1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate /* 30*0Sstevel@tonic-gate * If compiled without -DRF_INLINE_MACROS then needs -lm at link time 31*0Sstevel@tonic-gate * If compiled with -DRF_INLINE_MACROS then needs conv.il at compile time 32*0Sstevel@tonic-gate * (i.e. cc <compileer_flags> -DRF_INLINE_MACROS conv.il mont_mulf.c ) 33*0Sstevel@tonic-gate */ 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gate #include <sys/types.h> 36*0Sstevel@tonic-gate #include <math.h> 37*0Sstevel@tonic-gate 38*0Sstevel@tonic-gate static const double TwoTo16 = 65536.0; 39*0Sstevel@tonic-gate static const double TwoToMinus16 = 1.0/65536.0; 40*0Sstevel@tonic-gate static const double Zero = 0.0; 41*0Sstevel@tonic-gate static const double TwoTo32 = 65536.0 * 65536.0; 42*0Sstevel@tonic-gate static const double TwoToMinus32 = 1.0 / (65536.0 * 65536.0); 43*0Sstevel@tonic-gate 44*0Sstevel@tonic-gate #ifdef RF_INLINE_MACROS 45*0Sstevel@tonic-gate 46*0Sstevel@tonic-gate double upper32(double); 47*0Sstevel@tonic-gate double lower32(double, double); 48*0Sstevel@tonic-gate double mod(double, double, double); 49*0Sstevel@tonic-gate 50*0Sstevel@tonic-gate #else 51*0Sstevel@tonic-gate 52*0Sstevel@tonic-gate static double 53*0Sstevel@tonic-gate upper32(double x) 54*0Sstevel@tonic-gate { 55*0Sstevel@tonic-gate return (floor(x * TwoToMinus32)); 56*0Sstevel@tonic-gate } 57*0Sstevel@tonic-gate 58*0Sstevel@tonic-gate 59*0Sstevel@tonic-gate static double 60*0Sstevel@tonic-gate lower32(double x, double y) 61*0Sstevel@tonic-gate { 62*0Sstevel@tonic-gate return (x - TwoTo32 * floor(x * TwoToMinus32)); 63*0Sstevel@tonic-gate } 64*0Sstevel@tonic-gate 65*0Sstevel@tonic-gate static double 66*0Sstevel@tonic-gate mod(double x, double oneoverm, double m) 67*0Sstevel@tonic-gate { 68*0Sstevel@tonic-gate return (x - m * floor(x * oneoverm)); 69*0Sstevel@tonic-gate } 70*0Sstevel@tonic-gate 71*0Sstevel@tonic-gate #endif 72*0Sstevel@tonic-gate 73*0Sstevel@tonic-gate 74*0Sstevel@tonic-gate static void 75*0Sstevel@tonic-gate cleanup(double *dt, int from, int tlen) 76*0Sstevel@tonic-gate { 77*0Sstevel@tonic-gate int i; 78*0Sstevel@tonic-gate double tmp, tmp1, x, x1; 79*0Sstevel@tonic-gate 80*0Sstevel@tonic-gate tmp = tmp1 = Zero; 81*0Sstevel@tonic-gate 82*0Sstevel@tonic-gate for (i = 2 * from; i < 2 * tlen; i += 2) { 83*0Sstevel@tonic-gate x = dt[i]; 84*0Sstevel@tonic-gate x1 = dt[i + 1]; 85*0Sstevel@tonic-gate dt[i] = lower32(x, Zero) + tmp; 86*0Sstevel@tonic-gate dt[i + 1] = lower32(x1, Zero) + tmp1; 87*0Sstevel@tonic-gate tmp = upper32(x); 88*0Sstevel@tonic-gate tmp1 = upper32(x1); 89*0Sstevel@tonic-gate } 90*0Sstevel@tonic-gate } 91*0Sstevel@tonic-gate 92*0Sstevel@tonic-gate 93*0Sstevel@tonic-gate void 94*0Sstevel@tonic-gate conv_d16_to_i32(uint32_t *i32, double *d16, int64_t *tmp, int ilen) 95*0Sstevel@tonic-gate { 96*0Sstevel@tonic-gate int i; 97*0Sstevel@tonic-gate int64_t t, t1, /* using int64_t and not uint64_t */ 98*0Sstevel@tonic-gate a, b, c, d; /* because more efficient code is */ 99*0Sstevel@tonic-gate /* generated this way, and there */ 100*0Sstevel@tonic-gate /* is no overflow */ 101*0Sstevel@tonic-gate t1 = 0; 102*0Sstevel@tonic-gate a = (int64_t)d16[0]; 103*0Sstevel@tonic-gate b = (int64_t)d16[1]; 104*0Sstevel@tonic-gate for (i = 0; i < ilen - 1; i++) { 105*0Sstevel@tonic-gate c = (int64_t)d16[2 * i + 2]; 106*0Sstevel@tonic-gate t1 += a & 0xffffffff; 107*0Sstevel@tonic-gate t = (a >> 32); 108*0Sstevel@tonic-gate d = (int64_t)d16[2 * i + 3]; 109*0Sstevel@tonic-gate t1 += (b & 0xffff) << 16; 110*0Sstevel@tonic-gate t += (b >> 16) + (t1 >> 32); 111*0Sstevel@tonic-gate i32[i] = t1 & 0xffffffff; 112*0Sstevel@tonic-gate t1 = t; 113*0Sstevel@tonic-gate a = c; 114*0Sstevel@tonic-gate b = d; 115*0Sstevel@tonic-gate } 116*0Sstevel@tonic-gate t1 += a & 0xffffffff; 117*0Sstevel@tonic-gate t = (a >> 32); 118*0Sstevel@tonic-gate t1 += (b & 0xffff) << 16; 119*0Sstevel@tonic-gate i32[i] = t1 & 0xffffffff; 120*0Sstevel@tonic-gate } 121*0Sstevel@tonic-gate 122*0Sstevel@tonic-gate void 123*0Sstevel@tonic-gate conv_i32_to_d32(double *d32, uint32_t *i32, int len) 124*0Sstevel@tonic-gate { 125*0Sstevel@tonic-gate int i; 126*0Sstevel@tonic-gate 127*0Sstevel@tonic-gate #pragma pipeloop(0) 128*0Sstevel@tonic-gate for (i = 0; i < len; i++) 129*0Sstevel@tonic-gate d32[i] = (double)(i32[i]); 130*0Sstevel@tonic-gate } 131*0Sstevel@tonic-gate 132*0Sstevel@tonic-gate 133*0Sstevel@tonic-gate void 134*0Sstevel@tonic-gate conv_i32_to_d16(double *d16, uint32_t *i32, int len) 135*0Sstevel@tonic-gate { 136*0Sstevel@tonic-gate int i; 137*0Sstevel@tonic-gate uint32_t a; 138*0Sstevel@tonic-gate 139*0Sstevel@tonic-gate #pragma pipeloop(0) 140*0Sstevel@tonic-gate for (i = 0; i < len; i++) { 141*0Sstevel@tonic-gate a = i32[i]; 142*0Sstevel@tonic-gate d16[2 * i] = (double)(a & 0xffff); 143*0Sstevel@tonic-gate d16[2 * i + 1] = (double)(a >> 16); 144*0Sstevel@tonic-gate } 145*0Sstevel@tonic-gate } 146*0Sstevel@tonic-gate 147*0Sstevel@tonic-gate #ifdef RF_INLINE_MACROS 148*0Sstevel@tonic-gate 149*0Sstevel@tonic-gate void 150*0Sstevel@tonic-gate i16_to_d16_and_d32x4(const double *, /* 1/(2^16) */ 151*0Sstevel@tonic-gate const double *, /* 2^16 */ 152*0Sstevel@tonic-gate const double *, /* 0 */ 153*0Sstevel@tonic-gate double *, /* result16 */ 154*0Sstevel@tonic-gate double *, /* result32 */ 155*0Sstevel@tonic-gate float *); /* source - should be unsigned int* */ 156*0Sstevel@tonic-gate /* converted to float* */ 157*0Sstevel@tonic-gate 158*0Sstevel@tonic-gate #else 159*0Sstevel@tonic-gate 160*0Sstevel@tonic-gate 161*0Sstevel@tonic-gate static void 162*0Sstevel@tonic-gate i16_to_d16_and_d32x4(const double *dummy1, /* 1/(2^16) */ 163*0Sstevel@tonic-gate const double *dummy2, /* 2^16 */ 164*0Sstevel@tonic-gate const double *dummy3, /* 0 */ 165*0Sstevel@tonic-gate double *result16, 166*0Sstevel@tonic-gate double *result32, 167*0Sstevel@tonic-gate float *src) /* source - should be unsigned int* */ 168*0Sstevel@tonic-gate /* converted to float* */ 169*0Sstevel@tonic-gate { 170*0Sstevel@tonic-gate uint32_t *i32; 171*0Sstevel@tonic-gate uint32_t a, b, c, d; 172*0Sstevel@tonic-gate 173*0Sstevel@tonic-gate i32 = (uint32_t *)src; 174*0Sstevel@tonic-gate a = i32[0]; 175*0Sstevel@tonic-gate b = i32[1]; 176*0Sstevel@tonic-gate c = i32[2]; 177*0Sstevel@tonic-gate d = i32[3]; 178*0Sstevel@tonic-gate result16[0] = (double)(a & 0xffff); 179*0Sstevel@tonic-gate result16[1] = (double)(a >> 16); 180*0Sstevel@tonic-gate result32[0] = (double)a; 181*0Sstevel@tonic-gate result16[2] = (double)(b & 0xffff); 182*0Sstevel@tonic-gate result16[3] = (double)(b >> 16); 183*0Sstevel@tonic-gate result32[1] = (double)b; 184*0Sstevel@tonic-gate result16[4] = (double)(c & 0xffff); 185*0Sstevel@tonic-gate result16[5] = (double)(c >> 16); 186*0Sstevel@tonic-gate result32[2] = (double)c; 187*0Sstevel@tonic-gate result16[6] = (double)(d & 0xffff); 188*0Sstevel@tonic-gate result16[7] = (double)(d >> 16); 189*0Sstevel@tonic-gate result32[3] = (double)d; 190*0Sstevel@tonic-gate } 191*0Sstevel@tonic-gate 192*0Sstevel@tonic-gate #endif 193*0Sstevel@tonic-gate 194*0Sstevel@tonic-gate 195*0Sstevel@tonic-gate void 196*0Sstevel@tonic-gate conv_i32_to_d32_and_d16(double *d32, double *d16, uint32_t *i32, int len) 197*0Sstevel@tonic-gate { 198*0Sstevel@tonic-gate int i; 199*0Sstevel@tonic-gate uint32_t a; 200*0Sstevel@tonic-gate 201*0Sstevel@tonic-gate #pragma pipeloop(0) 202*0Sstevel@tonic-gate for (i = 0; i < len - 3; i += 4) { 203*0Sstevel@tonic-gate i16_to_d16_and_d32x4(&TwoToMinus16, &TwoTo16, &Zero, 204*0Sstevel@tonic-gate &(d16[2*i]), &(d32[i]), 205*0Sstevel@tonic-gate (float *)(&(i32[i]))); 206*0Sstevel@tonic-gate } 207*0Sstevel@tonic-gate for (; i < len; i++) { 208*0Sstevel@tonic-gate a = i32[i]; 209*0Sstevel@tonic-gate d32[i] = (double)(i32[i]); 210*0Sstevel@tonic-gate d16[2 * i] = (double)(a & 0xffff); 211*0Sstevel@tonic-gate d16[2 * i + 1] = (double)(a >> 16); 212*0Sstevel@tonic-gate } 213*0Sstevel@tonic-gate } 214*0Sstevel@tonic-gate 215*0Sstevel@tonic-gate 216*0Sstevel@tonic-gate static void 217*0Sstevel@tonic-gate adjust_montf_result(uint32_t *i32, uint32_t *nint, int len) 218*0Sstevel@tonic-gate { 219*0Sstevel@tonic-gate int64_t acc; 220*0Sstevel@tonic-gate int i; 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate if (i32[len] > 0) 223*0Sstevel@tonic-gate i = -1; 224*0Sstevel@tonic-gate else { 225*0Sstevel@tonic-gate for (i = len - 1; i >= 0; i--) { 226*0Sstevel@tonic-gate if (i32[i] != nint[i]) break; 227*0Sstevel@tonic-gate } 228*0Sstevel@tonic-gate } 229*0Sstevel@tonic-gate if ((i < 0) || (i32[i] > nint[i])) { 230*0Sstevel@tonic-gate acc = 0; 231*0Sstevel@tonic-gate for (i = 0; i < len; i++) { 232*0Sstevel@tonic-gate acc = acc + (uint64_t)(i32[i]) - (uint64_t)(nint[i]); 233*0Sstevel@tonic-gate i32[i] = acc & 0xffffffff; 234*0Sstevel@tonic-gate acc = acc >> 32; 235*0Sstevel@tonic-gate } 236*0Sstevel@tonic-gate } 237*0Sstevel@tonic-gate } 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gate 240*0Sstevel@tonic-gate /* 241*0Sstevel@tonic-gate * the lengths of the input arrays should be at least the following: 242*0Sstevel@tonic-gate * result[nlen+1], dm1[nlen], dm2[2*nlen+1], dt[4*nlen+2], dn[nlen], nint[nlen] 243*0Sstevel@tonic-gate * all of them should be different from one another 244*0Sstevel@tonic-gate */ 245*0Sstevel@tonic-gate void mont_mulf_noconv(uint32_t *result, 246*0Sstevel@tonic-gate double *dm1, double *dm2, double *dt, 247*0Sstevel@tonic-gate double *dn, uint32_t *nint, 248*0Sstevel@tonic-gate int nlen, double dn0) 249*0Sstevel@tonic-gate { 250*0Sstevel@tonic-gate int i, j, jj; 251*0Sstevel@tonic-gate double digit, m2j, a, b; 252*0Sstevel@tonic-gate double *pdm1, *pdm2, *pdn, *pdtj, pdn_0, pdm1_0; 253*0Sstevel@tonic-gate 254*0Sstevel@tonic-gate pdm1 = &(dm1[0]); 255*0Sstevel@tonic-gate pdm2 = &(dm2[0]); 256*0Sstevel@tonic-gate pdn = &(dn[0]); 257*0Sstevel@tonic-gate pdm2[2 * nlen] = Zero; 258*0Sstevel@tonic-gate 259*0Sstevel@tonic-gate if (nlen != 16) { 260*0Sstevel@tonic-gate for (i = 0; i < 4 * nlen + 2; i++) 261*0Sstevel@tonic-gate dt[i] = Zero; 262*0Sstevel@tonic-gate a = dt[0] = pdm1[0] * pdm2[0]; 263*0Sstevel@tonic-gate digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16); 264*0Sstevel@tonic-gate 265*0Sstevel@tonic-gate pdtj = &(dt[0]); 266*0Sstevel@tonic-gate for (j = jj = 0; j < 2 * nlen; j++, jj++, pdtj++) { 267*0Sstevel@tonic-gate m2j = pdm2[j]; 268*0Sstevel@tonic-gate a = pdtj[0] + pdn[0] * digit; 269*0Sstevel@tonic-gate b = pdtj[1] + pdm1[0] * pdm2[j + 1] + a * TwoToMinus16; 270*0Sstevel@tonic-gate pdtj[1] = b; 271*0Sstevel@tonic-gate 272*0Sstevel@tonic-gate #pragma pipeloop(0) 273*0Sstevel@tonic-gate for (i = 1; i < nlen; i++) { 274*0Sstevel@tonic-gate pdtj[2 * i] += pdm1[i] * m2j + pdn[i] * digit; 275*0Sstevel@tonic-gate } 276*0Sstevel@tonic-gate if (jj == 30) { 277*0Sstevel@tonic-gate cleanup(dt, j / 2 + 1, 2 * nlen + 1); 278*0Sstevel@tonic-gate jj = 0; 279*0Sstevel@tonic-gate } 280*0Sstevel@tonic-gate 281*0Sstevel@tonic-gate digit = mod(lower32(b, Zero) * dn0, 282*0Sstevel@tonic-gate TwoToMinus16, TwoTo16); 283*0Sstevel@tonic-gate } 284*0Sstevel@tonic-gate } else { 285*0Sstevel@tonic-gate a = dt[0] = pdm1[0] * pdm2[0]; 286*0Sstevel@tonic-gate 287*0Sstevel@tonic-gate dt[65] = dt[64] = dt[63] = dt[62] = dt[61] = dt[60] = 288*0Sstevel@tonic-gate dt[59] = dt[58] = dt[57] = dt[56] = dt[55] = 289*0Sstevel@tonic-gate dt[54] = dt[53] = dt[52] = dt[51] = dt[50] = 290*0Sstevel@tonic-gate dt[49] = dt[48] = dt[47] = dt[46] = dt[45] = 291*0Sstevel@tonic-gate dt[44] = dt[43] = dt[42] = dt[41] = dt[40] = 292*0Sstevel@tonic-gate dt[39] = dt[38] = dt[37] = dt[36] = dt[35] = 293*0Sstevel@tonic-gate dt[34] = dt[33] = dt[32] = dt[31] = dt[30] = 294*0Sstevel@tonic-gate dt[29] = dt[28] = dt[27] = dt[26] = dt[25] = 295*0Sstevel@tonic-gate dt[24] = dt[23] = dt[22] = dt[21] = dt[20] = 296*0Sstevel@tonic-gate dt[19] = dt[18] = dt[17] = dt[16] = dt[15] = 297*0Sstevel@tonic-gate dt[14] = dt[13] = dt[12] = dt[11] = dt[10] = 298*0Sstevel@tonic-gate dt[9] = dt[8] = dt[7] = dt[6] = dt[5] = dt[4] = 299*0Sstevel@tonic-gate dt[3] = dt[2] = dt[1] = Zero; 300*0Sstevel@tonic-gate 301*0Sstevel@tonic-gate pdn_0 = pdn[0]; 302*0Sstevel@tonic-gate pdm1_0 = pdm1[0]; 303*0Sstevel@tonic-gate 304*0Sstevel@tonic-gate digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16); 305*0Sstevel@tonic-gate pdtj = &(dt[0]); 306*0Sstevel@tonic-gate 307*0Sstevel@tonic-gate for (j = 0; j < 32; j++, pdtj++) { 308*0Sstevel@tonic-gate 309*0Sstevel@tonic-gate m2j = pdm2[j]; 310*0Sstevel@tonic-gate a = pdtj[0] + pdn_0 * digit; 311*0Sstevel@tonic-gate b = pdtj[1] + pdm1_0 * pdm2[j + 1] + a * TwoToMinus16; 312*0Sstevel@tonic-gate pdtj[1] = b; 313*0Sstevel@tonic-gate 314*0Sstevel@tonic-gate pdtj[2] += pdm1[1] *m2j + pdn[1] * digit; 315*0Sstevel@tonic-gate pdtj[4] += pdm1[2] *m2j + pdn[2] * digit; 316*0Sstevel@tonic-gate pdtj[6] += pdm1[3] *m2j + pdn[3] * digit; 317*0Sstevel@tonic-gate pdtj[8] += pdm1[4] *m2j + pdn[4] * digit; 318*0Sstevel@tonic-gate pdtj[10] += pdm1[5] *m2j + pdn[5] * digit; 319*0Sstevel@tonic-gate pdtj[12] += pdm1[6] *m2j + pdn[6] * digit; 320*0Sstevel@tonic-gate pdtj[14] += pdm1[7] *m2j + pdn[7] * digit; 321*0Sstevel@tonic-gate pdtj[16] += pdm1[8] *m2j + pdn[8] * digit; 322*0Sstevel@tonic-gate pdtj[18] += pdm1[9] *m2j + pdn[9] * digit; 323*0Sstevel@tonic-gate pdtj[20] += pdm1[10] *m2j + pdn[10] * digit; 324*0Sstevel@tonic-gate pdtj[22] += pdm1[11] *m2j + pdn[11] * digit; 325*0Sstevel@tonic-gate pdtj[24] += pdm1[12] *m2j + pdn[12] * digit; 326*0Sstevel@tonic-gate pdtj[26] += pdm1[13] *m2j + pdn[13] * digit; 327*0Sstevel@tonic-gate pdtj[28] += pdm1[14] *m2j + pdn[14] * digit; 328*0Sstevel@tonic-gate pdtj[30] += pdm1[15] *m2j + pdn[15] * digit; 329*0Sstevel@tonic-gate /* no need for cleenup, cannot overflow */ 330*0Sstevel@tonic-gate digit = mod(lower32(b, Zero) * dn0, 331*0Sstevel@tonic-gate TwoToMinus16, TwoTo16); 332*0Sstevel@tonic-gate } 333*0Sstevel@tonic-gate } 334*0Sstevel@tonic-gate 335*0Sstevel@tonic-gate conv_d16_to_i32(result, dt + 2 * nlen, (int64_t *)dt, nlen + 1); 336*0Sstevel@tonic-gate adjust_montf_result(result, nint, nlen); 337*0Sstevel@tonic-gate } 338