16429Svs195195 /* 26429Svs195195 * CDDL HEADER START 36429Svs195195 * 46429Svs195195 * The contents of this file are subject to the terms of the 56429Svs195195 * Common Development and Distribution License (the "License"). 66429Svs195195 * You may not use this file except in compliance with the License. 76429Svs195195 * 86429Svs195195 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 96429Svs195195 * or http://www.opensolaris.org/os/licensing. 106429Svs195195 * See the License for the specific language governing permissions 116429Svs195195 * and limitations under the License. 126429Svs195195 * 136429Svs195195 * When distributing Covered Code, include this CDDL HEADER in each 146429Svs195195 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 156429Svs195195 * If applicable, add the following below this CDDL HEADER, with the 166429Svs195195 * fields enclosed by brackets "[]" replaced with your own identifying 176429Svs195195 * information: Portions Copyright [yyyy] [name of copyright owner] 186429Svs195195 * 196429Svs195195 * CDDL HEADER END 206429Svs195195 */ 216429Svs195195 226429Svs195195 /* 23*6491Sia112686 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 246429Svs195195 * Use is subject to license terms. 256429Svs195195 */ 266429Svs195195 276429Svs195195 #pragma ident "%Z%%M% %I% %E% SMI" 286429Svs195195 296429Svs195195 #include <errno.h> 306429Svs195195 #include <stdio.h> 316429Svs195195 #include <stdlib.h> 326429Svs195195 #include <sys/time.h> 336429Svs195195 #include <unistd.h> 346429Svs195195 #include <externs.h> 356429Svs195195 #include <fp.h> 366429Svs195195 #include <fps_ereport.h> 376429Svs195195 #include <fpstestmsg.h> 386429Svs195195 #include <linpack.h> 396429Svs195195 406429Svs195195 #ifdef __i386 416429Svs195195 #include "/shared/dp/mercury/latest/prod/include/cc/sunperf.h" 426429Svs195195 #else 436429Svs195195 #include <sunperf.h> 446429Svs195195 #endif 456429Svs195195 466429Svs195195 double fabs(double x); 476429Svs195195 486429Svs195195 extern void ___pl_dss_set_chip_cache_(int *cache_size); 496429Svs195195 static double dran(int iseed[4]); 506429Svs195195 static int LINSUB(REAL * residn, REAL * resid, 516429Svs195195 REAL * eps, REAL * x11, REAL * xn1, int fps_verbose_msg); 526429Svs195195 static int MATGEN(REAL a[], int lda, int n, REAL b[], REAL * norma); 536429Svs195195 static REAL EPSLON(REAL x); 546429Svs195195 static void MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); 556429Svs195195 566429Svs195195 extern int errno; 576429Svs195195 static int LAPACK_ECACHE_SIZE = 8 * 1024 * 1024; 586429Svs195195 static int MAT_SIZE; 596429Svs195195 606429Svs195195 /* 616429Svs195195 * LINPACK(int Stress, int unit, struct fps_test_ereport *report, 626429Svs195195 * int fps_verbose_msg) 636429Svs195195 * performs the single and double precision lapack test. If an 646429Svs195195 * error is found, relevant data is collected and stored in report. 656429Svs195195 */ 666429Svs195195 int 676429Svs195195 LINPACK(int Stress, int unit, struct fps_test_ereport *report, 686429Svs195195 int fps_verbose_msg) 696429Svs195195 { 706429Svs195195 char err_data[MAX_INFO_SIZE]; 716429Svs195195 char l_buf[64]; 726429Svs195195 int c_index; 736429Svs195195 int ret; 746429Svs195195 REAL eps; 756429Svs195195 REAL resid; 766429Svs195195 REAL residn; 776429Svs195195 REAL x11; 786429Svs195195 REAL xn1; 796429Svs195195 REAL EPS; 806429Svs195195 REAL RESID; 816429Svs195195 REAL RESIDN; 826429Svs195195 REAL X11; 836429Svs195195 REAL XN1; 846429Svs195195 uint64_t expected[5]; 856429Svs195195 uint64_t observed[5]; 866429Svs195195 876429Svs195195 #ifdef FPS_LAPA_UNK 886429Svs195195 #ifndef DP 896429Svs195195 if (Stress > 1000) 906429Svs195195 return (0); 916429Svs195195 #endif /* DP */ 926429Svs195195 #endif /* FPS_LAPA_UNK */ 936429Svs195195 946429Svs195195 if (Stress > 10000) 956429Svs195195 return (0); 966429Svs195195 976429Svs195195 /* 986429Svs195195 * make sure is no dependency on the E$ size Without this call the 996429Svs195195 * computed results will depend on the size of the E$ ( 1006429Svs195195 * sos10/libsunperf ) IIIi computed results != IV+/IV/III+/III ... 1016429Svs195195 */ 1026429Svs195195 ___pl_dss_set_chip_cache_(&LAPACK_ECACHE_SIZE); 1036429Svs195195 1046429Svs195195 c_index = Stress; 1056429Svs195195 1066429Svs195195 if (2000 == c_index) 1076429Svs195195 c_index = 1001; 1086429Svs195195 if (3000 == c_index) 1096429Svs195195 c_index = 1002; 1106429Svs195195 if (4016 == c_index) 1116429Svs195195 c_index = 1003; 1126429Svs195195 if (5000 == c_index) 1136429Svs195195 c_index = 1004; 1146429Svs195195 if (6000 == c_index) 1156429Svs195195 c_index = 1005; 1166429Svs195195 if (7016 == c_index) 1176429Svs195195 c_index = 1006; 1186429Svs195195 if (8034 == c_index) 1196429Svs195195 c_index = 1007; 1206429Svs195195 if (9000 == c_index) 1216429Svs195195 c_index = 1008; 1226429Svs195195 if (10000 == c_index) 1236429Svs195195 c_index = 1009; 1246429Svs195195 1256429Svs195195 (void) snprintf(l_buf, 63, "%s(%d,cpu=%d)", PREC, Stress, unit); 1266429Svs195195 fps_msg(fps_verbose_msg, gettext(FPSM_02), l_buf, unit); 1276429Svs195195 1286429Svs195195 MAT_SIZE = Stress; 1296429Svs195195 ret = LINSUB(&residn, &resid, &eps, &x11, &xn1, fps_verbose_msg); 1306429Svs195195 1316429Svs195195 if (2 == ret) { 1326429Svs195195 if (errno == EAGAIN || errno == ENOMEM) 1336429Svs195195 _exit(FPU_SYSCALL_TRYAGAIN); 1346429Svs195195 else 1356429Svs195195 _exit(FPU_SYSCALL_FAIL); 1366429Svs195195 } 1376429Svs195195 1386429Svs195195 #ifdef FPS_LAPA_UNK 1396429Svs195195 RESIDN = RESID = X11 = XN1 = 0.0000000000000000e+00; 1406429Svs195195 1416429Svs195195 #ifdef DP 1426429Svs195195 EPS = 2.2204460492503131e-16; 1436429Svs195195 #else /* DP */ 1446429Svs195195 EPS = 1.1920928955078125e-07; 1456429Svs195195 #endif /* DP */ 1466429Svs195195 1476429Svs195195 #else /* FPS_LAPA_UNK */ 1486429Svs195195 1496429Svs195195 RESIDN = LinpValsA[c_index].residn; 1506429Svs195195 RESID = LinpValsA[c_index].resid; 1516429Svs195195 EPS = LinpValsA[c_index].eps; 1526429Svs195195 X11 = LinpValsA[c_index].x11; 1536429Svs195195 XN1 = LinpValsA[c_index].xn1; 1546429Svs195195 1556429Svs195195 #endif /* FPS_LAPA_UNK */ 1566429Svs195195 1576429Svs195195 if ((residn == RESIDN) && (resid == RESID) && (eps == EPS) && 1586429Svs195195 (x11 == X11) && (xn1 == XN1)) { 1596429Svs195195 1606429Svs195195 return (0); 1616429Svs195195 } else { 1626429Svs195195 snprintf(err_data, sizeof (err_data), 1636429Svs195195 "\nExpected: %.16e, %.16e, %.16e, %.16e, %.16e" 1646429Svs195195 "\nObserved: %.16e, %.16e, %.16e, %.16e, %.16e", 1656429Svs195195 RESIDN, RESID, EPS, X11, XN1, residn, resid, eps, x11, xn1); 1666429Svs195195 1676429Svs195195 1686429Svs195195 #ifdef DP 1696429Svs195195 observed[0] = *(uint64_t *)&residn; 1706429Svs195195 observed[1] = *(uint64_t *)&resid; 1716429Svs195195 observed[2] = *(uint64_t *)&eps; 1726429Svs195195 observed[3] = *(uint64_t *)&x11; 1736429Svs195195 observed[4] = *(uint64_t *)&xn1; 1746429Svs195195 expected[0] = *(uint64_t *)&RESIDN; 1756429Svs195195 expected[1] = *(uint64_t *)&RESID; 1766429Svs195195 expected[2] = *(uint64_t *)&EPS; 1776429Svs195195 expected[3] = *(uint64_t *)&X11; 1786429Svs195195 expected[4] = *(uint64_t *)&XN1; 1796429Svs195195 1806429Svs195195 setup_fps_test_struct(IS_EREPORT_INFO, report, 1816429Svs195195 6317, &observed, &expected, 5, 5, err_data); 1826429Svs195195 #else 1836429Svs195195 observed[0] = (uint64_t)(*(uint32_t *)&residn); 1846429Svs195195 observed[1] = (uint64_t)(*(uint32_t *)&resid); 1856429Svs195195 observed[2] = (uint64_t)(*(uint32_t *)&eps); 1866429Svs195195 observed[3] = (uint64_t)(*(uint32_t *)&x11); 1876429Svs195195 observed[4] = (uint64_t)(*(uint32_t *)&xn1); 1886429Svs195195 expected[0] = (uint64_t)(*(uint32_t *)&RESIDN); 1896429Svs195195 expected[1] = (uint64_t)(*(uint32_t *)&RESID); 1906429Svs195195 expected[2] = (uint64_t)(*(uint32_t *)&EPS); 1916429Svs195195 expected[3] = (uint64_t)(*(uint32_t *)&X11); 1926429Svs195195 expected[4] = (uint64_t)(*(uint32_t *)&XN1); 1936429Svs195195 1946429Svs195195 setup_fps_test_struct(IS_EREPORT_INFO, report, 1956429Svs195195 6316, &observed, &expected, 5, 5, err_data); 1966429Svs195195 #endif 1976429Svs195195 1986429Svs195195 return (-1); 1996429Svs195195 } 2006429Svs195195 } 2016429Svs195195 2026429Svs195195 /* 2036429Svs195195 * LINSUB(REAL *residn, REAL *resid, REAL *eps, 2046429Svs195195 * REAL *x11, REAL *xn1, int fps_verbose_msg)begins 2056429Svs195195 * the lapack calculation calls. 2066429Svs195195 */ 2076429Svs195195 static int 2086429Svs195195 LINSUB(REAL *residn, REAL *resid, 2096429Svs195195 REAL *eps, REAL *x11, REAL *xn1, 2106429Svs195195 int fps_verbose_msg) 2116429Svs195195 { 2126429Svs195195 int i; 2136429Svs195195 int lda; 2146429Svs195195 int n; 2156429Svs195195 int nr_malloc; 2166429Svs195195 REAL *a; 2176429Svs195195 REAL abs; 2186429Svs195195 REAL *b; 2196429Svs195195 REAL norma; 2206429Svs195195 REAL normx; 2216429Svs195195 REAL *x; 2226429Svs195195 struct timeval timeout; 2236429Svs195195 long info; 2246429Svs195195 long *ipvt; 2256429Svs195195 2266429Svs195195 timeout.tv_sec = 0; 2276429Svs195195 timeout.tv_usec = 10000; /* microseconds, 10ms */ 2286429Svs195195 nr_malloc = 0; 2296429Svs195195 2306429Svs195195 mallocAgain: 2316429Svs195195 2326429Svs195195 a = (REAL *) malloc((MAT_SIZE + 8) * (MAT_SIZE + 1) * 2336429Svs195195 (size_t)sizeof (REAL)); 2346429Svs195195 b = (REAL *) malloc(MAT_SIZE * (size_t)sizeof (REAL)); 2356429Svs195195 x = (REAL *) malloc(MAT_SIZE * (size_t)sizeof (REAL)); 2366429Svs195195 2376429Svs195195 ipvt = (long *)malloc(MAT_SIZE * (size_t)sizeof (long)); 2386429Svs195195 2396429Svs195195 if ((NULL == a) || (NULL == b) || 2406429Svs195195 (NULL == x) || (NULL == ipvt)) { 2416429Svs195195 if (NULL != a) 2426429Svs195195 free(a); 2436429Svs195195 if (NULL != b) 2446429Svs195195 free(b); 2456429Svs195195 if (NULL != x) 2466429Svs195195 free(x); 2476429Svs195195 if (NULL != ipvt) 2486429Svs195195 free(ipvt); 2496429Svs195195 2506429Svs195195 /* sleep 10 ms. wait for 100 ms */ 2516429Svs195195 if (nr_malloc++ < 11) { 2526429Svs195195 (void) select(1, NULL, NULL, NULL, &timeout); 2536429Svs195195 goto mallocAgain; 2546429Svs195195 } 2556429Svs195195 fps_msg(fps_verbose_msg, 2566429Svs195195 "Malloc failed in lapack, matrix size %d", 2576429Svs195195 MAT_SIZE); 2586429Svs195195 2596429Svs195195 return (2); 2606429Svs195195 } 2616429Svs195195 lda = MAT_SIZE + 8; 2626429Svs195195 n = MAT_SIZE; 2636429Svs195195 2646429Svs195195 (void) MATGEN(a, lda, n, b, &norma); 2656429Svs195195 GEFA(n, n, a, lda, ipvt, &info); 2666429Svs195195 GESL('N', n, 1, a, lda, ipvt, b, n, &info); 2676429Svs195195 free(ipvt); 2686429Svs195195 2696429Svs195195 for (i = 0; i < n; i++) { 2706429Svs195195 x[i] = b[i]; 2716429Svs195195 } 2726429Svs195195 2736429Svs195195 (void) MATGEN((REAL *) a, lda, n, b, &norma); 2746429Svs195195 2756429Svs195195 for (i = 0; i < n; i++) { 2766429Svs195195 b[i] = -b[i]; 2776429Svs195195 } 2786429Svs195195 2796429Svs195195 MXPY(n, b, n, lda, x, (REAL *) a); 2806429Svs195195 free(a); 2816429Svs195195 2826429Svs195195 *resid = 0.0; 2836429Svs195195 normx = 0.0; 2846429Svs195195 2856429Svs195195 for (i = 0; i < n; i++) { 2866429Svs195195 abs = (REAL)fabs((double)b[i]); 2876429Svs195195 *resid = (*resid > abs) ? *resid : abs; 2886429Svs195195 abs = (REAL)fabs((double)x[i]); 2896429Svs195195 normx = (normx > abs) ? normx : abs; 2906429Svs195195 } 2916429Svs195195 2926429Svs195195 free(b); 2936429Svs195195 2946429Svs195195 *eps = EPSLON((REAL) LP_ONE); 2956429Svs195195 2966429Svs195195 *residn = *resid / (n * norma * normx * (*eps)); 2976429Svs195195 2986429Svs195195 *x11 = x[0] - 1; 2996429Svs195195 *xn1 = x[n - 1] - 1; 3006429Svs195195 3016429Svs195195 free(x); 3026429Svs195195 3036429Svs195195 return (0); 3046429Svs195195 } 3056429Svs195195 3066429Svs195195 /* 3076429Svs195195 * dran(int iseed[4]) returns a random real number from a 3086429Svs195195 * uniform (0,1) distribution. 3096429Svs195195 */ 3106429Svs195195 static double 3116429Svs195195 dran(int iseed[4]) 3126429Svs195195 { 3136429Svs195195 double r; 3146429Svs195195 double value; 3156429Svs195195 int ipw2; 3166429Svs195195 int it1; 3176429Svs195195 int it2; 3186429Svs195195 int it3; 3196429Svs195195 int it4; 3206429Svs195195 int m1; 3216429Svs195195 int m2; 3226429Svs195195 int m3; 3236429Svs195195 int m4; 3246429Svs195195 3256429Svs195195 /* Set constants */ 3266429Svs195195 m1 = 494; 3276429Svs195195 m2 = 322; 3286429Svs195195 m3 = 2508; 3296429Svs195195 m4 = 2549; 3306429Svs195195 ipw2 = 4096; 3316429Svs195195 r = 1.0 / ipw2; 3326429Svs195195 3336429Svs195195 /* multiply the seed by the multiplier modulo 2**48 */ 3346429Svs195195 it4 = iseed[3] * m4; 3356429Svs195195 it3 = it4 / ipw2; 3366429Svs195195 it4 = it4 - ipw2 * it3; 3376429Svs195195 it3 = it3 + iseed[2] * m4 + iseed[3] * m3; 3386429Svs195195 it2 = it3 / ipw2; 3396429Svs195195 it3 = it3 - ipw2 * it2; 3406429Svs195195 it2 = it2 + iseed[1] * m4 + iseed[2] * m3 + iseed[3] * m2; 3416429Svs195195 it1 = it2 / ipw2; 3426429Svs195195 it2 = it2 - ipw2 * it1; 3436429Svs195195 it1 = it1 + iseed[0] * m4 + iseed[1] * m3 + iseed[2] * m2 + 3446429Svs195195 iseed[3] * m1; 3456429Svs195195 it1 = it1 % ipw2; 3466429Svs195195 3476429Svs195195 /* return updated seed */ 3486429Svs195195 iseed[0] = it1; 3496429Svs195195 iseed[1] = it2; 3506429Svs195195 iseed[2] = it3; 3516429Svs195195 iseed[3] = it4; 3526429Svs195195 3536429Svs195195 /* convert 48-bit integer to a real number in the interval (0,1) */ 3546429Svs195195 value = r * ((double)it1 + r * ((double)it2 + r * ((double)it3 + 3556429Svs195195 r * ((double)it4)))); 3566429Svs195195 3576429Svs195195 return (value); 3586429Svs195195 } 3596429Svs195195 3606429Svs195195 /* 3616429Svs195195 * MATGEN(REAL a[], int lda, int n, REAL b[], REAL *norma) 3626429Svs195195 * generates matrix a and b. 3636429Svs195195 */ 3646429Svs195195 3656429Svs195195 #define ALPHA 1.68750 3666429Svs195195 static int 3676429Svs195195 MATGEN(REAL a[], int lda, int n, REAL b[], REAL *norma) 3686429Svs195195 { 3696429Svs195195 int i; 3706429Svs195195 int init[4]; 3716429Svs195195 int j; 3726429Svs195195 REAL value; 3736429Svs195195 3746429Svs195195 init[0] = 1; 3756429Svs195195 init[1] = 2; 3766429Svs195195 init[2] = 3; 3776429Svs195195 init[3] = 1325; 3786429Svs195195 *norma = LP_ZERO; 3796429Svs195195 for (j = 0; j < n; j++) { 3806429Svs195195 for (i = 0; i < n; i++) { 3816429Svs195195 #ifdef FPS_LAPA_UNK 3826429Svs195195 a[lda*j+i] = 3836429Svs195195 (i < j) ? (double)(i+1) : (double)(j+ALPHA); 3846429Svs195195 if (fabs(a[lda*j+i]) > *norma) 3856429Svs195195 *norma = fabs(a[lda*j+i]); 3866429Svs195195 } /* i */ 3876429Svs195195 #else 3886429Svs195195 value = (REAL) dran(init) - 0.5; 3896429Svs195195 a[lda * j + i] = value; 3906429Svs195195 value = fabs(value); 3916429Svs195195 if (value > *norma) { 3926429Svs195195 *norma = value; 3936429Svs195195 } 3946429Svs195195 } /* i */ 3956429Svs195195 #endif /* FPS_LAPA_UNK */ 3966429Svs195195 } /* j */ 3976429Svs195195 3986429Svs195195 3996429Svs195195 for (i = 0; i < n; i++) { 4006429Svs195195 b[i] = LP_ZERO; 4016429Svs195195 } 4026429Svs195195 for (j = 0; j < n; j++) { 4036429Svs195195 for (i = 0; i < n; i++) { 4046429Svs195195 b[i] = b[i] + a[lda * j + i]; 4056429Svs195195 } 4066429Svs195195 } 4076429Svs195195 4086429Svs195195 return (0); 4096429Svs195195 } 4106429Svs195195 4116429Svs195195 /* 4126429Svs195195 * IAMAX(int n, REAL dx[])finds the index of element 4136429Svs195195 * having maximum absolute value. 4146429Svs195195 */ 4156429Svs195195 int 4166429Svs195195 IAMAX(int n, REAL dx[]) 4176429Svs195195 { 4186429Svs195195 double abs; 4196429Svs195195 double dmax; 4206429Svs195195 int i; 4216429Svs195195 int itemp; 4226429Svs195195 4236429Svs195195 if (n < 1) 4246429Svs195195 return (-1); 4256429Svs195195 if (n == 1) 4266429Svs195195 return (0); 4276429Svs195195 4286429Svs195195 itemp = 0; 4296429Svs195195 dmax = fabs((double)dx[0]); 4306429Svs195195 4316429Svs195195 for (i = 1; i < n; i++) { 4326429Svs195195 abs = fabs((double)dx[i]); 4336429Svs195195 if (abs > dmax) { 4346429Svs195195 itemp = i; 4356429Svs195195 dmax = abs; 4366429Svs195195 } 4376429Svs195195 } 4386429Svs195195 4396429Svs195195 return (itemp); 4406429Svs195195 } 4416429Svs195195 4426429Svs195195 /* 4436429Svs195195 * EPSLON(REAL x) estimates unit roundoff in 4446429Svs195195 * quantities of size x. 4456429Svs195195 */ 4466429Svs195195 static REAL 4476429Svs195195 EPSLON(REAL x) 4486429Svs195195 { 4496429Svs195195 REAL a; 4506429Svs195195 REAL abs; 4516429Svs195195 REAL b; 4526429Svs195195 REAL c; 4536429Svs195195 REAL eps; 4546429Svs195195 4556429Svs195195 a = 4.0e0 / 3.0e0; 4566429Svs195195 eps = LP_ZERO; 4576429Svs195195 4586429Svs195195 while (eps == LP_ZERO) { 4596429Svs195195 b = a - LP_ONE; 4606429Svs195195 c = b + b + b; 4616429Svs195195 eps = (REAL)fabs((double)(c - LP_ONE)); 4626429Svs195195 } 4636429Svs195195 4646429Svs195195 abs = (REAL)fabs((double)x); 4656429Svs195195 4666429Svs195195 return (eps * abs); 4676429Svs195195 } 4686429Svs195195 4696429Svs195195 /* 4706429Svs195195 * MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) 4716429Svs195195 * multiplies matrix m times vector x and add the result to 4726429Svs195195 * vector y. 4736429Svs195195 */ 4746429Svs195195 static void 4756429Svs195195 MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) 4766429Svs195195 { 4776429Svs195195 int i; 4786429Svs195195 int j; 4796429Svs195195 int jmin; 4806429Svs195195 4816429Svs195195 /* cleanup odd vector */ 4826429Svs195195 j = n2 % 2; 4836429Svs195195 if (j >= 1) { 4846429Svs195195 j = j - 1; 4856429Svs195195 for (i = 0; i < n1; i++) 4866429Svs195195 y[i] = (y[i]) + x[j] * m[ldm * j + i]; 4876429Svs195195 } 4886429Svs195195 4896429Svs195195 /* cleanup odd group of two vectors */ 4906429Svs195195 j = n2 % 4; 4916429Svs195195 if (j >= 2) { 4926429Svs195195 j = j - 1; 4936429Svs195195 for (i = 0; i < n1; i++) 4946429Svs195195 y[i] = ((y[i]) 4956429Svs195195 + x[j - 1] * m[ldm * (j - 1) + i]) 4966429Svs195195 + x[j] * m[ldm * j + i]; 4976429Svs195195 } 4986429Svs195195 4996429Svs195195 /* cleanup odd group of four vectors */ 5006429Svs195195 j = n2 % 8; 5016429Svs195195 if (j >= 4) { 5026429Svs195195 j = j - 1; 5036429Svs195195 for (i = 0; i < n1; i++) 5046429Svs195195 y[i] = ((((y[i]) 5056429Svs195195 + x[j - 3] * m[ldm * (j - 3) + i]) 5066429Svs195195 + x[j - 2] * m[ldm * (j - 2) + i]) 5076429Svs195195 + x[j - 1] * m[ldm * (j - 1) + i]) 5086429Svs195195 + x[j] * m[ldm * j + i]; 5096429Svs195195 } 5106429Svs195195 5116429Svs195195 /* cleanup odd group of eight vectors */ 5126429Svs195195 j = n2 % 16; 5136429Svs195195 if (j >= 8) { 5146429Svs195195 j = j - 1; 5156429Svs195195 for (i = 0; i < n1; i++) 5166429Svs195195 y[i] = ((((((((y[i]) 5176429Svs195195 + x[j - 7] * m[ldm * (j - 7) + i]) 5186429Svs195195 + x[j - 6] * m[ldm * (j - 6) + i]) 5196429Svs195195 + x[j - 5] * m[ldm * (j - 5) + i]) 5206429Svs195195 + x[j - 4] * m[ldm * (j - 4) + i]) 5216429Svs195195 + x[j - 3] * m[ldm * (j - 3) + i]) 5226429Svs195195 + x[j - 2] * m[ldm * (j - 2) + i]) 5236429Svs195195 + x[j - 1] * m[ldm * (j - 1) + i]) 5246429Svs195195 + x[j] * m[ldm * j + i]; 5256429Svs195195 } 5266429Svs195195 5276429Svs195195 /* main loop - groups of sixteen vectors */ 5286429Svs195195 jmin = (n2 % 16) + 16; 5296429Svs195195 for (j = jmin - 1; j < n2; j = j + 16) { 5306429Svs195195 for (i = 0; i < n1; i++) 5316429Svs195195 y[i] = ((((((((((((((((y[i]) 5326429Svs195195 + x[j - 15] * m[ldm * (j - 15) + i]) 5336429Svs195195 + x[j - 14] * m[ldm * (j - 14) + i]) 5346429Svs195195 + x[j - 13] * m[ldm * (j - 13) + i]) 5356429Svs195195 + x[j - 12] * m[ldm * (j - 12) + i]) 5366429Svs195195 + x[j - 11] * m[ldm * (j - 11) + i]) 5376429Svs195195 + x[j - 10] * m[ldm * (j - 10) + i]) 5386429Svs195195 + x[j - 9] * m[ldm * (j - 9) + i]) 5396429Svs195195 + x[j - 8] * m[ldm * (j - 8) + i]) 5406429Svs195195 + x[j - 7] * m[ldm * (j - 7) + i]) 5416429Svs195195 + x[j - 6] * m[ldm * (j - 6) + i]) 5426429Svs195195 + x[j - 5] * m[ldm * (j - 5) + i]) 5436429Svs195195 + x[j - 4] * m[ldm * (j - 4) + i]) 5446429Svs195195 + x[j - 3] * m[ldm * (j - 3) + i]) 5456429Svs195195 + x[j - 2] * m[ldm * (j - 2) + i]) 5466429Svs195195 + x[j - 1] * m[ldm * (j - 1) + i]) 5476429Svs195195 + x[j] * m[ldm * j + i]; 5486429Svs195195 } 5496429Svs195195 } 550