xref: /onnv-gate/usr/src/cmd/fps/fptest/linpack.c (revision 6491:448e02e63395)
16429Svs195195 /*
26429Svs195195  * CDDL HEADER START
36429Svs195195  *
46429Svs195195  * The contents of this file are subject to the terms of the
56429Svs195195  * Common Development and Distribution License (the "License").
66429Svs195195  * You may not use this file except in compliance with the License.
76429Svs195195  *
86429Svs195195  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
96429Svs195195  * or http://www.opensolaris.org/os/licensing.
106429Svs195195  * See the License for the specific language governing permissions
116429Svs195195  * and limitations under the License.
126429Svs195195  *
136429Svs195195  * When distributing Covered Code, include this CDDL HEADER in each
146429Svs195195  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
156429Svs195195  * If applicable, add the following below this CDDL HEADER, with the
166429Svs195195  * fields enclosed by brackets "[]" replaced with your own identifying
176429Svs195195  * information: Portions Copyright [yyyy] [name of copyright owner]
186429Svs195195  *
196429Svs195195  * CDDL HEADER END
206429Svs195195  */
216429Svs195195 
226429Svs195195 /*
23*6491Sia112686  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
246429Svs195195  * Use is subject to license terms.
256429Svs195195  */
266429Svs195195 
276429Svs195195 #pragma ident	"%Z%%M%	%I%	%E% SMI"
286429Svs195195 
296429Svs195195 #include <errno.h>
306429Svs195195 #include <stdio.h>
316429Svs195195 #include <stdlib.h>
326429Svs195195 #include <sys/time.h>
336429Svs195195 #include <unistd.h>
346429Svs195195 #include <externs.h>
356429Svs195195 #include <fp.h>
366429Svs195195 #include <fps_ereport.h>
376429Svs195195 #include <fpstestmsg.h>
386429Svs195195 #include <linpack.h>
396429Svs195195 
406429Svs195195 #ifdef __i386
416429Svs195195 #include "/shared/dp/mercury/latest/prod/include/cc/sunperf.h"
426429Svs195195 #else
436429Svs195195 #include <sunperf.h>
446429Svs195195 #endif
456429Svs195195 
466429Svs195195 double fabs(double x);
476429Svs195195 
486429Svs195195 extern void	___pl_dss_set_chip_cache_(int *cache_size);
496429Svs195195 static double dran(int iseed[4]);
506429Svs195195 static int LINSUB(REAL * residn, REAL * resid,
516429Svs195195     REAL * eps, REAL * x11, REAL * xn1, int fps_verbose_msg);
526429Svs195195 static int MATGEN(REAL a[], int lda, int n, REAL b[], REAL * norma);
536429Svs195195 static REAL EPSLON(REAL x);
546429Svs195195 static void MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
556429Svs195195 
566429Svs195195 extern int errno;
576429Svs195195 static int LAPACK_ECACHE_SIZE = 8 * 1024 * 1024;
586429Svs195195 static int MAT_SIZE;
596429Svs195195 
606429Svs195195 /*
616429Svs195195  * LINPACK(int Stress, int unit, struct fps_test_ereport *report,
626429Svs195195  * int fps_verbose_msg)
636429Svs195195  * performs the single and double precision lapack test. If an
646429Svs195195  * error is found, relevant data is collected and stored in report.
656429Svs195195  */
666429Svs195195 int
676429Svs195195 LINPACK(int Stress, int unit, struct fps_test_ereport *report,
686429Svs195195     int fps_verbose_msg)
696429Svs195195 {
706429Svs195195 	char err_data[MAX_INFO_SIZE];
716429Svs195195 	char l_buf[64];
726429Svs195195 	int c_index;
736429Svs195195 	int ret;
746429Svs195195 	REAL eps;
756429Svs195195 	REAL resid;
766429Svs195195 	REAL residn;
776429Svs195195 	REAL x11;
786429Svs195195 	REAL xn1;
796429Svs195195 	REAL EPS;
806429Svs195195 	REAL RESID;
816429Svs195195 	REAL RESIDN;
826429Svs195195 	REAL X11;
836429Svs195195 	REAL XN1;
846429Svs195195 	uint64_t expected[5];
856429Svs195195 	uint64_t observed[5];
866429Svs195195 
876429Svs195195 #ifdef  FPS_LAPA_UNK
886429Svs195195 #ifndef DP
896429Svs195195 	if (Stress > 1000)
906429Svs195195 			return (0);
916429Svs195195 #endif /* DP */
926429Svs195195 #endif /* FPS_LAPA_UNK */
936429Svs195195 
946429Svs195195 	if (Stress > 10000)
956429Svs195195 		return (0);
966429Svs195195 
976429Svs195195 	/*
986429Svs195195 	 * make sure is no dependency on the E$ size Without this call the
996429Svs195195 	 * computed results will depend on the size of the E$ (
1006429Svs195195 	 * sos10/libsunperf ) IIIi computed results != IV+/IV/III+/III ...
1016429Svs195195 	 */
1026429Svs195195 	___pl_dss_set_chip_cache_(&LAPACK_ECACHE_SIZE);
1036429Svs195195 
1046429Svs195195 	c_index = Stress;
1056429Svs195195 
1066429Svs195195 	if (2000 == c_index)
1076429Svs195195 		c_index = 1001;
1086429Svs195195 	if (3000 == c_index)
1096429Svs195195 		c_index = 1002;
1106429Svs195195 	if (4016 == c_index)
1116429Svs195195 		c_index = 1003;
1126429Svs195195 	if (5000 == c_index)
1136429Svs195195 		c_index = 1004;
1146429Svs195195 	if (6000 == c_index)
1156429Svs195195 		c_index = 1005;
1166429Svs195195 	if (7016 == c_index)
1176429Svs195195 		c_index = 1006;
1186429Svs195195 	if (8034 == c_index)
1196429Svs195195 		c_index = 1007;
1206429Svs195195 	if (9000 == c_index)
1216429Svs195195 		c_index = 1008;
1226429Svs195195 	if (10000 == c_index)
1236429Svs195195 		c_index = 1009;
1246429Svs195195 
1256429Svs195195 	(void) snprintf(l_buf, 63, "%s(%d,cpu=%d)", PREC, Stress, unit);
1266429Svs195195 	fps_msg(fps_verbose_msg, gettext(FPSM_02), l_buf, unit);
1276429Svs195195 
1286429Svs195195 	MAT_SIZE = Stress;
1296429Svs195195 	ret = LINSUB(&residn, &resid, &eps, &x11, &xn1, fps_verbose_msg);
1306429Svs195195 
1316429Svs195195 	if (2 == ret) {
1326429Svs195195 		if (errno == EAGAIN || errno == ENOMEM)
1336429Svs195195 			_exit(FPU_SYSCALL_TRYAGAIN);
1346429Svs195195 		else
1356429Svs195195 			_exit(FPU_SYSCALL_FAIL);
1366429Svs195195 	}
1376429Svs195195 
1386429Svs195195 #ifdef  FPS_LAPA_UNK
1396429Svs195195 	RESIDN  = RESID   = X11 = XN1 = 0.0000000000000000e+00;
1406429Svs195195 
1416429Svs195195 #ifdef DP
1426429Svs195195 	EPS = 2.2204460492503131e-16;
1436429Svs195195 #else /* DP */
1446429Svs195195 	EPS = 1.1920928955078125e-07;
1456429Svs195195 #endif /* DP */
1466429Svs195195 
1476429Svs195195 #else /* FPS_LAPA_UNK */
1486429Svs195195 
1496429Svs195195 	RESIDN = LinpValsA[c_index].residn;
1506429Svs195195 	RESID = LinpValsA[c_index].resid;
1516429Svs195195 	EPS = LinpValsA[c_index].eps;
1526429Svs195195 	X11 = LinpValsA[c_index].x11;
1536429Svs195195 	XN1 = LinpValsA[c_index].xn1;
1546429Svs195195 
1556429Svs195195 #endif /* FPS_LAPA_UNK */
1566429Svs195195 
1576429Svs195195 	if ((residn == RESIDN) && (resid == RESID) && (eps == EPS) &&
1586429Svs195195 	    (x11 == X11) && (xn1 == XN1)) {
1596429Svs195195 
1606429Svs195195 		return (0);
1616429Svs195195 	} else {
1626429Svs195195 		snprintf(err_data, sizeof (err_data),
1636429Svs195195 		    "\nExpected: %.16e, %.16e, %.16e, %.16e, %.16e"
1646429Svs195195 		    "\nObserved: %.16e, %.16e, %.16e, %.16e, %.16e",
1656429Svs195195 		    RESIDN, RESID, EPS, X11, XN1, residn, resid, eps, x11, xn1);
1666429Svs195195 
1676429Svs195195 
1686429Svs195195 #ifdef	DP
1696429Svs195195 		observed[0] = *(uint64_t *)&residn;
1706429Svs195195 		observed[1] = *(uint64_t *)&resid;
1716429Svs195195 		observed[2] = *(uint64_t *)&eps;
1726429Svs195195 		observed[3] = *(uint64_t *)&x11;
1736429Svs195195 		observed[4] = *(uint64_t *)&xn1;
1746429Svs195195 		expected[0] = *(uint64_t *)&RESIDN;
1756429Svs195195 		expected[1] = *(uint64_t *)&RESID;
1766429Svs195195 		expected[2] = *(uint64_t *)&EPS;
1776429Svs195195 		expected[3] = *(uint64_t *)&X11;
1786429Svs195195 		expected[4] = *(uint64_t *)&XN1;
1796429Svs195195 
1806429Svs195195 		setup_fps_test_struct(IS_EREPORT_INFO, report,
1816429Svs195195 		    6317, &observed, &expected, 5, 5, err_data);
1826429Svs195195 #else
1836429Svs195195 		observed[0] = (uint64_t)(*(uint32_t *)&residn);
1846429Svs195195 		observed[1] = (uint64_t)(*(uint32_t *)&resid);
1856429Svs195195 		observed[2] = (uint64_t)(*(uint32_t *)&eps);
1866429Svs195195 		observed[3] = (uint64_t)(*(uint32_t *)&x11);
1876429Svs195195 		observed[4] = (uint64_t)(*(uint32_t *)&xn1);
1886429Svs195195 		expected[0] = (uint64_t)(*(uint32_t *)&RESIDN);
1896429Svs195195 		expected[1] = (uint64_t)(*(uint32_t *)&RESID);
1906429Svs195195 		expected[2] = (uint64_t)(*(uint32_t *)&EPS);
1916429Svs195195 		expected[3] = (uint64_t)(*(uint32_t *)&X11);
1926429Svs195195 		expected[4] = (uint64_t)(*(uint32_t *)&XN1);
1936429Svs195195 
1946429Svs195195 		setup_fps_test_struct(IS_EREPORT_INFO, report,
1956429Svs195195 		    6316, &observed, &expected, 5, 5, err_data);
1966429Svs195195 #endif
1976429Svs195195 
1986429Svs195195 		return (-1);
1996429Svs195195 	}
2006429Svs195195 }
2016429Svs195195 
2026429Svs195195 /*
2036429Svs195195  * LINSUB(REAL *residn, REAL *resid, REAL *eps,
2046429Svs195195  * REAL *x11, REAL *xn1, int fps_verbose_msg)begins
2056429Svs195195  * the lapack calculation calls.
2066429Svs195195  */
2076429Svs195195 static int
2086429Svs195195 LINSUB(REAL *residn, REAL *resid,
2096429Svs195195 	REAL *eps, REAL *x11, REAL *xn1,
2106429Svs195195 	int fps_verbose_msg)
2116429Svs195195 {
2126429Svs195195 	int i;
2136429Svs195195 	int lda;
2146429Svs195195 	int n;
2156429Svs195195 	int nr_malloc;
2166429Svs195195 	REAL *a;
2176429Svs195195 	REAL abs;
2186429Svs195195 	REAL *b;
2196429Svs195195 	REAL norma;
2206429Svs195195 	REAL normx;
2216429Svs195195 	REAL *x;
2226429Svs195195 	struct timeval timeout;
2236429Svs195195 	long info;
2246429Svs195195 	long *ipvt;
2256429Svs195195 
2266429Svs195195 	timeout.tv_sec = 0;
2276429Svs195195 	timeout.tv_usec = 10000; /* microseconds, 10ms */
2286429Svs195195 	nr_malloc = 0;
2296429Svs195195 
2306429Svs195195 mallocAgain:
2316429Svs195195 
2326429Svs195195 	a = (REAL *) malloc((MAT_SIZE + 8) * (MAT_SIZE + 1) *
2336429Svs195195 	    (size_t)sizeof (REAL));
2346429Svs195195 	b = (REAL *) malloc(MAT_SIZE * (size_t)sizeof (REAL));
2356429Svs195195 	x = (REAL *) malloc(MAT_SIZE * (size_t)sizeof (REAL));
2366429Svs195195 
2376429Svs195195 	ipvt = (long *)malloc(MAT_SIZE * (size_t)sizeof (long));
2386429Svs195195 
2396429Svs195195 	if ((NULL == a) || (NULL == b) ||
2406429Svs195195 	    (NULL == x) || (NULL == ipvt)) {
2416429Svs195195 		if (NULL != a)
2426429Svs195195 			free(a);
2436429Svs195195 		if (NULL != b)
2446429Svs195195 			free(b);
2456429Svs195195 		if (NULL != x)
2466429Svs195195 			free(x);
2476429Svs195195 		if (NULL != ipvt)
2486429Svs195195 			free(ipvt);
2496429Svs195195 
2506429Svs195195 		/* sleep 10 ms. wait for 100 ms */
2516429Svs195195 		if (nr_malloc++ < 11) {
2526429Svs195195 			(void) select(1, NULL, NULL, NULL, &timeout);
2536429Svs195195 			goto mallocAgain;
2546429Svs195195 		}
2556429Svs195195 		fps_msg(fps_verbose_msg,
2566429Svs195195 		    "Malloc failed in lapack, matrix size %d",
2576429Svs195195 		    MAT_SIZE);
2586429Svs195195 
2596429Svs195195 		return (2);
2606429Svs195195 	}
2616429Svs195195 	lda = MAT_SIZE + 8;
2626429Svs195195 	n = MAT_SIZE;
2636429Svs195195 
2646429Svs195195 	(void) MATGEN(a, lda, n, b, &norma);
2656429Svs195195 	GEFA(n, n, a, lda, ipvt, &info);
2666429Svs195195 	GESL('N', n, 1, a, lda, ipvt, b, n, &info);
2676429Svs195195 	free(ipvt);
2686429Svs195195 
2696429Svs195195 	for (i = 0; i < n; i++) {
2706429Svs195195 		x[i] = b[i];
2716429Svs195195 	}
2726429Svs195195 
2736429Svs195195 	(void) MATGEN((REAL *) a, lda, n, b, &norma);
2746429Svs195195 
2756429Svs195195 	for (i = 0; i < n; i++) {
2766429Svs195195 		b[i] = -b[i];
2776429Svs195195 	}
2786429Svs195195 
2796429Svs195195 	MXPY(n, b, n, lda, x, (REAL *) a);
2806429Svs195195 	free(a);
2816429Svs195195 
2826429Svs195195 	*resid = 0.0;
2836429Svs195195 	normx = 0.0;
2846429Svs195195 
2856429Svs195195 	for (i = 0; i < n; i++) {
2866429Svs195195 		abs = (REAL)fabs((double)b[i]);
2876429Svs195195 		*resid = (*resid > abs) ? *resid : abs;
2886429Svs195195 		abs = (REAL)fabs((double)x[i]);
2896429Svs195195 		normx = (normx > abs) ? normx : abs;
2906429Svs195195 	}
2916429Svs195195 
2926429Svs195195 	free(b);
2936429Svs195195 
2946429Svs195195 	*eps = EPSLON((REAL) LP_ONE);
2956429Svs195195 
2966429Svs195195 	*residn = *resid / (n * norma * normx * (*eps));
2976429Svs195195 
2986429Svs195195 	*x11 = x[0] - 1;
2996429Svs195195 	*xn1 = x[n - 1] - 1;
3006429Svs195195 
3016429Svs195195 	free(x);
3026429Svs195195 
3036429Svs195195 	return (0);
3046429Svs195195 }
3056429Svs195195 
3066429Svs195195 /*
3076429Svs195195  * dran(int iseed[4]) returns a random real number from a
3086429Svs195195  * uniform (0,1) distribution.
3096429Svs195195  */
3106429Svs195195 static double
3116429Svs195195 dran(int iseed[4])
3126429Svs195195 {
3136429Svs195195 	double r;
3146429Svs195195 	double value;
3156429Svs195195 	int ipw2;
3166429Svs195195 	int it1;
3176429Svs195195 	int it2;
3186429Svs195195 	int it3;
3196429Svs195195 	int it4;
3206429Svs195195 	int m1;
3216429Svs195195 	int m2;
3226429Svs195195 	int m3;
3236429Svs195195 	int m4;
3246429Svs195195 
3256429Svs195195 	/* Set constants */
3266429Svs195195 	m1 = 494;
3276429Svs195195 	m2 = 322;
3286429Svs195195 	m3 = 2508;
3296429Svs195195 	m4 = 2549;
3306429Svs195195 	ipw2 = 4096;
3316429Svs195195 	r = 1.0 / ipw2;
3326429Svs195195 
3336429Svs195195 	/* multiply the seed by the multiplier modulo 2**48 */
3346429Svs195195 	it4 = iseed[3] * m4;
3356429Svs195195 	it3 = it4 / ipw2;
3366429Svs195195 	it4 = it4 - ipw2 * it3;
3376429Svs195195 	it3 = it3 + iseed[2] * m4 + iseed[3] * m3;
3386429Svs195195 	it2 = it3 / ipw2;
3396429Svs195195 	it3 = it3 - ipw2 * it2;
3406429Svs195195 	it2 = it2 + iseed[1] * m4 + iseed[2] * m3 + iseed[3] * m2;
3416429Svs195195 	it1 = it2 / ipw2;
3426429Svs195195 	it2 = it2 - ipw2 * it1;
3436429Svs195195 	it1 = it1 + iseed[0] * m4 + iseed[1] * m3 + iseed[2] * m2 +
3446429Svs195195 	    iseed[3] * m1;
3456429Svs195195 	it1 = it1 % ipw2;
3466429Svs195195 
3476429Svs195195 	/* return updated seed */
3486429Svs195195 	iseed[0] = it1;
3496429Svs195195 	iseed[1] = it2;
3506429Svs195195 	iseed[2] = it3;
3516429Svs195195 	iseed[3] = it4;
3526429Svs195195 
3536429Svs195195 	/* convert 48-bit integer to a real number in the interval (0,1) */
3546429Svs195195 	value = r * ((double)it1 + r * ((double)it2 + r * ((double)it3 +
3556429Svs195195 	    r * ((double)it4))));
3566429Svs195195 
3576429Svs195195 	return (value);
3586429Svs195195 }
3596429Svs195195 
3606429Svs195195 /*
3616429Svs195195  * MATGEN(REAL a[], int lda, int n, REAL b[], REAL *norma)
3626429Svs195195  * generates matrix a and b.
3636429Svs195195  */
3646429Svs195195 
3656429Svs195195 #define	ALPHA 1.68750
3666429Svs195195 static int
3676429Svs195195 MATGEN(REAL a[], int lda, int n, REAL b[], REAL *norma)
3686429Svs195195 {
3696429Svs195195 	int		i;
3706429Svs195195 	int		init[4];
3716429Svs195195 	int		j;
3726429Svs195195 	REAL	value;
3736429Svs195195 
3746429Svs195195 	init[0] = 1;
3756429Svs195195 	init[1] = 2;
3766429Svs195195 	init[2] = 3;
3776429Svs195195 	init[3] = 1325;
3786429Svs195195 	*norma = LP_ZERO;
3796429Svs195195 	for (j = 0; j < n; j++) {
3806429Svs195195 		for (i = 0; i < n; i++) {
3816429Svs195195 #ifdef FPS_LAPA_UNK
3826429Svs195195 			a[lda*j+i] =
3836429Svs195195 			    (i < j) ? (double)(i+1) : (double)(j+ALPHA);
3846429Svs195195 			if (fabs(a[lda*j+i]) > *norma)
3856429Svs195195 				*norma = fabs(a[lda*j+i]);
3866429Svs195195 			} /* i */
3876429Svs195195 #else
3886429Svs195195 			value = (REAL) dran(init) - 0.5;
3896429Svs195195 			a[lda * j + i] = value;
3906429Svs195195 			value = fabs(value);
3916429Svs195195 			if (value > *norma) {
3926429Svs195195 				*norma = value;
3936429Svs195195 			}
3946429Svs195195 		} /* i */
3956429Svs195195 #endif /* FPS_LAPA_UNK */
3966429Svs195195 	} /* j */
3976429Svs195195 
3986429Svs195195 
3996429Svs195195 	for (i = 0; i < n; i++) {
4006429Svs195195 		b[i] = LP_ZERO;
4016429Svs195195 	}
4026429Svs195195 	for (j = 0; j < n; j++) {
4036429Svs195195 		for (i = 0; i < n; i++) {
4046429Svs195195 			b[i] = b[i] + a[lda * j + i];
4056429Svs195195 		}
4066429Svs195195 	}
4076429Svs195195 
4086429Svs195195 	return (0);
4096429Svs195195 }
4106429Svs195195 
4116429Svs195195 /*
4126429Svs195195  * IAMAX(int n, REAL dx[])finds the index of element
4136429Svs195195  * having maximum absolute value.
4146429Svs195195  */
4156429Svs195195 int
4166429Svs195195 IAMAX(int n, REAL dx[])
4176429Svs195195 {
4186429Svs195195 	double abs;
4196429Svs195195 	double dmax;
4206429Svs195195 	int i;
4216429Svs195195 	int itemp;
4226429Svs195195 
4236429Svs195195 	if (n < 1)
4246429Svs195195 		return (-1);
4256429Svs195195 	if (n == 1)
4266429Svs195195 		return (0);
4276429Svs195195 
4286429Svs195195 	itemp = 0;
4296429Svs195195 	dmax = fabs((double)dx[0]);
4306429Svs195195 
4316429Svs195195 	for (i = 1; i < n; i++) {
4326429Svs195195 		abs = fabs((double)dx[i]);
4336429Svs195195 		if (abs > dmax) {
4346429Svs195195 			itemp = i;
4356429Svs195195 			dmax = abs;
4366429Svs195195 		}
4376429Svs195195 	}
4386429Svs195195 
4396429Svs195195 	return (itemp);
4406429Svs195195 }
4416429Svs195195 
4426429Svs195195 /*
4436429Svs195195  * EPSLON(REAL x) estimates unit roundoff in
4446429Svs195195  * quantities of size x.
4456429Svs195195  */
4466429Svs195195 static REAL
4476429Svs195195 EPSLON(REAL x)
4486429Svs195195 {
4496429Svs195195 	REAL a;
4506429Svs195195 	REAL abs;
4516429Svs195195 	REAL b;
4526429Svs195195 	REAL c;
4536429Svs195195 	REAL eps;
4546429Svs195195 
4556429Svs195195 	a = 4.0e0 / 3.0e0;
4566429Svs195195 	eps = LP_ZERO;
4576429Svs195195 
4586429Svs195195 	while (eps == LP_ZERO) {
4596429Svs195195 		b = a - LP_ONE;
4606429Svs195195 		c = b + b + b;
4616429Svs195195 		eps = (REAL)fabs((double)(c - LP_ONE));
4626429Svs195195 	}
4636429Svs195195 
4646429Svs195195 	abs = (REAL)fabs((double)x);
4656429Svs195195 
4666429Svs195195 	return (eps * abs);
4676429Svs195195 }
4686429Svs195195 
4696429Svs195195 /*
4706429Svs195195  * MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
4716429Svs195195  * multiplies matrix m times vector x and add the result to
4726429Svs195195  * vector y.
4736429Svs195195  */
4746429Svs195195 static void
4756429Svs195195 MXPY(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
4766429Svs195195 {
4776429Svs195195 	int i;
4786429Svs195195 	int j;
4796429Svs195195 	int jmin;
4806429Svs195195 
4816429Svs195195 	/* cleanup odd vector */
4826429Svs195195 	j = n2 % 2;
4836429Svs195195 	if (j >= 1) {
4846429Svs195195 		j = j - 1;
4856429Svs195195 		for (i = 0; i < n1; i++)
4866429Svs195195 			y[i] = (y[i]) + x[j] * m[ldm * j + i];
4876429Svs195195 	}
4886429Svs195195 
4896429Svs195195 	/* cleanup odd group of two vectors */
4906429Svs195195 	j = n2 % 4;
4916429Svs195195 	if (j >= 2) {
4926429Svs195195 		j = j - 1;
4936429Svs195195 		for (i = 0; i < n1; i++)
4946429Svs195195 			y[i] = ((y[i])
4956429Svs195195 			    + x[j - 1] * m[ldm * (j - 1) + i])
4966429Svs195195 			    + x[j] * m[ldm * j + i];
4976429Svs195195 	}
4986429Svs195195 
4996429Svs195195 	/* cleanup odd group of four vectors */
5006429Svs195195 	j = n2 % 8;
5016429Svs195195 	if (j >= 4) {
5026429Svs195195 		j = j - 1;
5036429Svs195195 		for (i = 0; i < n1; i++)
5046429Svs195195 			y[i] = ((((y[i])
5056429Svs195195 			    + x[j - 3] * m[ldm * (j - 3) + i])
5066429Svs195195 			    + x[j - 2] * m[ldm * (j - 2) + i])
5076429Svs195195 			    + x[j - 1] * m[ldm * (j - 1) + i])
5086429Svs195195 			    + x[j] * m[ldm * j + i];
5096429Svs195195 	}
5106429Svs195195 
5116429Svs195195 	/* cleanup odd group of eight vectors */
5126429Svs195195 	j = n2 % 16;
5136429Svs195195 	if (j >= 8) {
5146429Svs195195 		j = j - 1;
5156429Svs195195 		for (i = 0; i < n1; i++)
5166429Svs195195 			y[i] = ((((((((y[i])
5176429Svs195195 			    + x[j - 7] * m[ldm * (j - 7) + i])
5186429Svs195195 			    + x[j - 6] * m[ldm * (j - 6) + i])
5196429Svs195195 			    + x[j - 5] * m[ldm * (j - 5) + i])
5206429Svs195195 			    + x[j - 4] * m[ldm * (j - 4) + i])
5216429Svs195195 			    + x[j - 3] * m[ldm * (j - 3) + i])
5226429Svs195195 			    + x[j - 2] * m[ldm * (j - 2) + i])
5236429Svs195195 			    + x[j - 1] * m[ldm * (j - 1) + i])
5246429Svs195195 			    + x[j] * m[ldm * j + i];
5256429Svs195195 	}
5266429Svs195195 
5276429Svs195195 	/* main loop - groups of sixteen vectors */
5286429Svs195195 	jmin = (n2 % 16) + 16;
5296429Svs195195 	for (j = jmin - 1; j < n2; j = j + 16) {
5306429Svs195195 		for (i = 0; i < n1; i++)
5316429Svs195195 			y[i] = ((((((((((((((((y[i])
5326429Svs195195 			    + x[j - 15] * m[ldm * (j - 15) + i])
5336429Svs195195 			    + x[j - 14] * m[ldm * (j - 14) + i])
5346429Svs195195 			    + x[j - 13] * m[ldm * (j - 13) + i])
5356429Svs195195 			    + x[j - 12] * m[ldm * (j - 12) + i])
5366429Svs195195 			    + x[j - 11] * m[ldm * (j - 11) + i])
5376429Svs195195 			    + x[j - 10] * m[ldm * (j - 10) + i])
5386429Svs195195 			    + x[j - 9] * m[ldm * (j - 9) + i])
5396429Svs195195 			    + x[j - 8] * m[ldm * (j - 8) + i])
5406429Svs195195 			    + x[j - 7] * m[ldm * (j - 7) + i])
5416429Svs195195 			    + x[j - 6] * m[ldm * (j - 6) + i])
5426429Svs195195 			    + x[j - 5] * m[ldm * (j - 5) + i])
5436429Svs195195 			    + x[j - 4] * m[ldm * (j - 4) + i])
5446429Svs195195 			    + x[j - 3] * m[ldm * (j - 3) + i])
5456429Svs195195 			    + x[j - 2] * m[ldm * (j - 2) + i])
5466429Svs195195 			    + x[j - 1] * m[ldm * (j - 1) + i])
5476429Svs195195 			    + x[j] * m[ldm * j + i];
5486429Svs195195 	}
5496429Svs195195 }
550