xref: /netbsd-src/external/gpl3/gcc/dist/libgfortran/generated/matmul_r17.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1*b1e83836Smrg /* Implementation of the MATMUL intrinsic
2*b1e83836Smrg    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3*b1e83836Smrg    Contributed by Paul Brook <paul@nowt.org>
4*b1e83836Smrg 
5*b1e83836Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6*b1e83836Smrg 
7*b1e83836Smrg Libgfortran is free software; you can redistribute it and/or
8*b1e83836Smrg modify it under the terms of the GNU General Public
9*b1e83836Smrg License as published by the Free Software Foundation; either
10*b1e83836Smrg version 3 of the License, or (at your option) any later version.
11*b1e83836Smrg 
12*b1e83836Smrg Libgfortran is distributed in the hope that it will be useful,
13*b1e83836Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14*b1e83836Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15*b1e83836Smrg GNU General Public License for more details.
16*b1e83836Smrg 
17*b1e83836Smrg Under Section 7 of GPL version 3, you are granted additional
18*b1e83836Smrg permissions described in the GCC Runtime Library Exception, version
19*b1e83836Smrg 3.1, as published by the Free Software Foundation.
20*b1e83836Smrg 
21*b1e83836Smrg You should have received a copy of the GNU General Public License and
22*b1e83836Smrg a copy of the GCC Runtime Library Exception along with this program;
23*b1e83836Smrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24*b1e83836Smrg <http://www.gnu.org/licenses/>.  */
25*b1e83836Smrg 
26*b1e83836Smrg #include "libgfortran.h"
27*b1e83836Smrg #include <string.h>
28*b1e83836Smrg #include <assert.h>
29*b1e83836Smrg 
30*b1e83836Smrg 
31*b1e83836Smrg #if defined (HAVE_GFC_REAL_17)
32*b1e83836Smrg 
33*b1e83836Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34*b1e83836Smrg    passed to us by the front-end, in which case we call it for large
35*b1e83836Smrg    matrices.  */
36*b1e83836Smrg 
37*b1e83836Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38*b1e83836Smrg                           const int *, const GFC_REAL_17 *, const GFC_REAL_17 *,
39*b1e83836Smrg                           const int *, const GFC_REAL_17 *, const int *,
40*b1e83836Smrg                           const GFC_REAL_17 *, GFC_REAL_17 *, const int *,
41*b1e83836Smrg                           int, int);
42*b1e83836Smrg 
43*b1e83836Smrg /* The order of loops is different in the case of plain matrix
44*b1e83836Smrg    multiplication C=MATMUL(A,B), and in the frequent special case where
45*b1e83836Smrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
46*b1e83836Smrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47*b1e83836Smrg    looking at their strides.
48*b1e83836Smrg 
49*b1e83836Smrg    The equivalent Fortran pseudo-code is:
50*b1e83836Smrg 
51*b1e83836Smrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52*b1e83836Smrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
53*b1e83836Smrg      C = 0
54*b1e83836Smrg      DO J=1,N
55*b1e83836Smrg        DO K=1,COUNT
56*b1e83836Smrg          DO I=1,M
57*b1e83836Smrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58*b1e83836Smrg    ELSE
59*b1e83836Smrg      DO J=1,N
60*b1e83836Smrg        DO I=1,M
61*b1e83836Smrg          S = 0
62*b1e83836Smrg          DO K=1,COUNT
63*b1e83836Smrg            S = S+A(I,K)*B(K,J)
64*b1e83836Smrg          C(I,J) = S
65*b1e83836Smrg    ENDIF
66*b1e83836Smrg */
67*b1e83836Smrg 
68*b1e83836Smrg /* If try_blas is set to a nonzero value, then the matmul function will
69*b1e83836Smrg    see if there is a way to perform the matrix multiplication by a call
70*b1e83836Smrg    to the BLAS gemm function.  */
71*b1e83836Smrg 
72*b1e83836Smrg extern void matmul_r17 (gfc_array_r17 * const restrict retarray,
73*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
74*b1e83836Smrg 	int blas_limit, blas_call gemm);
75*b1e83836Smrg export_proto(matmul_r17);
76*b1e83836Smrg 
77*b1e83836Smrg /* Put exhaustive list of possible architectures here here, ORed together.  */
78*b1e83836Smrg 
79*b1e83836Smrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80*b1e83836Smrg 
81*b1e83836Smrg #ifdef HAVE_AVX
82*b1e83836Smrg static void
83*b1e83836Smrg matmul_r17_avx (gfc_array_r17 * const restrict retarray,
84*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
85*b1e83836Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86*b1e83836Smrg static void
matmul_r17_avx(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87*b1e83836Smrg matmul_r17_avx (gfc_array_r17 * const restrict retarray,
88*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
89*b1e83836Smrg 	int blas_limit, blas_call gemm)
90*b1e83836Smrg {
91*b1e83836Smrg   const GFC_REAL_17 * restrict abase;
92*b1e83836Smrg   const GFC_REAL_17 * restrict bbase;
93*b1e83836Smrg   GFC_REAL_17 * restrict dest;
94*b1e83836Smrg 
95*b1e83836Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96*b1e83836Smrg   index_type x, y, n, count, xcount, ycount;
97*b1e83836Smrg 
98*b1e83836Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
99*b1e83836Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
100*b1e83836Smrg 
101*b1e83836Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102*b1e83836Smrg 
103*b1e83836Smrg    Either A or B (but not both) can be rank 1:
104*b1e83836Smrg 
105*b1e83836Smrg    o One-dimensional argument A is implicitly treated as a row matrix
106*b1e83836Smrg      dimensioned [1,count], so xcount=1.
107*b1e83836Smrg 
108*b1e83836Smrg    o One-dimensional argument B is implicitly treated as a column matrix
109*b1e83836Smrg      dimensioned [count, 1], so ycount=1.
110*b1e83836Smrg */
111*b1e83836Smrg 
112*b1e83836Smrg   if (retarray->base_addr == NULL)
113*b1e83836Smrg     {
114*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
115*b1e83836Smrg         {
116*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118*b1e83836Smrg         }
119*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120*b1e83836Smrg         {
121*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123*b1e83836Smrg         }
124*b1e83836Smrg       else
125*b1e83836Smrg         {
126*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128*b1e83836Smrg 
129*b1e83836Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
130*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131*b1e83836Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132*b1e83836Smrg         }
133*b1e83836Smrg 
134*b1e83836Smrg       retarray->base_addr
135*b1e83836Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
136*b1e83836Smrg       retarray->offset = 0;
137*b1e83836Smrg     }
138*b1e83836Smrg   else if (unlikely (compile_options.bounds_check))
139*b1e83836Smrg     {
140*b1e83836Smrg       index_type ret_extent, arg_extent;
141*b1e83836Smrg 
142*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
143*b1e83836Smrg 	{
144*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146*b1e83836Smrg 	  if (arg_extent != ret_extent)
147*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
148*b1e83836Smrg 	    		   "array (%ld/%ld) ",
149*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
150*b1e83836Smrg 	}
151*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152*b1e83836Smrg 	{
153*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155*b1e83836Smrg 	  if (arg_extent != ret_extent)
156*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
157*b1e83836Smrg 	    		   "array (%ld/%ld) ",
158*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
159*b1e83836Smrg 	}
160*b1e83836Smrg       else
161*b1e83836Smrg 	{
162*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164*b1e83836Smrg 	  if (arg_extent != ret_extent)
165*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
166*b1e83836Smrg 	    		   "array (%ld/%ld) ",
167*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
168*b1e83836Smrg 
169*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171*b1e83836Smrg 	  if (arg_extent != ret_extent)
172*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
173*b1e83836Smrg 	    		   "array (%ld/%ld) ",
174*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
175*b1e83836Smrg 	}
176*b1e83836Smrg     }
177*b1e83836Smrg 
178*b1e83836Smrg 
179*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180*b1e83836Smrg     {
181*b1e83836Smrg       /* One-dimensional result may be addressed in the code below
182*b1e83836Smrg 	 either as a row or a column matrix. We want both cases to
183*b1e83836Smrg 	 work. */
184*b1e83836Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185*b1e83836Smrg     }
186*b1e83836Smrg   else
187*b1e83836Smrg     {
188*b1e83836Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189*b1e83836Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190*b1e83836Smrg     }
191*b1e83836Smrg 
192*b1e83836Smrg 
193*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
194*b1e83836Smrg     {
195*b1e83836Smrg       /* Treat it as a a row matrix A[1,count]. */
196*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197*b1e83836Smrg       aystride = 1;
198*b1e83836Smrg 
199*b1e83836Smrg       xcount = 1;
200*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
201*b1e83836Smrg     }
202*b1e83836Smrg   else
203*b1e83836Smrg     {
204*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205*b1e83836Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206*b1e83836Smrg 
207*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
208*b1e83836Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209*b1e83836Smrg     }
210*b1e83836Smrg 
211*b1e83836Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212*b1e83836Smrg     {
213*b1e83836Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214*b1e83836Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215*b1e83836Smrg 		       "in dimension 1: is %ld, should be %ld",
216*b1e83836Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217*b1e83836Smrg     }
218*b1e83836Smrg 
219*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
220*b1e83836Smrg     {
221*b1e83836Smrg       /* Treat it as a column matrix B[count,1] */
222*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223*b1e83836Smrg 
224*b1e83836Smrg       /* bystride should never be used for 1-dimensional b.
225*b1e83836Smrg          The value is only used for calculation of the
226*b1e83836Smrg          memory by the buffer.  */
227*b1e83836Smrg       bystride = 256;
228*b1e83836Smrg       ycount = 1;
229*b1e83836Smrg     }
230*b1e83836Smrg   else
231*b1e83836Smrg     {
232*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233*b1e83836Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234*b1e83836Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235*b1e83836Smrg     }
236*b1e83836Smrg 
237*b1e83836Smrg   abase = a->base_addr;
238*b1e83836Smrg   bbase = b->base_addr;
239*b1e83836Smrg   dest = retarray->base_addr;
240*b1e83836Smrg 
241*b1e83836Smrg   /* Now that everything is set up, we perform the multiplication
242*b1e83836Smrg      itself.  */
243*b1e83836Smrg 
244*b1e83836Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245*b1e83836Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
246*b1e83836Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
247*b1e83836Smrg 
248*b1e83836Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249*b1e83836Smrg       && (bxstride == 1 || bystride == 1)
250*b1e83836Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
251*b1e83836Smrg           > POW3(blas_limit)))
252*b1e83836Smrg     {
253*b1e83836Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
254*b1e83836Smrg       const GFC_REAL_17 one = 1, zero = 0;
255*b1e83836Smrg       const int lda = (axstride == 1) ? aystride : axstride,
256*b1e83836Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
257*b1e83836Smrg 
258*b1e83836Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259*b1e83836Smrg 	{
260*b1e83836Smrg 	  assert (gemm != NULL);
261*b1e83836Smrg 	  const char *transa, *transb;
262*b1e83836Smrg 	  if (try_blas & 2)
263*b1e83836Smrg 	    transa = "C";
264*b1e83836Smrg 	  else
265*b1e83836Smrg 	    transa = axstride == 1 ? "N" : "T";
266*b1e83836Smrg 
267*b1e83836Smrg 	  if (try_blas & 4)
268*b1e83836Smrg 	    transb = "C";
269*b1e83836Smrg 	  else
270*b1e83836Smrg 	    transb = bxstride == 1 ? "N" : "T";
271*b1e83836Smrg 
272*b1e83836Smrg 	  gemm (transa, transb , &m,
273*b1e83836Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274*b1e83836Smrg 		&ldc, 1, 1);
275*b1e83836Smrg 	  return;
276*b1e83836Smrg 	}
277*b1e83836Smrg     }
278*b1e83836Smrg 
279*b1e83836Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
280*b1e83836Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
281*b1e83836Smrg     {
282*b1e83836Smrg       /* This block of code implements a tuned matmul, derived from
283*b1e83836Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284*b1e83836Smrg 
285*b1e83836Smrg                Bo Kagstrom and Per Ling
286*b1e83836Smrg                Department of Computing Science
287*b1e83836Smrg                Umea University
288*b1e83836Smrg                S-901 87 Umea, Sweden
289*b1e83836Smrg 
290*b1e83836Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
291*b1e83836Smrg 
292*b1e83836Smrg       const GFC_REAL_17 *a, *b;
293*b1e83836Smrg       GFC_REAL_17 *c;
294*b1e83836Smrg       const index_type m = xcount, n = ycount, k = count;
295*b1e83836Smrg 
296*b1e83836Smrg       /* System generated locals */
297*b1e83836Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298*b1e83836Smrg 		 i1, i2, i3, i4, i5, i6;
299*b1e83836Smrg 
300*b1e83836Smrg       /* Local variables */
301*b1e83836Smrg       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
302*b1e83836Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
303*b1e83836Smrg       index_type i, j, l, ii, jj, ll;
304*b1e83836Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305*b1e83836Smrg       GFC_REAL_17 *t1;
306*b1e83836Smrg 
307*b1e83836Smrg       a = abase;
308*b1e83836Smrg       b = bbase;
309*b1e83836Smrg       c = retarray->base_addr;
310*b1e83836Smrg 
311*b1e83836Smrg       /* Parameter adjustments */
312*b1e83836Smrg       c_dim1 = rystride;
313*b1e83836Smrg       c_offset = 1 + c_dim1;
314*b1e83836Smrg       c -= c_offset;
315*b1e83836Smrg       a_dim1 = aystride;
316*b1e83836Smrg       a_offset = 1 + a_dim1;
317*b1e83836Smrg       a -= a_offset;
318*b1e83836Smrg       b_dim1 = bystride;
319*b1e83836Smrg       b_offset = 1 + b_dim1;
320*b1e83836Smrg       b -= b_offset;
321*b1e83836Smrg 
322*b1e83836Smrg       /* Empty c first.  */
323*b1e83836Smrg       for (j=1; j<=n; j++)
324*b1e83836Smrg 	for (i=1; i<=m; i++)
325*b1e83836Smrg 	  c[i + j * c_dim1] = (GFC_REAL_17)0;
326*b1e83836Smrg 
327*b1e83836Smrg       /* Early exit if possible */
328*b1e83836Smrg       if (m == 0 || n == 0 || k == 0)
329*b1e83836Smrg 	return;
330*b1e83836Smrg 
331*b1e83836Smrg       /* Adjust size of t1 to what is needed.  */
332*b1e83836Smrg       index_type t1_dim, a_sz;
333*b1e83836Smrg       if (aystride == 1)
334*b1e83836Smrg         a_sz = rystride;
335*b1e83836Smrg       else
336*b1e83836Smrg         a_sz = a_dim1;
337*b1e83836Smrg 
338*b1e83836Smrg       t1_dim = a_sz * 256 + b_dim1;
339*b1e83836Smrg       if (t1_dim > 65536)
340*b1e83836Smrg 	t1_dim = 65536;
341*b1e83836Smrg 
342*b1e83836Smrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
343*b1e83836Smrg 
344*b1e83836Smrg       /* Start turning the crank. */
345*b1e83836Smrg       i1 = n;
346*b1e83836Smrg       for (jj = 1; jj <= i1; jj += 512)
347*b1e83836Smrg 	{
348*b1e83836Smrg 	  /* Computing MIN */
349*b1e83836Smrg 	  i2 = 512;
350*b1e83836Smrg 	  i3 = n - jj + 1;
351*b1e83836Smrg 	  jsec = min(i2,i3);
352*b1e83836Smrg 	  ujsec = jsec - jsec % 4;
353*b1e83836Smrg 	  i2 = k;
354*b1e83836Smrg 	  for (ll = 1; ll <= i2; ll += 256)
355*b1e83836Smrg 	    {
356*b1e83836Smrg 	      /* Computing MIN */
357*b1e83836Smrg 	      i3 = 256;
358*b1e83836Smrg 	      i4 = k - ll + 1;
359*b1e83836Smrg 	      lsec = min(i3,i4);
360*b1e83836Smrg 	      ulsec = lsec - lsec % 2;
361*b1e83836Smrg 
362*b1e83836Smrg 	      i3 = m;
363*b1e83836Smrg 	      for (ii = 1; ii <= i3; ii += 256)
364*b1e83836Smrg 		{
365*b1e83836Smrg 		  /* Computing MIN */
366*b1e83836Smrg 		  i4 = 256;
367*b1e83836Smrg 		  i5 = m - ii + 1;
368*b1e83836Smrg 		  isec = min(i4,i5);
369*b1e83836Smrg 		  uisec = isec - isec % 2;
370*b1e83836Smrg 		  i4 = ll + ulsec - 1;
371*b1e83836Smrg 		  for (l = ll; l <= i4; l += 2)
372*b1e83836Smrg 		    {
373*b1e83836Smrg 		      i5 = ii + uisec - 1;
374*b1e83836Smrg 		      for (i = ii; i <= i5; i += 2)
375*b1e83836Smrg 			{
376*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377*b1e83836Smrg 					a[i + l * a_dim1];
378*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379*b1e83836Smrg 					a[i + (l + 1) * a_dim1];
380*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381*b1e83836Smrg 					a[i + 1 + l * a_dim1];
382*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383*b1e83836Smrg 					a[i + 1 + (l + 1) * a_dim1];
384*b1e83836Smrg 			}
385*b1e83836Smrg 		      if (uisec < isec)
386*b1e83836Smrg 			{
387*b1e83836Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
388*b1e83836Smrg 				    a[ii + isec - 1 + l * a_dim1];
389*b1e83836Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
390*b1e83836Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
391*b1e83836Smrg 			}
392*b1e83836Smrg 		    }
393*b1e83836Smrg 		  if (ulsec < lsec)
394*b1e83836Smrg 		    {
395*b1e83836Smrg 		      i4 = ii + isec - 1;
396*b1e83836Smrg 		      for (i = ii; i<= i4; ++i)
397*b1e83836Smrg 			{
398*b1e83836Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399*b1e83836Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
400*b1e83836Smrg 			}
401*b1e83836Smrg 		    }
402*b1e83836Smrg 
403*b1e83836Smrg 		  uisec = isec - isec % 4;
404*b1e83836Smrg 		  i4 = jj + ujsec - 1;
405*b1e83836Smrg 		  for (j = jj; j <= i4; j += 4)
406*b1e83836Smrg 		    {
407*b1e83836Smrg 		      i5 = ii + uisec - 1;
408*b1e83836Smrg 		      for (i = ii; i <= i5; i += 4)
409*b1e83836Smrg 			{
410*b1e83836Smrg 			  f11 = c[i + j * c_dim1];
411*b1e83836Smrg 			  f21 = c[i + 1 + j * c_dim1];
412*b1e83836Smrg 			  f12 = c[i + (j + 1) * c_dim1];
413*b1e83836Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
414*b1e83836Smrg 			  f13 = c[i + (j + 2) * c_dim1];
415*b1e83836Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
416*b1e83836Smrg 			  f14 = c[i + (j + 3) * c_dim1];
417*b1e83836Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
418*b1e83836Smrg 			  f31 = c[i + 2 + j * c_dim1];
419*b1e83836Smrg 			  f41 = c[i + 3 + j * c_dim1];
420*b1e83836Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
421*b1e83836Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
422*b1e83836Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
423*b1e83836Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
424*b1e83836Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
425*b1e83836Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
426*b1e83836Smrg 			  i6 = ll + lsec - 1;
427*b1e83836Smrg 			  for (l = ll; l <= i6; ++l)
428*b1e83836Smrg 			    {
429*b1e83836Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430*b1e83836Smrg 				      * b[l + j * b_dim1];
431*b1e83836Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432*b1e83836Smrg 				      * b[l + j * b_dim1];
433*b1e83836Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
435*b1e83836Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
437*b1e83836Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
439*b1e83836Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
441*b1e83836Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
443*b1e83836Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
445*b1e83836Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446*b1e83836Smrg 				      * b[l + j * b_dim1];
447*b1e83836Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448*b1e83836Smrg 				      * b[l + j * b_dim1];
449*b1e83836Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
451*b1e83836Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
453*b1e83836Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
455*b1e83836Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
457*b1e83836Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
459*b1e83836Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
461*b1e83836Smrg 			    }
462*b1e83836Smrg 			  c[i + j * c_dim1] = f11;
463*b1e83836Smrg 			  c[i + 1 + j * c_dim1] = f21;
464*b1e83836Smrg 			  c[i + (j + 1) * c_dim1] = f12;
465*b1e83836Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
466*b1e83836Smrg 			  c[i + (j + 2) * c_dim1] = f13;
467*b1e83836Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
468*b1e83836Smrg 			  c[i + (j + 3) * c_dim1] = f14;
469*b1e83836Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
470*b1e83836Smrg 			  c[i + 2 + j * c_dim1] = f31;
471*b1e83836Smrg 			  c[i + 3 + j * c_dim1] = f41;
472*b1e83836Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
473*b1e83836Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
474*b1e83836Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
475*b1e83836Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
476*b1e83836Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
477*b1e83836Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
478*b1e83836Smrg 			}
479*b1e83836Smrg 		      if (uisec < isec)
480*b1e83836Smrg 			{
481*b1e83836Smrg 			  i5 = ii + isec - 1;
482*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
483*b1e83836Smrg 			    {
484*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
485*b1e83836Smrg 			      f12 = c[i + (j + 1) * c_dim1];
486*b1e83836Smrg 			      f13 = c[i + (j + 2) * c_dim1];
487*b1e83836Smrg 			      f14 = c[i + (j + 3) * c_dim1];
488*b1e83836Smrg 			      i6 = ll + lsec - 1;
489*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
490*b1e83836Smrg 				{
491*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492*b1e83836Smrg 					  257] * b[l + j * b_dim1];
493*b1e83836Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494*b1e83836Smrg 					  257] * b[l + (j + 1) * b_dim1];
495*b1e83836Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496*b1e83836Smrg 					  257] * b[l + (j + 2) * b_dim1];
497*b1e83836Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498*b1e83836Smrg 					  257] * b[l + (j + 3) * b_dim1];
499*b1e83836Smrg 				}
500*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
501*b1e83836Smrg 			      c[i + (j + 1) * c_dim1] = f12;
502*b1e83836Smrg 			      c[i + (j + 2) * c_dim1] = f13;
503*b1e83836Smrg 			      c[i + (j + 3) * c_dim1] = f14;
504*b1e83836Smrg 			    }
505*b1e83836Smrg 			}
506*b1e83836Smrg 		    }
507*b1e83836Smrg 		  if (ujsec < jsec)
508*b1e83836Smrg 		    {
509*b1e83836Smrg 		      i4 = jj + jsec - 1;
510*b1e83836Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
511*b1e83836Smrg 			{
512*b1e83836Smrg 			  i5 = ii + uisec - 1;
513*b1e83836Smrg 			  for (i = ii; i <= i5; i += 4)
514*b1e83836Smrg 			    {
515*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
516*b1e83836Smrg 			      f21 = c[i + 1 + j * c_dim1];
517*b1e83836Smrg 			      f31 = c[i + 2 + j * c_dim1];
518*b1e83836Smrg 			      f41 = c[i + 3 + j * c_dim1];
519*b1e83836Smrg 			      i6 = ll + lsec - 1;
520*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
521*b1e83836Smrg 				{
522*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523*b1e83836Smrg 					  257] * b[l + j * b_dim1];
524*b1e83836Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525*b1e83836Smrg 					  257] * b[l + j * b_dim1];
526*b1e83836Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527*b1e83836Smrg 					  257] * b[l + j * b_dim1];
528*b1e83836Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529*b1e83836Smrg 					  257] * b[l + j * b_dim1];
530*b1e83836Smrg 				}
531*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
532*b1e83836Smrg 			      c[i + 1 + j * c_dim1] = f21;
533*b1e83836Smrg 			      c[i + 2 + j * c_dim1] = f31;
534*b1e83836Smrg 			      c[i + 3 + j * c_dim1] = f41;
535*b1e83836Smrg 			    }
536*b1e83836Smrg 			  i5 = ii + isec - 1;
537*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
538*b1e83836Smrg 			    {
539*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
540*b1e83836Smrg 			      i6 = ll + lsec - 1;
541*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
542*b1e83836Smrg 				{
543*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544*b1e83836Smrg 					  257] * b[l + j * b_dim1];
545*b1e83836Smrg 				}
546*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
547*b1e83836Smrg 			    }
548*b1e83836Smrg 			}
549*b1e83836Smrg 		    }
550*b1e83836Smrg 		}
551*b1e83836Smrg 	    }
552*b1e83836Smrg 	}
553*b1e83836Smrg       free(t1);
554*b1e83836Smrg       return;
555*b1e83836Smrg     }
556*b1e83836Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557*b1e83836Smrg     {
558*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
559*b1e83836Smrg 	{
560*b1e83836Smrg 	  const GFC_REAL_17 *restrict abase_x;
561*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
562*b1e83836Smrg 	  GFC_REAL_17 *restrict dest_y;
563*b1e83836Smrg 	  GFC_REAL_17 s;
564*b1e83836Smrg 
565*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
566*b1e83836Smrg 	    {
567*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
568*b1e83836Smrg 	      dest_y = &dest[y*rystride];
569*b1e83836Smrg 	      for (x = 0; x < xcount; x++)
570*b1e83836Smrg 		{
571*b1e83836Smrg 		  abase_x = &abase[x*axstride];
572*b1e83836Smrg 		  s = (GFC_REAL_17) 0;
573*b1e83836Smrg 		  for (n = 0; n < count; n++)
574*b1e83836Smrg 		    s += abase_x[n] * bbase_y[n];
575*b1e83836Smrg 		  dest_y[x] = s;
576*b1e83836Smrg 		}
577*b1e83836Smrg 	    }
578*b1e83836Smrg 	}
579*b1e83836Smrg       else
580*b1e83836Smrg 	{
581*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
582*b1e83836Smrg 	  GFC_REAL_17 s;
583*b1e83836Smrg 
584*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
585*b1e83836Smrg 	    {
586*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
587*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
588*b1e83836Smrg 	      for (n = 0; n < count; n++)
589*b1e83836Smrg 		s += abase[n*axstride] * bbase_y[n];
590*b1e83836Smrg 	      dest[y*rystride] = s;
591*b1e83836Smrg 	    }
592*b1e83836Smrg 	}
593*b1e83836Smrg     }
594*b1e83836Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
595*b1e83836Smrg     {
596*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
597*b1e83836Smrg       GFC_REAL_17 s;
598*b1e83836Smrg 
599*b1e83836Smrg       for (y = 0; y < ycount; y++)
600*b1e83836Smrg 	{
601*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
602*b1e83836Smrg 	  s = (GFC_REAL_17) 0;
603*b1e83836Smrg 	  for (n = 0; n < count; n++)
604*b1e83836Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
605*b1e83836Smrg 	  dest[y*rxstride] = s;
606*b1e83836Smrg 	}
607*b1e83836Smrg     }
608*b1e83836Smrg   else if (axstride < aystride)
609*b1e83836Smrg     {
610*b1e83836Smrg       for (y = 0; y < ycount; y++)
611*b1e83836Smrg 	for (x = 0; x < xcount; x++)
612*b1e83836Smrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
613*b1e83836Smrg 
614*b1e83836Smrg       for (y = 0; y < ycount; y++)
615*b1e83836Smrg 	for (n = 0; n < count; n++)
616*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
617*b1e83836Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
618*b1e83836Smrg 	    dest[x*rxstride + y*rystride] +=
619*b1e83836Smrg 					abase[x*axstride + n*aystride] *
620*b1e83836Smrg 					bbase[n*bxstride + y*bystride];
621*b1e83836Smrg     }
622*b1e83836Smrg   else
623*b1e83836Smrg     {
624*b1e83836Smrg       const GFC_REAL_17 *restrict abase_x;
625*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
626*b1e83836Smrg       GFC_REAL_17 *restrict dest_y;
627*b1e83836Smrg       GFC_REAL_17 s;
628*b1e83836Smrg 
629*b1e83836Smrg       for (y = 0; y < ycount; y++)
630*b1e83836Smrg 	{
631*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
632*b1e83836Smrg 	  dest_y = &dest[y*rystride];
633*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
634*b1e83836Smrg 	    {
635*b1e83836Smrg 	      abase_x = &abase[x*axstride];
636*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
637*b1e83836Smrg 	      for (n = 0; n < count; n++)
638*b1e83836Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639*b1e83836Smrg 	      dest_y[x*rxstride] = s;
640*b1e83836Smrg 	    }
641*b1e83836Smrg 	}
642*b1e83836Smrg     }
643*b1e83836Smrg }
644*b1e83836Smrg #undef POW3
645*b1e83836Smrg #undef min
646*b1e83836Smrg #undef max
647*b1e83836Smrg 
648*b1e83836Smrg #endif /* HAVE_AVX */
649*b1e83836Smrg 
650*b1e83836Smrg #ifdef HAVE_AVX2
651*b1e83836Smrg static void
652*b1e83836Smrg matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
653*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
654*b1e83836Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655*b1e83836Smrg static void
matmul_r17_avx2(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656*b1e83836Smrg matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
657*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
658*b1e83836Smrg 	int blas_limit, blas_call gemm)
659*b1e83836Smrg {
660*b1e83836Smrg   const GFC_REAL_17 * restrict abase;
661*b1e83836Smrg   const GFC_REAL_17 * restrict bbase;
662*b1e83836Smrg   GFC_REAL_17 * restrict dest;
663*b1e83836Smrg 
664*b1e83836Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665*b1e83836Smrg   index_type x, y, n, count, xcount, ycount;
666*b1e83836Smrg 
667*b1e83836Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
668*b1e83836Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
669*b1e83836Smrg 
670*b1e83836Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671*b1e83836Smrg 
672*b1e83836Smrg    Either A or B (but not both) can be rank 1:
673*b1e83836Smrg 
674*b1e83836Smrg    o One-dimensional argument A is implicitly treated as a row matrix
675*b1e83836Smrg      dimensioned [1,count], so xcount=1.
676*b1e83836Smrg 
677*b1e83836Smrg    o One-dimensional argument B is implicitly treated as a column matrix
678*b1e83836Smrg      dimensioned [count, 1], so ycount=1.
679*b1e83836Smrg */
680*b1e83836Smrg 
681*b1e83836Smrg   if (retarray->base_addr == NULL)
682*b1e83836Smrg     {
683*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
684*b1e83836Smrg         {
685*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687*b1e83836Smrg         }
688*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
689*b1e83836Smrg         {
690*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692*b1e83836Smrg         }
693*b1e83836Smrg       else
694*b1e83836Smrg         {
695*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697*b1e83836Smrg 
698*b1e83836Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
699*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700*b1e83836Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701*b1e83836Smrg         }
702*b1e83836Smrg 
703*b1e83836Smrg       retarray->base_addr
704*b1e83836Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
705*b1e83836Smrg       retarray->offset = 0;
706*b1e83836Smrg     }
707*b1e83836Smrg   else if (unlikely (compile_options.bounds_check))
708*b1e83836Smrg     {
709*b1e83836Smrg       index_type ret_extent, arg_extent;
710*b1e83836Smrg 
711*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
712*b1e83836Smrg 	{
713*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715*b1e83836Smrg 	  if (arg_extent != ret_extent)
716*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
717*b1e83836Smrg 	    		   "array (%ld/%ld) ",
718*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
719*b1e83836Smrg 	}
720*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
721*b1e83836Smrg 	{
722*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724*b1e83836Smrg 	  if (arg_extent != ret_extent)
725*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
726*b1e83836Smrg 	    		   "array (%ld/%ld) ",
727*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
728*b1e83836Smrg 	}
729*b1e83836Smrg       else
730*b1e83836Smrg 	{
731*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733*b1e83836Smrg 	  if (arg_extent != ret_extent)
734*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
735*b1e83836Smrg 	    		   "array (%ld/%ld) ",
736*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
737*b1e83836Smrg 
738*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740*b1e83836Smrg 	  if (arg_extent != ret_extent)
741*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
742*b1e83836Smrg 	    		   "array (%ld/%ld) ",
743*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
744*b1e83836Smrg 	}
745*b1e83836Smrg     }
746*b1e83836Smrg 
747*b1e83836Smrg 
748*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749*b1e83836Smrg     {
750*b1e83836Smrg       /* One-dimensional result may be addressed in the code below
751*b1e83836Smrg 	 either as a row or a column matrix. We want both cases to
752*b1e83836Smrg 	 work. */
753*b1e83836Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754*b1e83836Smrg     }
755*b1e83836Smrg   else
756*b1e83836Smrg     {
757*b1e83836Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758*b1e83836Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759*b1e83836Smrg     }
760*b1e83836Smrg 
761*b1e83836Smrg 
762*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
763*b1e83836Smrg     {
764*b1e83836Smrg       /* Treat it as a a row matrix A[1,count]. */
765*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766*b1e83836Smrg       aystride = 1;
767*b1e83836Smrg 
768*b1e83836Smrg       xcount = 1;
769*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
770*b1e83836Smrg     }
771*b1e83836Smrg   else
772*b1e83836Smrg     {
773*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774*b1e83836Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775*b1e83836Smrg 
776*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
777*b1e83836Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778*b1e83836Smrg     }
779*b1e83836Smrg 
780*b1e83836Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781*b1e83836Smrg     {
782*b1e83836Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783*b1e83836Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784*b1e83836Smrg 		       "in dimension 1: is %ld, should be %ld",
785*b1e83836Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786*b1e83836Smrg     }
787*b1e83836Smrg 
788*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
789*b1e83836Smrg     {
790*b1e83836Smrg       /* Treat it as a column matrix B[count,1] */
791*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792*b1e83836Smrg 
793*b1e83836Smrg       /* bystride should never be used for 1-dimensional b.
794*b1e83836Smrg          The value is only used for calculation of the
795*b1e83836Smrg          memory by the buffer.  */
796*b1e83836Smrg       bystride = 256;
797*b1e83836Smrg       ycount = 1;
798*b1e83836Smrg     }
799*b1e83836Smrg   else
800*b1e83836Smrg     {
801*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802*b1e83836Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803*b1e83836Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804*b1e83836Smrg     }
805*b1e83836Smrg 
806*b1e83836Smrg   abase = a->base_addr;
807*b1e83836Smrg   bbase = b->base_addr;
808*b1e83836Smrg   dest = retarray->base_addr;
809*b1e83836Smrg 
810*b1e83836Smrg   /* Now that everything is set up, we perform the multiplication
811*b1e83836Smrg      itself.  */
812*b1e83836Smrg 
813*b1e83836Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814*b1e83836Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
815*b1e83836Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
816*b1e83836Smrg 
817*b1e83836Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818*b1e83836Smrg       && (bxstride == 1 || bystride == 1)
819*b1e83836Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
820*b1e83836Smrg           > POW3(blas_limit)))
821*b1e83836Smrg     {
822*b1e83836Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
823*b1e83836Smrg       const GFC_REAL_17 one = 1, zero = 0;
824*b1e83836Smrg       const int lda = (axstride == 1) ? aystride : axstride,
825*b1e83836Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
826*b1e83836Smrg 
827*b1e83836Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828*b1e83836Smrg 	{
829*b1e83836Smrg 	  assert (gemm != NULL);
830*b1e83836Smrg 	  const char *transa, *transb;
831*b1e83836Smrg 	  if (try_blas & 2)
832*b1e83836Smrg 	    transa = "C";
833*b1e83836Smrg 	  else
834*b1e83836Smrg 	    transa = axstride == 1 ? "N" : "T";
835*b1e83836Smrg 
836*b1e83836Smrg 	  if (try_blas & 4)
837*b1e83836Smrg 	    transb = "C";
838*b1e83836Smrg 	  else
839*b1e83836Smrg 	    transb = bxstride == 1 ? "N" : "T";
840*b1e83836Smrg 
841*b1e83836Smrg 	  gemm (transa, transb , &m,
842*b1e83836Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843*b1e83836Smrg 		&ldc, 1, 1);
844*b1e83836Smrg 	  return;
845*b1e83836Smrg 	}
846*b1e83836Smrg     }
847*b1e83836Smrg 
848*b1e83836Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
849*b1e83836Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
850*b1e83836Smrg     {
851*b1e83836Smrg       /* This block of code implements a tuned matmul, derived from
852*b1e83836Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853*b1e83836Smrg 
854*b1e83836Smrg                Bo Kagstrom and Per Ling
855*b1e83836Smrg                Department of Computing Science
856*b1e83836Smrg                Umea University
857*b1e83836Smrg                S-901 87 Umea, Sweden
858*b1e83836Smrg 
859*b1e83836Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
860*b1e83836Smrg 
861*b1e83836Smrg       const GFC_REAL_17 *a, *b;
862*b1e83836Smrg       GFC_REAL_17 *c;
863*b1e83836Smrg       const index_type m = xcount, n = ycount, k = count;
864*b1e83836Smrg 
865*b1e83836Smrg       /* System generated locals */
866*b1e83836Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867*b1e83836Smrg 		 i1, i2, i3, i4, i5, i6;
868*b1e83836Smrg 
869*b1e83836Smrg       /* Local variables */
870*b1e83836Smrg       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
871*b1e83836Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
872*b1e83836Smrg       index_type i, j, l, ii, jj, ll;
873*b1e83836Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874*b1e83836Smrg       GFC_REAL_17 *t1;
875*b1e83836Smrg 
876*b1e83836Smrg       a = abase;
877*b1e83836Smrg       b = bbase;
878*b1e83836Smrg       c = retarray->base_addr;
879*b1e83836Smrg 
880*b1e83836Smrg       /* Parameter adjustments */
881*b1e83836Smrg       c_dim1 = rystride;
882*b1e83836Smrg       c_offset = 1 + c_dim1;
883*b1e83836Smrg       c -= c_offset;
884*b1e83836Smrg       a_dim1 = aystride;
885*b1e83836Smrg       a_offset = 1 + a_dim1;
886*b1e83836Smrg       a -= a_offset;
887*b1e83836Smrg       b_dim1 = bystride;
888*b1e83836Smrg       b_offset = 1 + b_dim1;
889*b1e83836Smrg       b -= b_offset;
890*b1e83836Smrg 
891*b1e83836Smrg       /* Empty c first.  */
892*b1e83836Smrg       for (j=1; j<=n; j++)
893*b1e83836Smrg 	for (i=1; i<=m; i++)
894*b1e83836Smrg 	  c[i + j * c_dim1] = (GFC_REAL_17)0;
895*b1e83836Smrg 
896*b1e83836Smrg       /* Early exit if possible */
897*b1e83836Smrg       if (m == 0 || n == 0 || k == 0)
898*b1e83836Smrg 	return;
899*b1e83836Smrg 
900*b1e83836Smrg       /* Adjust size of t1 to what is needed.  */
901*b1e83836Smrg       index_type t1_dim, a_sz;
902*b1e83836Smrg       if (aystride == 1)
903*b1e83836Smrg         a_sz = rystride;
904*b1e83836Smrg       else
905*b1e83836Smrg         a_sz = a_dim1;
906*b1e83836Smrg 
907*b1e83836Smrg       t1_dim = a_sz * 256 + b_dim1;
908*b1e83836Smrg       if (t1_dim > 65536)
909*b1e83836Smrg 	t1_dim = 65536;
910*b1e83836Smrg 
911*b1e83836Smrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
912*b1e83836Smrg 
913*b1e83836Smrg       /* Start turning the crank. */
914*b1e83836Smrg       i1 = n;
915*b1e83836Smrg       for (jj = 1; jj <= i1; jj += 512)
916*b1e83836Smrg 	{
917*b1e83836Smrg 	  /* Computing MIN */
918*b1e83836Smrg 	  i2 = 512;
919*b1e83836Smrg 	  i3 = n - jj + 1;
920*b1e83836Smrg 	  jsec = min(i2,i3);
921*b1e83836Smrg 	  ujsec = jsec - jsec % 4;
922*b1e83836Smrg 	  i2 = k;
923*b1e83836Smrg 	  for (ll = 1; ll <= i2; ll += 256)
924*b1e83836Smrg 	    {
925*b1e83836Smrg 	      /* Computing MIN */
926*b1e83836Smrg 	      i3 = 256;
927*b1e83836Smrg 	      i4 = k - ll + 1;
928*b1e83836Smrg 	      lsec = min(i3,i4);
929*b1e83836Smrg 	      ulsec = lsec - lsec % 2;
930*b1e83836Smrg 
931*b1e83836Smrg 	      i3 = m;
932*b1e83836Smrg 	      for (ii = 1; ii <= i3; ii += 256)
933*b1e83836Smrg 		{
934*b1e83836Smrg 		  /* Computing MIN */
935*b1e83836Smrg 		  i4 = 256;
936*b1e83836Smrg 		  i5 = m - ii + 1;
937*b1e83836Smrg 		  isec = min(i4,i5);
938*b1e83836Smrg 		  uisec = isec - isec % 2;
939*b1e83836Smrg 		  i4 = ll + ulsec - 1;
940*b1e83836Smrg 		  for (l = ll; l <= i4; l += 2)
941*b1e83836Smrg 		    {
942*b1e83836Smrg 		      i5 = ii + uisec - 1;
943*b1e83836Smrg 		      for (i = ii; i <= i5; i += 2)
944*b1e83836Smrg 			{
945*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946*b1e83836Smrg 					a[i + l * a_dim1];
947*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948*b1e83836Smrg 					a[i + (l + 1) * a_dim1];
949*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950*b1e83836Smrg 					a[i + 1 + l * a_dim1];
951*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952*b1e83836Smrg 					a[i + 1 + (l + 1) * a_dim1];
953*b1e83836Smrg 			}
954*b1e83836Smrg 		      if (uisec < isec)
955*b1e83836Smrg 			{
956*b1e83836Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
957*b1e83836Smrg 				    a[ii + isec - 1 + l * a_dim1];
958*b1e83836Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
959*b1e83836Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
960*b1e83836Smrg 			}
961*b1e83836Smrg 		    }
962*b1e83836Smrg 		  if (ulsec < lsec)
963*b1e83836Smrg 		    {
964*b1e83836Smrg 		      i4 = ii + isec - 1;
965*b1e83836Smrg 		      for (i = ii; i<= i4; ++i)
966*b1e83836Smrg 			{
967*b1e83836Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968*b1e83836Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
969*b1e83836Smrg 			}
970*b1e83836Smrg 		    }
971*b1e83836Smrg 
972*b1e83836Smrg 		  uisec = isec - isec % 4;
973*b1e83836Smrg 		  i4 = jj + ujsec - 1;
974*b1e83836Smrg 		  for (j = jj; j <= i4; j += 4)
975*b1e83836Smrg 		    {
976*b1e83836Smrg 		      i5 = ii + uisec - 1;
977*b1e83836Smrg 		      for (i = ii; i <= i5; i += 4)
978*b1e83836Smrg 			{
979*b1e83836Smrg 			  f11 = c[i + j * c_dim1];
980*b1e83836Smrg 			  f21 = c[i + 1 + j * c_dim1];
981*b1e83836Smrg 			  f12 = c[i + (j + 1) * c_dim1];
982*b1e83836Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
983*b1e83836Smrg 			  f13 = c[i + (j + 2) * c_dim1];
984*b1e83836Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
985*b1e83836Smrg 			  f14 = c[i + (j + 3) * c_dim1];
986*b1e83836Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
987*b1e83836Smrg 			  f31 = c[i + 2 + j * c_dim1];
988*b1e83836Smrg 			  f41 = c[i + 3 + j * c_dim1];
989*b1e83836Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
990*b1e83836Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
991*b1e83836Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
992*b1e83836Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
993*b1e83836Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
994*b1e83836Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
995*b1e83836Smrg 			  i6 = ll + lsec - 1;
996*b1e83836Smrg 			  for (l = ll; l <= i6; ++l)
997*b1e83836Smrg 			    {
998*b1e83836Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999*b1e83836Smrg 				      * b[l + j * b_dim1];
1000*b1e83836Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001*b1e83836Smrg 				      * b[l + j * b_dim1];
1002*b1e83836Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1004*b1e83836Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1006*b1e83836Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1008*b1e83836Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1010*b1e83836Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1012*b1e83836Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1014*b1e83836Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015*b1e83836Smrg 				      * b[l + j * b_dim1];
1016*b1e83836Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017*b1e83836Smrg 				      * b[l + j * b_dim1];
1018*b1e83836Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1020*b1e83836Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1022*b1e83836Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1024*b1e83836Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1026*b1e83836Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1028*b1e83836Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1030*b1e83836Smrg 			    }
1031*b1e83836Smrg 			  c[i + j * c_dim1] = f11;
1032*b1e83836Smrg 			  c[i + 1 + j * c_dim1] = f21;
1033*b1e83836Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1034*b1e83836Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035*b1e83836Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1036*b1e83836Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037*b1e83836Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1038*b1e83836Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039*b1e83836Smrg 			  c[i + 2 + j * c_dim1] = f31;
1040*b1e83836Smrg 			  c[i + 3 + j * c_dim1] = f41;
1041*b1e83836Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042*b1e83836Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043*b1e83836Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044*b1e83836Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045*b1e83836Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046*b1e83836Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047*b1e83836Smrg 			}
1048*b1e83836Smrg 		      if (uisec < isec)
1049*b1e83836Smrg 			{
1050*b1e83836Smrg 			  i5 = ii + isec - 1;
1051*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1052*b1e83836Smrg 			    {
1053*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1054*b1e83836Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1055*b1e83836Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1056*b1e83836Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1057*b1e83836Smrg 			      i6 = ll + lsec - 1;
1058*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1059*b1e83836Smrg 				{
1060*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1062*b1e83836Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063*b1e83836Smrg 					  257] * b[l + (j + 1) * b_dim1];
1064*b1e83836Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065*b1e83836Smrg 					  257] * b[l + (j + 2) * b_dim1];
1066*b1e83836Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067*b1e83836Smrg 					  257] * b[l + (j + 3) * b_dim1];
1068*b1e83836Smrg 				}
1069*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1070*b1e83836Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1071*b1e83836Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1072*b1e83836Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1073*b1e83836Smrg 			    }
1074*b1e83836Smrg 			}
1075*b1e83836Smrg 		    }
1076*b1e83836Smrg 		  if (ujsec < jsec)
1077*b1e83836Smrg 		    {
1078*b1e83836Smrg 		      i4 = jj + jsec - 1;
1079*b1e83836Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1080*b1e83836Smrg 			{
1081*b1e83836Smrg 			  i5 = ii + uisec - 1;
1082*b1e83836Smrg 			  for (i = ii; i <= i5; i += 4)
1083*b1e83836Smrg 			    {
1084*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1085*b1e83836Smrg 			      f21 = c[i + 1 + j * c_dim1];
1086*b1e83836Smrg 			      f31 = c[i + 2 + j * c_dim1];
1087*b1e83836Smrg 			      f41 = c[i + 3 + j * c_dim1];
1088*b1e83836Smrg 			      i6 = ll + lsec - 1;
1089*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1090*b1e83836Smrg 				{
1091*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1093*b1e83836Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1095*b1e83836Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1097*b1e83836Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1099*b1e83836Smrg 				}
1100*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1101*b1e83836Smrg 			      c[i + 1 + j * c_dim1] = f21;
1102*b1e83836Smrg 			      c[i + 2 + j * c_dim1] = f31;
1103*b1e83836Smrg 			      c[i + 3 + j * c_dim1] = f41;
1104*b1e83836Smrg 			    }
1105*b1e83836Smrg 			  i5 = ii + isec - 1;
1106*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1107*b1e83836Smrg 			    {
1108*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1109*b1e83836Smrg 			      i6 = ll + lsec - 1;
1110*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1111*b1e83836Smrg 				{
1112*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1114*b1e83836Smrg 				}
1115*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1116*b1e83836Smrg 			    }
1117*b1e83836Smrg 			}
1118*b1e83836Smrg 		    }
1119*b1e83836Smrg 		}
1120*b1e83836Smrg 	    }
1121*b1e83836Smrg 	}
1122*b1e83836Smrg       free(t1);
1123*b1e83836Smrg       return;
1124*b1e83836Smrg     }
1125*b1e83836Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126*b1e83836Smrg     {
1127*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1128*b1e83836Smrg 	{
1129*b1e83836Smrg 	  const GFC_REAL_17 *restrict abase_x;
1130*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
1131*b1e83836Smrg 	  GFC_REAL_17 *restrict dest_y;
1132*b1e83836Smrg 	  GFC_REAL_17 s;
1133*b1e83836Smrg 
1134*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
1135*b1e83836Smrg 	    {
1136*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
1137*b1e83836Smrg 	      dest_y = &dest[y*rystride];
1138*b1e83836Smrg 	      for (x = 0; x < xcount; x++)
1139*b1e83836Smrg 		{
1140*b1e83836Smrg 		  abase_x = &abase[x*axstride];
1141*b1e83836Smrg 		  s = (GFC_REAL_17) 0;
1142*b1e83836Smrg 		  for (n = 0; n < count; n++)
1143*b1e83836Smrg 		    s += abase_x[n] * bbase_y[n];
1144*b1e83836Smrg 		  dest_y[x] = s;
1145*b1e83836Smrg 		}
1146*b1e83836Smrg 	    }
1147*b1e83836Smrg 	}
1148*b1e83836Smrg       else
1149*b1e83836Smrg 	{
1150*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
1151*b1e83836Smrg 	  GFC_REAL_17 s;
1152*b1e83836Smrg 
1153*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
1154*b1e83836Smrg 	    {
1155*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
1156*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
1157*b1e83836Smrg 	      for (n = 0; n < count; n++)
1158*b1e83836Smrg 		s += abase[n*axstride] * bbase_y[n];
1159*b1e83836Smrg 	      dest[y*rystride] = s;
1160*b1e83836Smrg 	    }
1161*b1e83836Smrg 	}
1162*b1e83836Smrg     }
1163*b1e83836Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164*b1e83836Smrg     {
1165*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
1166*b1e83836Smrg       GFC_REAL_17 s;
1167*b1e83836Smrg 
1168*b1e83836Smrg       for (y = 0; y < ycount; y++)
1169*b1e83836Smrg 	{
1170*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
1171*b1e83836Smrg 	  s = (GFC_REAL_17) 0;
1172*b1e83836Smrg 	  for (n = 0; n < count; n++)
1173*b1e83836Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174*b1e83836Smrg 	  dest[y*rxstride] = s;
1175*b1e83836Smrg 	}
1176*b1e83836Smrg     }
1177*b1e83836Smrg   else if (axstride < aystride)
1178*b1e83836Smrg     {
1179*b1e83836Smrg       for (y = 0; y < ycount; y++)
1180*b1e83836Smrg 	for (x = 0; x < xcount; x++)
1181*b1e83836Smrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
1182*b1e83836Smrg 
1183*b1e83836Smrg       for (y = 0; y < ycount; y++)
1184*b1e83836Smrg 	for (n = 0; n < count; n++)
1185*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
1186*b1e83836Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1187*b1e83836Smrg 	    dest[x*rxstride + y*rystride] +=
1188*b1e83836Smrg 					abase[x*axstride + n*aystride] *
1189*b1e83836Smrg 					bbase[n*bxstride + y*bystride];
1190*b1e83836Smrg     }
1191*b1e83836Smrg   else
1192*b1e83836Smrg     {
1193*b1e83836Smrg       const GFC_REAL_17 *restrict abase_x;
1194*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
1195*b1e83836Smrg       GFC_REAL_17 *restrict dest_y;
1196*b1e83836Smrg       GFC_REAL_17 s;
1197*b1e83836Smrg 
1198*b1e83836Smrg       for (y = 0; y < ycount; y++)
1199*b1e83836Smrg 	{
1200*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
1201*b1e83836Smrg 	  dest_y = &dest[y*rystride];
1202*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
1203*b1e83836Smrg 	    {
1204*b1e83836Smrg 	      abase_x = &abase[x*axstride];
1205*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
1206*b1e83836Smrg 	      for (n = 0; n < count; n++)
1207*b1e83836Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208*b1e83836Smrg 	      dest_y[x*rxstride] = s;
1209*b1e83836Smrg 	    }
1210*b1e83836Smrg 	}
1211*b1e83836Smrg     }
1212*b1e83836Smrg }
1213*b1e83836Smrg #undef POW3
1214*b1e83836Smrg #undef min
1215*b1e83836Smrg #undef max
1216*b1e83836Smrg 
1217*b1e83836Smrg #endif /* HAVE_AVX2 */
1218*b1e83836Smrg 
1219*b1e83836Smrg #ifdef HAVE_AVX512F
1220*b1e83836Smrg static void
1221*b1e83836Smrg matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
1222*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
1223*b1e83836Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224*b1e83836Smrg static void
matmul_r17_avx512f(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225*b1e83836Smrg matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
1226*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
1227*b1e83836Smrg 	int blas_limit, blas_call gemm)
1228*b1e83836Smrg {
1229*b1e83836Smrg   const GFC_REAL_17 * restrict abase;
1230*b1e83836Smrg   const GFC_REAL_17 * restrict bbase;
1231*b1e83836Smrg   GFC_REAL_17 * restrict dest;
1232*b1e83836Smrg 
1233*b1e83836Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234*b1e83836Smrg   index_type x, y, n, count, xcount, ycount;
1235*b1e83836Smrg 
1236*b1e83836Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1237*b1e83836Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1238*b1e83836Smrg 
1239*b1e83836Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240*b1e83836Smrg 
1241*b1e83836Smrg    Either A or B (but not both) can be rank 1:
1242*b1e83836Smrg 
1243*b1e83836Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1244*b1e83836Smrg      dimensioned [1,count], so xcount=1.
1245*b1e83836Smrg 
1246*b1e83836Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1247*b1e83836Smrg      dimensioned [count, 1], so ycount=1.
1248*b1e83836Smrg */
1249*b1e83836Smrg 
1250*b1e83836Smrg   if (retarray->base_addr == NULL)
1251*b1e83836Smrg     {
1252*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1253*b1e83836Smrg         {
1254*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256*b1e83836Smrg         }
1257*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258*b1e83836Smrg         {
1259*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261*b1e83836Smrg         }
1262*b1e83836Smrg       else
1263*b1e83836Smrg         {
1264*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266*b1e83836Smrg 
1267*b1e83836Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1268*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269*b1e83836Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270*b1e83836Smrg         }
1271*b1e83836Smrg 
1272*b1e83836Smrg       retarray->base_addr
1273*b1e83836Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
1274*b1e83836Smrg       retarray->offset = 0;
1275*b1e83836Smrg     }
1276*b1e83836Smrg   else if (unlikely (compile_options.bounds_check))
1277*b1e83836Smrg     {
1278*b1e83836Smrg       index_type ret_extent, arg_extent;
1279*b1e83836Smrg 
1280*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1281*b1e83836Smrg 	{
1282*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284*b1e83836Smrg 	  if (arg_extent != ret_extent)
1285*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1286*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1287*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1288*b1e83836Smrg 	}
1289*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290*b1e83836Smrg 	{
1291*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293*b1e83836Smrg 	  if (arg_extent != ret_extent)
1294*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1295*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1296*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1297*b1e83836Smrg 	}
1298*b1e83836Smrg       else
1299*b1e83836Smrg 	{
1300*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302*b1e83836Smrg 	  if (arg_extent != ret_extent)
1303*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1304*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1305*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1306*b1e83836Smrg 
1307*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309*b1e83836Smrg 	  if (arg_extent != ret_extent)
1310*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1311*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1312*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1313*b1e83836Smrg 	}
1314*b1e83836Smrg     }
1315*b1e83836Smrg 
1316*b1e83836Smrg 
1317*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318*b1e83836Smrg     {
1319*b1e83836Smrg       /* One-dimensional result may be addressed in the code below
1320*b1e83836Smrg 	 either as a row or a column matrix. We want both cases to
1321*b1e83836Smrg 	 work. */
1322*b1e83836Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323*b1e83836Smrg     }
1324*b1e83836Smrg   else
1325*b1e83836Smrg     {
1326*b1e83836Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327*b1e83836Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328*b1e83836Smrg     }
1329*b1e83836Smrg 
1330*b1e83836Smrg 
1331*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1332*b1e83836Smrg     {
1333*b1e83836Smrg       /* Treat it as a a row matrix A[1,count]. */
1334*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335*b1e83836Smrg       aystride = 1;
1336*b1e83836Smrg 
1337*b1e83836Smrg       xcount = 1;
1338*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1339*b1e83836Smrg     }
1340*b1e83836Smrg   else
1341*b1e83836Smrg     {
1342*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343*b1e83836Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344*b1e83836Smrg 
1345*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1346*b1e83836Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347*b1e83836Smrg     }
1348*b1e83836Smrg 
1349*b1e83836Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350*b1e83836Smrg     {
1351*b1e83836Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352*b1e83836Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353*b1e83836Smrg 		       "in dimension 1: is %ld, should be %ld",
1354*b1e83836Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355*b1e83836Smrg     }
1356*b1e83836Smrg 
1357*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1358*b1e83836Smrg     {
1359*b1e83836Smrg       /* Treat it as a column matrix B[count,1] */
1360*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361*b1e83836Smrg 
1362*b1e83836Smrg       /* bystride should never be used for 1-dimensional b.
1363*b1e83836Smrg          The value is only used for calculation of the
1364*b1e83836Smrg          memory by the buffer.  */
1365*b1e83836Smrg       bystride = 256;
1366*b1e83836Smrg       ycount = 1;
1367*b1e83836Smrg     }
1368*b1e83836Smrg   else
1369*b1e83836Smrg     {
1370*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371*b1e83836Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372*b1e83836Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373*b1e83836Smrg     }
1374*b1e83836Smrg 
1375*b1e83836Smrg   abase = a->base_addr;
1376*b1e83836Smrg   bbase = b->base_addr;
1377*b1e83836Smrg   dest = retarray->base_addr;
1378*b1e83836Smrg 
1379*b1e83836Smrg   /* Now that everything is set up, we perform the multiplication
1380*b1e83836Smrg      itself.  */
1381*b1e83836Smrg 
1382*b1e83836Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383*b1e83836Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1384*b1e83836Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1385*b1e83836Smrg 
1386*b1e83836Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387*b1e83836Smrg       && (bxstride == 1 || bystride == 1)
1388*b1e83836Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1389*b1e83836Smrg           > POW3(blas_limit)))
1390*b1e83836Smrg     {
1391*b1e83836Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1392*b1e83836Smrg       const GFC_REAL_17 one = 1, zero = 0;
1393*b1e83836Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1394*b1e83836Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1395*b1e83836Smrg 
1396*b1e83836Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397*b1e83836Smrg 	{
1398*b1e83836Smrg 	  assert (gemm != NULL);
1399*b1e83836Smrg 	  const char *transa, *transb;
1400*b1e83836Smrg 	  if (try_blas & 2)
1401*b1e83836Smrg 	    transa = "C";
1402*b1e83836Smrg 	  else
1403*b1e83836Smrg 	    transa = axstride == 1 ? "N" : "T";
1404*b1e83836Smrg 
1405*b1e83836Smrg 	  if (try_blas & 4)
1406*b1e83836Smrg 	    transb = "C";
1407*b1e83836Smrg 	  else
1408*b1e83836Smrg 	    transb = bxstride == 1 ? "N" : "T";
1409*b1e83836Smrg 
1410*b1e83836Smrg 	  gemm (transa, transb , &m,
1411*b1e83836Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412*b1e83836Smrg 		&ldc, 1, 1);
1413*b1e83836Smrg 	  return;
1414*b1e83836Smrg 	}
1415*b1e83836Smrg     }
1416*b1e83836Smrg 
1417*b1e83836Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
1418*b1e83836Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
1419*b1e83836Smrg     {
1420*b1e83836Smrg       /* This block of code implements a tuned matmul, derived from
1421*b1e83836Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422*b1e83836Smrg 
1423*b1e83836Smrg                Bo Kagstrom and Per Ling
1424*b1e83836Smrg                Department of Computing Science
1425*b1e83836Smrg                Umea University
1426*b1e83836Smrg                S-901 87 Umea, Sweden
1427*b1e83836Smrg 
1428*b1e83836Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429*b1e83836Smrg 
1430*b1e83836Smrg       const GFC_REAL_17 *a, *b;
1431*b1e83836Smrg       GFC_REAL_17 *c;
1432*b1e83836Smrg       const index_type m = xcount, n = ycount, k = count;
1433*b1e83836Smrg 
1434*b1e83836Smrg       /* System generated locals */
1435*b1e83836Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436*b1e83836Smrg 		 i1, i2, i3, i4, i5, i6;
1437*b1e83836Smrg 
1438*b1e83836Smrg       /* Local variables */
1439*b1e83836Smrg       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
1440*b1e83836Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
1441*b1e83836Smrg       index_type i, j, l, ii, jj, ll;
1442*b1e83836Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443*b1e83836Smrg       GFC_REAL_17 *t1;
1444*b1e83836Smrg 
1445*b1e83836Smrg       a = abase;
1446*b1e83836Smrg       b = bbase;
1447*b1e83836Smrg       c = retarray->base_addr;
1448*b1e83836Smrg 
1449*b1e83836Smrg       /* Parameter adjustments */
1450*b1e83836Smrg       c_dim1 = rystride;
1451*b1e83836Smrg       c_offset = 1 + c_dim1;
1452*b1e83836Smrg       c -= c_offset;
1453*b1e83836Smrg       a_dim1 = aystride;
1454*b1e83836Smrg       a_offset = 1 + a_dim1;
1455*b1e83836Smrg       a -= a_offset;
1456*b1e83836Smrg       b_dim1 = bystride;
1457*b1e83836Smrg       b_offset = 1 + b_dim1;
1458*b1e83836Smrg       b -= b_offset;
1459*b1e83836Smrg 
1460*b1e83836Smrg       /* Empty c first.  */
1461*b1e83836Smrg       for (j=1; j<=n; j++)
1462*b1e83836Smrg 	for (i=1; i<=m; i++)
1463*b1e83836Smrg 	  c[i + j * c_dim1] = (GFC_REAL_17)0;
1464*b1e83836Smrg 
1465*b1e83836Smrg       /* Early exit if possible */
1466*b1e83836Smrg       if (m == 0 || n == 0 || k == 0)
1467*b1e83836Smrg 	return;
1468*b1e83836Smrg 
1469*b1e83836Smrg       /* Adjust size of t1 to what is needed.  */
1470*b1e83836Smrg       index_type t1_dim, a_sz;
1471*b1e83836Smrg       if (aystride == 1)
1472*b1e83836Smrg         a_sz = rystride;
1473*b1e83836Smrg       else
1474*b1e83836Smrg         a_sz = a_dim1;
1475*b1e83836Smrg 
1476*b1e83836Smrg       t1_dim = a_sz * 256 + b_dim1;
1477*b1e83836Smrg       if (t1_dim > 65536)
1478*b1e83836Smrg 	t1_dim = 65536;
1479*b1e83836Smrg 
1480*b1e83836Smrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
1481*b1e83836Smrg 
1482*b1e83836Smrg       /* Start turning the crank. */
1483*b1e83836Smrg       i1 = n;
1484*b1e83836Smrg       for (jj = 1; jj <= i1; jj += 512)
1485*b1e83836Smrg 	{
1486*b1e83836Smrg 	  /* Computing MIN */
1487*b1e83836Smrg 	  i2 = 512;
1488*b1e83836Smrg 	  i3 = n - jj + 1;
1489*b1e83836Smrg 	  jsec = min(i2,i3);
1490*b1e83836Smrg 	  ujsec = jsec - jsec % 4;
1491*b1e83836Smrg 	  i2 = k;
1492*b1e83836Smrg 	  for (ll = 1; ll <= i2; ll += 256)
1493*b1e83836Smrg 	    {
1494*b1e83836Smrg 	      /* Computing MIN */
1495*b1e83836Smrg 	      i3 = 256;
1496*b1e83836Smrg 	      i4 = k - ll + 1;
1497*b1e83836Smrg 	      lsec = min(i3,i4);
1498*b1e83836Smrg 	      ulsec = lsec - lsec % 2;
1499*b1e83836Smrg 
1500*b1e83836Smrg 	      i3 = m;
1501*b1e83836Smrg 	      for (ii = 1; ii <= i3; ii += 256)
1502*b1e83836Smrg 		{
1503*b1e83836Smrg 		  /* Computing MIN */
1504*b1e83836Smrg 		  i4 = 256;
1505*b1e83836Smrg 		  i5 = m - ii + 1;
1506*b1e83836Smrg 		  isec = min(i4,i5);
1507*b1e83836Smrg 		  uisec = isec - isec % 2;
1508*b1e83836Smrg 		  i4 = ll + ulsec - 1;
1509*b1e83836Smrg 		  for (l = ll; l <= i4; l += 2)
1510*b1e83836Smrg 		    {
1511*b1e83836Smrg 		      i5 = ii + uisec - 1;
1512*b1e83836Smrg 		      for (i = ii; i <= i5; i += 2)
1513*b1e83836Smrg 			{
1514*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515*b1e83836Smrg 					a[i + l * a_dim1];
1516*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517*b1e83836Smrg 					a[i + (l + 1) * a_dim1];
1518*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519*b1e83836Smrg 					a[i + 1 + l * a_dim1];
1520*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521*b1e83836Smrg 					a[i + 1 + (l + 1) * a_dim1];
1522*b1e83836Smrg 			}
1523*b1e83836Smrg 		      if (uisec < isec)
1524*b1e83836Smrg 			{
1525*b1e83836Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
1526*b1e83836Smrg 				    a[ii + isec - 1 + l * a_dim1];
1527*b1e83836Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
1528*b1e83836Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529*b1e83836Smrg 			}
1530*b1e83836Smrg 		    }
1531*b1e83836Smrg 		  if (ulsec < lsec)
1532*b1e83836Smrg 		    {
1533*b1e83836Smrg 		      i4 = ii + isec - 1;
1534*b1e83836Smrg 		      for (i = ii; i<= i4; ++i)
1535*b1e83836Smrg 			{
1536*b1e83836Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537*b1e83836Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
1538*b1e83836Smrg 			}
1539*b1e83836Smrg 		    }
1540*b1e83836Smrg 
1541*b1e83836Smrg 		  uisec = isec - isec % 4;
1542*b1e83836Smrg 		  i4 = jj + ujsec - 1;
1543*b1e83836Smrg 		  for (j = jj; j <= i4; j += 4)
1544*b1e83836Smrg 		    {
1545*b1e83836Smrg 		      i5 = ii + uisec - 1;
1546*b1e83836Smrg 		      for (i = ii; i <= i5; i += 4)
1547*b1e83836Smrg 			{
1548*b1e83836Smrg 			  f11 = c[i + j * c_dim1];
1549*b1e83836Smrg 			  f21 = c[i + 1 + j * c_dim1];
1550*b1e83836Smrg 			  f12 = c[i + (j + 1) * c_dim1];
1551*b1e83836Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552*b1e83836Smrg 			  f13 = c[i + (j + 2) * c_dim1];
1553*b1e83836Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554*b1e83836Smrg 			  f14 = c[i + (j + 3) * c_dim1];
1555*b1e83836Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556*b1e83836Smrg 			  f31 = c[i + 2 + j * c_dim1];
1557*b1e83836Smrg 			  f41 = c[i + 3 + j * c_dim1];
1558*b1e83836Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559*b1e83836Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560*b1e83836Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561*b1e83836Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562*b1e83836Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563*b1e83836Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564*b1e83836Smrg 			  i6 = ll + lsec - 1;
1565*b1e83836Smrg 			  for (l = ll; l <= i6; ++l)
1566*b1e83836Smrg 			    {
1567*b1e83836Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568*b1e83836Smrg 				      * b[l + j * b_dim1];
1569*b1e83836Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570*b1e83836Smrg 				      * b[l + j * b_dim1];
1571*b1e83836Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1573*b1e83836Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1575*b1e83836Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1577*b1e83836Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1579*b1e83836Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1581*b1e83836Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1583*b1e83836Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584*b1e83836Smrg 				      * b[l + j * b_dim1];
1585*b1e83836Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586*b1e83836Smrg 				      * b[l + j * b_dim1];
1587*b1e83836Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1589*b1e83836Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
1591*b1e83836Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1593*b1e83836Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
1595*b1e83836Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1597*b1e83836Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
1599*b1e83836Smrg 			    }
1600*b1e83836Smrg 			  c[i + j * c_dim1] = f11;
1601*b1e83836Smrg 			  c[i + 1 + j * c_dim1] = f21;
1602*b1e83836Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1603*b1e83836Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604*b1e83836Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1605*b1e83836Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606*b1e83836Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1607*b1e83836Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608*b1e83836Smrg 			  c[i + 2 + j * c_dim1] = f31;
1609*b1e83836Smrg 			  c[i + 3 + j * c_dim1] = f41;
1610*b1e83836Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611*b1e83836Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612*b1e83836Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613*b1e83836Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614*b1e83836Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615*b1e83836Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616*b1e83836Smrg 			}
1617*b1e83836Smrg 		      if (uisec < isec)
1618*b1e83836Smrg 			{
1619*b1e83836Smrg 			  i5 = ii + isec - 1;
1620*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1621*b1e83836Smrg 			    {
1622*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1623*b1e83836Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1624*b1e83836Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1625*b1e83836Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1626*b1e83836Smrg 			      i6 = ll + lsec - 1;
1627*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1628*b1e83836Smrg 				{
1629*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1631*b1e83836Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632*b1e83836Smrg 					  257] * b[l + (j + 1) * b_dim1];
1633*b1e83836Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634*b1e83836Smrg 					  257] * b[l + (j + 2) * b_dim1];
1635*b1e83836Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636*b1e83836Smrg 					  257] * b[l + (j + 3) * b_dim1];
1637*b1e83836Smrg 				}
1638*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1639*b1e83836Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1640*b1e83836Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1641*b1e83836Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1642*b1e83836Smrg 			    }
1643*b1e83836Smrg 			}
1644*b1e83836Smrg 		    }
1645*b1e83836Smrg 		  if (ujsec < jsec)
1646*b1e83836Smrg 		    {
1647*b1e83836Smrg 		      i4 = jj + jsec - 1;
1648*b1e83836Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1649*b1e83836Smrg 			{
1650*b1e83836Smrg 			  i5 = ii + uisec - 1;
1651*b1e83836Smrg 			  for (i = ii; i <= i5; i += 4)
1652*b1e83836Smrg 			    {
1653*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1654*b1e83836Smrg 			      f21 = c[i + 1 + j * c_dim1];
1655*b1e83836Smrg 			      f31 = c[i + 2 + j * c_dim1];
1656*b1e83836Smrg 			      f41 = c[i + 3 + j * c_dim1];
1657*b1e83836Smrg 			      i6 = ll + lsec - 1;
1658*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1659*b1e83836Smrg 				{
1660*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1662*b1e83836Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1664*b1e83836Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1666*b1e83836Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1668*b1e83836Smrg 				}
1669*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1670*b1e83836Smrg 			      c[i + 1 + j * c_dim1] = f21;
1671*b1e83836Smrg 			      c[i + 2 + j * c_dim1] = f31;
1672*b1e83836Smrg 			      c[i + 3 + j * c_dim1] = f41;
1673*b1e83836Smrg 			    }
1674*b1e83836Smrg 			  i5 = ii + isec - 1;
1675*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1676*b1e83836Smrg 			    {
1677*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
1678*b1e83836Smrg 			      i6 = ll + lsec - 1;
1679*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
1680*b1e83836Smrg 				{
1681*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682*b1e83836Smrg 					  257] * b[l + j * b_dim1];
1683*b1e83836Smrg 				}
1684*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
1685*b1e83836Smrg 			    }
1686*b1e83836Smrg 			}
1687*b1e83836Smrg 		    }
1688*b1e83836Smrg 		}
1689*b1e83836Smrg 	    }
1690*b1e83836Smrg 	}
1691*b1e83836Smrg       free(t1);
1692*b1e83836Smrg       return;
1693*b1e83836Smrg     }
1694*b1e83836Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695*b1e83836Smrg     {
1696*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1697*b1e83836Smrg 	{
1698*b1e83836Smrg 	  const GFC_REAL_17 *restrict abase_x;
1699*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
1700*b1e83836Smrg 	  GFC_REAL_17 *restrict dest_y;
1701*b1e83836Smrg 	  GFC_REAL_17 s;
1702*b1e83836Smrg 
1703*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
1704*b1e83836Smrg 	    {
1705*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
1706*b1e83836Smrg 	      dest_y = &dest[y*rystride];
1707*b1e83836Smrg 	      for (x = 0; x < xcount; x++)
1708*b1e83836Smrg 		{
1709*b1e83836Smrg 		  abase_x = &abase[x*axstride];
1710*b1e83836Smrg 		  s = (GFC_REAL_17) 0;
1711*b1e83836Smrg 		  for (n = 0; n < count; n++)
1712*b1e83836Smrg 		    s += abase_x[n] * bbase_y[n];
1713*b1e83836Smrg 		  dest_y[x] = s;
1714*b1e83836Smrg 		}
1715*b1e83836Smrg 	    }
1716*b1e83836Smrg 	}
1717*b1e83836Smrg       else
1718*b1e83836Smrg 	{
1719*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
1720*b1e83836Smrg 	  GFC_REAL_17 s;
1721*b1e83836Smrg 
1722*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
1723*b1e83836Smrg 	    {
1724*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
1725*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
1726*b1e83836Smrg 	      for (n = 0; n < count; n++)
1727*b1e83836Smrg 		s += abase[n*axstride] * bbase_y[n];
1728*b1e83836Smrg 	      dest[y*rystride] = s;
1729*b1e83836Smrg 	    }
1730*b1e83836Smrg 	}
1731*b1e83836Smrg     }
1732*b1e83836Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733*b1e83836Smrg     {
1734*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
1735*b1e83836Smrg       GFC_REAL_17 s;
1736*b1e83836Smrg 
1737*b1e83836Smrg       for (y = 0; y < ycount; y++)
1738*b1e83836Smrg 	{
1739*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
1740*b1e83836Smrg 	  s = (GFC_REAL_17) 0;
1741*b1e83836Smrg 	  for (n = 0; n < count; n++)
1742*b1e83836Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743*b1e83836Smrg 	  dest[y*rxstride] = s;
1744*b1e83836Smrg 	}
1745*b1e83836Smrg     }
1746*b1e83836Smrg   else if (axstride < aystride)
1747*b1e83836Smrg     {
1748*b1e83836Smrg       for (y = 0; y < ycount; y++)
1749*b1e83836Smrg 	for (x = 0; x < xcount; x++)
1750*b1e83836Smrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
1751*b1e83836Smrg 
1752*b1e83836Smrg       for (y = 0; y < ycount; y++)
1753*b1e83836Smrg 	for (n = 0; n < count; n++)
1754*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
1755*b1e83836Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1756*b1e83836Smrg 	    dest[x*rxstride + y*rystride] +=
1757*b1e83836Smrg 					abase[x*axstride + n*aystride] *
1758*b1e83836Smrg 					bbase[n*bxstride + y*bystride];
1759*b1e83836Smrg     }
1760*b1e83836Smrg   else
1761*b1e83836Smrg     {
1762*b1e83836Smrg       const GFC_REAL_17 *restrict abase_x;
1763*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
1764*b1e83836Smrg       GFC_REAL_17 *restrict dest_y;
1765*b1e83836Smrg       GFC_REAL_17 s;
1766*b1e83836Smrg 
1767*b1e83836Smrg       for (y = 0; y < ycount; y++)
1768*b1e83836Smrg 	{
1769*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
1770*b1e83836Smrg 	  dest_y = &dest[y*rystride];
1771*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
1772*b1e83836Smrg 	    {
1773*b1e83836Smrg 	      abase_x = &abase[x*axstride];
1774*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
1775*b1e83836Smrg 	      for (n = 0; n < count; n++)
1776*b1e83836Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777*b1e83836Smrg 	      dest_y[x*rxstride] = s;
1778*b1e83836Smrg 	    }
1779*b1e83836Smrg 	}
1780*b1e83836Smrg     }
1781*b1e83836Smrg }
1782*b1e83836Smrg #undef POW3
1783*b1e83836Smrg #undef min
1784*b1e83836Smrg #undef max
1785*b1e83836Smrg 
1786*b1e83836Smrg #endif  /* HAVE_AVX512F */
1787*b1e83836Smrg 
1788*b1e83836Smrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789*b1e83836Smrg 
1790*b1e83836Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791*b1e83836Smrg void
1792*b1e83836Smrg matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray,
1793*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
1794*b1e83836Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795*b1e83836Smrg internal_proto(matmul_r17_avx128_fma3);
1796*b1e83836Smrg #endif
1797*b1e83836Smrg 
1798*b1e83836Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799*b1e83836Smrg void
1800*b1e83836Smrg matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray,
1801*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
1802*b1e83836Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803*b1e83836Smrg internal_proto(matmul_r17_avx128_fma4);
1804*b1e83836Smrg #endif
1805*b1e83836Smrg 
1806*b1e83836Smrg /* Function to fall back to if there is no special processor-specific version.  */
1807*b1e83836Smrg static void
matmul_r17_vanilla(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808*b1e83836Smrg matmul_r17_vanilla (gfc_array_r17 * const restrict retarray,
1809*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
1810*b1e83836Smrg 	int blas_limit, blas_call gemm)
1811*b1e83836Smrg {
1812*b1e83836Smrg   const GFC_REAL_17 * restrict abase;
1813*b1e83836Smrg   const GFC_REAL_17 * restrict bbase;
1814*b1e83836Smrg   GFC_REAL_17 * restrict dest;
1815*b1e83836Smrg 
1816*b1e83836Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817*b1e83836Smrg   index_type x, y, n, count, xcount, ycount;
1818*b1e83836Smrg 
1819*b1e83836Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1820*b1e83836Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1821*b1e83836Smrg 
1822*b1e83836Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823*b1e83836Smrg 
1824*b1e83836Smrg    Either A or B (but not both) can be rank 1:
1825*b1e83836Smrg 
1826*b1e83836Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1827*b1e83836Smrg      dimensioned [1,count], so xcount=1.
1828*b1e83836Smrg 
1829*b1e83836Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1830*b1e83836Smrg      dimensioned [count, 1], so ycount=1.
1831*b1e83836Smrg */
1832*b1e83836Smrg 
1833*b1e83836Smrg   if (retarray->base_addr == NULL)
1834*b1e83836Smrg     {
1835*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1836*b1e83836Smrg         {
1837*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839*b1e83836Smrg         }
1840*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841*b1e83836Smrg         {
1842*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844*b1e83836Smrg         }
1845*b1e83836Smrg       else
1846*b1e83836Smrg         {
1847*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849*b1e83836Smrg 
1850*b1e83836Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1851*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852*b1e83836Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853*b1e83836Smrg         }
1854*b1e83836Smrg 
1855*b1e83836Smrg       retarray->base_addr
1856*b1e83836Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
1857*b1e83836Smrg       retarray->offset = 0;
1858*b1e83836Smrg     }
1859*b1e83836Smrg   else if (unlikely (compile_options.bounds_check))
1860*b1e83836Smrg     {
1861*b1e83836Smrg       index_type ret_extent, arg_extent;
1862*b1e83836Smrg 
1863*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1864*b1e83836Smrg 	{
1865*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867*b1e83836Smrg 	  if (arg_extent != ret_extent)
1868*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1869*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1870*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1871*b1e83836Smrg 	}
1872*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873*b1e83836Smrg 	{
1874*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876*b1e83836Smrg 	  if (arg_extent != ret_extent)
1877*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1878*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1879*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1880*b1e83836Smrg 	}
1881*b1e83836Smrg       else
1882*b1e83836Smrg 	{
1883*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885*b1e83836Smrg 	  if (arg_extent != ret_extent)
1886*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1887*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1888*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1889*b1e83836Smrg 
1890*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892*b1e83836Smrg 	  if (arg_extent != ret_extent)
1893*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1894*b1e83836Smrg 	    		   "array (%ld/%ld) ",
1895*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
1896*b1e83836Smrg 	}
1897*b1e83836Smrg     }
1898*b1e83836Smrg 
1899*b1e83836Smrg 
1900*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901*b1e83836Smrg     {
1902*b1e83836Smrg       /* One-dimensional result may be addressed in the code below
1903*b1e83836Smrg 	 either as a row or a column matrix. We want both cases to
1904*b1e83836Smrg 	 work. */
1905*b1e83836Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906*b1e83836Smrg     }
1907*b1e83836Smrg   else
1908*b1e83836Smrg     {
1909*b1e83836Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910*b1e83836Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911*b1e83836Smrg     }
1912*b1e83836Smrg 
1913*b1e83836Smrg 
1914*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1915*b1e83836Smrg     {
1916*b1e83836Smrg       /* Treat it as a a row matrix A[1,count]. */
1917*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918*b1e83836Smrg       aystride = 1;
1919*b1e83836Smrg 
1920*b1e83836Smrg       xcount = 1;
1921*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1922*b1e83836Smrg     }
1923*b1e83836Smrg   else
1924*b1e83836Smrg     {
1925*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926*b1e83836Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927*b1e83836Smrg 
1928*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1929*b1e83836Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930*b1e83836Smrg     }
1931*b1e83836Smrg 
1932*b1e83836Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933*b1e83836Smrg     {
1934*b1e83836Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935*b1e83836Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936*b1e83836Smrg 		       "in dimension 1: is %ld, should be %ld",
1937*b1e83836Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938*b1e83836Smrg     }
1939*b1e83836Smrg 
1940*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1941*b1e83836Smrg     {
1942*b1e83836Smrg       /* Treat it as a column matrix B[count,1] */
1943*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944*b1e83836Smrg 
1945*b1e83836Smrg       /* bystride should never be used for 1-dimensional b.
1946*b1e83836Smrg          The value is only used for calculation of the
1947*b1e83836Smrg          memory by the buffer.  */
1948*b1e83836Smrg       bystride = 256;
1949*b1e83836Smrg       ycount = 1;
1950*b1e83836Smrg     }
1951*b1e83836Smrg   else
1952*b1e83836Smrg     {
1953*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954*b1e83836Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955*b1e83836Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956*b1e83836Smrg     }
1957*b1e83836Smrg 
1958*b1e83836Smrg   abase = a->base_addr;
1959*b1e83836Smrg   bbase = b->base_addr;
1960*b1e83836Smrg   dest = retarray->base_addr;
1961*b1e83836Smrg 
1962*b1e83836Smrg   /* Now that everything is set up, we perform the multiplication
1963*b1e83836Smrg      itself.  */
1964*b1e83836Smrg 
1965*b1e83836Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966*b1e83836Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1967*b1e83836Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1968*b1e83836Smrg 
1969*b1e83836Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970*b1e83836Smrg       && (bxstride == 1 || bystride == 1)
1971*b1e83836Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1972*b1e83836Smrg           > POW3(blas_limit)))
1973*b1e83836Smrg     {
1974*b1e83836Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1975*b1e83836Smrg       const GFC_REAL_17 one = 1, zero = 0;
1976*b1e83836Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1977*b1e83836Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1978*b1e83836Smrg 
1979*b1e83836Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980*b1e83836Smrg 	{
1981*b1e83836Smrg 	  assert (gemm != NULL);
1982*b1e83836Smrg 	  const char *transa, *transb;
1983*b1e83836Smrg 	  if (try_blas & 2)
1984*b1e83836Smrg 	    transa = "C";
1985*b1e83836Smrg 	  else
1986*b1e83836Smrg 	    transa = axstride == 1 ? "N" : "T";
1987*b1e83836Smrg 
1988*b1e83836Smrg 	  if (try_blas & 4)
1989*b1e83836Smrg 	    transb = "C";
1990*b1e83836Smrg 	  else
1991*b1e83836Smrg 	    transb = bxstride == 1 ? "N" : "T";
1992*b1e83836Smrg 
1993*b1e83836Smrg 	  gemm (transa, transb , &m,
1994*b1e83836Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995*b1e83836Smrg 		&ldc, 1, 1);
1996*b1e83836Smrg 	  return;
1997*b1e83836Smrg 	}
1998*b1e83836Smrg     }
1999*b1e83836Smrg 
2000*b1e83836Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2001*b1e83836Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2002*b1e83836Smrg     {
2003*b1e83836Smrg       /* This block of code implements a tuned matmul, derived from
2004*b1e83836Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005*b1e83836Smrg 
2006*b1e83836Smrg                Bo Kagstrom and Per Ling
2007*b1e83836Smrg                Department of Computing Science
2008*b1e83836Smrg                Umea University
2009*b1e83836Smrg                S-901 87 Umea, Sweden
2010*b1e83836Smrg 
2011*b1e83836Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012*b1e83836Smrg 
2013*b1e83836Smrg       const GFC_REAL_17 *a, *b;
2014*b1e83836Smrg       GFC_REAL_17 *c;
2015*b1e83836Smrg       const index_type m = xcount, n = ycount, k = count;
2016*b1e83836Smrg 
2017*b1e83836Smrg       /* System generated locals */
2018*b1e83836Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019*b1e83836Smrg 		 i1, i2, i3, i4, i5, i6;
2020*b1e83836Smrg 
2021*b1e83836Smrg       /* Local variables */
2022*b1e83836Smrg       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
2023*b1e83836Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2024*b1e83836Smrg       index_type i, j, l, ii, jj, ll;
2025*b1e83836Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026*b1e83836Smrg       GFC_REAL_17 *t1;
2027*b1e83836Smrg 
2028*b1e83836Smrg       a = abase;
2029*b1e83836Smrg       b = bbase;
2030*b1e83836Smrg       c = retarray->base_addr;
2031*b1e83836Smrg 
2032*b1e83836Smrg       /* Parameter adjustments */
2033*b1e83836Smrg       c_dim1 = rystride;
2034*b1e83836Smrg       c_offset = 1 + c_dim1;
2035*b1e83836Smrg       c -= c_offset;
2036*b1e83836Smrg       a_dim1 = aystride;
2037*b1e83836Smrg       a_offset = 1 + a_dim1;
2038*b1e83836Smrg       a -= a_offset;
2039*b1e83836Smrg       b_dim1 = bystride;
2040*b1e83836Smrg       b_offset = 1 + b_dim1;
2041*b1e83836Smrg       b -= b_offset;
2042*b1e83836Smrg 
2043*b1e83836Smrg       /* Empty c first.  */
2044*b1e83836Smrg       for (j=1; j<=n; j++)
2045*b1e83836Smrg 	for (i=1; i<=m; i++)
2046*b1e83836Smrg 	  c[i + j * c_dim1] = (GFC_REAL_17)0;
2047*b1e83836Smrg 
2048*b1e83836Smrg       /* Early exit if possible */
2049*b1e83836Smrg       if (m == 0 || n == 0 || k == 0)
2050*b1e83836Smrg 	return;
2051*b1e83836Smrg 
2052*b1e83836Smrg       /* Adjust size of t1 to what is needed.  */
2053*b1e83836Smrg       index_type t1_dim, a_sz;
2054*b1e83836Smrg       if (aystride == 1)
2055*b1e83836Smrg         a_sz = rystride;
2056*b1e83836Smrg       else
2057*b1e83836Smrg         a_sz = a_dim1;
2058*b1e83836Smrg 
2059*b1e83836Smrg       t1_dim = a_sz * 256 + b_dim1;
2060*b1e83836Smrg       if (t1_dim > 65536)
2061*b1e83836Smrg 	t1_dim = 65536;
2062*b1e83836Smrg 
2063*b1e83836Smrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
2064*b1e83836Smrg 
2065*b1e83836Smrg       /* Start turning the crank. */
2066*b1e83836Smrg       i1 = n;
2067*b1e83836Smrg       for (jj = 1; jj <= i1; jj += 512)
2068*b1e83836Smrg 	{
2069*b1e83836Smrg 	  /* Computing MIN */
2070*b1e83836Smrg 	  i2 = 512;
2071*b1e83836Smrg 	  i3 = n - jj + 1;
2072*b1e83836Smrg 	  jsec = min(i2,i3);
2073*b1e83836Smrg 	  ujsec = jsec - jsec % 4;
2074*b1e83836Smrg 	  i2 = k;
2075*b1e83836Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2076*b1e83836Smrg 	    {
2077*b1e83836Smrg 	      /* Computing MIN */
2078*b1e83836Smrg 	      i3 = 256;
2079*b1e83836Smrg 	      i4 = k - ll + 1;
2080*b1e83836Smrg 	      lsec = min(i3,i4);
2081*b1e83836Smrg 	      ulsec = lsec - lsec % 2;
2082*b1e83836Smrg 
2083*b1e83836Smrg 	      i3 = m;
2084*b1e83836Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2085*b1e83836Smrg 		{
2086*b1e83836Smrg 		  /* Computing MIN */
2087*b1e83836Smrg 		  i4 = 256;
2088*b1e83836Smrg 		  i5 = m - ii + 1;
2089*b1e83836Smrg 		  isec = min(i4,i5);
2090*b1e83836Smrg 		  uisec = isec - isec % 2;
2091*b1e83836Smrg 		  i4 = ll + ulsec - 1;
2092*b1e83836Smrg 		  for (l = ll; l <= i4; l += 2)
2093*b1e83836Smrg 		    {
2094*b1e83836Smrg 		      i5 = ii + uisec - 1;
2095*b1e83836Smrg 		      for (i = ii; i <= i5; i += 2)
2096*b1e83836Smrg 			{
2097*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098*b1e83836Smrg 					a[i + l * a_dim1];
2099*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100*b1e83836Smrg 					a[i + (l + 1) * a_dim1];
2101*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102*b1e83836Smrg 					a[i + 1 + l * a_dim1];
2103*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104*b1e83836Smrg 					a[i + 1 + (l + 1) * a_dim1];
2105*b1e83836Smrg 			}
2106*b1e83836Smrg 		      if (uisec < isec)
2107*b1e83836Smrg 			{
2108*b1e83836Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2109*b1e83836Smrg 				    a[ii + isec - 1 + l * a_dim1];
2110*b1e83836Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2111*b1e83836Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112*b1e83836Smrg 			}
2113*b1e83836Smrg 		    }
2114*b1e83836Smrg 		  if (ulsec < lsec)
2115*b1e83836Smrg 		    {
2116*b1e83836Smrg 		      i4 = ii + isec - 1;
2117*b1e83836Smrg 		      for (i = ii; i<= i4; ++i)
2118*b1e83836Smrg 			{
2119*b1e83836Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120*b1e83836Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2121*b1e83836Smrg 			}
2122*b1e83836Smrg 		    }
2123*b1e83836Smrg 
2124*b1e83836Smrg 		  uisec = isec - isec % 4;
2125*b1e83836Smrg 		  i4 = jj + ujsec - 1;
2126*b1e83836Smrg 		  for (j = jj; j <= i4; j += 4)
2127*b1e83836Smrg 		    {
2128*b1e83836Smrg 		      i5 = ii + uisec - 1;
2129*b1e83836Smrg 		      for (i = ii; i <= i5; i += 4)
2130*b1e83836Smrg 			{
2131*b1e83836Smrg 			  f11 = c[i + j * c_dim1];
2132*b1e83836Smrg 			  f21 = c[i + 1 + j * c_dim1];
2133*b1e83836Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2134*b1e83836Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135*b1e83836Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2136*b1e83836Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137*b1e83836Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2138*b1e83836Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139*b1e83836Smrg 			  f31 = c[i + 2 + j * c_dim1];
2140*b1e83836Smrg 			  f41 = c[i + 3 + j * c_dim1];
2141*b1e83836Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142*b1e83836Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143*b1e83836Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144*b1e83836Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145*b1e83836Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146*b1e83836Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147*b1e83836Smrg 			  i6 = ll + lsec - 1;
2148*b1e83836Smrg 			  for (l = ll; l <= i6; ++l)
2149*b1e83836Smrg 			    {
2150*b1e83836Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151*b1e83836Smrg 				      * b[l + j * b_dim1];
2152*b1e83836Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153*b1e83836Smrg 				      * b[l + j * b_dim1];
2154*b1e83836Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2156*b1e83836Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2158*b1e83836Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2160*b1e83836Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2162*b1e83836Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2164*b1e83836Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2166*b1e83836Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167*b1e83836Smrg 				      * b[l + j * b_dim1];
2168*b1e83836Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169*b1e83836Smrg 				      * b[l + j * b_dim1];
2170*b1e83836Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2172*b1e83836Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2174*b1e83836Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2176*b1e83836Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2178*b1e83836Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2180*b1e83836Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2182*b1e83836Smrg 			    }
2183*b1e83836Smrg 			  c[i + j * c_dim1] = f11;
2184*b1e83836Smrg 			  c[i + 1 + j * c_dim1] = f21;
2185*b1e83836Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2186*b1e83836Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187*b1e83836Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2188*b1e83836Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189*b1e83836Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2190*b1e83836Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191*b1e83836Smrg 			  c[i + 2 + j * c_dim1] = f31;
2192*b1e83836Smrg 			  c[i + 3 + j * c_dim1] = f41;
2193*b1e83836Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194*b1e83836Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195*b1e83836Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196*b1e83836Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197*b1e83836Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198*b1e83836Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199*b1e83836Smrg 			}
2200*b1e83836Smrg 		      if (uisec < isec)
2201*b1e83836Smrg 			{
2202*b1e83836Smrg 			  i5 = ii + isec - 1;
2203*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2204*b1e83836Smrg 			    {
2205*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2206*b1e83836Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2207*b1e83836Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2208*b1e83836Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2209*b1e83836Smrg 			      i6 = ll + lsec - 1;
2210*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2211*b1e83836Smrg 				{
2212*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2214*b1e83836Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215*b1e83836Smrg 					  257] * b[l + (j + 1) * b_dim1];
2216*b1e83836Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217*b1e83836Smrg 					  257] * b[l + (j + 2) * b_dim1];
2218*b1e83836Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219*b1e83836Smrg 					  257] * b[l + (j + 3) * b_dim1];
2220*b1e83836Smrg 				}
2221*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2222*b1e83836Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2223*b1e83836Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2224*b1e83836Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2225*b1e83836Smrg 			    }
2226*b1e83836Smrg 			}
2227*b1e83836Smrg 		    }
2228*b1e83836Smrg 		  if (ujsec < jsec)
2229*b1e83836Smrg 		    {
2230*b1e83836Smrg 		      i4 = jj + jsec - 1;
2231*b1e83836Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2232*b1e83836Smrg 			{
2233*b1e83836Smrg 			  i5 = ii + uisec - 1;
2234*b1e83836Smrg 			  for (i = ii; i <= i5; i += 4)
2235*b1e83836Smrg 			    {
2236*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2237*b1e83836Smrg 			      f21 = c[i + 1 + j * c_dim1];
2238*b1e83836Smrg 			      f31 = c[i + 2 + j * c_dim1];
2239*b1e83836Smrg 			      f41 = c[i + 3 + j * c_dim1];
2240*b1e83836Smrg 			      i6 = ll + lsec - 1;
2241*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2242*b1e83836Smrg 				{
2243*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2245*b1e83836Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2247*b1e83836Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2249*b1e83836Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2251*b1e83836Smrg 				}
2252*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2253*b1e83836Smrg 			      c[i + 1 + j * c_dim1] = f21;
2254*b1e83836Smrg 			      c[i + 2 + j * c_dim1] = f31;
2255*b1e83836Smrg 			      c[i + 3 + j * c_dim1] = f41;
2256*b1e83836Smrg 			    }
2257*b1e83836Smrg 			  i5 = ii + isec - 1;
2258*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2259*b1e83836Smrg 			    {
2260*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2261*b1e83836Smrg 			      i6 = ll + lsec - 1;
2262*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2263*b1e83836Smrg 				{
2264*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2266*b1e83836Smrg 				}
2267*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2268*b1e83836Smrg 			    }
2269*b1e83836Smrg 			}
2270*b1e83836Smrg 		    }
2271*b1e83836Smrg 		}
2272*b1e83836Smrg 	    }
2273*b1e83836Smrg 	}
2274*b1e83836Smrg       free(t1);
2275*b1e83836Smrg       return;
2276*b1e83836Smrg     }
2277*b1e83836Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278*b1e83836Smrg     {
2279*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2280*b1e83836Smrg 	{
2281*b1e83836Smrg 	  const GFC_REAL_17 *restrict abase_x;
2282*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
2283*b1e83836Smrg 	  GFC_REAL_17 *restrict dest_y;
2284*b1e83836Smrg 	  GFC_REAL_17 s;
2285*b1e83836Smrg 
2286*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
2287*b1e83836Smrg 	    {
2288*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
2289*b1e83836Smrg 	      dest_y = &dest[y*rystride];
2290*b1e83836Smrg 	      for (x = 0; x < xcount; x++)
2291*b1e83836Smrg 		{
2292*b1e83836Smrg 		  abase_x = &abase[x*axstride];
2293*b1e83836Smrg 		  s = (GFC_REAL_17) 0;
2294*b1e83836Smrg 		  for (n = 0; n < count; n++)
2295*b1e83836Smrg 		    s += abase_x[n] * bbase_y[n];
2296*b1e83836Smrg 		  dest_y[x] = s;
2297*b1e83836Smrg 		}
2298*b1e83836Smrg 	    }
2299*b1e83836Smrg 	}
2300*b1e83836Smrg       else
2301*b1e83836Smrg 	{
2302*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
2303*b1e83836Smrg 	  GFC_REAL_17 s;
2304*b1e83836Smrg 
2305*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
2306*b1e83836Smrg 	    {
2307*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
2308*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
2309*b1e83836Smrg 	      for (n = 0; n < count; n++)
2310*b1e83836Smrg 		s += abase[n*axstride] * bbase_y[n];
2311*b1e83836Smrg 	      dest[y*rystride] = s;
2312*b1e83836Smrg 	    }
2313*b1e83836Smrg 	}
2314*b1e83836Smrg     }
2315*b1e83836Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316*b1e83836Smrg     {
2317*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
2318*b1e83836Smrg       GFC_REAL_17 s;
2319*b1e83836Smrg 
2320*b1e83836Smrg       for (y = 0; y < ycount; y++)
2321*b1e83836Smrg 	{
2322*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
2323*b1e83836Smrg 	  s = (GFC_REAL_17) 0;
2324*b1e83836Smrg 	  for (n = 0; n < count; n++)
2325*b1e83836Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326*b1e83836Smrg 	  dest[y*rxstride] = s;
2327*b1e83836Smrg 	}
2328*b1e83836Smrg     }
2329*b1e83836Smrg   else if (axstride < aystride)
2330*b1e83836Smrg     {
2331*b1e83836Smrg       for (y = 0; y < ycount; y++)
2332*b1e83836Smrg 	for (x = 0; x < xcount; x++)
2333*b1e83836Smrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
2334*b1e83836Smrg 
2335*b1e83836Smrg       for (y = 0; y < ycount; y++)
2336*b1e83836Smrg 	for (n = 0; n < count; n++)
2337*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
2338*b1e83836Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2339*b1e83836Smrg 	    dest[x*rxstride + y*rystride] +=
2340*b1e83836Smrg 					abase[x*axstride + n*aystride] *
2341*b1e83836Smrg 					bbase[n*bxstride + y*bystride];
2342*b1e83836Smrg     }
2343*b1e83836Smrg   else
2344*b1e83836Smrg     {
2345*b1e83836Smrg       const GFC_REAL_17 *restrict abase_x;
2346*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
2347*b1e83836Smrg       GFC_REAL_17 *restrict dest_y;
2348*b1e83836Smrg       GFC_REAL_17 s;
2349*b1e83836Smrg 
2350*b1e83836Smrg       for (y = 0; y < ycount; y++)
2351*b1e83836Smrg 	{
2352*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
2353*b1e83836Smrg 	  dest_y = &dest[y*rystride];
2354*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
2355*b1e83836Smrg 	    {
2356*b1e83836Smrg 	      abase_x = &abase[x*axstride];
2357*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
2358*b1e83836Smrg 	      for (n = 0; n < count; n++)
2359*b1e83836Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360*b1e83836Smrg 	      dest_y[x*rxstride] = s;
2361*b1e83836Smrg 	    }
2362*b1e83836Smrg 	}
2363*b1e83836Smrg     }
2364*b1e83836Smrg }
2365*b1e83836Smrg #undef POW3
2366*b1e83836Smrg #undef min
2367*b1e83836Smrg #undef max
2368*b1e83836Smrg 
2369*b1e83836Smrg 
2370*b1e83836Smrg /* Compiling main function, with selection code for the processor.  */
2371*b1e83836Smrg 
2372*b1e83836Smrg /* Currently, this is i386 only.  Adjust for other architectures.  */
2373*b1e83836Smrg 
matmul_r17(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2374*b1e83836Smrg void matmul_r17 (gfc_array_r17 * const restrict retarray,
2375*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
2376*b1e83836Smrg 	int blas_limit, blas_call gemm)
2377*b1e83836Smrg {
2378*b1e83836Smrg   static void (*matmul_p) (gfc_array_r17 * const restrict retarray,
2379*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
2380*b1e83836Smrg 	int blas_limit, blas_call gemm);
2381*b1e83836Smrg 
2382*b1e83836Smrg   void (*matmul_fn) (gfc_array_r17 * const restrict retarray,
2383*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
2384*b1e83836Smrg 	int blas_limit, blas_call gemm);
2385*b1e83836Smrg 
2386*b1e83836Smrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2387*b1e83836Smrg   if (matmul_fn == NULL)
2388*b1e83836Smrg     {
2389*b1e83836Smrg       matmul_fn = matmul_r17_vanilla;
2390*b1e83836Smrg       if (__builtin_cpu_is ("intel"))
2391*b1e83836Smrg 	{
2392*b1e83836Smrg           /* Run down the available processors in order of preference.  */
2393*b1e83836Smrg #ifdef HAVE_AVX512F
2394*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx512f"))
2395*b1e83836Smrg 	    {
2396*b1e83836Smrg 	      matmul_fn = matmul_r17_avx512f;
2397*b1e83836Smrg 	      goto store;
2398*b1e83836Smrg 	    }
2399*b1e83836Smrg 
2400*b1e83836Smrg #endif  /* HAVE_AVX512F */
2401*b1e83836Smrg 
2402*b1e83836Smrg #ifdef HAVE_AVX2
2403*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx2")
2404*b1e83836Smrg 	      && __builtin_cpu_supports ("fma"))
2405*b1e83836Smrg 	    {
2406*b1e83836Smrg 	      matmul_fn = matmul_r17_avx2;
2407*b1e83836Smrg 	      goto store;
2408*b1e83836Smrg 	    }
2409*b1e83836Smrg 
2410*b1e83836Smrg #endif
2411*b1e83836Smrg 
2412*b1e83836Smrg #ifdef HAVE_AVX
2413*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx"))
2414*b1e83836Smrg  	    {
2415*b1e83836Smrg               matmul_fn = matmul_r17_avx;
2416*b1e83836Smrg 	      goto store;
2417*b1e83836Smrg 	    }
2418*b1e83836Smrg #endif  /* HAVE_AVX */
2419*b1e83836Smrg         }
2420*b1e83836Smrg     else if (__builtin_cpu_is ("amd"))
2421*b1e83836Smrg       {
2422*b1e83836Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2423*b1e83836Smrg 	if (__builtin_cpu_supports ("avx")
2424*b1e83836Smrg 	    && __builtin_cpu_supports ("fma"))
2425*b1e83836Smrg 	  {
2426*b1e83836Smrg             matmul_fn = matmul_r17_avx128_fma3;
2427*b1e83836Smrg 	    goto store;
2428*b1e83836Smrg 	  }
2429*b1e83836Smrg #endif
2430*b1e83836Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2431*b1e83836Smrg 	if (__builtin_cpu_supports ("avx")
2432*b1e83836Smrg 	    && __builtin_cpu_supports ("fma4"))
2433*b1e83836Smrg 	  {
2434*b1e83836Smrg             matmul_fn = matmul_r17_avx128_fma4;
2435*b1e83836Smrg 	    goto store;
2436*b1e83836Smrg 	  }
2437*b1e83836Smrg #endif
2438*b1e83836Smrg 
2439*b1e83836Smrg       }
2440*b1e83836Smrg    store:
2441*b1e83836Smrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2442*b1e83836Smrg    }
2443*b1e83836Smrg 
2444*b1e83836Smrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2445*b1e83836Smrg }
2446*b1e83836Smrg 
2447*b1e83836Smrg #else  /* Just the vanilla function.  */
2448*b1e83836Smrg 
2449*b1e83836Smrg void
matmul_r17(gfc_array_r17 * const restrict retarray,gfc_array_r17 * const restrict a,gfc_array_r17 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2450*b1e83836Smrg matmul_r17 (gfc_array_r17 * const restrict retarray,
2451*b1e83836Smrg 	gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
2452*b1e83836Smrg 	int blas_limit, blas_call gemm)
2453*b1e83836Smrg {
2454*b1e83836Smrg   const GFC_REAL_17 * restrict abase;
2455*b1e83836Smrg   const GFC_REAL_17 * restrict bbase;
2456*b1e83836Smrg   GFC_REAL_17 * restrict dest;
2457*b1e83836Smrg 
2458*b1e83836Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2459*b1e83836Smrg   index_type x, y, n, count, xcount, ycount;
2460*b1e83836Smrg 
2461*b1e83836Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
2462*b1e83836Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
2463*b1e83836Smrg 
2464*b1e83836Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2465*b1e83836Smrg 
2466*b1e83836Smrg    Either A or B (but not both) can be rank 1:
2467*b1e83836Smrg 
2468*b1e83836Smrg    o One-dimensional argument A is implicitly treated as a row matrix
2469*b1e83836Smrg      dimensioned [1,count], so xcount=1.
2470*b1e83836Smrg 
2471*b1e83836Smrg    o One-dimensional argument B is implicitly treated as a column matrix
2472*b1e83836Smrg      dimensioned [count, 1], so ycount=1.
2473*b1e83836Smrg */
2474*b1e83836Smrg 
2475*b1e83836Smrg   if (retarray->base_addr == NULL)
2476*b1e83836Smrg     {
2477*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2478*b1e83836Smrg         {
2479*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2480*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2481*b1e83836Smrg         }
2482*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2483*b1e83836Smrg         {
2484*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2485*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2486*b1e83836Smrg         }
2487*b1e83836Smrg       else
2488*b1e83836Smrg         {
2489*b1e83836Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2490*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2491*b1e83836Smrg 
2492*b1e83836Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
2493*b1e83836Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2494*b1e83836Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2495*b1e83836Smrg         }
2496*b1e83836Smrg 
2497*b1e83836Smrg       retarray->base_addr
2498*b1e83836Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
2499*b1e83836Smrg       retarray->offset = 0;
2500*b1e83836Smrg     }
2501*b1e83836Smrg   else if (unlikely (compile_options.bounds_check))
2502*b1e83836Smrg     {
2503*b1e83836Smrg       index_type ret_extent, arg_extent;
2504*b1e83836Smrg 
2505*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2506*b1e83836Smrg 	{
2507*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2508*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2509*b1e83836Smrg 	  if (arg_extent != ret_extent)
2510*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2511*b1e83836Smrg 	    		   "array (%ld/%ld) ",
2512*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
2513*b1e83836Smrg 	}
2514*b1e83836Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2515*b1e83836Smrg 	{
2516*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2517*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2518*b1e83836Smrg 	  if (arg_extent != ret_extent)
2519*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2520*b1e83836Smrg 	    		   "array (%ld/%ld) ",
2521*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
2522*b1e83836Smrg 	}
2523*b1e83836Smrg       else
2524*b1e83836Smrg 	{
2525*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2526*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2527*b1e83836Smrg 	  if (arg_extent != ret_extent)
2528*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2529*b1e83836Smrg 	    		   "array (%ld/%ld) ",
2530*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
2531*b1e83836Smrg 
2532*b1e83836Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2533*b1e83836Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2534*b1e83836Smrg 	  if (arg_extent != ret_extent)
2535*b1e83836Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
2536*b1e83836Smrg 	    		   "array (%ld/%ld) ",
2537*b1e83836Smrg 			   (long int) ret_extent, (long int) arg_extent);
2538*b1e83836Smrg 	}
2539*b1e83836Smrg     }
2540*b1e83836Smrg 
2541*b1e83836Smrg 
2542*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2543*b1e83836Smrg     {
2544*b1e83836Smrg       /* One-dimensional result may be addressed in the code below
2545*b1e83836Smrg 	 either as a row or a column matrix. We want both cases to
2546*b1e83836Smrg 	 work. */
2547*b1e83836Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548*b1e83836Smrg     }
2549*b1e83836Smrg   else
2550*b1e83836Smrg     {
2551*b1e83836Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2552*b1e83836Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2553*b1e83836Smrg     }
2554*b1e83836Smrg 
2555*b1e83836Smrg 
2556*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
2557*b1e83836Smrg     {
2558*b1e83836Smrg       /* Treat it as a a row matrix A[1,count]. */
2559*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2560*b1e83836Smrg       aystride = 1;
2561*b1e83836Smrg 
2562*b1e83836Smrg       xcount = 1;
2563*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
2564*b1e83836Smrg     }
2565*b1e83836Smrg   else
2566*b1e83836Smrg     {
2567*b1e83836Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2568*b1e83836Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2569*b1e83836Smrg 
2570*b1e83836Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
2571*b1e83836Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2572*b1e83836Smrg     }
2573*b1e83836Smrg 
2574*b1e83836Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2575*b1e83836Smrg     {
2576*b1e83836Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2577*b1e83836Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2578*b1e83836Smrg 		       "in dimension 1: is %ld, should be %ld",
2579*b1e83836Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2580*b1e83836Smrg     }
2581*b1e83836Smrg 
2582*b1e83836Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
2583*b1e83836Smrg     {
2584*b1e83836Smrg       /* Treat it as a column matrix B[count,1] */
2585*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2586*b1e83836Smrg 
2587*b1e83836Smrg       /* bystride should never be used for 1-dimensional b.
2588*b1e83836Smrg          The value is only used for calculation of the
2589*b1e83836Smrg          memory by the buffer.  */
2590*b1e83836Smrg       bystride = 256;
2591*b1e83836Smrg       ycount = 1;
2592*b1e83836Smrg     }
2593*b1e83836Smrg   else
2594*b1e83836Smrg     {
2595*b1e83836Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2596*b1e83836Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2597*b1e83836Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2598*b1e83836Smrg     }
2599*b1e83836Smrg 
2600*b1e83836Smrg   abase = a->base_addr;
2601*b1e83836Smrg   bbase = b->base_addr;
2602*b1e83836Smrg   dest = retarray->base_addr;
2603*b1e83836Smrg 
2604*b1e83836Smrg   /* Now that everything is set up, we perform the multiplication
2605*b1e83836Smrg      itself.  */
2606*b1e83836Smrg 
2607*b1e83836Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2608*b1e83836Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
2609*b1e83836Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
2610*b1e83836Smrg 
2611*b1e83836Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2612*b1e83836Smrg       && (bxstride == 1 || bystride == 1)
2613*b1e83836Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
2614*b1e83836Smrg           > POW3(blas_limit)))
2615*b1e83836Smrg     {
2616*b1e83836Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
2617*b1e83836Smrg       const GFC_REAL_17 one = 1, zero = 0;
2618*b1e83836Smrg       const int lda = (axstride == 1) ? aystride : axstride,
2619*b1e83836Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
2620*b1e83836Smrg 
2621*b1e83836Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2622*b1e83836Smrg 	{
2623*b1e83836Smrg 	  assert (gemm != NULL);
2624*b1e83836Smrg 	  const char *transa, *transb;
2625*b1e83836Smrg 	  if (try_blas & 2)
2626*b1e83836Smrg 	    transa = "C";
2627*b1e83836Smrg 	  else
2628*b1e83836Smrg 	    transa = axstride == 1 ? "N" : "T";
2629*b1e83836Smrg 
2630*b1e83836Smrg 	  if (try_blas & 4)
2631*b1e83836Smrg 	    transb = "C";
2632*b1e83836Smrg 	  else
2633*b1e83836Smrg 	    transb = bxstride == 1 ? "N" : "T";
2634*b1e83836Smrg 
2635*b1e83836Smrg 	  gemm (transa, transb , &m,
2636*b1e83836Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2637*b1e83836Smrg 		&ldc, 1, 1);
2638*b1e83836Smrg 	  return;
2639*b1e83836Smrg 	}
2640*b1e83836Smrg     }
2641*b1e83836Smrg 
2642*b1e83836Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2643*b1e83836Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2644*b1e83836Smrg     {
2645*b1e83836Smrg       /* This block of code implements a tuned matmul, derived from
2646*b1e83836Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2647*b1e83836Smrg 
2648*b1e83836Smrg                Bo Kagstrom and Per Ling
2649*b1e83836Smrg                Department of Computing Science
2650*b1e83836Smrg                Umea University
2651*b1e83836Smrg                S-901 87 Umea, Sweden
2652*b1e83836Smrg 
2653*b1e83836Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2654*b1e83836Smrg 
2655*b1e83836Smrg       const GFC_REAL_17 *a, *b;
2656*b1e83836Smrg       GFC_REAL_17 *c;
2657*b1e83836Smrg       const index_type m = xcount, n = ycount, k = count;
2658*b1e83836Smrg 
2659*b1e83836Smrg       /* System generated locals */
2660*b1e83836Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2661*b1e83836Smrg 		 i1, i2, i3, i4, i5, i6;
2662*b1e83836Smrg 
2663*b1e83836Smrg       /* Local variables */
2664*b1e83836Smrg       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
2665*b1e83836Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2666*b1e83836Smrg       index_type i, j, l, ii, jj, ll;
2667*b1e83836Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2668*b1e83836Smrg       GFC_REAL_17 *t1;
2669*b1e83836Smrg 
2670*b1e83836Smrg       a = abase;
2671*b1e83836Smrg       b = bbase;
2672*b1e83836Smrg       c = retarray->base_addr;
2673*b1e83836Smrg 
2674*b1e83836Smrg       /* Parameter adjustments */
2675*b1e83836Smrg       c_dim1 = rystride;
2676*b1e83836Smrg       c_offset = 1 + c_dim1;
2677*b1e83836Smrg       c -= c_offset;
2678*b1e83836Smrg       a_dim1 = aystride;
2679*b1e83836Smrg       a_offset = 1 + a_dim1;
2680*b1e83836Smrg       a -= a_offset;
2681*b1e83836Smrg       b_dim1 = bystride;
2682*b1e83836Smrg       b_offset = 1 + b_dim1;
2683*b1e83836Smrg       b -= b_offset;
2684*b1e83836Smrg 
2685*b1e83836Smrg       /* Empty c first.  */
2686*b1e83836Smrg       for (j=1; j<=n; j++)
2687*b1e83836Smrg 	for (i=1; i<=m; i++)
2688*b1e83836Smrg 	  c[i + j * c_dim1] = (GFC_REAL_17)0;
2689*b1e83836Smrg 
2690*b1e83836Smrg       /* Early exit if possible */
2691*b1e83836Smrg       if (m == 0 || n == 0 || k == 0)
2692*b1e83836Smrg 	return;
2693*b1e83836Smrg 
2694*b1e83836Smrg       /* Adjust size of t1 to what is needed.  */
2695*b1e83836Smrg       index_type t1_dim, a_sz;
2696*b1e83836Smrg       if (aystride == 1)
2697*b1e83836Smrg         a_sz = rystride;
2698*b1e83836Smrg       else
2699*b1e83836Smrg         a_sz = a_dim1;
2700*b1e83836Smrg 
2701*b1e83836Smrg       t1_dim = a_sz * 256 + b_dim1;
2702*b1e83836Smrg       if (t1_dim > 65536)
2703*b1e83836Smrg 	t1_dim = 65536;
2704*b1e83836Smrg 
2705*b1e83836Smrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
2706*b1e83836Smrg 
2707*b1e83836Smrg       /* Start turning the crank. */
2708*b1e83836Smrg       i1 = n;
2709*b1e83836Smrg       for (jj = 1; jj <= i1; jj += 512)
2710*b1e83836Smrg 	{
2711*b1e83836Smrg 	  /* Computing MIN */
2712*b1e83836Smrg 	  i2 = 512;
2713*b1e83836Smrg 	  i3 = n - jj + 1;
2714*b1e83836Smrg 	  jsec = min(i2,i3);
2715*b1e83836Smrg 	  ujsec = jsec - jsec % 4;
2716*b1e83836Smrg 	  i2 = k;
2717*b1e83836Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2718*b1e83836Smrg 	    {
2719*b1e83836Smrg 	      /* Computing MIN */
2720*b1e83836Smrg 	      i3 = 256;
2721*b1e83836Smrg 	      i4 = k - ll + 1;
2722*b1e83836Smrg 	      lsec = min(i3,i4);
2723*b1e83836Smrg 	      ulsec = lsec - lsec % 2;
2724*b1e83836Smrg 
2725*b1e83836Smrg 	      i3 = m;
2726*b1e83836Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2727*b1e83836Smrg 		{
2728*b1e83836Smrg 		  /* Computing MIN */
2729*b1e83836Smrg 		  i4 = 256;
2730*b1e83836Smrg 		  i5 = m - ii + 1;
2731*b1e83836Smrg 		  isec = min(i4,i5);
2732*b1e83836Smrg 		  uisec = isec - isec % 2;
2733*b1e83836Smrg 		  i4 = ll + ulsec - 1;
2734*b1e83836Smrg 		  for (l = ll; l <= i4; l += 2)
2735*b1e83836Smrg 		    {
2736*b1e83836Smrg 		      i5 = ii + uisec - 1;
2737*b1e83836Smrg 		      for (i = ii; i <= i5; i += 2)
2738*b1e83836Smrg 			{
2739*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2740*b1e83836Smrg 					a[i + l * a_dim1];
2741*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2742*b1e83836Smrg 					a[i + (l + 1) * a_dim1];
2743*b1e83836Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2744*b1e83836Smrg 					a[i + 1 + l * a_dim1];
2745*b1e83836Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2746*b1e83836Smrg 					a[i + 1 + (l + 1) * a_dim1];
2747*b1e83836Smrg 			}
2748*b1e83836Smrg 		      if (uisec < isec)
2749*b1e83836Smrg 			{
2750*b1e83836Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2751*b1e83836Smrg 				    a[ii + isec - 1 + l * a_dim1];
2752*b1e83836Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2753*b1e83836Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2754*b1e83836Smrg 			}
2755*b1e83836Smrg 		    }
2756*b1e83836Smrg 		  if (ulsec < lsec)
2757*b1e83836Smrg 		    {
2758*b1e83836Smrg 		      i4 = ii + isec - 1;
2759*b1e83836Smrg 		      for (i = ii; i<= i4; ++i)
2760*b1e83836Smrg 			{
2761*b1e83836Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2762*b1e83836Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2763*b1e83836Smrg 			}
2764*b1e83836Smrg 		    }
2765*b1e83836Smrg 
2766*b1e83836Smrg 		  uisec = isec - isec % 4;
2767*b1e83836Smrg 		  i4 = jj + ujsec - 1;
2768*b1e83836Smrg 		  for (j = jj; j <= i4; j += 4)
2769*b1e83836Smrg 		    {
2770*b1e83836Smrg 		      i5 = ii + uisec - 1;
2771*b1e83836Smrg 		      for (i = ii; i <= i5; i += 4)
2772*b1e83836Smrg 			{
2773*b1e83836Smrg 			  f11 = c[i + j * c_dim1];
2774*b1e83836Smrg 			  f21 = c[i + 1 + j * c_dim1];
2775*b1e83836Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2776*b1e83836Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2777*b1e83836Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2778*b1e83836Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2779*b1e83836Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2780*b1e83836Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2781*b1e83836Smrg 			  f31 = c[i + 2 + j * c_dim1];
2782*b1e83836Smrg 			  f41 = c[i + 3 + j * c_dim1];
2783*b1e83836Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2784*b1e83836Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2785*b1e83836Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2786*b1e83836Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2787*b1e83836Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2788*b1e83836Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2789*b1e83836Smrg 			  i6 = ll + lsec - 1;
2790*b1e83836Smrg 			  for (l = ll; l <= i6; ++l)
2791*b1e83836Smrg 			    {
2792*b1e83836Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2793*b1e83836Smrg 				      * b[l + j * b_dim1];
2794*b1e83836Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2795*b1e83836Smrg 				      * b[l + j * b_dim1];
2796*b1e83836Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2797*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2798*b1e83836Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2799*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2800*b1e83836Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2801*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2802*b1e83836Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2803*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2804*b1e83836Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2805*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2806*b1e83836Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2807*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2808*b1e83836Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2809*b1e83836Smrg 				      * b[l + j * b_dim1];
2810*b1e83836Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2811*b1e83836Smrg 				      * b[l + j * b_dim1];
2812*b1e83836Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2813*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2814*b1e83836Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2815*b1e83836Smrg 				      * b[l + (j + 1) * b_dim1];
2816*b1e83836Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2817*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2818*b1e83836Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2819*b1e83836Smrg 				      * b[l + (j + 2) * b_dim1];
2820*b1e83836Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2821*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2822*b1e83836Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2823*b1e83836Smrg 				      * b[l + (j + 3) * b_dim1];
2824*b1e83836Smrg 			    }
2825*b1e83836Smrg 			  c[i + j * c_dim1] = f11;
2826*b1e83836Smrg 			  c[i + 1 + j * c_dim1] = f21;
2827*b1e83836Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2828*b1e83836Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2829*b1e83836Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2830*b1e83836Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2831*b1e83836Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2832*b1e83836Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2833*b1e83836Smrg 			  c[i + 2 + j * c_dim1] = f31;
2834*b1e83836Smrg 			  c[i + 3 + j * c_dim1] = f41;
2835*b1e83836Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2836*b1e83836Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2837*b1e83836Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2838*b1e83836Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2839*b1e83836Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2840*b1e83836Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2841*b1e83836Smrg 			}
2842*b1e83836Smrg 		      if (uisec < isec)
2843*b1e83836Smrg 			{
2844*b1e83836Smrg 			  i5 = ii + isec - 1;
2845*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2846*b1e83836Smrg 			    {
2847*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2848*b1e83836Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2849*b1e83836Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2850*b1e83836Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2851*b1e83836Smrg 			      i6 = ll + lsec - 1;
2852*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2853*b1e83836Smrg 				{
2854*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2855*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2856*b1e83836Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2857*b1e83836Smrg 					  257] * b[l + (j + 1) * b_dim1];
2858*b1e83836Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2859*b1e83836Smrg 					  257] * b[l + (j + 2) * b_dim1];
2860*b1e83836Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2861*b1e83836Smrg 					  257] * b[l + (j + 3) * b_dim1];
2862*b1e83836Smrg 				}
2863*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2864*b1e83836Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2865*b1e83836Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2866*b1e83836Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2867*b1e83836Smrg 			    }
2868*b1e83836Smrg 			}
2869*b1e83836Smrg 		    }
2870*b1e83836Smrg 		  if (ujsec < jsec)
2871*b1e83836Smrg 		    {
2872*b1e83836Smrg 		      i4 = jj + jsec - 1;
2873*b1e83836Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2874*b1e83836Smrg 			{
2875*b1e83836Smrg 			  i5 = ii + uisec - 1;
2876*b1e83836Smrg 			  for (i = ii; i <= i5; i += 4)
2877*b1e83836Smrg 			    {
2878*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2879*b1e83836Smrg 			      f21 = c[i + 1 + j * c_dim1];
2880*b1e83836Smrg 			      f31 = c[i + 2 + j * c_dim1];
2881*b1e83836Smrg 			      f41 = c[i + 3 + j * c_dim1];
2882*b1e83836Smrg 			      i6 = ll + lsec - 1;
2883*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2884*b1e83836Smrg 				{
2885*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2886*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2887*b1e83836Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2888*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2889*b1e83836Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2890*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2891*b1e83836Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2892*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2893*b1e83836Smrg 				}
2894*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2895*b1e83836Smrg 			      c[i + 1 + j * c_dim1] = f21;
2896*b1e83836Smrg 			      c[i + 2 + j * c_dim1] = f31;
2897*b1e83836Smrg 			      c[i + 3 + j * c_dim1] = f41;
2898*b1e83836Smrg 			    }
2899*b1e83836Smrg 			  i5 = ii + isec - 1;
2900*b1e83836Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2901*b1e83836Smrg 			    {
2902*b1e83836Smrg 			      f11 = c[i + j * c_dim1];
2903*b1e83836Smrg 			      i6 = ll + lsec - 1;
2904*b1e83836Smrg 			      for (l = ll; l <= i6; ++l)
2905*b1e83836Smrg 				{
2906*b1e83836Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2907*b1e83836Smrg 					  257] * b[l + j * b_dim1];
2908*b1e83836Smrg 				}
2909*b1e83836Smrg 			      c[i + j * c_dim1] = f11;
2910*b1e83836Smrg 			    }
2911*b1e83836Smrg 			}
2912*b1e83836Smrg 		    }
2913*b1e83836Smrg 		}
2914*b1e83836Smrg 	    }
2915*b1e83836Smrg 	}
2916*b1e83836Smrg       free(t1);
2917*b1e83836Smrg       return;
2918*b1e83836Smrg     }
2919*b1e83836Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2920*b1e83836Smrg     {
2921*b1e83836Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2922*b1e83836Smrg 	{
2923*b1e83836Smrg 	  const GFC_REAL_17 *restrict abase_x;
2924*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
2925*b1e83836Smrg 	  GFC_REAL_17 *restrict dest_y;
2926*b1e83836Smrg 	  GFC_REAL_17 s;
2927*b1e83836Smrg 
2928*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
2929*b1e83836Smrg 	    {
2930*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
2931*b1e83836Smrg 	      dest_y = &dest[y*rystride];
2932*b1e83836Smrg 	      for (x = 0; x < xcount; x++)
2933*b1e83836Smrg 		{
2934*b1e83836Smrg 		  abase_x = &abase[x*axstride];
2935*b1e83836Smrg 		  s = (GFC_REAL_17) 0;
2936*b1e83836Smrg 		  for (n = 0; n < count; n++)
2937*b1e83836Smrg 		    s += abase_x[n] * bbase_y[n];
2938*b1e83836Smrg 		  dest_y[x] = s;
2939*b1e83836Smrg 		}
2940*b1e83836Smrg 	    }
2941*b1e83836Smrg 	}
2942*b1e83836Smrg       else
2943*b1e83836Smrg 	{
2944*b1e83836Smrg 	  const GFC_REAL_17 *restrict bbase_y;
2945*b1e83836Smrg 	  GFC_REAL_17 s;
2946*b1e83836Smrg 
2947*b1e83836Smrg 	  for (y = 0; y < ycount; y++)
2948*b1e83836Smrg 	    {
2949*b1e83836Smrg 	      bbase_y = &bbase[y*bystride];
2950*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
2951*b1e83836Smrg 	      for (n = 0; n < count; n++)
2952*b1e83836Smrg 		s += abase[n*axstride] * bbase_y[n];
2953*b1e83836Smrg 	      dest[y*rystride] = s;
2954*b1e83836Smrg 	    }
2955*b1e83836Smrg 	}
2956*b1e83836Smrg     }
2957*b1e83836Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2958*b1e83836Smrg     {
2959*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
2960*b1e83836Smrg       GFC_REAL_17 s;
2961*b1e83836Smrg 
2962*b1e83836Smrg       for (y = 0; y < ycount; y++)
2963*b1e83836Smrg 	{
2964*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
2965*b1e83836Smrg 	  s = (GFC_REAL_17) 0;
2966*b1e83836Smrg 	  for (n = 0; n < count; n++)
2967*b1e83836Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2968*b1e83836Smrg 	  dest[y*rxstride] = s;
2969*b1e83836Smrg 	}
2970*b1e83836Smrg     }
2971*b1e83836Smrg   else if (axstride < aystride)
2972*b1e83836Smrg     {
2973*b1e83836Smrg       for (y = 0; y < ycount; y++)
2974*b1e83836Smrg 	for (x = 0; x < xcount; x++)
2975*b1e83836Smrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
2976*b1e83836Smrg 
2977*b1e83836Smrg       for (y = 0; y < ycount; y++)
2978*b1e83836Smrg 	for (n = 0; n < count; n++)
2979*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
2980*b1e83836Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2981*b1e83836Smrg 	    dest[x*rxstride + y*rystride] +=
2982*b1e83836Smrg 					abase[x*axstride + n*aystride] *
2983*b1e83836Smrg 					bbase[n*bxstride + y*bystride];
2984*b1e83836Smrg     }
2985*b1e83836Smrg   else
2986*b1e83836Smrg     {
2987*b1e83836Smrg       const GFC_REAL_17 *restrict abase_x;
2988*b1e83836Smrg       const GFC_REAL_17 *restrict bbase_y;
2989*b1e83836Smrg       GFC_REAL_17 *restrict dest_y;
2990*b1e83836Smrg       GFC_REAL_17 s;
2991*b1e83836Smrg 
2992*b1e83836Smrg       for (y = 0; y < ycount; y++)
2993*b1e83836Smrg 	{
2994*b1e83836Smrg 	  bbase_y = &bbase[y*bystride];
2995*b1e83836Smrg 	  dest_y = &dest[y*rystride];
2996*b1e83836Smrg 	  for (x = 0; x < xcount; x++)
2997*b1e83836Smrg 	    {
2998*b1e83836Smrg 	      abase_x = &abase[x*axstride];
2999*b1e83836Smrg 	      s = (GFC_REAL_17) 0;
3000*b1e83836Smrg 	      for (n = 0; n < count; n++)
3001*b1e83836Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3002*b1e83836Smrg 	      dest_y[x*rxstride] = s;
3003*b1e83836Smrg 	    }
3004*b1e83836Smrg 	}
3005*b1e83836Smrg     }
3006*b1e83836Smrg }
3007*b1e83836Smrg #undef POW3
3008*b1e83836Smrg #undef min
3009*b1e83836Smrg #undef max
3010*b1e83836Smrg 
3011*b1e83836Smrg #endif
3012*b1e83836Smrg #endif
3013*b1e83836Smrg 
3014