xref: /netbsd-src/external/gpl3/gcc/dist/libgfortran/generated/matmul_i16.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1181254a7Smrg /* Implementation of the MATMUL intrinsic
2*b1e83836Smrg    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3181254a7Smrg    Contributed by Paul Brook <paul@nowt.org>
4181254a7Smrg 
5181254a7Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6181254a7Smrg 
7181254a7Smrg Libgfortran is free software; you can redistribute it and/or
8181254a7Smrg modify it under the terms of the GNU General Public
9181254a7Smrg License as published by the Free Software Foundation; either
10181254a7Smrg version 3 of the License, or (at your option) any later version.
11181254a7Smrg 
12181254a7Smrg Libgfortran is distributed in the hope that it will be useful,
13181254a7Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14181254a7Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15181254a7Smrg GNU General Public License for more details.
16181254a7Smrg 
17181254a7Smrg Under Section 7 of GPL version 3, you are granted additional
18181254a7Smrg permissions described in the GCC Runtime Library Exception, version
19181254a7Smrg 3.1, as published by the Free Software Foundation.
20181254a7Smrg 
21181254a7Smrg You should have received a copy of the GNU General Public License and
22181254a7Smrg a copy of the GCC Runtime Library Exception along with this program;
23181254a7Smrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24181254a7Smrg <http://www.gnu.org/licenses/>.  */
25181254a7Smrg 
26181254a7Smrg #include "libgfortran.h"
27181254a7Smrg #include <string.h>
28181254a7Smrg #include <assert.h>
29181254a7Smrg 
30181254a7Smrg 
31181254a7Smrg #if defined (HAVE_GFC_INTEGER_16)
32181254a7Smrg 
33181254a7Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34181254a7Smrg    passed to us by the front-end, in which case we call it for large
35181254a7Smrg    matrices.  */
36181254a7Smrg 
37181254a7Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38181254a7Smrg                           const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
39181254a7Smrg                           const int *, const GFC_INTEGER_16 *, const int *,
40181254a7Smrg                           const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
41181254a7Smrg                           int, int);
42181254a7Smrg 
43181254a7Smrg /* The order of loops is different in the case of plain matrix
44181254a7Smrg    multiplication C=MATMUL(A,B), and in the frequent special case where
45181254a7Smrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
46181254a7Smrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47181254a7Smrg    looking at their strides.
48181254a7Smrg 
49181254a7Smrg    The equivalent Fortran pseudo-code is:
50181254a7Smrg 
51181254a7Smrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52181254a7Smrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
53181254a7Smrg      C = 0
54181254a7Smrg      DO J=1,N
55181254a7Smrg        DO K=1,COUNT
56181254a7Smrg          DO I=1,M
57181254a7Smrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58181254a7Smrg    ELSE
59181254a7Smrg      DO J=1,N
60181254a7Smrg        DO I=1,M
61181254a7Smrg          S = 0
62181254a7Smrg          DO K=1,COUNT
63181254a7Smrg            S = S+A(I,K)*B(K,J)
64181254a7Smrg          C(I,J) = S
65181254a7Smrg    ENDIF
66181254a7Smrg */
67181254a7Smrg 
68181254a7Smrg /* If try_blas is set to a nonzero value, then the matmul function will
69181254a7Smrg    see if there is a way to perform the matrix multiplication by a call
70181254a7Smrg    to the BLAS gemm function.  */
71181254a7Smrg 
72181254a7Smrg extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
73181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
74181254a7Smrg 	int blas_limit, blas_call gemm);
75181254a7Smrg export_proto(matmul_i16);
76181254a7Smrg 
77181254a7Smrg /* Put exhaustive list of possible architectures here here, ORed together.  */
78181254a7Smrg 
79181254a7Smrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80181254a7Smrg 
81181254a7Smrg #ifdef HAVE_AVX
82181254a7Smrg static void
83181254a7Smrg matmul_i16_avx (gfc_array_i16 * const restrict retarray,
84181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
85181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86181254a7Smrg static void
matmul_i16_avx(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87181254a7Smrg matmul_i16_avx (gfc_array_i16 * const restrict retarray,
88181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
89181254a7Smrg 	int blas_limit, blas_call gemm)
90181254a7Smrg {
91181254a7Smrg   const GFC_INTEGER_16 * restrict abase;
92181254a7Smrg   const GFC_INTEGER_16 * restrict bbase;
93181254a7Smrg   GFC_INTEGER_16 * restrict dest;
94181254a7Smrg 
95181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96181254a7Smrg   index_type x, y, n, count, xcount, ycount;
97181254a7Smrg 
98181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
99181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
100181254a7Smrg 
101181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102181254a7Smrg 
103181254a7Smrg    Either A or B (but not both) can be rank 1:
104181254a7Smrg 
105181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
106181254a7Smrg      dimensioned [1,count], so xcount=1.
107181254a7Smrg 
108181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
109181254a7Smrg      dimensioned [count, 1], so ycount=1.
110181254a7Smrg */
111181254a7Smrg 
112181254a7Smrg   if (retarray->base_addr == NULL)
113181254a7Smrg     {
114181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
115181254a7Smrg         {
116181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118181254a7Smrg         }
119181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120181254a7Smrg         {
121181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123181254a7Smrg         }
124181254a7Smrg       else
125181254a7Smrg         {
126181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128181254a7Smrg 
129181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
130181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132181254a7Smrg         }
133181254a7Smrg 
134181254a7Smrg       retarray->base_addr
135181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
136181254a7Smrg       retarray->offset = 0;
137181254a7Smrg     }
138181254a7Smrg   else if (unlikely (compile_options.bounds_check))
139181254a7Smrg     {
140181254a7Smrg       index_type ret_extent, arg_extent;
141181254a7Smrg 
142181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
143181254a7Smrg 	{
144181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146181254a7Smrg 	  if (arg_extent != ret_extent)
147181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
148181254a7Smrg 	    		   "array (%ld/%ld) ",
149181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
150181254a7Smrg 	}
151181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152181254a7Smrg 	{
153181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155181254a7Smrg 	  if (arg_extent != ret_extent)
156181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
157181254a7Smrg 	    		   "array (%ld/%ld) ",
158181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
159181254a7Smrg 	}
160181254a7Smrg       else
161181254a7Smrg 	{
162181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164181254a7Smrg 	  if (arg_extent != ret_extent)
165181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
166181254a7Smrg 	    		   "array (%ld/%ld) ",
167181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
168181254a7Smrg 
169181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171181254a7Smrg 	  if (arg_extent != ret_extent)
172181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
173181254a7Smrg 	    		   "array (%ld/%ld) ",
174181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
175181254a7Smrg 	}
176181254a7Smrg     }
177181254a7Smrg 
178181254a7Smrg 
179181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180181254a7Smrg     {
181181254a7Smrg       /* One-dimensional result may be addressed in the code below
182181254a7Smrg 	 either as a row or a column matrix. We want both cases to
183181254a7Smrg 	 work. */
184181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185181254a7Smrg     }
186181254a7Smrg   else
187181254a7Smrg     {
188181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190181254a7Smrg     }
191181254a7Smrg 
192181254a7Smrg 
193181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
194181254a7Smrg     {
195181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
196181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197181254a7Smrg       aystride = 1;
198181254a7Smrg 
199181254a7Smrg       xcount = 1;
200181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
201181254a7Smrg     }
202181254a7Smrg   else
203181254a7Smrg     {
204181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206181254a7Smrg 
207181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
208181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209181254a7Smrg     }
210181254a7Smrg 
211181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212181254a7Smrg     {
213181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
216181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217181254a7Smrg     }
218181254a7Smrg 
219181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
220181254a7Smrg     {
221181254a7Smrg       /* Treat it as a column matrix B[count,1] */
222181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223181254a7Smrg 
224181254a7Smrg       /* bystride should never be used for 1-dimensional b.
225181254a7Smrg          The value is only used for calculation of the
226181254a7Smrg          memory by the buffer.  */
227181254a7Smrg       bystride = 256;
228181254a7Smrg       ycount = 1;
229181254a7Smrg     }
230181254a7Smrg   else
231181254a7Smrg     {
232181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235181254a7Smrg     }
236181254a7Smrg 
237181254a7Smrg   abase = a->base_addr;
238181254a7Smrg   bbase = b->base_addr;
239181254a7Smrg   dest = retarray->base_addr;
240181254a7Smrg 
241181254a7Smrg   /* Now that everything is set up, we perform the multiplication
242181254a7Smrg      itself.  */
243181254a7Smrg 
244181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
246181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
247181254a7Smrg 
248181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249181254a7Smrg       && (bxstride == 1 || bystride == 1)
250181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
251181254a7Smrg           > POW3(blas_limit)))
252181254a7Smrg     {
253181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
254181254a7Smrg       const GFC_INTEGER_16 one = 1, zero = 0;
255181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
256181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
257181254a7Smrg 
258181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259181254a7Smrg 	{
260181254a7Smrg 	  assert (gemm != NULL);
261181254a7Smrg 	  const char *transa, *transb;
262181254a7Smrg 	  if (try_blas & 2)
263181254a7Smrg 	    transa = "C";
264181254a7Smrg 	  else
265181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
266181254a7Smrg 
267181254a7Smrg 	  if (try_blas & 4)
268181254a7Smrg 	    transb = "C";
269181254a7Smrg 	  else
270181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
271181254a7Smrg 
272181254a7Smrg 	  gemm (transa, transb , &m,
273181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274181254a7Smrg 		&ldc, 1, 1);
275181254a7Smrg 	  return;
276181254a7Smrg 	}
277181254a7Smrg     }
278181254a7Smrg 
279fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
280fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
281181254a7Smrg     {
282181254a7Smrg       /* This block of code implements a tuned matmul, derived from
283181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284181254a7Smrg 
285181254a7Smrg                Bo Kagstrom and Per Ling
286181254a7Smrg                Department of Computing Science
287181254a7Smrg                Umea University
288181254a7Smrg                S-901 87 Umea, Sweden
289181254a7Smrg 
290181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
291181254a7Smrg 
292181254a7Smrg       const GFC_INTEGER_16 *a, *b;
293181254a7Smrg       GFC_INTEGER_16 *c;
294181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
295181254a7Smrg 
296181254a7Smrg       /* System generated locals */
297181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
299181254a7Smrg 
300181254a7Smrg       /* Local variables */
301181254a7Smrg       GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
302181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
303181254a7Smrg       index_type i, j, l, ii, jj, ll;
304181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305181254a7Smrg       GFC_INTEGER_16 *t1;
306181254a7Smrg 
307181254a7Smrg       a = abase;
308181254a7Smrg       b = bbase;
309181254a7Smrg       c = retarray->base_addr;
310181254a7Smrg 
311181254a7Smrg       /* Parameter adjustments */
312181254a7Smrg       c_dim1 = rystride;
313181254a7Smrg       c_offset = 1 + c_dim1;
314181254a7Smrg       c -= c_offset;
315181254a7Smrg       a_dim1 = aystride;
316181254a7Smrg       a_offset = 1 + a_dim1;
317181254a7Smrg       a -= a_offset;
318181254a7Smrg       b_dim1 = bystride;
319181254a7Smrg       b_offset = 1 + b_dim1;
320181254a7Smrg       b -= b_offset;
321181254a7Smrg 
322181254a7Smrg       /* Empty c first.  */
323181254a7Smrg       for (j=1; j<=n; j++)
324181254a7Smrg 	for (i=1; i<=m; i++)
325181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
326181254a7Smrg 
327181254a7Smrg       /* Early exit if possible */
328181254a7Smrg       if (m == 0 || n == 0 || k == 0)
329181254a7Smrg 	return;
330181254a7Smrg 
331181254a7Smrg       /* Adjust size of t1 to what is needed.  */
332181254a7Smrg       index_type t1_dim, a_sz;
333181254a7Smrg       if (aystride == 1)
334181254a7Smrg         a_sz = rystride;
335181254a7Smrg       else
336181254a7Smrg         a_sz = a_dim1;
337181254a7Smrg 
338181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
339181254a7Smrg       if (t1_dim > 65536)
340181254a7Smrg 	t1_dim = 65536;
341181254a7Smrg 
342181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
343181254a7Smrg 
344181254a7Smrg       /* Start turning the crank. */
345181254a7Smrg       i1 = n;
346181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
347181254a7Smrg 	{
348181254a7Smrg 	  /* Computing MIN */
349181254a7Smrg 	  i2 = 512;
350181254a7Smrg 	  i3 = n - jj + 1;
351181254a7Smrg 	  jsec = min(i2,i3);
352181254a7Smrg 	  ujsec = jsec - jsec % 4;
353181254a7Smrg 	  i2 = k;
354181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
355181254a7Smrg 	    {
356181254a7Smrg 	      /* Computing MIN */
357181254a7Smrg 	      i3 = 256;
358181254a7Smrg 	      i4 = k - ll + 1;
359181254a7Smrg 	      lsec = min(i3,i4);
360181254a7Smrg 	      ulsec = lsec - lsec % 2;
361181254a7Smrg 
362181254a7Smrg 	      i3 = m;
363181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
364181254a7Smrg 		{
365181254a7Smrg 		  /* Computing MIN */
366181254a7Smrg 		  i4 = 256;
367181254a7Smrg 		  i5 = m - ii + 1;
368181254a7Smrg 		  isec = min(i4,i5);
369181254a7Smrg 		  uisec = isec - isec % 2;
370181254a7Smrg 		  i4 = ll + ulsec - 1;
371181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
372181254a7Smrg 		    {
373181254a7Smrg 		      i5 = ii + uisec - 1;
374181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
375181254a7Smrg 			{
376181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377181254a7Smrg 					a[i + l * a_dim1];
378181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379181254a7Smrg 					a[i + (l + 1) * a_dim1];
380181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381181254a7Smrg 					a[i + 1 + l * a_dim1];
382181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
384181254a7Smrg 			}
385181254a7Smrg 		      if (uisec < isec)
386181254a7Smrg 			{
387181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
388181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
389181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
390181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
391181254a7Smrg 			}
392181254a7Smrg 		    }
393181254a7Smrg 		  if (ulsec < lsec)
394181254a7Smrg 		    {
395181254a7Smrg 		      i4 = ii + isec - 1;
396181254a7Smrg 		      for (i = ii; i<= i4; ++i)
397181254a7Smrg 			{
398181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
400181254a7Smrg 			}
401181254a7Smrg 		    }
402181254a7Smrg 
403181254a7Smrg 		  uisec = isec - isec % 4;
404181254a7Smrg 		  i4 = jj + ujsec - 1;
405181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
406181254a7Smrg 		    {
407181254a7Smrg 		      i5 = ii + uisec - 1;
408181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
409181254a7Smrg 			{
410181254a7Smrg 			  f11 = c[i + j * c_dim1];
411181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
412181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
413181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
414181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
415181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
416181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
417181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
418181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
419181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
420181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
421181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
422181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
423181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
424181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
425181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
426181254a7Smrg 			  i6 = ll + lsec - 1;
427181254a7Smrg 			  for (l = ll; l <= i6; ++l)
428181254a7Smrg 			    {
429181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430181254a7Smrg 				      * b[l + j * b_dim1];
431181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432181254a7Smrg 				      * b[l + j * b_dim1];
433181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
435181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
437181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
439181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
441181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
443181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
445181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446181254a7Smrg 				      * b[l + j * b_dim1];
447181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448181254a7Smrg 				      * b[l + j * b_dim1];
449181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
451181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
453181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
455181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
457181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
459181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
461181254a7Smrg 			    }
462181254a7Smrg 			  c[i + j * c_dim1] = f11;
463181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
464181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
465181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
466181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
467181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
468181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
469181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
470181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
471181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
472181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
473181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
474181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
475181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
476181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
477181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
478181254a7Smrg 			}
479181254a7Smrg 		      if (uisec < isec)
480181254a7Smrg 			{
481181254a7Smrg 			  i5 = ii + isec - 1;
482181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
483181254a7Smrg 			    {
484181254a7Smrg 			      f11 = c[i + j * c_dim1];
485181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
486181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
487181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
488181254a7Smrg 			      i6 = ll + lsec - 1;
489181254a7Smrg 			      for (l = ll; l <= i6; ++l)
490181254a7Smrg 				{
491181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492181254a7Smrg 					  257] * b[l + j * b_dim1];
493181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
495181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
497181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
499181254a7Smrg 				}
500181254a7Smrg 			      c[i + j * c_dim1] = f11;
501181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
502181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
503181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
504181254a7Smrg 			    }
505181254a7Smrg 			}
506181254a7Smrg 		    }
507181254a7Smrg 		  if (ujsec < jsec)
508181254a7Smrg 		    {
509181254a7Smrg 		      i4 = jj + jsec - 1;
510181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
511181254a7Smrg 			{
512181254a7Smrg 			  i5 = ii + uisec - 1;
513181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
514181254a7Smrg 			    {
515181254a7Smrg 			      f11 = c[i + j * c_dim1];
516181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
517181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
518181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
519181254a7Smrg 			      i6 = ll + lsec - 1;
520181254a7Smrg 			      for (l = ll; l <= i6; ++l)
521181254a7Smrg 				{
522181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523181254a7Smrg 					  257] * b[l + j * b_dim1];
524181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525181254a7Smrg 					  257] * b[l + j * b_dim1];
526181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527181254a7Smrg 					  257] * b[l + j * b_dim1];
528181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529181254a7Smrg 					  257] * b[l + j * b_dim1];
530181254a7Smrg 				}
531181254a7Smrg 			      c[i + j * c_dim1] = f11;
532181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
533181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
534181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
535181254a7Smrg 			    }
536181254a7Smrg 			  i5 = ii + isec - 1;
537181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
538181254a7Smrg 			    {
539181254a7Smrg 			      f11 = c[i + j * c_dim1];
540181254a7Smrg 			      i6 = ll + lsec - 1;
541181254a7Smrg 			      for (l = ll; l <= i6; ++l)
542181254a7Smrg 				{
543181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544181254a7Smrg 					  257] * b[l + j * b_dim1];
545181254a7Smrg 				}
546181254a7Smrg 			      c[i + j * c_dim1] = f11;
547181254a7Smrg 			    }
548181254a7Smrg 			}
549181254a7Smrg 		    }
550181254a7Smrg 		}
551181254a7Smrg 	    }
552181254a7Smrg 	}
553181254a7Smrg       free(t1);
554181254a7Smrg       return;
555181254a7Smrg     }
556181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557181254a7Smrg     {
558181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
559181254a7Smrg 	{
560181254a7Smrg 	  const GFC_INTEGER_16 *restrict abase_x;
561181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
562181254a7Smrg 	  GFC_INTEGER_16 *restrict dest_y;
563181254a7Smrg 	  GFC_INTEGER_16 s;
564181254a7Smrg 
565181254a7Smrg 	  for (y = 0; y < ycount; y++)
566181254a7Smrg 	    {
567181254a7Smrg 	      bbase_y = &bbase[y*bystride];
568181254a7Smrg 	      dest_y = &dest[y*rystride];
569181254a7Smrg 	      for (x = 0; x < xcount; x++)
570181254a7Smrg 		{
571181254a7Smrg 		  abase_x = &abase[x*axstride];
572181254a7Smrg 		  s = (GFC_INTEGER_16) 0;
573181254a7Smrg 		  for (n = 0; n < count; n++)
574181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
575181254a7Smrg 		  dest_y[x] = s;
576181254a7Smrg 		}
577181254a7Smrg 	    }
578181254a7Smrg 	}
579181254a7Smrg       else
580181254a7Smrg 	{
581181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
582181254a7Smrg 	  GFC_INTEGER_16 s;
583181254a7Smrg 
584181254a7Smrg 	  for (y = 0; y < ycount; y++)
585181254a7Smrg 	    {
586181254a7Smrg 	      bbase_y = &bbase[y*bystride];
587181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
588181254a7Smrg 	      for (n = 0; n < count; n++)
589181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
590181254a7Smrg 	      dest[y*rystride] = s;
591181254a7Smrg 	    }
592181254a7Smrg 	}
593181254a7Smrg     }
594181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
595181254a7Smrg     {
596181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
597181254a7Smrg       GFC_INTEGER_16 s;
598181254a7Smrg 
599181254a7Smrg       for (y = 0; y < ycount; y++)
600181254a7Smrg 	{
601181254a7Smrg 	  bbase_y = &bbase[y*bystride];
602181254a7Smrg 	  s = (GFC_INTEGER_16) 0;
603181254a7Smrg 	  for (n = 0; n < count; n++)
604181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
605181254a7Smrg 	  dest[y*rxstride] = s;
606181254a7Smrg 	}
607181254a7Smrg     }
608fb8a8121Smrg   else if (axstride < aystride)
609fb8a8121Smrg     {
610fb8a8121Smrg       for (y = 0; y < ycount; y++)
611fb8a8121Smrg 	for (x = 0; x < xcount; x++)
612fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
613fb8a8121Smrg 
614fb8a8121Smrg       for (y = 0; y < ycount; y++)
615fb8a8121Smrg 	for (n = 0; n < count; n++)
616fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
617fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
618fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
619fb8a8121Smrg 					abase[x*axstride + n*aystride] *
620fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
621fb8a8121Smrg     }
622181254a7Smrg   else
623181254a7Smrg     {
624181254a7Smrg       const GFC_INTEGER_16 *restrict abase_x;
625181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
626181254a7Smrg       GFC_INTEGER_16 *restrict dest_y;
627181254a7Smrg       GFC_INTEGER_16 s;
628181254a7Smrg 
629181254a7Smrg       for (y = 0; y < ycount; y++)
630181254a7Smrg 	{
631181254a7Smrg 	  bbase_y = &bbase[y*bystride];
632181254a7Smrg 	  dest_y = &dest[y*rystride];
633181254a7Smrg 	  for (x = 0; x < xcount; x++)
634181254a7Smrg 	    {
635181254a7Smrg 	      abase_x = &abase[x*axstride];
636181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
637181254a7Smrg 	      for (n = 0; n < count; n++)
638181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639181254a7Smrg 	      dest_y[x*rxstride] = s;
640181254a7Smrg 	    }
641181254a7Smrg 	}
642181254a7Smrg     }
643181254a7Smrg }
644181254a7Smrg #undef POW3
645181254a7Smrg #undef min
646181254a7Smrg #undef max
647181254a7Smrg 
648181254a7Smrg #endif /* HAVE_AVX */
649181254a7Smrg 
650181254a7Smrg #ifdef HAVE_AVX2
651181254a7Smrg static void
652181254a7Smrg matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
653181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
654181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655181254a7Smrg static void
matmul_i16_avx2(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656181254a7Smrg matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
657181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
658181254a7Smrg 	int blas_limit, blas_call gemm)
659181254a7Smrg {
660181254a7Smrg   const GFC_INTEGER_16 * restrict abase;
661181254a7Smrg   const GFC_INTEGER_16 * restrict bbase;
662181254a7Smrg   GFC_INTEGER_16 * restrict dest;
663181254a7Smrg 
664181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665181254a7Smrg   index_type x, y, n, count, xcount, ycount;
666181254a7Smrg 
667181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
668181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
669181254a7Smrg 
670181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671181254a7Smrg 
672181254a7Smrg    Either A or B (but not both) can be rank 1:
673181254a7Smrg 
674181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
675181254a7Smrg      dimensioned [1,count], so xcount=1.
676181254a7Smrg 
677181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
678181254a7Smrg      dimensioned [count, 1], so ycount=1.
679181254a7Smrg */
680181254a7Smrg 
681181254a7Smrg   if (retarray->base_addr == NULL)
682181254a7Smrg     {
683181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
684181254a7Smrg         {
685181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687181254a7Smrg         }
688181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
689181254a7Smrg         {
690181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692181254a7Smrg         }
693181254a7Smrg       else
694181254a7Smrg         {
695181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697181254a7Smrg 
698181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
699181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701181254a7Smrg         }
702181254a7Smrg 
703181254a7Smrg       retarray->base_addr
704181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
705181254a7Smrg       retarray->offset = 0;
706181254a7Smrg     }
707181254a7Smrg   else if (unlikely (compile_options.bounds_check))
708181254a7Smrg     {
709181254a7Smrg       index_type ret_extent, arg_extent;
710181254a7Smrg 
711181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
712181254a7Smrg 	{
713181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715181254a7Smrg 	  if (arg_extent != ret_extent)
716181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
717181254a7Smrg 	    		   "array (%ld/%ld) ",
718181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
719181254a7Smrg 	}
720181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
721181254a7Smrg 	{
722181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724181254a7Smrg 	  if (arg_extent != ret_extent)
725181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
726181254a7Smrg 	    		   "array (%ld/%ld) ",
727181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
728181254a7Smrg 	}
729181254a7Smrg       else
730181254a7Smrg 	{
731181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733181254a7Smrg 	  if (arg_extent != ret_extent)
734181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
735181254a7Smrg 	    		   "array (%ld/%ld) ",
736181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
737181254a7Smrg 
738181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740181254a7Smrg 	  if (arg_extent != ret_extent)
741181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
742181254a7Smrg 	    		   "array (%ld/%ld) ",
743181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
744181254a7Smrg 	}
745181254a7Smrg     }
746181254a7Smrg 
747181254a7Smrg 
748181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749181254a7Smrg     {
750181254a7Smrg       /* One-dimensional result may be addressed in the code below
751181254a7Smrg 	 either as a row or a column matrix. We want both cases to
752181254a7Smrg 	 work. */
753181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754181254a7Smrg     }
755181254a7Smrg   else
756181254a7Smrg     {
757181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759181254a7Smrg     }
760181254a7Smrg 
761181254a7Smrg 
762181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
763181254a7Smrg     {
764181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
765181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766181254a7Smrg       aystride = 1;
767181254a7Smrg 
768181254a7Smrg       xcount = 1;
769181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
770181254a7Smrg     }
771181254a7Smrg   else
772181254a7Smrg     {
773181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775181254a7Smrg 
776181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
777181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778181254a7Smrg     }
779181254a7Smrg 
780181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781181254a7Smrg     {
782181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
785181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786181254a7Smrg     }
787181254a7Smrg 
788181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
789181254a7Smrg     {
790181254a7Smrg       /* Treat it as a column matrix B[count,1] */
791181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792181254a7Smrg 
793181254a7Smrg       /* bystride should never be used for 1-dimensional b.
794181254a7Smrg          The value is only used for calculation of the
795181254a7Smrg          memory by the buffer.  */
796181254a7Smrg       bystride = 256;
797181254a7Smrg       ycount = 1;
798181254a7Smrg     }
799181254a7Smrg   else
800181254a7Smrg     {
801181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804181254a7Smrg     }
805181254a7Smrg 
806181254a7Smrg   abase = a->base_addr;
807181254a7Smrg   bbase = b->base_addr;
808181254a7Smrg   dest = retarray->base_addr;
809181254a7Smrg 
810181254a7Smrg   /* Now that everything is set up, we perform the multiplication
811181254a7Smrg      itself.  */
812181254a7Smrg 
813181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
815181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
816181254a7Smrg 
817181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818181254a7Smrg       && (bxstride == 1 || bystride == 1)
819181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
820181254a7Smrg           > POW3(blas_limit)))
821181254a7Smrg     {
822181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
823181254a7Smrg       const GFC_INTEGER_16 one = 1, zero = 0;
824181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
825181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
826181254a7Smrg 
827181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828181254a7Smrg 	{
829181254a7Smrg 	  assert (gemm != NULL);
830181254a7Smrg 	  const char *transa, *transb;
831181254a7Smrg 	  if (try_blas & 2)
832181254a7Smrg 	    transa = "C";
833181254a7Smrg 	  else
834181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
835181254a7Smrg 
836181254a7Smrg 	  if (try_blas & 4)
837181254a7Smrg 	    transb = "C";
838181254a7Smrg 	  else
839181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
840181254a7Smrg 
841181254a7Smrg 	  gemm (transa, transb , &m,
842181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843181254a7Smrg 		&ldc, 1, 1);
844181254a7Smrg 	  return;
845181254a7Smrg 	}
846181254a7Smrg     }
847181254a7Smrg 
848fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
849fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
850181254a7Smrg     {
851181254a7Smrg       /* This block of code implements a tuned matmul, derived from
852181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853181254a7Smrg 
854181254a7Smrg                Bo Kagstrom and Per Ling
855181254a7Smrg                Department of Computing Science
856181254a7Smrg                Umea University
857181254a7Smrg                S-901 87 Umea, Sweden
858181254a7Smrg 
859181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
860181254a7Smrg 
861181254a7Smrg       const GFC_INTEGER_16 *a, *b;
862181254a7Smrg       GFC_INTEGER_16 *c;
863181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
864181254a7Smrg 
865181254a7Smrg       /* System generated locals */
866181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
868181254a7Smrg 
869181254a7Smrg       /* Local variables */
870181254a7Smrg       GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
871181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
872181254a7Smrg       index_type i, j, l, ii, jj, ll;
873181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874181254a7Smrg       GFC_INTEGER_16 *t1;
875181254a7Smrg 
876181254a7Smrg       a = abase;
877181254a7Smrg       b = bbase;
878181254a7Smrg       c = retarray->base_addr;
879181254a7Smrg 
880181254a7Smrg       /* Parameter adjustments */
881181254a7Smrg       c_dim1 = rystride;
882181254a7Smrg       c_offset = 1 + c_dim1;
883181254a7Smrg       c -= c_offset;
884181254a7Smrg       a_dim1 = aystride;
885181254a7Smrg       a_offset = 1 + a_dim1;
886181254a7Smrg       a -= a_offset;
887181254a7Smrg       b_dim1 = bystride;
888181254a7Smrg       b_offset = 1 + b_dim1;
889181254a7Smrg       b -= b_offset;
890181254a7Smrg 
891181254a7Smrg       /* Empty c first.  */
892181254a7Smrg       for (j=1; j<=n; j++)
893181254a7Smrg 	for (i=1; i<=m; i++)
894181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
895181254a7Smrg 
896181254a7Smrg       /* Early exit if possible */
897181254a7Smrg       if (m == 0 || n == 0 || k == 0)
898181254a7Smrg 	return;
899181254a7Smrg 
900181254a7Smrg       /* Adjust size of t1 to what is needed.  */
901181254a7Smrg       index_type t1_dim, a_sz;
902181254a7Smrg       if (aystride == 1)
903181254a7Smrg         a_sz = rystride;
904181254a7Smrg       else
905181254a7Smrg         a_sz = a_dim1;
906181254a7Smrg 
907181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
908181254a7Smrg       if (t1_dim > 65536)
909181254a7Smrg 	t1_dim = 65536;
910181254a7Smrg 
911181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
912181254a7Smrg 
913181254a7Smrg       /* Start turning the crank. */
914181254a7Smrg       i1 = n;
915181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
916181254a7Smrg 	{
917181254a7Smrg 	  /* Computing MIN */
918181254a7Smrg 	  i2 = 512;
919181254a7Smrg 	  i3 = n - jj + 1;
920181254a7Smrg 	  jsec = min(i2,i3);
921181254a7Smrg 	  ujsec = jsec - jsec % 4;
922181254a7Smrg 	  i2 = k;
923181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
924181254a7Smrg 	    {
925181254a7Smrg 	      /* Computing MIN */
926181254a7Smrg 	      i3 = 256;
927181254a7Smrg 	      i4 = k - ll + 1;
928181254a7Smrg 	      lsec = min(i3,i4);
929181254a7Smrg 	      ulsec = lsec - lsec % 2;
930181254a7Smrg 
931181254a7Smrg 	      i3 = m;
932181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
933181254a7Smrg 		{
934181254a7Smrg 		  /* Computing MIN */
935181254a7Smrg 		  i4 = 256;
936181254a7Smrg 		  i5 = m - ii + 1;
937181254a7Smrg 		  isec = min(i4,i5);
938181254a7Smrg 		  uisec = isec - isec % 2;
939181254a7Smrg 		  i4 = ll + ulsec - 1;
940181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
941181254a7Smrg 		    {
942181254a7Smrg 		      i5 = ii + uisec - 1;
943181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
944181254a7Smrg 			{
945181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946181254a7Smrg 					a[i + l * a_dim1];
947181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948181254a7Smrg 					a[i + (l + 1) * a_dim1];
949181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950181254a7Smrg 					a[i + 1 + l * a_dim1];
951181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
953181254a7Smrg 			}
954181254a7Smrg 		      if (uisec < isec)
955181254a7Smrg 			{
956181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
957181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
958181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
959181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
960181254a7Smrg 			}
961181254a7Smrg 		    }
962181254a7Smrg 		  if (ulsec < lsec)
963181254a7Smrg 		    {
964181254a7Smrg 		      i4 = ii + isec - 1;
965181254a7Smrg 		      for (i = ii; i<= i4; ++i)
966181254a7Smrg 			{
967181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
969181254a7Smrg 			}
970181254a7Smrg 		    }
971181254a7Smrg 
972181254a7Smrg 		  uisec = isec - isec % 4;
973181254a7Smrg 		  i4 = jj + ujsec - 1;
974181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
975181254a7Smrg 		    {
976181254a7Smrg 		      i5 = ii + uisec - 1;
977181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
978181254a7Smrg 			{
979181254a7Smrg 			  f11 = c[i + j * c_dim1];
980181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
981181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
982181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
983181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
984181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
985181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
986181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
987181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
988181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
989181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
990181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
991181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
992181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
993181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
994181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
995181254a7Smrg 			  i6 = ll + lsec - 1;
996181254a7Smrg 			  for (l = ll; l <= i6; ++l)
997181254a7Smrg 			    {
998181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999181254a7Smrg 				      * b[l + j * b_dim1];
1000181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001181254a7Smrg 				      * b[l + j * b_dim1];
1002181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1004181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1006181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1008181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1010181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1012181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1014181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015181254a7Smrg 				      * b[l + j * b_dim1];
1016181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017181254a7Smrg 				      * b[l + j * b_dim1];
1018181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1020181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1022181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1024181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1026181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1028181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1030181254a7Smrg 			    }
1031181254a7Smrg 			  c[i + j * c_dim1] = f11;
1032181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
1033181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1034181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1036181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1038181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
1040181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
1041181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047181254a7Smrg 			}
1048181254a7Smrg 		      if (uisec < isec)
1049181254a7Smrg 			{
1050181254a7Smrg 			  i5 = ii + isec - 1;
1051181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1052181254a7Smrg 			    {
1053181254a7Smrg 			      f11 = c[i + j * c_dim1];
1054181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1055181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1056181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1057181254a7Smrg 			      i6 = ll + lsec - 1;
1058181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1059181254a7Smrg 				{
1060181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061181254a7Smrg 					  257] * b[l + j * b_dim1];
1062181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
1064181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
1066181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
1068181254a7Smrg 				}
1069181254a7Smrg 			      c[i + j * c_dim1] = f11;
1070181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1071181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1072181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1073181254a7Smrg 			    }
1074181254a7Smrg 			}
1075181254a7Smrg 		    }
1076181254a7Smrg 		  if (ujsec < jsec)
1077181254a7Smrg 		    {
1078181254a7Smrg 		      i4 = jj + jsec - 1;
1079181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1080181254a7Smrg 			{
1081181254a7Smrg 			  i5 = ii + uisec - 1;
1082181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
1083181254a7Smrg 			    {
1084181254a7Smrg 			      f11 = c[i + j * c_dim1];
1085181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
1086181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
1087181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
1088181254a7Smrg 			      i6 = ll + lsec - 1;
1089181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1090181254a7Smrg 				{
1091181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092181254a7Smrg 					  257] * b[l + j * b_dim1];
1093181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094181254a7Smrg 					  257] * b[l + j * b_dim1];
1095181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096181254a7Smrg 					  257] * b[l + j * b_dim1];
1097181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098181254a7Smrg 					  257] * b[l + j * b_dim1];
1099181254a7Smrg 				}
1100181254a7Smrg 			      c[i + j * c_dim1] = f11;
1101181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
1102181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
1103181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
1104181254a7Smrg 			    }
1105181254a7Smrg 			  i5 = ii + isec - 1;
1106181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1107181254a7Smrg 			    {
1108181254a7Smrg 			      f11 = c[i + j * c_dim1];
1109181254a7Smrg 			      i6 = ll + lsec - 1;
1110181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1111181254a7Smrg 				{
1112181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113181254a7Smrg 					  257] * b[l + j * b_dim1];
1114181254a7Smrg 				}
1115181254a7Smrg 			      c[i + j * c_dim1] = f11;
1116181254a7Smrg 			    }
1117181254a7Smrg 			}
1118181254a7Smrg 		    }
1119181254a7Smrg 		}
1120181254a7Smrg 	    }
1121181254a7Smrg 	}
1122181254a7Smrg       free(t1);
1123181254a7Smrg       return;
1124181254a7Smrg     }
1125181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126181254a7Smrg     {
1127181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1128181254a7Smrg 	{
1129181254a7Smrg 	  const GFC_INTEGER_16 *restrict abase_x;
1130181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
1131181254a7Smrg 	  GFC_INTEGER_16 *restrict dest_y;
1132181254a7Smrg 	  GFC_INTEGER_16 s;
1133181254a7Smrg 
1134181254a7Smrg 	  for (y = 0; y < ycount; y++)
1135181254a7Smrg 	    {
1136181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1137181254a7Smrg 	      dest_y = &dest[y*rystride];
1138181254a7Smrg 	      for (x = 0; x < xcount; x++)
1139181254a7Smrg 		{
1140181254a7Smrg 		  abase_x = &abase[x*axstride];
1141181254a7Smrg 		  s = (GFC_INTEGER_16) 0;
1142181254a7Smrg 		  for (n = 0; n < count; n++)
1143181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
1144181254a7Smrg 		  dest_y[x] = s;
1145181254a7Smrg 		}
1146181254a7Smrg 	    }
1147181254a7Smrg 	}
1148181254a7Smrg       else
1149181254a7Smrg 	{
1150181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
1151181254a7Smrg 	  GFC_INTEGER_16 s;
1152181254a7Smrg 
1153181254a7Smrg 	  for (y = 0; y < ycount; y++)
1154181254a7Smrg 	    {
1155181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1156181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
1157181254a7Smrg 	      for (n = 0; n < count; n++)
1158181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
1159181254a7Smrg 	      dest[y*rystride] = s;
1160181254a7Smrg 	    }
1161181254a7Smrg 	}
1162181254a7Smrg     }
1163181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164181254a7Smrg     {
1165181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
1166181254a7Smrg       GFC_INTEGER_16 s;
1167181254a7Smrg 
1168181254a7Smrg       for (y = 0; y < ycount; y++)
1169181254a7Smrg 	{
1170181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1171181254a7Smrg 	  s = (GFC_INTEGER_16) 0;
1172181254a7Smrg 	  for (n = 0; n < count; n++)
1173181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174181254a7Smrg 	  dest[y*rxstride] = s;
1175181254a7Smrg 	}
1176181254a7Smrg     }
1177fb8a8121Smrg   else if (axstride < aystride)
1178fb8a8121Smrg     {
1179fb8a8121Smrg       for (y = 0; y < ycount; y++)
1180fb8a8121Smrg 	for (x = 0; x < xcount; x++)
1181fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
1182fb8a8121Smrg 
1183fb8a8121Smrg       for (y = 0; y < ycount; y++)
1184fb8a8121Smrg 	for (n = 0; n < count; n++)
1185fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
1186fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1187fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
1188fb8a8121Smrg 					abase[x*axstride + n*aystride] *
1189fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
1190fb8a8121Smrg     }
1191181254a7Smrg   else
1192181254a7Smrg     {
1193181254a7Smrg       const GFC_INTEGER_16 *restrict abase_x;
1194181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
1195181254a7Smrg       GFC_INTEGER_16 *restrict dest_y;
1196181254a7Smrg       GFC_INTEGER_16 s;
1197181254a7Smrg 
1198181254a7Smrg       for (y = 0; y < ycount; y++)
1199181254a7Smrg 	{
1200181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1201181254a7Smrg 	  dest_y = &dest[y*rystride];
1202181254a7Smrg 	  for (x = 0; x < xcount; x++)
1203181254a7Smrg 	    {
1204181254a7Smrg 	      abase_x = &abase[x*axstride];
1205181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
1206181254a7Smrg 	      for (n = 0; n < count; n++)
1207181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208181254a7Smrg 	      dest_y[x*rxstride] = s;
1209181254a7Smrg 	    }
1210181254a7Smrg 	}
1211181254a7Smrg     }
1212181254a7Smrg }
1213181254a7Smrg #undef POW3
1214181254a7Smrg #undef min
1215181254a7Smrg #undef max
1216181254a7Smrg 
1217181254a7Smrg #endif /* HAVE_AVX2 */
1218181254a7Smrg 
1219181254a7Smrg #ifdef HAVE_AVX512F
1220181254a7Smrg static void
1221181254a7Smrg matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
1222181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
1223181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224181254a7Smrg static void
matmul_i16_avx512f(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225181254a7Smrg matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
1226181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
1227181254a7Smrg 	int blas_limit, blas_call gemm)
1228181254a7Smrg {
1229181254a7Smrg   const GFC_INTEGER_16 * restrict abase;
1230181254a7Smrg   const GFC_INTEGER_16 * restrict bbase;
1231181254a7Smrg   GFC_INTEGER_16 * restrict dest;
1232181254a7Smrg 
1233181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234181254a7Smrg   index_type x, y, n, count, xcount, ycount;
1235181254a7Smrg 
1236181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1237181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1238181254a7Smrg 
1239181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240181254a7Smrg 
1241181254a7Smrg    Either A or B (but not both) can be rank 1:
1242181254a7Smrg 
1243181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1244181254a7Smrg      dimensioned [1,count], so xcount=1.
1245181254a7Smrg 
1246181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1247181254a7Smrg      dimensioned [count, 1], so ycount=1.
1248181254a7Smrg */
1249181254a7Smrg 
1250181254a7Smrg   if (retarray->base_addr == NULL)
1251181254a7Smrg     {
1252181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1253181254a7Smrg         {
1254181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256181254a7Smrg         }
1257181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258181254a7Smrg         {
1259181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261181254a7Smrg         }
1262181254a7Smrg       else
1263181254a7Smrg         {
1264181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266181254a7Smrg 
1267181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1268181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270181254a7Smrg         }
1271181254a7Smrg 
1272181254a7Smrg       retarray->base_addr
1273181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
1274181254a7Smrg       retarray->offset = 0;
1275181254a7Smrg     }
1276181254a7Smrg   else if (unlikely (compile_options.bounds_check))
1277181254a7Smrg     {
1278181254a7Smrg       index_type ret_extent, arg_extent;
1279181254a7Smrg 
1280181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1281181254a7Smrg 	{
1282181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284181254a7Smrg 	  if (arg_extent != ret_extent)
1285181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1286181254a7Smrg 	    		   "array (%ld/%ld) ",
1287181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1288181254a7Smrg 	}
1289181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290181254a7Smrg 	{
1291181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293181254a7Smrg 	  if (arg_extent != ret_extent)
1294181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1295181254a7Smrg 	    		   "array (%ld/%ld) ",
1296181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1297181254a7Smrg 	}
1298181254a7Smrg       else
1299181254a7Smrg 	{
1300181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302181254a7Smrg 	  if (arg_extent != ret_extent)
1303181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1304181254a7Smrg 	    		   "array (%ld/%ld) ",
1305181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1306181254a7Smrg 
1307181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309181254a7Smrg 	  if (arg_extent != ret_extent)
1310181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1311181254a7Smrg 	    		   "array (%ld/%ld) ",
1312181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1313181254a7Smrg 	}
1314181254a7Smrg     }
1315181254a7Smrg 
1316181254a7Smrg 
1317181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318181254a7Smrg     {
1319181254a7Smrg       /* One-dimensional result may be addressed in the code below
1320181254a7Smrg 	 either as a row or a column matrix. We want both cases to
1321181254a7Smrg 	 work. */
1322181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323181254a7Smrg     }
1324181254a7Smrg   else
1325181254a7Smrg     {
1326181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328181254a7Smrg     }
1329181254a7Smrg 
1330181254a7Smrg 
1331181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1332181254a7Smrg     {
1333181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
1334181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335181254a7Smrg       aystride = 1;
1336181254a7Smrg 
1337181254a7Smrg       xcount = 1;
1338181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1339181254a7Smrg     }
1340181254a7Smrg   else
1341181254a7Smrg     {
1342181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344181254a7Smrg 
1345181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1346181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347181254a7Smrg     }
1348181254a7Smrg 
1349181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350181254a7Smrg     {
1351181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
1354181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355181254a7Smrg     }
1356181254a7Smrg 
1357181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1358181254a7Smrg     {
1359181254a7Smrg       /* Treat it as a column matrix B[count,1] */
1360181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361181254a7Smrg 
1362181254a7Smrg       /* bystride should never be used for 1-dimensional b.
1363181254a7Smrg          The value is only used for calculation of the
1364181254a7Smrg          memory by the buffer.  */
1365181254a7Smrg       bystride = 256;
1366181254a7Smrg       ycount = 1;
1367181254a7Smrg     }
1368181254a7Smrg   else
1369181254a7Smrg     {
1370181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373181254a7Smrg     }
1374181254a7Smrg 
1375181254a7Smrg   abase = a->base_addr;
1376181254a7Smrg   bbase = b->base_addr;
1377181254a7Smrg   dest = retarray->base_addr;
1378181254a7Smrg 
1379181254a7Smrg   /* Now that everything is set up, we perform the multiplication
1380181254a7Smrg      itself.  */
1381181254a7Smrg 
1382181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1384181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1385181254a7Smrg 
1386181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387181254a7Smrg       && (bxstride == 1 || bystride == 1)
1388181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1389181254a7Smrg           > POW3(blas_limit)))
1390181254a7Smrg     {
1391181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1392181254a7Smrg       const GFC_INTEGER_16 one = 1, zero = 0;
1393181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1394181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1395181254a7Smrg 
1396181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397181254a7Smrg 	{
1398181254a7Smrg 	  assert (gemm != NULL);
1399181254a7Smrg 	  const char *transa, *transb;
1400181254a7Smrg 	  if (try_blas & 2)
1401181254a7Smrg 	    transa = "C";
1402181254a7Smrg 	  else
1403181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
1404181254a7Smrg 
1405181254a7Smrg 	  if (try_blas & 4)
1406181254a7Smrg 	    transb = "C";
1407181254a7Smrg 	  else
1408181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
1409181254a7Smrg 
1410181254a7Smrg 	  gemm (transa, transb , &m,
1411181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412181254a7Smrg 		&ldc, 1, 1);
1413181254a7Smrg 	  return;
1414181254a7Smrg 	}
1415181254a7Smrg     }
1416181254a7Smrg 
1417fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
1418fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
1419181254a7Smrg     {
1420181254a7Smrg       /* This block of code implements a tuned matmul, derived from
1421181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422181254a7Smrg 
1423181254a7Smrg                Bo Kagstrom and Per Ling
1424181254a7Smrg                Department of Computing Science
1425181254a7Smrg                Umea University
1426181254a7Smrg                S-901 87 Umea, Sweden
1427181254a7Smrg 
1428181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429181254a7Smrg 
1430181254a7Smrg       const GFC_INTEGER_16 *a, *b;
1431181254a7Smrg       GFC_INTEGER_16 *c;
1432181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
1433181254a7Smrg 
1434181254a7Smrg       /* System generated locals */
1435181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
1437181254a7Smrg 
1438181254a7Smrg       /* Local variables */
1439181254a7Smrg       GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
1440181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
1441181254a7Smrg       index_type i, j, l, ii, jj, ll;
1442181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443181254a7Smrg       GFC_INTEGER_16 *t1;
1444181254a7Smrg 
1445181254a7Smrg       a = abase;
1446181254a7Smrg       b = bbase;
1447181254a7Smrg       c = retarray->base_addr;
1448181254a7Smrg 
1449181254a7Smrg       /* Parameter adjustments */
1450181254a7Smrg       c_dim1 = rystride;
1451181254a7Smrg       c_offset = 1 + c_dim1;
1452181254a7Smrg       c -= c_offset;
1453181254a7Smrg       a_dim1 = aystride;
1454181254a7Smrg       a_offset = 1 + a_dim1;
1455181254a7Smrg       a -= a_offset;
1456181254a7Smrg       b_dim1 = bystride;
1457181254a7Smrg       b_offset = 1 + b_dim1;
1458181254a7Smrg       b -= b_offset;
1459181254a7Smrg 
1460181254a7Smrg       /* Empty c first.  */
1461181254a7Smrg       for (j=1; j<=n; j++)
1462181254a7Smrg 	for (i=1; i<=m; i++)
1463181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
1464181254a7Smrg 
1465181254a7Smrg       /* Early exit if possible */
1466181254a7Smrg       if (m == 0 || n == 0 || k == 0)
1467181254a7Smrg 	return;
1468181254a7Smrg 
1469181254a7Smrg       /* Adjust size of t1 to what is needed.  */
1470181254a7Smrg       index_type t1_dim, a_sz;
1471181254a7Smrg       if (aystride == 1)
1472181254a7Smrg         a_sz = rystride;
1473181254a7Smrg       else
1474181254a7Smrg         a_sz = a_dim1;
1475181254a7Smrg 
1476181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
1477181254a7Smrg       if (t1_dim > 65536)
1478181254a7Smrg 	t1_dim = 65536;
1479181254a7Smrg 
1480181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
1481181254a7Smrg 
1482181254a7Smrg       /* Start turning the crank. */
1483181254a7Smrg       i1 = n;
1484181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
1485181254a7Smrg 	{
1486181254a7Smrg 	  /* Computing MIN */
1487181254a7Smrg 	  i2 = 512;
1488181254a7Smrg 	  i3 = n - jj + 1;
1489181254a7Smrg 	  jsec = min(i2,i3);
1490181254a7Smrg 	  ujsec = jsec - jsec % 4;
1491181254a7Smrg 	  i2 = k;
1492181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
1493181254a7Smrg 	    {
1494181254a7Smrg 	      /* Computing MIN */
1495181254a7Smrg 	      i3 = 256;
1496181254a7Smrg 	      i4 = k - ll + 1;
1497181254a7Smrg 	      lsec = min(i3,i4);
1498181254a7Smrg 	      ulsec = lsec - lsec % 2;
1499181254a7Smrg 
1500181254a7Smrg 	      i3 = m;
1501181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
1502181254a7Smrg 		{
1503181254a7Smrg 		  /* Computing MIN */
1504181254a7Smrg 		  i4 = 256;
1505181254a7Smrg 		  i5 = m - ii + 1;
1506181254a7Smrg 		  isec = min(i4,i5);
1507181254a7Smrg 		  uisec = isec - isec % 2;
1508181254a7Smrg 		  i4 = ll + ulsec - 1;
1509181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
1510181254a7Smrg 		    {
1511181254a7Smrg 		      i5 = ii + uisec - 1;
1512181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
1513181254a7Smrg 			{
1514181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515181254a7Smrg 					a[i + l * a_dim1];
1516181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517181254a7Smrg 					a[i + (l + 1) * a_dim1];
1518181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519181254a7Smrg 					a[i + 1 + l * a_dim1];
1520181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
1522181254a7Smrg 			}
1523181254a7Smrg 		      if (uisec < isec)
1524181254a7Smrg 			{
1525181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
1526181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
1527181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
1528181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529181254a7Smrg 			}
1530181254a7Smrg 		    }
1531181254a7Smrg 		  if (ulsec < lsec)
1532181254a7Smrg 		    {
1533181254a7Smrg 		      i4 = ii + isec - 1;
1534181254a7Smrg 		      for (i = ii; i<= i4; ++i)
1535181254a7Smrg 			{
1536181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
1538181254a7Smrg 			}
1539181254a7Smrg 		    }
1540181254a7Smrg 
1541181254a7Smrg 		  uisec = isec - isec % 4;
1542181254a7Smrg 		  i4 = jj + ujsec - 1;
1543181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
1544181254a7Smrg 		    {
1545181254a7Smrg 		      i5 = ii + uisec - 1;
1546181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
1547181254a7Smrg 			{
1548181254a7Smrg 			  f11 = c[i + j * c_dim1];
1549181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
1550181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
1551181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
1553181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
1555181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
1557181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
1558181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564181254a7Smrg 			  i6 = ll + lsec - 1;
1565181254a7Smrg 			  for (l = ll; l <= i6; ++l)
1566181254a7Smrg 			    {
1567181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568181254a7Smrg 				      * b[l + j * b_dim1];
1569181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570181254a7Smrg 				      * b[l + j * b_dim1];
1571181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1573181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1575181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1577181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1579181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1581181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1583181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584181254a7Smrg 				      * b[l + j * b_dim1];
1585181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586181254a7Smrg 				      * b[l + j * b_dim1];
1587181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1589181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
1591181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1593181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
1595181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1597181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
1599181254a7Smrg 			    }
1600181254a7Smrg 			  c[i + j * c_dim1] = f11;
1601181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
1602181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1603181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1605181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1607181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
1609181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
1610181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616181254a7Smrg 			}
1617181254a7Smrg 		      if (uisec < isec)
1618181254a7Smrg 			{
1619181254a7Smrg 			  i5 = ii + isec - 1;
1620181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1621181254a7Smrg 			    {
1622181254a7Smrg 			      f11 = c[i + j * c_dim1];
1623181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1624181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1625181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1626181254a7Smrg 			      i6 = ll + lsec - 1;
1627181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1628181254a7Smrg 				{
1629181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630181254a7Smrg 					  257] * b[l + j * b_dim1];
1631181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
1633181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
1635181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
1637181254a7Smrg 				}
1638181254a7Smrg 			      c[i + j * c_dim1] = f11;
1639181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1640181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1641181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1642181254a7Smrg 			    }
1643181254a7Smrg 			}
1644181254a7Smrg 		    }
1645181254a7Smrg 		  if (ujsec < jsec)
1646181254a7Smrg 		    {
1647181254a7Smrg 		      i4 = jj + jsec - 1;
1648181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1649181254a7Smrg 			{
1650181254a7Smrg 			  i5 = ii + uisec - 1;
1651181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
1652181254a7Smrg 			    {
1653181254a7Smrg 			      f11 = c[i + j * c_dim1];
1654181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
1655181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
1656181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
1657181254a7Smrg 			      i6 = ll + lsec - 1;
1658181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1659181254a7Smrg 				{
1660181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661181254a7Smrg 					  257] * b[l + j * b_dim1];
1662181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663181254a7Smrg 					  257] * b[l + j * b_dim1];
1664181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665181254a7Smrg 					  257] * b[l + j * b_dim1];
1666181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667181254a7Smrg 					  257] * b[l + j * b_dim1];
1668181254a7Smrg 				}
1669181254a7Smrg 			      c[i + j * c_dim1] = f11;
1670181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
1671181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
1672181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
1673181254a7Smrg 			    }
1674181254a7Smrg 			  i5 = ii + isec - 1;
1675181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1676181254a7Smrg 			    {
1677181254a7Smrg 			      f11 = c[i + j * c_dim1];
1678181254a7Smrg 			      i6 = ll + lsec - 1;
1679181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1680181254a7Smrg 				{
1681181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682181254a7Smrg 					  257] * b[l + j * b_dim1];
1683181254a7Smrg 				}
1684181254a7Smrg 			      c[i + j * c_dim1] = f11;
1685181254a7Smrg 			    }
1686181254a7Smrg 			}
1687181254a7Smrg 		    }
1688181254a7Smrg 		}
1689181254a7Smrg 	    }
1690181254a7Smrg 	}
1691181254a7Smrg       free(t1);
1692181254a7Smrg       return;
1693181254a7Smrg     }
1694181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695181254a7Smrg     {
1696181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1697181254a7Smrg 	{
1698181254a7Smrg 	  const GFC_INTEGER_16 *restrict abase_x;
1699181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
1700181254a7Smrg 	  GFC_INTEGER_16 *restrict dest_y;
1701181254a7Smrg 	  GFC_INTEGER_16 s;
1702181254a7Smrg 
1703181254a7Smrg 	  for (y = 0; y < ycount; y++)
1704181254a7Smrg 	    {
1705181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1706181254a7Smrg 	      dest_y = &dest[y*rystride];
1707181254a7Smrg 	      for (x = 0; x < xcount; x++)
1708181254a7Smrg 		{
1709181254a7Smrg 		  abase_x = &abase[x*axstride];
1710181254a7Smrg 		  s = (GFC_INTEGER_16) 0;
1711181254a7Smrg 		  for (n = 0; n < count; n++)
1712181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
1713181254a7Smrg 		  dest_y[x] = s;
1714181254a7Smrg 		}
1715181254a7Smrg 	    }
1716181254a7Smrg 	}
1717181254a7Smrg       else
1718181254a7Smrg 	{
1719181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
1720181254a7Smrg 	  GFC_INTEGER_16 s;
1721181254a7Smrg 
1722181254a7Smrg 	  for (y = 0; y < ycount; y++)
1723181254a7Smrg 	    {
1724181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1725181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
1726181254a7Smrg 	      for (n = 0; n < count; n++)
1727181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
1728181254a7Smrg 	      dest[y*rystride] = s;
1729181254a7Smrg 	    }
1730181254a7Smrg 	}
1731181254a7Smrg     }
1732181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733181254a7Smrg     {
1734181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
1735181254a7Smrg       GFC_INTEGER_16 s;
1736181254a7Smrg 
1737181254a7Smrg       for (y = 0; y < ycount; y++)
1738181254a7Smrg 	{
1739181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1740181254a7Smrg 	  s = (GFC_INTEGER_16) 0;
1741181254a7Smrg 	  for (n = 0; n < count; n++)
1742181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743181254a7Smrg 	  dest[y*rxstride] = s;
1744181254a7Smrg 	}
1745181254a7Smrg     }
1746fb8a8121Smrg   else if (axstride < aystride)
1747fb8a8121Smrg     {
1748fb8a8121Smrg       for (y = 0; y < ycount; y++)
1749fb8a8121Smrg 	for (x = 0; x < xcount; x++)
1750fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
1751fb8a8121Smrg 
1752fb8a8121Smrg       for (y = 0; y < ycount; y++)
1753fb8a8121Smrg 	for (n = 0; n < count; n++)
1754fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
1755fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1756fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
1757fb8a8121Smrg 					abase[x*axstride + n*aystride] *
1758fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
1759fb8a8121Smrg     }
1760181254a7Smrg   else
1761181254a7Smrg     {
1762181254a7Smrg       const GFC_INTEGER_16 *restrict abase_x;
1763181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
1764181254a7Smrg       GFC_INTEGER_16 *restrict dest_y;
1765181254a7Smrg       GFC_INTEGER_16 s;
1766181254a7Smrg 
1767181254a7Smrg       for (y = 0; y < ycount; y++)
1768181254a7Smrg 	{
1769181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1770181254a7Smrg 	  dest_y = &dest[y*rystride];
1771181254a7Smrg 	  for (x = 0; x < xcount; x++)
1772181254a7Smrg 	    {
1773181254a7Smrg 	      abase_x = &abase[x*axstride];
1774181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
1775181254a7Smrg 	      for (n = 0; n < count; n++)
1776181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777181254a7Smrg 	      dest_y[x*rxstride] = s;
1778181254a7Smrg 	    }
1779181254a7Smrg 	}
1780181254a7Smrg     }
1781181254a7Smrg }
1782181254a7Smrg #undef POW3
1783181254a7Smrg #undef min
1784181254a7Smrg #undef max
1785181254a7Smrg 
1786181254a7Smrg #endif  /* HAVE_AVX512F */
1787181254a7Smrg 
1788181254a7Smrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789181254a7Smrg 
1790181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791181254a7Smrg void
1792181254a7Smrg matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
1793181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
1794181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795181254a7Smrg internal_proto(matmul_i16_avx128_fma3);
1796181254a7Smrg #endif
1797181254a7Smrg 
1798181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799181254a7Smrg void
1800181254a7Smrg matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
1801181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
1802181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803181254a7Smrg internal_proto(matmul_i16_avx128_fma4);
1804181254a7Smrg #endif
1805181254a7Smrg 
1806181254a7Smrg /* Function to fall back to if there is no special processor-specific version.  */
1807181254a7Smrg static void
matmul_i16_vanilla(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808181254a7Smrg matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
1809181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
1810181254a7Smrg 	int blas_limit, blas_call gemm)
1811181254a7Smrg {
1812181254a7Smrg   const GFC_INTEGER_16 * restrict abase;
1813181254a7Smrg   const GFC_INTEGER_16 * restrict bbase;
1814181254a7Smrg   GFC_INTEGER_16 * restrict dest;
1815181254a7Smrg 
1816181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817181254a7Smrg   index_type x, y, n, count, xcount, ycount;
1818181254a7Smrg 
1819181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1820181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1821181254a7Smrg 
1822181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823181254a7Smrg 
1824181254a7Smrg    Either A or B (but not both) can be rank 1:
1825181254a7Smrg 
1826181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1827181254a7Smrg      dimensioned [1,count], so xcount=1.
1828181254a7Smrg 
1829181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1830181254a7Smrg      dimensioned [count, 1], so ycount=1.
1831181254a7Smrg */
1832181254a7Smrg 
1833181254a7Smrg   if (retarray->base_addr == NULL)
1834181254a7Smrg     {
1835181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1836181254a7Smrg         {
1837181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839181254a7Smrg         }
1840181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841181254a7Smrg         {
1842181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844181254a7Smrg         }
1845181254a7Smrg       else
1846181254a7Smrg         {
1847181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849181254a7Smrg 
1850181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1851181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853181254a7Smrg         }
1854181254a7Smrg 
1855181254a7Smrg       retarray->base_addr
1856181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
1857181254a7Smrg       retarray->offset = 0;
1858181254a7Smrg     }
1859181254a7Smrg   else if (unlikely (compile_options.bounds_check))
1860181254a7Smrg     {
1861181254a7Smrg       index_type ret_extent, arg_extent;
1862181254a7Smrg 
1863181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1864181254a7Smrg 	{
1865181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867181254a7Smrg 	  if (arg_extent != ret_extent)
1868181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1869181254a7Smrg 	    		   "array (%ld/%ld) ",
1870181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1871181254a7Smrg 	}
1872181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873181254a7Smrg 	{
1874181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876181254a7Smrg 	  if (arg_extent != ret_extent)
1877181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1878181254a7Smrg 	    		   "array (%ld/%ld) ",
1879181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1880181254a7Smrg 	}
1881181254a7Smrg       else
1882181254a7Smrg 	{
1883181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885181254a7Smrg 	  if (arg_extent != ret_extent)
1886181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1887181254a7Smrg 	    		   "array (%ld/%ld) ",
1888181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1889181254a7Smrg 
1890181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892181254a7Smrg 	  if (arg_extent != ret_extent)
1893181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1894181254a7Smrg 	    		   "array (%ld/%ld) ",
1895181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
1896181254a7Smrg 	}
1897181254a7Smrg     }
1898181254a7Smrg 
1899181254a7Smrg 
1900181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901181254a7Smrg     {
1902181254a7Smrg       /* One-dimensional result may be addressed in the code below
1903181254a7Smrg 	 either as a row or a column matrix. We want both cases to
1904181254a7Smrg 	 work. */
1905181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906181254a7Smrg     }
1907181254a7Smrg   else
1908181254a7Smrg     {
1909181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911181254a7Smrg     }
1912181254a7Smrg 
1913181254a7Smrg 
1914181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1915181254a7Smrg     {
1916181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
1917181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918181254a7Smrg       aystride = 1;
1919181254a7Smrg 
1920181254a7Smrg       xcount = 1;
1921181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1922181254a7Smrg     }
1923181254a7Smrg   else
1924181254a7Smrg     {
1925181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927181254a7Smrg 
1928181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1929181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930181254a7Smrg     }
1931181254a7Smrg 
1932181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933181254a7Smrg     {
1934181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
1937181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938181254a7Smrg     }
1939181254a7Smrg 
1940181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1941181254a7Smrg     {
1942181254a7Smrg       /* Treat it as a column matrix B[count,1] */
1943181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944181254a7Smrg 
1945181254a7Smrg       /* bystride should never be used for 1-dimensional b.
1946181254a7Smrg          The value is only used for calculation of the
1947181254a7Smrg          memory by the buffer.  */
1948181254a7Smrg       bystride = 256;
1949181254a7Smrg       ycount = 1;
1950181254a7Smrg     }
1951181254a7Smrg   else
1952181254a7Smrg     {
1953181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956181254a7Smrg     }
1957181254a7Smrg 
1958181254a7Smrg   abase = a->base_addr;
1959181254a7Smrg   bbase = b->base_addr;
1960181254a7Smrg   dest = retarray->base_addr;
1961181254a7Smrg 
1962181254a7Smrg   /* Now that everything is set up, we perform the multiplication
1963181254a7Smrg      itself.  */
1964181254a7Smrg 
1965181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1967181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1968181254a7Smrg 
1969181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970181254a7Smrg       && (bxstride == 1 || bystride == 1)
1971181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1972181254a7Smrg           > POW3(blas_limit)))
1973181254a7Smrg     {
1974181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1975181254a7Smrg       const GFC_INTEGER_16 one = 1, zero = 0;
1976181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1977181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1978181254a7Smrg 
1979181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980181254a7Smrg 	{
1981181254a7Smrg 	  assert (gemm != NULL);
1982181254a7Smrg 	  const char *transa, *transb;
1983181254a7Smrg 	  if (try_blas & 2)
1984181254a7Smrg 	    transa = "C";
1985181254a7Smrg 	  else
1986181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
1987181254a7Smrg 
1988181254a7Smrg 	  if (try_blas & 4)
1989181254a7Smrg 	    transb = "C";
1990181254a7Smrg 	  else
1991181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
1992181254a7Smrg 
1993181254a7Smrg 	  gemm (transa, transb , &m,
1994181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995181254a7Smrg 		&ldc, 1, 1);
1996181254a7Smrg 	  return;
1997181254a7Smrg 	}
1998181254a7Smrg     }
1999181254a7Smrg 
2000fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2001fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2002181254a7Smrg     {
2003181254a7Smrg       /* This block of code implements a tuned matmul, derived from
2004181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005181254a7Smrg 
2006181254a7Smrg                Bo Kagstrom and Per Ling
2007181254a7Smrg                Department of Computing Science
2008181254a7Smrg                Umea University
2009181254a7Smrg                S-901 87 Umea, Sweden
2010181254a7Smrg 
2011181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012181254a7Smrg 
2013181254a7Smrg       const GFC_INTEGER_16 *a, *b;
2014181254a7Smrg       GFC_INTEGER_16 *c;
2015181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
2016181254a7Smrg 
2017181254a7Smrg       /* System generated locals */
2018181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
2020181254a7Smrg 
2021181254a7Smrg       /* Local variables */
2022181254a7Smrg       GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
2023181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2024181254a7Smrg       index_type i, j, l, ii, jj, ll;
2025181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026181254a7Smrg       GFC_INTEGER_16 *t1;
2027181254a7Smrg 
2028181254a7Smrg       a = abase;
2029181254a7Smrg       b = bbase;
2030181254a7Smrg       c = retarray->base_addr;
2031181254a7Smrg 
2032181254a7Smrg       /* Parameter adjustments */
2033181254a7Smrg       c_dim1 = rystride;
2034181254a7Smrg       c_offset = 1 + c_dim1;
2035181254a7Smrg       c -= c_offset;
2036181254a7Smrg       a_dim1 = aystride;
2037181254a7Smrg       a_offset = 1 + a_dim1;
2038181254a7Smrg       a -= a_offset;
2039181254a7Smrg       b_dim1 = bystride;
2040181254a7Smrg       b_offset = 1 + b_dim1;
2041181254a7Smrg       b -= b_offset;
2042181254a7Smrg 
2043181254a7Smrg       /* Empty c first.  */
2044181254a7Smrg       for (j=1; j<=n; j++)
2045181254a7Smrg 	for (i=1; i<=m; i++)
2046181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
2047181254a7Smrg 
2048181254a7Smrg       /* Early exit if possible */
2049181254a7Smrg       if (m == 0 || n == 0 || k == 0)
2050181254a7Smrg 	return;
2051181254a7Smrg 
2052181254a7Smrg       /* Adjust size of t1 to what is needed.  */
2053181254a7Smrg       index_type t1_dim, a_sz;
2054181254a7Smrg       if (aystride == 1)
2055181254a7Smrg         a_sz = rystride;
2056181254a7Smrg       else
2057181254a7Smrg         a_sz = a_dim1;
2058181254a7Smrg 
2059181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
2060181254a7Smrg       if (t1_dim > 65536)
2061181254a7Smrg 	t1_dim = 65536;
2062181254a7Smrg 
2063181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
2064181254a7Smrg 
2065181254a7Smrg       /* Start turning the crank. */
2066181254a7Smrg       i1 = n;
2067181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
2068181254a7Smrg 	{
2069181254a7Smrg 	  /* Computing MIN */
2070181254a7Smrg 	  i2 = 512;
2071181254a7Smrg 	  i3 = n - jj + 1;
2072181254a7Smrg 	  jsec = min(i2,i3);
2073181254a7Smrg 	  ujsec = jsec - jsec % 4;
2074181254a7Smrg 	  i2 = k;
2075181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2076181254a7Smrg 	    {
2077181254a7Smrg 	      /* Computing MIN */
2078181254a7Smrg 	      i3 = 256;
2079181254a7Smrg 	      i4 = k - ll + 1;
2080181254a7Smrg 	      lsec = min(i3,i4);
2081181254a7Smrg 	      ulsec = lsec - lsec % 2;
2082181254a7Smrg 
2083181254a7Smrg 	      i3 = m;
2084181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2085181254a7Smrg 		{
2086181254a7Smrg 		  /* Computing MIN */
2087181254a7Smrg 		  i4 = 256;
2088181254a7Smrg 		  i5 = m - ii + 1;
2089181254a7Smrg 		  isec = min(i4,i5);
2090181254a7Smrg 		  uisec = isec - isec % 2;
2091181254a7Smrg 		  i4 = ll + ulsec - 1;
2092181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
2093181254a7Smrg 		    {
2094181254a7Smrg 		      i5 = ii + uisec - 1;
2095181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
2096181254a7Smrg 			{
2097181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098181254a7Smrg 					a[i + l * a_dim1];
2099181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100181254a7Smrg 					a[i + (l + 1) * a_dim1];
2101181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102181254a7Smrg 					a[i + 1 + l * a_dim1];
2103181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
2105181254a7Smrg 			}
2106181254a7Smrg 		      if (uisec < isec)
2107181254a7Smrg 			{
2108181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2109181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
2110181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2111181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112181254a7Smrg 			}
2113181254a7Smrg 		    }
2114181254a7Smrg 		  if (ulsec < lsec)
2115181254a7Smrg 		    {
2116181254a7Smrg 		      i4 = ii + isec - 1;
2117181254a7Smrg 		      for (i = ii; i<= i4; ++i)
2118181254a7Smrg 			{
2119181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2121181254a7Smrg 			}
2122181254a7Smrg 		    }
2123181254a7Smrg 
2124181254a7Smrg 		  uisec = isec - isec % 4;
2125181254a7Smrg 		  i4 = jj + ujsec - 1;
2126181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
2127181254a7Smrg 		    {
2128181254a7Smrg 		      i5 = ii + uisec - 1;
2129181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
2130181254a7Smrg 			{
2131181254a7Smrg 			  f11 = c[i + j * c_dim1];
2132181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
2133181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2134181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2136181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2138181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
2140181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
2141181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147181254a7Smrg 			  i6 = ll + lsec - 1;
2148181254a7Smrg 			  for (l = ll; l <= i6; ++l)
2149181254a7Smrg 			    {
2150181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151181254a7Smrg 				      * b[l + j * b_dim1];
2152181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153181254a7Smrg 				      * b[l + j * b_dim1];
2154181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2156181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2158181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2160181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2162181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2164181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2166181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167181254a7Smrg 				      * b[l + j * b_dim1];
2168181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169181254a7Smrg 				      * b[l + j * b_dim1];
2170181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2172181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2174181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2176181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2178181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2180181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2182181254a7Smrg 			    }
2183181254a7Smrg 			  c[i + j * c_dim1] = f11;
2184181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
2185181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2186181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2188181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2190181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
2192181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
2193181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199181254a7Smrg 			}
2200181254a7Smrg 		      if (uisec < isec)
2201181254a7Smrg 			{
2202181254a7Smrg 			  i5 = ii + isec - 1;
2203181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2204181254a7Smrg 			    {
2205181254a7Smrg 			      f11 = c[i + j * c_dim1];
2206181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2207181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2208181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2209181254a7Smrg 			      i6 = ll + lsec - 1;
2210181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2211181254a7Smrg 				{
2212181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213181254a7Smrg 					  257] * b[l + j * b_dim1];
2214181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
2216181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
2218181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
2220181254a7Smrg 				}
2221181254a7Smrg 			      c[i + j * c_dim1] = f11;
2222181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2223181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2224181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2225181254a7Smrg 			    }
2226181254a7Smrg 			}
2227181254a7Smrg 		    }
2228181254a7Smrg 		  if (ujsec < jsec)
2229181254a7Smrg 		    {
2230181254a7Smrg 		      i4 = jj + jsec - 1;
2231181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2232181254a7Smrg 			{
2233181254a7Smrg 			  i5 = ii + uisec - 1;
2234181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
2235181254a7Smrg 			    {
2236181254a7Smrg 			      f11 = c[i + j * c_dim1];
2237181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
2238181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
2239181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
2240181254a7Smrg 			      i6 = ll + lsec - 1;
2241181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2242181254a7Smrg 				{
2243181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244181254a7Smrg 					  257] * b[l + j * b_dim1];
2245181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246181254a7Smrg 					  257] * b[l + j * b_dim1];
2247181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248181254a7Smrg 					  257] * b[l + j * b_dim1];
2249181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250181254a7Smrg 					  257] * b[l + j * b_dim1];
2251181254a7Smrg 				}
2252181254a7Smrg 			      c[i + j * c_dim1] = f11;
2253181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
2254181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
2255181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
2256181254a7Smrg 			    }
2257181254a7Smrg 			  i5 = ii + isec - 1;
2258181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2259181254a7Smrg 			    {
2260181254a7Smrg 			      f11 = c[i + j * c_dim1];
2261181254a7Smrg 			      i6 = ll + lsec - 1;
2262181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2263181254a7Smrg 				{
2264181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265181254a7Smrg 					  257] * b[l + j * b_dim1];
2266181254a7Smrg 				}
2267181254a7Smrg 			      c[i + j * c_dim1] = f11;
2268181254a7Smrg 			    }
2269181254a7Smrg 			}
2270181254a7Smrg 		    }
2271181254a7Smrg 		}
2272181254a7Smrg 	    }
2273181254a7Smrg 	}
2274181254a7Smrg       free(t1);
2275181254a7Smrg       return;
2276181254a7Smrg     }
2277181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278181254a7Smrg     {
2279181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2280181254a7Smrg 	{
2281181254a7Smrg 	  const GFC_INTEGER_16 *restrict abase_x;
2282181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
2283181254a7Smrg 	  GFC_INTEGER_16 *restrict dest_y;
2284181254a7Smrg 	  GFC_INTEGER_16 s;
2285181254a7Smrg 
2286181254a7Smrg 	  for (y = 0; y < ycount; y++)
2287181254a7Smrg 	    {
2288181254a7Smrg 	      bbase_y = &bbase[y*bystride];
2289181254a7Smrg 	      dest_y = &dest[y*rystride];
2290181254a7Smrg 	      for (x = 0; x < xcount; x++)
2291181254a7Smrg 		{
2292181254a7Smrg 		  abase_x = &abase[x*axstride];
2293181254a7Smrg 		  s = (GFC_INTEGER_16) 0;
2294181254a7Smrg 		  for (n = 0; n < count; n++)
2295181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
2296181254a7Smrg 		  dest_y[x] = s;
2297181254a7Smrg 		}
2298181254a7Smrg 	    }
2299181254a7Smrg 	}
2300181254a7Smrg       else
2301181254a7Smrg 	{
2302181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
2303181254a7Smrg 	  GFC_INTEGER_16 s;
2304181254a7Smrg 
2305181254a7Smrg 	  for (y = 0; y < ycount; y++)
2306181254a7Smrg 	    {
2307181254a7Smrg 	      bbase_y = &bbase[y*bystride];
2308181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
2309181254a7Smrg 	      for (n = 0; n < count; n++)
2310181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
2311181254a7Smrg 	      dest[y*rystride] = s;
2312181254a7Smrg 	    }
2313181254a7Smrg 	}
2314181254a7Smrg     }
2315181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316181254a7Smrg     {
2317181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
2318181254a7Smrg       GFC_INTEGER_16 s;
2319181254a7Smrg 
2320181254a7Smrg       for (y = 0; y < ycount; y++)
2321181254a7Smrg 	{
2322181254a7Smrg 	  bbase_y = &bbase[y*bystride];
2323181254a7Smrg 	  s = (GFC_INTEGER_16) 0;
2324181254a7Smrg 	  for (n = 0; n < count; n++)
2325181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326181254a7Smrg 	  dest[y*rxstride] = s;
2327181254a7Smrg 	}
2328181254a7Smrg     }
2329fb8a8121Smrg   else if (axstride < aystride)
2330fb8a8121Smrg     {
2331fb8a8121Smrg       for (y = 0; y < ycount; y++)
2332fb8a8121Smrg 	for (x = 0; x < xcount; x++)
2333fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
2334fb8a8121Smrg 
2335fb8a8121Smrg       for (y = 0; y < ycount; y++)
2336fb8a8121Smrg 	for (n = 0; n < count; n++)
2337fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
2338fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2339fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
2340fb8a8121Smrg 					abase[x*axstride + n*aystride] *
2341fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
2342fb8a8121Smrg     }
2343181254a7Smrg   else
2344181254a7Smrg     {
2345181254a7Smrg       const GFC_INTEGER_16 *restrict abase_x;
2346181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
2347181254a7Smrg       GFC_INTEGER_16 *restrict dest_y;
2348181254a7Smrg       GFC_INTEGER_16 s;
2349181254a7Smrg 
2350181254a7Smrg       for (y = 0; y < ycount; y++)
2351181254a7Smrg 	{
2352181254a7Smrg 	  bbase_y = &bbase[y*bystride];
2353181254a7Smrg 	  dest_y = &dest[y*rystride];
2354181254a7Smrg 	  for (x = 0; x < xcount; x++)
2355181254a7Smrg 	    {
2356181254a7Smrg 	      abase_x = &abase[x*axstride];
2357181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
2358181254a7Smrg 	      for (n = 0; n < count; n++)
2359181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360181254a7Smrg 	      dest_y[x*rxstride] = s;
2361181254a7Smrg 	    }
2362181254a7Smrg 	}
2363181254a7Smrg     }
2364181254a7Smrg }
2365181254a7Smrg #undef POW3
2366181254a7Smrg #undef min
2367181254a7Smrg #undef max
2368181254a7Smrg 
2369181254a7Smrg 
2370181254a7Smrg /* Compiling main function, with selection code for the processor.  */
2371181254a7Smrg 
2372181254a7Smrg /* Currently, this is i386 only.  Adjust for other architectures.  */
2373181254a7Smrg 
matmul_i16(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2374181254a7Smrg void matmul_i16 (gfc_array_i16 * const restrict retarray,
2375181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
2376181254a7Smrg 	int blas_limit, blas_call gemm)
2377181254a7Smrg {
2378181254a7Smrg   static void (*matmul_p) (gfc_array_i16 * const restrict retarray,
2379181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
2380181254a7Smrg 	int blas_limit, blas_call gemm);
2381181254a7Smrg 
2382181254a7Smrg   void (*matmul_fn) (gfc_array_i16 * const restrict retarray,
2383181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
2384181254a7Smrg 	int blas_limit, blas_call gemm);
2385181254a7Smrg 
2386181254a7Smrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2387181254a7Smrg   if (matmul_fn == NULL)
2388181254a7Smrg     {
2389181254a7Smrg       matmul_fn = matmul_i16_vanilla;
2390*b1e83836Smrg       if (__builtin_cpu_is ("intel"))
2391181254a7Smrg 	{
2392181254a7Smrg           /* Run down the available processors in order of preference.  */
2393181254a7Smrg #ifdef HAVE_AVX512F
2394*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx512f"))
2395181254a7Smrg 	    {
2396181254a7Smrg 	      matmul_fn = matmul_i16_avx512f;
2397181254a7Smrg 	      goto store;
2398181254a7Smrg 	    }
2399181254a7Smrg 
2400181254a7Smrg #endif  /* HAVE_AVX512F */
2401181254a7Smrg 
2402181254a7Smrg #ifdef HAVE_AVX2
2403*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx2")
2404*b1e83836Smrg 	      && __builtin_cpu_supports ("fma"))
2405181254a7Smrg 	    {
2406181254a7Smrg 	      matmul_fn = matmul_i16_avx2;
2407181254a7Smrg 	      goto store;
2408181254a7Smrg 	    }
2409181254a7Smrg 
2410181254a7Smrg #endif
2411181254a7Smrg 
2412181254a7Smrg #ifdef HAVE_AVX
2413*b1e83836Smrg 	  if (__builtin_cpu_supports ("avx"))
2414181254a7Smrg  	    {
2415181254a7Smrg               matmul_fn = matmul_i16_avx;
2416181254a7Smrg 	      goto store;
2417181254a7Smrg 	    }
2418181254a7Smrg #endif  /* HAVE_AVX */
2419181254a7Smrg         }
2420*b1e83836Smrg     else if (__builtin_cpu_is ("amd"))
2421181254a7Smrg       {
2422181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2423*b1e83836Smrg 	if (__builtin_cpu_supports ("avx")
2424*b1e83836Smrg 	    && __builtin_cpu_supports ("fma"))
2425181254a7Smrg 	  {
2426181254a7Smrg             matmul_fn = matmul_i16_avx128_fma3;
2427181254a7Smrg 	    goto store;
2428181254a7Smrg 	  }
2429181254a7Smrg #endif
2430181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2431*b1e83836Smrg 	if (__builtin_cpu_supports ("avx")
2432*b1e83836Smrg 	    && __builtin_cpu_supports ("fma4"))
2433181254a7Smrg 	  {
2434181254a7Smrg             matmul_fn = matmul_i16_avx128_fma4;
2435181254a7Smrg 	    goto store;
2436181254a7Smrg 	  }
2437181254a7Smrg #endif
2438181254a7Smrg 
2439181254a7Smrg       }
2440181254a7Smrg    store:
2441181254a7Smrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2442181254a7Smrg    }
2443181254a7Smrg 
2444181254a7Smrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2445181254a7Smrg }
2446181254a7Smrg 
2447181254a7Smrg #else  /* Just the vanilla function.  */
2448181254a7Smrg 
2449181254a7Smrg void
matmul_i16(gfc_array_i16 * const restrict retarray,gfc_array_i16 * const restrict a,gfc_array_i16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2450181254a7Smrg matmul_i16 (gfc_array_i16 * const restrict retarray,
2451181254a7Smrg 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
2452181254a7Smrg 	int blas_limit, blas_call gemm)
2453181254a7Smrg {
2454181254a7Smrg   const GFC_INTEGER_16 * restrict abase;
2455181254a7Smrg   const GFC_INTEGER_16 * restrict bbase;
2456181254a7Smrg   GFC_INTEGER_16 * restrict dest;
2457181254a7Smrg 
2458181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2459181254a7Smrg   index_type x, y, n, count, xcount, ycount;
2460181254a7Smrg 
2461181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
2462181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
2463181254a7Smrg 
2464181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2465181254a7Smrg 
2466181254a7Smrg    Either A or B (but not both) can be rank 1:
2467181254a7Smrg 
2468181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
2469181254a7Smrg      dimensioned [1,count], so xcount=1.
2470181254a7Smrg 
2471181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
2472181254a7Smrg      dimensioned [count, 1], so ycount=1.
2473181254a7Smrg */
2474181254a7Smrg 
2475181254a7Smrg   if (retarray->base_addr == NULL)
2476181254a7Smrg     {
2477181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2478181254a7Smrg         {
2479181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2480181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2481181254a7Smrg         }
2482181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2483181254a7Smrg         {
2484181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2485181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2486181254a7Smrg         }
2487181254a7Smrg       else
2488181254a7Smrg         {
2489181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2490181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2491181254a7Smrg 
2492181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
2493181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2494181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2495181254a7Smrg         }
2496181254a7Smrg 
2497181254a7Smrg       retarray->base_addr
2498181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
2499181254a7Smrg       retarray->offset = 0;
2500181254a7Smrg     }
2501181254a7Smrg   else if (unlikely (compile_options.bounds_check))
2502181254a7Smrg     {
2503181254a7Smrg       index_type ret_extent, arg_extent;
2504181254a7Smrg 
2505181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2506181254a7Smrg 	{
2507181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2508181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2509181254a7Smrg 	  if (arg_extent != ret_extent)
2510181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2511181254a7Smrg 	    		   "array (%ld/%ld) ",
2512181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
2513181254a7Smrg 	}
2514181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2515181254a7Smrg 	{
2516181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2517181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2518181254a7Smrg 	  if (arg_extent != ret_extent)
2519181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2520181254a7Smrg 	    		   "array (%ld/%ld) ",
2521181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
2522181254a7Smrg 	}
2523181254a7Smrg       else
2524181254a7Smrg 	{
2525181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2526181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2527181254a7Smrg 	  if (arg_extent != ret_extent)
2528181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2529181254a7Smrg 	    		   "array (%ld/%ld) ",
2530181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
2531181254a7Smrg 
2532181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2533181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2534181254a7Smrg 	  if (arg_extent != ret_extent)
2535181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
2536181254a7Smrg 	    		   "array (%ld/%ld) ",
2537181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
2538181254a7Smrg 	}
2539181254a7Smrg     }
2540181254a7Smrg 
2541181254a7Smrg 
2542181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2543181254a7Smrg     {
2544181254a7Smrg       /* One-dimensional result may be addressed in the code below
2545181254a7Smrg 	 either as a row or a column matrix. We want both cases to
2546181254a7Smrg 	 work. */
2547181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548181254a7Smrg     }
2549181254a7Smrg   else
2550181254a7Smrg     {
2551181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2552181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2553181254a7Smrg     }
2554181254a7Smrg 
2555181254a7Smrg 
2556181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
2557181254a7Smrg     {
2558181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
2559181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2560181254a7Smrg       aystride = 1;
2561181254a7Smrg 
2562181254a7Smrg       xcount = 1;
2563181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
2564181254a7Smrg     }
2565181254a7Smrg   else
2566181254a7Smrg     {
2567181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2568181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2569181254a7Smrg 
2570181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
2571181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2572181254a7Smrg     }
2573181254a7Smrg 
2574181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2575181254a7Smrg     {
2576181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2577181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2578181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
2579181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2580181254a7Smrg     }
2581181254a7Smrg 
2582181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
2583181254a7Smrg     {
2584181254a7Smrg       /* Treat it as a column matrix B[count,1] */
2585181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2586181254a7Smrg 
2587181254a7Smrg       /* bystride should never be used for 1-dimensional b.
2588181254a7Smrg          The value is only used for calculation of the
2589181254a7Smrg          memory by the buffer.  */
2590181254a7Smrg       bystride = 256;
2591181254a7Smrg       ycount = 1;
2592181254a7Smrg     }
2593181254a7Smrg   else
2594181254a7Smrg     {
2595181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2596181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2597181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2598181254a7Smrg     }
2599181254a7Smrg 
2600181254a7Smrg   abase = a->base_addr;
2601181254a7Smrg   bbase = b->base_addr;
2602181254a7Smrg   dest = retarray->base_addr;
2603181254a7Smrg 
2604181254a7Smrg   /* Now that everything is set up, we perform the multiplication
2605181254a7Smrg      itself.  */
2606181254a7Smrg 
2607181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2608181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
2609181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
2610181254a7Smrg 
2611181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2612181254a7Smrg       && (bxstride == 1 || bystride == 1)
2613181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
2614181254a7Smrg           > POW3(blas_limit)))
2615181254a7Smrg     {
2616181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
2617181254a7Smrg       const GFC_INTEGER_16 one = 1, zero = 0;
2618181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
2619181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
2620181254a7Smrg 
2621181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2622181254a7Smrg 	{
2623181254a7Smrg 	  assert (gemm != NULL);
2624181254a7Smrg 	  const char *transa, *transb;
2625181254a7Smrg 	  if (try_blas & 2)
2626181254a7Smrg 	    transa = "C";
2627181254a7Smrg 	  else
2628181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
2629181254a7Smrg 
2630181254a7Smrg 	  if (try_blas & 4)
2631181254a7Smrg 	    transb = "C";
2632181254a7Smrg 	  else
2633181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
2634181254a7Smrg 
2635181254a7Smrg 	  gemm (transa, transb , &m,
2636181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2637181254a7Smrg 		&ldc, 1, 1);
2638181254a7Smrg 	  return;
2639181254a7Smrg 	}
2640181254a7Smrg     }
2641181254a7Smrg 
2642fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2643fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2644181254a7Smrg     {
2645181254a7Smrg       /* This block of code implements a tuned matmul, derived from
2646181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2647181254a7Smrg 
2648181254a7Smrg                Bo Kagstrom and Per Ling
2649181254a7Smrg                Department of Computing Science
2650181254a7Smrg                Umea University
2651181254a7Smrg                S-901 87 Umea, Sweden
2652181254a7Smrg 
2653181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2654181254a7Smrg 
2655181254a7Smrg       const GFC_INTEGER_16 *a, *b;
2656181254a7Smrg       GFC_INTEGER_16 *c;
2657181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
2658181254a7Smrg 
2659181254a7Smrg       /* System generated locals */
2660181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2661181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
2662181254a7Smrg 
2663181254a7Smrg       /* Local variables */
2664181254a7Smrg       GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
2665181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2666181254a7Smrg       index_type i, j, l, ii, jj, ll;
2667181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2668181254a7Smrg       GFC_INTEGER_16 *t1;
2669181254a7Smrg 
2670181254a7Smrg       a = abase;
2671181254a7Smrg       b = bbase;
2672181254a7Smrg       c = retarray->base_addr;
2673181254a7Smrg 
2674181254a7Smrg       /* Parameter adjustments */
2675181254a7Smrg       c_dim1 = rystride;
2676181254a7Smrg       c_offset = 1 + c_dim1;
2677181254a7Smrg       c -= c_offset;
2678181254a7Smrg       a_dim1 = aystride;
2679181254a7Smrg       a_offset = 1 + a_dim1;
2680181254a7Smrg       a -= a_offset;
2681181254a7Smrg       b_dim1 = bystride;
2682181254a7Smrg       b_offset = 1 + b_dim1;
2683181254a7Smrg       b -= b_offset;
2684181254a7Smrg 
2685181254a7Smrg       /* Empty c first.  */
2686181254a7Smrg       for (j=1; j<=n; j++)
2687181254a7Smrg 	for (i=1; i<=m; i++)
2688181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
2689181254a7Smrg 
2690181254a7Smrg       /* Early exit if possible */
2691181254a7Smrg       if (m == 0 || n == 0 || k == 0)
2692181254a7Smrg 	return;
2693181254a7Smrg 
2694181254a7Smrg       /* Adjust size of t1 to what is needed.  */
2695181254a7Smrg       index_type t1_dim, a_sz;
2696181254a7Smrg       if (aystride == 1)
2697181254a7Smrg         a_sz = rystride;
2698181254a7Smrg       else
2699181254a7Smrg         a_sz = a_dim1;
2700181254a7Smrg 
2701181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
2702181254a7Smrg       if (t1_dim > 65536)
2703181254a7Smrg 	t1_dim = 65536;
2704181254a7Smrg 
2705181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
2706181254a7Smrg 
2707181254a7Smrg       /* Start turning the crank. */
2708181254a7Smrg       i1 = n;
2709181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
2710181254a7Smrg 	{
2711181254a7Smrg 	  /* Computing MIN */
2712181254a7Smrg 	  i2 = 512;
2713181254a7Smrg 	  i3 = n - jj + 1;
2714181254a7Smrg 	  jsec = min(i2,i3);
2715181254a7Smrg 	  ujsec = jsec - jsec % 4;
2716181254a7Smrg 	  i2 = k;
2717181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2718181254a7Smrg 	    {
2719181254a7Smrg 	      /* Computing MIN */
2720181254a7Smrg 	      i3 = 256;
2721181254a7Smrg 	      i4 = k - ll + 1;
2722181254a7Smrg 	      lsec = min(i3,i4);
2723181254a7Smrg 	      ulsec = lsec - lsec % 2;
2724181254a7Smrg 
2725181254a7Smrg 	      i3 = m;
2726181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2727181254a7Smrg 		{
2728181254a7Smrg 		  /* Computing MIN */
2729181254a7Smrg 		  i4 = 256;
2730181254a7Smrg 		  i5 = m - ii + 1;
2731181254a7Smrg 		  isec = min(i4,i5);
2732181254a7Smrg 		  uisec = isec - isec % 2;
2733181254a7Smrg 		  i4 = ll + ulsec - 1;
2734181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
2735181254a7Smrg 		    {
2736181254a7Smrg 		      i5 = ii + uisec - 1;
2737181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
2738181254a7Smrg 			{
2739181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2740181254a7Smrg 					a[i + l * a_dim1];
2741181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2742181254a7Smrg 					a[i + (l + 1) * a_dim1];
2743181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2744181254a7Smrg 					a[i + 1 + l * a_dim1];
2745181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2746181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
2747181254a7Smrg 			}
2748181254a7Smrg 		      if (uisec < isec)
2749181254a7Smrg 			{
2750181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2751181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
2752181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2753181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2754181254a7Smrg 			}
2755181254a7Smrg 		    }
2756181254a7Smrg 		  if (ulsec < lsec)
2757181254a7Smrg 		    {
2758181254a7Smrg 		      i4 = ii + isec - 1;
2759181254a7Smrg 		      for (i = ii; i<= i4; ++i)
2760181254a7Smrg 			{
2761181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2762181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2763181254a7Smrg 			}
2764181254a7Smrg 		    }
2765181254a7Smrg 
2766181254a7Smrg 		  uisec = isec - isec % 4;
2767181254a7Smrg 		  i4 = jj + ujsec - 1;
2768181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
2769181254a7Smrg 		    {
2770181254a7Smrg 		      i5 = ii + uisec - 1;
2771181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
2772181254a7Smrg 			{
2773181254a7Smrg 			  f11 = c[i + j * c_dim1];
2774181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
2775181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2776181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2777181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2778181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2779181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2780181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2781181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
2782181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
2783181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2784181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2785181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2786181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2787181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2788181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2789181254a7Smrg 			  i6 = ll + lsec - 1;
2790181254a7Smrg 			  for (l = ll; l <= i6; ++l)
2791181254a7Smrg 			    {
2792181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2793181254a7Smrg 				      * b[l + j * b_dim1];
2794181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2795181254a7Smrg 				      * b[l + j * b_dim1];
2796181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2797181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2798181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2799181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2800181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2801181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2802181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2803181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2804181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2805181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2806181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2807181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2808181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2809181254a7Smrg 				      * b[l + j * b_dim1];
2810181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2811181254a7Smrg 				      * b[l + j * b_dim1];
2812181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2813181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2814181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2815181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
2816181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2817181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2818181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2819181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
2820181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2821181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2822181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2823181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
2824181254a7Smrg 			    }
2825181254a7Smrg 			  c[i + j * c_dim1] = f11;
2826181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
2827181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2828181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2829181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2830181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2831181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2832181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2833181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
2834181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
2835181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2836181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2837181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2838181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2839181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2840181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2841181254a7Smrg 			}
2842181254a7Smrg 		      if (uisec < isec)
2843181254a7Smrg 			{
2844181254a7Smrg 			  i5 = ii + isec - 1;
2845181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2846181254a7Smrg 			    {
2847181254a7Smrg 			      f11 = c[i + j * c_dim1];
2848181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2849181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2850181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2851181254a7Smrg 			      i6 = ll + lsec - 1;
2852181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2853181254a7Smrg 				{
2854181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2855181254a7Smrg 					  257] * b[l + j * b_dim1];
2856181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2857181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
2858181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2859181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
2860181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2861181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
2862181254a7Smrg 				}
2863181254a7Smrg 			      c[i + j * c_dim1] = f11;
2864181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2865181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2866181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2867181254a7Smrg 			    }
2868181254a7Smrg 			}
2869181254a7Smrg 		    }
2870181254a7Smrg 		  if (ujsec < jsec)
2871181254a7Smrg 		    {
2872181254a7Smrg 		      i4 = jj + jsec - 1;
2873181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2874181254a7Smrg 			{
2875181254a7Smrg 			  i5 = ii + uisec - 1;
2876181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
2877181254a7Smrg 			    {
2878181254a7Smrg 			      f11 = c[i + j * c_dim1];
2879181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
2880181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
2881181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
2882181254a7Smrg 			      i6 = ll + lsec - 1;
2883181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2884181254a7Smrg 				{
2885181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2886181254a7Smrg 					  257] * b[l + j * b_dim1];
2887181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2888181254a7Smrg 					  257] * b[l + j * b_dim1];
2889181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2890181254a7Smrg 					  257] * b[l + j * b_dim1];
2891181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2892181254a7Smrg 					  257] * b[l + j * b_dim1];
2893181254a7Smrg 				}
2894181254a7Smrg 			      c[i + j * c_dim1] = f11;
2895181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
2896181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
2897181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
2898181254a7Smrg 			    }
2899181254a7Smrg 			  i5 = ii + isec - 1;
2900181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2901181254a7Smrg 			    {
2902181254a7Smrg 			      f11 = c[i + j * c_dim1];
2903181254a7Smrg 			      i6 = ll + lsec - 1;
2904181254a7Smrg 			      for (l = ll; l <= i6; ++l)
2905181254a7Smrg 				{
2906181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2907181254a7Smrg 					  257] * b[l + j * b_dim1];
2908181254a7Smrg 				}
2909181254a7Smrg 			      c[i + j * c_dim1] = f11;
2910181254a7Smrg 			    }
2911181254a7Smrg 			}
2912181254a7Smrg 		    }
2913181254a7Smrg 		}
2914181254a7Smrg 	    }
2915181254a7Smrg 	}
2916181254a7Smrg       free(t1);
2917181254a7Smrg       return;
2918181254a7Smrg     }
2919181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2920181254a7Smrg     {
2921181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2922181254a7Smrg 	{
2923181254a7Smrg 	  const GFC_INTEGER_16 *restrict abase_x;
2924181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
2925181254a7Smrg 	  GFC_INTEGER_16 *restrict dest_y;
2926181254a7Smrg 	  GFC_INTEGER_16 s;
2927181254a7Smrg 
2928181254a7Smrg 	  for (y = 0; y < ycount; y++)
2929181254a7Smrg 	    {
2930181254a7Smrg 	      bbase_y = &bbase[y*bystride];
2931181254a7Smrg 	      dest_y = &dest[y*rystride];
2932181254a7Smrg 	      for (x = 0; x < xcount; x++)
2933181254a7Smrg 		{
2934181254a7Smrg 		  abase_x = &abase[x*axstride];
2935181254a7Smrg 		  s = (GFC_INTEGER_16) 0;
2936181254a7Smrg 		  for (n = 0; n < count; n++)
2937181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
2938181254a7Smrg 		  dest_y[x] = s;
2939181254a7Smrg 		}
2940181254a7Smrg 	    }
2941181254a7Smrg 	}
2942181254a7Smrg       else
2943181254a7Smrg 	{
2944181254a7Smrg 	  const GFC_INTEGER_16 *restrict bbase_y;
2945181254a7Smrg 	  GFC_INTEGER_16 s;
2946181254a7Smrg 
2947181254a7Smrg 	  for (y = 0; y < ycount; y++)
2948181254a7Smrg 	    {
2949181254a7Smrg 	      bbase_y = &bbase[y*bystride];
2950181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
2951181254a7Smrg 	      for (n = 0; n < count; n++)
2952181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
2953181254a7Smrg 	      dest[y*rystride] = s;
2954181254a7Smrg 	    }
2955181254a7Smrg 	}
2956181254a7Smrg     }
2957181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2958181254a7Smrg     {
2959181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
2960181254a7Smrg       GFC_INTEGER_16 s;
2961181254a7Smrg 
2962181254a7Smrg       for (y = 0; y < ycount; y++)
2963181254a7Smrg 	{
2964181254a7Smrg 	  bbase_y = &bbase[y*bystride];
2965181254a7Smrg 	  s = (GFC_INTEGER_16) 0;
2966181254a7Smrg 	  for (n = 0; n < count; n++)
2967181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2968181254a7Smrg 	  dest[y*rxstride] = s;
2969181254a7Smrg 	}
2970181254a7Smrg     }
2971fb8a8121Smrg   else if (axstride < aystride)
2972fb8a8121Smrg     {
2973fb8a8121Smrg       for (y = 0; y < ycount; y++)
2974fb8a8121Smrg 	for (x = 0; x < xcount; x++)
2975fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
2976fb8a8121Smrg 
2977fb8a8121Smrg       for (y = 0; y < ycount; y++)
2978fb8a8121Smrg 	for (n = 0; n < count; n++)
2979fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
2980fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2981fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
2982fb8a8121Smrg 					abase[x*axstride + n*aystride] *
2983fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
2984fb8a8121Smrg     }
2985181254a7Smrg   else
2986181254a7Smrg     {
2987181254a7Smrg       const GFC_INTEGER_16 *restrict abase_x;
2988181254a7Smrg       const GFC_INTEGER_16 *restrict bbase_y;
2989181254a7Smrg       GFC_INTEGER_16 *restrict dest_y;
2990181254a7Smrg       GFC_INTEGER_16 s;
2991181254a7Smrg 
2992181254a7Smrg       for (y = 0; y < ycount; y++)
2993181254a7Smrg 	{
2994181254a7Smrg 	  bbase_y = &bbase[y*bystride];
2995181254a7Smrg 	  dest_y = &dest[y*rystride];
2996181254a7Smrg 	  for (x = 0; x < xcount; x++)
2997181254a7Smrg 	    {
2998181254a7Smrg 	      abase_x = &abase[x*axstride];
2999181254a7Smrg 	      s = (GFC_INTEGER_16) 0;
3000181254a7Smrg 	      for (n = 0; n < count; n++)
3001181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3002181254a7Smrg 	      dest_y[x*rxstride] = s;
3003181254a7Smrg 	    }
3004181254a7Smrg 	}
3005181254a7Smrg     }
3006181254a7Smrg }
3007181254a7Smrg #undef POW3
3008181254a7Smrg #undef min
3009181254a7Smrg #undef max
3010181254a7Smrg 
3011181254a7Smrg #endif
3012181254a7Smrg #endif
3013181254a7Smrg 
3014