xref: /netbsd-src/external/gpl3/gcc/dist/libgfortran/generated/matmulavx128_i2.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1181254a7Smrg /* Implementation of the MATMUL intrinsic
2*b1e83836Smrg    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3181254a7Smrg    Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4181254a7Smrg 
5181254a7Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6181254a7Smrg 
7181254a7Smrg Libgfortran is free software; you can redistribute it and/or
8181254a7Smrg modify it under the terms of the GNU General Public
9181254a7Smrg License as published by the Free Software Foundation; either
10181254a7Smrg version 3 of the License, or (at your option) any later version.
11181254a7Smrg 
12181254a7Smrg Libgfortran is distributed in the hope that it will be useful,
13181254a7Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14181254a7Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15181254a7Smrg GNU General Public License for more details.
16181254a7Smrg 
17181254a7Smrg Under Section 7 of GPL version 3, you are granted additional
18181254a7Smrg permissions described in the GCC Runtime Library Exception, version
19181254a7Smrg 3.1, as published by the Free Software Foundation.
20181254a7Smrg 
21181254a7Smrg You should have received a copy of the GNU General Public License and
22181254a7Smrg a copy of the GCC Runtime Library Exception along with this program;
23181254a7Smrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24181254a7Smrg <http://www.gnu.org/licenses/>.  */
25181254a7Smrg 
26181254a7Smrg #include "libgfortran.h"
27181254a7Smrg #include <string.h>
28181254a7Smrg #include <assert.h>
29181254a7Smrg 
30181254a7Smrg 
31181254a7Smrg /* These are the specific versions of matmul with -mprefer-avx128.  */
32181254a7Smrg 
33181254a7Smrg #if defined (HAVE_GFC_INTEGER_2)
34181254a7Smrg 
35181254a7Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36181254a7Smrg    passed to us by the front-end, in which case we call it for large
37181254a7Smrg    matrices.  */
38181254a7Smrg 
39181254a7Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40181254a7Smrg                           const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
41181254a7Smrg                           const int *, const GFC_INTEGER_2 *, const int *,
42181254a7Smrg                           const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
43181254a7Smrg                           int, int);
44181254a7Smrg 
45181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46181254a7Smrg void
47181254a7Smrg matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
48181254a7Smrg 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
49181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50181254a7Smrg internal_proto(matmul_i2_avx128_fma3);
51181254a7Smrg void
matmul_i2_avx128_fma3(gfc_array_i2 * const restrict retarray,gfc_array_i2 * const restrict a,gfc_array_i2 * const restrict b,int try_blas,int blas_limit,blas_call gemm)52181254a7Smrg matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
53181254a7Smrg 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
54181254a7Smrg 	int blas_limit, blas_call gemm)
55181254a7Smrg {
56181254a7Smrg   const GFC_INTEGER_2 * restrict abase;
57181254a7Smrg   const GFC_INTEGER_2 * restrict bbase;
58181254a7Smrg   GFC_INTEGER_2 * restrict dest;
59181254a7Smrg 
60181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61181254a7Smrg   index_type x, y, n, count, xcount, ycount;
62181254a7Smrg 
63181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
64181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
65181254a7Smrg 
66181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67181254a7Smrg 
68181254a7Smrg    Either A or B (but not both) can be rank 1:
69181254a7Smrg 
70181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
71181254a7Smrg      dimensioned [1,count], so xcount=1.
72181254a7Smrg 
73181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
74181254a7Smrg      dimensioned [count, 1], so ycount=1.
75181254a7Smrg */
76181254a7Smrg 
77181254a7Smrg   if (retarray->base_addr == NULL)
78181254a7Smrg     {
79181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
80181254a7Smrg         {
81181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
82181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83181254a7Smrg         }
84181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
85181254a7Smrg         {
86181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
87181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88181254a7Smrg         }
89181254a7Smrg       else
90181254a7Smrg         {
91181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
92181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93181254a7Smrg 
94181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
95181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
97181254a7Smrg         }
98181254a7Smrg 
99181254a7Smrg       retarray->base_addr
100181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
101181254a7Smrg       retarray->offset = 0;
102181254a7Smrg     }
103181254a7Smrg   else if (unlikely (compile_options.bounds_check))
104181254a7Smrg     {
105181254a7Smrg       index_type ret_extent, arg_extent;
106181254a7Smrg 
107181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
108181254a7Smrg 	{
109181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111181254a7Smrg 	  if (arg_extent != ret_extent)
112181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
113181254a7Smrg 	    		   "array (%ld/%ld) ",
114181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
115181254a7Smrg 	}
116181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
117181254a7Smrg 	{
118181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120181254a7Smrg 	  if (arg_extent != ret_extent)
121181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
122181254a7Smrg 	    		   "array (%ld/%ld) ",
123181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
124181254a7Smrg 	}
125181254a7Smrg       else
126181254a7Smrg 	{
127181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129181254a7Smrg 	  if (arg_extent != ret_extent)
130181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
131181254a7Smrg 	    		   "array (%ld/%ld) ",
132181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
133181254a7Smrg 
134181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136181254a7Smrg 	  if (arg_extent != ret_extent)
137181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
138181254a7Smrg 	    		   "array (%ld/%ld) ",
139181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
140181254a7Smrg 	}
141181254a7Smrg     }
142181254a7Smrg 
143181254a7Smrg 
144181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145181254a7Smrg     {
146181254a7Smrg       /* One-dimensional result may be addressed in the code below
147181254a7Smrg 	 either as a row or a column matrix. We want both cases to
148181254a7Smrg 	 work. */
149181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150181254a7Smrg     }
151181254a7Smrg   else
152181254a7Smrg     {
153181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155181254a7Smrg     }
156181254a7Smrg 
157181254a7Smrg 
158181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
159181254a7Smrg     {
160181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
161181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162181254a7Smrg       aystride = 1;
163181254a7Smrg 
164181254a7Smrg       xcount = 1;
165181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
166181254a7Smrg     }
167181254a7Smrg   else
168181254a7Smrg     {
169181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171181254a7Smrg 
172181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
173181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174181254a7Smrg     }
175181254a7Smrg 
176181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177181254a7Smrg     {
178181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
181181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182181254a7Smrg     }
183181254a7Smrg 
184181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
185181254a7Smrg     {
186181254a7Smrg       /* Treat it as a column matrix B[count,1] */
187181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188181254a7Smrg 
189181254a7Smrg       /* bystride should never be used for 1-dimensional b.
190181254a7Smrg          The value is only used for calculation of the
191181254a7Smrg          memory by the buffer.  */
192181254a7Smrg       bystride = 256;
193181254a7Smrg       ycount = 1;
194181254a7Smrg     }
195181254a7Smrg   else
196181254a7Smrg     {
197181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200181254a7Smrg     }
201181254a7Smrg 
202181254a7Smrg   abase = a->base_addr;
203181254a7Smrg   bbase = b->base_addr;
204181254a7Smrg   dest = retarray->base_addr;
205181254a7Smrg 
206181254a7Smrg   /* Now that everything is set up, we perform the multiplication
207181254a7Smrg      itself.  */
208181254a7Smrg 
209181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
211181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
212181254a7Smrg 
213181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214181254a7Smrg       && (bxstride == 1 || bystride == 1)
215181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
216181254a7Smrg           > POW3(blas_limit)))
217181254a7Smrg     {
218181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
219181254a7Smrg       const GFC_INTEGER_2 one = 1, zero = 0;
220181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
221181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
222181254a7Smrg 
223181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224181254a7Smrg 	{
225181254a7Smrg 	  assert (gemm != NULL);
226181254a7Smrg 	  const char *transa, *transb;
227181254a7Smrg 	  if (try_blas & 2)
228181254a7Smrg 	    transa = "C";
229181254a7Smrg 	  else
230181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
231181254a7Smrg 
232181254a7Smrg 	  if (try_blas & 4)
233181254a7Smrg 	    transb = "C";
234181254a7Smrg 	  else
235181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
236181254a7Smrg 
237181254a7Smrg 	  gemm (transa, transb , &m,
238181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
239181254a7Smrg 		&ldc, 1, 1);
240181254a7Smrg 	  return;
241181254a7Smrg 	}
242181254a7Smrg     }
243181254a7Smrg 
244fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
245fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
246181254a7Smrg     {
247181254a7Smrg       /* This block of code implements a tuned matmul, derived from
248181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
249181254a7Smrg 
250181254a7Smrg                Bo Kagstrom and Per Ling
251181254a7Smrg                Department of Computing Science
252181254a7Smrg                Umea University
253181254a7Smrg                S-901 87 Umea, Sweden
254181254a7Smrg 
255181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
256181254a7Smrg 
257181254a7Smrg       const GFC_INTEGER_2 *a, *b;
258181254a7Smrg       GFC_INTEGER_2 *c;
259181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
260181254a7Smrg 
261181254a7Smrg       /* System generated locals */
262181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
264181254a7Smrg 
265181254a7Smrg       /* Local variables */
266181254a7Smrg       GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
267181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
268181254a7Smrg       index_type i, j, l, ii, jj, ll;
269181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270181254a7Smrg       GFC_INTEGER_2 *t1;
271181254a7Smrg 
272181254a7Smrg       a = abase;
273181254a7Smrg       b = bbase;
274181254a7Smrg       c = retarray->base_addr;
275181254a7Smrg 
276181254a7Smrg       /* Parameter adjustments */
277181254a7Smrg       c_dim1 = rystride;
278181254a7Smrg       c_offset = 1 + c_dim1;
279181254a7Smrg       c -= c_offset;
280181254a7Smrg       a_dim1 = aystride;
281181254a7Smrg       a_offset = 1 + a_dim1;
282181254a7Smrg       a -= a_offset;
283181254a7Smrg       b_dim1 = bystride;
284181254a7Smrg       b_offset = 1 + b_dim1;
285181254a7Smrg       b -= b_offset;
286181254a7Smrg 
287181254a7Smrg       /* Empty c first.  */
288181254a7Smrg       for (j=1; j<=n; j++)
289181254a7Smrg 	for (i=1; i<=m; i++)
290181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
291181254a7Smrg 
292181254a7Smrg       /* Early exit if possible */
293181254a7Smrg       if (m == 0 || n == 0 || k == 0)
294181254a7Smrg 	return;
295181254a7Smrg 
296181254a7Smrg       /* Adjust size of t1 to what is needed.  */
297181254a7Smrg       index_type t1_dim, a_sz;
298181254a7Smrg       if (aystride == 1)
299181254a7Smrg         a_sz = rystride;
300181254a7Smrg       else
301181254a7Smrg         a_sz = a_dim1;
302181254a7Smrg 
303181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
304181254a7Smrg       if (t1_dim > 65536)
305181254a7Smrg 	t1_dim = 65536;
306181254a7Smrg 
307181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
308181254a7Smrg 
309181254a7Smrg       /* Start turning the crank. */
310181254a7Smrg       i1 = n;
311181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
312181254a7Smrg 	{
313181254a7Smrg 	  /* Computing MIN */
314181254a7Smrg 	  i2 = 512;
315181254a7Smrg 	  i3 = n - jj + 1;
316181254a7Smrg 	  jsec = min(i2,i3);
317181254a7Smrg 	  ujsec = jsec - jsec % 4;
318181254a7Smrg 	  i2 = k;
319181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
320181254a7Smrg 	    {
321181254a7Smrg 	      /* Computing MIN */
322181254a7Smrg 	      i3 = 256;
323181254a7Smrg 	      i4 = k - ll + 1;
324181254a7Smrg 	      lsec = min(i3,i4);
325181254a7Smrg 	      ulsec = lsec - lsec % 2;
326181254a7Smrg 
327181254a7Smrg 	      i3 = m;
328181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
329181254a7Smrg 		{
330181254a7Smrg 		  /* Computing MIN */
331181254a7Smrg 		  i4 = 256;
332181254a7Smrg 		  i5 = m - ii + 1;
333181254a7Smrg 		  isec = min(i4,i5);
334181254a7Smrg 		  uisec = isec - isec % 2;
335181254a7Smrg 		  i4 = ll + ulsec - 1;
336181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
337181254a7Smrg 		    {
338181254a7Smrg 		      i5 = ii + uisec - 1;
339181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
340181254a7Smrg 			{
341181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342181254a7Smrg 					a[i + l * a_dim1];
343181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344181254a7Smrg 					a[i + (l + 1) * a_dim1];
345181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346181254a7Smrg 					a[i + 1 + l * a_dim1];
347181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
349181254a7Smrg 			}
350181254a7Smrg 		      if (uisec < isec)
351181254a7Smrg 			{
352181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
353181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
354181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
355181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
356181254a7Smrg 			}
357181254a7Smrg 		    }
358181254a7Smrg 		  if (ulsec < lsec)
359181254a7Smrg 		    {
360181254a7Smrg 		      i4 = ii + isec - 1;
361181254a7Smrg 		      for (i = ii; i<= i4; ++i)
362181254a7Smrg 			{
363181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
364181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
365181254a7Smrg 			}
366181254a7Smrg 		    }
367181254a7Smrg 
368181254a7Smrg 		  uisec = isec - isec % 4;
369181254a7Smrg 		  i4 = jj + ujsec - 1;
370181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
371181254a7Smrg 		    {
372181254a7Smrg 		      i5 = ii + uisec - 1;
373181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
374181254a7Smrg 			{
375181254a7Smrg 			  f11 = c[i + j * c_dim1];
376181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
377181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
378181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
379181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
380181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
381181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
382181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
383181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
384181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
385181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
386181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
387181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
388181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
389181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
390181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
391181254a7Smrg 			  i6 = ll + lsec - 1;
392181254a7Smrg 			  for (l = ll; l <= i6; ++l)
393181254a7Smrg 			    {
394181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395181254a7Smrg 				      * b[l + j * b_dim1];
396181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397181254a7Smrg 				      * b[l + j * b_dim1];
398181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
400181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
402181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
404181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
406181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
408181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
410181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411181254a7Smrg 				      * b[l + j * b_dim1];
412181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413181254a7Smrg 				      * b[l + j * b_dim1];
414181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
416181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
418181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
420181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
422181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
424181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
426181254a7Smrg 			    }
427181254a7Smrg 			  c[i + j * c_dim1] = f11;
428181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
429181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
430181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
431181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
432181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
433181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
434181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
435181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
436181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
437181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
438181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
439181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
440181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
441181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
442181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
443181254a7Smrg 			}
444181254a7Smrg 		      if (uisec < isec)
445181254a7Smrg 			{
446181254a7Smrg 			  i5 = ii + isec - 1;
447181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
448181254a7Smrg 			    {
449181254a7Smrg 			      f11 = c[i + j * c_dim1];
450181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
451181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
452181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
453181254a7Smrg 			      i6 = ll + lsec - 1;
454181254a7Smrg 			      for (l = ll; l <= i6; ++l)
455181254a7Smrg 				{
456181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457181254a7Smrg 					  257] * b[l + j * b_dim1];
458181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
460181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
462181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
464181254a7Smrg 				}
465181254a7Smrg 			      c[i + j * c_dim1] = f11;
466181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
467181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
468181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
469181254a7Smrg 			    }
470181254a7Smrg 			}
471181254a7Smrg 		    }
472181254a7Smrg 		  if (ujsec < jsec)
473181254a7Smrg 		    {
474181254a7Smrg 		      i4 = jj + jsec - 1;
475181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
476181254a7Smrg 			{
477181254a7Smrg 			  i5 = ii + uisec - 1;
478181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
479181254a7Smrg 			    {
480181254a7Smrg 			      f11 = c[i + j * c_dim1];
481181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
482181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
483181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
484181254a7Smrg 			      i6 = ll + lsec - 1;
485181254a7Smrg 			      for (l = ll; l <= i6; ++l)
486181254a7Smrg 				{
487181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488181254a7Smrg 					  257] * b[l + j * b_dim1];
489181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490181254a7Smrg 					  257] * b[l + j * b_dim1];
491181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492181254a7Smrg 					  257] * b[l + j * b_dim1];
493181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494181254a7Smrg 					  257] * b[l + j * b_dim1];
495181254a7Smrg 				}
496181254a7Smrg 			      c[i + j * c_dim1] = f11;
497181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
498181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
499181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
500181254a7Smrg 			    }
501181254a7Smrg 			  i5 = ii + isec - 1;
502181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
503181254a7Smrg 			    {
504181254a7Smrg 			      f11 = c[i + j * c_dim1];
505181254a7Smrg 			      i6 = ll + lsec - 1;
506181254a7Smrg 			      for (l = ll; l <= i6; ++l)
507181254a7Smrg 				{
508181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509181254a7Smrg 					  257] * b[l + j * b_dim1];
510181254a7Smrg 				}
511181254a7Smrg 			      c[i + j * c_dim1] = f11;
512181254a7Smrg 			    }
513181254a7Smrg 			}
514181254a7Smrg 		    }
515181254a7Smrg 		}
516181254a7Smrg 	    }
517181254a7Smrg 	}
518181254a7Smrg       free(t1);
519181254a7Smrg       return;
520181254a7Smrg     }
521181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522181254a7Smrg     {
523181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
524181254a7Smrg 	{
525181254a7Smrg 	  const GFC_INTEGER_2 *restrict abase_x;
526181254a7Smrg 	  const GFC_INTEGER_2 *restrict bbase_y;
527181254a7Smrg 	  GFC_INTEGER_2 *restrict dest_y;
528181254a7Smrg 	  GFC_INTEGER_2 s;
529181254a7Smrg 
530181254a7Smrg 	  for (y = 0; y < ycount; y++)
531181254a7Smrg 	    {
532181254a7Smrg 	      bbase_y = &bbase[y*bystride];
533181254a7Smrg 	      dest_y = &dest[y*rystride];
534181254a7Smrg 	      for (x = 0; x < xcount; x++)
535181254a7Smrg 		{
536181254a7Smrg 		  abase_x = &abase[x*axstride];
537181254a7Smrg 		  s = (GFC_INTEGER_2) 0;
538181254a7Smrg 		  for (n = 0; n < count; n++)
539181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
540181254a7Smrg 		  dest_y[x] = s;
541181254a7Smrg 		}
542181254a7Smrg 	    }
543181254a7Smrg 	}
544181254a7Smrg       else
545181254a7Smrg 	{
546181254a7Smrg 	  const GFC_INTEGER_2 *restrict bbase_y;
547181254a7Smrg 	  GFC_INTEGER_2 s;
548181254a7Smrg 
549181254a7Smrg 	  for (y = 0; y < ycount; y++)
550181254a7Smrg 	    {
551181254a7Smrg 	      bbase_y = &bbase[y*bystride];
552181254a7Smrg 	      s = (GFC_INTEGER_2) 0;
553181254a7Smrg 	      for (n = 0; n < count; n++)
554181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
555181254a7Smrg 	      dest[y*rystride] = s;
556181254a7Smrg 	    }
557181254a7Smrg 	}
558181254a7Smrg     }
559181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
560181254a7Smrg     {
561181254a7Smrg       const GFC_INTEGER_2 *restrict bbase_y;
562181254a7Smrg       GFC_INTEGER_2 s;
563181254a7Smrg 
564181254a7Smrg       for (y = 0; y < ycount; y++)
565181254a7Smrg 	{
566181254a7Smrg 	  bbase_y = &bbase[y*bystride];
567181254a7Smrg 	  s = (GFC_INTEGER_2) 0;
568181254a7Smrg 	  for (n = 0; n < count; n++)
569181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
570181254a7Smrg 	  dest[y*rxstride] = s;
571181254a7Smrg 	}
572181254a7Smrg     }
573fb8a8121Smrg   else if (axstride < aystride)
574fb8a8121Smrg     {
575fb8a8121Smrg       for (y = 0; y < ycount; y++)
576fb8a8121Smrg 	for (x = 0; x < xcount; x++)
577fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
578fb8a8121Smrg 
579fb8a8121Smrg       for (y = 0; y < ycount; y++)
580fb8a8121Smrg 	for (n = 0; n < count; n++)
581fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
582fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
583fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
584fb8a8121Smrg 					abase[x*axstride + n*aystride] *
585fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
586fb8a8121Smrg     }
587181254a7Smrg   else
588181254a7Smrg     {
589181254a7Smrg       const GFC_INTEGER_2 *restrict abase_x;
590181254a7Smrg       const GFC_INTEGER_2 *restrict bbase_y;
591181254a7Smrg       GFC_INTEGER_2 *restrict dest_y;
592181254a7Smrg       GFC_INTEGER_2 s;
593181254a7Smrg 
594181254a7Smrg       for (y = 0; y < ycount; y++)
595181254a7Smrg 	{
596181254a7Smrg 	  bbase_y = &bbase[y*bystride];
597181254a7Smrg 	  dest_y = &dest[y*rystride];
598181254a7Smrg 	  for (x = 0; x < xcount; x++)
599181254a7Smrg 	    {
600181254a7Smrg 	      abase_x = &abase[x*axstride];
601181254a7Smrg 	      s = (GFC_INTEGER_2) 0;
602181254a7Smrg 	      for (n = 0; n < count; n++)
603181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
604181254a7Smrg 	      dest_y[x*rxstride] = s;
605181254a7Smrg 	    }
606181254a7Smrg 	}
607181254a7Smrg     }
608181254a7Smrg }
609181254a7Smrg #undef POW3
610181254a7Smrg #undef min
611181254a7Smrg #undef max
612181254a7Smrg 
613181254a7Smrg #endif
614181254a7Smrg 
615181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616181254a7Smrg void
617181254a7Smrg matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
618181254a7Smrg 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
619181254a7Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620181254a7Smrg internal_proto(matmul_i2_avx128_fma4);
621181254a7Smrg void
matmul_i2_avx128_fma4(gfc_array_i2 * const restrict retarray,gfc_array_i2 * const restrict a,gfc_array_i2 * const restrict b,int try_blas,int blas_limit,blas_call gemm)622181254a7Smrg matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
623181254a7Smrg 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
624181254a7Smrg 	int blas_limit, blas_call gemm)
625181254a7Smrg {
626181254a7Smrg   const GFC_INTEGER_2 * restrict abase;
627181254a7Smrg   const GFC_INTEGER_2 * restrict bbase;
628181254a7Smrg   GFC_INTEGER_2 * restrict dest;
629181254a7Smrg 
630181254a7Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631181254a7Smrg   index_type x, y, n, count, xcount, ycount;
632181254a7Smrg 
633181254a7Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
634181254a7Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
635181254a7Smrg 
636181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637181254a7Smrg 
638181254a7Smrg    Either A or B (but not both) can be rank 1:
639181254a7Smrg 
640181254a7Smrg    o One-dimensional argument A is implicitly treated as a row matrix
641181254a7Smrg      dimensioned [1,count], so xcount=1.
642181254a7Smrg 
643181254a7Smrg    o One-dimensional argument B is implicitly treated as a column matrix
644181254a7Smrg      dimensioned [count, 1], so ycount=1.
645181254a7Smrg */
646181254a7Smrg 
647181254a7Smrg   if (retarray->base_addr == NULL)
648181254a7Smrg     {
649181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
650181254a7Smrg         {
651181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
652181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653181254a7Smrg         }
654181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
655181254a7Smrg         {
656181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
657181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658181254a7Smrg         }
659181254a7Smrg       else
660181254a7Smrg         {
661181254a7Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
662181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663181254a7Smrg 
664181254a7Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
665181254a7Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666181254a7Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
667181254a7Smrg         }
668181254a7Smrg 
669181254a7Smrg       retarray->base_addr
670181254a7Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
671181254a7Smrg       retarray->offset = 0;
672181254a7Smrg     }
673181254a7Smrg   else if (unlikely (compile_options.bounds_check))
674181254a7Smrg     {
675181254a7Smrg       index_type ret_extent, arg_extent;
676181254a7Smrg 
677181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
678181254a7Smrg 	{
679181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681181254a7Smrg 	  if (arg_extent != ret_extent)
682181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
683181254a7Smrg 	    		   "array (%ld/%ld) ",
684181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
685181254a7Smrg 	}
686181254a7Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
687181254a7Smrg 	{
688181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690181254a7Smrg 	  if (arg_extent != ret_extent)
691181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
692181254a7Smrg 	    		   "array (%ld/%ld) ",
693181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
694181254a7Smrg 	}
695181254a7Smrg       else
696181254a7Smrg 	{
697181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699181254a7Smrg 	  if (arg_extent != ret_extent)
700181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
701181254a7Smrg 	    		   "array (%ld/%ld) ",
702181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
703181254a7Smrg 
704181254a7Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705181254a7Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706181254a7Smrg 	  if (arg_extent != ret_extent)
707181254a7Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
708181254a7Smrg 	    		   "array (%ld/%ld) ",
709181254a7Smrg 			   (long int) ret_extent, (long int) arg_extent);
710181254a7Smrg 	}
711181254a7Smrg     }
712181254a7Smrg 
713181254a7Smrg 
714181254a7Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715181254a7Smrg     {
716181254a7Smrg       /* One-dimensional result may be addressed in the code below
717181254a7Smrg 	 either as a row or a column matrix. We want both cases to
718181254a7Smrg 	 work. */
719181254a7Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720181254a7Smrg     }
721181254a7Smrg   else
722181254a7Smrg     {
723181254a7Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724181254a7Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725181254a7Smrg     }
726181254a7Smrg 
727181254a7Smrg 
728181254a7Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
729181254a7Smrg     {
730181254a7Smrg       /* Treat it as a a row matrix A[1,count]. */
731181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732181254a7Smrg       aystride = 1;
733181254a7Smrg 
734181254a7Smrg       xcount = 1;
735181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
736181254a7Smrg     }
737181254a7Smrg   else
738181254a7Smrg     {
739181254a7Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740181254a7Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741181254a7Smrg 
742181254a7Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
743181254a7Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744181254a7Smrg     }
745181254a7Smrg 
746181254a7Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747181254a7Smrg     {
748181254a7Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749181254a7Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750181254a7Smrg 		       "in dimension 1: is %ld, should be %ld",
751181254a7Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752181254a7Smrg     }
753181254a7Smrg 
754181254a7Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
755181254a7Smrg     {
756181254a7Smrg       /* Treat it as a column matrix B[count,1] */
757181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758181254a7Smrg 
759181254a7Smrg       /* bystride should never be used for 1-dimensional b.
760181254a7Smrg          The value is only used for calculation of the
761181254a7Smrg          memory by the buffer.  */
762181254a7Smrg       bystride = 256;
763181254a7Smrg       ycount = 1;
764181254a7Smrg     }
765181254a7Smrg   else
766181254a7Smrg     {
767181254a7Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768181254a7Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769181254a7Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770181254a7Smrg     }
771181254a7Smrg 
772181254a7Smrg   abase = a->base_addr;
773181254a7Smrg   bbase = b->base_addr;
774181254a7Smrg   dest = retarray->base_addr;
775181254a7Smrg 
776181254a7Smrg   /* Now that everything is set up, we perform the multiplication
777181254a7Smrg      itself.  */
778181254a7Smrg 
779181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
781181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
782181254a7Smrg 
783181254a7Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784181254a7Smrg       && (bxstride == 1 || bystride == 1)
785181254a7Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
786181254a7Smrg           > POW3(blas_limit)))
787181254a7Smrg     {
788181254a7Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
789181254a7Smrg       const GFC_INTEGER_2 one = 1, zero = 0;
790181254a7Smrg       const int lda = (axstride == 1) ? aystride : axstride,
791181254a7Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
792181254a7Smrg 
793181254a7Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794181254a7Smrg 	{
795181254a7Smrg 	  assert (gemm != NULL);
796181254a7Smrg 	  const char *transa, *transb;
797181254a7Smrg 	  if (try_blas & 2)
798181254a7Smrg 	    transa = "C";
799181254a7Smrg 	  else
800181254a7Smrg 	    transa = axstride == 1 ? "N" : "T";
801181254a7Smrg 
802181254a7Smrg 	  if (try_blas & 4)
803181254a7Smrg 	    transb = "C";
804181254a7Smrg 	  else
805181254a7Smrg 	    transb = bxstride == 1 ? "N" : "T";
806181254a7Smrg 
807181254a7Smrg 	  gemm (transa, transb , &m,
808181254a7Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
809181254a7Smrg 		&ldc, 1, 1);
810181254a7Smrg 	  return;
811181254a7Smrg 	}
812181254a7Smrg     }
813181254a7Smrg 
814fb8a8121Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
815fb8a8121Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
816181254a7Smrg     {
817181254a7Smrg       /* This block of code implements a tuned matmul, derived from
818181254a7Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
819181254a7Smrg 
820181254a7Smrg                Bo Kagstrom and Per Ling
821181254a7Smrg                Department of Computing Science
822181254a7Smrg                Umea University
823181254a7Smrg                S-901 87 Umea, Sweden
824181254a7Smrg 
825181254a7Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
826181254a7Smrg 
827181254a7Smrg       const GFC_INTEGER_2 *a, *b;
828181254a7Smrg       GFC_INTEGER_2 *c;
829181254a7Smrg       const index_type m = xcount, n = ycount, k = count;
830181254a7Smrg 
831181254a7Smrg       /* System generated locals */
832181254a7Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833181254a7Smrg 		 i1, i2, i3, i4, i5, i6;
834181254a7Smrg 
835181254a7Smrg       /* Local variables */
836181254a7Smrg       GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
837181254a7Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
838181254a7Smrg       index_type i, j, l, ii, jj, ll;
839181254a7Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840181254a7Smrg       GFC_INTEGER_2 *t1;
841181254a7Smrg 
842181254a7Smrg       a = abase;
843181254a7Smrg       b = bbase;
844181254a7Smrg       c = retarray->base_addr;
845181254a7Smrg 
846181254a7Smrg       /* Parameter adjustments */
847181254a7Smrg       c_dim1 = rystride;
848181254a7Smrg       c_offset = 1 + c_dim1;
849181254a7Smrg       c -= c_offset;
850181254a7Smrg       a_dim1 = aystride;
851181254a7Smrg       a_offset = 1 + a_dim1;
852181254a7Smrg       a -= a_offset;
853181254a7Smrg       b_dim1 = bystride;
854181254a7Smrg       b_offset = 1 + b_dim1;
855181254a7Smrg       b -= b_offset;
856181254a7Smrg 
857181254a7Smrg       /* Empty c first.  */
858181254a7Smrg       for (j=1; j<=n; j++)
859181254a7Smrg 	for (i=1; i<=m; i++)
860181254a7Smrg 	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
861181254a7Smrg 
862181254a7Smrg       /* Early exit if possible */
863181254a7Smrg       if (m == 0 || n == 0 || k == 0)
864181254a7Smrg 	return;
865181254a7Smrg 
866181254a7Smrg       /* Adjust size of t1 to what is needed.  */
867181254a7Smrg       index_type t1_dim, a_sz;
868181254a7Smrg       if (aystride == 1)
869181254a7Smrg         a_sz = rystride;
870181254a7Smrg       else
871181254a7Smrg         a_sz = a_dim1;
872181254a7Smrg 
873181254a7Smrg       t1_dim = a_sz * 256 + b_dim1;
874181254a7Smrg       if (t1_dim > 65536)
875181254a7Smrg 	t1_dim = 65536;
876181254a7Smrg 
877181254a7Smrg       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
878181254a7Smrg 
879181254a7Smrg       /* Start turning the crank. */
880181254a7Smrg       i1 = n;
881181254a7Smrg       for (jj = 1; jj <= i1; jj += 512)
882181254a7Smrg 	{
883181254a7Smrg 	  /* Computing MIN */
884181254a7Smrg 	  i2 = 512;
885181254a7Smrg 	  i3 = n - jj + 1;
886181254a7Smrg 	  jsec = min(i2,i3);
887181254a7Smrg 	  ujsec = jsec - jsec % 4;
888181254a7Smrg 	  i2 = k;
889181254a7Smrg 	  for (ll = 1; ll <= i2; ll += 256)
890181254a7Smrg 	    {
891181254a7Smrg 	      /* Computing MIN */
892181254a7Smrg 	      i3 = 256;
893181254a7Smrg 	      i4 = k - ll + 1;
894181254a7Smrg 	      lsec = min(i3,i4);
895181254a7Smrg 	      ulsec = lsec - lsec % 2;
896181254a7Smrg 
897181254a7Smrg 	      i3 = m;
898181254a7Smrg 	      for (ii = 1; ii <= i3; ii += 256)
899181254a7Smrg 		{
900181254a7Smrg 		  /* Computing MIN */
901181254a7Smrg 		  i4 = 256;
902181254a7Smrg 		  i5 = m - ii + 1;
903181254a7Smrg 		  isec = min(i4,i5);
904181254a7Smrg 		  uisec = isec - isec % 2;
905181254a7Smrg 		  i4 = ll + ulsec - 1;
906181254a7Smrg 		  for (l = ll; l <= i4; l += 2)
907181254a7Smrg 		    {
908181254a7Smrg 		      i5 = ii + uisec - 1;
909181254a7Smrg 		      for (i = ii; i <= i5; i += 2)
910181254a7Smrg 			{
911181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912181254a7Smrg 					a[i + l * a_dim1];
913181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914181254a7Smrg 					a[i + (l + 1) * a_dim1];
915181254a7Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916181254a7Smrg 					a[i + 1 + l * a_dim1];
917181254a7Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918181254a7Smrg 					a[i + 1 + (l + 1) * a_dim1];
919181254a7Smrg 			}
920181254a7Smrg 		      if (uisec < isec)
921181254a7Smrg 			{
922181254a7Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
923181254a7Smrg 				    a[ii + isec - 1 + l * a_dim1];
924181254a7Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
925181254a7Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
926181254a7Smrg 			}
927181254a7Smrg 		    }
928181254a7Smrg 		  if (ulsec < lsec)
929181254a7Smrg 		    {
930181254a7Smrg 		      i4 = ii + isec - 1;
931181254a7Smrg 		      for (i = ii; i<= i4; ++i)
932181254a7Smrg 			{
933181254a7Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
934181254a7Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
935181254a7Smrg 			}
936181254a7Smrg 		    }
937181254a7Smrg 
938181254a7Smrg 		  uisec = isec - isec % 4;
939181254a7Smrg 		  i4 = jj + ujsec - 1;
940181254a7Smrg 		  for (j = jj; j <= i4; j += 4)
941181254a7Smrg 		    {
942181254a7Smrg 		      i5 = ii + uisec - 1;
943181254a7Smrg 		      for (i = ii; i <= i5; i += 4)
944181254a7Smrg 			{
945181254a7Smrg 			  f11 = c[i + j * c_dim1];
946181254a7Smrg 			  f21 = c[i + 1 + j * c_dim1];
947181254a7Smrg 			  f12 = c[i + (j + 1) * c_dim1];
948181254a7Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
949181254a7Smrg 			  f13 = c[i + (j + 2) * c_dim1];
950181254a7Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
951181254a7Smrg 			  f14 = c[i + (j + 3) * c_dim1];
952181254a7Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
953181254a7Smrg 			  f31 = c[i + 2 + j * c_dim1];
954181254a7Smrg 			  f41 = c[i + 3 + j * c_dim1];
955181254a7Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
956181254a7Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
957181254a7Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
958181254a7Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
959181254a7Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
960181254a7Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
961181254a7Smrg 			  i6 = ll + lsec - 1;
962181254a7Smrg 			  for (l = ll; l <= i6; ++l)
963181254a7Smrg 			    {
964181254a7Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965181254a7Smrg 				      * b[l + j * b_dim1];
966181254a7Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967181254a7Smrg 				      * b[l + j * b_dim1];
968181254a7Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
970181254a7Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
972181254a7Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
974181254a7Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
976181254a7Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
978181254a7Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
980181254a7Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981181254a7Smrg 				      * b[l + j * b_dim1];
982181254a7Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983181254a7Smrg 				      * b[l + j * b_dim1];
984181254a7Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
986181254a7Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987181254a7Smrg 				      * b[l + (j + 1) * b_dim1];
988181254a7Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
990181254a7Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991181254a7Smrg 				      * b[l + (j + 2) * b_dim1];
992181254a7Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
994181254a7Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995181254a7Smrg 				      * b[l + (j + 3) * b_dim1];
996181254a7Smrg 			    }
997181254a7Smrg 			  c[i + j * c_dim1] = f11;
998181254a7Smrg 			  c[i + 1 + j * c_dim1] = f21;
999181254a7Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1000181254a7Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1001181254a7Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1002181254a7Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1003181254a7Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1004181254a7Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1005181254a7Smrg 			  c[i + 2 + j * c_dim1] = f31;
1006181254a7Smrg 			  c[i + 3 + j * c_dim1] = f41;
1007181254a7Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1008181254a7Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1009181254a7Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1010181254a7Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1011181254a7Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1012181254a7Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1013181254a7Smrg 			}
1014181254a7Smrg 		      if (uisec < isec)
1015181254a7Smrg 			{
1016181254a7Smrg 			  i5 = ii + isec - 1;
1017181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1018181254a7Smrg 			    {
1019181254a7Smrg 			      f11 = c[i + j * c_dim1];
1020181254a7Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1021181254a7Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1022181254a7Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1023181254a7Smrg 			      i6 = ll + lsec - 1;
1024181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1025181254a7Smrg 				{
1026181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027181254a7Smrg 					  257] * b[l + j * b_dim1];
1028181254a7Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029181254a7Smrg 					  257] * b[l + (j + 1) * b_dim1];
1030181254a7Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031181254a7Smrg 					  257] * b[l + (j + 2) * b_dim1];
1032181254a7Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033181254a7Smrg 					  257] * b[l + (j + 3) * b_dim1];
1034181254a7Smrg 				}
1035181254a7Smrg 			      c[i + j * c_dim1] = f11;
1036181254a7Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1037181254a7Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1038181254a7Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1039181254a7Smrg 			    }
1040181254a7Smrg 			}
1041181254a7Smrg 		    }
1042181254a7Smrg 		  if (ujsec < jsec)
1043181254a7Smrg 		    {
1044181254a7Smrg 		      i4 = jj + jsec - 1;
1045181254a7Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1046181254a7Smrg 			{
1047181254a7Smrg 			  i5 = ii + uisec - 1;
1048181254a7Smrg 			  for (i = ii; i <= i5; i += 4)
1049181254a7Smrg 			    {
1050181254a7Smrg 			      f11 = c[i + j * c_dim1];
1051181254a7Smrg 			      f21 = c[i + 1 + j * c_dim1];
1052181254a7Smrg 			      f31 = c[i + 2 + j * c_dim1];
1053181254a7Smrg 			      f41 = c[i + 3 + j * c_dim1];
1054181254a7Smrg 			      i6 = ll + lsec - 1;
1055181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1056181254a7Smrg 				{
1057181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058181254a7Smrg 					  257] * b[l + j * b_dim1];
1059181254a7Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060181254a7Smrg 					  257] * b[l + j * b_dim1];
1061181254a7Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062181254a7Smrg 					  257] * b[l + j * b_dim1];
1063181254a7Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064181254a7Smrg 					  257] * b[l + j * b_dim1];
1065181254a7Smrg 				}
1066181254a7Smrg 			      c[i + j * c_dim1] = f11;
1067181254a7Smrg 			      c[i + 1 + j * c_dim1] = f21;
1068181254a7Smrg 			      c[i + 2 + j * c_dim1] = f31;
1069181254a7Smrg 			      c[i + 3 + j * c_dim1] = f41;
1070181254a7Smrg 			    }
1071181254a7Smrg 			  i5 = ii + isec - 1;
1072181254a7Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1073181254a7Smrg 			    {
1074181254a7Smrg 			      f11 = c[i + j * c_dim1];
1075181254a7Smrg 			      i6 = ll + lsec - 1;
1076181254a7Smrg 			      for (l = ll; l <= i6; ++l)
1077181254a7Smrg 				{
1078181254a7Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079181254a7Smrg 					  257] * b[l + j * b_dim1];
1080181254a7Smrg 				}
1081181254a7Smrg 			      c[i + j * c_dim1] = f11;
1082181254a7Smrg 			    }
1083181254a7Smrg 			}
1084181254a7Smrg 		    }
1085181254a7Smrg 		}
1086181254a7Smrg 	    }
1087181254a7Smrg 	}
1088181254a7Smrg       free(t1);
1089181254a7Smrg       return;
1090181254a7Smrg     }
1091181254a7Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092181254a7Smrg     {
1093181254a7Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094181254a7Smrg 	{
1095181254a7Smrg 	  const GFC_INTEGER_2 *restrict abase_x;
1096181254a7Smrg 	  const GFC_INTEGER_2 *restrict bbase_y;
1097181254a7Smrg 	  GFC_INTEGER_2 *restrict dest_y;
1098181254a7Smrg 	  GFC_INTEGER_2 s;
1099181254a7Smrg 
1100181254a7Smrg 	  for (y = 0; y < ycount; y++)
1101181254a7Smrg 	    {
1102181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1103181254a7Smrg 	      dest_y = &dest[y*rystride];
1104181254a7Smrg 	      for (x = 0; x < xcount; x++)
1105181254a7Smrg 		{
1106181254a7Smrg 		  abase_x = &abase[x*axstride];
1107181254a7Smrg 		  s = (GFC_INTEGER_2) 0;
1108181254a7Smrg 		  for (n = 0; n < count; n++)
1109181254a7Smrg 		    s += abase_x[n] * bbase_y[n];
1110181254a7Smrg 		  dest_y[x] = s;
1111181254a7Smrg 		}
1112181254a7Smrg 	    }
1113181254a7Smrg 	}
1114181254a7Smrg       else
1115181254a7Smrg 	{
1116181254a7Smrg 	  const GFC_INTEGER_2 *restrict bbase_y;
1117181254a7Smrg 	  GFC_INTEGER_2 s;
1118181254a7Smrg 
1119181254a7Smrg 	  for (y = 0; y < ycount; y++)
1120181254a7Smrg 	    {
1121181254a7Smrg 	      bbase_y = &bbase[y*bystride];
1122181254a7Smrg 	      s = (GFC_INTEGER_2) 0;
1123181254a7Smrg 	      for (n = 0; n < count; n++)
1124181254a7Smrg 		s += abase[n*axstride] * bbase_y[n];
1125181254a7Smrg 	      dest[y*rystride] = s;
1126181254a7Smrg 	    }
1127181254a7Smrg 	}
1128181254a7Smrg     }
1129181254a7Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130181254a7Smrg     {
1131181254a7Smrg       const GFC_INTEGER_2 *restrict bbase_y;
1132181254a7Smrg       GFC_INTEGER_2 s;
1133181254a7Smrg 
1134181254a7Smrg       for (y = 0; y < ycount; y++)
1135181254a7Smrg 	{
1136181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1137181254a7Smrg 	  s = (GFC_INTEGER_2) 0;
1138181254a7Smrg 	  for (n = 0; n < count; n++)
1139181254a7Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1140181254a7Smrg 	  dest[y*rxstride] = s;
1141181254a7Smrg 	}
1142181254a7Smrg     }
1143fb8a8121Smrg   else if (axstride < aystride)
1144fb8a8121Smrg     {
1145fb8a8121Smrg       for (y = 0; y < ycount; y++)
1146fb8a8121Smrg 	for (x = 0; x < xcount; x++)
1147fb8a8121Smrg 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1148fb8a8121Smrg 
1149fb8a8121Smrg       for (y = 0; y < ycount; y++)
1150fb8a8121Smrg 	for (n = 0; n < count; n++)
1151fb8a8121Smrg 	  for (x = 0; x < xcount; x++)
1152fb8a8121Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1153fb8a8121Smrg 	    dest[x*rxstride + y*rystride] +=
1154fb8a8121Smrg 					abase[x*axstride + n*aystride] *
1155fb8a8121Smrg 					bbase[n*bxstride + y*bystride];
1156fb8a8121Smrg     }
1157181254a7Smrg   else
1158181254a7Smrg     {
1159181254a7Smrg       const GFC_INTEGER_2 *restrict abase_x;
1160181254a7Smrg       const GFC_INTEGER_2 *restrict bbase_y;
1161181254a7Smrg       GFC_INTEGER_2 *restrict dest_y;
1162181254a7Smrg       GFC_INTEGER_2 s;
1163181254a7Smrg 
1164181254a7Smrg       for (y = 0; y < ycount; y++)
1165181254a7Smrg 	{
1166181254a7Smrg 	  bbase_y = &bbase[y*bystride];
1167181254a7Smrg 	  dest_y = &dest[y*rystride];
1168181254a7Smrg 	  for (x = 0; x < xcount; x++)
1169181254a7Smrg 	    {
1170181254a7Smrg 	      abase_x = &abase[x*axstride];
1171181254a7Smrg 	      s = (GFC_INTEGER_2) 0;
1172181254a7Smrg 	      for (n = 0; n < count; n++)
1173181254a7Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174181254a7Smrg 	      dest_y[x*rxstride] = s;
1175181254a7Smrg 	    }
1176181254a7Smrg 	}
1177181254a7Smrg     }
1178181254a7Smrg }
1179181254a7Smrg #undef POW3
1180181254a7Smrg #undef min
1181181254a7Smrg #undef max
1182181254a7Smrg 
1183181254a7Smrg #endif
1184181254a7Smrg 
1185181254a7Smrg #endif
1186181254a7Smrg 
1187