xref: /netbsd-src/external/gpl3/gcc.old/dist/libgfortran/generated/matmul_c8.c (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1627f7eb2Smrg /* Implementation of the MATMUL intrinsic
2*4c3eb207Smrg    Copyright (C) 2002-2020 Free Software Foundation, Inc.
3627f7eb2Smrg    Contributed by Paul Brook <paul@nowt.org>
4627f7eb2Smrg 
5627f7eb2Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6627f7eb2Smrg 
7627f7eb2Smrg Libgfortran is free software; you can redistribute it and/or
8627f7eb2Smrg modify it under the terms of the GNU General Public
9627f7eb2Smrg License as published by the Free Software Foundation; either
10627f7eb2Smrg version 3 of the License, or (at your option) any later version.
11627f7eb2Smrg 
12627f7eb2Smrg Libgfortran is distributed in the hope that it will be useful,
13627f7eb2Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14627f7eb2Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15627f7eb2Smrg GNU General Public License for more details.
16627f7eb2Smrg 
17627f7eb2Smrg Under Section 7 of GPL version 3, you are granted additional
18627f7eb2Smrg permissions described in the GCC Runtime Library Exception, version
19627f7eb2Smrg 3.1, as published by the Free Software Foundation.
20627f7eb2Smrg 
21627f7eb2Smrg You should have received a copy of the GNU General Public License and
22627f7eb2Smrg a copy of the GCC Runtime Library Exception along with this program;
23627f7eb2Smrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24627f7eb2Smrg <http://www.gnu.org/licenses/>.  */
25627f7eb2Smrg 
26627f7eb2Smrg #include "libgfortran.h"
27627f7eb2Smrg #include <string.h>
28627f7eb2Smrg #include <assert.h>
29627f7eb2Smrg 
30627f7eb2Smrg 
31627f7eb2Smrg #if defined (HAVE_GFC_COMPLEX_8)
32627f7eb2Smrg 
33627f7eb2Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34627f7eb2Smrg    passed to us by the front-end, in which case we call it for large
35627f7eb2Smrg    matrices.  */
36627f7eb2Smrg 
37627f7eb2Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38627f7eb2Smrg                           const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
39627f7eb2Smrg                           const int *, const GFC_COMPLEX_8 *, const int *,
40627f7eb2Smrg                           const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
41627f7eb2Smrg                           int, int);
42627f7eb2Smrg 
43627f7eb2Smrg /* The order of loops is different in the case of plain matrix
44627f7eb2Smrg    multiplication C=MATMUL(A,B), and in the frequent special case where
45627f7eb2Smrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
46627f7eb2Smrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47627f7eb2Smrg    looking at their strides.
48627f7eb2Smrg 
49627f7eb2Smrg    The equivalent Fortran pseudo-code is:
50627f7eb2Smrg 
51627f7eb2Smrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52627f7eb2Smrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
53627f7eb2Smrg      C = 0
54627f7eb2Smrg      DO J=1,N
55627f7eb2Smrg        DO K=1,COUNT
56627f7eb2Smrg          DO I=1,M
57627f7eb2Smrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58627f7eb2Smrg    ELSE
59627f7eb2Smrg      DO J=1,N
60627f7eb2Smrg        DO I=1,M
61627f7eb2Smrg          S = 0
62627f7eb2Smrg          DO K=1,COUNT
63627f7eb2Smrg            S = S+A(I,K)*B(K,J)
64627f7eb2Smrg          C(I,J) = S
65627f7eb2Smrg    ENDIF
66627f7eb2Smrg */
67627f7eb2Smrg 
68627f7eb2Smrg /* If try_blas is set to a nonzero value, then the matmul function will
69627f7eb2Smrg    see if there is a way to perform the matrix multiplication by a call
70627f7eb2Smrg    to the BLAS gemm function.  */
71627f7eb2Smrg 
72627f7eb2Smrg extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
73627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
74627f7eb2Smrg 	int blas_limit, blas_call gemm);
75627f7eb2Smrg export_proto(matmul_c8);
76627f7eb2Smrg 
77627f7eb2Smrg /* Put exhaustive list of possible architectures here here, ORed together.  */
78627f7eb2Smrg 
79627f7eb2Smrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80627f7eb2Smrg 
81627f7eb2Smrg #ifdef HAVE_AVX
82627f7eb2Smrg static void
83627f7eb2Smrg matmul_c8_avx (gfc_array_c8 * const restrict retarray,
84627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
85627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86627f7eb2Smrg static void
matmul_c8_avx(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87627f7eb2Smrg matmul_c8_avx (gfc_array_c8 * const restrict retarray,
88627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
89627f7eb2Smrg 	int blas_limit, blas_call gemm)
90627f7eb2Smrg {
91627f7eb2Smrg   const GFC_COMPLEX_8 * restrict abase;
92627f7eb2Smrg   const GFC_COMPLEX_8 * restrict bbase;
93627f7eb2Smrg   GFC_COMPLEX_8 * restrict dest;
94627f7eb2Smrg 
95627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
97627f7eb2Smrg 
98627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
99627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
100627f7eb2Smrg 
101627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102627f7eb2Smrg 
103627f7eb2Smrg    Either A or B (but not both) can be rank 1:
104627f7eb2Smrg 
105627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
106627f7eb2Smrg      dimensioned [1,count], so xcount=1.
107627f7eb2Smrg 
108627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
109627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
110627f7eb2Smrg */
111627f7eb2Smrg 
112627f7eb2Smrg   if (retarray->base_addr == NULL)
113627f7eb2Smrg     {
114627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
115627f7eb2Smrg         {
116627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118627f7eb2Smrg         }
119627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120627f7eb2Smrg         {
121627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123627f7eb2Smrg         }
124627f7eb2Smrg       else
125627f7eb2Smrg         {
126627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128627f7eb2Smrg 
129627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
130627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132627f7eb2Smrg         }
133627f7eb2Smrg 
134627f7eb2Smrg       retarray->base_addr
135627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
136627f7eb2Smrg       retarray->offset = 0;
137627f7eb2Smrg     }
138627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
139627f7eb2Smrg     {
140627f7eb2Smrg       index_type ret_extent, arg_extent;
141627f7eb2Smrg 
142627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
143627f7eb2Smrg 	{
144627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146627f7eb2Smrg 	  if (arg_extent != ret_extent)
147627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
148627f7eb2Smrg 	    		   "array (%ld/%ld) ",
149627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
150627f7eb2Smrg 	}
151627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152627f7eb2Smrg 	{
153627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155627f7eb2Smrg 	  if (arg_extent != ret_extent)
156627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
157627f7eb2Smrg 	    		   "array (%ld/%ld) ",
158627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
159627f7eb2Smrg 	}
160627f7eb2Smrg       else
161627f7eb2Smrg 	{
162627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164627f7eb2Smrg 	  if (arg_extent != ret_extent)
165627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
166627f7eb2Smrg 	    		   "array (%ld/%ld) ",
167627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
168627f7eb2Smrg 
169627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171627f7eb2Smrg 	  if (arg_extent != ret_extent)
172627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
173627f7eb2Smrg 	    		   "array (%ld/%ld) ",
174627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
175627f7eb2Smrg 	}
176627f7eb2Smrg     }
177627f7eb2Smrg 
178627f7eb2Smrg 
179627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180627f7eb2Smrg     {
181627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
182627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
183627f7eb2Smrg 	 work. */
184627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185627f7eb2Smrg     }
186627f7eb2Smrg   else
187627f7eb2Smrg     {
188627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190627f7eb2Smrg     }
191627f7eb2Smrg 
192627f7eb2Smrg 
193627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
194627f7eb2Smrg     {
195627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
196627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197627f7eb2Smrg       aystride = 1;
198627f7eb2Smrg 
199627f7eb2Smrg       xcount = 1;
200627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
201627f7eb2Smrg     }
202627f7eb2Smrg   else
203627f7eb2Smrg     {
204627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206627f7eb2Smrg 
207627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
208627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209627f7eb2Smrg     }
210627f7eb2Smrg 
211627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212627f7eb2Smrg     {
213627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
216627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217627f7eb2Smrg     }
218627f7eb2Smrg 
219627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
220627f7eb2Smrg     {
221627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
222627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223627f7eb2Smrg 
224627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
225627f7eb2Smrg          The value is only used for calculation of the
226627f7eb2Smrg          memory by the buffer.  */
227627f7eb2Smrg       bystride = 256;
228627f7eb2Smrg       ycount = 1;
229627f7eb2Smrg     }
230627f7eb2Smrg   else
231627f7eb2Smrg     {
232627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235627f7eb2Smrg     }
236627f7eb2Smrg 
237627f7eb2Smrg   abase = a->base_addr;
238627f7eb2Smrg   bbase = b->base_addr;
239627f7eb2Smrg   dest = retarray->base_addr;
240627f7eb2Smrg 
241627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
242627f7eb2Smrg      itself.  */
243627f7eb2Smrg 
244627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
246627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
247627f7eb2Smrg 
248627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
250627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
251627f7eb2Smrg           > POW3(blas_limit)))
252627f7eb2Smrg     {
253627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
254627f7eb2Smrg       const GFC_COMPLEX_8 one = 1, zero = 0;
255627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
256627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
257627f7eb2Smrg 
258627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259627f7eb2Smrg 	{
260627f7eb2Smrg 	  assert (gemm != NULL);
261627f7eb2Smrg 	  const char *transa, *transb;
262627f7eb2Smrg 	  if (try_blas & 2)
263627f7eb2Smrg 	    transa = "C";
264627f7eb2Smrg 	  else
265627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
266627f7eb2Smrg 
267627f7eb2Smrg 	  if (try_blas & 4)
268627f7eb2Smrg 	    transb = "C";
269627f7eb2Smrg 	  else
270627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
271627f7eb2Smrg 
272627f7eb2Smrg 	  gemm (transa, transb , &m,
273627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274627f7eb2Smrg 		&ldc, 1, 1);
275627f7eb2Smrg 	  return;
276627f7eb2Smrg 	}
277627f7eb2Smrg     }
278627f7eb2Smrg 
279*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
280*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
281627f7eb2Smrg     {
282627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
283627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284627f7eb2Smrg 
285627f7eb2Smrg                Bo Kagstrom and Per Ling
286627f7eb2Smrg                Department of Computing Science
287627f7eb2Smrg                Umea University
288627f7eb2Smrg                S-901 87 Umea, Sweden
289627f7eb2Smrg 
290627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
291627f7eb2Smrg 
292627f7eb2Smrg       const GFC_COMPLEX_8 *a, *b;
293627f7eb2Smrg       GFC_COMPLEX_8 *c;
294627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
295627f7eb2Smrg 
296627f7eb2Smrg       /* System generated locals */
297627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
299627f7eb2Smrg 
300627f7eb2Smrg       /* Local variables */
301627f7eb2Smrg       GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
302627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
303627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
304627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305627f7eb2Smrg       GFC_COMPLEX_8 *t1;
306627f7eb2Smrg 
307627f7eb2Smrg       a = abase;
308627f7eb2Smrg       b = bbase;
309627f7eb2Smrg       c = retarray->base_addr;
310627f7eb2Smrg 
311627f7eb2Smrg       /* Parameter adjustments */
312627f7eb2Smrg       c_dim1 = rystride;
313627f7eb2Smrg       c_offset = 1 + c_dim1;
314627f7eb2Smrg       c -= c_offset;
315627f7eb2Smrg       a_dim1 = aystride;
316627f7eb2Smrg       a_offset = 1 + a_dim1;
317627f7eb2Smrg       a -= a_offset;
318627f7eb2Smrg       b_dim1 = bystride;
319627f7eb2Smrg       b_offset = 1 + b_dim1;
320627f7eb2Smrg       b -= b_offset;
321627f7eb2Smrg 
322627f7eb2Smrg       /* Empty c first.  */
323627f7eb2Smrg       for (j=1; j<=n; j++)
324627f7eb2Smrg 	for (i=1; i<=m; i++)
325627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
326627f7eb2Smrg 
327627f7eb2Smrg       /* Early exit if possible */
328627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
329627f7eb2Smrg 	return;
330627f7eb2Smrg 
331627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
332627f7eb2Smrg       index_type t1_dim, a_sz;
333627f7eb2Smrg       if (aystride == 1)
334627f7eb2Smrg         a_sz = rystride;
335627f7eb2Smrg       else
336627f7eb2Smrg         a_sz = a_dim1;
337627f7eb2Smrg 
338627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
339627f7eb2Smrg       if (t1_dim > 65536)
340627f7eb2Smrg 	t1_dim = 65536;
341627f7eb2Smrg 
342627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
343627f7eb2Smrg 
344627f7eb2Smrg       /* Start turning the crank. */
345627f7eb2Smrg       i1 = n;
346627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
347627f7eb2Smrg 	{
348627f7eb2Smrg 	  /* Computing MIN */
349627f7eb2Smrg 	  i2 = 512;
350627f7eb2Smrg 	  i3 = n - jj + 1;
351627f7eb2Smrg 	  jsec = min(i2,i3);
352627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
353627f7eb2Smrg 	  i2 = k;
354627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
355627f7eb2Smrg 	    {
356627f7eb2Smrg 	      /* Computing MIN */
357627f7eb2Smrg 	      i3 = 256;
358627f7eb2Smrg 	      i4 = k - ll + 1;
359627f7eb2Smrg 	      lsec = min(i3,i4);
360627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
361627f7eb2Smrg 
362627f7eb2Smrg 	      i3 = m;
363627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
364627f7eb2Smrg 		{
365627f7eb2Smrg 		  /* Computing MIN */
366627f7eb2Smrg 		  i4 = 256;
367627f7eb2Smrg 		  i5 = m - ii + 1;
368627f7eb2Smrg 		  isec = min(i4,i5);
369627f7eb2Smrg 		  uisec = isec - isec % 2;
370627f7eb2Smrg 		  i4 = ll + ulsec - 1;
371627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
372627f7eb2Smrg 		    {
373627f7eb2Smrg 		      i5 = ii + uisec - 1;
374627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
375627f7eb2Smrg 			{
376627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377627f7eb2Smrg 					a[i + l * a_dim1];
378627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
380627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381627f7eb2Smrg 					a[i + 1 + l * a_dim1];
382627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
384627f7eb2Smrg 			}
385627f7eb2Smrg 		      if (uisec < isec)
386627f7eb2Smrg 			{
387627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
388627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
389627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
390627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
391627f7eb2Smrg 			}
392627f7eb2Smrg 		    }
393627f7eb2Smrg 		  if (ulsec < lsec)
394627f7eb2Smrg 		    {
395627f7eb2Smrg 		      i4 = ii + isec - 1;
396627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
397627f7eb2Smrg 			{
398627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
400627f7eb2Smrg 			}
401627f7eb2Smrg 		    }
402627f7eb2Smrg 
403627f7eb2Smrg 		  uisec = isec - isec % 4;
404627f7eb2Smrg 		  i4 = jj + ujsec - 1;
405627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
406627f7eb2Smrg 		    {
407627f7eb2Smrg 		      i5 = ii + uisec - 1;
408627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
409627f7eb2Smrg 			{
410627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
411627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
412627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
413627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
414627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
415627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
416627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
417627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
418627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
419627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
420627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
421627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
422627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
423627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
424627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
425627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
426627f7eb2Smrg 			  i6 = ll + lsec - 1;
427627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
428627f7eb2Smrg 			    {
429627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430627f7eb2Smrg 				      * b[l + j * b_dim1];
431627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432627f7eb2Smrg 				      * b[l + j * b_dim1];
433627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
435627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
437627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
439627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
441627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
443627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
445627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446627f7eb2Smrg 				      * b[l + j * b_dim1];
447627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448627f7eb2Smrg 				      * b[l + j * b_dim1];
449627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
451627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
453627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
455627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
457627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
459627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
461627f7eb2Smrg 			    }
462627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
463627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
464627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
465627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
466627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
467627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
468627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
469627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
470627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
471627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
472627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
473627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
474627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
475627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
476627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
477627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
478627f7eb2Smrg 			}
479627f7eb2Smrg 		      if (uisec < isec)
480627f7eb2Smrg 			{
481627f7eb2Smrg 			  i5 = ii + isec - 1;
482627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
483627f7eb2Smrg 			    {
484627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
485627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
486627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
487627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
488627f7eb2Smrg 			      i6 = ll + lsec - 1;
489627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
490627f7eb2Smrg 				{
491627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492627f7eb2Smrg 					  257] * b[l + j * b_dim1];
493627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
495627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
497627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
499627f7eb2Smrg 				}
500627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
501627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
502627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
503627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
504627f7eb2Smrg 			    }
505627f7eb2Smrg 			}
506627f7eb2Smrg 		    }
507627f7eb2Smrg 		  if (ujsec < jsec)
508627f7eb2Smrg 		    {
509627f7eb2Smrg 		      i4 = jj + jsec - 1;
510627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
511627f7eb2Smrg 			{
512627f7eb2Smrg 			  i5 = ii + uisec - 1;
513627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
514627f7eb2Smrg 			    {
515627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
516627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
517627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
518627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
519627f7eb2Smrg 			      i6 = ll + lsec - 1;
520627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
521627f7eb2Smrg 				{
522627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523627f7eb2Smrg 					  257] * b[l + j * b_dim1];
524627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525627f7eb2Smrg 					  257] * b[l + j * b_dim1];
526627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527627f7eb2Smrg 					  257] * b[l + j * b_dim1];
528627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529627f7eb2Smrg 					  257] * b[l + j * b_dim1];
530627f7eb2Smrg 				}
531627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
532627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
533627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
534627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
535627f7eb2Smrg 			    }
536627f7eb2Smrg 			  i5 = ii + isec - 1;
537627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
538627f7eb2Smrg 			    {
539627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
540627f7eb2Smrg 			      i6 = ll + lsec - 1;
541627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
542627f7eb2Smrg 				{
543627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544627f7eb2Smrg 					  257] * b[l + j * b_dim1];
545627f7eb2Smrg 				}
546627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
547627f7eb2Smrg 			    }
548627f7eb2Smrg 			}
549627f7eb2Smrg 		    }
550627f7eb2Smrg 		}
551627f7eb2Smrg 	    }
552627f7eb2Smrg 	}
553627f7eb2Smrg       free(t1);
554627f7eb2Smrg       return;
555627f7eb2Smrg     }
556627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557627f7eb2Smrg     {
558627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
559627f7eb2Smrg 	{
560627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict abase_x;
561627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
562627f7eb2Smrg 	  GFC_COMPLEX_8 *restrict dest_y;
563627f7eb2Smrg 	  GFC_COMPLEX_8 s;
564627f7eb2Smrg 
565627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
566627f7eb2Smrg 	    {
567627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
568627f7eb2Smrg 	      dest_y = &dest[y*rystride];
569627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
570627f7eb2Smrg 		{
571627f7eb2Smrg 		  abase_x = &abase[x*axstride];
572627f7eb2Smrg 		  s = (GFC_COMPLEX_8) 0;
573627f7eb2Smrg 		  for (n = 0; n < count; n++)
574627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
575627f7eb2Smrg 		  dest_y[x] = s;
576627f7eb2Smrg 		}
577627f7eb2Smrg 	    }
578627f7eb2Smrg 	}
579627f7eb2Smrg       else
580627f7eb2Smrg 	{
581627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
582627f7eb2Smrg 	  GFC_COMPLEX_8 s;
583627f7eb2Smrg 
584627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
585627f7eb2Smrg 	    {
586627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
587627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
588627f7eb2Smrg 	      for (n = 0; n < count; n++)
589627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
590627f7eb2Smrg 	      dest[y*rystride] = s;
591627f7eb2Smrg 	    }
592627f7eb2Smrg 	}
593627f7eb2Smrg     }
594627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
595627f7eb2Smrg     {
596627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
597627f7eb2Smrg       GFC_COMPLEX_8 s;
598627f7eb2Smrg 
599627f7eb2Smrg       for (y = 0; y < ycount; y++)
600627f7eb2Smrg 	{
601627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
602627f7eb2Smrg 	  s = (GFC_COMPLEX_8) 0;
603627f7eb2Smrg 	  for (n = 0; n < count; n++)
604627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
605627f7eb2Smrg 	  dest[y*rxstride] = s;
606627f7eb2Smrg 	}
607627f7eb2Smrg     }
608*4c3eb207Smrg   else if (axstride < aystride)
609*4c3eb207Smrg     {
610*4c3eb207Smrg       for (y = 0; y < ycount; y++)
611*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
612*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
613*4c3eb207Smrg 
614*4c3eb207Smrg       for (y = 0; y < ycount; y++)
615*4c3eb207Smrg 	for (n = 0; n < count; n++)
616*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
617*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
618*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
619*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
620*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
621*4c3eb207Smrg     }
622627f7eb2Smrg   else
623627f7eb2Smrg     {
624627f7eb2Smrg       const GFC_COMPLEX_8 *restrict abase_x;
625627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
626627f7eb2Smrg       GFC_COMPLEX_8 *restrict dest_y;
627627f7eb2Smrg       GFC_COMPLEX_8 s;
628627f7eb2Smrg 
629627f7eb2Smrg       for (y = 0; y < ycount; y++)
630627f7eb2Smrg 	{
631627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
632627f7eb2Smrg 	  dest_y = &dest[y*rystride];
633627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
634627f7eb2Smrg 	    {
635627f7eb2Smrg 	      abase_x = &abase[x*axstride];
636627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
637627f7eb2Smrg 	      for (n = 0; n < count; n++)
638627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639627f7eb2Smrg 	      dest_y[x*rxstride] = s;
640627f7eb2Smrg 	    }
641627f7eb2Smrg 	}
642627f7eb2Smrg     }
643627f7eb2Smrg }
644627f7eb2Smrg #undef POW3
645627f7eb2Smrg #undef min
646627f7eb2Smrg #undef max
647627f7eb2Smrg 
648627f7eb2Smrg #endif /* HAVE_AVX */
649627f7eb2Smrg 
650627f7eb2Smrg #ifdef HAVE_AVX2
651627f7eb2Smrg static void
652627f7eb2Smrg matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
653627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
654627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655627f7eb2Smrg static void
matmul_c8_avx2(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656627f7eb2Smrg matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
657627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
658627f7eb2Smrg 	int blas_limit, blas_call gemm)
659627f7eb2Smrg {
660627f7eb2Smrg   const GFC_COMPLEX_8 * restrict abase;
661627f7eb2Smrg   const GFC_COMPLEX_8 * restrict bbase;
662627f7eb2Smrg   GFC_COMPLEX_8 * restrict dest;
663627f7eb2Smrg 
664627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
666627f7eb2Smrg 
667627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
668627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
669627f7eb2Smrg 
670627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671627f7eb2Smrg 
672627f7eb2Smrg    Either A or B (but not both) can be rank 1:
673627f7eb2Smrg 
674627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
675627f7eb2Smrg      dimensioned [1,count], so xcount=1.
676627f7eb2Smrg 
677627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
678627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
679627f7eb2Smrg */
680627f7eb2Smrg 
681627f7eb2Smrg   if (retarray->base_addr == NULL)
682627f7eb2Smrg     {
683627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
684627f7eb2Smrg         {
685627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687627f7eb2Smrg         }
688627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
689627f7eb2Smrg         {
690627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692627f7eb2Smrg         }
693627f7eb2Smrg       else
694627f7eb2Smrg         {
695627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697627f7eb2Smrg 
698627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
699627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701627f7eb2Smrg         }
702627f7eb2Smrg 
703627f7eb2Smrg       retarray->base_addr
704627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
705627f7eb2Smrg       retarray->offset = 0;
706627f7eb2Smrg     }
707627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
708627f7eb2Smrg     {
709627f7eb2Smrg       index_type ret_extent, arg_extent;
710627f7eb2Smrg 
711627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
712627f7eb2Smrg 	{
713627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715627f7eb2Smrg 	  if (arg_extent != ret_extent)
716627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
717627f7eb2Smrg 	    		   "array (%ld/%ld) ",
718627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
719627f7eb2Smrg 	}
720627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
721627f7eb2Smrg 	{
722627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724627f7eb2Smrg 	  if (arg_extent != ret_extent)
725627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
726627f7eb2Smrg 	    		   "array (%ld/%ld) ",
727627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
728627f7eb2Smrg 	}
729627f7eb2Smrg       else
730627f7eb2Smrg 	{
731627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733627f7eb2Smrg 	  if (arg_extent != ret_extent)
734627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
735627f7eb2Smrg 	    		   "array (%ld/%ld) ",
736627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
737627f7eb2Smrg 
738627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740627f7eb2Smrg 	  if (arg_extent != ret_extent)
741627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
742627f7eb2Smrg 	    		   "array (%ld/%ld) ",
743627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
744627f7eb2Smrg 	}
745627f7eb2Smrg     }
746627f7eb2Smrg 
747627f7eb2Smrg 
748627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749627f7eb2Smrg     {
750627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
751627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
752627f7eb2Smrg 	 work. */
753627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754627f7eb2Smrg     }
755627f7eb2Smrg   else
756627f7eb2Smrg     {
757627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759627f7eb2Smrg     }
760627f7eb2Smrg 
761627f7eb2Smrg 
762627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
763627f7eb2Smrg     {
764627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
765627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766627f7eb2Smrg       aystride = 1;
767627f7eb2Smrg 
768627f7eb2Smrg       xcount = 1;
769627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
770627f7eb2Smrg     }
771627f7eb2Smrg   else
772627f7eb2Smrg     {
773627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775627f7eb2Smrg 
776627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
777627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778627f7eb2Smrg     }
779627f7eb2Smrg 
780627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781627f7eb2Smrg     {
782627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
785627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786627f7eb2Smrg     }
787627f7eb2Smrg 
788627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
789627f7eb2Smrg     {
790627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
791627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792627f7eb2Smrg 
793627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
794627f7eb2Smrg          The value is only used for calculation of the
795627f7eb2Smrg          memory by the buffer.  */
796627f7eb2Smrg       bystride = 256;
797627f7eb2Smrg       ycount = 1;
798627f7eb2Smrg     }
799627f7eb2Smrg   else
800627f7eb2Smrg     {
801627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804627f7eb2Smrg     }
805627f7eb2Smrg 
806627f7eb2Smrg   abase = a->base_addr;
807627f7eb2Smrg   bbase = b->base_addr;
808627f7eb2Smrg   dest = retarray->base_addr;
809627f7eb2Smrg 
810627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
811627f7eb2Smrg      itself.  */
812627f7eb2Smrg 
813627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
815627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
816627f7eb2Smrg 
817627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
819627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
820627f7eb2Smrg           > POW3(blas_limit)))
821627f7eb2Smrg     {
822627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
823627f7eb2Smrg       const GFC_COMPLEX_8 one = 1, zero = 0;
824627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
825627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
826627f7eb2Smrg 
827627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828627f7eb2Smrg 	{
829627f7eb2Smrg 	  assert (gemm != NULL);
830627f7eb2Smrg 	  const char *transa, *transb;
831627f7eb2Smrg 	  if (try_blas & 2)
832627f7eb2Smrg 	    transa = "C";
833627f7eb2Smrg 	  else
834627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
835627f7eb2Smrg 
836627f7eb2Smrg 	  if (try_blas & 4)
837627f7eb2Smrg 	    transb = "C";
838627f7eb2Smrg 	  else
839627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
840627f7eb2Smrg 
841627f7eb2Smrg 	  gemm (transa, transb , &m,
842627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843627f7eb2Smrg 		&ldc, 1, 1);
844627f7eb2Smrg 	  return;
845627f7eb2Smrg 	}
846627f7eb2Smrg     }
847627f7eb2Smrg 
848*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
849*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
850627f7eb2Smrg     {
851627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
852627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853627f7eb2Smrg 
854627f7eb2Smrg                Bo Kagstrom and Per Ling
855627f7eb2Smrg                Department of Computing Science
856627f7eb2Smrg                Umea University
857627f7eb2Smrg                S-901 87 Umea, Sweden
858627f7eb2Smrg 
859627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
860627f7eb2Smrg 
861627f7eb2Smrg       const GFC_COMPLEX_8 *a, *b;
862627f7eb2Smrg       GFC_COMPLEX_8 *c;
863627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
864627f7eb2Smrg 
865627f7eb2Smrg       /* System generated locals */
866627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
868627f7eb2Smrg 
869627f7eb2Smrg       /* Local variables */
870627f7eb2Smrg       GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
871627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
872627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
873627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874627f7eb2Smrg       GFC_COMPLEX_8 *t1;
875627f7eb2Smrg 
876627f7eb2Smrg       a = abase;
877627f7eb2Smrg       b = bbase;
878627f7eb2Smrg       c = retarray->base_addr;
879627f7eb2Smrg 
880627f7eb2Smrg       /* Parameter adjustments */
881627f7eb2Smrg       c_dim1 = rystride;
882627f7eb2Smrg       c_offset = 1 + c_dim1;
883627f7eb2Smrg       c -= c_offset;
884627f7eb2Smrg       a_dim1 = aystride;
885627f7eb2Smrg       a_offset = 1 + a_dim1;
886627f7eb2Smrg       a -= a_offset;
887627f7eb2Smrg       b_dim1 = bystride;
888627f7eb2Smrg       b_offset = 1 + b_dim1;
889627f7eb2Smrg       b -= b_offset;
890627f7eb2Smrg 
891627f7eb2Smrg       /* Empty c first.  */
892627f7eb2Smrg       for (j=1; j<=n; j++)
893627f7eb2Smrg 	for (i=1; i<=m; i++)
894627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
895627f7eb2Smrg 
896627f7eb2Smrg       /* Early exit if possible */
897627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
898627f7eb2Smrg 	return;
899627f7eb2Smrg 
900627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
901627f7eb2Smrg       index_type t1_dim, a_sz;
902627f7eb2Smrg       if (aystride == 1)
903627f7eb2Smrg         a_sz = rystride;
904627f7eb2Smrg       else
905627f7eb2Smrg         a_sz = a_dim1;
906627f7eb2Smrg 
907627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
908627f7eb2Smrg       if (t1_dim > 65536)
909627f7eb2Smrg 	t1_dim = 65536;
910627f7eb2Smrg 
911627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
912627f7eb2Smrg 
913627f7eb2Smrg       /* Start turning the crank. */
914627f7eb2Smrg       i1 = n;
915627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
916627f7eb2Smrg 	{
917627f7eb2Smrg 	  /* Computing MIN */
918627f7eb2Smrg 	  i2 = 512;
919627f7eb2Smrg 	  i3 = n - jj + 1;
920627f7eb2Smrg 	  jsec = min(i2,i3);
921627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
922627f7eb2Smrg 	  i2 = k;
923627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
924627f7eb2Smrg 	    {
925627f7eb2Smrg 	      /* Computing MIN */
926627f7eb2Smrg 	      i3 = 256;
927627f7eb2Smrg 	      i4 = k - ll + 1;
928627f7eb2Smrg 	      lsec = min(i3,i4);
929627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
930627f7eb2Smrg 
931627f7eb2Smrg 	      i3 = m;
932627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
933627f7eb2Smrg 		{
934627f7eb2Smrg 		  /* Computing MIN */
935627f7eb2Smrg 		  i4 = 256;
936627f7eb2Smrg 		  i5 = m - ii + 1;
937627f7eb2Smrg 		  isec = min(i4,i5);
938627f7eb2Smrg 		  uisec = isec - isec % 2;
939627f7eb2Smrg 		  i4 = ll + ulsec - 1;
940627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
941627f7eb2Smrg 		    {
942627f7eb2Smrg 		      i5 = ii + uisec - 1;
943627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
944627f7eb2Smrg 			{
945627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946627f7eb2Smrg 					a[i + l * a_dim1];
947627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
949627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950627f7eb2Smrg 					a[i + 1 + l * a_dim1];
951627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
953627f7eb2Smrg 			}
954627f7eb2Smrg 		      if (uisec < isec)
955627f7eb2Smrg 			{
956627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
957627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
958627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
959627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
960627f7eb2Smrg 			}
961627f7eb2Smrg 		    }
962627f7eb2Smrg 		  if (ulsec < lsec)
963627f7eb2Smrg 		    {
964627f7eb2Smrg 		      i4 = ii + isec - 1;
965627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
966627f7eb2Smrg 			{
967627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
969627f7eb2Smrg 			}
970627f7eb2Smrg 		    }
971627f7eb2Smrg 
972627f7eb2Smrg 		  uisec = isec - isec % 4;
973627f7eb2Smrg 		  i4 = jj + ujsec - 1;
974627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
975627f7eb2Smrg 		    {
976627f7eb2Smrg 		      i5 = ii + uisec - 1;
977627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
978627f7eb2Smrg 			{
979627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
980627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
981627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
982627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
983627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
984627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
985627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
986627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
987627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
988627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
989627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
990627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
991627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
992627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
993627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
994627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
995627f7eb2Smrg 			  i6 = ll + lsec - 1;
996627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
997627f7eb2Smrg 			    {
998627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999627f7eb2Smrg 				      * b[l + j * b_dim1];
1000627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001627f7eb2Smrg 				      * b[l + j * b_dim1];
1002627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1004627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1006627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1008627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1010627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1012627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1014627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015627f7eb2Smrg 				      * b[l + j * b_dim1];
1016627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017627f7eb2Smrg 				      * b[l + j * b_dim1];
1018627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1020627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1022627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1024627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1026627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1028627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1030627f7eb2Smrg 			    }
1031627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
1032627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
1033627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1034627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1036627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1038627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
1040627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
1041627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047627f7eb2Smrg 			}
1048627f7eb2Smrg 		      if (uisec < isec)
1049627f7eb2Smrg 			{
1050627f7eb2Smrg 			  i5 = ii + isec - 1;
1051627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1052627f7eb2Smrg 			    {
1053627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1054627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1055627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1056627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1057627f7eb2Smrg 			      i6 = ll + lsec - 1;
1058627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1059627f7eb2Smrg 				{
1060627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1062627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
1064627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
1066627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
1068627f7eb2Smrg 				}
1069627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1070627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1071627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1072627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1073627f7eb2Smrg 			    }
1074627f7eb2Smrg 			}
1075627f7eb2Smrg 		    }
1076627f7eb2Smrg 		  if (ujsec < jsec)
1077627f7eb2Smrg 		    {
1078627f7eb2Smrg 		      i4 = jj + jsec - 1;
1079627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1080627f7eb2Smrg 			{
1081627f7eb2Smrg 			  i5 = ii + uisec - 1;
1082627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
1083627f7eb2Smrg 			    {
1084627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1085627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
1086627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
1087627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
1088627f7eb2Smrg 			      i6 = ll + lsec - 1;
1089627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1090627f7eb2Smrg 				{
1091627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1093627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1095627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1097627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1099627f7eb2Smrg 				}
1100627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1101627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
1102627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
1103627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
1104627f7eb2Smrg 			    }
1105627f7eb2Smrg 			  i5 = ii + isec - 1;
1106627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1107627f7eb2Smrg 			    {
1108627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1109627f7eb2Smrg 			      i6 = ll + lsec - 1;
1110627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1111627f7eb2Smrg 				{
1112627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1114627f7eb2Smrg 				}
1115627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1116627f7eb2Smrg 			    }
1117627f7eb2Smrg 			}
1118627f7eb2Smrg 		    }
1119627f7eb2Smrg 		}
1120627f7eb2Smrg 	    }
1121627f7eb2Smrg 	}
1122627f7eb2Smrg       free(t1);
1123627f7eb2Smrg       return;
1124627f7eb2Smrg     }
1125627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126627f7eb2Smrg     {
1127627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1128627f7eb2Smrg 	{
1129627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict abase_x;
1130627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
1131627f7eb2Smrg 	  GFC_COMPLEX_8 *restrict dest_y;
1132627f7eb2Smrg 	  GFC_COMPLEX_8 s;
1133627f7eb2Smrg 
1134627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1135627f7eb2Smrg 	    {
1136627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1137627f7eb2Smrg 	      dest_y = &dest[y*rystride];
1138627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
1139627f7eb2Smrg 		{
1140627f7eb2Smrg 		  abase_x = &abase[x*axstride];
1141627f7eb2Smrg 		  s = (GFC_COMPLEX_8) 0;
1142627f7eb2Smrg 		  for (n = 0; n < count; n++)
1143627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
1144627f7eb2Smrg 		  dest_y[x] = s;
1145627f7eb2Smrg 		}
1146627f7eb2Smrg 	    }
1147627f7eb2Smrg 	}
1148627f7eb2Smrg       else
1149627f7eb2Smrg 	{
1150627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
1151627f7eb2Smrg 	  GFC_COMPLEX_8 s;
1152627f7eb2Smrg 
1153627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1154627f7eb2Smrg 	    {
1155627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1156627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
1157627f7eb2Smrg 	      for (n = 0; n < count; n++)
1158627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
1159627f7eb2Smrg 	      dest[y*rystride] = s;
1160627f7eb2Smrg 	    }
1161627f7eb2Smrg 	}
1162627f7eb2Smrg     }
1163627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164627f7eb2Smrg     {
1165627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
1166627f7eb2Smrg       GFC_COMPLEX_8 s;
1167627f7eb2Smrg 
1168627f7eb2Smrg       for (y = 0; y < ycount; y++)
1169627f7eb2Smrg 	{
1170627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1171627f7eb2Smrg 	  s = (GFC_COMPLEX_8) 0;
1172627f7eb2Smrg 	  for (n = 0; n < count; n++)
1173627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174627f7eb2Smrg 	  dest[y*rxstride] = s;
1175627f7eb2Smrg 	}
1176627f7eb2Smrg     }
1177*4c3eb207Smrg   else if (axstride < aystride)
1178*4c3eb207Smrg     {
1179*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1180*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
1181*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1182*4c3eb207Smrg 
1183*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1184*4c3eb207Smrg 	for (n = 0; n < count; n++)
1185*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
1186*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1187*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
1188*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
1189*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
1190*4c3eb207Smrg     }
1191627f7eb2Smrg   else
1192627f7eb2Smrg     {
1193627f7eb2Smrg       const GFC_COMPLEX_8 *restrict abase_x;
1194627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
1195627f7eb2Smrg       GFC_COMPLEX_8 *restrict dest_y;
1196627f7eb2Smrg       GFC_COMPLEX_8 s;
1197627f7eb2Smrg 
1198627f7eb2Smrg       for (y = 0; y < ycount; y++)
1199627f7eb2Smrg 	{
1200627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1201627f7eb2Smrg 	  dest_y = &dest[y*rystride];
1202627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
1203627f7eb2Smrg 	    {
1204627f7eb2Smrg 	      abase_x = &abase[x*axstride];
1205627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
1206627f7eb2Smrg 	      for (n = 0; n < count; n++)
1207627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208627f7eb2Smrg 	      dest_y[x*rxstride] = s;
1209627f7eb2Smrg 	    }
1210627f7eb2Smrg 	}
1211627f7eb2Smrg     }
1212627f7eb2Smrg }
1213627f7eb2Smrg #undef POW3
1214627f7eb2Smrg #undef min
1215627f7eb2Smrg #undef max
1216627f7eb2Smrg 
1217627f7eb2Smrg #endif /* HAVE_AVX2 */
1218627f7eb2Smrg 
1219627f7eb2Smrg #ifdef HAVE_AVX512F
1220627f7eb2Smrg static void
1221627f7eb2Smrg matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1222627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1223627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224627f7eb2Smrg static void
matmul_c8_avx512f(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225627f7eb2Smrg matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1226627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1227627f7eb2Smrg 	int blas_limit, blas_call gemm)
1228627f7eb2Smrg {
1229627f7eb2Smrg   const GFC_COMPLEX_8 * restrict abase;
1230627f7eb2Smrg   const GFC_COMPLEX_8 * restrict bbase;
1231627f7eb2Smrg   GFC_COMPLEX_8 * restrict dest;
1232627f7eb2Smrg 
1233627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
1235627f7eb2Smrg 
1236627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1237627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1238627f7eb2Smrg 
1239627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240627f7eb2Smrg 
1241627f7eb2Smrg    Either A or B (but not both) can be rank 1:
1242627f7eb2Smrg 
1243627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1244627f7eb2Smrg      dimensioned [1,count], so xcount=1.
1245627f7eb2Smrg 
1246627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1247627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
1248627f7eb2Smrg */
1249627f7eb2Smrg 
1250627f7eb2Smrg   if (retarray->base_addr == NULL)
1251627f7eb2Smrg     {
1252627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1253627f7eb2Smrg         {
1254627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256627f7eb2Smrg         }
1257627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258627f7eb2Smrg         {
1259627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261627f7eb2Smrg         }
1262627f7eb2Smrg       else
1263627f7eb2Smrg         {
1264627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266627f7eb2Smrg 
1267627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1268627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270627f7eb2Smrg         }
1271627f7eb2Smrg 
1272627f7eb2Smrg       retarray->base_addr
1273627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1274627f7eb2Smrg       retarray->offset = 0;
1275627f7eb2Smrg     }
1276627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
1277627f7eb2Smrg     {
1278627f7eb2Smrg       index_type ret_extent, arg_extent;
1279627f7eb2Smrg 
1280627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1281627f7eb2Smrg 	{
1282627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284627f7eb2Smrg 	  if (arg_extent != ret_extent)
1285627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1286627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1287627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1288627f7eb2Smrg 	}
1289627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290627f7eb2Smrg 	{
1291627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293627f7eb2Smrg 	  if (arg_extent != ret_extent)
1294627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1295627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1296627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1297627f7eb2Smrg 	}
1298627f7eb2Smrg       else
1299627f7eb2Smrg 	{
1300627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302627f7eb2Smrg 	  if (arg_extent != ret_extent)
1303627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1304627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1305627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1306627f7eb2Smrg 
1307627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309627f7eb2Smrg 	  if (arg_extent != ret_extent)
1310627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1311627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1312627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1313627f7eb2Smrg 	}
1314627f7eb2Smrg     }
1315627f7eb2Smrg 
1316627f7eb2Smrg 
1317627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318627f7eb2Smrg     {
1319627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
1320627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
1321627f7eb2Smrg 	 work. */
1322627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323627f7eb2Smrg     }
1324627f7eb2Smrg   else
1325627f7eb2Smrg     {
1326627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328627f7eb2Smrg     }
1329627f7eb2Smrg 
1330627f7eb2Smrg 
1331627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1332627f7eb2Smrg     {
1333627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
1334627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335627f7eb2Smrg       aystride = 1;
1336627f7eb2Smrg 
1337627f7eb2Smrg       xcount = 1;
1338627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1339627f7eb2Smrg     }
1340627f7eb2Smrg   else
1341627f7eb2Smrg     {
1342627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344627f7eb2Smrg 
1345627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1346627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347627f7eb2Smrg     }
1348627f7eb2Smrg 
1349627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350627f7eb2Smrg     {
1351627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
1354627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355627f7eb2Smrg     }
1356627f7eb2Smrg 
1357627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1358627f7eb2Smrg     {
1359627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
1360627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361627f7eb2Smrg 
1362627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
1363627f7eb2Smrg          The value is only used for calculation of the
1364627f7eb2Smrg          memory by the buffer.  */
1365627f7eb2Smrg       bystride = 256;
1366627f7eb2Smrg       ycount = 1;
1367627f7eb2Smrg     }
1368627f7eb2Smrg   else
1369627f7eb2Smrg     {
1370627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373627f7eb2Smrg     }
1374627f7eb2Smrg 
1375627f7eb2Smrg   abase = a->base_addr;
1376627f7eb2Smrg   bbase = b->base_addr;
1377627f7eb2Smrg   dest = retarray->base_addr;
1378627f7eb2Smrg 
1379627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
1380627f7eb2Smrg      itself.  */
1381627f7eb2Smrg 
1382627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1384627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1385627f7eb2Smrg 
1386627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
1388627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1389627f7eb2Smrg           > POW3(blas_limit)))
1390627f7eb2Smrg     {
1391627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1392627f7eb2Smrg       const GFC_COMPLEX_8 one = 1, zero = 0;
1393627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1394627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1395627f7eb2Smrg 
1396627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397627f7eb2Smrg 	{
1398627f7eb2Smrg 	  assert (gemm != NULL);
1399627f7eb2Smrg 	  const char *transa, *transb;
1400627f7eb2Smrg 	  if (try_blas & 2)
1401627f7eb2Smrg 	    transa = "C";
1402627f7eb2Smrg 	  else
1403627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
1404627f7eb2Smrg 
1405627f7eb2Smrg 	  if (try_blas & 4)
1406627f7eb2Smrg 	    transb = "C";
1407627f7eb2Smrg 	  else
1408627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
1409627f7eb2Smrg 
1410627f7eb2Smrg 	  gemm (transa, transb , &m,
1411627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412627f7eb2Smrg 		&ldc, 1, 1);
1413627f7eb2Smrg 	  return;
1414627f7eb2Smrg 	}
1415627f7eb2Smrg     }
1416627f7eb2Smrg 
1417*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
1418*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
1419627f7eb2Smrg     {
1420627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
1421627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422627f7eb2Smrg 
1423627f7eb2Smrg                Bo Kagstrom and Per Ling
1424627f7eb2Smrg                Department of Computing Science
1425627f7eb2Smrg                Umea University
1426627f7eb2Smrg                S-901 87 Umea, Sweden
1427627f7eb2Smrg 
1428627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429627f7eb2Smrg 
1430627f7eb2Smrg       const GFC_COMPLEX_8 *a, *b;
1431627f7eb2Smrg       GFC_COMPLEX_8 *c;
1432627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
1433627f7eb2Smrg 
1434627f7eb2Smrg       /* System generated locals */
1435627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
1437627f7eb2Smrg 
1438627f7eb2Smrg       /* Local variables */
1439627f7eb2Smrg       GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
1440627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
1441627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
1442627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443627f7eb2Smrg       GFC_COMPLEX_8 *t1;
1444627f7eb2Smrg 
1445627f7eb2Smrg       a = abase;
1446627f7eb2Smrg       b = bbase;
1447627f7eb2Smrg       c = retarray->base_addr;
1448627f7eb2Smrg 
1449627f7eb2Smrg       /* Parameter adjustments */
1450627f7eb2Smrg       c_dim1 = rystride;
1451627f7eb2Smrg       c_offset = 1 + c_dim1;
1452627f7eb2Smrg       c -= c_offset;
1453627f7eb2Smrg       a_dim1 = aystride;
1454627f7eb2Smrg       a_offset = 1 + a_dim1;
1455627f7eb2Smrg       a -= a_offset;
1456627f7eb2Smrg       b_dim1 = bystride;
1457627f7eb2Smrg       b_offset = 1 + b_dim1;
1458627f7eb2Smrg       b -= b_offset;
1459627f7eb2Smrg 
1460627f7eb2Smrg       /* Empty c first.  */
1461627f7eb2Smrg       for (j=1; j<=n; j++)
1462627f7eb2Smrg 	for (i=1; i<=m; i++)
1463627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1464627f7eb2Smrg 
1465627f7eb2Smrg       /* Early exit if possible */
1466627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
1467627f7eb2Smrg 	return;
1468627f7eb2Smrg 
1469627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
1470627f7eb2Smrg       index_type t1_dim, a_sz;
1471627f7eb2Smrg       if (aystride == 1)
1472627f7eb2Smrg         a_sz = rystride;
1473627f7eb2Smrg       else
1474627f7eb2Smrg         a_sz = a_dim1;
1475627f7eb2Smrg 
1476627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
1477627f7eb2Smrg       if (t1_dim > 65536)
1478627f7eb2Smrg 	t1_dim = 65536;
1479627f7eb2Smrg 
1480627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
1481627f7eb2Smrg 
1482627f7eb2Smrg       /* Start turning the crank. */
1483627f7eb2Smrg       i1 = n;
1484627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
1485627f7eb2Smrg 	{
1486627f7eb2Smrg 	  /* Computing MIN */
1487627f7eb2Smrg 	  i2 = 512;
1488627f7eb2Smrg 	  i3 = n - jj + 1;
1489627f7eb2Smrg 	  jsec = min(i2,i3);
1490627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
1491627f7eb2Smrg 	  i2 = k;
1492627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
1493627f7eb2Smrg 	    {
1494627f7eb2Smrg 	      /* Computing MIN */
1495627f7eb2Smrg 	      i3 = 256;
1496627f7eb2Smrg 	      i4 = k - ll + 1;
1497627f7eb2Smrg 	      lsec = min(i3,i4);
1498627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
1499627f7eb2Smrg 
1500627f7eb2Smrg 	      i3 = m;
1501627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
1502627f7eb2Smrg 		{
1503627f7eb2Smrg 		  /* Computing MIN */
1504627f7eb2Smrg 		  i4 = 256;
1505627f7eb2Smrg 		  i5 = m - ii + 1;
1506627f7eb2Smrg 		  isec = min(i4,i5);
1507627f7eb2Smrg 		  uisec = isec - isec % 2;
1508627f7eb2Smrg 		  i4 = ll + ulsec - 1;
1509627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
1510627f7eb2Smrg 		    {
1511627f7eb2Smrg 		      i5 = ii + uisec - 1;
1512627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
1513627f7eb2Smrg 			{
1514627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515627f7eb2Smrg 					a[i + l * a_dim1];
1516627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
1518627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519627f7eb2Smrg 					a[i + 1 + l * a_dim1];
1520627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
1522627f7eb2Smrg 			}
1523627f7eb2Smrg 		      if (uisec < isec)
1524627f7eb2Smrg 			{
1525627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
1526627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
1527627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
1528627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529627f7eb2Smrg 			}
1530627f7eb2Smrg 		    }
1531627f7eb2Smrg 		  if (ulsec < lsec)
1532627f7eb2Smrg 		    {
1533627f7eb2Smrg 		      i4 = ii + isec - 1;
1534627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
1535627f7eb2Smrg 			{
1536627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
1538627f7eb2Smrg 			}
1539627f7eb2Smrg 		    }
1540627f7eb2Smrg 
1541627f7eb2Smrg 		  uisec = isec - isec % 4;
1542627f7eb2Smrg 		  i4 = jj + ujsec - 1;
1543627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
1544627f7eb2Smrg 		    {
1545627f7eb2Smrg 		      i5 = ii + uisec - 1;
1546627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
1547627f7eb2Smrg 			{
1548627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
1549627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
1550627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
1551627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
1553627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
1555627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
1557627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
1558627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564627f7eb2Smrg 			  i6 = ll + lsec - 1;
1565627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
1566627f7eb2Smrg 			    {
1567627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568627f7eb2Smrg 				      * b[l + j * b_dim1];
1569627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570627f7eb2Smrg 				      * b[l + j * b_dim1];
1571627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1573627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1575627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1577627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1579627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1581627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1583627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584627f7eb2Smrg 				      * b[l + j * b_dim1];
1585627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586627f7eb2Smrg 				      * b[l + j * b_dim1];
1587627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1589627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
1591627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1593627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
1595627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1597627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
1599627f7eb2Smrg 			    }
1600627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
1601627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
1602627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1603627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1605627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1607627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
1609627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
1610627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616627f7eb2Smrg 			}
1617627f7eb2Smrg 		      if (uisec < isec)
1618627f7eb2Smrg 			{
1619627f7eb2Smrg 			  i5 = ii + isec - 1;
1620627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1621627f7eb2Smrg 			    {
1622627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1623627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1624627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1625627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1626627f7eb2Smrg 			      i6 = ll + lsec - 1;
1627627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1628627f7eb2Smrg 				{
1629627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1631627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
1633627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
1635627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
1637627f7eb2Smrg 				}
1638627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1639627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1640627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1641627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1642627f7eb2Smrg 			    }
1643627f7eb2Smrg 			}
1644627f7eb2Smrg 		    }
1645627f7eb2Smrg 		  if (ujsec < jsec)
1646627f7eb2Smrg 		    {
1647627f7eb2Smrg 		      i4 = jj + jsec - 1;
1648627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1649627f7eb2Smrg 			{
1650627f7eb2Smrg 			  i5 = ii + uisec - 1;
1651627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
1652627f7eb2Smrg 			    {
1653627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1654627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
1655627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
1656627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
1657627f7eb2Smrg 			      i6 = ll + lsec - 1;
1658627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1659627f7eb2Smrg 				{
1660627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1662627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1664627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1666627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1668627f7eb2Smrg 				}
1669627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1670627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
1671627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
1672627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
1673627f7eb2Smrg 			    }
1674627f7eb2Smrg 			  i5 = ii + isec - 1;
1675627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1676627f7eb2Smrg 			    {
1677627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1678627f7eb2Smrg 			      i6 = ll + lsec - 1;
1679627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1680627f7eb2Smrg 				{
1681627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1683627f7eb2Smrg 				}
1684627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1685627f7eb2Smrg 			    }
1686627f7eb2Smrg 			}
1687627f7eb2Smrg 		    }
1688627f7eb2Smrg 		}
1689627f7eb2Smrg 	    }
1690627f7eb2Smrg 	}
1691627f7eb2Smrg       free(t1);
1692627f7eb2Smrg       return;
1693627f7eb2Smrg     }
1694627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695627f7eb2Smrg     {
1696627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1697627f7eb2Smrg 	{
1698627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict abase_x;
1699627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
1700627f7eb2Smrg 	  GFC_COMPLEX_8 *restrict dest_y;
1701627f7eb2Smrg 	  GFC_COMPLEX_8 s;
1702627f7eb2Smrg 
1703627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1704627f7eb2Smrg 	    {
1705627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1706627f7eb2Smrg 	      dest_y = &dest[y*rystride];
1707627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
1708627f7eb2Smrg 		{
1709627f7eb2Smrg 		  abase_x = &abase[x*axstride];
1710627f7eb2Smrg 		  s = (GFC_COMPLEX_8) 0;
1711627f7eb2Smrg 		  for (n = 0; n < count; n++)
1712627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
1713627f7eb2Smrg 		  dest_y[x] = s;
1714627f7eb2Smrg 		}
1715627f7eb2Smrg 	    }
1716627f7eb2Smrg 	}
1717627f7eb2Smrg       else
1718627f7eb2Smrg 	{
1719627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
1720627f7eb2Smrg 	  GFC_COMPLEX_8 s;
1721627f7eb2Smrg 
1722627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1723627f7eb2Smrg 	    {
1724627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1725627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
1726627f7eb2Smrg 	      for (n = 0; n < count; n++)
1727627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
1728627f7eb2Smrg 	      dest[y*rystride] = s;
1729627f7eb2Smrg 	    }
1730627f7eb2Smrg 	}
1731627f7eb2Smrg     }
1732627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733627f7eb2Smrg     {
1734627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
1735627f7eb2Smrg       GFC_COMPLEX_8 s;
1736627f7eb2Smrg 
1737627f7eb2Smrg       for (y = 0; y < ycount; y++)
1738627f7eb2Smrg 	{
1739627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1740627f7eb2Smrg 	  s = (GFC_COMPLEX_8) 0;
1741627f7eb2Smrg 	  for (n = 0; n < count; n++)
1742627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743627f7eb2Smrg 	  dest[y*rxstride] = s;
1744627f7eb2Smrg 	}
1745627f7eb2Smrg     }
1746*4c3eb207Smrg   else if (axstride < aystride)
1747*4c3eb207Smrg     {
1748*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1749*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
1750*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1751*4c3eb207Smrg 
1752*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1753*4c3eb207Smrg 	for (n = 0; n < count; n++)
1754*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
1755*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1756*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
1757*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
1758*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
1759*4c3eb207Smrg     }
1760627f7eb2Smrg   else
1761627f7eb2Smrg     {
1762627f7eb2Smrg       const GFC_COMPLEX_8 *restrict abase_x;
1763627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
1764627f7eb2Smrg       GFC_COMPLEX_8 *restrict dest_y;
1765627f7eb2Smrg       GFC_COMPLEX_8 s;
1766627f7eb2Smrg 
1767627f7eb2Smrg       for (y = 0; y < ycount; y++)
1768627f7eb2Smrg 	{
1769627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1770627f7eb2Smrg 	  dest_y = &dest[y*rystride];
1771627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
1772627f7eb2Smrg 	    {
1773627f7eb2Smrg 	      abase_x = &abase[x*axstride];
1774627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
1775627f7eb2Smrg 	      for (n = 0; n < count; n++)
1776627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777627f7eb2Smrg 	      dest_y[x*rxstride] = s;
1778627f7eb2Smrg 	    }
1779627f7eb2Smrg 	}
1780627f7eb2Smrg     }
1781627f7eb2Smrg }
1782627f7eb2Smrg #undef POW3
1783627f7eb2Smrg #undef min
1784627f7eb2Smrg #undef max
1785627f7eb2Smrg 
1786627f7eb2Smrg #endif  /* HAVE_AVX512F */
1787627f7eb2Smrg 
1788627f7eb2Smrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789627f7eb2Smrg 
1790627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791627f7eb2Smrg void
1792627f7eb2Smrg matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
1793627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1794627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795627f7eb2Smrg internal_proto(matmul_c8_avx128_fma3);
1796627f7eb2Smrg #endif
1797627f7eb2Smrg 
1798627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799627f7eb2Smrg void
1800627f7eb2Smrg matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
1801627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1802627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803627f7eb2Smrg internal_proto(matmul_c8_avx128_fma4);
1804627f7eb2Smrg #endif
1805627f7eb2Smrg 
1806627f7eb2Smrg /* Function to fall back to if there is no special processor-specific version.  */
1807627f7eb2Smrg static void
matmul_c8_vanilla(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808627f7eb2Smrg matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
1809627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1810627f7eb2Smrg 	int blas_limit, blas_call gemm)
1811627f7eb2Smrg {
1812627f7eb2Smrg   const GFC_COMPLEX_8 * restrict abase;
1813627f7eb2Smrg   const GFC_COMPLEX_8 * restrict bbase;
1814627f7eb2Smrg   GFC_COMPLEX_8 * restrict dest;
1815627f7eb2Smrg 
1816627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
1818627f7eb2Smrg 
1819627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
1820627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
1821627f7eb2Smrg 
1822627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823627f7eb2Smrg 
1824627f7eb2Smrg    Either A or B (but not both) can be rank 1:
1825627f7eb2Smrg 
1826627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
1827627f7eb2Smrg      dimensioned [1,count], so xcount=1.
1828627f7eb2Smrg 
1829627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
1830627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
1831627f7eb2Smrg */
1832627f7eb2Smrg 
1833627f7eb2Smrg   if (retarray->base_addr == NULL)
1834627f7eb2Smrg     {
1835627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1836627f7eb2Smrg         {
1837627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839627f7eb2Smrg         }
1840627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841627f7eb2Smrg         {
1842627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844627f7eb2Smrg         }
1845627f7eb2Smrg       else
1846627f7eb2Smrg         {
1847627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849627f7eb2Smrg 
1850627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
1851627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853627f7eb2Smrg         }
1854627f7eb2Smrg 
1855627f7eb2Smrg       retarray->base_addr
1856627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1857627f7eb2Smrg       retarray->offset = 0;
1858627f7eb2Smrg     }
1859627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
1860627f7eb2Smrg     {
1861627f7eb2Smrg       index_type ret_extent, arg_extent;
1862627f7eb2Smrg 
1863627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
1864627f7eb2Smrg 	{
1865627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867627f7eb2Smrg 	  if (arg_extent != ret_extent)
1868627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1869627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1870627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1871627f7eb2Smrg 	}
1872627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873627f7eb2Smrg 	{
1874627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876627f7eb2Smrg 	  if (arg_extent != ret_extent)
1877627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1878627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1879627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1880627f7eb2Smrg 	}
1881627f7eb2Smrg       else
1882627f7eb2Smrg 	{
1883627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885627f7eb2Smrg 	  if (arg_extent != ret_extent)
1886627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
1887627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1888627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1889627f7eb2Smrg 
1890627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892627f7eb2Smrg 	  if (arg_extent != ret_extent)
1893627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
1894627f7eb2Smrg 	    		   "array (%ld/%ld) ",
1895627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
1896627f7eb2Smrg 	}
1897627f7eb2Smrg     }
1898627f7eb2Smrg 
1899627f7eb2Smrg 
1900627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901627f7eb2Smrg     {
1902627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
1903627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
1904627f7eb2Smrg 	 work. */
1905627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906627f7eb2Smrg     }
1907627f7eb2Smrg   else
1908627f7eb2Smrg     {
1909627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911627f7eb2Smrg     }
1912627f7eb2Smrg 
1913627f7eb2Smrg 
1914627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
1915627f7eb2Smrg     {
1916627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
1917627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918627f7eb2Smrg       aystride = 1;
1919627f7eb2Smrg 
1920627f7eb2Smrg       xcount = 1;
1921627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
1922627f7eb2Smrg     }
1923627f7eb2Smrg   else
1924627f7eb2Smrg     {
1925627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927627f7eb2Smrg 
1928627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
1929627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930627f7eb2Smrg     }
1931627f7eb2Smrg 
1932627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933627f7eb2Smrg     {
1934627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
1937627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938627f7eb2Smrg     }
1939627f7eb2Smrg 
1940627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
1941627f7eb2Smrg     {
1942627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
1943627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944627f7eb2Smrg 
1945627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
1946627f7eb2Smrg          The value is only used for calculation of the
1947627f7eb2Smrg          memory by the buffer.  */
1948627f7eb2Smrg       bystride = 256;
1949627f7eb2Smrg       ycount = 1;
1950627f7eb2Smrg     }
1951627f7eb2Smrg   else
1952627f7eb2Smrg     {
1953627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956627f7eb2Smrg     }
1957627f7eb2Smrg 
1958627f7eb2Smrg   abase = a->base_addr;
1959627f7eb2Smrg   bbase = b->base_addr;
1960627f7eb2Smrg   dest = retarray->base_addr;
1961627f7eb2Smrg 
1962627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
1963627f7eb2Smrg      itself.  */
1964627f7eb2Smrg 
1965627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1967627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1968627f7eb2Smrg 
1969627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
1971627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
1972627f7eb2Smrg           > POW3(blas_limit)))
1973627f7eb2Smrg     {
1974627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
1975627f7eb2Smrg       const GFC_COMPLEX_8 one = 1, zero = 0;
1976627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
1977627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
1978627f7eb2Smrg 
1979627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980627f7eb2Smrg 	{
1981627f7eb2Smrg 	  assert (gemm != NULL);
1982627f7eb2Smrg 	  const char *transa, *transb;
1983627f7eb2Smrg 	  if (try_blas & 2)
1984627f7eb2Smrg 	    transa = "C";
1985627f7eb2Smrg 	  else
1986627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
1987627f7eb2Smrg 
1988627f7eb2Smrg 	  if (try_blas & 4)
1989627f7eb2Smrg 	    transb = "C";
1990627f7eb2Smrg 	  else
1991627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
1992627f7eb2Smrg 
1993627f7eb2Smrg 	  gemm (transa, transb , &m,
1994627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995627f7eb2Smrg 		&ldc, 1, 1);
1996627f7eb2Smrg 	  return;
1997627f7eb2Smrg 	}
1998627f7eb2Smrg     }
1999627f7eb2Smrg 
2000*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2001*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2002627f7eb2Smrg     {
2003627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
2004627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005627f7eb2Smrg 
2006627f7eb2Smrg                Bo Kagstrom and Per Ling
2007627f7eb2Smrg                Department of Computing Science
2008627f7eb2Smrg                Umea University
2009627f7eb2Smrg                S-901 87 Umea, Sweden
2010627f7eb2Smrg 
2011627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012627f7eb2Smrg 
2013627f7eb2Smrg       const GFC_COMPLEX_8 *a, *b;
2014627f7eb2Smrg       GFC_COMPLEX_8 *c;
2015627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
2016627f7eb2Smrg 
2017627f7eb2Smrg       /* System generated locals */
2018627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
2020627f7eb2Smrg 
2021627f7eb2Smrg       /* Local variables */
2022627f7eb2Smrg       GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
2023627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2024627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
2025627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026627f7eb2Smrg       GFC_COMPLEX_8 *t1;
2027627f7eb2Smrg 
2028627f7eb2Smrg       a = abase;
2029627f7eb2Smrg       b = bbase;
2030627f7eb2Smrg       c = retarray->base_addr;
2031627f7eb2Smrg 
2032627f7eb2Smrg       /* Parameter adjustments */
2033627f7eb2Smrg       c_dim1 = rystride;
2034627f7eb2Smrg       c_offset = 1 + c_dim1;
2035627f7eb2Smrg       c -= c_offset;
2036627f7eb2Smrg       a_dim1 = aystride;
2037627f7eb2Smrg       a_offset = 1 + a_dim1;
2038627f7eb2Smrg       a -= a_offset;
2039627f7eb2Smrg       b_dim1 = bystride;
2040627f7eb2Smrg       b_offset = 1 + b_dim1;
2041627f7eb2Smrg       b -= b_offset;
2042627f7eb2Smrg 
2043627f7eb2Smrg       /* Empty c first.  */
2044627f7eb2Smrg       for (j=1; j<=n; j++)
2045627f7eb2Smrg 	for (i=1; i<=m; i++)
2046627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
2047627f7eb2Smrg 
2048627f7eb2Smrg       /* Early exit if possible */
2049627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
2050627f7eb2Smrg 	return;
2051627f7eb2Smrg 
2052627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
2053627f7eb2Smrg       index_type t1_dim, a_sz;
2054627f7eb2Smrg       if (aystride == 1)
2055627f7eb2Smrg         a_sz = rystride;
2056627f7eb2Smrg       else
2057627f7eb2Smrg         a_sz = a_dim1;
2058627f7eb2Smrg 
2059627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
2060627f7eb2Smrg       if (t1_dim > 65536)
2061627f7eb2Smrg 	t1_dim = 65536;
2062627f7eb2Smrg 
2063627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
2064627f7eb2Smrg 
2065627f7eb2Smrg       /* Start turning the crank. */
2066627f7eb2Smrg       i1 = n;
2067627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
2068627f7eb2Smrg 	{
2069627f7eb2Smrg 	  /* Computing MIN */
2070627f7eb2Smrg 	  i2 = 512;
2071627f7eb2Smrg 	  i3 = n - jj + 1;
2072627f7eb2Smrg 	  jsec = min(i2,i3);
2073627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
2074627f7eb2Smrg 	  i2 = k;
2075627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2076627f7eb2Smrg 	    {
2077627f7eb2Smrg 	      /* Computing MIN */
2078627f7eb2Smrg 	      i3 = 256;
2079627f7eb2Smrg 	      i4 = k - ll + 1;
2080627f7eb2Smrg 	      lsec = min(i3,i4);
2081627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
2082627f7eb2Smrg 
2083627f7eb2Smrg 	      i3 = m;
2084627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2085627f7eb2Smrg 		{
2086627f7eb2Smrg 		  /* Computing MIN */
2087627f7eb2Smrg 		  i4 = 256;
2088627f7eb2Smrg 		  i5 = m - ii + 1;
2089627f7eb2Smrg 		  isec = min(i4,i5);
2090627f7eb2Smrg 		  uisec = isec - isec % 2;
2091627f7eb2Smrg 		  i4 = ll + ulsec - 1;
2092627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
2093627f7eb2Smrg 		    {
2094627f7eb2Smrg 		      i5 = ii + uisec - 1;
2095627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
2096627f7eb2Smrg 			{
2097627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098627f7eb2Smrg 					a[i + l * a_dim1];
2099627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
2101627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102627f7eb2Smrg 					a[i + 1 + l * a_dim1];
2103627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
2105627f7eb2Smrg 			}
2106627f7eb2Smrg 		      if (uisec < isec)
2107627f7eb2Smrg 			{
2108627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2109627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
2110627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2111627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112627f7eb2Smrg 			}
2113627f7eb2Smrg 		    }
2114627f7eb2Smrg 		  if (ulsec < lsec)
2115627f7eb2Smrg 		    {
2116627f7eb2Smrg 		      i4 = ii + isec - 1;
2117627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
2118627f7eb2Smrg 			{
2119627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2121627f7eb2Smrg 			}
2122627f7eb2Smrg 		    }
2123627f7eb2Smrg 
2124627f7eb2Smrg 		  uisec = isec - isec % 4;
2125627f7eb2Smrg 		  i4 = jj + ujsec - 1;
2126627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
2127627f7eb2Smrg 		    {
2128627f7eb2Smrg 		      i5 = ii + uisec - 1;
2129627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
2130627f7eb2Smrg 			{
2131627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
2132627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
2133627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2134627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2136627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2138627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
2140627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
2141627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147627f7eb2Smrg 			  i6 = ll + lsec - 1;
2148627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
2149627f7eb2Smrg 			    {
2150627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151627f7eb2Smrg 				      * b[l + j * b_dim1];
2152627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153627f7eb2Smrg 				      * b[l + j * b_dim1];
2154627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2156627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2158627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2160627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2162627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2164627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2166627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167627f7eb2Smrg 				      * b[l + j * b_dim1];
2168627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169627f7eb2Smrg 				      * b[l + j * b_dim1];
2170627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2172627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2174627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2176627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2178627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2180627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2182627f7eb2Smrg 			    }
2183627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
2184627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
2185627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2186627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2188627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2190627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
2192627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
2193627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199627f7eb2Smrg 			}
2200627f7eb2Smrg 		      if (uisec < isec)
2201627f7eb2Smrg 			{
2202627f7eb2Smrg 			  i5 = ii + isec - 1;
2203627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2204627f7eb2Smrg 			    {
2205627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2206627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2207627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2208627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2209627f7eb2Smrg 			      i6 = ll + lsec - 1;
2210627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2211627f7eb2Smrg 				{
2212627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2214627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
2216627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
2218627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
2220627f7eb2Smrg 				}
2221627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2222627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2223627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2224627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2225627f7eb2Smrg 			    }
2226627f7eb2Smrg 			}
2227627f7eb2Smrg 		    }
2228627f7eb2Smrg 		  if (ujsec < jsec)
2229627f7eb2Smrg 		    {
2230627f7eb2Smrg 		      i4 = jj + jsec - 1;
2231627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2232627f7eb2Smrg 			{
2233627f7eb2Smrg 			  i5 = ii + uisec - 1;
2234627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
2235627f7eb2Smrg 			    {
2236627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2237627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
2238627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
2239627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
2240627f7eb2Smrg 			      i6 = ll + lsec - 1;
2241627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2242627f7eb2Smrg 				{
2243627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2245627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2247627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2249627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2251627f7eb2Smrg 				}
2252627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2253627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
2254627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
2255627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
2256627f7eb2Smrg 			    }
2257627f7eb2Smrg 			  i5 = ii + isec - 1;
2258627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2259627f7eb2Smrg 			    {
2260627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2261627f7eb2Smrg 			      i6 = ll + lsec - 1;
2262627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2263627f7eb2Smrg 				{
2264627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2266627f7eb2Smrg 				}
2267627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2268627f7eb2Smrg 			    }
2269627f7eb2Smrg 			}
2270627f7eb2Smrg 		    }
2271627f7eb2Smrg 		}
2272627f7eb2Smrg 	    }
2273627f7eb2Smrg 	}
2274627f7eb2Smrg       free(t1);
2275627f7eb2Smrg       return;
2276627f7eb2Smrg     }
2277627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278627f7eb2Smrg     {
2279627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2280627f7eb2Smrg 	{
2281627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict abase_x;
2282627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
2283627f7eb2Smrg 	  GFC_COMPLEX_8 *restrict dest_y;
2284627f7eb2Smrg 	  GFC_COMPLEX_8 s;
2285627f7eb2Smrg 
2286627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
2287627f7eb2Smrg 	    {
2288627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
2289627f7eb2Smrg 	      dest_y = &dest[y*rystride];
2290627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
2291627f7eb2Smrg 		{
2292627f7eb2Smrg 		  abase_x = &abase[x*axstride];
2293627f7eb2Smrg 		  s = (GFC_COMPLEX_8) 0;
2294627f7eb2Smrg 		  for (n = 0; n < count; n++)
2295627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
2296627f7eb2Smrg 		  dest_y[x] = s;
2297627f7eb2Smrg 		}
2298627f7eb2Smrg 	    }
2299627f7eb2Smrg 	}
2300627f7eb2Smrg       else
2301627f7eb2Smrg 	{
2302627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
2303627f7eb2Smrg 	  GFC_COMPLEX_8 s;
2304627f7eb2Smrg 
2305627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
2306627f7eb2Smrg 	    {
2307627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
2308627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
2309627f7eb2Smrg 	      for (n = 0; n < count; n++)
2310627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
2311627f7eb2Smrg 	      dest[y*rystride] = s;
2312627f7eb2Smrg 	    }
2313627f7eb2Smrg 	}
2314627f7eb2Smrg     }
2315627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316627f7eb2Smrg     {
2317627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
2318627f7eb2Smrg       GFC_COMPLEX_8 s;
2319627f7eb2Smrg 
2320627f7eb2Smrg       for (y = 0; y < ycount; y++)
2321627f7eb2Smrg 	{
2322627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
2323627f7eb2Smrg 	  s = (GFC_COMPLEX_8) 0;
2324627f7eb2Smrg 	  for (n = 0; n < count; n++)
2325627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326627f7eb2Smrg 	  dest[y*rxstride] = s;
2327627f7eb2Smrg 	}
2328627f7eb2Smrg     }
2329*4c3eb207Smrg   else if (axstride < aystride)
2330*4c3eb207Smrg     {
2331*4c3eb207Smrg       for (y = 0; y < ycount; y++)
2332*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
2333*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2334*4c3eb207Smrg 
2335*4c3eb207Smrg       for (y = 0; y < ycount; y++)
2336*4c3eb207Smrg 	for (n = 0; n < count; n++)
2337*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
2338*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2339*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
2340*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
2341*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
2342*4c3eb207Smrg     }
2343627f7eb2Smrg   else
2344627f7eb2Smrg     {
2345627f7eb2Smrg       const GFC_COMPLEX_8 *restrict abase_x;
2346627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
2347627f7eb2Smrg       GFC_COMPLEX_8 *restrict dest_y;
2348627f7eb2Smrg       GFC_COMPLEX_8 s;
2349627f7eb2Smrg 
2350627f7eb2Smrg       for (y = 0; y < ycount; y++)
2351627f7eb2Smrg 	{
2352627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
2353627f7eb2Smrg 	  dest_y = &dest[y*rystride];
2354627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
2355627f7eb2Smrg 	    {
2356627f7eb2Smrg 	      abase_x = &abase[x*axstride];
2357627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
2358627f7eb2Smrg 	      for (n = 0; n < count; n++)
2359627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360627f7eb2Smrg 	      dest_y[x*rxstride] = s;
2361627f7eb2Smrg 	    }
2362627f7eb2Smrg 	}
2363627f7eb2Smrg     }
2364627f7eb2Smrg }
2365627f7eb2Smrg #undef POW3
2366627f7eb2Smrg #undef min
2367627f7eb2Smrg #undef max
2368627f7eb2Smrg 
2369627f7eb2Smrg 
2370627f7eb2Smrg /* Compiling main function, with selection code for the processor.  */
2371627f7eb2Smrg 
2372627f7eb2Smrg /* Currently, this is i386 only.  Adjust for other architectures.  */
2373627f7eb2Smrg 
2374627f7eb2Smrg #include <config/i386/cpuinfo.h>
matmul_c8(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2375627f7eb2Smrg void matmul_c8 (gfc_array_c8 * const restrict retarray,
2376627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2377627f7eb2Smrg 	int blas_limit, blas_call gemm)
2378627f7eb2Smrg {
2379627f7eb2Smrg   static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
2380627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2381627f7eb2Smrg 	int blas_limit, blas_call gemm);
2382627f7eb2Smrg 
2383627f7eb2Smrg   void (*matmul_fn) (gfc_array_c8 * const restrict retarray,
2384627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2385627f7eb2Smrg 	int blas_limit, blas_call gemm);
2386627f7eb2Smrg 
2387627f7eb2Smrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2388627f7eb2Smrg   if (matmul_fn == NULL)
2389627f7eb2Smrg     {
2390627f7eb2Smrg       matmul_fn = matmul_c8_vanilla;
2391627f7eb2Smrg       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2392627f7eb2Smrg 	{
2393627f7eb2Smrg           /* Run down the available processors in order of preference.  */
2394627f7eb2Smrg #ifdef HAVE_AVX512F
2395627f7eb2Smrg       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2396627f7eb2Smrg 	    {
2397627f7eb2Smrg 	      matmul_fn = matmul_c8_avx512f;
2398627f7eb2Smrg 	      goto store;
2399627f7eb2Smrg 	    }
2400627f7eb2Smrg 
2401627f7eb2Smrg #endif  /* HAVE_AVX512F */
2402627f7eb2Smrg 
2403627f7eb2Smrg #ifdef HAVE_AVX2
2404627f7eb2Smrg       	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2405627f7eb2Smrg 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2406627f7eb2Smrg 	    {
2407627f7eb2Smrg 	      matmul_fn = matmul_c8_avx2;
2408627f7eb2Smrg 	      goto store;
2409627f7eb2Smrg 	    }
2410627f7eb2Smrg 
2411627f7eb2Smrg #endif
2412627f7eb2Smrg 
2413627f7eb2Smrg #ifdef HAVE_AVX
2414627f7eb2Smrg       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2415627f7eb2Smrg  	    {
2416627f7eb2Smrg               matmul_fn = matmul_c8_avx;
2417627f7eb2Smrg 	      goto store;
2418627f7eb2Smrg 	    }
2419627f7eb2Smrg #endif  /* HAVE_AVX */
2420627f7eb2Smrg         }
2421627f7eb2Smrg     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2422627f7eb2Smrg       {
2423627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2424627f7eb2Smrg         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2425627f7eb2Smrg 	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2426627f7eb2Smrg 	  {
2427627f7eb2Smrg             matmul_fn = matmul_c8_avx128_fma3;
2428627f7eb2Smrg 	    goto store;
2429627f7eb2Smrg 	  }
2430627f7eb2Smrg #endif
2431627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2432627f7eb2Smrg         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2433627f7eb2Smrg 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2434627f7eb2Smrg 	  {
2435627f7eb2Smrg             matmul_fn = matmul_c8_avx128_fma4;
2436627f7eb2Smrg 	    goto store;
2437627f7eb2Smrg 	  }
2438627f7eb2Smrg #endif
2439627f7eb2Smrg 
2440627f7eb2Smrg       }
2441627f7eb2Smrg    store:
2442627f7eb2Smrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2443627f7eb2Smrg    }
2444627f7eb2Smrg 
2445627f7eb2Smrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2446627f7eb2Smrg }
2447627f7eb2Smrg 
2448627f7eb2Smrg #else  /* Just the vanilla function.  */
2449627f7eb2Smrg 
2450627f7eb2Smrg void
matmul_c8(gfc_array_c8 * const restrict retarray,gfc_array_c8 * const restrict a,gfc_array_c8 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2451627f7eb2Smrg matmul_c8 (gfc_array_c8 * const restrict retarray,
2452627f7eb2Smrg 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2453627f7eb2Smrg 	int blas_limit, blas_call gemm)
2454627f7eb2Smrg {
2455627f7eb2Smrg   const GFC_COMPLEX_8 * restrict abase;
2456627f7eb2Smrg   const GFC_COMPLEX_8 * restrict bbase;
2457627f7eb2Smrg   GFC_COMPLEX_8 * restrict dest;
2458627f7eb2Smrg 
2459627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2460627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
2461627f7eb2Smrg 
2462627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
2463627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
2464627f7eb2Smrg 
2465627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2466627f7eb2Smrg 
2467627f7eb2Smrg    Either A or B (but not both) can be rank 1:
2468627f7eb2Smrg 
2469627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
2470627f7eb2Smrg      dimensioned [1,count], so xcount=1.
2471627f7eb2Smrg 
2472627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
2473627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
2474627f7eb2Smrg */
2475627f7eb2Smrg 
2476627f7eb2Smrg   if (retarray->base_addr == NULL)
2477627f7eb2Smrg     {
2478627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2479627f7eb2Smrg         {
2480627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2481627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2482627f7eb2Smrg         }
2483627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2484627f7eb2Smrg         {
2485627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2486627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487627f7eb2Smrg         }
2488627f7eb2Smrg       else
2489627f7eb2Smrg         {
2490627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2491627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2492627f7eb2Smrg 
2493627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
2494627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2495627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2496627f7eb2Smrg         }
2497627f7eb2Smrg 
2498627f7eb2Smrg       retarray->base_addr
2499627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
2500627f7eb2Smrg       retarray->offset = 0;
2501627f7eb2Smrg     }
2502627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
2503627f7eb2Smrg     {
2504627f7eb2Smrg       index_type ret_extent, arg_extent;
2505627f7eb2Smrg 
2506627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
2507627f7eb2Smrg 	{
2508627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2509627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2510627f7eb2Smrg 	  if (arg_extent != ret_extent)
2511627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2512627f7eb2Smrg 	    		   "array (%ld/%ld) ",
2513627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
2514627f7eb2Smrg 	}
2515627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2516627f7eb2Smrg 	{
2517627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2518627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2519627f7eb2Smrg 	  if (arg_extent != ret_extent)
2520627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2521627f7eb2Smrg 	    		   "array (%ld/%ld) ",
2522627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
2523627f7eb2Smrg 	}
2524627f7eb2Smrg       else
2525627f7eb2Smrg 	{
2526627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2527627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2528627f7eb2Smrg 	  if (arg_extent != ret_extent)
2529627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
2530627f7eb2Smrg 	    		   "array (%ld/%ld) ",
2531627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
2532627f7eb2Smrg 
2533627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2534627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2535627f7eb2Smrg 	  if (arg_extent != ret_extent)
2536627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
2537627f7eb2Smrg 	    		   "array (%ld/%ld) ",
2538627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
2539627f7eb2Smrg 	}
2540627f7eb2Smrg     }
2541627f7eb2Smrg 
2542627f7eb2Smrg 
2543627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2544627f7eb2Smrg     {
2545627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
2546627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
2547627f7eb2Smrg 	 work. */
2548627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2549627f7eb2Smrg     }
2550627f7eb2Smrg   else
2551627f7eb2Smrg     {
2552627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2553627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2554627f7eb2Smrg     }
2555627f7eb2Smrg 
2556627f7eb2Smrg 
2557627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
2558627f7eb2Smrg     {
2559627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
2560627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2561627f7eb2Smrg       aystride = 1;
2562627f7eb2Smrg 
2563627f7eb2Smrg       xcount = 1;
2564627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
2565627f7eb2Smrg     }
2566627f7eb2Smrg   else
2567627f7eb2Smrg     {
2568627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2569627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2570627f7eb2Smrg 
2571627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
2572627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2573627f7eb2Smrg     }
2574627f7eb2Smrg 
2575627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2576627f7eb2Smrg     {
2577627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2578627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2579627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
2580627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2581627f7eb2Smrg     }
2582627f7eb2Smrg 
2583627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
2584627f7eb2Smrg     {
2585627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
2586627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2587627f7eb2Smrg 
2588627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
2589627f7eb2Smrg          The value is only used for calculation of the
2590627f7eb2Smrg          memory by the buffer.  */
2591627f7eb2Smrg       bystride = 256;
2592627f7eb2Smrg       ycount = 1;
2593627f7eb2Smrg     }
2594627f7eb2Smrg   else
2595627f7eb2Smrg     {
2596627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2597627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2598627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2599627f7eb2Smrg     }
2600627f7eb2Smrg 
2601627f7eb2Smrg   abase = a->base_addr;
2602627f7eb2Smrg   bbase = b->base_addr;
2603627f7eb2Smrg   dest = retarray->base_addr;
2604627f7eb2Smrg 
2605627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
2606627f7eb2Smrg      itself.  */
2607627f7eb2Smrg 
2608627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2609627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
2610627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
2611627f7eb2Smrg 
2612627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2613627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
2614627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
2615627f7eb2Smrg           > POW3(blas_limit)))
2616627f7eb2Smrg     {
2617627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
2618627f7eb2Smrg       const GFC_COMPLEX_8 one = 1, zero = 0;
2619627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
2620627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
2621627f7eb2Smrg 
2622627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2623627f7eb2Smrg 	{
2624627f7eb2Smrg 	  assert (gemm != NULL);
2625627f7eb2Smrg 	  const char *transa, *transb;
2626627f7eb2Smrg 	  if (try_blas & 2)
2627627f7eb2Smrg 	    transa = "C";
2628627f7eb2Smrg 	  else
2629627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
2630627f7eb2Smrg 
2631627f7eb2Smrg 	  if (try_blas & 4)
2632627f7eb2Smrg 	    transb = "C";
2633627f7eb2Smrg 	  else
2634627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
2635627f7eb2Smrg 
2636627f7eb2Smrg 	  gemm (transa, transb , &m,
2637627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2638627f7eb2Smrg 		&ldc, 1, 1);
2639627f7eb2Smrg 	  return;
2640627f7eb2Smrg 	}
2641627f7eb2Smrg     }
2642627f7eb2Smrg 
2643*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
2644*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
2645627f7eb2Smrg     {
2646627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
2647627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2648627f7eb2Smrg 
2649627f7eb2Smrg                Bo Kagstrom and Per Ling
2650627f7eb2Smrg                Department of Computing Science
2651627f7eb2Smrg                Umea University
2652627f7eb2Smrg                S-901 87 Umea, Sweden
2653627f7eb2Smrg 
2654627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2655627f7eb2Smrg 
2656627f7eb2Smrg       const GFC_COMPLEX_8 *a, *b;
2657627f7eb2Smrg       GFC_COMPLEX_8 *c;
2658627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
2659627f7eb2Smrg 
2660627f7eb2Smrg       /* System generated locals */
2661627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2662627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
2663627f7eb2Smrg 
2664627f7eb2Smrg       /* Local variables */
2665627f7eb2Smrg       GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
2666627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
2667627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
2668627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2669627f7eb2Smrg       GFC_COMPLEX_8 *t1;
2670627f7eb2Smrg 
2671627f7eb2Smrg       a = abase;
2672627f7eb2Smrg       b = bbase;
2673627f7eb2Smrg       c = retarray->base_addr;
2674627f7eb2Smrg 
2675627f7eb2Smrg       /* Parameter adjustments */
2676627f7eb2Smrg       c_dim1 = rystride;
2677627f7eb2Smrg       c_offset = 1 + c_dim1;
2678627f7eb2Smrg       c -= c_offset;
2679627f7eb2Smrg       a_dim1 = aystride;
2680627f7eb2Smrg       a_offset = 1 + a_dim1;
2681627f7eb2Smrg       a -= a_offset;
2682627f7eb2Smrg       b_dim1 = bystride;
2683627f7eb2Smrg       b_offset = 1 + b_dim1;
2684627f7eb2Smrg       b -= b_offset;
2685627f7eb2Smrg 
2686627f7eb2Smrg       /* Empty c first.  */
2687627f7eb2Smrg       for (j=1; j<=n; j++)
2688627f7eb2Smrg 	for (i=1; i<=m; i++)
2689627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
2690627f7eb2Smrg 
2691627f7eb2Smrg       /* Early exit if possible */
2692627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
2693627f7eb2Smrg 	return;
2694627f7eb2Smrg 
2695627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
2696627f7eb2Smrg       index_type t1_dim, a_sz;
2697627f7eb2Smrg       if (aystride == 1)
2698627f7eb2Smrg         a_sz = rystride;
2699627f7eb2Smrg       else
2700627f7eb2Smrg         a_sz = a_dim1;
2701627f7eb2Smrg 
2702627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
2703627f7eb2Smrg       if (t1_dim > 65536)
2704627f7eb2Smrg 	t1_dim = 65536;
2705627f7eb2Smrg 
2706627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
2707627f7eb2Smrg 
2708627f7eb2Smrg       /* Start turning the crank. */
2709627f7eb2Smrg       i1 = n;
2710627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
2711627f7eb2Smrg 	{
2712627f7eb2Smrg 	  /* Computing MIN */
2713627f7eb2Smrg 	  i2 = 512;
2714627f7eb2Smrg 	  i3 = n - jj + 1;
2715627f7eb2Smrg 	  jsec = min(i2,i3);
2716627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
2717627f7eb2Smrg 	  i2 = k;
2718627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
2719627f7eb2Smrg 	    {
2720627f7eb2Smrg 	      /* Computing MIN */
2721627f7eb2Smrg 	      i3 = 256;
2722627f7eb2Smrg 	      i4 = k - ll + 1;
2723627f7eb2Smrg 	      lsec = min(i3,i4);
2724627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
2725627f7eb2Smrg 
2726627f7eb2Smrg 	      i3 = m;
2727627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
2728627f7eb2Smrg 		{
2729627f7eb2Smrg 		  /* Computing MIN */
2730627f7eb2Smrg 		  i4 = 256;
2731627f7eb2Smrg 		  i5 = m - ii + 1;
2732627f7eb2Smrg 		  isec = min(i4,i5);
2733627f7eb2Smrg 		  uisec = isec - isec % 2;
2734627f7eb2Smrg 		  i4 = ll + ulsec - 1;
2735627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
2736627f7eb2Smrg 		    {
2737627f7eb2Smrg 		      i5 = ii + uisec - 1;
2738627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
2739627f7eb2Smrg 			{
2740627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2741627f7eb2Smrg 					a[i + l * a_dim1];
2742627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2743627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
2744627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2745627f7eb2Smrg 					a[i + 1 + l * a_dim1];
2746627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2747627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
2748627f7eb2Smrg 			}
2749627f7eb2Smrg 		      if (uisec < isec)
2750627f7eb2Smrg 			{
2751627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
2752627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
2753627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
2754627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2755627f7eb2Smrg 			}
2756627f7eb2Smrg 		    }
2757627f7eb2Smrg 		  if (ulsec < lsec)
2758627f7eb2Smrg 		    {
2759627f7eb2Smrg 		      i4 = ii + isec - 1;
2760627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
2761627f7eb2Smrg 			{
2762627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2763627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
2764627f7eb2Smrg 			}
2765627f7eb2Smrg 		    }
2766627f7eb2Smrg 
2767627f7eb2Smrg 		  uisec = isec - isec % 4;
2768627f7eb2Smrg 		  i4 = jj + ujsec - 1;
2769627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
2770627f7eb2Smrg 		    {
2771627f7eb2Smrg 		      i5 = ii + uisec - 1;
2772627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
2773627f7eb2Smrg 			{
2774627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
2775627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
2776627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
2777627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2778627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
2779627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2780627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
2781627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2782627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
2783627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
2784627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2785627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2786627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2787627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2788627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2789627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2790627f7eb2Smrg 			  i6 = ll + lsec - 1;
2791627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
2792627f7eb2Smrg 			    {
2793627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2794627f7eb2Smrg 				      * b[l + j * b_dim1];
2795627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2796627f7eb2Smrg 				      * b[l + j * b_dim1];
2797627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2798627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2799627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2800627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2801627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2802627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2803627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2804627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2805627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2806627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2807627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2808627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2809627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2810627f7eb2Smrg 				      * b[l + j * b_dim1];
2811627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2812627f7eb2Smrg 				      * b[l + j * b_dim1];
2813627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2814627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2815627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2816627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
2817627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2818627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2819627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2820627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
2821627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2822627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2823627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2824627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
2825627f7eb2Smrg 			    }
2826627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
2827627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
2828627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
2829627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2830627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
2831627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2832627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
2833627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2834627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
2835627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
2836627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2837627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2838627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2839627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2840627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2841627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2842627f7eb2Smrg 			}
2843627f7eb2Smrg 		      if (uisec < isec)
2844627f7eb2Smrg 			{
2845627f7eb2Smrg 			  i5 = ii + isec - 1;
2846627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2847627f7eb2Smrg 			    {
2848627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2849627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
2850627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
2851627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
2852627f7eb2Smrg 			      i6 = ll + lsec - 1;
2853627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2854627f7eb2Smrg 				{
2855627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2857627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2858627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
2859627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2860627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
2861627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2862627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
2863627f7eb2Smrg 				}
2864627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2865627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
2866627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
2867627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
2868627f7eb2Smrg 			    }
2869627f7eb2Smrg 			}
2870627f7eb2Smrg 		    }
2871627f7eb2Smrg 		  if (ujsec < jsec)
2872627f7eb2Smrg 		    {
2873627f7eb2Smrg 		      i4 = jj + jsec - 1;
2874627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
2875627f7eb2Smrg 			{
2876627f7eb2Smrg 			  i5 = ii + uisec - 1;
2877627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
2878627f7eb2Smrg 			    {
2879627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2880627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
2881627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
2882627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
2883627f7eb2Smrg 			      i6 = ll + lsec - 1;
2884627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2885627f7eb2Smrg 				{
2886627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2887627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2888627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2889627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2890627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2891627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2892627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2893627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2894627f7eb2Smrg 				}
2895627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2896627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
2897627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
2898627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
2899627f7eb2Smrg 			    }
2900627f7eb2Smrg 			  i5 = ii + isec - 1;
2901627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
2902627f7eb2Smrg 			    {
2903627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
2904627f7eb2Smrg 			      i6 = ll + lsec - 1;
2905627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
2906627f7eb2Smrg 				{
2907627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2908627f7eb2Smrg 					  257] * b[l + j * b_dim1];
2909627f7eb2Smrg 				}
2910627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
2911627f7eb2Smrg 			    }
2912627f7eb2Smrg 			}
2913627f7eb2Smrg 		    }
2914627f7eb2Smrg 		}
2915627f7eb2Smrg 	    }
2916627f7eb2Smrg 	}
2917627f7eb2Smrg       free(t1);
2918627f7eb2Smrg       return;
2919627f7eb2Smrg     }
2920627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2921627f7eb2Smrg     {
2922627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
2923627f7eb2Smrg 	{
2924627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict abase_x;
2925627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
2926627f7eb2Smrg 	  GFC_COMPLEX_8 *restrict dest_y;
2927627f7eb2Smrg 	  GFC_COMPLEX_8 s;
2928627f7eb2Smrg 
2929627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
2930627f7eb2Smrg 	    {
2931627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
2932627f7eb2Smrg 	      dest_y = &dest[y*rystride];
2933627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
2934627f7eb2Smrg 		{
2935627f7eb2Smrg 		  abase_x = &abase[x*axstride];
2936627f7eb2Smrg 		  s = (GFC_COMPLEX_8) 0;
2937627f7eb2Smrg 		  for (n = 0; n < count; n++)
2938627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
2939627f7eb2Smrg 		  dest_y[x] = s;
2940627f7eb2Smrg 		}
2941627f7eb2Smrg 	    }
2942627f7eb2Smrg 	}
2943627f7eb2Smrg       else
2944627f7eb2Smrg 	{
2945627f7eb2Smrg 	  const GFC_COMPLEX_8 *restrict bbase_y;
2946627f7eb2Smrg 	  GFC_COMPLEX_8 s;
2947627f7eb2Smrg 
2948627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
2949627f7eb2Smrg 	    {
2950627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
2951627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
2952627f7eb2Smrg 	      for (n = 0; n < count; n++)
2953627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
2954627f7eb2Smrg 	      dest[y*rystride] = s;
2955627f7eb2Smrg 	    }
2956627f7eb2Smrg 	}
2957627f7eb2Smrg     }
2958627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2959627f7eb2Smrg     {
2960627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
2961627f7eb2Smrg       GFC_COMPLEX_8 s;
2962627f7eb2Smrg 
2963627f7eb2Smrg       for (y = 0; y < ycount; y++)
2964627f7eb2Smrg 	{
2965627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
2966627f7eb2Smrg 	  s = (GFC_COMPLEX_8) 0;
2967627f7eb2Smrg 	  for (n = 0; n < count; n++)
2968627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2969627f7eb2Smrg 	  dest[y*rxstride] = s;
2970627f7eb2Smrg 	}
2971627f7eb2Smrg     }
2972*4c3eb207Smrg   else if (axstride < aystride)
2973*4c3eb207Smrg     {
2974*4c3eb207Smrg       for (y = 0; y < ycount; y++)
2975*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
2976*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2977*4c3eb207Smrg 
2978*4c3eb207Smrg       for (y = 0; y < ycount; y++)
2979*4c3eb207Smrg 	for (n = 0; n < count; n++)
2980*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
2981*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
2982*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
2983*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
2984*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
2985*4c3eb207Smrg     }
2986627f7eb2Smrg   else
2987627f7eb2Smrg     {
2988627f7eb2Smrg       const GFC_COMPLEX_8 *restrict abase_x;
2989627f7eb2Smrg       const GFC_COMPLEX_8 *restrict bbase_y;
2990627f7eb2Smrg       GFC_COMPLEX_8 *restrict dest_y;
2991627f7eb2Smrg       GFC_COMPLEX_8 s;
2992627f7eb2Smrg 
2993627f7eb2Smrg       for (y = 0; y < ycount; y++)
2994627f7eb2Smrg 	{
2995627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
2996627f7eb2Smrg 	  dest_y = &dest[y*rystride];
2997627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
2998627f7eb2Smrg 	    {
2999627f7eb2Smrg 	      abase_x = &abase[x*axstride];
3000627f7eb2Smrg 	      s = (GFC_COMPLEX_8) 0;
3001627f7eb2Smrg 	      for (n = 0; n < count; n++)
3002627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3003627f7eb2Smrg 	      dest_y[x*rxstride] = s;
3004627f7eb2Smrg 	    }
3005627f7eb2Smrg 	}
3006627f7eb2Smrg     }
3007627f7eb2Smrg }
3008627f7eb2Smrg #undef POW3
3009627f7eb2Smrg #undef min
3010627f7eb2Smrg #undef max
3011627f7eb2Smrg 
3012627f7eb2Smrg #endif
3013627f7eb2Smrg #endif
3014627f7eb2Smrg 
3015