xref: /netbsd-src/external/gpl3/gcc.old/dist/libgfortran/generated/matmulavx128_c16.c (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1627f7eb2Smrg /* Implementation of the MATMUL intrinsic
2*4c3eb207Smrg    Copyright (C) 2002-2020 Free Software Foundation, Inc.
3627f7eb2Smrg    Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4627f7eb2Smrg 
5627f7eb2Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6627f7eb2Smrg 
7627f7eb2Smrg Libgfortran is free software; you can redistribute it and/or
8627f7eb2Smrg modify it under the terms of the GNU General Public
9627f7eb2Smrg License as published by the Free Software Foundation; either
10627f7eb2Smrg version 3 of the License, or (at your option) any later version.
11627f7eb2Smrg 
12627f7eb2Smrg Libgfortran is distributed in the hope that it will be useful,
13627f7eb2Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14627f7eb2Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15627f7eb2Smrg GNU General Public License for more details.
16627f7eb2Smrg 
17627f7eb2Smrg Under Section 7 of GPL version 3, you are granted additional
18627f7eb2Smrg permissions described in the GCC Runtime Library Exception, version
19627f7eb2Smrg 3.1, as published by the Free Software Foundation.
20627f7eb2Smrg 
21627f7eb2Smrg You should have received a copy of the GNU General Public License and
22627f7eb2Smrg a copy of the GCC Runtime Library Exception along with this program;
23627f7eb2Smrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24627f7eb2Smrg <http://www.gnu.org/licenses/>.  */
25627f7eb2Smrg 
26627f7eb2Smrg #include "libgfortran.h"
27627f7eb2Smrg #include <string.h>
28627f7eb2Smrg #include <assert.h>
29627f7eb2Smrg 
30627f7eb2Smrg 
31627f7eb2Smrg /* These are the specific versions of matmul with -mprefer-avx128.  */
32627f7eb2Smrg 
33627f7eb2Smrg #if defined (HAVE_GFC_COMPLEX_16)
34627f7eb2Smrg 
35627f7eb2Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36627f7eb2Smrg    passed to us by the front-end, in which case we call it for large
37627f7eb2Smrg    matrices.  */
38627f7eb2Smrg 
39627f7eb2Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40627f7eb2Smrg                           const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
41627f7eb2Smrg                           const int *, const GFC_COMPLEX_16 *, const int *,
42627f7eb2Smrg                           const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
43627f7eb2Smrg                           int, int);
44627f7eb2Smrg 
45627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46627f7eb2Smrg void
47627f7eb2Smrg matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
48627f7eb2Smrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
49627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50627f7eb2Smrg internal_proto(matmul_c16_avx128_fma3);
51627f7eb2Smrg void
matmul_c16_avx128_fma3(gfc_array_c16 * const restrict retarray,gfc_array_c16 * const restrict a,gfc_array_c16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)52627f7eb2Smrg matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
53627f7eb2Smrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
54627f7eb2Smrg 	int blas_limit, blas_call gemm)
55627f7eb2Smrg {
56627f7eb2Smrg   const GFC_COMPLEX_16 * restrict abase;
57627f7eb2Smrg   const GFC_COMPLEX_16 * restrict bbase;
58627f7eb2Smrg   GFC_COMPLEX_16 * restrict dest;
59627f7eb2Smrg 
60627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
62627f7eb2Smrg 
63627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
64627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
65627f7eb2Smrg 
66627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67627f7eb2Smrg 
68627f7eb2Smrg    Either A or B (but not both) can be rank 1:
69627f7eb2Smrg 
70627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
71627f7eb2Smrg      dimensioned [1,count], so xcount=1.
72627f7eb2Smrg 
73627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
74627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
75627f7eb2Smrg */
76627f7eb2Smrg 
77627f7eb2Smrg   if (retarray->base_addr == NULL)
78627f7eb2Smrg     {
79627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
80627f7eb2Smrg         {
81627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
82627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83627f7eb2Smrg         }
84627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
85627f7eb2Smrg         {
86627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
87627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88627f7eb2Smrg         }
89627f7eb2Smrg       else
90627f7eb2Smrg         {
91627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
92627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93627f7eb2Smrg 
94627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
95627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
97627f7eb2Smrg         }
98627f7eb2Smrg 
99627f7eb2Smrg       retarray->base_addr
100627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
101627f7eb2Smrg       retarray->offset = 0;
102627f7eb2Smrg     }
103627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
104627f7eb2Smrg     {
105627f7eb2Smrg       index_type ret_extent, arg_extent;
106627f7eb2Smrg 
107627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
108627f7eb2Smrg 	{
109627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111627f7eb2Smrg 	  if (arg_extent != ret_extent)
112627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
113627f7eb2Smrg 	    		   "array (%ld/%ld) ",
114627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
115627f7eb2Smrg 	}
116627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
117627f7eb2Smrg 	{
118627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120627f7eb2Smrg 	  if (arg_extent != ret_extent)
121627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
122627f7eb2Smrg 	    		   "array (%ld/%ld) ",
123627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
124627f7eb2Smrg 	}
125627f7eb2Smrg       else
126627f7eb2Smrg 	{
127627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129627f7eb2Smrg 	  if (arg_extent != ret_extent)
130627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
131627f7eb2Smrg 	    		   "array (%ld/%ld) ",
132627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
133627f7eb2Smrg 
134627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136627f7eb2Smrg 	  if (arg_extent != ret_extent)
137627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
138627f7eb2Smrg 	    		   "array (%ld/%ld) ",
139627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
140627f7eb2Smrg 	}
141627f7eb2Smrg     }
142627f7eb2Smrg 
143627f7eb2Smrg 
144627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145627f7eb2Smrg     {
146627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
147627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
148627f7eb2Smrg 	 work. */
149627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150627f7eb2Smrg     }
151627f7eb2Smrg   else
152627f7eb2Smrg     {
153627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155627f7eb2Smrg     }
156627f7eb2Smrg 
157627f7eb2Smrg 
158627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
159627f7eb2Smrg     {
160627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
161627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162627f7eb2Smrg       aystride = 1;
163627f7eb2Smrg 
164627f7eb2Smrg       xcount = 1;
165627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
166627f7eb2Smrg     }
167627f7eb2Smrg   else
168627f7eb2Smrg     {
169627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171627f7eb2Smrg 
172627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
173627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174627f7eb2Smrg     }
175627f7eb2Smrg 
176627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177627f7eb2Smrg     {
178627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
181627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182627f7eb2Smrg     }
183627f7eb2Smrg 
184627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
185627f7eb2Smrg     {
186627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
187627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188627f7eb2Smrg 
189627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
190627f7eb2Smrg          The value is only used for calculation of the
191627f7eb2Smrg          memory by the buffer.  */
192627f7eb2Smrg       bystride = 256;
193627f7eb2Smrg       ycount = 1;
194627f7eb2Smrg     }
195627f7eb2Smrg   else
196627f7eb2Smrg     {
197627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200627f7eb2Smrg     }
201627f7eb2Smrg 
202627f7eb2Smrg   abase = a->base_addr;
203627f7eb2Smrg   bbase = b->base_addr;
204627f7eb2Smrg   dest = retarray->base_addr;
205627f7eb2Smrg 
206627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
207627f7eb2Smrg      itself.  */
208627f7eb2Smrg 
209627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
211627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
212627f7eb2Smrg 
213627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
215627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
216627f7eb2Smrg           > POW3(blas_limit)))
217627f7eb2Smrg     {
218627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
219627f7eb2Smrg       const GFC_COMPLEX_16 one = 1, zero = 0;
220627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
221627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
222627f7eb2Smrg 
223627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224627f7eb2Smrg 	{
225627f7eb2Smrg 	  assert (gemm != NULL);
226627f7eb2Smrg 	  const char *transa, *transb;
227627f7eb2Smrg 	  if (try_blas & 2)
228627f7eb2Smrg 	    transa = "C";
229627f7eb2Smrg 	  else
230627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
231627f7eb2Smrg 
232627f7eb2Smrg 	  if (try_blas & 4)
233627f7eb2Smrg 	    transb = "C";
234627f7eb2Smrg 	  else
235627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
236627f7eb2Smrg 
237627f7eb2Smrg 	  gemm (transa, transb , &m,
238627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
239627f7eb2Smrg 		&ldc, 1, 1);
240627f7eb2Smrg 	  return;
241627f7eb2Smrg 	}
242627f7eb2Smrg     }
243627f7eb2Smrg 
244*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
245*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
246627f7eb2Smrg     {
247627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
248627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
249627f7eb2Smrg 
250627f7eb2Smrg                Bo Kagstrom and Per Ling
251627f7eb2Smrg                Department of Computing Science
252627f7eb2Smrg                Umea University
253627f7eb2Smrg                S-901 87 Umea, Sweden
254627f7eb2Smrg 
255627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
256627f7eb2Smrg 
257627f7eb2Smrg       const GFC_COMPLEX_16 *a, *b;
258627f7eb2Smrg       GFC_COMPLEX_16 *c;
259627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
260627f7eb2Smrg 
261627f7eb2Smrg       /* System generated locals */
262627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
264627f7eb2Smrg 
265627f7eb2Smrg       /* Local variables */
266627f7eb2Smrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
267627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
268627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
269627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270627f7eb2Smrg       GFC_COMPLEX_16 *t1;
271627f7eb2Smrg 
272627f7eb2Smrg       a = abase;
273627f7eb2Smrg       b = bbase;
274627f7eb2Smrg       c = retarray->base_addr;
275627f7eb2Smrg 
276627f7eb2Smrg       /* Parameter adjustments */
277627f7eb2Smrg       c_dim1 = rystride;
278627f7eb2Smrg       c_offset = 1 + c_dim1;
279627f7eb2Smrg       c -= c_offset;
280627f7eb2Smrg       a_dim1 = aystride;
281627f7eb2Smrg       a_offset = 1 + a_dim1;
282627f7eb2Smrg       a -= a_offset;
283627f7eb2Smrg       b_dim1 = bystride;
284627f7eb2Smrg       b_offset = 1 + b_dim1;
285627f7eb2Smrg       b -= b_offset;
286627f7eb2Smrg 
287627f7eb2Smrg       /* Empty c first.  */
288627f7eb2Smrg       for (j=1; j<=n; j++)
289627f7eb2Smrg 	for (i=1; i<=m; i++)
290627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
291627f7eb2Smrg 
292627f7eb2Smrg       /* Early exit if possible */
293627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
294627f7eb2Smrg 	return;
295627f7eb2Smrg 
296627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
297627f7eb2Smrg       index_type t1_dim, a_sz;
298627f7eb2Smrg       if (aystride == 1)
299627f7eb2Smrg         a_sz = rystride;
300627f7eb2Smrg       else
301627f7eb2Smrg         a_sz = a_dim1;
302627f7eb2Smrg 
303627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
304627f7eb2Smrg       if (t1_dim > 65536)
305627f7eb2Smrg 	t1_dim = 65536;
306627f7eb2Smrg 
307627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
308627f7eb2Smrg 
309627f7eb2Smrg       /* Start turning the crank. */
310627f7eb2Smrg       i1 = n;
311627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
312627f7eb2Smrg 	{
313627f7eb2Smrg 	  /* Computing MIN */
314627f7eb2Smrg 	  i2 = 512;
315627f7eb2Smrg 	  i3 = n - jj + 1;
316627f7eb2Smrg 	  jsec = min(i2,i3);
317627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
318627f7eb2Smrg 	  i2 = k;
319627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
320627f7eb2Smrg 	    {
321627f7eb2Smrg 	      /* Computing MIN */
322627f7eb2Smrg 	      i3 = 256;
323627f7eb2Smrg 	      i4 = k - ll + 1;
324627f7eb2Smrg 	      lsec = min(i3,i4);
325627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
326627f7eb2Smrg 
327627f7eb2Smrg 	      i3 = m;
328627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
329627f7eb2Smrg 		{
330627f7eb2Smrg 		  /* Computing MIN */
331627f7eb2Smrg 		  i4 = 256;
332627f7eb2Smrg 		  i5 = m - ii + 1;
333627f7eb2Smrg 		  isec = min(i4,i5);
334627f7eb2Smrg 		  uisec = isec - isec % 2;
335627f7eb2Smrg 		  i4 = ll + ulsec - 1;
336627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
337627f7eb2Smrg 		    {
338627f7eb2Smrg 		      i5 = ii + uisec - 1;
339627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
340627f7eb2Smrg 			{
341627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342627f7eb2Smrg 					a[i + l * a_dim1];
343627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
345627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346627f7eb2Smrg 					a[i + 1 + l * a_dim1];
347627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
349627f7eb2Smrg 			}
350627f7eb2Smrg 		      if (uisec < isec)
351627f7eb2Smrg 			{
352627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
353627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
354627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
355627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
356627f7eb2Smrg 			}
357627f7eb2Smrg 		    }
358627f7eb2Smrg 		  if (ulsec < lsec)
359627f7eb2Smrg 		    {
360627f7eb2Smrg 		      i4 = ii + isec - 1;
361627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
362627f7eb2Smrg 			{
363627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
364627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
365627f7eb2Smrg 			}
366627f7eb2Smrg 		    }
367627f7eb2Smrg 
368627f7eb2Smrg 		  uisec = isec - isec % 4;
369627f7eb2Smrg 		  i4 = jj + ujsec - 1;
370627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
371627f7eb2Smrg 		    {
372627f7eb2Smrg 		      i5 = ii + uisec - 1;
373627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
374627f7eb2Smrg 			{
375627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
376627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
377627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
378627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
379627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
380627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
381627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
382627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
383627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
384627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
385627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
386627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
387627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
388627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
389627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
390627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
391627f7eb2Smrg 			  i6 = ll + lsec - 1;
392627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
393627f7eb2Smrg 			    {
394627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395627f7eb2Smrg 				      * b[l + j * b_dim1];
396627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397627f7eb2Smrg 				      * b[l + j * b_dim1];
398627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
400627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
402627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
404627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
406627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
408627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
410627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411627f7eb2Smrg 				      * b[l + j * b_dim1];
412627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413627f7eb2Smrg 				      * b[l + j * b_dim1];
414627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
416627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
418627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
420627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
422627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
424627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
426627f7eb2Smrg 			    }
427627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
428627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
429627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
430627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
431627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
432627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
433627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
434627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
435627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
436627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
437627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
438627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
439627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
440627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
441627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
442627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
443627f7eb2Smrg 			}
444627f7eb2Smrg 		      if (uisec < isec)
445627f7eb2Smrg 			{
446627f7eb2Smrg 			  i5 = ii + isec - 1;
447627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
448627f7eb2Smrg 			    {
449627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
450627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
451627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
452627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
453627f7eb2Smrg 			      i6 = ll + lsec - 1;
454627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
455627f7eb2Smrg 				{
456627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457627f7eb2Smrg 					  257] * b[l + j * b_dim1];
458627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
460627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
462627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
464627f7eb2Smrg 				}
465627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
466627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
467627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
468627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
469627f7eb2Smrg 			    }
470627f7eb2Smrg 			}
471627f7eb2Smrg 		    }
472627f7eb2Smrg 		  if (ujsec < jsec)
473627f7eb2Smrg 		    {
474627f7eb2Smrg 		      i4 = jj + jsec - 1;
475627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
476627f7eb2Smrg 			{
477627f7eb2Smrg 			  i5 = ii + uisec - 1;
478627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
479627f7eb2Smrg 			    {
480627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
481627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
482627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
483627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
484627f7eb2Smrg 			      i6 = ll + lsec - 1;
485627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
486627f7eb2Smrg 				{
487627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488627f7eb2Smrg 					  257] * b[l + j * b_dim1];
489627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490627f7eb2Smrg 					  257] * b[l + j * b_dim1];
491627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492627f7eb2Smrg 					  257] * b[l + j * b_dim1];
493627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494627f7eb2Smrg 					  257] * b[l + j * b_dim1];
495627f7eb2Smrg 				}
496627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
497627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
498627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
499627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
500627f7eb2Smrg 			    }
501627f7eb2Smrg 			  i5 = ii + isec - 1;
502627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
503627f7eb2Smrg 			    {
504627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
505627f7eb2Smrg 			      i6 = ll + lsec - 1;
506627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
507627f7eb2Smrg 				{
508627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509627f7eb2Smrg 					  257] * b[l + j * b_dim1];
510627f7eb2Smrg 				}
511627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
512627f7eb2Smrg 			    }
513627f7eb2Smrg 			}
514627f7eb2Smrg 		    }
515627f7eb2Smrg 		}
516627f7eb2Smrg 	    }
517627f7eb2Smrg 	}
518627f7eb2Smrg       free(t1);
519627f7eb2Smrg       return;
520627f7eb2Smrg     }
521627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522627f7eb2Smrg     {
523627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
524627f7eb2Smrg 	{
525627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict abase_x;
526627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
527627f7eb2Smrg 	  GFC_COMPLEX_16 *restrict dest_y;
528627f7eb2Smrg 	  GFC_COMPLEX_16 s;
529627f7eb2Smrg 
530627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
531627f7eb2Smrg 	    {
532627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
533627f7eb2Smrg 	      dest_y = &dest[y*rystride];
534627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
535627f7eb2Smrg 		{
536627f7eb2Smrg 		  abase_x = &abase[x*axstride];
537627f7eb2Smrg 		  s = (GFC_COMPLEX_16) 0;
538627f7eb2Smrg 		  for (n = 0; n < count; n++)
539627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
540627f7eb2Smrg 		  dest_y[x] = s;
541627f7eb2Smrg 		}
542627f7eb2Smrg 	    }
543627f7eb2Smrg 	}
544627f7eb2Smrg       else
545627f7eb2Smrg 	{
546627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
547627f7eb2Smrg 	  GFC_COMPLEX_16 s;
548627f7eb2Smrg 
549627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
550627f7eb2Smrg 	    {
551627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
552627f7eb2Smrg 	      s = (GFC_COMPLEX_16) 0;
553627f7eb2Smrg 	      for (n = 0; n < count; n++)
554627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
555627f7eb2Smrg 	      dest[y*rystride] = s;
556627f7eb2Smrg 	    }
557627f7eb2Smrg 	}
558627f7eb2Smrg     }
559627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
560627f7eb2Smrg     {
561627f7eb2Smrg       const GFC_COMPLEX_16 *restrict bbase_y;
562627f7eb2Smrg       GFC_COMPLEX_16 s;
563627f7eb2Smrg 
564627f7eb2Smrg       for (y = 0; y < ycount; y++)
565627f7eb2Smrg 	{
566627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
567627f7eb2Smrg 	  s = (GFC_COMPLEX_16) 0;
568627f7eb2Smrg 	  for (n = 0; n < count; n++)
569627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
570627f7eb2Smrg 	  dest[y*rxstride] = s;
571627f7eb2Smrg 	}
572627f7eb2Smrg     }
573*4c3eb207Smrg   else if (axstride < aystride)
574*4c3eb207Smrg     {
575*4c3eb207Smrg       for (y = 0; y < ycount; y++)
576*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
577*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
578*4c3eb207Smrg 
579*4c3eb207Smrg       for (y = 0; y < ycount; y++)
580*4c3eb207Smrg 	for (n = 0; n < count; n++)
581*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
582*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
583*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
584*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
585*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
586*4c3eb207Smrg     }
587627f7eb2Smrg   else
588627f7eb2Smrg     {
589627f7eb2Smrg       const GFC_COMPLEX_16 *restrict abase_x;
590627f7eb2Smrg       const GFC_COMPLEX_16 *restrict bbase_y;
591627f7eb2Smrg       GFC_COMPLEX_16 *restrict dest_y;
592627f7eb2Smrg       GFC_COMPLEX_16 s;
593627f7eb2Smrg 
594627f7eb2Smrg       for (y = 0; y < ycount; y++)
595627f7eb2Smrg 	{
596627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
597627f7eb2Smrg 	  dest_y = &dest[y*rystride];
598627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
599627f7eb2Smrg 	    {
600627f7eb2Smrg 	      abase_x = &abase[x*axstride];
601627f7eb2Smrg 	      s = (GFC_COMPLEX_16) 0;
602627f7eb2Smrg 	      for (n = 0; n < count; n++)
603627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
604627f7eb2Smrg 	      dest_y[x*rxstride] = s;
605627f7eb2Smrg 	    }
606627f7eb2Smrg 	}
607627f7eb2Smrg     }
608627f7eb2Smrg }
609627f7eb2Smrg #undef POW3
610627f7eb2Smrg #undef min
611627f7eb2Smrg #undef max
612627f7eb2Smrg 
613627f7eb2Smrg #endif
614627f7eb2Smrg 
615627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616627f7eb2Smrg void
617627f7eb2Smrg matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
618627f7eb2Smrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
619627f7eb2Smrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620627f7eb2Smrg internal_proto(matmul_c16_avx128_fma4);
621627f7eb2Smrg void
matmul_c16_avx128_fma4(gfc_array_c16 * const restrict retarray,gfc_array_c16 * const restrict a,gfc_array_c16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)622627f7eb2Smrg matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
623627f7eb2Smrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
624627f7eb2Smrg 	int blas_limit, blas_call gemm)
625627f7eb2Smrg {
626627f7eb2Smrg   const GFC_COMPLEX_16 * restrict abase;
627627f7eb2Smrg   const GFC_COMPLEX_16 * restrict bbase;
628627f7eb2Smrg   GFC_COMPLEX_16 * restrict dest;
629627f7eb2Smrg 
630627f7eb2Smrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631627f7eb2Smrg   index_type x, y, n, count, xcount, ycount;
632627f7eb2Smrg 
633627f7eb2Smrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
634627f7eb2Smrg           || GFC_DESCRIPTOR_RANK (b) == 2);
635627f7eb2Smrg 
636627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637627f7eb2Smrg 
638627f7eb2Smrg    Either A or B (but not both) can be rank 1:
639627f7eb2Smrg 
640627f7eb2Smrg    o One-dimensional argument A is implicitly treated as a row matrix
641627f7eb2Smrg      dimensioned [1,count], so xcount=1.
642627f7eb2Smrg 
643627f7eb2Smrg    o One-dimensional argument B is implicitly treated as a column matrix
644627f7eb2Smrg      dimensioned [count, 1], so ycount=1.
645627f7eb2Smrg */
646627f7eb2Smrg 
647627f7eb2Smrg   if (retarray->base_addr == NULL)
648627f7eb2Smrg     {
649627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
650627f7eb2Smrg         {
651627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
652627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653627f7eb2Smrg         }
654627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
655627f7eb2Smrg         {
656627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
657627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658627f7eb2Smrg         }
659627f7eb2Smrg       else
660627f7eb2Smrg         {
661627f7eb2Smrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
662627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663627f7eb2Smrg 
664627f7eb2Smrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
665627f7eb2Smrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666627f7eb2Smrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
667627f7eb2Smrg         }
668627f7eb2Smrg 
669627f7eb2Smrg       retarray->base_addr
670627f7eb2Smrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
671627f7eb2Smrg       retarray->offset = 0;
672627f7eb2Smrg     }
673627f7eb2Smrg   else if (unlikely (compile_options.bounds_check))
674627f7eb2Smrg     {
675627f7eb2Smrg       index_type ret_extent, arg_extent;
676627f7eb2Smrg 
677627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
678627f7eb2Smrg 	{
679627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681627f7eb2Smrg 	  if (arg_extent != ret_extent)
682627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
683627f7eb2Smrg 	    		   "array (%ld/%ld) ",
684627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
685627f7eb2Smrg 	}
686627f7eb2Smrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
687627f7eb2Smrg 	{
688627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690627f7eb2Smrg 	  if (arg_extent != ret_extent)
691627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
692627f7eb2Smrg 	    		   "array (%ld/%ld) ",
693627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
694627f7eb2Smrg 	}
695627f7eb2Smrg       else
696627f7eb2Smrg 	{
697627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699627f7eb2Smrg 	  if (arg_extent != ret_extent)
700627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
701627f7eb2Smrg 	    		   "array (%ld/%ld) ",
702627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
703627f7eb2Smrg 
704627f7eb2Smrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705627f7eb2Smrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706627f7eb2Smrg 	  if (arg_extent != ret_extent)
707627f7eb2Smrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
708627f7eb2Smrg 	    		   "array (%ld/%ld) ",
709627f7eb2Smrg 			   (long int) ret_extent, (long int) arg_extent);
710627f7eb2Smrg 	}
711627f7eb2Smrg     }
712627f7eb2Smrg 
713627f7eb2Smrg 
714627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715627f7eb2Smrg     {
716627f7eb2Smrg       /* One-dimensional result may be addressed in the code below
717627f7eb2Smrg 	 either as a row or a column matrix. We want both cases to
718627f7eb2Smrg 	 work. */
719627f7eb2Smrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720627f7eb2Smrg     }
721627f7eb2Smrg   else
722627f7eb2Smrg     {
723627f7eb2Smrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724627f7eb2Smrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725627f7eb2Smrg     }
726627f7eb2Smrg 
727627f7eb2Smrg 
728627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
729627f7eb2Smrg     {
730627f7eb2Smrg       /* Treat it as a a row matrix A[1,count]. */
731627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732627f7eb2Smrg       aystride = 1;
733627f7eb2Smrg 
734627f7eb2Smrg       xcount = 1;
735627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
736627f7eb2Smrg     }
737627f7eb2Smrg   else
738627f7eb2Smrg     {
739627f7eb2Smrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740627f7eb2Smrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741627f7eb2Smrg 
742627f7eb2Smrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
743627f7eb2Smrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744627f7eb2Smrg     }
745627f7eb2Smrg 
746627f7eb2Smrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747627f7eb2Smrg     {
748627f7eb2Smrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749627f7eb2Smrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750627f7eb2Smrg 		       "in dimension 1: is %ld, should be %ld",
751627f7eb2Smrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752627f7eb2Smrg     }
753627f7eb2Smrg 
754627f7eb2Smrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
755627f7eb2Smrg     {
756627f7eb2Smrg       /* Treat it as a column matrix B[count,1] */
757627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758627f7eb2Smrg 
759627f7eb2Smrg       /* bystride should never be used for 1-dimensional b.
760627f7eb2Smrg          The value is only used for calculation of the
761627f7eb2Smrg          memory by the buffer.  */
762627f7eb2Smrg       bystride = 256;
763627f7eb2Smrg       ycount = 1;
764627f7eb2Smrg     }
765627f7eb2Smrg   else
766627f7eb2Smrg     {
767627f7eb2Smrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768627f7eb2Smrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769627f7eb2Smrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770627f7eb2Smrg     }
771627f7eb2Smrg 
772627f7eb2Smrg   abase = a->base_addr;
773627f7eb2Smrg   bbase = b->base_addr;
774627f7eb2Smrg   dest = retarray->base_addr;
775627f7eb2Smrg 
776627f7eb2Smrg   /* Now that everything is set up, we perform the multiplication
777627f7eb2Smrg      itself.  */
778627f7eb2Smrg 
779627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
781627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
782627f7eb2Smrg 
783627f7eb2Smrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784627f7eb2Smrg       && (bxstride == 1 || bystride == 1)
785627f7eb2Smrg       && (((float) xcount) * ((float) ycount) * ((float) count)
786627f7eb2Smrg           > POW3(blas_limit)))
787627f7eb2Smrg     {
788627f7eb2Smrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
789627f7eb2Smrg       const GFC_COMPLEX_16 one = 1, zero = 0;
790627f7eb2Smrg       const int lda = (axstride == 1) ? aystride : axstride,
791627f7eb2Smrg 		ldb = (bxstride == 1) ? bystride : bxstride;
792627f7eb2Smrg 
793627f7eb2Smrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794627f7eb2Smrg 	{
795627f7eb2Smrg 	  assert (gemm != NULL);
796627f7eb2Smrg 	  const char *transa, *transb;
797627f7eb2Smrg 	  if (try_blas & 2)
798627f7eb2Smrg 	    transa = "C";
799627f7eb2Smrg 	  else
800627f7eb2Smrg 	    transa = axstride == 1 ? "N" : "T";
801627f7eb2Smrg 
802627f7eb2Smrg 	  if (try_blas & 4)
803627f7eb2Smrg 	    transb = "C";
804627f7eb2Smrg 	  else
805627f7eb2Smrg 	    transb = bxstride == 1 ? "N" : "T";
806627f7eb2Smrg 
807627f7eb2Smrg 	  gemm (transa, transb , &m,
808627f7eb2Smrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
809627f7eb2Smrg 		&ldc, 1, 1);
810627f7eb2Smrg 	  return;
811627f7eb2Smrg 	}
812627f7eb2Smrg     }
813627f7eb2Smrg 
814*4c3eb207Smrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
815*4c3eb207Smrg       && GFC_DESCRIPTOR_RANK (b) != 1)
816627f7eb2Smrg     {
817627f7eb2Smrg       /* This block of code implements a tuned matmul, derived from
818627f7eb2Smrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
819627f7eb2Smrg 
820627f7eb2Smrg                Bo Kagstrom and Per Ling
821627f7eb2Smrg                Department of Computing Science
822627f7eb2Smrg                Umea University
823627f7eb2Smrg                S-901 87 Umea, Sweden
824627f7eb2Smrg 
825627f7eb2Smrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
826627f7eb2Smrg 
827627f7eb2Smrg       const GFC_COMPLEX_16 *a, *b;
828627f7eb2Smrg       GFC_COMPLEX_16 *c;
829627f7eb2Smrg       const index_type m = xcount, n = ycount, k = count;
830627f7eb2Smrg 
831627f7eb2Smrg       /* System generated locals */
832627f7eb2Smrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833627f7eb2Smrg 		 i1, i2, i3, i4, i5, i6;
834627f7eb2Smrg 
835627f7eb2Smrg       /* Local variables */
836627f7eb2Smrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
837627f7eb2Smrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
838627f7eb2Smrg       index_type i, j, l, ii, jj, ll;
839627f7eb2Smrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840627f7eb2Smrg       GFC_COMPLEX_16 *t1;
841627f7eb2Smrg 
842627f7eb2Smrg       a = abase;
843627f7eb2Smrg       b = bbase;
844627f7eb2Smrg       c = retarray->base_addr;
845627f7eb2Smrg 
846627f7eb2Smrg       /* Parameter adjustments */
847627f7eb2Smrg       c_dim1 = rystride;
848627f7eb2Smrg       c_offset = 1 + c_dim1;
849627f7eb2Smrg       c -= c_offset;
850627f7eb2Smrg       a_dim1 = aystride;
851627f7eb2Smrg       a_offset = 1 + a_dim1;
852627f7eb2Smrg       a -= a_offset;
853627f7eb2Smrg       b_dim1 = bystride;
854627f7eb2Smrg       b_offset = 1 + b_dim1;
855627f7eb2Smrg       b -= b_offset;
856627f7eb2Smrg 
857627f7eb2Smrg       /* Empty c first.  */
858627f7eb2Smrg       for (j=1; j<=n; j++)
859627f7eb2Smrg 	for (i=1; i<=m; i++)
860627f7eb2Smrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
861627f7eb2Smrg 
862627f7eb2Smrg       /* Early exit if possible */
863627f7eb2Smrg       if (m == 0 || n == 0 || k == 0)
864627f7eb2Smrg 	return;
865627f7eb2Smrg 
866627f7eb2Smrg       /* Adjust size of t1 to what is needed.  */
867627f7eb2Smrg       index_type t1_dim, a_sz;
868627f7eb2Smrg       if (aystride == 1)
869627f7eb2Smrg         a_sz = rystride;
870627f7eb2Smrg       else
871627f7eb2Smrg         a_sz = a_dim1;
872627f7eb2Smrg 
873627f7eb2Smrg       t1_dim = a_sz * 256 + b_dim1;
874627f7eb2Smrg       if (t1_dim > 65536)
875627f7eb2Smrg 	t1_dim = 65536;
876627f7eb2Smrg 
877627f7eb2Smrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
878627f7eb2Smrg 
879627f7eb2Smrg       /* Start turning the crank. */
880627f7eb2Smrg       i1 = n;
881627f7eb2Smrg       for (jj = 1; jj <= i1; jj += 512)
882627f7eb2Smrg 	{
883627f7eb2Smrg 	  /* Computing MIN */
884627f7eb2Smrg 	  i2 = 512;
885627f7eb2Smrg 	  i3 = n - jj + 1;
886627f7eb2Smrg 	  jsec = min(i2,i3);
887627f7eb2Smrg 	  ujsec = jsec - jsec % 4;
888627f7eb2Smrg 	  i2 = k;
889627f7eb2Smrg 	  for (ll = 1; ll <= i2; ll += 256)
890627f7eb2Smrg 	    {
891627f7eb2Smrg 	      /* Computing MIN */
892627f7eb2Smrg 	      i3 = 256;
893627f7eb2Smrg 	      i4 = k - ll + 1;
894627f7eb2Smrg 	      lsec = min(i3,i4);
895627f7eb2Smrg 	      ulsec = lsec - lsec % 2;
896627f7eb2Smrg 
897627f7eb2Smrg 	      i3 = m;
898627f7eb2Smrg 	      for (ii = 1; ii <= i3; ii += 256)
899627f7eb2Smrg 		{
900627f7eb2Smrg 		  /* Computing MIN */
901627f7eb2Smrg 		  i4 = 256;
902627f7eb2Smrg 		  i5 = m - ii + 1;
903627f7eb2Smrg 		  isec = min(i4,i5);
904627f7eb2Smrg 		  uisec = isec - isec % 2;
905627f7eb2Smrg 		  i4 = ll + ulsec - 1;
906627f7eb2Smrg 		  for (l = ll; l <= i4; l += 2)
907627f7eb2Smrg 		    {
908627f7eb2Smrg 		      i5 = ii + uisec - 1;
909627f7eb2Smrg 		      for (i = ii; i <= i5; i += 2)
910627f7eb2Smrg 			{
911627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912627f7eb2Smrg 					a[i + l * a_dim1];
913627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914627f7eb2Smrg 					a[i + (l + 1) * a_dim1];
915627f7eb2Smrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916627f7eb2Smrg 					a[i + 1 + l * a_dim1];
917627f7eb2Smrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918627f7eb2Smrg 					a[i + 1 + (l + 1) * a_dim1];
919627f7eb2Smrg 			}
920627f7eb2Smrg 		      if (uisec < isec)
921627f7eb2Smrg 			{
922627f7eb2Smrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
923627f7eb2Smrg 				    a[ii + isec - 1 + l * a_dim1];
924627f7eb2Smrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
925627f7eb2Smrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
926627f7eb2Smrg 			}
927627f7eb2Smrg 		    }
928627f7eb2Smrg 		  if (ulsec < lsec)
929627f7eb2Smrg 		    {
930627f7eb2Smrg 		      i4 = ii + isec - 1;
931627f7eb2Smrg 		      for (i = ii; i<= i4; ++i)
932627f7eb2Smrg 			{
933627f7eb2Smrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
934627f7eb2Smrg 				    a[i + (ll + lsec - 1) * a_dim1];
935627f7eb2Smrg 			}
936627f7eb2Smrg 		    }
937627f7eb2Smrg 
938627f7eb2Smrg 		  uisec = isec - isec % 4;
939627f7eb2Smrg 		  i4 = jj + ujsec - 1;
940627f7eb2Smrg 		  for (j = jj; j <= i4; j += 4)
941627f7eb2Smrg 		    {
942627f7eb2Smrg 		      i5 = ii + uisec - 1;
943627f7eb2Smrg 		      for (i = ii; i <= i5; i += 4)
944627f7eb2Smrg 			{
945627f7eb2Smrg 			  f11 = c[i + j * c_dim1];
946627f7eb2Smrg 			  f21 = c[i + 1 + j * c_dim1];
947627f7eb2Smrg 			  f12 = c[i + (j + 1) * c_dim1];
948627f7eb2Smrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
949627f7eb2Smrg 			  f13 = c[i + (j + 2) * c_dim1];
950627f7eb2Smrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
951627f7eb2Smrg 			  f14 = c[i + (j + 3) * c_dim1];
952627f7eb2Smrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
953627f7eb2Smrg 			  f31 = c[i + 2 + j * c_dim1];
954627f7eb2Smrg 			  f41 = c[i + 3 + j * c_dim1];
955627f7eb2Smrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
956627f7eb2Smrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
957627f7eb2Smrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
958627f7eb2Smrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
959627f7eb2Smrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
960627f7eb2Smrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
961627f7eb2Smrg 			  i6 = ll + lsec - 1;
962627f7eb2Smrg 			  for (l = ll; l <= i6; ++l)
963627f7eb2Smrg 			    {
964627f7eb2Smrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965627f7eb2Smrg 				      * b[l + j * b_dim1];
966627f7eb2Smrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967627f7eb2Smrg 				      * b[l + j * b_dim1];
968627f7eb2Smrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
970627f7eb2Smrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
972627f7eb2Smrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
974627f7eb2Smrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
976627f7eb2Smrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
978627f7eb2Smrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
980627f7eb2Smrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981627f7eb2Smrg 				      * b[l + j * b_dim1];
982627f7eb2Smrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983627f7eb2Smrg 				      * b[l + j * b_dim1];
984627f7eb2Smrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
986627f7eb2Smrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987627f7eb2Smrg 				      * b[l + (j + 1) * b_dim1];
988627f7eb2Smrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
990627f7eb2Smrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991627f7eb2Smrg 				      * b[l + (j + 2) * b_dim1];
992627f7eb2Smrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
994627f7eb2Smrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995627f7eb2Smrg 				      * b[l + (j + 3) * b_dim1];
996627f7eb2Smrg 			    }
997627f7eb2Smrg 			  c[i + j * c_dim1] = f11;
998627f7eb2Smrg 			  c[i + 1 + j * c_dim1] = f21;
999627f7eb2Smrg 			  c[i + (j + 1) * c_dim1] = f12;
1000627f7eb2Smrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1001627f7eb2Smrg 			  c[i + (j + 2) * c_dim1] = f13;
1002627f7eb2Smrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1003627f7eb2Smrg 			  c[i + (j + 3) * c_dim1] = f14;
1004627f7eb2Smrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1005627f7eb2Smrg 			  c[i + 2 + j * c_dim1] = f31;
1006627f7eb2Smrg 			  c[i + 3 + j * c_dim1] = f41;
1007627f7eb2Smrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1008627f7eb2Smrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1009627f7eb2Smrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1010627f7eb2Smrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1011627f7eb2Smrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1012627f7eb2Smrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1013627f7eb2Smrg 			}
1014627f7eb2Smrg 		      if (uisec < isec)
1015627f7eb2Smrg 			{
1016627f7eb2Smrg 			  i5 = ii + isec - 1;
1017627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1018627f7eb2Smrg 			    {
1019627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1020627f7eb2Smrg 			      f12 = c[i + (j + 1) * c_dim1];
1021627f7eb2Smrg 			      f13 = c[i + (j + 2) * c_dim1];
1022627f7eb2Smrg 			      f14 = c[i + (j + 3) * c_dim1];
1023627f7eb2Smrg 			      i6 = ll + lsec - 1;
1024627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1025627f7eb2Smrg 				{
1026627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1028627f7eb2Smrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029627f7eb2Smrg 					  257] * b[l + (j + 1) * b_dim1];
1030627f7eb2Smrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031627f7eb2Smrg 					  257] * b[l + (j + 2) * b_dim1];
1032627f7eb2Smrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033627f7eb2Smrg 					  257] * b[l + (j + 3) * b_dim1];
1034627f7eb2Smrg 				}
1035627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1036627f7eb2Smrg 			      c[i + (j + 1) * c_dim1] = f12;
1037627f7eb2Smrg 			      c[i + (j + 2) * c_dim1] = f13;
1038627f7eb2Smrg 			      c[i + (j + 3) * c_dim1] = f14;
1039627f7eb2Smrg 			    }
1040627f7eb2Smrg 			}
1041627f7eb2Smrg 		    }
1042627f7eb2Smrg 		  if (ujsec < jsec)
1043627f7eb2Smrg 		    {
1044627f7eb2Smrg 		      i4 = jj + jsec - 1;
1045627f7eb2Smrg 		      for (j = jj + ujsec; j <= i4; ++j)
1046627f7eb2Smrg 			{
1047627f7eb2Smrg 			  i5 = ii + uisec - 1;
1048627f7eb2Smrg 			  for (i = ii; i <= i5; i += 4)
1049627f7eb2Smrg 			    {
1050627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1051627f7eb2Smrg 			      f21 = c[i + 1 + j * c_dim1];
1052627f7eb2Smrg 			      f31 = c[i + 2 + j * c_dim1];
1053627f7eb2Smrg 			      f41 = c[i + 3 + j * c_dim1];
1054627f7eb2Smrg 			      i6 = ll + lsec - 1;
1055627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1056627f7eb2Smrg 				{
1057627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1059627f7eb2Smrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1061627f7eb2Smrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1063627f7eb2Smrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1065627f7eb2Smrg 				}
1066627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1067627f7eb2Smrg 			      c[i + 1 + j * c_dim1] = f21;
1068627f7eb2Smrg 			      c[i + 2 + j * c_dim1] = f31;
1069627f7eb2Smrg 			      c[i + 3 + j * c_dim1] = f41;
1070627f7eb2Smrg 			    }
1071627f7eb2Smrg 			  i5 = ii + isec - 1;
1072627f7eb2Smrg 			  for (i = ii + uisec; i <= i5; ++i)
1073627f7eb2Smrg 			    {
1074627f7eb2Smrg 			      f11 = c[i + j * c_dim1];
1075627f7eb2Smrg 			      i6 = ll + lsec - 1;
1076627f7eb2Smrg 			      for (l = ll; l <= i6; ++l)
1077627f7eb2Smrg 				{
1078627f7eb2Smrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079627f7eb2Smrg 					  257] * b[l + j * b_dim1];
1080627f7eb2Smrg 				}
1081627f7eb2Smrg 			      c[i + j * c_dim1] = f11;
1082627f7eb2Smrg 			    }
1083627f7eb2Smrg 			}
1084627f7eb2Smrg 		    }
1085627f7eb2Smrg 		}
1086627f7eb2Smrg 	    }
1087627f7eb2Smrg 	}
1088627f7eb2Smrg       free(t1);
1089627f7eb2Smrg       return;
1090627f7eb2Smrg     }
1091627f7eb2Smrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092627f7eb2Smrg     {
1093627f7eb2Smrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094627f7eb2Smrg 	{
1095627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict abase_x;
1096627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
1097627f7eb2Smrg 	  GFC_COMPLEX_16 *restrict dest_y;
1098627f7eb2Smrg 	  GFC_COMPLEX_16 s;
1099627f7eb2Smrg 
1100627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1101627f7eb2Smrg 	    {
1102627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1103627f7eb2Smrg 	      dest_y = &dest[y*rystride];
1104627f7eb2Smrg 	      for (x = 0; x < xcount; x++)
1105627f7eb2Smrg 		{
1106627f7eb2Smrg 		  abase_x = &abase[x*axstride];
1107627f7eb2Smrg 		  s = (GFC_COMPLEX_16) 0;
1108627f7eb2Smrg 		  for (n = 0; n < count; n++)
1109627f7eb2Smrg 		    s += abase_x[n] * bbase_y[n];
1110627f7eb2Smrg 		  dest_y[x] = s;
1111627f7eb2Smrg 		}
1112627f7eb2Smrg 	    }
1113627f7eb2Smrg 	}
1114627f7eb2Smrg       else
1115627f7eb2Smrg 	{
1116627f7eb2Smrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
1117627f7eb2Smrg 	  GFC_COMPLEX_16 s;
1118627f7eb2Smrg 
1119627f7eb2Smrg 	  for (y = 0; y < ycount; y++)
1120627f7eb2Smrg 	    {
1121627f7eb2Smrg 	      bbase_y = &bbase[y*bystride];
1122627f7eb2Smrg 	      s = (GFC_COMPLEX_16) 0;
1123627f7eb2Smrg 	      for (n = 0; n < count; n++)
1124627f7eb2Smrg 		s += abase[n*axstride] * bbase_y[n];
1125627f7eb2Smrg 	      dest[y*rystride] = s;
1126627f7eb2Smrg 	    }
1127627f7eb2Smrg 	}
1128627f7eb2Smrg     }
1129627f7eb2Smrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130627f7eb2Smrg     {
1131627f7eb2Smrg       const GFC_COMPLEX_16 *restrict bbase_y;
1132627f7eb2Smrg       GFC_COMPLEX_16 s;
1133627f7eb2Smrg 
1134627f7eb2Smrg       for (y = 0; y < ycount; y++)
1135627f7eb2Smrg 	{
1136627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1137627f7eb2Smrg 	  s = (GFC_COMPLEX_16) 0;
1138627f7eb2Smrg 	  for (n = 0; n < count; n++)
1139627f7eb2Smrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1140627f7eb2Smrg 	  dest[y*rxstride] = s;
1141627f7eb2Smrg 	}
1142627f7eb2Smrg     }
1143*4c3eb207Smrg   else if (axstride < aystride)
1144*4c3eb207Smrg     {
1145*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1146*4c3eb207Smrg 	for (x = 0; x < xcount; x++)
1147*4c3eb207Smrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
1148*4c3eb207Smrg 
1149*4c3eb207Smrg       for (y = 0; y < ycount; y++)
1150*4c3eb207Smrg 	for (n = 0; n < count; n++)
1151*4c3eb207Smrg 	  for (x = 0; x < xcount; x++)
1152*4c3eb207Smrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
1153*4c3eb207Smrg 	    dest[x*rxstride + y*rystride] +=
1154*4c3eb207Smrg 					abase[x*axstride + n*aystride] *
1155*4c3eb207Smrg 					bbase[n*bxstride + y*bystride];
1156*4c3eb207Smrg     }
1157627f7eb2Smrg   else
1158627f7eb2Smrg     {
1159627f7eb2Smrg       const GFC_COMPLEX_16 *restrict abase_x;
1160627f7eb2Smrg       const GFC_COMPLEX_16 *restrict bbase_y;
1161627f7eb2Smrg       GFC_COMPLEX_16 *restrict dest_y;
1162627f7eb2Smrg       GFC_COMPLEX_16 s;
1163627f7eb2Smrg 
1164627f7eb2Smrg       for (y = 0; y < ycount; y++)
1165627f7eb2Smrg 	{
1166627f7eb2Smrg 	  bbase_y = &bbase[y*bystride];
1167627f7eb2Smrg 	  dest_y = &dest[y*rystride];
1168627f7eb2Smrg 	  for (x = 0; x < xcount; x++)
1169627f7eb2Smrg 	    {
1170627f7eb2Smrg 	      abase_x = &abase[x*axstride];
1171627f7eb2Smrg 	      s = (GFC_COMPLEX_16) 0;
1172627f7eb2Smrg 	      for (n = 0; n < count; n++)
1173627f7eb2Smrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174627f7eb2Smrg 	      dest_y[x*rxstride] = s;
1175627f7eb2Smrg 	    }
1176627f7eb2Smrg 	}
1177627f7eb2Smrg     }
1178627f7eb2Smrg }
1179627f7eb2Smrg #undef POW3
1180627f7eb2Smrg #undef min
1181627f7eb2Smrg #undef max
1182627f7eb2Smrg 
1183627f7eb2Smrg #endif
1184627f7eb2Smrg 
1185627f7eb2Smrg #endif
1186627f7eb2Smrg 
1187