xref: /netbsd-src/external/gpl3/gcc.old/dist/libgfortran/m4/matmul_internal.m4 (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1627f7eb2Smrg`void
2627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray,
3627f7eb2Smrg	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
4627f7eb2Smrg	int blas_limit, blas_call gemm)
5627f7eb2Smrg{
6627f7eb2Smrg  const 'rtype_name` * restrict abase;
7627f7eb2Smrg  const 'rtype_name` * restrict bbase;
8627f7eb2Smrg  'rtype_name` * restrict dest;
9627f7eb2Smrg
10627f7eb2Smrg  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
11627f7eb2Smrg  index_type x, y, n, count, xcount, ycount;
12627f7eb2Smrg
13627f7eb2Smrg  assert (GFC_DESCRIPTOR_RANK (a) == 2
14627f7eb2Smrg          || GFC_DESCRIPTOR_RANK (b) == 2);
15627f7eb2Smrg
16627f7eb2Smrg/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
17627f7eb2Smrg
18627f7eb2Smrg   Either A or B (but not both) can be rank 1:
19627f7eb2Smrg
20627f7eb2Smrg   o One-dimensional argument A is implicitly treated as a row matrix
21627f7eb2Smrg     dimensioned [1,count], so xcount=1.
22627f7eb2Smrg
23627f7eb2Smrg   o One-dimensional argument B is implicitly treated as a column matrix
24627f7eb2Smrg     dimensioned [count, 1], so ycount=1.
25627f7eb2Smrg*/
26627f7eb2Smrg
27627f7eb2Smrg  if (retarray->base_addr == NULL)
28627f7eb2Smrg    {
29627f7eb2Smrg      if (GFC_DESCRIPTOR_RANK (a) == 1)
30627f7eb2Smrg        {
31627f7eb2Smrg	  GFC_DIMENSION_SET(retarray->dim[0], 0,
32627f7eb2Smrg	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
33627f7eb2Smrg        }
34627f7eb2Smrg      else if (GFC_DESCRIPTOR_RANK (b) == 1)
35627f7eb2Smrg        {
36627f7eb2Smrg	  GFC_DIMENSION_SET(retarray->dim[0], 0,
37627f7eb2Smrg	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
38627f7eb2Smrg        }
39627f7eb2Smrg      else
40627f7eb2Smrg        {
41627f7eb2Smrg	  GFC_DIMENSION_SET(retarray->dim[0], 0,
42627f7eb2Smrg	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
43627f7eb2Smrg
44627f7eb2Smrg          GFC_DIMENSION_SET(retarray->dim[1], 0,
45627f7eb2Smrg	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
46627f7eb2Smrg			    GFC_DESCRIPTOR_EXTENT(retarray,0));
47627f7eb2Smrg        }
48627f7eb2Smrg
49627f7eb2Smrg      retarray->base_addr
50627f7eb2Smrg	= xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
51627f7eb2Smrg      retarray->offset = 0;
52627f7eb2Smrg    }
53627f7eb2Smrg  else if (unlikely (compile_options.bounds_check))
54627f7eb2Smrg    {
55627f7eb2Smrg      index_type ret_extent, arg_extent;
56627f7eb2Smrg
57627f7eb2Smrg      if (GFC_DESCRIPTOR_RANK (a) == 1)
58627f7eb2Smrg	{
59627f7eb2Smrg	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
60627f7eb2Smrg	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
61627f7eb2Smrg	  if (arg_extent != ret_extent)
62627f7eb2Smrg	    runtime_error ("Array bound mismatch for dimension 1 of "
63627f7eb2Smrg	    		   "array (%ld/%ld) ",
64627f7eb2Smrg			   (long int) ret_extent, (long int) arg_extent);
65627f7eb2Smrg	}
66627f7eb2Smrg      else if (GFC_DESCRIPTOR_RANK (b) == 1)
67627f7eb2Smrg	{
68627f7eb2Smrg	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
69627f7eb2Smrg	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
70627f7eb2Smrg	  if (arg_extent != ret_extent)
71627f7eb2Smrg	    runtime_error ("Array bound mismatch for dimension 1 of "
72627f7eb2Smrg	    		   "array (%ld/%ld) ",
73627f7eb2Smrg			   (long int) ret_extent, (long int) arg_extent);
74627f7eb2Smrg	}
75627f7eb2Smrg      else
76627f7eb2Smrg	{
77627f7eb2Smrg	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
78627f7eb2Smrg	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
79627f7eb2Smrg	  if (arg_extent != ret_extent)
80627f7eb2Smrg	    runtime_error ("Array bound mismatch for dimension 1 of "
81627f7eb2Smrg	    		   "array (%ld/%ld) ",
82627f7eb2Smrg			   (long int) ret_extent, (long int) arg_extent);
83627f7eb2Smrg
84627f7eb2Smrg	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
85627f7eb2Smrg	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
86627f7eb2Smrg	  if (arg_extent != ret_extent)
87627f7eb2Smrg	    runtime_error ("Array bound mismatch for dimension 2 of "
88627f7eb2Smrg	    		   "array (%ld/%ld) ",
89627f7eb2Smrg			   (long int) ret_extent, (long int) arg_extent);
90627f7eb2Smrg	}
91627f7eb2Smrg    }
92627f7eb2Smrg'
93627f7eb2Smrgsinclude(`matmul_asm_'rtype_code`.m4')dnl
94627f7eb2Smrg`
95627f7eb2Smrg  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
96627f7eb2Smrg    {
97627f7eb2Smrg      /* One-dimensional result may be addressed in the code below
98627f7eb2Smrg	 either as a row or a column matrix. We want both cases to
99627f7eb2Smrg	 work. */
100627f7eb2Smrg      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
101627f7eb2Smrg    }
102627f7eb2Smrg  else
103627f7eb2Smrg    {
104627f7eb2Smrg      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
105627f7eb2Smrg      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
106627f7eb2Smrg    }
107627f7eb2Smrg
108627f7eb2Smrg
109627f7eb2Smrg  if (GFC_DESCRIPTOR_RANK (a) == 1)
110627f7eb2Smrg    {
111627f7eb2Smrg      /* Treat it as a a row matrix A[1,count]. */
112627f7eb2Smrg      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
113627f7eb2Smrg      aystride = 1;
114627f7eb2Smrg
115627f7eb2Smrg      xcount = 1;
116627f7eb2Smrg      count = GFC_DESCRIPTOR_EXTENT(a,0);
117627f7eb2Smrg    }
118627f7eb2Smrg  else
119627f7eb2Smrg    {
120627f7eb2Smrg      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
121627f7eb2Smrg      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
122627f7eb2Smrg
123627f7eb2Smrg      count = GFC_DESCRIPTOR_EXTENT(a,1);
124627f7eb2Smrg      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
125627f7eb2Smrg    }
126627f7eb2Smrg
127627f7eb2Smrg  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
128627f7eb2Smrg    {
129627f7eb2Smrg      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
130627f7eb2Smrg	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
131627f7eb2Smrg		       "in dimension 1: is %ld, should be %ld",
132627f7eb2Smrg		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
133627f7eb2Smrg    }
134627f7eb2Smrg
135627f7eb2Smrg  if (GFC_DESCRIPTOR_RANK (b) == 1)
136627f7eb2Smrg    {
137627f7eb2Smrg      /* Treat it as a column matrix B[count,1] */
138627f7eb2Smrg      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
139627f7eb2Smrg
140627f7eb2Smrg      /* bystride should never be used for 1-dimensional b.
141627f7eb2Smrg         The value is only used for calculation of the
142627f7eb2Smrg         memory by the buffer.  */
143627f7eb2Smrg      bystride = 256;
144627f7eb2Smrg      ycount = 1;
145627f7eb2Smrg    }
146627f7eb2Smrg  else
147627f7eb2Smrg    {
148627f7eb2Smrg      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
149627f7eb2Smrg      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
150627f7eb2Smrg      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
151627f7eb2Smrg    }
152627f7eb2Smrg
153627f7eb2Smrg  abase = a->base_addr;
154627f7eb2Smrg  bbase = b->base_addr;
155627f7eb2Smrg  dest = retarray->base_addr;
156627f7eb2Smrg
157627f7eb2Smrg  /* Now that everything is set up, we perform the multiplication
158627f7eb2Smrg     itself.  */
159627f7eb2Smrg
160627f7eb2Smrg#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
161627f7eb2Smrg#define min(a,b) ((a) <= (b) ? (a) : (b))
162627f7eb2Smrg#define max(a,b) ((a) >= (b) ? (a) : (b))
163627f7eb2Smrg
164627f7eb2Smrg  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
165627f7eb2Smrg      && (bxstride == 1 || bystride == 1)
166627f7eb2Smrg      && (((float) xcount) * ((float) ycount) * ((float) count)
167627f7eb2Smrg          > POW3(blas_limit)))
168627f7eb2Smrg    {
169627f7eb2Smrg      const int m = xcount, n = ycount, k = count, ldc = rystride;
170627f7eb2Smrg      const 'rtype_name` one = 1, zero = 0;
171627f7eb2Smrg      const int lda = (axstride == 1) ? aystride : axstride,
172627f7eb2Smrg		ldb = (bxstride == 1) ? bystride : bxstride;
173627f7eb2Smrg
174627f7eb2Smrg      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
175627f7eb2Smrg	{
176627f7eb2Smrg	  assert (gemm != NULL);
177627f7eb2Smrg	  const char *transa, *transb;
178627f7eb2Smrg	  if (try_blas & 2)
179627f7eb2Smrg	    transa = "C";
180627f7eb2Smrg	  else
181627f7eb2Smrg	    transa = axstride == 1 ? "N" : "T";
182627f7eb2Smrg
183627f7eb2Smrg	  if (try_blas & 4)
184627f7eb2Smrg	    transb = "C";
185627f7eb2Smrg	  else
186627f7eb2Smrg	    transb = bxstride == 1 ? "N" : "T";
187627f7eb2Smrg
188627f7eb2Smrg	  gemm (transa, transb , &m,
189627f7eb2Smrg		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
190627f7eb2Smrg		&ldc, 1, 1);
191627f7eb2Smrg	  return;
192627f7eb2Smrg	}
193627f7eb2Smrg    }
194627f7eb2Smrg
195*4c3eb207Smrg  if (rxstride == 1 && axstride == 1 && bxstride == 1
196*4c3eb207Smrg      && GFC_DESCRIPTOR_RANK (b) != 1)
197627f7eb2Smrg    {
198627f7eb2Smrg      /* This block of code implements a tuned matmul, derived from
199627f7eb2Smrg         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
200627f7eb2Smrg
201627f7eb2Smrg               Bo Kagstrom and Per Ling
202627f7eb2Smrg               Department of Computing Science
203627f7eb2Smrg               Umea University
204627f7eb2Smrg               S-901 87 Umea, Sweden
205627f7eb2Smrg
206627f7eb2Smrg	 from netlib.org, translated to C, and modified for matmul.m4.  */
207627f7eb2Smrg
208627f7eb2Smrg      const 'rtype_name` *a, *b;
209627f7eb2Smrg      'rtype_name` *c;
210627f7eb2Smrg      const index_type m = xcount, n = ycount, k = count;
211627f7eb2Smrg
212627f7eb2Smrg      /* System generated locals */
213627f7eb2Smrg      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
214627f7eb2Smrg		 i1, i2, i3, i4, i5, i6;
215627f7eb2Smrg
216627f7eb2Smrg      /* Local variables */
217627f7eb2Smrg      'rtype_name` f11, f12, f21, f22, f31, f32, f41, f42,
218627f7eb2Smrg		 f13, f14, f23, f24, f33, f34, f43, f44;
219627f7eb2Smrg      index_type i, j, l, ii, jj, ll;
220627f7eb2Smrg      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
221627f7eb2Smrg      'rtype_name` *t1;
222627f7eb2Smrg
223627f7eb2Smrg      a = abase;
224627f7eb2Smrg      b = bbase;
225627f7eb2Smrg      c = retarray->base_addr;
226627f7eb2Smrg
227627f7eb2Smrg      /* Parameter adjustments */
228627f7eb2Smrg      c_dim1 = rystride;
229627f7eb2Smrg      c_offset = 1 + c_dim1;
230627f7eb2Smrg      c -= c_offset;
231627f7eb2Smrg      a_dim1 = aystride;
232627f7eb2Smrg      a_offset = 1 + a_dim1;
233627f7eb2Smrg      a -= a_offset;
234627f7eb2Smrg      b_dim1 = bystride;
235627f7eb2Smrg      b_offset = 1 + b_dim1;
236627f7eb2Smrg      b -= b_offset;
237627f7eb2Smrg
238627f7eb2Smrg      /* Empty c first.  */
239627f7eb2Smrg      for (j=1; j<=n; j++)
240627f7eb2Smrg	for (i=1; i<=m; i++)
241627f7eb2Smrg	  c[i + j * c_dim1] = ('rtype_name`)0;
242627f7eb2Smrg
243627f7eb2Smrg      /* Early exit if possible */
244627f7eb2Smrg      if (m == 0 || n == 0 || k == 0)
245627f7eb2Smrg	return;
246627f7eb2Smrg
247627f7eb2Smrg      /* Adjust size of t1 to what is needed.  */
248627f7eb2Smrg      index_type t1_dim, a_sz;
249627f7eb2Smrg      if (aystride == 1)
250627f7eb2Smrg        a_sz = rystride;
251627f7eb2Smrg      else
252627f7eb2Smrg        a_sz = a_dim1;
253627f7eb2Smrg
254627f7eb2Smrg      t1_dim = a_sz * 256 + b_dim1;
255627f7eb2Smrg      if (t1_dim > 65536)
256627f7eb2Smrg	t1_dim = 65536;
257627f7eb2Smrg
258627f7eb2Smrg      t1 = malloc (t1_dim * sizeof('rtype_name`));
259627f7eb2Smrg
260627f7eb2Smrg      /* Start turning the crank. */
261627f7eb2Smrg      i1 = n;
262627f7eb2Smrg      for (jj = 1; jj <= i1; jj += 512)
263627f7eb2Smrg	{
264627f7eb2Smrg	  /* Computing MIN */
265627f7eb2Smrg	  i2 = 512;
266627f7eb2Smrg	  i3 = n - jj + 1;
267627f7eb2Smrg	  jsec = min(i2,i3);
268627f7eb2Smrg	  ujsec = jsec - jsec % 4;
269627f7eb2Smrg	  i2 = k;
270627f7eb2Smrg	  for (ll = 1; ll <= i2; ll += 256)
271627f7eb2Smrg	    {
272627f7eb2Smrg	      /* Computing MIN */
273627f7eb2Smrg	      i3 = 256;
274627f7eb2Smrg	      i4 = k - ll + 1;
275627f7eb2Smrg	      lsec = min(i3,i4);
276627f7eb2Smrg	      ulsec = lsec - lsec % 2;
277627f7eb2Smrg
278627f7eb2Smrg	      i3 = m;
279627f7eb2Smrg	      for (ii = 1; ii <= i3; ii += 256)
280627f7eb2Smrg		{
281627f7eb2Smrg		  /* Computing MIN */
282627f7eb2Smrg		  i4 = 256;
283627f7eb2Smrg		  i5 = m - ii + 1;
284627f7eb2Smrg		  isec = min(i4,i5);
285627f7eb2Smrg		  uisec = isec - isec % 2;
286627f7eb2Smrg		  i4 = ll + ulsec - 1;
287627f7eb2Smrg		  for (l = ll; l <= i4; l += 2)
288627f7eb2Smrg		    {
289627f7eb2Smrg		      i5 = ii + uisec - 1;
290627f7eb2Smrg		      for (i = ii; i <= i5; i += 2)
291627f7eb2Smrg			{
292627f7eb2Smrg			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
293627f7eb2Smrg					a[i + l * a_dim1];
294627f7eb2Smrg			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
295627f7eb2Smrg					a[i + (l + 1) * a_dim1];
296627f7eb2Smrg			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
297627f7eb2Smrg					a[i + 1 + l * a_dim1];
298627f7eb2Smrg			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
299627f7eb2Smrg					a[i + 1 + (l + 1) * a_dim1];
300627f7eb2Smrg			}
301627f7eb2Smrg		      if (uisec < isec)
302627f7eb2Smrg			{
303627f7eb2Smrg			  t1[l - ll + 1 + (isec << 8) - 257] =
304627f7eb2Smrg				    a[ii + isec - 1 + l * a_dim1];
305627f7eb2Smrg			  t1[l - ll + 2 + (isec << 8) - 257] =
306627f7eb2Smrg				    a[ii + isec - 1 + (l + 1) * a_dim1];
307627f7eb2Smrg			}
308627f7eb2Smrg		    }
309627f7eb2Smrg		  if (ulsec < lsec)
310627f7eb2Smrg		    {
311627f7eb2Smrg		      i4 = ii + isec - 1;
312627f7eb2Smrg		      for (i = ii; i<= i4; ++i)
313627f7eb2Smrg			{
314627f7eb2Smrg			  t1[lsec + ((i - ii + 1) << 8) - 257] =
315627f7eb2Smrg				    a[i + (ll + lsec - 1) * a_dim1];
316627f7eb2Smrg			}
317627f7eb2Smrg		    }
318627f7eb2Smrg
319627f7eb2Smrg		  uisec = isec - isec % 4;
320627f7eb2Smrg		  i4 = jj + ujsec - 1;
321627f7eb2Smrg		  for (j = jj; j <= i4; j += 4)
322627f7eb2Smrg		    {
323627f7eb2Smrg		      i5 = ii + uisec - 1;
324627f7eb2Smrg		      for (i = ii; i <= i5; i += 4)
325627f7eb2Smrg			{
326627f7eb2Smrg			  f11 = c[i + j * c_dim1];
327627f7eb2Smrg			  f21 = c[i + 1 + j * c_dim1];
328627f7eb2Smrg			  f12 = c[i + (j + 1) * c_dim1];
329627f7eb2Smrg			  f22 = c[i + 1 + (j + 1) * c_dim1];
330627f7eb2Smrg			  f13 = c[i + (j + 2) * c_dim1];
331627f7eb2Smrg			  f23 = c[i + 1 + (j + 2) * c_dim1];
332627f7eb2Smrg			  f14 = c[i + (j + 3) * c_dim1];
333627f7eb2Smrg			  f24 = c[i + 1 + (j + 3) * c_dim1];
334627f7eb2Smrg			  f31 = c[i + 2 + j * c_dim1];
335627f7eb2Smrg			  f41 = c[i + 3 + j * c_dim1];
336627f7eb2Smrg			  f32 = c[i + 2 + (j + 1) * c_dim1];
337627f7eb2Smrg			  f42 = c[i + 3 + (j + 1) * c_dim1];
338627f7eb2Smrg			  f33 = c[i + 2 + (j + 2) * c_dim1];
339627f7eb2Smrg			  f43 = c[i + 3 + (j + 2) * c_dim1];
340627f7eb2Smrg			  f34 = c[i + 2 + (j + 3) * c_dim1];
341627f7eb2Smrg			  f44 = c[i + 3 + (j + 3) * c_dim1];
342627f7eb2Smrg			  i6 = ll + lsec - 1;
343627f7eb2Smrg			  for (l = ll; l <= i6; ++l)
344627f7eb2Smrg			    {
345627f7eb2Smrg			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
346627f7eb2Smrg				      * b[l + j * b_dim1];
347627f7eb2Smrg			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
348627f7eb2Smrg				      * b[l + j * b_dim1];
349627f7eb2Smrg			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
350627f7eb2Smrg				      * b[l + (j + 1) * b_dim1];
351627f7eb2Smrg			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
352627f7eb2Smrg				      * b[l + (j + 1) * b_dim1];
353627f7eb2Smrg			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
354627f7eb2Smrg				      * b[l + (j + 2) * b_dim1];
355627f7eb2Smrg			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
356627f7eb2Smrg				      * b[l + (j + 2) * b_dim1];
357627f7eb2Smrg			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
358627f7eb2Smrg				      * b[l + (j + 3) * b_dim1];
359627f7eb2Smrg			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
360627f7eb2Smrg				      * b[l + (j + 3) * b_dim1];
361627f7eb2Smrg			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
362627f7eb2Smrg				      * b[l + j * b_dim1];
363627f7eb2Smrg			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
364627f7eb2Smrg				      * b[l + j * b_dim1];
365627f7eb2Smrg			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
366627f7eb2Smrg				      * b[l + (j + 1) * b_dim1];
367627f7eb2Smrg			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
368627f7eb2Smrg				      * b[l + (j + 1) * b_dim1];
369627f7eb2Smrg			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
370627f7eb2Smrg				      * b[l + (j + 2) * b_dim1];
371627f7eb2Smrg			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
372627f7eb2Smrg				      * b[l + (j + 2) * b_dim1];
373627f7eb2Smrg			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
374627f7eb2Smrg				      * b[l + (j + 3) * b_dim1];
375627f7eb2Smrg			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
376627f7eb2Smrg				      * b[l + (j + 3) * b_dim1];
377627f7eb2Smrg			    }
378627f7eb2Smrg			  c[i + j * c_dim1] = f11;
379627f7eb2Smrg			  c[i + 1 + j * c_dim1] = f21;
380627f7eb2Smrg			  c[i + (j + 1) * c_dim1] = f12;
381627f7eb2Smrg			  c[i + 1 + (j + 1) * c_dim1] = f22;
382627f7eb2Smrg			  c[i + (j + 2) * c_dim1] = f13;
383627f7eb2Smrg			  c[i + 1 + (j + 2) * c_dim1] = f23;
384627f7eb2Smrg			  c[i + (j + 3) * c_dim1] = f14;
385627f7eb2Smrg			  c[i + 1 + (j + 3) * c_dim1] = f24;
386627f7eb2Smrg			  c[i + 2 + j * c_dim1] = f31;
387627f7eb2Smrg			  c[i + 3 + j * c_dim1] = f41;
388627f7eb2Smrg			  c[i + 2 + (j + 1) * c_dim1] = f32;
389627f7eb2Smrg			  c[i + 3 + (j + 1) * c_dim1] = f42;
390627f7eb2Smrg			  c[i + 2 + (j + 2) * c_dim1] = f33;
391627f7eb2Smrg			  c[i + 3 + (j + 2) * c_dim1] = f43;
392627f7eb2Smrg			  c[i + 2 + (j + 3) * c_dim1] = f34;
393627f7eb2Smrg			  c[i + 3 + (j + 3) * c_dim1] = f44;
394627f7eb2Smrg			}
395627f7eb2Smrg		      if (uisec < isec)
396627f7eb2Smrg			{
397627f7eb2Smrg			  i5 = ii + isec - 1;
398627f7eb2Smrg			  for (i = ii + uisec; i <= i5; ++i)
399627f7eb2Smrg			    {
400627f7eb2Smrg			      f11 = c[i + j * c_dim1];
401627f7eb2Smrg			      f12 = c[i + (j + 1) * c_dim1];
402627f7eb2Smrg			      f13 = c[i + (j + 2) * c_dim1];
403627f7eb2Smrg			      f14 = c[i + (j + 3) * c_dim1];
404627f7eb2Smrg			      i6 = ll + lsec - 1;
405627f7eb2Smrg			      for (l = ll; l <= i6; ++l)
406627f7eb2Smrg				{
407627f7eb2Smrg				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
408627f7eb2Smrg					  257] * b[l + j * b_dim1];
409627f7eb2Smrg				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
410627f7eb2Smrg					  257] * b[l + (j + 1) * b_dim1];
411627f7eb2Smrg				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
412627f7eb2Smrg					  257] * b[l + (j + 2) * b_dim1];
413627f7eb2Smrg				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
414627f7eb2Smrg					  257] * b[l + (j + 3) * b_dim1];
415627f7eb2Smrg				}
416627f7eb2Smrg			      c[i + j * c_dim1] = f11;
417627f7eb2Smrg			      c[i + (j + 1) * c_dim1] = f12;
418627f7eb2Smrg			      c[i + (j + 2) * c_dim1] = f13;
419627f7eb2Smrg			      c[i + (j + 3) * c_dim1] = f14;
420627f7eb2Smrg			    }
421627f7eb2Smrg			}
422627f7eb2Smrg		    }
423627f7eb2Smrg		  if (ujsec < jsec)
424627f7eb2Smrg		    {
425627f7eb2Smrg		      i4 = jj + jsec - 1;
426627f7eb2Smrg		      for (j = jj + ujsec; j <= i4; ++j)
427627f7eb2Smrg			{
428627f7eb2Smrg			  i5 = ii + uisec - 1;
429627f7eb2Smrg			  for (i = ii; i <= i5; i += 4)
430627f7eb2Smrg			    {
431627f7eb2Smrg			      f11 = c[i + j * c_dim1];
432627f7eb2Smrg			      f21 = c[i + 1 + j * c_dim1];
433627f7eb2Smrg			      f31 = c[i + 2 + j * c_dim1];
434627f7eb2Smrg			      f41 = c[i + 3 + j * c_dim1];
435627f7eb2Smrg			      i6 = ll + lsec - 1;
436627f7eb2Smrg			      for (l = ll; l <= i6; ++l)
437627f7eb2Smrg				{
438627f7eb2Smrg				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
439627f7eb2Smrg					  257] * b[l + j * b_dim1];
440627f7eb2Smrg				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
441627f7eb2Smrg					  257] * b[l + j * b_dim1];
442627f7eb2Smrg				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
443627f7eb2Smrg					  257] * b[l + j * b_dim1];
444627f7eb2Smrg				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
445627f7eb2Smrg					  257] * b[l + j * b_dim1];
446627f7eb2Smrg				}
447627f7eb2Smrg			      c[i + j * c_dim1] = f11;
448627f7eb2Smrg			      c[i + 1 + j * c_dim1] = f21;
449627f7eb2Smrg			      c[i + 2 + j * c_dim1] = f31;
450627f7eb2Smrg			      c[i + 3 + j * c_dim1] = f41;
451627f7eb2Smrg			    }
452627f7eb2Smrg			  i5 = ii + isec - 1;
453627f7eb2Smrg			  for (i = ii + uisec; i <= i5; ++i)
454627f7eb2Smrg			    {
455627f7eb2Smrg			      f11 = c[i + j * c_dim1];
456627f7eb2Smrg			      i6 = ll + lsec - 1;
457627f7eb2Smrg			      for (l = ll; l <= i6; ++l)
458627f7eb2Smrg				{
459627f7eb2Smrg				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
460627f7eb2Smrg					  257] * b[l + j * b_dim1];
461627f7eb2Smrg				}
462627f7eb2Smrg			      c[i + j * c_dim1] = f11;
463627f7eb2Smrg			    }
464627f7eb2Smrg			}
465627f7eb2Smrg		    }
466627f7eb2Smrg		}
467627f7eb2Smrg	    }
468627f7eb2Smrg	}
469627f7eb2Smrg      free(t1);
470627f7eb2Smrg      return;
471627f7eb2Smrg    }
472627f7eb2Smrg  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
473627f7eb2Smrg    {
474627f7eb2Smrg      if (GFC_DESCRIPTOR_RANK (a) != 1)
475627f7eb2Smrg	{
476627f7eb2Smrg	  const 'rtype_name` *restrict abase_x;
477627f7eb2Smrg	  const 'rtype_name` *restrict bbase_y;
478627f7eb2Smrg	  'rtype_name` *restrict dest_y;
479627f7eb2Smrg	  'rtype_name` s;
480627f7eb2Smrg
481627f7eb2Smrg	  for (y = 0; y < ycount; y++)
482627f7eb2Smrg	    {
483627f7eb2Smrg	      bbase_y = &bbase[y*bystride];
484627f7eb2Smrg	      dest_y = &dest[y*rystride];
485627f7eb2Smrg	      for (x = 0; x < xcount; x++)
486627f7eb2Smrg		{
487627f7eb2Smrg		  abase_x = &abase[x*axstride];
488627f7eb2Smrg		  s = ('rtype_name`) 0;
489627f7eb2Smrg		  for (n = 0; n < count; n++)
490627f7eb2Smrg		    s += abase_x[n] * bbase_y[n];
491627f7eb2Smrg		  dest_y[x] = s;
492627f7eb2Smrg		}
493627f7eb2Smrg	    }
494627f7eb2Smrg	}
495627f7eb2Smrg      else
496627f7eb2Smrg	{
497627f7eb2Smrg	  const 'rtype_name` *restrict bbase_y;
498627f7eb2Smrg	  'rtype_name` s;
499627f7eb2Smrg
500627f7eb2Smrg	  for (y = 0; y < ycount; y++)
501627f7eb2Smrg	    {
502627f7eb2Smrg	      bbase_y = &bbase[y*bystride];
503627f7eb2Smrg	      s = ('rtype_name`) 0;
504627f7eb2Smrg	      for (n = 0; n < count; n++)
505627f7eb2Smrg		s += abase[n*axstride] * bbase_y[n];
506627f7eb2Smrg	      dest[y*rystride] = s;
507627f7eb2Smrg	    }
508627f7eb2Smrg	}
509627f7eb2Smrg    }
510627f7eb2Smrg  else if (GFC_DESCRIPTOR_RANK (a) == 1)
511627f7eb2Smrg    {
512627f7eb2Smrg      const 'rtype_name` *restrict bbase_y;
513627f7eb2Smrg      'rtype_name` s;
514627f7eb2Smrg
515627f7eb2Smrg      for (y = 0; y < ycount; y++)
516627f7eb2Smrg	{
517627f7eb2Smrg	  bbase_y = &bbase[y*bystride];
518627f7eb2Smrg	  s = ('rtype_name`) 0;
519627f7eb2Smrg	  for (n = 0; n < count; n++)
520627f7eb2Smrg	    s += abase[n*axstride] * bbase_y[n*bxstride];
521627f7eb2Smrg	  dest[y*rxstride] = s;
522627f7eb2Smrg	}
523627f7eb2Smrg    }
524*4c3eb207Smrg  else if (axstride < aystride)
525*4c3eb207Smrg    {
526*4c3eb207Smrg      for (y = 0; y < ycount; y++)
527*4c3eb207Smrg	for (x = 0; x < xcount; x++)
528*4c3eb207Smrg	  dest[x*rxstride + y*rystride] = ('rtype_name`)0;
529*4c3eb207Smrg
530*4c3eb207Smrg      for (y = 0; y < ycount; y++)
531*4c3eb207Smrg	for (n = 0; n < count; n++)
532*4c3eb207Smrg	  for (x = 0; x < xcount; x++)
533*4c3eb207Smrg	    /* dest[x,y] += a[x,n] * b[n,y] */
534*4c3eb207Smrg	    dest[x*rxstride + y*rystride] +=
535*4c3eb207Smrg					abase[x*axstride + n*aystride] *
536*4c3eb207Smrg					bbase[n*bxstride + y*bystride];
537*4c3eb207Smrg    }
538627f7eb2Smrg  else
539627f7eb2Smrg    {
540627f7eb2Smrg      const 'rtype_name` *restrict abase_x;
541627f7eb2Smrg      const 'rtype_name` *restrict bbase_y;
542627f7eb2Smrg      'rtype_name` *restrict dest_y;
543627f7eb2Smrg      'rtype_name` s;
544627f7eb2Smrg
545627f7eb2Smrg      for (y = 0; y < ycount; y++)
546627f7eb2Smrg	{
547627f7eb2Smrg	  bbase_y = &bbase[y*bystride];
548627f7eb2Smrg	  dest_y = &dest[y*rystride];
549627f7eb2Smrg	  for (x = 0; x < xcount; x++)
550627f7eb2Smrg	    {
551627f7eb2Smrg	      abase_x = &abase[x*axstride];
552627f7eb2Smrg	      s = ('rtype_name`) 0;
553627f7eb2Smrg	      for (n = 0; n < count; n++)
554627f7eb2Smrg		s += abase_x[n*aystride] * bbase_y[n*bxstride];
555627f7eb2Smrg	      dest_y[x*rxstride] = s;
556627f7eb2Smrg	    }
557627f7eb2Smrg	}
558627f7eb2Smrg    }
559627f7eb2Smrg}
560627f7eb2Smrg#undef POW3
561627f7eb2Smrg#undef min
562627f7eb2Smrg#undef max
563627f7eb2Smrg'
564