1181254a7Smrg /* Implementation of the MATMUL intrinsic
2*b1e83836Smrg Copyright (C) 2002-2022 Free Software Foundation, Inc.
3181254a7Smrg Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4181254a7Smrg
5181254a7Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6181254a7Smrg
7181254a7Smrg Libgfortran is free software; you can redistribute it and/or
8181254a7Smrg modify it under the terms of the GNU General Public
9181254a7Smrg License as published by the Free Software Foundation; either
10181254a7Smrg version 3 of the License, or (at your option) any later version.
11181254a7Smrg
12181254a7Smrg Libgfortran is distributed in the hope that it will be useful,
13181254a7Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14181254a7Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15181254a7Smrg GNU General Public License for more details.
16181254a7Smrg
17181254a7Smrg Under Section 7 of GPL version 3, you are granted additional
18181254a7Smrg permissions described in the GCC Runtime Library Exception, version
19181254a7Smrg 3.1, as published by the Free Software Foundation.
20181254a7Smrg
21181254a7Smrg You should have received a copy of the GNU General Public License and
22181254a7Smrg a copy of the GCC Runtime Library Exception along with this program;
23181254a7Smrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24181254a7Smrg <http://www.gnu.org/licenses/>. */
25181254a7Smrg
26181254a7Smrg #include "libgfortran.h"
27181254a7Smrg #include <string.h>
28181254a7Smrg #include <assert.h>
29181254a7Smrg
30181254a7Smrg
31181254a7Smrg /* These are the specific versions of matmul with -mprefer-avx128. */
32181254a7Smrg
33181254a7Smrg #if defined (HAVE_GFC_INTEGER_2)
34181254a7Smrg
35181254a7Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36181254a7Smrg passed to us by the front-end, in which case we call it for large
37181254a7Smrg matrices. */
38181254a7Smrg
39181254a7Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40181254a7Smrg const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
41181254a7Smrg const int *, const GFC_INTEGER_2 *, const int *,
42181254a7Smrg const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
43181254a7Smrg int, int);
44181254a7Smrg
45181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46181254a7Smrg void
47181254a7Smrg matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
48181254a7Smrg gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
49181254a7Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50181254a7Smrg internal_proto(matmul_i2_avx128_fma3);
51181254a7Smrg void
matmul_i2_avx128_fma3(gfc_array_i2 * const restrict retarray,gfc_array_i2 * const restrict a,gfc_array_i2 * const restrict b,int try_blas,int blas_limit,blas_call gemm)52181254a7Smrg matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
53181254a7Smrg gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
54181254a7Smrg int blas_limit, blas_call gemm)
55181254a7Smrg {
56181254a7Smrg const GFC_INTEGER_2 * restrict abase;
57181254a7Smrg const GFC_INTEGER_2 * restrict bbase;
58181254a7Smrg GFC_INTEGER_2 * restrict dest;
59181254a7Smrg
60181254a7Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61181254a7Smrg index_type x, y, n, count, xcount, ycount;
62181254a7Smrg
63181254a7Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
64181254a7Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
65181254a7Smrg
66181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67181254a7Smrg
68181254a7Smrg Either A or B (but not both) can be rank 1:
69181254a7Smrg
70181254a7Smrg o One-dimensional argument A is implicitly treated as a row matrix
71181254a7Smrg dimensioned [1,count], so xcount=1.
72181254a7Smrg
73181254a7Smrg o One-dimensional argument B is implicitly treated as a column matrix
74181254a7Smrg dimensioned [count, 1], so ycount=1.
75181254a7Smrg */
76181254a7Smrg
77181254a7Smrg if (retarray->base_addr == NULL)
78181254a7Smrg {
79181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
80181254a7Smrg {
81181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
82181254a7Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83181254a7Smrg }
84181254a7Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
85181254a7Smrg {
86181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
87181254a7Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88181254a7Smrg }
89181254a7Smrg else
90181254a7Smrg {
91181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
92181254a7Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93181254a7Smrg
94181254a7Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
95181254a7Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96181254a7Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
97181254a7Smrg }
98181254a7Smrg
99181254a7Smrg retarray->base_addr
100181254a7Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
101181254a7Smrg retarray->offset = 0;
102181254a7Smrg }
103181254a7Smrg else if (unlikely (compile_options.bounds_check))
104181254a7Smrg {
105181254a7Smrg index_type ret_extent, arg_extent;
106181254a7Smrg
107181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
108181254a7Smrg {
109181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111181254a7Smrg if (arg_extent != ret_extent)
112181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
113181254a7Smrg "array (%ld/%ld) ",
114181254a7Smrg (long int) ret_extent, (long int) arg_extent);
115181254a7Smrg }
116181254a7Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
117181254a7Smrg {
118181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120181254a7Smrg if (arg_extent != ret_extent)
121181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
122181254a7Smrg "array (%ld/%ld) ",
123181254a7Smrg (long int) ret_extent, (long int) arg_extent);
124181254a7Smrg }
125181254a7Smrg else
126181254a7Smrg {
127181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129181254a7Smrg if (arg_extent != ret_extent)
130181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
131181254a7Smrg "array (%ld/%ld) ",
132181254a7Smrg (long int) ret_extent, (long int) arg_extent);
133181254a7Smrg
134181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136181254a7Smrg if (arg_extent != ret_extent)
137181254a7Smrg runtime_error ("Array bound mismatch for dimension 2 of "
138181254a7Smrg "array (%ld/%ld) ",
139181254a7Smrg (long int) ret_extent, (long int) arg_extent);
140181254a7Smrg }
141181254a7Smrg }
142181254a7Smrg
143181254a7Smrg
144181254a7Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145181254a7Smrg {
146181254a7Smrg /* One-dimensional result may be addressed in the code below
147181254a7Smrg either as a row or a column matrix. We want both cases to
148181254a7Smrg work. */
149181254a7Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150181254a7Smrg }
151181254a7Smrg else
152181254a7Smrg {
153181254a7Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154181254a7Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155181254a7Smrg }
156181254a7Smrg
157181254a7Smrg
158181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
159181254a7Smrg {
160181254a7Smrg /* Treat it as a a row matrix A[1,count]. */
161181254a7Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162181254a7Smrg aystride = 1;
163181254a7Smrg
164181254a7Smrg xcount = 1;
165181254a7Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
166181254a7Smrg }
167181254a7Smrg else
168181254a7Smrg {
169181254a7Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170181254a7Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171181254a7Smrg
172181254a7Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
173181254a7Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174181254a7Smrg }
175181254a7Smrg
176181254a7Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177181254a7Smrg {
178181254a7Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179181254a7Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180181254a7Smrg "in dimension 1: is %ld, should be %ld",
181181254a7Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182181254a7Smrg }
183181254a7Smrg
184181254a7Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
185181254a7Smrg {
186181254a7Smrg /* Treat it as a column matrix B[count,1] */
187181254a7Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188181254a7Smrg
189181254a7Smrg /* bystride should never be used for 1-dimensional b.
190181254a7Smrg The value is only used for calculation of the
191181254a7Smrg memory by the buffer. */
192181254a7Smrg bystride = 256;
193181254a7Smrg ycount = 1;
194181254a7Smrg }
195181254a7Smrg else
196181254a7Smrg {
197181254a7Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198181254a7Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199181254a7Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200181254a7Smrg }
201181254a7Smrg
202181254a7Smrg abase = a->base_addr;
203181254a7Smrg bbase = b->base_addr;
204181254a7Smrg dest = retarray->base_addr;
205181254a7Smrg
206181254a7Smrg /* Now that everything is set up, we perform the multiplication
207181254a7Smrg itself. */
208181254a7Smrg
209181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
211181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
212181254a7Smrg
213181254a7Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214181254a7Smrg && (bxstride == 1 || bystride == 1)
215181254a7Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
216181254a7Smrg > POW3(blas_limit)))
217181254a7Smrg {
218181254a7Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
219181254a7Smrg const GFC_INTEGER_2 one = 1, zero = 0;
220181254a7Smrg const int lda = (axstride == 1) ? aystride : axstride,
221181254a7Smrg ldb = (bxstride == 1) ? bystride : bxstride;
222181254a7Smrg
223181254a7Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224181254a7Smrg {
225181254a7Smrg assert (gemm != NULL);
226181254a7Smrg const char *transa, *transb;
227181254a7Smrg if (try_blas & 2)
228181254a7Smrg transa = "C";
229181254a7Smrg else
230181254a7Smrg transa = axstride == 1 ? "N" : "T";
231181254a7Smrg
232181254a7Smrg if (try_blas & 4)
233181254a7Smrg transb = "C";
234181254a7Smrg else
235181254a7Smrg transb = bxstride == 1 ? "N" : "T";
236181254a7Smrg
237181254a7Smrg gemm (transa, transb , &m,
238181254a7Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
239181254a7Smrg &ldc, 1, 1);
240181254a7Smrg return;
241181254a7Smrg }
242181254a7Smrg }
243181254a7Smrg
244fb8a8121Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
245fb8a8121Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
246181254a7Smrg {
247181254a7Smrg /* This block of code implements a tuned matmul, derived from
248181254a7Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
249181254a7Smrg
250181254a7Smrg Bo Kagstrom and Per Ling
251181254a7Smrg Department of Computing Science
252181254a7Smrg Umea University
253181254a7Smrg S-901 87 Umea, Sweden
254181254a7Smrg
255181254a7Smrg from netlib.org, translated to C, and modified for matmul.m4. */
256181254a7Smrg
257181254a7Smrg const GFC_INTEGER_2 *a, *b;
258181254a7Smrg GFC_INTEGER_2 *c;
259181254a7Smrg const index_type m = xcount, n = ycount, k = count;
260181254a7Smrg
261181254a7Smrg /* System generated locals */
262181254a7Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263181254a7Smrg i1, i2, i3, i4, i5, i6;
264181254a7Smrg
265181254a7Smrg /* Local variables */
266181254a7Smrg GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
267181254a7Smrg f13, f14, f23, f24, f33, f34, f43, f44;
268181254a7Smrg index_type i, j, l, ii, jj, ll;
269181254a7Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270181254a7Smrg GFC_INTEGER_2 *t1;
271181254a7Smrg
272181254a7Smrg a = abase;
273181254a7Smrg b = bbase;
274181254a7Smrg c = retarray->base_addr;
275181254a7Smrg
276181254a7Smrg /* Parameter adjustments */
277181254a7Smrg c_dim1 = rystride;
278181254a7Smrg c_offset = 1 + c_dim1;
279181254a7Smrg c -= c_offset;
280181254a7Smrg a_dim1 = aystride;
281181254a7Smrg a_offset = 1 + a_dim1;
282181254a7Smrg a -= a_offset;
283181254a7Smrg b_dim1 = bystride;
284181254a7Smrg b_offset = 1 + b_dim1;
285181254a7Smrg b -= b_offset;
286181254a7Smrg
287181254a7Smrg /* Empty c first. */
288181254a7Smrg for (j=1; j<=n; j++)
289181254a7Smrg for (i=1; i<=m; i++)
290181254a7Smrg c[i + j * c_dim1] = (GFC_INTEGER_2)0;
291181254a7Smrg
292181254a7Smrg /* Early exit if possible */
293181254a7Smrg if (m == 0 || n == 0 || k == 0)
294181254a7Smrg return;
295181254a7Smrg
296181254a7Smrg /* Adjust size of t1 to what is needed. */
297181254a7Smrg index_type t1_dim, a_sz;
298181254a7Smrg if (aystride == 1)
299181254a7Smrg a_sz = rystride;
300181254a7Smrg else
301181254a7Smrg a_sz = a_dim1;
302181254a7Smrg
303181254a7Smrg t1_dim = a_sz * 256 + b_dim1;
304181254a7Smrg if (t1_dim > 65536)
305181254a7Smrg t1_dim = 65536;
306181254a7Smrg
307181254a7Smrg t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
308181254a7Smrg
309181254a7Smrg /* Start turning the crank. */
310181254a7Smrg i1 = n;
311181254a7Smrg for (jj = 1; jj <= i1; jj += 512)
312181254a7Smrg {
313181254a7Smrg /* Computing MIN */
314181254a7Smrg i2 = 512;
315181254a7Smrg i3 = n - jj + 1;
316181254a7Smrg jsec = min(i2,i3);
317181254a7Smrg ujsec = jsec - jsec % 4;
318181254a7Smrg i2 = k;
319181254a7Smrg for (ll = 1; ll <= i2; ll += 256)
320181254a7Smrg {
321181254a7Smrg /* Computing MIN */
322181254a7Smrg i3 = 256;
323181254a7Smrg i4 = k - ll + 1;
324181254a7Smrg lsec = min(i3,i4);
325181254a7Smrg ulsec = lsec - lsec % 2;
326181254a7Smrg
327181254a7Smrg i3 = m;
328181254a7Smrg for (ii = 1; ii <= i3; ii += 256)
329181254a7Smrg {
330181254a7Smrg /* Computing MIN */
331181254a7Smrg i4 = 256;
332181254a7Smrg i5 = m - ii + 1;
333181254a7Smrg isec = min(i4,i5);
334181254a7Smrg uisec = isec - isec % 2;
335181254a7Smrg i4 = ll + ulsec - 1;
336181254a7Smrg for (l = ll; l <= i4; l += 2)
337181254a7Smrg {
338181254a7Smrg i5 = ii + uisec - 1;
339181254a7Smrg for (i = ii; i <= i5; i += 2)
340181254a7Smrg {
341181254a7Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342181254a7Smrg a[i + l * a_dim1];
343181254a7Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344181254a7Smrg a[i + (l + 1) * a_dim1];
345181254a7Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346181254a7Smrg a[i + 1 + l * a_dim1];
347181254a7Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348181254a7Smrg a[i + 1 + (l + 1) * a_dim1];
349181254a7Smrg }
350181254a7Smrg if (uisec < isec)
351181254a7Smrg {
352181254a7Smrg t1[l - ll + 1 + (isec << 8) - 257] =
353181254a7Smrg a[ii + isec - 1 + l * a_dim1];
354181254a7Smrg t1[l - ll + 2 + (isec << 8) - 257] =
355181254a7Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
356181254a7Smrg }
357181254a7Smrg }
358181254a7Smrg if (ulsec < lsec)
359181254a7Smrg {
360181254a7Smrg i4 = ii + isec - 1;
361181254a7Smrg for (i = ii; i<= i4; ++i)
362181254a7Smrg {
363181254a7Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
364181254a7Smrg a[i + (ll + lsec - 1) * a_dim1];
365181254a7Smrg }
366181254a7Smrg }
367181254a7Smrg
368181254a7Smrg uisec = isec - isec % 4;
369181254a7Smrg i4 = jj + ujsec - 1;
370181254a7Smrg for (j = jj; j <= i4; j += 4)
371181254a7Smrg {
372181254a7Smrg i5 = ii + uisec - 1;
373181254a7Smrg for (i = ii; i <= i5; i += 4)
374181254a7Smrg {
375181254a7Smrg f11 = c[i + j * c_dim1];
376181254a7Smrg f21 = c[i + 1 + j * c_dim1];
377181254a7Smrg f12 = c[i + (j + 1) * c_dim1];
378181254a7Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
379181254a7Smrg f13 = c[i + (j + 2) * c_dim1];
380181254a7Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
381181254a7Smrg f14 = c[i + (j + 3) * c_dim1];
382181254a7Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
383181254a7Smrg f31 = c[i + 2 + j * c_dim1];
384181254a7Smrg f41 = c[i + 3 + j * c_dim1];
385181254a7Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
386181254a7Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
387181254a7Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
388181254a7Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
389181254a7Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
390181254a7Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
391181254a7Smrg i6 = ll + lsec - 1;
392181254a7Smrg for (l = ll; l <= i6; ++l)
393181254a7Smrg {
394181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395181254a7Smrg * b[l + j * b_dim1];
396181254a7Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397181254a7Smrg * b[l + j * b_dim1];
398181254a7Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399181254a7Smrg * b[l + (j + 1) * b_dim1];
400181254a7Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401181254a7Smrg * b[l + (j + 1) * b_dim1];
402181254a7Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403181254a7Smrg * b[l + (j + 2) * b_dim1];
404181254a7Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405181254a7Smrg * b[l + (j + 2) * b_dim1];
406181254a7Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407181254a7Smrg * b[l + (j + 3) * b_dim1];
408181254a7Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409181254a7Smrg * b[l + (j + 3) * b_dim1];
410181254a7Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411181254a7Smrg * b[l + j * b_dim1];
412181254a7Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413181254a7Smrg * b[l + j * b_dim1];
414181254a7Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415181254a7Smrg * b[l + (j + 1) * b_dim1];
416181254a7Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417181254a7Smrg * b[l + (j + 1) * b_dim1];
418181254a7Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419181254a7Smrg * b[l + (j + 2) * b_dim1];
420181254a7Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421181254a7Smrg * b[l + (j + 2) * b_dim1];
422181254a7Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423181254a7Smrg * b[l + (j + 3) * b_dim1];
424181254a7Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425181254a7Smrg * b[l + (j + 3) * b_dim1];
426181254a7Smrg }
427181254a7Smrg c[i + j * c_dim1] = f11;
428181254a7Smrg c[i + 1 + j * c_dim1] = f21;
429181254a7Smrg c[i + (j + 1) * c_dim1] = f12;
430181254a7Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
431181254a7Smrg c[i + (j + 2) * c_dim1] = f13;
432181254a7Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
433181254a7Smrg c[i + (j + 3) * c_dim1] = f14;
434181254a7Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
435181254a7Smrg c[i + 2 + j * c_dim1] = f31;
436181254a7Smrg c[i + 3 + j * c_dim1] = f41;
437181254a7Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
438181254a7Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
439181254a7Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
440181254a7Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
441181254a7Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
442181254a7Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
443181254a7Smrg }
444181254a7Smrg if (uisec < isec)
445181254a7Smrg {
446181254a7Smrg i5 = ii + isec - 1;
447181254a7Smrg for (i = ii + uisec; i <= i5; ++i)
448181254a7Smrg {
449181254a7Smrg f11 = c[i + j * c_dim1];
450181254a7Smrg f12 = c[i + (j + 1) * c_dim1];
451181254a7Smrg f13 = c[i + (j + 2) * c_dim1];
452181254a7Smrg f14 = c[i + (j + 3) * c_dim1];
453181254a7Smrg i6 = ll + lsec - 1;
454181254a7Smrg for (l = ll; l <= i6; ++l)
455181254a7Smrg {
456181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457181254a7Smrg 257] * b[l + j * b_dim1];
458181254a7Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459181254a7Smrg 257] * b[l + (j + 1) * b_dim1];
460181254a7Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461181254a7Smrg 257] * b[l + (j + 2) * b_dim1];
462181254a7Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463181254a7Smrg 257] * b[l + (j + 3) * b_dim1];
464181254a7Smrg }
465181254a7Smrg c[i + j * c_dim1] = f11;
466181254a7Smrg c[i + (j + 1) * c_dim1] = f12;
467181254a7Smrg c[i + (j + 2) * c_dim1] = f13;
468181254a7Smrg c[i + (j + 3) * c_dim1] = f14;
469181254a7Smrg }
470181254a7Smrg }
471181254a7Smrg }
472181254a7Smrg if (ujsec < jsec)
473181254a7Smrg {
474181254a7Smrg i4 = jj + jsec - 1;
475181254a7Smrg for (j = jj + ujsec; j <= i4; ++j)
476181254a7Smrg {
477181254a7Smrg i5 = ii + uisec - 1;
478181254a7Smrg for (i = ii; i <= i5; i += 4)
479181254a7Smrg {
480181254a7Smrg f11 = c[i + j * c_dim1];
481181254a7Smrg f21 = c[i + 1 + j * c_dim1];
482181254a7Smrg f31 = c[i + 2 + j * c_dim1];
483181254a7Smrg f41 = c[i + 3 + j * c_dim1];
484181254a7Smrg i6 = ll + lsec - 1;
485181254a7Smrg for (l = ll; l <= i6; ++l)
486181254a7Smrg {
487181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488181254a7Smrg 257] * b[l + j * b_dim1];
489181254a7Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490181254a7Smrg 257] * b[l + j * b_dim1];
491181254a7Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492181254a7Smrg 257] * b[l + j * b_dim1];
493181254a7Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494181254a7Smrg 257] * b[l + j * b_dim1];
495181254a7Smrg }
496181254a7Smrg c[i + j * c_dim1] = f11;
497181254a7Smrg c[i + 1 + j * c_dim1] = f21;
498181254a7Smrg c[i + 2 + j * c_dim1] = f31;
499181254a7Smrg c[i + 3 + j * c_dim1] = f41;
500181254a7Smrg }
501181254a7Smrg i5 = ii + isec - 1;
502181254a7Smrg for (i = ii + uisec; i <= i5; ++i)
503181254a7Smrg {
504181254a7Smrg f11 = c[i + j * c_dim1];
505181254a7Smrg i6 = ll + lsec - 1;
506181254a7Smrg for (l = ll; l <= i6; ++l)
507181254a7Smrg {
508181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509181254a7Smrg 257] * b[l + j * b_dim1];
510181254a7Smrg }
511181254a7Smrg c[i + j * c_dim1] = f11;
512181254a7Smrg }
513181254a7Smrg }
514181254a7Smrg }
515181254a7Smrg }
516181254a7Smrg }
517181254a7Smrg }
518181254a7Smrg free(t1);
519181254a7Smrg return;
520181254a7Smrg }
521181254a7Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522181254a7Smrg {
523181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
524181254a7Smrg {
525181254a7Smrg const GFC_INTEGER_2 *restrict abase_x;
526181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
527181254a7Smrg GFC_INTEGER_2 *restrict dest_y;
528181254a7Smrg GFC_INTEGER_2 s;
529181254a7Smrg
530181254a7Smrg for (y = 0; y < ycount; y++)
531181254a7Smrg {
532181254a7Smrg bbase_y = &bbase[y*bystride];
533181254a7Smrg dest_y = &dest[y*rystride];
534181254a7Smrg for (x = 0; x < xcount; x++)
535181254a7Smrg {
536181254a7Smrg abase_x = &abase[x*axstride];
537181254a7Smrg s = (GFC_INTEGER_2) 0;
538181254a7Smrg for (n = 0; n < count; n++)
539181254a7Smrg s += abase_x[n] * bbase_y[n];
540181254a7Smrg dest_y[x] = s;
541181254a7Smrg }
542181254a7Smrg }
543181254a7Smrg }
544181254a7Smrg else
545181254a7Smrg {
546181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
547181254a7Smrg GFC_INTEGER_2 s;
548181254a7Smrg
549181254a7Smrg for (y = 0; y < ycount; y++)
550181254a7Smrg {
551181254a7Smrg bbase_y = &bbase[y*bystride];
552181254a7Smrg s = (GFC_INTEGER_2) 0;
553181254a7Smrg for (n = 0; n < count; n++)
554181254a7Smrg s += abase[n*axstride] * bbase_y[n];
555181254a7Smrg dest[y*rystride] = s;
556181254a7Smrg }
557181254a7Smrg }
558181254a7Smrg }
559181254a7Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
560181254a7Smrg {
561181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
562181254a7Smrg GFC_INTEGER_2 s;
563181254a7Smrg
564181254a7Smrg for (y = 0; y < ycount; y++)
565181254a7Smrg {
566181254a7Smrg bbase_y = &bbase[y*bystride];
567181254a7Smrg s = (GFC_INTEGER_2) 0;
568181254a7Smrg for (n = 0; n < count; n++)
569181254a7Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
570181254a7Smrg dest[y*rxstride] = s;
571181254a7Smrg }
572181254a7Smrg }
573fb8a8121Smrg else if (axstride < aystride)
574fb8a8121Smrg {
575fb8a8121Smrg for (y = 0; y < ycount; y++)
576fb8a8121Smrg for (x = 0; x < xcount; x++)
577fb8a8121Smrg dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
578fb8a8121Smrg
579fb8a8121Smrg for (y = 0; y < ycount; y++)
580fb8a8121Smrg for (n = 0; n < count; n++)
581fb8a8121Smrg for (x = 0; x < xcount; x++)
582fb8a8121Smrg /* dest[x,y] += a[x,n] * b[n,y] */
583fb8a8121Smrg dest[x*rxstride + y*rystride] +=
584fb8a8121Smrg abase[x*axstride + n*aystride] *
585fb8a8121Smrg bbase[n*bxstride + y*bystride];
586fb8a8121Smrg }
587181254a7Smrg else
588181254a7Smrg {
589181254a7Smrg const GFC_INTEGER_2 *restrict abase_x;
590181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
591181254a7Smrg GFC_INTEGER_2 *restrict dest_y;
592181254a7Smrg GFC_INTEGER_2 s;
593181254a7Smrg
594181254a7Smrg for (y = 0; y < ycount; y++)
595181254a7Smrg {
596181254a7Smrg bbase_y = &bbase[y*bystride];
597181254a7Smrg dest_y = &dest[y*rystride];
598181254a7Smrg for (x = 0; x < xcount; x++)
599181254a7Smrg {
600181254a7Smrg abase_x = &abase[x*axstride];
601181254a7Smrg s = (GFC_INTEGER_2) 0;
602181254a7Smrg for (n = 0; n < count; n++)
603181254a7Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
604181254a7Smrg dest_y[x*rxstride] = s;
605181254a7Smrg }
606181254a7Smrg }
607181254a7Smrg }
608181254a7Smrg }
609181254a7Smrg #undef POW3
610181254a7Smrg #undef min
611181254a7Smrg #undef max
612181254a7Smrg
613181254a7Smrg #endif
614181254a7Smrg
615181254a7Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616181254a7Smrg void
617181254a7Smrg matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
618181254a7Smrg gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
619181254a7Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620181254a7Smrg internal_proto(matmul_i2_avx128_fma4);
621181254a7Smrg void
matmul_i2_avx128_fma4(gfc_array_i2 * const restrict retarray,gfc_array_i2 * const restrict a,gfc_array_i2 * const restrict b,int try_blas,int blas_limit,blas_call gemm)622181254a7Smrg matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
623181254a7Smrg gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
624181254a7Smrg int blas_limit, blas_call gemm)
625181254a7Smrg {
626181254a7Smrg const GFC_INTEGER_2 * restrict abase;
627181254a7Smrg const GFC_INTEGER_2 * restrict bbase;
628181254a7Smrg GFC_INTEGER_2 * restrict dest;
629181254a7Smrg
630181254a7Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631181254a7Smrg index_type x, y, n, count, xcount, ycount;
632181254a7Smrg
633181254a7Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
634181254a7Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
635181254a7Smrg
636181254a7Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637181254a7Smrg
638181254a7Smrg Either A or B (but not both) can be rank 1:
639181254a7Smrg
640181254a7Smrg o One-dimensional argument A is implicitly treated as a row matrix
641181254a7Smrg dimensioned [1,count], so xcount=1.
642181254a7Smrg
643181254a7Smrg o One-dimensional argument B is implicitly treated as a column matrix
644181254a7Smrg dimensioned [count, 1], so ycount=1.
645181254a7Smrg */
646181254a7Smrg
647181254a7Smrg if (retarray->base_addr == NULL)
648181254a7Smrg {
649181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
650181254a7Smrg {
651181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
652181254a7Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653181254a7Smrg }
654181254a7Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
655181254a7Smrg {
656181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
657181254a7Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658181254a7Smrg }
659181254a7Smrg else
660181254a7Smrg {
661181254a7Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
662181254a7Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663181254a7Smrg
664181254a7Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
665181254a7Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666181254a7Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
667181254a7Smrg }
668181254a7Smrg
669181254a7Smrg retarray->base_addr
670181254a7Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
671181254a7Smrg retarray->offset = 0;
672181254a7Smrg }
673181254a7Smrg else if (unlikely (compile_options.bounds_check))
674181254a7Smrg {
675181254a7Smrg index_type ret_extent, arg_extent;
676181254a7Smrg
677181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
678181254a7Smrg {
679181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681181254a7Smrg if (arg_extent != ret_extent)
682181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
683181254a7Smrg "array (%ld/%ld) ",
684181254a7Smrg (long int) ret_extent, (long int) arg_extent);
685181254a7Smrg }
686181254a7Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
687181254a7Smrg {
688181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690181254a7Smrg if (arg_extent != ret_extent)
691181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
692181254a7Smrg "array (%ld/%ld) ",
693181254a7Smrg (long int) ret_extent, (long int) arg_extent);
694181254a7Smrg }
695181254a7Smrg else
696181254a7Smrg {
697181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699181254a7Smrg if (arg_extent != ret_extent)
700181254a7Smrg runtime_error ("Array bound mismatch for dimension 1 of "
701181254a7Smrg "array (%ld/%ld) ",
702181254a7Smrg (long int) ret_extent, (long int) arg_extent);
703181254a7Smrg
704181254a7Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705181254a7Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706181254a7Smrg if (arg_extent != ret_extent)
707181254a7Smrg runtime_error ("Array bound mismatch for dimension 2 of "
708181254a7Smrg "array (%ld/%ld) ",
709181254a7Smrg (long int) ret_extent, (long int) arg_extent);
710181254a7Smrg }
711181254a7Smrg }
712181254a7Smrg
713181254a7Smrg
714181254a7Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715181254a7Smrg {
716181254a7Smrg /* One-dimensional result may be addressed in the code below
717181254a7Smrg either as a row or a column matrix. We want both cases to
718181254a7Smrg work. */
719181254a7Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720181254a7Smrg }
721181254a7Smrg else
722181254a7Smrg {
723181254a7Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724181254a7Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725181254a7Smrg }
726181254a7Smrg
727181254a7Smrg
728181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
729181254a7Smrg {
730181254a7Smrg /* Treat it as a a row matrix A[1,count]. */
731181254a7Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732181254a7Smrg aystride = 1;
733181254a7Smrg
734181254a7Smrg xcount = 1;
735181254a7Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
736181254a7Smrg }
737181254a7Smrg else
738181254a7Smrg {
739181254a7Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740181254a7Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741181254a7Smrg
742181254a7Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
743181254a7Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744181254a7Smrg }
745181254a7Smrg
746181254a7Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747181254a7Smrg {
748181254a7Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749181254a7Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750181254a7Smrg "in dimension 1: is %ld, should be %ld",
751181254a7Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752181254a7Smrg }
753181254a7Smrg
754181254a7Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
755181254a7Smrg {
756181254a7Smrg /* Treat it as a column matrix B[count,1] */
757181254a7Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758181254a7Smrg
759181254a7Smrg /* bystride should never be used for 1-dimensional b.
760181254a7Smrg The value is only used for calculation of the
761181254a7Smrg memory by the buffer. */
762181254a7Smrg bystride = 256;
763181254a7Smrg ycount = 1;
764181254a7Smrg }
765181254a7Smrg else
766181254a7Smrg {
767181254a7Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768181254a7Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769181254a7Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770181254a7Smrg }
771181254a7Smrg
772181254a7Smrg abase = a->base_addr;
773181254a7Smrg bbase = b->base_addr;
774181254a7Smrg dest = retarray->base_addr;
775181254a7Smrg
776181254a7Smrg /* Now that everything is set up, we perform the multiplication
777181254a7Smrg itself. */
778181254a7Smrg
779181254a7Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780181254a7Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
781181254a7Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
782181254a7Smrg
783181254a7Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784181254a7Smrg && (bxstride == 1 || bystride == 1)
785181254a7Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
786181254a7Smrg > POW3(blas_limit)))
787181254a7Smrg {
788181254a7Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
789181254a7Smrg const GFC_INTEGER_2 one = 1, zero = 0;
790181254a7Smrg const int lda = (axstride == 1) ? aystride : axstride,
791181254a7Smrg ldb = (bxstride == 1) ? bystride : bxstride;
792181254a7Smrg
793181254a7Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794181254a7Smrg {
795181254a7Smrg assert (gemm != NULL);
796181254a7Smrg const char *transa, *transb;
797181254a7Smrg if (try_blas & 2)
798181254a7Smrg transa = "C";
799181254a7Smrg else
800181254a7Smrg transa = axstride == 1 ? "N" : "T";
801181254a7Smrg
802181254a7Smrg if (try_blas & 4)
803181254a7Smrg transb = "C";
804181254a7Smrg else
805181254a7Smrg transb = bxstride == 1 ? "N" : "T";
806181254a7Smrg
807181254a7Smrg gemm (transa, transb , &m,
808181254a7Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
809181254a7Smrg &ldc, 1, 1);
810181254a7Smrg return;
811181254a7Smrg }
812181254a7Smrg }
813181254a7Smrg
814fb8a8121Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
815fb8a8121Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
816181254a7Smrg {
817181254a7Smrg /* This block of code implements a tuned matmul, derived from
818181254a7Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
819181254a7Smrg
820181254a7Smrg Bo Kagstrom and Per Ling
821181254a7Smrg Department of Computing Science
822181254a7Smrg Umea University
823181254a7Smrg S-901 87 Umea, Sweden
824181254a7Smrg
825181254a7Smrg from netlib.org, translated to C, and modified for matmul.m4. */
826181254a7Smrg
827181254a7Smrg const GFC_INTEGER_2 *a, *b;
828181254a7Smrg GFC_INTEGER_2 *c;
829181254a7Smrg const index_type m = xcount, n = ycount, k = count;
830181254a7Smrg
831181254a7Smrg /* System generated locals */
832181254a7Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833181254a7Smrg i1, i2, i3, i4, i5, i6;
834181254a7Smrg
835181254a7Smrg /* Local variables */
836181254a7Smrg GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
837181254a7Smrg f13, f14, f23, f24, f33, f34, f43, f44;
838181254a7Smrg index_type i, j, l, ii, jj, ll;
839181254a7Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840181254a7Smrg GFC_INTEGER_2 *t1;
841181254a7Smrg
842181254a7Smrg a = abase;
843181254a7Smrg b = bbase;
844181254a7Smrg c = retarray->base_addr;
845181254a7Smrg
846181254a7Smrg /* Parameter adjustments */
847181254a7Smrg c_dim1 = rystride;
848181254a7Smrg c_offset = 1 + c_dim1;
849181254a7Smrg c -= c_offset;
850181254a7Smrg a_dim1 = aystride;
851181254a7Smrg a_offset = 1 + a_dim1;
852181254a7Smrg a -= a_offset;
853181254a7Smrg b_dim1 = bystride;
854181254a7Smrg b_offset = 1 + b_dim1;
855181254a7Smrg b -= b_offset;
856181254a7Smrg
857181254a7Smrg /* Empty c first. */
858181254a7Smrg for (j=1; j<=n; j++)
859181254a7Smrg for (i=1; i<=m; i++)
860181254a7Smrg c[i + j * c_dim1] = (GFC_INTEGER_2)0;
861181254a7Smrg
862181254a7Smrg /* Early exit if possible */
863181254a7Smrg if (m == 0 || n == 0 || k == 0)
864181254a7Smrg return;
865181254a7Smrg
866181254a7Smrg /* Adjust size of t1 to what is needed. */
867181254a7Smrg index_type t1_dim, a_sz;
868181254a7Smrg if (aystride == 1)
869181254a7Smrg a_sz = rystride;
870181254a7Smrg else
871181254a7Smrg a_sz = a_dim1;
872181254a7Smrg
873181254a7Smrg t1_dim = a_sz * 256 + b_dim1;
874181254a7Smrg if (t1_dim > 65536)
875181254a7Smrg t1_dim = 65536;
876181254a7Smrg
877181254a7Smrg t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
878181254a7Smrg
879181254a7Smrg /* Start turning the crank. */
880181254a7Smrg i1 = n;
881181254a7Smrg for (jj = 1; jj <= i1; jj += 512)
882181254a7Smrg {
883181254a7Smrg /* Computing MIN */
884181254a7Smrg i2 = 512;
885181254a7Smrg i3 = n - jj + 1;
886181254a7Smrg jsec = min(i2,i3);
887181254a7Smrg ujsec = jsec - jsec % 4;
888181254a7Smrg i2 = k;
889181254a7Smrg for (ll = 1; ll <= i2; ll += 256)
890181254a7Smrg {
891181254a7Smrg /* Computing MIN */
892181254a7Smrg i3 = 256;
893181254a7Smrg i4 = k - ll + 1;
894181254a7Smrg lsec = min(i3,i4);
895181254a7Smrg ulsec = lsec - lsec % 2;
896181254a7Smrg
897181254a7Smrg i3 = m;
898181254a7Smrg for (ii = 1; ii <= i3; ii += 256)
899181254a7Smrg {
900181254a7Smrg /* Computing MIN */
901181254a7Smrg i4 = 256;
902181254a7Smrg i5 = m - ii + 1;
903181254a7Smrg isec = min(i4,i5);
904181254a7Smrg uisec = isec - isec % 2;
905181254a7Smrg i4 = ll + ulsec - 1;
906181254a7Smrg for (l = ll; l <= i4; l += 2)
907181254a7Smrg {
908181254a7Smrg i5 = ii + uisec - 1;
909181254a7Smrg for (i = ii; i <= i5; i += 2)
910181254a7Smrg {
911181254a7Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912181254a7Smrg a[i + l * a_dim1];
913181254a7Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914181254a7Smrg a[i + (l + 1) * a_dim1];
915181254a7Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916181254a7Smrg a[i + 1 + l * a_dim1];
917181254a7Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918181254a7Smrg a[i + 1 + (l + 1) * a_dim1];
919181254a7Smrg }
920181254a7Smrg if (uisec < isec)
921181254a7Smrg {
922181254a7Smrg t1[l - ll + 1 + (isec << 8) - 257] =
923181254a7Smrg a[ii + isec - 1 + l * a_dim1];
924181254a7Smrg t1[l - ll + 2 + (isec << 8) - 257] =
925181254a7Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
926181254a7Smrg }
927181254a7Smrg }
928181254a7Smrg if (ulsec < lsec)
929181254a7Smrg {
930181254a7Smrg i4 = ii + isec - 1;
931181254a7Smrg for (i = ii; i<= i4; ++i)
932181254a7Smrg {
933181254a7Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
934181254a7Smrg a[i + (ll + lsec - 1) * a_dim1];
935181254a7Smrg }
936181254a7Smrg }
937181254a7Smrg
938181254a7Smrg uisec = isec - isec % 4;
939181254a7Smrg i4 = jj + ujsec - 1;
940181254a7Smrg for (j = jj; j <= i4; j += 4)
941181254a7Smrg {
942181254a7Smrg i5 = ii + uisec - 1;
943181254a7Smrg for (i = ii; i <= i5; i += 4)
944181254a7Smrg {
945181254a7Smrg f11 = c[i + j * c_dim1];
946181254a7Smrg f21 = c[i + 1 + j * c_dim1];
947181254a7Smrg f12 = c[i + (j + 1) * c_dim1];
948181254a7Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
949181254a7Smrg f13 = c[i + (j + 2) * c_dim1];
950181254a7Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
951181254a7Smrg f14 = c[i + (j + 3) * c_dim1];
952181254a7Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
953181254a7Smrg f31 = c[i + 2 + j * c_dim1];
954181254a7Smrg f41 = c[i + 3 + j * c_dim1];
955181254a7Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
956181254a7Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
957181254a7Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
958181254a7Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
959181254a7Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
960181254a7Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
961181254a7Smrg i6 = ll + lsec - 1;
962181254a7Smrg for (l = ll; l <= i6; ++l)
963181254a7Smrg {
964181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965181254a7Smrg * b[l + j * b_dim1];
966181254a7Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967181254a7Smrg * b[l + j * b_dim1];
968181254a7Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969181254a7Smrg * b[l + (j + 1) * b_dim1];
970181254a7Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971181254a7Smrg * b[l + (j + 1) * b_dim1];
972181254a7Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973181254a7Smrg * b[l + (j + 2) * b_dim1];
974181254a7Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975181254a7Smrg * b[l + (j + 2) * b_dim1];
976181254a7Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977181254a7Smrg * b[l + (j + 3) * b_dim1];
978181254a7Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979181254a7Smrg * b[l + (j + 3) * b_dim1];
980181254a7Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981181254a7Smrg * b[l + j * b_dim1];
982181254a7Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983181254a7Smrg * b[l + j * b_dim1];
984181254a7Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985181254a7Smrg * b[l + (j + 1) * b_dim1];
986181254a7Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987181254a7Smrg * b[l + (j + 1) * b_dim1];
988181254a7Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989181254a7Smrg * b[l + (j + 2) * b_dim1];
990181254a7Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991181254a7Smrg * b[l + (j + 2) * b_dim1];
992181254a7Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993181254a7Smrg * b[l + (j + 3) * b_dim1];
994181254a7Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995181254a7Smrg * b[l + (j + 3) * b_dim1];
996181254a7Smrg }
997181254a7Smrg c[i + j * c_dim1] = f11;
998181254a7Smrg c[i + 1 + j * c_dim1] = f21;
999181254a7Smrg c[i + (j + 1) * c_dim1] = f12;
1000181254a7Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
1001181254a7Smrg c[i + (j + 2) * c_dim1] = f13;
1002181254a7Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
1003181254a7Smrg c[i + (j + 3) * c_dim1] = f14;
1004181254a7Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
1005181254a7Smrg c[i + 2 + j * c_dim1] = f31;
1006181254a7Smrg c[i + 3 + j * c_dim1] = f41;
1007181254a7Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
1008181254a7Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
1009181254a7Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
1010181254a7Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
1011181254a7Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
1012181254a7Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
1013181254a7Smrg }
1014181254a7Smrg if (uisec < isec)
1015181254a7Smrg {
1016181254a7Smrg i5 = ii + isec - 1;
1017181254a7Smrg for (i = ii + uisec; i <= i5; ++i)
1018181254a7Smrg {
1019181254a7Smrg f11 = c[i + j * c_dim1];
1020181254a7Smrg f12 = c[i + (j + 1) * c_dim1];
1021181254a7Smrg f13 = c[i + (j + 2) * c_dim1];
1022181254a7Smrg f14 = c[i + (j + 3) * c_dim1];
1023181254a7Smrg i6 = ll + lsec - 1;
1024181254a7Smrg for (l = ll; l <= i6; ++l)
1025181254a7Smrg {
1026181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027181254a7Smrg 257] * b[l + j * b_dim1];
1028181254a7Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029181254a7Smrg 257] * b[l + (j + 1) * b_dim1];
1030181254a7Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031181254a7Smrg 257] * b[l + (j + 2) * b_dim1];
1032181254a7Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033181254a7Smrg 257] * b[l + (j + 3) * b_dim1];
1034181254a7Smrg }
1035181254a7Smrg c[i + j * c_dim1] = f11;
1036181254a7Smrg c[i + (j + 1) * c_dim1] = f12;
1037181254a7Smrg c[i + (j + 2) * c_dim1] = f13;
1038181254a7Smrg c[i + (j + 3) * c_dim1] = f14;
1039181254a7Smrg }
1040181254a7Smrg }
1041181254a7Smrg }
1042181254a7Smrg if (ujsec < jsec)
1043181254a7Smrg {
1044181254a7Smrg i4 = jj + jsec - 1;
1045181254a7Smrg for (j = jj + ujsec; j <= i4; ++j)
1046181254a7Smrg {
1047181254a7Smrg i5 = ii + uisec - 1;
1048181254a7Smrg for (i = ii; i <= i5; i += 4)
1049181254a7Smrg {
1050181254a7Smrg f11 = c[i + j * c_dim1];
1051181254a7Smrg f21 = c[i + 1 + j * c_dim1];
1052181254a7Smrg f31 = c[i + 2 + j * c_dim1];
1053181254a7Smrg f41 = c[i + 3 + j * c_dim1];
1054181254a7Smrg i6 = ll + lsec - 1;
1055181254a7Smrg for (l = ll; l <= i6; ++l)
1056181254a7Smrg {
1057181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058181254a7Smrg 257] * b[l + j * b_dim1];
1059181254a7Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060181254a7Smrg 257] * b[l + j * b_dim1];
1061181254a7Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062181254a7Smrg 257] * b[l + j * b_dim1];
1063181254a7Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064181254a7Smrg 257] * b[l + j * b_dim1];
1065181254a7Smrg }
1066181254a7Smrg c[i + j * c_dim1] = f11;
1067181254a7Smrg c[i + 1 + j * c_dim1] = f21;
1068181254a7Smrg c[i + 2 + j * c_dim1] = f31;
1069181254a7Smrg c[i + 3 + j * c_dim1] = f41;
1070181254a7Smrg }
1071181254a7Smrg i5 = ii + isec - 1;
1072181254a7Smrg for (i = ii + uisec; i <= i5; ++i)
1073181254a7Smrg {
1074181254a7Smrg f11 = c[i + j * c_dim1];
1075181254a7Smrg i6 = ll + lsec - 1;
1076181254a7Smrg for (l = ll; l <= i6; ++l)
1077181254a7Smrg {
1078181254a7Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079181254a7Smrg 257] * b[l + j * b_dim1];
1080181254a7Smrg }
1081181254a7Smrg c[i + j * c_dim1] = f11;
1082181254a7Smrg }
1083181254a7Smrg }
1084181254a7Smrg }
1085181254a7Smrg }
1086181254a7Smrg }
1087181254a7Smrg }
1088181254a7Smrg free(t1);
1089181254a7Smrg return;
1090181254a7Smrg }
1091181254a7Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092181254a7Smrg {
1093181254a7Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
1094181254a7Smrg {
1095181254a7Smrg const GFC_INTEGER_2 *restrict abase_x;
1096181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
1097181254a7Smrg GFC_INTEGER_2 *restrict dest_y;
1098181254a7Smrg GFC_INTEGER_2 s;
1099181254a7Smrg
1100181254a7Smrg for (y = 0; y < ycount; y++)
1101181254a7Smrg {
1102181254a7Smrg bbase_y = &bbase[y*bystride];
1103181254a7Smrg dest_y = &dest[y*rystride];
1104181254a7Smrg for (x = 0; x < xcount; x++)
1105181254a7Smrg {
1106181254a7Smrg abase_x = &abase[x*axstride];
1107181254a7Smrg s = (GFC_INTEGER_2) 0;
1108181254a7Smrg for (n = 0; n < count; n++)
1109181254a7Smrg s += abase_x[n] * bbase_y[n];
1110181254a7Smrg dest_y[x] = s;
1111181254a7Smrg }
1112181254a7Smrg }
1113181254a7Smrg }
1114181254a7Smrg else
1115181254a7Smrg {
1116181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
1117181254a7Smrg GFC_INTEGER_2 s;
1118181254a7Smrg
1119181254a7Smrg for (y = 0; y < ycount; y++)
1120181254a7Smrg {
1121181254a7Smrg bbase_y = &bbase[y*bystride];
1122181254a7Smrg s = (GFC_INTEGER_2) 0;
1123181254a7Smrg for (n = 0; n < count; n++)
1124181254a7Smrg s += abase[n*axstride] * bbase_y[n];
1125181254a7Smrg dest[y*rystride] = s;
1126181254a7Smrg }
1127181254a7Smrg }
1128181254a7Smrg }
1129181254a7Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130181254a7Smrg {
1131181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
1132181254a7Smrg GFC_INTEGER_2 s;
1133181254a7Smrg
1134181254a7Smrg for (y = 0; y < ycount; y++)
1135181254a7Smrg {
1136181254a7Smrg bbase_y = &bbase[y*bystride];
1137181254a7Smrg s = (GFC_INTEGER_2) 0;
1138181254a7Smrg for (n = 0; n < count; n++)
1139181254a7Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
1140181254a7Smrg dest[y*rxstride] = s;
1141181254a7Smrg }
1142181254a7Smrg }
1143fb8a8121Smrg else if (axstride < aystride)
1144fb8a8121Smrg {
1145fb8a8121Smrg for (y = 0; y < ycount; y++)
1146fb8a8121Smrg for (x = 0; x < xcount; x++)
1147fb8a8121Smrg dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1148fb8a8121Smrg
1149fb8a8121Smrg for (y = 0; y < ycount; y++)
1150fb8a8121Smrg for (n = 0; n < count; n++)
1151fb8a8121Smrg for (x = 0; x < xcount; x++)
1152fb8a8121Smrg /* dest[x,y] += a[x,n] * b[n,y] */
1153fb8a8121Smrg dest[x*rxstride + y*rystride] +=
1154fb8a8121Smrg abase[x*axstride + n*aystride] *
1155fb8a8121Smrg bbase[n*bxstride + y*bystride];
1156fb8a8121Smrg }
1157181254a7Smrg else
1158181254a7Smrg {
1159181254a7Smrg const GFC_INTEGER_2 *restrict abase_x;
1160181254a7Smrg const GFC_INTEGER_2 *restrict bbase_y;
1161181254a7Smrg GFC_INTEGER_2 *restrict dest_y;
1162181254a7Smrg GFC_INTEGER_2 s;
1163181254a7Smrg
1164181254a7Smrg for (y = 0; y < ycount; y++)
1165181254a7Smrg {
1166181254a7Smrg bbase_y = &bbase[y*bystride];
1167181254a7Smrg dest_y = &dest[y*rystride];
1168181254a7Smrg for (x = 0; x < xcount; x++)
1169181254a7Smrg {
1170181254a7Smrg abase_x = &abase[x*axstride];
1171181254a7Smrg s = (GFC_INTEGER_2) 0;
1172181254a7Smrg for (n = 0; n < count; n++)
1173181254a7Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174181254a7Smrg dest_y[x*rxstride] = s;
1175181254a7Smrg }
1176181254a7Smrg }
1177181254a7Smrg }
1178181254a7Smrg }
1179181254a7Smrg #undef POW3
1180181254a7Smrg #undef min
1181181254a7Smrg #undef max
1182181254a7Smrg
1183181254a7Smrg #endif
1184181254a7Smrg
1185181254a7Smrg #endif
1186181254a7Smrg
1187