1627f7eb2Smrg /* Implementation of the MATMUL intrinsic
2*4c3eb207Smrg Copyright (C) 2002-2020 Free Software Foundation, Inc.
3627f7eb2Smrg Contributed by Paul Brook <paul@nowt.org>
4627f7eb2Smrg
5627f7eb2Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6627f7eb2Smrg
7627f7eb2Smrg Libgfortran is free software; you can redistribute it and/or
8627f7eb2Smrg modify it under the terms of the GNU General Public
9627f7eb2Smrg License as published by the Free Software Foundation; either
10627f7eb2Smrg version 3 of the License, or (at your option) any later version.
11627f7eb2Smrg
12627f7eb2Smrg Libgfortran is distributed in the hope that it will be useful,
13627f7eb2Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14627f7eb2Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15627f7eb2Smrg GNU General Public License for more details.
16627f7eb2Smrg
17627f7eb2Smrg Under Section 7 of GPL version 3, you are granted additional
18627f7eb2Smrg permissions described in the GCC Runtime Library Exception, version
19627f7eb2Smrg 3.1, as published by the Free Software Foundation.
20627f7eb2Smrg
21627f7eb2Smrg You should have received a copy of the GNU General Public License and
22627f7eb2Smrg a copy of the GCC Runtime Library Exception along with this program;
23627f7eb2Smrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24627f7eb2Smrg <http://www.gnu.org/licenses/>. */
25627f7eb2Smrg
26627f7eb2Smrg #include "libgfortran.h"
27627f7eb2Smrg #include <string.h>
28627f7eb2Smrg #include <assert.h>
29627f7eb2Smrg
30627f7eb2Smrg
31627f7eb2Smrg #if defined (HAVE_GFC_COMPLEX_4)
32627f7eb2Smrg
33627f7eb2Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34627f7eb2Smrg passed to us by the front-end, in which case we call it for large
35627f7eb2Smrg matrices. */
36627f7eb2Smrg
37627f7eb2Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38627f7eb2Smrg const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
39627f7eb2Smrg const int *, const GFC_COMPLEX_4 *, const int *,
40627f7eb2Smrg const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
41627f7eb2Smrg int, int);
42627f7eb2Smrg
43627f7eb2Smrg /* The order of loops is different in the case of plain matrix
44627f7eb2Smrg multiplication C=MATMUL(A,B), and in the frequent special case where
45627f7eb2Smrg the argument A is the temporary result of a TRANSPOSE intrinsic:
46627f7eb2Smrg C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47627f7eb2Smrg looking at their strides.
48627f7eb2Smrg
49627f7eb2Smrg The equivalent Fortran pseudo-code is:
50627f7eb2Smrg
51627f7eb2Smrg DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52627f7eb2Smrg IF (.NOT.IS_TRANSPOSED(A)) THEN
53627f7eb2Smrg C = 0
54627f7eb2Smrg DO J=1,N
55627f7eb2Smrg DO K=1,COUNT
56627f7eb2Smrg DO I=1,M
57627f7eb2Smrg C(I,J) = C(I,J)+A(I,K)*B(K,J)
58627f7eb2Smrg ELSE
59627f7eb2Smrg DO J=1,N
60627f7eb2Smrg DO I=1,M
61627f7eb2Smrg S = 0
62627f7eb2Smrg DO K=1,COUNT
63627f7eb2Smrg S = S+A(I,K)*B(K,J)
64627f7eb2Smrg C(I,J) = S
65627f7eb2Smrg ENDIF
66627f7eb2Smrg */
67627f7eb2Smrg
68627f7eb2Smrg /* If try_blas is set to a nonzero value, then the matmul function will
69627f7eb2Smrg see if there is a way to perform the matrix multiplication by a call
70627f7eb2Smrg to the BLAS gemm function. */
71627f7eb2Smrg
72627f7eb2Smrg extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
73627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
74627f7eb2Smrg int blas_limit, blas_call gemm);
75627f7eb2Smrg export_proto(matmul_c4);
76627f7eb2Smrg
77627f7eb2Smrg /* Put exhaustive list of possible architectures here here, ORed together. */
78627f7eb2Smrg
79627f7eb2Smrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80627f7eb2Smrg
81627f7eb2Smrg #ifdef HAVE_AVX
82627f7eb2Smrg static void
83627f7eb2Smrg matmul_c4_avx (gfc_array_c4 * const restrict retarray,
84627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
85627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86627f7eb2Smrg static void
matmul_c4_avx(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87627f7eb2Smrg matmul_c4_avx (gfc_array_c4 * const restrict retarray,
88627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
89627f7eb2Smrg int blas_limit, blas_call gemm)
90627f7eb2Smrg {
91627f7eb2Smrg const GFC_COMPLEX_4 * restrict abase;
92627f7eb2Smrg const GFC_COMPLEX_4 * restrict bbase;
93627f7eb2Smrg GFC_COMPLEX_4 * restrict dest;
94627f7eb2Smrg
95627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
97627f7eb2Smrg
98627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
99627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
100627f7eb2Smrg
101627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102627f7eb2Smrg
103627f7eb2Smrg Either A or B (but not both) can be rank 1:
104627f7eb2Smrg
105627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
106627f7eb2Smrg dimensioned [1,count], so xcount=1.
107627f7eb2Smrg
108627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
109627f7eb2Smrg dimensioned [count, 1], so ycount=1.
110627f7eb2Smrg */
111627f7eb2Smrg
112627f7eb2Smrg if (retarray->base_addr == NULL)
113627f7eb2Smrg {
114627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
115627f7eb2Smrg {
116627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
117627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118627f7eb2Smrg }
119627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
120627f7eb2Smrg {
121627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
122627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123627f7eb2Smrg }
124627f7eb2Smrg else
125627f7eb2Smrg {
126627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
127627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128627f7eb2Smrg
129627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
130627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
132627f7eb2Smrg }
133627f7eb2Smrg
134627f7eb2Smrg retarray->base_addr
135627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
136627f7eb2Smrg retarray->offset = 0;
137627f7eb2Smrg }
138627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
139627f7eb2Smrg {
140627f7eb2Smrg index_type ret_extent, arg_extent;
141627f7eb2Smrg
142627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
143627f7eb2Smrg {
144627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146627f7eb2Smrg if (arg_extent != ret_extent)
147627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
148627f7eb2Smrg "array (%ld/%ld) ",
149627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
150627f7eb2Smrg }
151627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
152627f7eb2Smrg {
153627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155627f7eb2Smrg if (arg_extent != ret_extent)
156627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
157627f7eb2Smrg "array (%ld/%ld) ",
158627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
159627f7eb2Smrg }
160627f7eb2Smrg else
161627f7eb2Smrg {
162627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164627f7eb2Smrg if (arg_extent != ret_extent)
165627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
166627f7eb2Smrg "array (%ld/%ld) ",
167627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
168627f7eb2Smrg
169627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171627f7eb2Smrg if (arg_extent != ret_extent)
172627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
173627f7eb2Smrg "array (%ld/%ld) ",
174627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
175627f7eb2Smrg }
176627f7eb2Smrg }
177627f7eb2Smrg
178627f7eb2Smrg
179627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180627f7eb2Smrg {
181627f7eb2Smrg /* One-dimensional result may be addressed in the code below
182627f7eb2Smrg either as a row or a column matrix. We want both cases to
183627f7eb2Smrg work. */
184627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185627f7eb2Smrg }
186627f7eb2Smrg else
187627f7eb2Smrg {
188627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190627f7eb2Smrg }
191627f7eb2Smrg
192627f7eb2Smrg
193627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
194627f7eb2Smrg {
195627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
196627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197627f7eb2Smrg aystride = 1;
198627f7eb2Smrg
199627f7eb2Smrg xcount = 1;
200627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
201627f7eb2Smrg }
202627f7eb2Smrg else
203627f7eb2Smrg {
204627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206627f7eb2Smrg
207627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
208627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209627f7eb2Smrg }
210627f7eb2Smrg
211627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212627f7eb2Smrg {
213627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
216627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217627f7eb2Smrg }
218627f7eb2Smrg
219627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
220627f7eb2Smrg {
221627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
222627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223627f7eb2Smrg
224627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
225627f7eb2Smrg The value is only used for calculation of the
226627f7eb2Smrg memory by the buffer. */
227627f7eb2Smrg bystride = 256;
228627f7eb2Smrg ycount = 1;
229627f7eb2Smrg }
230627f7eb2Smrg else
231627f7eb2Smrg {
232627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235627f7eb2Smrg }
236627f7eb2Smrg
237627f7eb2Smrg abase = a->base_addr;
238627f7eb2Smrg bbase = b->base_addr;
239627f7eb2Smrg dest = retarray->base_addr;
240627f7eb2Smrg
241627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
242627f7eb2Smrg itself. */
243627f7eb2Smrg
244627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
246627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
247627f7eb2Smrg
248627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249627f7eb2Smrg && (bxstride == 1 || bystride == 1)
250627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
251627f7eb2Smrg > POW3(blas_limit)))
252627f7eb2Smrg {
253627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
254627f7eb2Smrg const GFC_COMPLEX_4 one = 1, zero = 0;
255627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
256627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
257627f7eb2Smrg
258627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259627f7eb2Smrg {
260627f7eb2Smrg assert (gemm != NULL);
261627f7eb2Smrg const char *transa, *transb;
262627f7eb2Smrg if (try_blas & 2)
263627f7eb2Smrg transa = "C";
264627f7eb2Smrg else
265627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
266627f7eb2Smrg
267627f7eb2Smrg if (try_blas & 4)
268627f7eb2Smrg transb = "C";
269627f7eb2Smrg else
270627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
271627f7eb2Smrg
272627f7eb2Smrg gemm (transa, transb , &m,
273627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
274627f7eb2Smrg &ldc, 1, 1);
275627f7eb2Smrg return;
276627f7eb2Smrg }
277627f7eb2Smrg }
278627f7eb2Smrg
279*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
280*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
281627f7eb2Smrg {
282627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
283627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
284627f7eb2Smrg
285627f7eb2Smrg Bo Kagstrom and Per Ling
286627f7eb2Smrg Department of Computing Science
287627f7eb2Smrg Umea University
288627f7eb2Smrg S-901 87 Umea, Sweden
289627f7eb2Smrg
290627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
291627f7eb2Smrg
292627f7eb2Smrg const GFC_COMPLEX_4 *a, *b;
293627f7eb2Smrg GFC_COMPLEX_4 *c;
294627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
295627f7eb2Smrg
296627f7eb2Smrg /* System generated locals */
297627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298627f7eb2Smrg i1, i2, i3, i4, i5, i6;
299627f7eb2Smrg
300627f7eb2Smrg /* Local variables */
301627f7eb2Smrg GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
302627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
303627f7eb2Smrg index_type i, j, l, ii, jj, ll;
304627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305627f7eb2Smrg GFC_COMPLEX_4 *t1;
306627f7eb2Smrg
307627f7eb2Smrg a = abase;
308627f7eb2Smrg b = bbase;
309627f7eb2Smrg c = retarray->base_addr;
310627f7eb2Smrg
311627f7eb2Smrg /* Parameter adjustments */
312627f7eb2Smrg c_dim1 = rystride;
313627f7eb2Smrg c_offset = 1 + c_dim1;
314627f7eb2Smrg c -= c_offset;
315627f7eb2Smrg a_dim1 = aystride;
316627f7eb2Smrg a_offset = 1 + a_dim1;
317627f7eb2Smrg a -= a_offset;
318627f7eb2Smrg b_dim1 = bystride;
319627f7eb2Smrg b_offset = 1 + b_dim1;
320627f7eb2Smrg b -= b_offset;
321627f7eb2Smrg
322627f7eb2Smrg /* Empty c first. */
323627f7eb2Smrg for (j=1; j<=n; j++)
324627f7eb2Smrg for (i=1; i<=m; i++)
325627f7eb2Smrg c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
326627f7eb2Smrg
327627f7eb2Smrg /* Early exit if possible */
328627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
329627f7eb2Smrg return;
330627f7eb2Smrg
331627f7eb2Smrg /* Adjust size of t1 to what is needed. */
332627f7eb2Smrg index_type t1_dim, a_sz;
333627f7eb2Smrg if (aystride == 1)
334627f7eb2Smrg a_sz = rystride;
335627f7eb2Smrg else
336627f7eb2Smrg a_sz = a_dim1;
337627f7eb2Smrg
338627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
339627f7eb2Smrg if (t1_dim > 65536)
340627f7eb2Smrg t1_dim = 65536;
341627f7eb2Smrg
342627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
343627f7eb2Smrg
344627f7eb2Smrg /* Start turning the crank. */
345627f7eb2Smrg i1 = n;
346627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
347627f7eb2Smrg {
348627f7eb2Smrg /* Computing MIN */
349627f7eb2Smrg i2 = 512;
350627f7eb2Smrg i3 = n - jj + 1;
351627f7eb2Smrg jsec = min(i2,i3);
352627f7eb2Smrg ujsec = jsec - jsec % 4;
353627f7eb2Smrg i2 = k;
354627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
355627f7eb2Smrg {
356627f7eb2Smrg /* Computing MIN */
357627f7eb2Smrg i3 = 256;
358627f7eb2Smrg i4 = k - ll + 1;
359627f7eb2Smrg lsec = min(i3,i4);
360627f7eb2Smrg ulsec = lsec - lsec % 2;
361627f7eb2Smrg
362627f7eb2Smrg i3 = m;
363627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
364627f7eb2Smrg {
365627f7eb2Smrg /* Computing MIN */
366627f7eb2Smrg i4 = 256;
367627f7eb2Smrg i5 = m - ii + 1;
368627f7eb2Smrg isec = min(i4,i5);
369627f7eb2Smrg uisec = isec - isec % 2;
370627f7eb2Smrg i4 = ll + ulsec - 1;
371627f7eb2Smrg for (l = ll; l <= i4; l += 2)
372627f7eb2Smrg {
373627f7eb2Smrg i5 = ii + uisec - 1;
374627f7eb2Smrg for (i = ii; i <= i5; i += 2)
375627f7eb2Smrg {
376627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377627f7eb2Smrg a[i + l * a_dim1];
378627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379627f7eb2Smrg a[i + (l + 1) * a_dim1];
380627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381627f7eb2Smrg a[i + 1 + l * a_dim1];
382627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
384627f7eb2Smrg }
385627f7eb2Smrg if (uisec < isec)
386627f7eb2Smrg {
387627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
388627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
389627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
390627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
391627f7eb2Smrg }
392627f7eb2Smrg }
393627f7eb2Smrg if (ulsec < lsec)
394627f7eb2Smrg {
395627f7eb2Smrg i4 = ii + isec - 1;
396627f7eb2Smrg for (i = ii; i<= i4; ++i)
397627f7eb2Smrg {
398627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
399627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
400627f7eb2Smrg }
401627f7eb2Smrg }
402627f7eb2Smrg
403627f7eb2Smrg uisec = isec - isec % 4;
404627f7eb2Smrg i4 = jj + ujsec - 1;
405627f7eb2Smrg for (j = jj; j <= i4; j += 4)
406627f7eb2Smrg {
407627f7eb2Smrg i5 = ii + uisec - 1;
408627f7eb2Smrg for (i = ii; i <= i5; i += 4)
409627f7eb2Smrg {
410627f7eb2Smrg f11 = c[i + j * c_dim1];
411627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
412627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
413627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
414627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
415627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
416627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
417627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
418627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
419627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
420627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
421627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
422627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
423627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
424627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
425627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
426627f7eb2Smrg i6 = ll + lsec - 1;
427627f7eb2Smrg for (l = ll; l <= i6; ++l)
428627f7eb2Smrg {
429627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430627f7eb2Smrg * b[l + j * b_dim1];
431627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432627f7eb2Smrg * b[l + j * b_dim1];
433627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434627f7eb2Smrg * b[l + (j + 1) * b_dim1];
435627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436627f7eb2Smrg * b[l + (j + 1) * b_dim1];
437627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438627f7eb2Smrg * b[l + (j + 2) * b_dim1];
439627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440627f7eb2Smrg * b[l + (j + 2) * b_dim1];
441627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442627f7eb2Smrg * b[l + (j + 3) * b_dim1];
443627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444627f7eb2Smrg * b[l + (j + 3) * b_dim1];
445627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446627f7eb2Smrg * b[l + j * b_dim1];
447627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448627f7eb2Smrg * b[l + j * b_dim1];
449627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450627f7eb2Smrg * b[l + (j + 1) * b_dim1];
451627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452627f7eb2Smrg * b[l + (j + 1) * b_dim1];
453627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454627f7eb2Smrg * b[l + (j + 2) * b_dim1];
455627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456627f7eb2Smrg * b[l + (j + 2) * b_dim1];
457627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458627f7eb2Smrg * b[l + (j + 3) * b_dim1];
459627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460627f7eb2Smrg * b[l + (j + 3) * b_dim1];
461627f7eb2Smrg }
462627f7eb2Smrg c[i + j * c_dim1] = f11;
463627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
464627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
465627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
466627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
467627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
468627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
469627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
470627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
471627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
472627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
473627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
474627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
475627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
476627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
477627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
478627f7eb2Smrg }
479627f7eb2Smrg if (uisec < isec)
480627f7eb2Smrg {
481627f7eb2Smrg i5 = ii + isec - 1;
482627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
483627f7eb2Smrg {
484627f7eb2Smrg f11 = c[i + j * c_dim1];
485627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
486627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
487627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
488627f7eb2Smrg i6 = ll + lsec - 1;
489627f7eb2Smrg for (l = ll; l <= i6; ++l)
490627f7eb2Smrg {
491627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492627f7eb2Smrg 257] * b[l + j * b_dim1];
493627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
495627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
497627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
499627f7eb2Smrg }
500627f7eb2Smrg c[i + j * c_dim1] = f11;
501627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
502627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
503627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
504627f7eb2Smrg }
505627f7eb2Smrg }
506627f7eb2Smrg }
507627f7eb2Smrg if (ujsec < jsec)
508627f7eb2Smrg {
509627f7eb2Smrg i4 = jj + jsec - 1;
510627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
511627f7eb2Smrg {
512627f7eb2Smrg i5 = ii + uisec - 1;
513627f7eb2Smrg for (i = ii; i <= i5; i += 4)
514627f7eb2Smrg {
515627f7eb2Smrg f11 = c[i + j * c_dim1];
516627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
517627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
518627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
519627f7eb2Smrg i6 = ll + lsec - 1;
520627f7eb2Smrg for (l = ll; l <= i6; ++l)
521627f7eb2Smrg {
522627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523627f7eb2Smrg 257] * b[l + j * b_dim1];
524627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525627f7eb2Smrg 257] * b[l + j * b_dim1];
526627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527627f7eb2Smrg 257] * b[l + j * b_dim1];
528627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529627f7eb2Smrg 257] * b[l + j * b_dim1];
530627f7eb2Smrg }
531627f7eb2Smrg c[i + j * c_dim1] = f11;
532627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
533627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
534627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
535627f7eb2Smrg }
536627f7eb2Smrg i5 = ii + isec - 1;
537627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
538627f7eb2Smrg {
539627f7eb2Smrg f11 = c[i + j * c_dim1];
540627f7eb2Smrg i6 = ll + lsec - 1;
541627f7eb2Smrg for (l = ll; l <= i6; ++l)
542627f7eb2Smrg {
543627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544627f7eb2Smrg 257] * b[l + j * b_dim1];
545627f7eb2Smrg }
546627f7eb2Smrg c[i + j * c_dim1] = f11;
547627f7eb2Smrg }
548627f7eb2Smrg }
549627f7eb2Smrg }
550627f7eb2Smrg }
551627f7eb2Smrg }
552627f7eb2Smrg }
553627f7eb2Smrg free(t1);
554627f7eb2Smrg return;
555627f7eb2Smrg }
556627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557627f7eb2Smrg {
558627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
559627f7eb2Smrg {
560627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
561627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
562627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
563627f7eb2Smrg GFC_COMPLEX_4 s;
564627f7eb2Smrg
565627f7eb2Smrg for (y = 0; y < ycount; y++)
566627f7eb2Smrg {
567627f7eb2Smrg bbase_y = &bbase[y*bystride];
568627f7eb2Smrg dest_y = &dest[y*rystride];
569627f7eb2Smrg for (x = 0; x < xcount; x++)
570627f7eb2Smrg {
571627f7eb2Smrg abase_x = &abase[x*axstride];
572627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
573627f7eb2Smrg for (n = 0; n < count; n++)
574627f7eb2Smrg s += abase_x[n] * bbase_y[n];
575627f7eb2Smrg dest_y[x] = s;
576627f7eb2Smrg }
577627f7eb2Smrg }
578627f7eb2Smrg }
579627f7eb2Smrg else
580627f7eb2Smrg {
581627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
582627f7eb2Smrg GFC_COMPLEX_4 s;
583627f7eb2Smrg
584627f7eb2Smrg for (y = 0; y < ycount; y++)
585627f7eb2Smrg {
586627f7eb2Smrg bbase_y = &bbase[y*bystride];
587627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
588627f7eb2Smrg for (n = 0; n < count; n++)
589627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
590627f7eb2Smrg dest[y*rystride] = s;
591627f7eb2Smrg }
592627f7eb2Smrg }
593627f7eb2Smrg }
594627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
595627f7eb2Smrg {
596627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
597627f7eb2Smrg GFC_COMPLEX_4 s;
598627f7eb2Smrg
599627f7eb2Smrg for (y = 0; y < ycount; y++)
600627f7eb2Smrg {
601627f7eb2Smrg bbase_y = &bbase[y*bystride];
602627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
603627f7eb2Smrg for (n = 0; n < count; n++)
604627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
605627f7eb2Smrg dest[y*rxstride] = s;
606627f7eb2Smrg }
607627f7eb2Smrg }
608*4c3eb207Smrg else if (axstride < aystride)
609*4c3eb207Smrg {
610*4c3eb207Smrg for (y = 0; y < ycount; y++)
611*4c3eb207Smrg for (x = 0; x < xcount; x++)
612*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
613*4c3eb207Smrg
614*4c3eb207Smrg for (y = 0; y < ycount; y++)
615*4c3eb207Smrg for (n = 0; n < count; n++)
616*4c3eb207Smrg for (x = 0; x < xcount; x++)
617*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
618*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
619*4c3eb207Smrg abase[x*axstride + n*aystride] *
620*4c3eb207Smrg bbase[n*bxstride + y*bystride];
621*4c3eb207Smrg }
622627f7eb2Smrg else
623627f7eb2Smrg {
624627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
625627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
626627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
627627f7eb2Smrg GFC_COMPLEX_4 s;
628627f7eb2Smrg
629627f7eb2Smrg for (y = 0; y < ycount; y++)
630627f7eb2Smrg {
631627f7eb2Smrg bbase_y = &bbase[y*bystride];
632627f7eb2Smrg dest_y = &dest[y*rystride];
633627f7eb2Smrg for (x = 0; x < xcount; x++)
634627f7eb2Smrg {
635627f7eb2Smrg abase_x = &abase[x*axstride];
636627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
637627f7eb2Smrg for (n = 0; n < count; n++)
638627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
639627f7eb2Smrg dest_y[x*rxstride] = s;
640627f7eb2Smrg }
641627f7eb2Smrg }
642627f7eb2Smrg }
643627f7eb2Smrg }
644627f7eb2Smrg #undef POW3
645627f7eb2Smrg #undef min
646627f7eb2Smrg #undef max
647627f7eb2Smrg
648627f7eb2Smrg #endif /* HAVE_AVX */
649627f7eb2Smrg
650627f7eb2Smrg #ifdef HAVE_AVX2
651627f7eb2Smrg static void
652627f7eb2Smrg matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
653627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
654627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655627f7eb2Smrg static void
matmul_c4_avx2(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656627f7eb2Smrg matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
657627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
658627f7eb2Smrg int blas_limit, blas_call gemm)
659627f7eb2Smrg {
660627f7eb2Smrg const GFC_COMPLEX_4 * restrict abase;
661627f7eb2Smrg const GFC_COMPLEX_4 * restrict bbase;
662627f7eb2Smrg GFC_COMPLEX_4 * restrict dest;
663627f7eb2Smrg
664627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
666627f7eb2Smrg
667627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
668627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
669627f7eb2Smrg
670627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671627f7eb2Smrg
672627f7eb2Smrg Either A or B (but not both) can be rank 1:
673627f7eb2Smrg
674627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
675627f7eb2Smrg dimensioned [1,count], so xcount=1.
676627f7eb2Smrg
677627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
678627f7eb2Smrg dimensioned [count, 1], so ycount=1.
679627f7eb2Smrg */
680627f7eb2Smrg
681627f7eb2Smrg if (retarray->base_addr == NULL)
682627f7eb2Smrg {
683627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
684627f7eb2Smrg {
685627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
686627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687627f7eb2Smrg }
688627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
689627f7eb2Smrg {
690627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
691627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692627f7eb2Smrg }
693627f7eb2Smrg else
694627f7eb2Smrg {
695627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
696627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697627f7eb2Smrg
698627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
699627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
701627f7eb2Smrg }
702627f7eb2Smrg
703627f7eb2Smrg retarray->base_addr
704627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
705627f7eb2Smrg retarray->offset = 0;
706627f7eb2Smrg }
707627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
708627f7eb2Smrg {
709627f7eb2Smrg index_type ret_extent, arg_extent;
710627f7eb2Smrg
711627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
712627f7eb2Smrg {
713627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715627f7eb2Smrg if (arg_extent != ret_extent)
716627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
717627f7eb2Smrg "array (%ld/%ld) ",
718627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
719627f7eb2Smrg }
720627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
721627f7eb2Smrg {
722627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724627f7eb2Smrg if (arg_extent != ret_extent)
725627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
726627f7eb2Smrg "array (%ld/%ld) ",
727627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
728627f7eb2Smrg }
729627f7eb2Smrg else
730627f7eb2Smrg {
731627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733627f7eb2Smrg if (arg_extent != ret_extent)
734627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
735627f7eb2Smrg "array (%ld/%ld) ",
736627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
737627f7eb2Smrg
738627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740627f7eb2Smrg if (arg_extent != ret_extent)
741627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
742627f7eb2Smrg "array (%ld/%ld) ",
743627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
744627f7eb2Smrg }
745627f7eb2Smrg }
746627f7eb2Smrg
747627f7eb2Smrg
748627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749627f7eb2Smrg {
750627f7eb2Smrg /* One-dimensional result may be addressed in the code below
751627f7eb2Smrg either as a row or a column matrix. We want both cases to
752627f7eb2Smrg work. */
753627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754627f7eb2Smrg }
755627f7eb2Smrg else
756627f7eb2Smrg {
757627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759627f7eb2Smrg }
760627f7eb2Smrg
761627f7eb2Smrg
762627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
763627f7eb2Smrg {
764627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
765627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766627f7eb2Smrg aystride = 1;
767627f7eb2Smrg
768627f7eb2Smrg xcount = 1;
769627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
770627f7eb2Smrg }
771627f7eb2Smrg else
772627f7eb2Smrg {
773627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775627f7eb2Smrg
776627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
777627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778627f7eb2Smrg }
779627f7eb2Smrg
780627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781627f7eb2Smrg {
782627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
785627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786627f7eb2Smrg }
787627f7eb2Smrg
788627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
789627f7eb2Smrg {
790627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
791627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792627f7eb2Smrg
793627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
794627f7eb2Smrg The value is only used for calculation of the
795627f7eb2Smrg memory by the buffer. */
796627f7eb2Smrg bystride = 256;
797627f7eb2Smrg ycount = 1;
798627f7eb2Smrg }
799627f7eb2Smrg else
800627f7eb2Smrg {
801627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804627f7eb2Smrg }
805627f7eb2Smrg
806627f7eb2Smrg abase = a->base_addr;
807627f7eb2Smrg bbase = b->base_addr;
808627f7eb2Smrg dest = retarray->base_addr;
809627f7eb2Smrg
810627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
811627f7eb2Smrg itself. */
812627f7eb2Smrg
813627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
815627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
816627f7eb2Smrg
817627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818627f7eb2Smrg && (bxstride == 1 || bystride == 1)
819627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
820627f7eb2Smrg > POW3(blas_limit)))
821627f7eb2Smrg {
822627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
823627f7eb2Smrg const GFC_COMPLEX_4 one = 1, zero = 0;
824627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
825627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
826627f7eb2Smrg
827627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828627f7eb2Smrg {
829627f7eb2Smrg assert (gemm != NULL);
830627f7eb2Smrg const char *transa, *transb;
831627f7eb2Smrg if (try_blas & 2)
832627f7eb2Smrg transa = "C";
833627f7eb2Smrg else
834627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
835627f7eb2Smrg
836627f7eb2Smrg if (try_blas & 4)
837627f7eb2Smrg transb = "C";
838627f7eb2Smrg else
839627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
840627f7eb2Smrg
841627f7eb2Smrg gemm (transa, transb , &m,
842627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
843627f7eb2Smrg &ldc, 1, 1);
844627f7eb2Smrg return;
845627f7eb2Smrg }
846627f7eb2Smrg }
847627f7eb2Smrg
848*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
849*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
850627f7eb2Smrg {
851627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
852627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
853627f7eb2Smrg
854627f7eb2Smrg Bo Kagstrom and Per Ling
855627f7eb2Smrg Department of Computing Science
856627f7eb2Smrg Umea University
857627f7eb2Smrg S-901 87 Umea, Sweden
858627f7eb2Smrg
859627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
860627f7eb2Smrg
861627f7eb2Smrg const GFC_COMPLEX_4 *a, *b;
862627f7eb2Smrg GFC_COMPLEX_4 *c;
863627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
864627f7eb2Smrg
865627f7eb2Smrg /* System generated locals */
866627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867627f7eb2Smrg i1, i2, i3, i4, i5, i6;
868627f7eb2Smrg
869627f7eb2Smrg /* Local variables */
870627f7eb2Smrg GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
871627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
872627f7eb2Smrg index_type i, j, l, ii, jj, ll;
873627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874627f7eb2Smrg GFC_COMPLEX_4 *t1;
875627f7eb2Smrg
876627f7eb2Smrg a = abase;
877627f7eb2Smrg b = bbase;
878627f7eb2Smrg c = retarray->base_addr;
879627f7eb2Smrg
880627f7eb2Smrg /* Parameter adjustments */
881627f7eb2Smrg c_dim1 = rystride;
882627f7eb2Smrg c_offset = 1 + c_dim1;
883627f7eb2Smrg c -= c_offset;
884627f7eb2Smrg a_dim1 = aystride;
885627f7eb2Smrg a_offset = 1 + a_dim1;
886627f7eb2Smrg a -= a_offset;
887627f7eb2Smrg b_dim1 = bystride;
888627f7eb2Smrg b_offset = 1 + b_dim1;
889627f7eb2Smrg b -= b_offset;
890627f7eb2Smrg
891627f7eb2Smrg /* Empty c first. */
892627f7eb2Smrg for (j=1; j<=n; j++)
893627f7eb2Smrg for (i=1; i<=m; i++)
894627f7eb2Smrg c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
895627f7eb2Smrg
896627f7eb2Smrg /* Early exit if possible */
897627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
898627f7eb2Smrg return;
899627f7eb2Smrg
900627f7eb2Smrg /* Adjust size of t1 to what is needed. */
901627f7eb2Smrg index_type t1_dim, a_sz;
902627f7eb2Smrg if (aystride == 1)
903627f7eb2Smrg a_sz = rystride;
904627f7eb2Smrg else
905627f7eb2Smrg a_sz = a_dim1;
906627f7eb2Smrg
907627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
908627f7eb2Smrg if (t1_dim > 65536)
909627f7eb2Smrg t1_dim = 65536;
910627f7eb2Smrg
911627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
912627f7eb2Smrg
913627f7eb2Smrg /* Start turning the crank. */
914627f7eb2Smrg i1 = n;
915627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
916627f7eb2Smrg {
917627f7eb2Smrg /* Computing MIN */
918627f7eb2Smrg i2 = 512;
919627f7eb2Smrg i3 = n - jj + 1;
920627f7eb2Smrg jsec = min(i2,i3);
921627f7eb2Smrg ujsec = jsec - jsec % 4;
922627f7eb2Smrg i2 = k;
923627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
924627f7eb2Smrg {
925627f7eb2Smrg /* Computing MIN */
926627f7eb2Smrg i3 = 256;
927627f7eb2Smrg i4 = k - ll + 1;
928627f7eb2Smrg lsec = min(i3,i4);
929627f7eb2Smrg ulsec = lsec - lsec % 2;
930627f7eb2Smrg
931627f7eb2Smrg i3 = m;
932627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
933627f7eb2Smrg {
934627f7eb2Smrg /* Computing MIN */
935627f7eb2Smrg i4 = 256;
936627f7eb2Smrg i5 = m - ii + 1;
937627f7eb2Smrg isec = min(i4,i5);
938627f7eb2Smrg uisec = isec - isec % 2;
939627f7eb2Smrg i4 = ll + ulsec - 1;
940627f7eb2Smrg for (l = ll; l <= i4; l += 2)
941627f7eb2Smrg {
942627f7eb2Smrg i5 = ii + uisec - 1;
943627f7eb2Smrg for (i = ii; i <= i5; i += 2)
944627f7eb2Smrg {
945627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946627f7eb2Smrg a[i + l * a_dim1];
947627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948627f7eb2Smrg a[i + (l + 1) * a_dim1];
949627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950627f7eb2Smrg a[i + 1 + l * a_dim1];
951627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
953627f7eb2Smrg }
954627f7eb2Smrg if (uisec < isec)
955627f7eb2Smrg {
956627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
957627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
958627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
959627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
960627f7eb2Smrg }
961627f7eb2Smrg }
962627f7eb2Smrg if (ulsec < lsec)
963627f7eb2Smrg {
964627f7eb2Smrg i4 = ii + isec - 1;
965627f7eb2Smrg for (i = ii; i<= i4; ++i)
966627f7eb2Smrg {
967627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
968627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
969627f7eb2Smrg }
970627f7eb2Smrg }
971627f7eb2Smrg
972627f7eb2Smrg uisec = isec - isec % 4;
973627f7eb2Smrg i4 = jj + ujsec - 1;
974627f7eb2Smrg for (j = jj; j <= i4; j += 4)
975627f7eb2Smrg {
976627f7eb2Smrg i5 = ii + uisec - 1;
977627f7eb2Smrg for (i = ii; i <= i5; i += 4)
978627f7eb2Smrg {
979627f7eb2Smrg f11 = c[i + j * c_dim1];
980627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
981627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
982627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
983627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
984627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
985627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
986627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
987627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
988627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
989627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
990627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
991627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
992627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
993627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
994627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
995627f7eb2Smrg i6 = ll + lsec - 1;
996627f7eb2Smrg for (l = ll; l <= i6; ++l)
997627f7eb2Smrg {
998627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999627f7eb2Smrg * b[l + j * b_dim1];
1000627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001627f7eb2Smrg * b[l + j * b_dim1];
1002627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1004627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1006627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1008627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1010627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1012627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1014627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015627f7eb2Smrg * b[l + j * b_dim1];
1016627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017627f7eb2Smrg * b[l + j * b_dim1];
1018627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1020627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1022627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1024627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1026627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1028627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1030627f7eb2Smrg }
1031627f7eb2Smrg c[i + j * c_dim1] = f11;
1032627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
1033627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1034627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
1035627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1036627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
1037627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1038627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
1039627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1040627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1041627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
1042627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
1043627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
1044627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
1045627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
1046627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
1047627f7eb2Smrg }
1048627f7eb2Smrg if (uisec < isec)
1049627f7eb2Smrg {
1050627f7eb2Smrg i5 = ii + isec - 1;
1051627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1052627f7eb2Smrg {
1053627f7eb2Smrg f11 = c[i + j * c_dim1];
1054627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
1055627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
1056627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
1057627f7eb2Smrg i6 = ll + lsec - 1;
1058627f7eb2Smrg for (l = ll; l <= i6; ++l)
1059627f7eb2Smrg {
1060627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061627f7eb2Smrg 257] * b[l + j * b_dim1];
1062627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
1064627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
1066627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
1068627f7eb2Smrg }
1069627f7eb2Smrg c[i + j * c_dim1] = f11;
1070627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1071627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1072627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1073627f7eb2Smrg }
1074627f7eb2Smrg }
1075627f7eb2Smrg }
1076627f7eb2Smrg if (ujsec < jsec)
1077627f7eb2Smrg {
1078627f7eb2Smrg i4 = jj + jsec - 1;
1079627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
1080627f7eb2Smrg {
1081627f7eb2Smrg i5 = ii + uisec - 1;
1082627f7eb2Smrg for (i = ii; i <= i5; i += 4)
1083627f7eb2Smrg {
1084627f7eb2Smrg f11 = c[i + j * c_dim1];
1085627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
1086627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
1087627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
1088627f7eb2Smrg i6 = ll + lsec - 1;
1089627f7eb2Smrg for (l = ll; l <= i6; ++l)
1090627f7eb2Smrg {
1091627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092627f7eb2Smrg 257] * b[l + j * b_dim1];
1093627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094627f7eb2Smrg 257] * b[l + j * b_dim1];
1095627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096627f7eb2Smrg 257] * b[l + j * b_dim1];
1097627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098627f7eb2Smrg 257] * b[l + j * b_dim1];
1099627f7eb2Smrg }
1100627f7eb2Smrg c[i + j * c_dim1] = f11;
1101627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
1102627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1103627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1104627f7eb2Smrg }
1105627f7eb2Smrg i5 = ii + isec - 1;
1106627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1107627f7eb2Smrg {
1108627f7eb2Smrg f11 = c[i + j * c_dim1];
1109627f7eb2Smrg i6 = ll + lsec - 1;
1110627f7eb2Smrg for (l = ll; l <= i6; ++l)
1111627f7eb2Smrg {
1112627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113627f7eb2Smrg 257] * b[l + j * b_dim1];
1114627f7eb2Smrg }
1115627f7eb2Smrg c[i + j * c_dim1] = f11;
1116627f7eb2Smrg }
1117627f7eb2Smrg }
1118627f7eb2Smrg }
1119627f7eb2Smrg }
1120627f7eb2Smrg }
1121627f7eb2Smrg }
1122627f7eb2Smrg free(t1);
1123627f7eb2Smrg return;
1124627f7eb2Smrg }
1125627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126627f7eb2Smrg {
1127627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
1128627f7eb2Smrg {
1129627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
1130627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1131627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
1132627f7eb2Smrg GFC_COMPLEX_4 s;
1133627f7eb2Smrg
1134627f7eb2Smrg for (y = 0; y < ycount; y++)
1135627f7eb2Smrg {
1136627f7eb2Smrg bbase_y = &bbase[y*bystride];
1137627f7eb2Smrg dest_y = &dest[y*rystride];
1138627f7eb2Smrg for (x = 0; x < xcount; x++)
1139627f7eb2Smrg {
1140627f7eb2Smrg abase_x = &abase[x*axstride];
1141627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1142627f7eb2Smrg for (n = 0; n < count; n++)
1143627f7eb2Smrg s += abase_x[n] * bbase_y[n];
1144627f7eb2Smrg dest_y[x] = s;
1145627f7eb2Smrg }
1146627f7eb2Smrg }
1147627f7eb2Smrg }
1148627f7eb2Smrg else
1149627f7eb2Smrg {
1150627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1151627f7eb2Smrg GFC_COMPLEX_4 s;
1152627f7eb2Smrg
1153627f7eb2Smrg for (y = 0; y < ycount; y++)
1154627f7eb2Smrg {
1155627f7eb2Smrg bbase_y = &bbase[y*bystride];
1156627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1157627f7eb2Smrg for (n = 0; n < count; n++)
1158627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
1159627f7eb2Smrg dest[y*rystride] = s;
1160627f7eb2Smrg }
1161627f7eb2Smrg }
1162627f7eb2Smrg }
1163627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164627f7eb2Smrg {
1165627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1166627f7eb2Smrg GFC_COMPLEX_4 s;
1167627f7eb2Smrg
1168627f7eb2Smrg for (y = 0; y < ycount; y++)
1169627f7eb2Smrg {
1170627f7eb2Smrg bbase_y = &bbase[y*bystride];
1171627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1172627f7eb2Smrg for (n = 0; n < count; n++)
1173627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
1174627f7eb2Smrg dest[y*rxstride] = s;
1175627f7eb2Smrg }
1176627f7eb2Smrg }
1177*4c3eb207Smrg else if (axstride < aystride)
1178*4c3eb207Smrg {
1179*4c3eb207Smrg for (y = 0; y < ycount; y++)
1180*4c3eb207Smrg for (x = 0; x < xcount; x++)
1181*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1182*4c3eb207Smrg
1183*4c3eb207Smrg for (y = 0; y < ycount; y++)
1184*4c3eb207Smrg for (n = 0; n < count; n++)
1185*4c3eb207Smrg for (x = 0; x < xcount; x++)
1186*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
1187*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
1188*4c3eb207Smrg abase[x*axstride + n*aystride] *
1189*4c3eb207Smrg bbase[n*bxstride + y*bystride];
1190*4c3eb207Smrg }
1191627f7eb2Smrg else
1192627f7eb2Smrg {
1193627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
1194627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1195627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
1196627f7eb2Smrg GFC_COMPLEX_4 s;
1197627f7eb2Smrg
1198627f7eb2Smrg for (y = 0; y < ycount; y++)
1199627f7eb2Smrg {
1200627f7eb2Smrg bbase_y = &bbase[y*bystride];
1201627f7eb2Smrg dest_y = &dest[y*rystride];
1202627f7eb2Smrg for (x = 0; x < xcount; x++)
1203627f7eb2Smrg {
1204627f7eb2Smrg abase_x = &abase[x*axstride];
1205627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1206627f7eb2Smrg for (n = 0; n < count; n++)
1207627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208627f7eb2Smrg dest_y[x*rxstride] = s;
1209627f7eb2Smrg }
1210627f7eb2Smrg }
1211627f7eb2Smrg }
1212627f7eb2Smrg }
1213627f7eb2Smrg #undef POW3
1214627f7eb2Smrg #undef min
1215627f7eb2Smrg #undef max
1216627f7eb2Smrg
1217627f7eb2Smrg #endif /* HAVE_AVX2 */
1218627f7eb2Smrg
1219627f7eb2Smrg #ifdef HAVE_AVX512F
1220627f7eb2Smrg static void
1221627f7eb2Smrg matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1222627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1223627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224627f7eb2Smrg static void
matmul_c4_avx512f(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225627f7eb2Smrg matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1226627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1227627f7eb2Smrg int blas_limit, blas_call gemm)
1228627f7eb2Smrg {
1229627f7eb2Smrg const GFC_COMPLEX_4 * restrict abase;
1230627f7eb2Smrg const GFC_COMPLEX_4 * restrict bbase;
1231627f7eb2Smrg GFC_COMPLEX_4 * restrict dest;
1232627f7eb2Smrg
1233627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
1235627f7eb2Smrg
1236627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
1237627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
1238627f7eb2Smrg
1239627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240627f7eb2Smrg
1241627f7eb2Smrg Either A or B (but not both) can be rank 1:
1242627f7eb2Smrg
1243627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
1244627f7eb2Smrg dimensioned [1,count], so xcount=1.
1245627f7eb2Smrg
1246627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
1247627f7eb2Smrg dimensioned [count, 1], so ycount=1.
1248627f7eb2Smrg */
1249627f7eb2Smrg
1250627f7eb2Smrg if (retarray->base_addr == NULL)
1251627f7eb2Smrg {
1252627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1253627f7eb2Smrg {
1254627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1255627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256627f7eb2Smrg }
1257627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258627f7eb2Smrg {
1259627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1260627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261627f7eb2Smrg }
1262627f7eb2Smrg else
1263627f7eb2Smrg {
1264627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1265627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266627f7eb2Smrg
1267627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
1268627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
1270627f7eb2Smrg }
1271627f7eb2Smrg
1272627f7eb2Smrg retarray->base_addr
1273627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1274627f7eb2Smrg retarray->offset = 0;
1275627f7eb2Smrg }
1276627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
1277627f7eb2Smrg {
1278627f7eb2Smrg index_type ret_extent, arg_extent;
1279627f7eb2Smrg
1280627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1281627f7eb2Smrg {
1282627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284627f7eb2Smrg if (arg_extent != ret_extent)
1285627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1286627f7eb2Smrg "array (%ld/%ld) ",
1287627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1288627f7eb2Smrg }
1289627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290627f7eb2Smrg {
1291627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293627f7eb2Smrg if (arg_extent != ret_extent)
1294627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1295627f7eb2Smrg "array (%ld/%ld) ",
1296627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1297627f7eb2Smrg }
1298627f7eb2Smrg else
1299627f7eb2Smrg {
1300627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302627f7eb2Smrg if (arg_extent != ret_extent)
1303627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1304627f7eb2Smrg "array (%ld/%ld) ",
1305627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1306627f7eb2Smrg
1307627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309627f7eb2Smrg if (arg_extent != ret_extent)
1310627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
1311627f7eb2Smrg "array (%ld/%ld) ",
1312627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1313627f7eb2Smrg }
1314627f7eb2Smrg }
1315627f7eb2Smrg
1316627f7eb2Smrg
1317627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318627f7eb2Smrg {
1319627f7eb2Smrg /* One-dimensional result may be addressed in the code below
1320627f7eb2Smrg either as a row or a column matrix. We want both cases to
1321627f7eb2Smrg work. */
1322627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323627f7eb2Smrg }
1324627f7eb2Smrg else
1325627f7eb2Smrg {
1326627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328627f7eb2Smrg }
1329627f7eb2Smrg
1330627f7eb2Smrg
1331627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1332627f7eb2Smrg {
1333627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
1334627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335627f7eb2Smrg aystride = 1;
1336627f7eb2Smrg
1337627f7eb2Smrg xcount = 1;
1338627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
1339627f7eb2Smrg }
1340627f7eb2Smrg else
1341627f7eb2Smrg {
1342627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344627f7eb2Smrg
1345627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
1346627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347627f7eb2Smrg }
1348627f7eb2Smrg
1349627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350627f7eb2Smrg {
1351627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
1354627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355627f7eb2Smrg }
1356627f7eb2Smrg
1357627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
1358627f7eb2Smrg {
1359627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
1360627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361627f7eb2Smrg
1362627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
1363627f7eb2Smrg The value is only used for calculation of the
1364627f7eb2Smrg memory by the buffer. */
1365627f7eb2Smrg bystride = 256;
1366627f7eb2Smrg ycount = 1;
1367627f7eb2Smrg }
1368627f7eb2Smrg else
1369627f7eb2Smrg {
1370627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373627f7eb2Smrg }
1374627f7eb2Smrg
1375627f7eb2Smrg abase = a->base_addr;
1376627f7eb2Smrg bbase = b->base_addr;
1377627f7eb2Smrg dest = retarray->base_addr;
1378627f7eb2Smrg
1379627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
1380627f7eb2Smrg itself. */
1381627f7eb2Smrg
1382627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1384627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1385627f7eb2Smrg
1386627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387627f7eb2Smrg && (bxstride == 1 || bystride == 1)
1388627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
1389627f7eb2Smrg > POW3(blas_limit)))
1390627f7eb2Smrg {
1391627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
1392627f7eb2Smrg const GFC_COMPLEX_4 one = 1, zero = 0;
1393627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
1394627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
1395627f7eb2Smrg
1396627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397627f7eb2Smrg {
1398627f7eb2Smrg assert (gemm != NULL);
1399627f7eb2Smrg const char *transa, *transb;
1400627f7eb2Smrg if (try_blas & 2)
1401627f7eb2Smrg transa = "C";
1402627f7eb2Smrg else
1403627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
1404627f7eb2Smrg
1405627f7eb2Smrg if (try_blas & 4)
1406627f7eb2Smrg transb = "C";
1407627f7eb2Smrg else
1408627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
1409627f7eb2Smrg
1410627f7eb2Smrg gemm (transa, transb , &m,
1411627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1412627f7eb2Smrg &ldc, 1, 1);
1413627f7eb2Smrg return;
1414627f7eb2Smrg }
1415627f7eb2Smrg }
1416627f7eb2Smrg
1417*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
1418*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
1419627f7eb2Smrg {
1420627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
1421627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1422627f7eb2Smrg
1423627f7eb2Smrg Bo Kagstrom and Per Ling
1424627f7eb2Smrg Department of Computing Science
1425627f7eb2Smrg Umea University
1426627f7eb2Smrg S-901 87 Umea, Sweden
1427627f7eb2Smrg
1428627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
1429627f7eb2Smrg
1430627f7eb2Smrg const GFC_COMPLEX_4 *a, *b;
1431627f7eb2Smrg GFC_COMPLEX_4 *c;
1432627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
1433627f7eb2Smrg
1434627f7eb2Smrg /* System generated locals */
1435627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436627f7eb2Smrg i1, i2, i3, i4, i5, i6;
1437627f7eb2Smrg
1438627f7eb2Smrg /* Local variables */
1439627f7eb2Smrg GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
1440627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
1441627f7eb2Smrg index_type i, j, l, ii, jj, ll;
1442627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443627f7eb2Smrg GFC_COMPLEX_4 *t1;
1444627f7eb2Smrg
1445627f7eb2Smrg a = abase;
1446627f7eb2Smrg b = bbase;
1447627f7eb2Smrg c = retarray->base_addr;
1448627f7eb2Smrg
1449627f7eb2Smrg /* Parameter adjustments */
1450627f7eb2Smrg c_dim1 = rystride;
1451627f7eb2Smrg c_offset = 1 + c_dim1;
1452627f7eb2Smrg c -= c_offset;
1453627f7eb2Smrg a_dim1 = aystride;
1454627f7eb2Smrg a_offset = 1 + a_dim1;
1455627f7eb2Smrg a -= a_offset;
1456627f7eb2Smrg b_dim1 = bystride;
1457627f7eb2Smrg b_offset = 1 + b_dim1;
1458627f7eb2Smrg b -= b_offset;
1459627f7eb2Smrg
1460627f7eb2Smrg /* Empty c first. */
1461627f7eb2Smrg for (j=1; j<=n; j++)
1462627f7eb2Smrg for (i=1; i<=m; i++)
1463627f7eb2Smrg c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
1464627f7eb2Smrg
1465627f7eb2Smrg /* Early exit if possible */
1466627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
1467627f7eb2Smrg return;
1468627f7eb2Smrg
1469627f7eb2Smrg /* Adjust size of t1 to what is needed. */
1470627f7eb2Smrg index_type t1_dim, a_sz;
1471627f7eb2Smrg if (aystride == 1)
1472627f7eb2Smrg a_sz = rystride;
1473627f7eb2Smrg else
1474627f7eb2Smrg a_sz = a_dim1;
1475627f7eb2Smrg
1476627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
1477627f7eb2Smrg if (t1_dim > 65536)
1478627f7eb2Smrg t1_dim = 65536;
1479627f7eb2Smrg
1480627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
1481627f7eb2Smrg
1482627f7eb2Smrg /* Start turning the crank. */
1483627f7eb2Smrg i1 = n;
1484627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
1485627f7eb2Smrg {
1486627f7eb2Smrg /* Computing MIN */
1487627f7eb2Smrg i2 = 512;
1488627f7eb2Smrg i3 = n - jj + 1;
1489627f7eb2Smrg jsec = min(i2,i3);
1490627f7eb2Smrg ujsec = jsec - jsec % 4;
1491627f7eb2Smrg i2 = k;
1492627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
1493627f7eb2Smrg {
1494627f7eb2Smrg /* Computing MIN */
1495627f7eb2Smrg i3 = 256;
1496627f7eb2Smrg i4 = k - ll + 1;
1497627f7eb2Smrg lsec = min(i3,i4);
1498627f7eb2Smrg ulsec = lsec - lsec % 2;
1499627f7eb2Smrg
1500627f7eb2Smrg i3 = m;
1501627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
1502627f7eb2Smrg {
1503627f7eb2Smrg /* Computing MIN */
1504627f7eb2Smrg i4 = 256;
1505627f7eb2Smrg i5 = m - ii + 1;
1506627f7eb2Smrg isec = min(i4,i5);
1507627f7eb2Smrg uisec = isec - isec % 2;
1508627f7eb2Smrg i4 = ll + ulsec - 1;
1509627f7eb2Smrg for (l = ll; l <= i4; l += 2)
1510627f7eb2Smrg {
1511627f7eb2Smrg i5 = ii + uisec - 1;
1512627f7eb2Smrg for (i = ii; i <= i5; i += 2)
1513627f7eb2Smrg {
1514627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515627f7eb2Smrg a[i + l * a_dim1];
1516627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517627f7eb2Smrg a[i + (l + 1) * a_dim1];
1518627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519627f7eb2Smrg a[i + 1 + l * a_dim1];
1520627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
1522627f7eb2Smrg }
1523627f7eb2Smrg if (uisec < isec)
1524627f7eb2Smrg {
1525627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
1526627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
1527627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
1528627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
1529627f7eb2Smrg }
1530627f7eb2Smrg }
1531627f7eb2Smrg if (ulsec < lsec)
1532627f7eb2Smrg {
1533627f7eb2Smrg i4 = ii + isec - 1;
1534627f7eb2Smrg for (i = ii; i<= i4; ++i)
1535627f7eb2Smrg {
1536627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
1537627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
1538627f7eb2Smrg }
1539627f7eb2Smrg }
1540627f7eb2Smrg
1541627f7eb2Smrg uisec = isec - isec % 4;
1542627f7eb2Smrg i4 = jj + ujsec - 1;
1543627f7eb2Smrg for (j = jj; j <= i4; j += 4)
1544627f7eb2Smrg {
1545627f7eb2Smrg i5 = ii + uisec - 1;
1546627f7eb2Smrg for (i = ii; i <= i5; i += 4)
1547627f7eb2Smrg {
1548627f7eb2Smrg f11 = c[i + j * c_dim1];
1549627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
1550627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
1551627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
1552627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
1553627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
1554627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
1555627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
1556627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
1557627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
1558627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
1559627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
1560627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
1561627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
1562627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
1563627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
1564627f7eb2Smrg i6 = ll + lsec - 1;
1565627f7eb2Smrg for (l = ll; l <= i6; ++l)
1566627f7eb2Smrg {
1567627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568627f7eb2Smrg * b[l + j * b_dim1];
1569627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570627f7eb2Smrg * b[l + j * b_dim1];
1571627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1573627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1575627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1577627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1579627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1581627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1583627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584627f7eb2Smrg * b[l + j * b_dim1];
1585627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586627f7eb2Smrg * b[l + j * b_dim1];
1587627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1589627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590627f7eb2Smrg * b[l + (j + 1) * b_dim1];
1591627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1593627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594627f7eb2Smrg * b[l + (j + 2) * b_dim1];
1595627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1597627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598627f7eb2Smrg * b[l + (j + 3) * b_dim1];
1599627f7eb2Smrg }
1600627f7eb2Smrg c[i + j * c_dim1] = f11;
1601627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
1602627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1603627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
1604627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1605627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
1606627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1607627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
1608627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1609627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1610627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
1611627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
1612627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
1613627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
1614627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
1615627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
1616627f7eb2Smrg }
1617627f7eb2Smrg if (uisec < isec)
1618627f7eb2Smrg {
1619627f7eb2Smrg i5 = ii + isec - 1;
1620627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1621627f7eb2Smrg {
1622627f7eb2Smrg f11 = c[i + j * c_dim1];
1623627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
1624627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
1625627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
1626627f7eb2Smrg i6 = ll + lsec - 1;
1627627f7eb2Smrg for (l = ll; l <= i6; ++l)
1628627f7eb2Smrg {
1629627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630627f7eb2Smrg 257] * b[l + j * b_dim1];
1631627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
1633627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
1635627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
1637627f7eb2Smrg }
1638627f7eb2Smrg c[i + j * c_dim1] = f11;
1639627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1640627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1641627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1642627f7eb2Smrg }
1643627f7eb2Smrg }
1644627f7eb2Smrg }
1645627f7eb2Smrg if (ujsec < jsec)
1646627f7eb2Smrg {
1647627f7eb2Smrg i4 = jj + jsec - 1;
1648627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
1649627f7eb2Smrg {
1650627f7eb2Smrg i5 = ii + uisec - 1;
1651627f7eb2Smrg for (i = ii; i <= i5; i += 4)
1652627f7eb2Smrg {
1653627f7eb2Smrg f11 = c[i + j * c_dim1];
1654627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
1655627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
1656627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
1657627f7eb2Smrg i6 = ll + lsec - 1;
1658627f7eb2Smrg for (l = ll; l <= i6; ++l)
1659627f7eb2Smrg {
1660627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661627f7eb2Smrg 257] * b[l + j * b_dim1];
1662627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663627f7eb2Smrg 257] * b[l + j * b_dim1];
1664627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665627f7eb2Smrg 257] * b[l + j * b_dim1];
1666627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667627f7eb2Smrg 257] * b[l + j * b_dim1];
1668627f7eb2Smrg }
1669627f7eb2Smrg c[i + j * c_dim1] = f11;
1670627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
1671627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1672627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1673627f7eb2Smrg }
1674627f7eb2Smrg i5 = ii + isec - 1;
1675627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1676627f7eb2Smrg {
1677627f7eb2Smrg f11 = c[i + j * c_dim1];
1678627f7eb2Smrg i6 = ll + lsec - 1;
1679627f7eb2Smrg for (l = ll; l <= i6; ++l)
1680627f7eb2Smrg {
1681627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682627f7eb2Smrg 257] * b[l + j * b_dim1];
1683627f7eb2Smrg }
1684627f7eb2Smrg c[i + j * c_dim1] = f11;
1685627f7eb2Smrg }
1686627f7eb2Smrg }
1687627f7eb2Smrg }
1688627f7eb2Smrg }
1689627f7eb2Smrg }
1690627f7eb2Smrg }
1691627f7eb2Smrg free(t1);
1692627f7eb2Smrg return;
1693627f7eb2Smrg }
1694627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695627f7eb2Smrg {
1696627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
1697627f7eb2Smrg {
1698627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
1699627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1700627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
1701627f7eb2Smrg GFC_COMPLEX_4 s;
1702627f7eb2Smrg
1703627f7eb2Smrg for (y = 0; y < ycount; y++)
1704627f7eb2Smrg {
1705627f7eb2Smrg bbase_y = &bbase[y*bystride];
1706627f7eb2Smrg dest_y = &dest[y*rystride];
1707627f7eb2Smrg for (x = 0; x < xcount; x++)
1708627f7eb2Smrg {
1709627f7eb2Smrg abase_x = &abase[x*axstride];
1710627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1711627f7eb2Smrg for (n = 0; n < count; n++)
1712627f7eb2Smrg s += abase_x[n] * bbase_y[n];
1713627f7eb2Smrg dest_y[x] = s;
1714627f7eb2Smrg }
1715627f7eb2Smrg }
1716627f7eb2Smrg }
1717627f7eb2Smrg else
1718627f7eb2Smrg {
1719627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1720627f7eb2Smrg GFC_COMPLEX_4 s;
1721627f7eb2Smrg
1722627f7eb2Smrg for (y = 0; y < ycount; y++)
1723627f7eb2Smrg {
1724627f7eb2Smrg bbase_y = &bbase[y*bystride];
1725627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1726627f7eb2Smrg for (n = 0; n < count; n++)
1727627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
1728627f7eb2Smrg dest[y*rystride] = s;
1729627f7eb2Smrg }
1730627f7eb2Smrg }
1731627f7eb2Smrg }
1732627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733627f7eb2Smrg {
1734627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1735627f7eb2Smrg GFC_COMPLEX_4 s;
1736627f7eb2Smrg
1737627f7eb2Smrg for (y = 0; y < ycount; y++)
1738627f7eb2Smrg {
1739627f7eb2Smrg bbase_y = &bbase[y*bystride];
1740627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1741627f7eb2Smrg for (n = 0; n < count; n++)
1742627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
1743627f7eb2Smrg dest[y*rxstride] = s;
1744627f7eb2Smrg }
1745627f7eb2Smrg }
1746*4c3eb207Smrg else if (axstride < aystride)
1747*4c3eb207Smrg {
1748*4c3eb207Smrg for (y = 0; y < ycount; y++)
1749*4c3eb207Smrg for (x = 0; x < xcount; x++)
1750*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1751*4c3eb207Smrg
1752*4c3eb207Smrg for (y = 0; y < ycount; y++)
1753*4c3eb207Smrg for (n = 0; n < count; n++)
1754*4c3eb207Smrg for (x = 0; x < xcount; x++)
1755*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
1756*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
1757*4c3eb207Smrg abase[x*axstride + n*aystride] *
1758*4c3eb207Smrg bbase[n*bxstride + y*bystride];
1759*4c3eb207Smrg }
1760627f7eb2Smrg else
1761627f7eb2Smrg {
1762627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
1763627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
1764627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
1765627f7eb2Smrg GFC_COMPLEX_4 s;
1766627f7eb2Smrg
1767627f7eb2Smrg for (y = 0; y < ycount; y++)
1768627f7eb2Smrg {
1769627f7eb2Smrg bbase_y = &bbase[y*bystride];
1770627f7eb2Smrg dest_y = &dest[y*rystride];
1771627f7eb2Smrg for (x = 0; x < xcount; x++)
1772627f7eb2Smrg {
1773627f7eb2Smrg abase_x = &abase[x*axstride];
1774627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
1775627f7eb2Smrg for (n = 0; n < count; n++)
1776627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777627f7eb2Smrg dest_y[x*rxstride] = s;
1778627f7eb2Smrg }
1779627f7eb2Smrg }
1780627f7eb2Smrg }
1781627f7eb2Smrg }
1782627f7eb2Smrg #undef POW3
1783627f7eb2Smrg #undef min
1784627f7eb2Smrg #undef max
1785627f7eb2Smrg
1786627f7eb2Smrg #endif /* HAVE_AVX512F */
1787627f7eb2Smrg
1788627f7eb2Smrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1789627f7eb2Smrg
1790627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791627f7eb2Smrg void
1792627f7eb2Smrg matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
1793627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1794627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795627f7eb2Smrg internal_proto(matmul_c4_avx128_fma3);
1796627f7eb2Smrg #endif
1797627f7eb2Smrg
1798627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799627f7eb2Smrg void
1800627f7eb2Smrg matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
1801627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1802627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803627f7eb2Smrg internal_proto(matmul_c4_avx128_fma4);
1804627f7eb2Smrg #endif
1805627f7eb2Smrg
1806627f7eb2Smrg /* Function to fall back to if there is no special processor-specific version. */
1807627f7eb2Smrg static void
matmul_c4_vanilla(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808627f7eb2Smrg matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
1809627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1810627f7eb2Smrg int blas_limit, blas_call gemm)
1811627f7eb2Smrg {
1812627f7eb2Smrg const GFC_COMPLEX_4 * restrict abase;
1813627f7eb2Smrg const GFC_COMPLEX_4 * restrict bbase;
1814627f7eb2Smrg GFC_COMPLEX_4 * restrict dest;
1815627f7eb2Smrg
1816627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
1818627f7eb2Smrg
1819627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
1820627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
1821627f7eb2Smrg
1822627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823627f7eb2Smrg
1824627f7eb2Smrg Either A or B (but not both) can be rank 1:
1825627f7eb2Smrg
1826627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
1827627f7eb2Smrg dimensioned [1,count], so xcount=1.
1828627f7eb2Smrg
1829627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
1830627f7eb2Smrg dimensioned [count, 1], so ycount=1.
1831627f7eb2Smrg */
1832627f7eb2Smrg
1833627f7eb2Smrg if (retarray->base_addr == NULL)
1834627f7eb2Smrg {
1835627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1836627f7eb2Smrg {
1837627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1838627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839627f7eb2Smrg }
1840627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841627f7eb2Smrg {
1842627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1843627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844627f7eb2Smrg }
1845627f7eb2Smrg else
1846627f7eb2Smrg {
1847627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
1848627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849627f7eb2Smrg
1850627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
1851627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
1853627f7eb2Smrg }
1854627f7eb2Smrg
1855627f7eb2Smrg retarray->base_addr
1856627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1857627f7eb2Smrg retarray->offset = 0;
1858627f7eb2Smrg }
1859627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
1860627f7eb2Smrg {
1861627f7eb2Smrg index_type ret_extent, arg_extent;
1862627f7eb2Smrg
1863627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1864627f7eb2Smrg {
1865627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867627f7eb2Smrg if (arg_extent != ret_extent)
1868627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1869627f7eb2Smrg "array (%ld/%ld) ",
1870627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1871627f7eb2Smrg }
1872627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873627f7eb2Smrg {
1874627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876627f7eb2Smrg if (arg_extent != ret_extent)
1877627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1878627f7eb2Smrg "array (%ld/%ld) ",
1879627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1880627f7eb2Smrg }
1881627f7eb2Smrg else
1882627f7eb2Smrg {
1883627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885627f7eb2Smrg if (arg_extent != ret_extent)
1886627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
1887627f7eb2Smrg "array (%ld/%ld) ",
1888627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1889627f7eb2Smrg
1890627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892627f7eb2Smrg if (arg_extent != ret_extent)
1893627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
1894627f7eb2Smrg "array (%ld/%ld) ",
1895627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
1896627f7eb2Smrg }
1897627f7eb2Smrg }
1898627f7eb2Smrg
1899627f7eb2Smrg
1900627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901627f7eb2Smrg {
1902627f7eb2Smrg /* One-dimensional result may be addressed in the code below
1903627f7eb2Smrg either as a row or a column matrix. We want both cases to
1904627f7eb2Smrg work. */
1905627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906627f7eb2Smrg }
1907627f7eb2Smrg else
1908627f7eb2Smrg {
1909627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911627f7eb2Smrg }
1912627f7eb2Smrg
1913627f7eb2Smrg
1914627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
1915627f7eb2Smrg {
1916627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
1917627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918627f7eb2Smrg aystride = 1;
1919627f7eb2Smrg
1920627f7eb2Smrg xcount = 1;
1921627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
1922627f7eb2Smrg }
1923627f7eb2Smrg else
1924627f7eb2Smrg {
1925627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927627f7eb2Smrg
1928627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
1929627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930627f7eb2Smrg }
1931627f7eb2Smrg
1932627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933627f7eb2Smrg {
1934627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
1937627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938627f7eb2Smrg }
1939627f7eb2Smrg
1940627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
1941627f7eb2Smrg {
1942627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
1943627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944627f7eb2Smrg
1945627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
1946627f7eb2Smrg The value is only used for calculation of the
1947627f7eb2Smrg memory by the buffer. */
1948627f7eb2Smrg bystride = 256;
1949627f7eb2Smrg ycount = 1;
1950627f7eb2Smrg }
1951627f7eb2Smrg else
1952627f7eb2Smrg {
1953627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956627f7eb2Smrg }
1957627f7eb2Smrg
1958627f7eb2Smrg abase = a->base_addr;
1959627f7eb2Smrg bbase = b->base_addr;
1960627f7eb2Smrg dest = retarray->base_addr;
1961627f7eb2Smrg
1962627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
1963627f7eb2Smrg itself. */
1964627f7eb2Smrg
1965627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
1967627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
1968627f7eb2Smrg
1969627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970627f7eb2Smrg && (bxstride == 1 || bystride == 1)
1971627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
1972627f7eb2Smrg > POW3(blas_limit)))
1973627f7eb2Smrg {
1974627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
1975627f7eb2Smrg const GFC_COMPLEX_4 one = 1, zero = 0;
1976627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
1977627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
1978627f7eb2Smrg
1979627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980627f7eb2Smrg {
1981627f7eb2Smrg assert (gemm != NULL);
1982627f7eb2Smrg const char *transa, *transb;
1983627f7eb2Smrg if (try_blas & 2)
1984627f7eb2Smrg transa = "C";
1985627f7eb2Smrg else
1986627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
1987627f7eb2Smrg
1988627f7eb2Smrg if (try_blas & 4)
1989627f7eb2Smrg transb = "C";
1990627f7eb2Smrg else
1991627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
1992627f7eb2Smrg
1993627f7eb2Smrg gemm (transa, transb , &m,
1994627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1995627f7eb2Smrg &ldc, 1, 1);
1996627f7eb2Smrg return;
1997627f7eb2Smrg }
1998627f7eb2Smrg }
1999627f7eb2Smrg
2000*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
2001*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
2002627f7eb2Smrg {
2003627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
2004627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2005627f7eb2Smrg
2006627f7eb2Smrg Bo Kagstrom and Per Ling
2007627f7eb2Smrg Department of Computing Science
2008627f7eb2Smrg Umea University
2009627f7eb2Smrg S-901 87 Umea, Sweden
2010627f7eb2Smrg
2011627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
2012627f7eb2Smrg
2013627f7eb2Smrg const GFC_COMPLEX_4 *a, *b;
2014627f7eb2Smrg GFC_COMPLEX_4 *c;
2015627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
2016627f7eb2Smrg
2017627f7eb2Smrg /* System generated locals */
2018627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019627f7eb2Smrg i1, i2, i3, i4, i5, i6;
2020627f7eb2Smrg
2021627f7eb2Smrg /* Local variables */
2022627f7eb2Smrg GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
2023627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
2024627f7eb2Smrg index_type i, j, l, ii, jj, ll;
2025627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026627f7eb2Smrg GFC_COMPLEX_4 *t1;
2027627f7eb2Smrg
2028627f7eb2Smrg a = abase;
2029627f7eb2Smrg b = bbase;
2030627f7eb2Smrg c = retarray->base_addr;
2031627f7eb2Smrg
2032627f7eb2Smrg /* Parameter adjustments */
2033627f7eb2Smrg c_dim1 = rystride;
2034627f7eb2Smrg c_offset = 1 + c_dim1;
2035627f7eb2Smrg c -= c_offset;
2036627f7eb2Smrg a_dim1 = aystride;
2037627f7eb2Smrg a_offset = 1 + a_dim1;
2038627f7eb2Smrg a -= a_offset;
2039627f7eb2Smrg b_dim1 = bystride;
2040627f7eb2Smrg b_offset = 1 + b_dim1;
2041627f7eb2Smrg b -= b_offset;
2042627f7eb2Smrg
2043627f7eb2Smrg /* Empty c first. */
2044627f7eb2Smrg for (j=1; j<=n; j++)
2045627f7eb2Smrg for (i=1; i<=m; i++)
2046627f7eb2Smrg c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
2047627f7eb2Smrg
2048627f7eb2Smrg /* Early exit if possible */
2049627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
2050627f7eb2Smrg return;
2051627f7eb2Smrg
2052627f7eb2Smrg /* Adjust size of t1 to what is needed. */
2053627f7eb2Smrg index_type t1_dim, a_sz;
2054627f7eb2Smrg if (aystride == 1)
2055627f7eb2Smrg a_sz = rystride;
2056627f7eb2Smrg else
2057627f7eb2Smrg a_sz = a_dim1;
2058627f7eb2Smrg
2059627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
2060627f7eb2Smrg if (t1_dim > 65536)
2061627f7eb2Smrg t1_dim = 65536;
2062627f7eb2Smrg
2063627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
2064627f7eb2Smrg
2065627f7eb2Smrg /* Start turning the crank. */
2066627f7eb2Smrg i1 = n;
2067627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
2068627f7eb2Smrg {
2069627f7eb2Smrg /* Computing MIN */
2070627f7eb2Smrg i2 = 512;
2071627f7eb2Smrg i3 = n - jj + 1;
2072627f7eb2Smrg jsec = min(i2,i3);
2073627f7eb2Smrg ujsec = jsec - jsec % 4;
2074627f7eb2Smrg i2 = k;
2075627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
2076627f7eb2Smrg {
2077627f7eb2Smrg /* Computing MIN */
2078627f7eb2Smrg i3 = 256;
2079627f7eb2Smrg i4 = k - ll + 1;
2080627f7eb2Smrg lsec = min(i3,i4);
2081627f7eb2Smrg ulsec = lsec - lsec % 2;
2082627f7eb2Smrg
2083627f7eb2Smrg i3 = m;
2084627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
2085627f7eb2Smrg {
2086627f7eb2Smrg /* Computing MIN */
2087627f7eb2Smrg i4 = 256;
2088627f7eb2Smrg i5 = m - ii + 1;
2089627f7eb2Smrg isec = min(i4,i5);
2090627f7eb2Smrg uisec = isec - isec % 2;
2091627f7eb2Smrg i4 = ll + ulsec - 1;
2092627f7eb2Smrg for (l = ll; l <= i4; l += 2)
2093627f7eb2Smrg {
2094627f7eb2Smrg i5 = ii + uisec - 1;
2095627f7eb2Smrg for (i = ii; i <= i5; i += 2)
2096627f7eb2Smrg {
2097627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098627f7eb2Smrg a[i + l * a_dim1];
2099627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100627f7eb2Smrg a[i + (l + 1) * a_dim1];
2101627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102627f7eb2Smrg a[i + 1 + l * a_dim1];
2103627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
2105627f7eb2Smrg }
2106627f7eb2Smrg if (uisec < isec)
2107627f7eb2Smrg {
2108627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
2109627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
2110627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
2111627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
2112627f7eb2Smrg }
2113627f7eb2Smrg }
2114627f7eb2Smrg if (ulsec < lsec)
2115627f7eb2Smrg {
2116627f7eb2Smrg i4 = ii + isec - 1;
2117627f7eb2Smrg for (i = ii; i<= i4; ++i)
2118627f7eb2Smrg {
2119627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
2120627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
2121627f7eb2Smrg }
2122627f7eb2Smrg }
2123627f7eb2Smrg
2124627f7eb2Smrg uisec = isec - isec % 4;
2125627f7eb2Smrg i4 = jj + ujsec - 1;
2126627f7eb2Smrg for (j = jj; j <= i4; j += 4)
2127627f7eb2Smrg {
2128627f7eb2Smrg i5 = ii + uisec - 1;
2129627f7eb2Smrg for (i = ii; i <= i5; i += 4)
2130627f7eb2Smrg {
2131627f7eb2Smrg f11 = c[i + j * c_dim1];
2132627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
2133627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
2134627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
2135627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
2136627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
2137627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
2138627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
2139627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
2140627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
2141627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
2142627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
2143627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
2144627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
2145627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
2146627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
2147627f7eb2Smrg i6 = ll + lsec - 1;
2148627f7eb2Smrg for (l = ll; l <= i6; ++l)
2149627f7eb2Smrg {
2150627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151627f7eb2Smrg * b[l + j * b_dim1];
2152627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153627f7eb2Smrg * b[l + j * b_dim1];
2154627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2156627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2158627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2160627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2162627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2164627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2166627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167627f7eb2Smrg * b[l + j * b_dim1];
2168627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169627f7eb2Smrg * b[l + j * b_dim1];
2170627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2172627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2174627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2176627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2178627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2180627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2182627f7eb2Smrg }
2183627f7eb2Smrg c[i + j * c_dim1] = f11;
2184627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
2185627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
2186627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
2187627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
2188627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
2189627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
2190627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
2191627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
2192627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
2193627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
2194627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
2195627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
2196627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
2197627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
2198627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
2199627f7eb2Smrg }
2200627f7eb2Smrg if (uisec < isec)
2201627f7eb2Smrg {
2202627f7eb2Smrg i5 = ii + isec - 1;
2203627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
2204627f7eb2Smrg {
2205627f7eb2Smrg f11 = c[i + j * c_dim1];
2206627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
2207627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
2208627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
2209627f7eb2Smrg i6 = ll + lsec - 1;
2210627f7eb2Smrg for (l = ll; l <= i6; ++l)
2211627f7eb2Smrg {
2212627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213627f7eb2Smrg 257] * b[l + j * b_dim1];
2214627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
2216627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
2218627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
2220627f7eb2Smrg }
2221627f7eb2Smrg c[i + j * c_dim1] = f11;
2222627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
2223627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
2224627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
2225627f7eb2Smrg }
2226627f7eb2Smrg }
2227627f7eb2Smrg }
2228627f7eb2Smrg if (ujsec < jsec)
2229627f7eb2Smrg {
2230627f7eb2Smrg i4 = jj + jsec - 1;
2231627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
2232627f7eb2Smrg {
2233627f7eb2Smrg i5 = ii + uisec - 1;
2234627f7eb2Smrg for (i = ii; i <= i5; i += 4)
2235627f7eb2Smrg {
2236627f7eb2Smrg f11 = c[i + j * c_dim1];
2237627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
2238627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
2239627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
2240627f7eb2Smrg i6 = ll + lsec - 1;
2241627f7eb2Smrg for (l = ll; l <= i6; ++l)
2242627f7eb2Smrg {
2243627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244627f7eb2Smrg 257] * b[l + j * b_dim1];
2245627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246627f7eb2Smrg 257] * b[l + j * b_dim1];
2247627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248627f7eb2Smrg 257] * b[l + j * b_dim1];
2249627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250627f7eb2Smrg 257] * b[l + j * b_dim1];
2251627f7eb2Smrg }
2252627f7eb2Smrg c[i + j * c_dim1] = f11;
2253627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
2254627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
2255627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
2256627f7eb2Smrg }
2257627f7eb2Smrg i5 = ii + isec - 1;
2258627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
2259627f7eb2Smrg {
2260627f7eb2Smrg f11 = c[i + j * c_dim1];
2261627f7eb2Smrg i6 = ll + lsec - 1;
2262627f7eb2Smrg for (l = ll; l <= i6; ++l)
2263627f7eb2Smrg {
2264627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265627f7eb2Smrg 257] * b[l + j * b_dim1];
2266627f7eb2Smrg }
2267627f7eb2Smrg c[i + j * c_dim1] = f11;
2268627f7eb2Smrg }
2269627f7eb2Smrg }
2270627f7eb2Smrg }
2271627f7eb2Smrg }
2272627f7eb2Smrg }
2273627f7eb2Smrg }
2274627f7eb2Smrg free(t1);
2275627f7eb2Smrg return;
2276627f7eb2Smrg }
2277627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278627f7eb2Smrg {
2279627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
2280627f7eb2Smrg {
2281627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
2282627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2283627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
2284627f7eb2Smrg GFC_COMPLEX_4 s;
2285627f7eb2Smrg
2286627f7eb2Smrg for (y = 0; y < ycount; y++)
2287627f7eb2Smrg {
2288627f7eb2Smrg bbase_y = &bbase[y*bystride];
2289627f7eb2Smrg dest_y = &dest[y*rystride];
2290627f7eb2Smrg for (x = 0; x < xcount; x++)
2291627f7eb2Smrg {
2292627f7eb2Smrg abase_x = &abase[x*axstride];
2293627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2294627f7eb2Smrg for (n = 0; n < count; n++)
2295627f7eb2Smrg s += abase_x[n] * bbase_y[n];
2296627f7eb2Smrg dest_y[x] = s;
2297627f7eb2Smrg }
2298627f7eb2Smrg }
2299627f7eb2Smrg }
2300627f7eb2Smrg else
2301627f7eb2Smrg {
2302627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2303627f7eb2Smrg GFC_COMPLEX_4 s;
2304627f7eb2Smrg
2305627f7eb2Smrg for (y = 0; y < ycount; y++)
2306627f7eb2Smrg {
2307627f7eb2Smrg bbase_y = &bbase[y*bystride];
2308627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2309627f7eb2Smrg for (n = 0; n < count; n++)
2310627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
2311627f7eb2Smrg dest[y*rystride] = s;
2312627f7eb2Smrg }
2313627f7eb2Smrg }
2314627f7eb2Smrg }
2315627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316627f7eb2Smrg {
2317627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2318627f7eb2Smrg GFC_COMPLEX_4 s;
2319627f7eb2Smrg
2320627f7eb2Smrg for (y = 0; y < ycount; y++)
2321627f7eb2Smrg {
2322627f7eb2Smrg bbase_y = &bbase[y*bystride];
2323627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2324627f7eb2Smrg for (n = 0; n < count; n++)
2325627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
2326627f7eb2Smrg dest[y*rxstride] = s;
2327627f7eb2Smrg }
2328627f7eb2Smrg }
2329*4c3eb207Smrg else if (axstride < aystride)
2330*4c3eb207Smrg {
2331*4c3eb207Smrg for (y = 0; y < ycount; y++)
2332*4c3eb207Smrg for (x = 0; x < xcount; x++)
2333*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2334*4c3eb207Smrg
2335*4c3eb207Smrg for (y = 0; y < ycount; y++)
2336*4c3eb207Smrg for (n = 0; n < count; n++)
2337*4c3eb207Smrg for (x = 0; x < xcount; x++)
2338*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
2339*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
2340*4c3eb207Smrg abase[x*axstride + n*aystride] *
2341*4c3eb207Smrg bbase[n*bxstride + y*bystride];
2342*4c3eb207Smrg }
2343627f7eb2Smrg else
2344627f7eb2Smrg {
2345627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
2346627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2347627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
2348627f7eb2Smrg GFC_COMPLEX_4 s;
2349627f7eb2Smrg
2350627f7eb2Smrg for (y = 0; y < ycount; y++)
2351627f7eb2Smrg {
2352627f7eb2Smrg bbase_y = &bbase[y*bystride];
2353627f7eb2Smrg dest_y = &dest[y*rystride];
2354627f7eb2Smrg for (x = 0; x < xcount; x++)
2355627f7eb2Smrg {
2356627f7eb2Smrg abase_x = &abase[x*axstride];
2357627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2358627f7eb2Smrg for (n = 0; n < count; n++)
2359627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360627f7eb2Smrg dest_y[x*rxstride] = s;
2361627f7eb2Smrg }
2362627f7eb2Smrg }
2363627f7eb2Smrg }
2364627f7eb2Smrg }
2365627f7eb2Smrg #undef POW3
2366627f7eb2Smrg #undef min
2367627f7eb2Smrg #undef max
2368627f7eb2Smrg
2369627f7eb2Smrg
2370627f7eb2Smrg /* Compiling main function, with selection code for the processor. */
2371627f7eb2Smrg
2372627f7eb2Smrg /* Currently, this is i386 only. Adjust for other architectures. */
2373627f7eb2Smrg
2374627f7eb2Smrg #include <config/i386/cpuinfo.h>
matmul_c4(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2375627f7eb2Smrg void matmul_c4 (gfc_array_c4 * const restrict retarray,
2376627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2377627f7eb2Smrg int blas_limit, blas_call gemm)
2378627f7eb2Smrg {
2379627f7eb2Smrg static void (*matmul_p) (gfc_array_c4 * const restrict retarray,
2380627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2381627f7eb2Smrg int blas_limit, blas_call gemm);
2382627f7eb2Smrg
2383627f7eb2Smrg void (*matmul_fn) (gfc_array_c4 * const restrict retarray,
2384627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2385627f7eb2Smrg int blas_limit, blas_call gemm);
2386627f7eb2Smrg
2387627f7eb2Smrg matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2388627f7eb2Smrg if (matmul_fn == NULL)
2389627f7eb2Smrg {
2390627f7eb2Smrg matmul_fn = matmul_c4_vanilla;
2391627f7eb2Smrg if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2392627f7eb2Smrg {
2393627f7eb2Smrg /* Run down the available processors in order of preference. */
2394627f7eb2Smrg #ifdef HAVE_AVX512F
2395627f7eb2Smrg if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2396627f7eb2Smrg {
2397627f7eb2Smrg matmul_fn = matmul_c4_avx512f;
2398627f7eb2Smrg goto store;
2399627f7eb2Smrg }
2400627f7eb2Smrg
2401627f7eb2Smrg #endif /* HAVE_AVX512F */
2402627f7eb2Smrg
2403627f7eb2Smrg #ifdef HAVE_AVX2
2404627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2405627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2406627f7eb2Smrg {
2407627f7eb2Smrg matmul_fn = matmul_c4_avx2;
2408627f7eb2Smrg goto store;
2409627f7eb2Smrg }
2410627f7eb2Smrg
2411627f7eb2Smrg #endif
2412627f7eb2Smrg
2413627f7eb2Smrg #ifdef HAVE_AVX
2414627f7eb2Smrg if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2415627f7eb2Smrg {
2416627f7eb2Smrg matmul_fn = matmul_c4_avx;
2417627f7eb2Smrg goto store;
2418627f7eb2Smrg }
2419627f7eb2Smrg #endif /* HAVE_AVX */
2420627f7eb2Smrg }
2421627f7eb2Smrg else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2422627f7eb2Smrg {
2423627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2424627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2425627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2426627f7eb2Smrg {
2427627f7eb2Smrg matmul_fn = matmul_c4_avx128_fma3;
2428627f7eb2Smrg goto store;
2429627f7eb2Smrg }
2430627f7eb2Smrg #endif
2431627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2432627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2433627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2434627f7eb2Smrg {
2435627f7eb2Smrg matmul_fn = matmul_c4_avx128_fma4;
2436627f7eb2Smrg goto store;
2437627f7eb2Smrg }
2438627f7eb2Smrg #endif
2439627f7eb2Smrg
2440627f7eb2Smrg }
2441627f7eb2Smrg store:
2442627f7eb2Smrg __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2443627f7eb2Smrg }
2444627f7eb2Smrg
2445627f7eb2Smrg (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2446627f7eb2Smrg }
2447627f7eb2Smrg
2448627f7eb2Smrg #else /* Just the vanilla function. */
2449627f7eb2Smrg
2450627f7eb2Smrg void
matmul_c4(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2451627f7eb2Smrg matmul_c4 (gfc_array_c4 * const restrict retarray,
2452627f7eb2Smrg gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2453627f7eb2Smrg int blas_limit, blas_call gemm)
2454627f7eb2Smrg {
2455627f7eb2Smrg const GFC_COMPLEX_4 * restrict abase;
2456627f7eb2Smrg const GFC_COMPLEX_4 * restrict bbase;
2457627f7eb2Smrg GFC_COMPLEX_4 * restrict dest;
2458627f7eb2Smrg
2459627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2460627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
2461627f7eb2Smrg
2462627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
2463627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
2464627f7eb2Smrg
2465627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2466627f7eb2Smrg
2467627f7eb2Smrg Either A or B (but not both) can be rank 1:
2468627f7eb2Smrg
2469627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
2470627f7eb2Smrg dimensioned [1,count], so xcount=1.
2471627f7eb2Smrg
2472627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
2473627f7eb2Smrg dimensioned [count, 1], so ycount=1.
2474627f7eb2Smrg */
2475627f7eb2Smrg
2476627f7eb2Smrg if (retarray->base_addr == NULL)
2477627f7eb2Smrg {
2478627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
2479627f7eb2Smrg {
2480627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
2481627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2482627f7eb2Smrg }
2483627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
2484627f7eb2Smrg {
2485627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
2486627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487627f7eb2Smrg }
2488627f7eb2Smrg else
2489627f7eb2Smrg {
2490627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
2491627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2492627f7eb2Smrg
2493627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
2494627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2495627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
2496627f7eb2Smrg }
2497627f7eb2Smrg
2498627f7eb2Smrg retarray->base_addr
2499627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
2500627f7eb2Smrg retarray->offset = 0;
2501627f7eb2Smrg }
2502627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
2503627f7eb2Smrg {
2504627f7eb2Smrg index_type ret_extent, arg_extent;
2505627f7eb2Smrg
2506627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
2507627f7eb2Smrg {
2508627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2509627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2510627f7eb2Smrg if (arg_extent != ret_extent)
2511627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
2512627f7eb2Smrg "array (%ld/%ld) ",
2513627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
2514627f7eb2Smrg }
2515627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
2516627f7eb2Smrg {
2517627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2518627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2519627f7eb2Smrg if (arg_extent != ret_extent)
2520627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
2521627f7eb2Smrg "array (%ld/%ld) ",
2522627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
2523627f7eb2Smrg }
2524627f7eb2Smrg else
2525627f7eb2Smrg {
2526627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2527627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2528627f7eb2Smrg if (arg_extent != ret_extent)
2529627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
2530627f7eb2Smrg "array (%ld/%ld) ",
2531627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
2532627f7eb2Smrg
2533627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2534627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2535627f7eb2Smrg if (arg_extent != ret_extent)
2536627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
2537627f7eb2Smrg "array (%ld/%ld) ",
2538627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
2539627f7eb2Smrg }
2540627f7eb2Smrg }
2541627f7eb2Smrg
2542627f7eb2Smrg
2543627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2544627f7eb2Smrg {
2545627f7eb2Smrg /* One-dimensional result may be addressed in the code below
2546627f7eb2Smrg either as a row or a column matrix. We want both cases to
2547627f7eb2Smrg work. */
2548627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2549627f7eb2Smrg }
2550627f7eb2Smrg else
2551627f7eb2Smrg {
2552627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2553627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2554627f7eb2Smrg }
2555627f7eb2Smrg
2556627f7eb2Smrg
2557627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
2558627f7eb2Smrg {
2559627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
2560627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2561627f7eb2Smrg aystride = 1;
2562627f7eb2Smrg
2563627f7eb2Smrg xcount = 1;
2564627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
2565627f7eb2Smrg }
2566627f7eb2Smrg else
2567627f7eb2Smrg {
2568627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2569627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2570627f7eb2Smrg
2571627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
2572627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2573627f7eb2Smrg }
2574627f7eb2Smrg
2575627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2576627f7eb2Smrg {
2577627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2578627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2579627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
2580627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2581627f7eb2Smrg }
2582627f7eb2Smrg
2583627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
2584627f7eb2Smrg {
2585627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
2586627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2587627f7eb2Smrg
2588627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
2589627f7eb2Smrg The value is only used for calculation of the
2590627f7eb2Smrg memory by the buffer. */
2591627f7eb2Smrg bystride = 256;
2592627f7eb2Smrg ycount = 1;
2593627f7eb2Smrg }
2594627f7eb2Smrg else
2595627f7eb2Smrg {
2596627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2597627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2598627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2599627f7eb2Smrg }
2600627f7eb2Smrg
2601627f7eb2Smrg abase = a->base_addr;
2602627f7eb2Smrg bbase = b->base_addr;
2603627f7eb2Smrg dest = retarray->base_addr;
2604627f7eb2Smrg
2605627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
2606627f7eb2Smrg itself. */
2607627f7eb2Smrg
2608627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2609627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
2610627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
2611627f7eb2Smrg
2612627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2613627f7eb2Smrg && (bxstride == 1 || bystride == 1)
2614627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
2615627f7eb2Smrg > POW3(blas_limit)))
2616627f7eb2Smrg {
2617627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
2618627f7eb2Smrg const GFC_COMPLEX_4 one = 1, zero = 0;
2619627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
2620627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
2621627f7eb2Smrg
2622627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2623627f7eb2Smrg {
2624627f7eb2Smrg assert (gemm != NULL);
2625627f7eb2Smrg const char *transa, *transb;
2626627f7eb2Smrg if (try_blas & 2)
2627627f7eb2Smrg transa = "C";
2628627f7eb2Smrg else
2629627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
2630627f7eb2Smrg
2631627f7eb2Smrg if (try_blas & 4)
2632627f7eb2Smrg transb = "C";
2633627f7eb2Smrg else
2634627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
2635627f7eb2Smrg
2636627f7eb2Smrg gemm (transa, transb , &m,
2637627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2638627f7eb2Smrg &ldc, 1, 1);
2639627f7eb2Smrg return;
2640627f7eb2Smrg }
2641627f7eb2Smrg }
2642627f7eb2Smrg
2643*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
2644*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
2645627f7eb2Smrg {
2646627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
2647627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2648627f7eb2Smrg
2649627f7eb2Smrg Bo Kagstrom and Per Ling
2650627f7eb2Smrg Department of Computing Science
2651627f7eb2Smrg Umea University
2652627f7eb2Smrg S-901 87 Umea, Sweden
2653627f7eb2Smrg
2654627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
2655627f7eb2Smrg
2656627f7eb2Smrg const GFC_COMPLEX_4 *a, *b;
2657627f7eb2Smrg GFC_COMPLEX_4 *c;
2658627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
2659627f7eb2Smrg
2660627f7eb2Smrg /* System generated locals */
2661627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2662627f7eb2Smrg i1, i2, i3, i4, i5, i6;
2663627f7eb2Smrg
2664627f7eb2Smrg /* Local variables */
2665627f7eb2Smrg GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
2666627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
2667627f7eb2Smrg index_type i, j, l, ii, jj, ll;
2668627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2669627f7eb2Smrg GFC_COMPLEX_4 *t1;
2670627f7eb2Smrg
2671627f7eb2Smrg a = abase;
2672627f7eb2Smrg b = bbase;
2673627f7eb2Smrg c = retarray->base_addr;
2674627f7eb2Smrg
2675627f7eb2Smrg /* Parameter adjustments */
2676627f7eb2Smrg c_dim1 = rystride;
2677627f7eb2Smrg c_offset = 1 + c_dim1;
2678627f7eb2Smrg c -= c_offset;
2679627f7eb2Smrg a_dim1 = aystride;
2680627f7eb2Smrg a_offset = 1 + a_dim1;
2681627f7eb2Smrg a -= a_offset;
2682627f7eb2Smrg b_dim1 = bystride;
2683627f7eb2Smrg b_offset = 1 + b_dim1;
2684627f7eb2Smrg b -= b_offset;
2685627f7eb2Smrg
2686627f7eb2Smrg /* Empty c first. */
2687627f7eb2Smrg for (j=1; j<=n; j++)
2688627f7eb2Smrg for (i=1; i<=m; i++)
2689627f7eb2Smrg c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
2690627f7eb2Smrg
2691627f7eb2Smrg /* Early exit if possible */
2692627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
2693627f7eb2Smrg return;
2694627f7eb2Smrg
2695627f7eb2Smrg /* Adjust size of t1 to what is needed. */
2696627f7eb2Smrg index_type t1_dim, a_sz;
2697627f7eb2Smrg if (aystride == 1)
2698627f7eb2Smrg a_sz = rystride;
2699627f7eb2Smrg else
2700627f7eb2Smrg a_sz = a_dim1;
2701627f7eb2Smrg
2702627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
2703627f7eb2Smrg if (t1_dim > 65536)
2704627f7eb2Smrg t1_dim = 65536;
2705627f7eb2Smrg
2706627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
2707627f7eb2Smrg
2708627f7eb2Smrg /* Start turning the crank. */
2709627f7eb2Smrg i1 = n;
2710627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
2711627f7eb2Smrg {
2712627f7eb2Smrg /* Computing MIN */
2713627f7eb2Smrg i2 = 512;
2714627f7eb2Smrg i3 = n - jj + 1;
2715627f7eb2Smrg jsec = min(i2,i3);
2716627f7eb2Smrg ujsec = jsec - jsec % 4;
2717627f7eb2Smrg i2 = k;
2718627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
2719627f7eb2Smrg {
2720627f7eb2Smrg /* Computing MIN */
2721627f7eb2Smrg i3 = 256;
2722627f7eb2Smrg i4 = k - ll + 1;
2723627f7eb2Smrg lsec = min(i3,i4);
2724627f7eb2Smrg ulsec = lsec - lsec % 2;
2725627f7eb2Smrg
2726627f7eb2Smrg i3 = m;
2727627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
2728627f7eb2Smrg {
2729627f7eb2Smrg /* Computing MIN */
2730627f7eb2Smrg i4 = 256;
2731627f7eb2Smrg i5 = m - ii + 1;
2732627f7eb2Smrg isec = min(i4,i5);
2733627f7eb2Smrg uisec = isec - isec % 2;
2734627f7eb2Smrg i4 = ll + ulsec - 1;
2735627f7eb2Smrg for (l = ll; l <= i4; l += 2)
2736627f7eb2Smrg {
2737627f7eb2Smrg i5 = ii + uisec - 1;
2738627f7eb2Smrg for (i = ii; i <= i5; i += 2)
2739627f7eb2Smrg {
2740627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2741627f7eb2Smrg a[i + l * a_dim1];
2742627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2743627f7eb2Smrg a[i + (l + 1) * a_dim1];
2744627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2745627f7eb2Smrg a[i + 1 + l * a_dim1];
2746627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2747627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
2748627f7eb2Smrg }
2749627f7eb2Smrg if (uisec < isec)
2750627f7eb2Smrg {
2751627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
2752627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
2753627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
2754627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
2755627f7eb2Smrg }
2756627f7eb2Smrg }
2757627f7eb2Smrg if (ulsec < lsec)
2758627f7eb2Smrg {
2759627f7eb2Smrg i4 = ii + isec - 1;
2760627f7eb2Smrg for (i = ii; i<= i4; ++i)
2761627f7eb2Smrg {
2762627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
2763627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
2764627f7eb2Smrg }
2765627f7eb2Smrg }
2766627f7eb2Smrg
2767627f7eb2Smrg uisec = isec - isec % 4;
2768627f7eb2Smrg i4 = jj + ujsec - 1;
2769627f7eb2Smrg for (j = jj; j <= i4; j += 4)
2770627f7eb2Smrg {
2771627f7eb2Smrg i5 = ii + uisec - 1;
2772627f7eb2Smrg for (i = ii; i <= i5; i += 4)
2773627f7eb2Smrg {
2774627f7eb2Smrg f11 = c[i + j * c_dim1];
2775627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
2776627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
2777627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
2778627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
2779627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
2780627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
2781627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
2782627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
2783627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
2784627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
2785627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
2786627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
2787627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
2788627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
2789627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
2790627f7eb2Smrg i6 = ll + lsec - 1;
2791627f7eb2Smrg for (l = ll; l <= i6; ++l)
2792627f7eb2Smrg {
2793627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2794627f7eb2Smrg * b[l + j * b_dim1];
2795627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2796627f7eb2Smrg * b[l + j * b_dim1];
2797627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2798627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2799627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2800627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2801627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2802627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2803627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2804627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2805627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2806627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2807627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2808627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2809627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2810627f7eb2Smrg * b[l + j * b_dim1];
2811627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2812627f7eb2Smrg * b[l + j * b_dim1];
2813627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2814627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2815627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2816627f7eb2Smrg * b[l + (j + 1) * b_dim1];
2817627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2818627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2819627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2820627f7eb2Smrg * b[l + (j + 2) * b_dim1];
2821627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2822627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2823627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2824627f7eb2Smrg * b[l + (j + 3) * b_dim1];
2825627f7eb2Smrg }
2826627f7eb2Smrg c[i + j * c_dim1] = f11;
2827627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
2828627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
2829627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
2830627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
2831627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
2832627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
2833627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
2834627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
2835627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
2836627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
2837627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
2838627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
2839627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
2840627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
2841627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
2842627f7eb2Smrg }
2843627f7eb2Smrg if (uisec < isec)
2844627f7eb2Smrg {
2845627f7eb2Smrg i5 = ii + isec - 1;
2846627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
2847627f7eb2Smrg {
2848627f7eb2Smrg f11 = c[i + j * c_dim1];
2849627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
2850627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
2851627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
2852627f7eb2Smrg i6 = ll + lsec - 1;
2853627f7eb2Smrg for (l = ll; l <= i6; ++l)
2854627f7eb2Smrg {
2855627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856627f7eb2Smrg 257] * b[l + j * b_dim1];
2857627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2858627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
2859627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2860627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
2861627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2862627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
2863627f7eb2Smrg }
2864627f7eb2Smrg c[i + j * c_dim1] = f11;
2865627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
2866627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
2867627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
2868627f7eb2Smrg }
2869627f7eb2Smrg }
2870627f7eb2Smrg }
2871627f7eb2Smrg if (ujsec < jsec)
2872627f7eb2Smrg {
2873627f7eb2Smrg i4 = jj + jsec - 1;
2874627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
2875627f7eb2Smrg {
2876627f7eb2Smrg i5 = ii + uisec - 1;
2877627f7eb2Smrg for (i = ii; i <= i5; i += 4)
2878627f7eb2Smrg {
2879627f7eb2Smrg f11 = c[i + j * c_dim1];
2880627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
2881627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
2882627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
2883627f7eb2Smrg i6 = ll + lsec - 1;
2884627f7eb2Smrg for (l = ll; l <= i6; ++l)
2885627f7eb2Smrg {
2886627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2887627f7eb2Smrg 257] * b[l + j * b_dim1];
2888627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2889627f7eb2Smrg 257] * b[l + j * b_dim1];
2890627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2891627f7eb2Smrg 257] * b[l + j * b_dim1];
2892627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2893627f7eb2Smrg 257] * b[l + j * b_dim1];
2894627f7eb2Smrg }
2895627f7eb2Smrg c[i + j * c_dim1] = f11;
2896627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
2897627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
2898627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
2899627f7eb2Smrg }
2900627f7eb2Smrg i5 = ii + isec - 1;
2901627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
2902627f7eb2Smrg {
2903627f7eb2Smrg f11 = c[i + j * c_dim1];
2904627f7eb2Smrg i6 = ll + lsec - 1;
2905627f7eb2Smrg for (l = ll; l <= i6; ++l)
2906627f7eb2Smrg {
2907627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2908627f7eb2Smrg 257] * b[l + j * b_dim1];
2909627f7eb2Smrg }
2910627f7eb2Smrg c[i + j * c_dim1] = f11;
2911627f7eb2Smrg }
2912627f7eb2Smrg }
2913627f7eb2Smrg }
2914627f7eb2Smrg }
2915627f7eb2Smrg }
2916627f7eb2Smrg }
2917627f7eb2Smrg free(t1);
2918627f7eb2Smrg return;
2919627f7eb2Smrg }
2920627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2921627f7eb2Smrg {
2922627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
2923627f7eb2Smrg {
2924627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
2925627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2926627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
2927627f7eb2Smrg GFC_COMPLEX_4 s;
2928627f7eb2Smrg
2929627f7eb2Smrg for (y = 0; y < ycount; y++)
2930627f7eb2Smrg {
2931627f7eb2Smrg bbase_y = &bbase[y*bystride];
2932627f7eb2Smrg dest_y = &dest[y*rystride];
2933627f7eb2Smrg for (x = 0; x < xcount; x++)
2934627f7eb2Smrg {
2935627f7eb2Smrg abase_x = &abase[x*axstride];
2936627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2937627f7eb2Smrg for (n = 0; n < count; n++)
2938627f7eb2Smrg s += abase_x[n] * bbase_y[n];
2939627f7eb2Smrg dest_y[x] = s;
2940627f7eb2Smrg }
2941627f7eb2Smrg }
2942627f7eb2Smrg }
2943627f7eb2Smrg else
2944627f7eb2Smrg {
2945627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2946627f7eb2Smrg GFC_COMPLEX_4 s;
2947627f7eb2Smrg
2948627f7eb2Smrg for (y = 0; y < ycount; y++)
2949627f7eb2Smrg {
2950627f7eb2Smrg bbase_y = &bbase[y*bystride];
2951627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2952627f7eb2Smrg for (n = 0; n < count; n++)
2953627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
2954627f7eb2Smrg dest[y*rystride] = s;
2955627f7eb2Smrg }
2956627f7eb2Smrg }
2957627f7eb2Smrg }
2958627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
2959627f7eb2Smrg {
2960627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2961627f7eb2Smrg GFC_COMPLEX_4 s;
2962627f7eb2Smrg
2963627f7eb2Smrg for (y = 0; y < ycount; y++)
2964627f7eb2Smrg {
2965627f7eb2Smrg bbase_y = &bbase[y*bystride];
2966627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
2967627f7eb2Smrg for (n = 0; n < count; n++)
2968627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
2969627f7eb2Smrg dest[y*rxstride] = s;
2970627f7eb2Smrg }
2971627f7eb2Smrg }
2972*4c3eb207Smrg else if (axstride < aystride)
2973*4c3eb207Smrg {
2974*4c3eb207Smrg for (y = 0; y < ycount; y++)
2975*4c3eb207Smrg for (x = 0; x < xcount; x++)
2976*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2977*4c3eb207Smrg
2978*4c3eb207Smrg for (y = 0; y < ycount; y++)
2979*4c3eb207Smrg for (n = 0; n < count; n++)
2980*4c3eb207Smrg for (x = 0; x < xcount; x++)
2981*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
2982*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
2983*4c3eb207Smrg abase[x*axstride + n*aystride] *
2984*4c3eb207Smrg bbase[n*bxstride + y*bystride];
2985*4c3eb207Smrg }
2986627f7eb2Smrg else
2987627f7eb2Smrg {
2988627f7eb2Smrg const GFC_COMPLEX_4 *restrict abase_x;
2989627f7eb2Smrg const GFC_COMPLEX_4 *restrict bbase_y;
2990627f7eb2Smrg GFC_COMPLEX_4 *restrict dest_y;
2991627f7eb2Smrg GFC_COMPLEX_4 s;
2992627f7eb2Smrg
2993627f7eb2Smrg for (y = 0; y < ycount; y++)
2994627f7eb2Smrg {
2995627f7eb2Smrg bbase_y = &bbase[y*bystride];
2996627f7eb2Smrg dest_y = &dest[y*rystride];
2997627f7eb2Smrg for (x = 0; x < xcount; x++)
2998627f7eb2Smrg {
2999627f7eb2Smrg abase_x = &abase[x*axstride];
3000627f7eb2Smrg s = (GFC_COMPLEX_4) 0;
3001627f7eb2Smrg for (n = 0; n < count; n++)
3002627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
3003627f7eb2Smrg dest_y[x*rxstride] = s;
3004627f7eb2Smrg }
3005627f7eb2Smrg }
3006627f7eb2Smrg }
3007627f7eb2Smrg }
3008627f7eb2Smrg #undef POW3
3009627f7eb2Smrg #undef min
3010627f7eb2Smrg #undef max
3011627f7eb2Smrg
3012627f7eb2Smrg #endif
3013627f7eb2Smrg #endif
3014627f7eb2Smrg
3015