1627f7eb2Smrg /* Implementation of the MATMUL intrinsic
2*4c3eb207Smrg Copyright (C) 2002-2020 Free Software Foundation, Inc.
3627f7eb2Smrg Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4627f7eb2Smrg
5627f7eb2Smrg This file is part of the GNU Fortran runtime library (libgfortran).
6627f7eb2Smrg
7627f7eb2Smrg Libgfortran is free software; you can redistribute it and/or
8627f7eb2Smrg modify it under the terms of the GNU General Public
9627f7eb2Smrg License as published by the Free Software Foundation; either
10627f7eb2Smrg version 3 of the License, or (at your option) any later version.
11627f7eb2Smrg
12627f7eb2Smrg Libgfortran is distributed in the hope that it will be useful,
13627f7eb2Smrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14627f7eb2Smrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15627f7eb2Smrg GNU General Public License for more details.
16627f7eb2Smrg
17627f7eb2Smrg Under Section 7 of GPL version 3, you are granted additional
18627f7eb2Smrg permissions described in the GCC Runtime Library Exception, version
19627f7eb2Smrg 3.1, as published by the Free Software Foundation.
20627f7eb2Smrg
21627f7eb2Smrg You should have received a copy of the GNU General Public License and
22627f7eb2Smrg a copy of the GCC Runtime Library Exception along with this program;
23627f7eb2Smrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24627f7eb2Smrg <http://www.gnu.org/licenses/>. */
25627f7eb2Smrg
26627f7eb2Smrg #include "libgfortran.h"
27627f7eb2Smrg #include <string.h>
28627f7eb2Smrg #include <assert.h>
29627f7eb2Smrg
30627f7eb2Smrg
31627f7eb2Smrg /* These are the specific versions of matmul with -mprefer-avx128. */
32627f7eb2Smrg
33627f7eb2Smrg #if defined (HAVE_GFC_REAL_10)
34627f7eb2Smrg
35627f7eb2Smrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36627f7eb2Smrg passed to us by the front-end, in which case we call it for large
37627f7eb2Smrg matrices. */
38627f7eb2Smrg
39627f7eb2Smrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40627f7eb2Smrg const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
41627f7eb2Smrg const int *, const GFC_REAL_10 *, const int *,
42627f7eb2Smrg const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
43627f7eb2Smrg int, int);
44627f7eb2Smrg
45627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46627f7eb2Smrg void
47627f7eb2Smrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
48627f7eb2Smrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
49627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50627f7eb2Smrg internal_proto(matmul_r10_avx128_fma3);
51627f7eb2Smrg void
matmul_r10_avx128_fma3(gfc_array_r10 * const restrict retarray,gfc_array_r10 * const restrict a,gfc_array_r10 * const restrict b,int try_blas,int blas_limit,blas_call gemm)52627f7eb2Smrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
53627f7eb2Smrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
54627f7eb2Smrg int blas_limit, blas_call gemm)
55627f7eb2Smrg {
56627f7eb2Smrg const GFC_REAL_10 * restrict abase;
57627f7eb2Smrg const GFC_REAL_10 * restrict bbase;
58627f7eb2Smrg GFC_REAL_10 * restrict dest;
59627f7eb2Smrg
60627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
62627f7eb2Smrg
63627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
64627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
65627f7eb2Smrg
66627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67627f7eb2Smrg
68627f7eb2Smrg Either A or B (but not both) can be rank 1:
69627f7eb2Smrg
70627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
71627f7eb2Smrg dimensioned [1,count], so xcount=1.
72627f7eb2Smrg
73627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
74627f7eb2Smrg dimensioned [count, 1], so ycount=1.
75627f7eb2Smrg */
76627f7eb2Smrg
77627f7eb2Smrg if (retarray->base_addr == NULL)
78627f7eb2Smrg {
79627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
80627f7eb2Smrg {
81627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
82627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83627f7eb2Smrg }
84627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
85627f7eb2Smrg {
86627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
87627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88627f7eb2Smrg }
89627f7eb2Smrg else
90627f7eb2Smrg {
91627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
92627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93627f7eb2Smrg
94627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
95627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
97627f7eb2Smrg }
98627f7eb2Smrg
99627f7eb2Smrg retarray->base_addr
100627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
101627f7eb2Smrg retarray->offset = 0;
102627f7eb2Smrg }
103627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
104627f7eb2Smrg {
105627f7eb2Smrg index_type ret_extent, arg_extent;
106627f7eb2Smrg
107627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
108627f7eb2Smrg {
109627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111627f7eb2Smrg if (arg_extent != ret_extent)
112627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
113627f7eb2Smrg "array (%ld/%ld) ",
114627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
115627f7eb2Smrg }
116627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
117627f7eb2Smrg {
118627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120627f7eb2Smrg if (arg_extent != ret_extent)
121627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
122627f7eb2Smrg "array (%ld/%ld) ",
123627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
124627f7eb2Smrg }
125627f7eb2Smrg else
126627f7eb2Smrg {
127627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129627f7eb2Smrg if (arg_extent != ret_extent)
130627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
131627f7eb2Smrg "array (%ld/%ld) ",
132627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
133627f7eb2Smrg
134627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136627f7eb2Smrg if (arg_extent != ret_extent)
137627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
138627f7eb2Smrg "array (%ld/%ld) ",
139627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
140627f7eb2Smrg }
141627f7eb2Smrg }
142627f7eb2Smrg
143627f7eb2Smrg
144627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145627f7eb2Smrg {
146627f7eb2Smrg /* One-dimensional result may be addressed in the code below
147627f7eb2Smrg either as a row or a column matrix. We want both cases to
148627f7eb2Smrg work. */
149627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150627f7eb2Smrg }
151627f7eb2Smrg else
152627f7eb2Smrg {
153627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155627f7eb2Smrg }
156627f7eb2Smrg
157627f7eb2Smrg
158627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
159627f7eb2Smrg {
160627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
161627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162627f7eb2Smrg aystride = 1;
163627f7eb2Smrg
164627f7eb2Smrg xcount = 1;
165627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
166627f7eb2Smrg }
167627f7eb2Smrg else
168627f7eb2Smrg {
169627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171627f7eb2Smrg
172627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
173627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174627f7eb2Smrg }
175627f7eb2Smrg
176627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177627f7eb2Smrg {
178627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
181627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182627f7eb2Smrg }
183627f7eb2Smrg
184627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
185627f7eb2Smrg {
186627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
187627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188627f7eb2Smrg
189627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
190627f7eb2Smrg The value is only used for calculation of the
191627f7eb2Smrg memory by the buffer. */
192627f7eb2Smrg bystride = 256;
193627f7eb2Smrg ycount = 1;
194627f7eb2Smrg }
195627f7eb2Smrg else
196627f7eb2Smrg {
197627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200627f7eb2Smrg }
201627f7eb2Smrg
202627f7eb2Smrg abase = a->base_addr;
203627f7eb2Smrg bbase = b->base_addr;
204627f7eb2Smrg dest = retarray->base_addr;
205627f7eb2Smrg
206627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
207627f7eb2Smrg itself. */
208627f7eb2Smrg
209627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
211627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
212627f7eb2Smrg
213627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214627f7eb2Smrg && (bxstride == 1 || bystride == 1)
215627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
216627f7eb2Smrg > POW3(blas_limit)))
217627f7eb2Smrg {
218627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
219627f7eb2Smrg const GFC_REAL_10 one = 1, zero = 0;
220627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
221627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
222627f7eb2Smrg
223627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224627f7eb2Smrg {
225627f7eb2Smrg assert (gemm != NULL);
226627f7eb2Smrg const char *transa, *transb;
227627f7eb2Smrg if (try_blas & 2)
228627f7eb2Smrg transa = "C";
229627f7eb2Smrg else
230627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
231627f7eb2Smrg
232627f7eb2Smrg if (try_blas & 4)
233627f7eb2Smrg transb = "C";
234627f7eb2Smrg else
235627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
236627f7eb2Smrg
237627f7eb2Smrg gemm (transa, transb , &m,
238627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
239627f7eb2Smrg &ldc, 1, 1);
240627f7eb2Smrg return;
241627f7eb2Smrg }
242627f7eb2Smrg }
243627f7eb2Smrg
244*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
245*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
246627f7eb2Smrg {
247627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
248627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
249627f7eb2Smrg
250627f7eb2Smrg Bo Kagstrom and Per Ling
251627f7eb2Smrg Department of Computing Science
252627f7eb2Smrg Umea University
253627f7eb2Smrg S-901 87 Umea, Sweden
254627f7eb2Smrg
255627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
256627f7eb2Smrg
257627f7eb2Smrg const GFC_REAL_10 *a, *b;
258627f7eb2Smrg GFC_REAL_10 *c;
259627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
260627f7eb2Smrg
261627f7eb2Smrg /* System generated locals */
262627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263627f7eb2Smrg i1, i2, i3, i4, i5, i6;
264627f7eb2Smrg
265627f7eb2Smrg /* Local variables */
266627f7eb2Smrg GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
267627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
268627f7eb2Smrg index_type i, j, l, ii, jj, ll;
269627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270627f7eb2Smrg GFC_REAL_10 *t1;
271627f7eb2Smrg
272627f7eb2Smrg a = abase;
273627f7eb2Smrg b = bbase;
274627f7eb2Smrg c = retarray->base_addr;
275627f7eb2Smrg
276627f7eb2Smrg /* Parameter adjustments */
277627f7eb2Smrg c_dim1 = rystride;
278627f7eb2Smrg c_offset = 1 + c_dim1;
279627f7eb2Smrg c -= c_offset;
280627f7eb2Smrg a_dim1 = aystride;
281627f7eb2Smrg a_offset = 1 + a_dim1;
282627f7eb2Smrg a -= a_offset;
283627f7eb2Smrg b_dim1 = bystride;
284627f7eb2Smrg b_offset = 1 + b_dim1;
285627f7eb2Smrg b -= b_offset;
286627f7eb2Smrg
287627f7eb2Smrg /* Empty c first. */
288627f7eb2Smrg for (j=1; j<=n; j++)
289627f7eb2Smrg for (i=1; i<=m; i++)
290627f7eb2Smrg c[i + j * c_dim1] = (GFC_REAL_10)0;
291627f7eb2Smrg
292627f7eb2Smrg /* Early exit if possible */
293627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
294627f7eb2Smrg return;
295627f7eb2Smrg
296627f7eb2Smrg /* Adjust size of t1 to what is needed. */
297627f7eb2Smrg index_type t1_dim, a_sz;
298627f7eb2Smrg if (aystride == 1)
299627f7eb2Smrg a_sz = rystride;
300627f7eb2Smrg else
301627f7eb2Smrg a_sz = a_dim1;
302627f7eb2Smrg
303627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
304627f7eb2Smrg if (t1_dim > 65536)
305627f7eb2Smrg t1_dim = 65536;
306627f7eb2Smrg
307627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
308627f7eb2Smrg
309627f7eb2Smrg /* Start turning the crank. */
310627f7eb2Smrg i1 = n;
311627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
312627f7eb2Smrg {
313627f7eb2Smrg /* Computing MIN */
314627f7eb2Smrg i2 = 512;
315627f7eb2Smrg i3 = n - jj + 1;
316627f7eb2Smrg jsec = min(i2,i3);
317627f7eb2Smrg ujsec = jsec - jsec % 4;
318627f7eb2Smrg i2 = k;
319627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
320627f7eb2Smrg {
321627f7eb2Smrg /* Computing MIN */
322627f7eb2Smrg i3 = 256;
323627f7eb2Smrg i4 = k - ll + 1;
324627f7eb2Smrg lsec = min(i3,i4);
325627f7eb2Smrg ulsec = lsec - lsec % 2;
326627f7eb2Smrg
327627f7eb2Smrg i3 = m;
328627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
329627f7eb2Smrg {
330627f7eb2Smrg /* Computing MIN */
331627f7eb2Smrg i4 = 256;
332627f7eb2Smrg i5 = m - ii + 1;
333627f7eb2Smrg isec = min(i4,i5);
334627f7eb2Smrg uisec = isec - isec % 2;
335627f7eb2Smrg i4 = ll + ulsec - 1;
336627f7eb2Smrg for (l = ll; l <= i4; l += 2)
337627f7eb2Smrg {
338627f7eb2Smrg i5 = ii + uisec - 1;
339627f7eb2Smrg for (i = ii; i <= i5; i += 2)
340627f7eb2Smrg {
341627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342627f7eb2Smrg a[i + l * a_dim1];
343627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344627f7eb2Smrg a[i + (l + 1) * a_dim1];
345627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346627f7eb2Smrg a[i + 1 + l * a_dim1];
347627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
349627f7eb2Smrg }
350627f7eb2Smrg if (uisec < isec)
351627f7eb2Smrg {
352627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
353627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
354627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
355627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
356627f7eb2Smrg }
357627f7eb2Smrg }
358627f7eb2Smrg if (ulsec < lsec)
359627f7eb2Smrg {
360627f7eb2Smrg i4 = ii + isec - 1;
361627f7eb2Smrg for (i = ii; i<= i4; ++i)
362627f7eb2Smrg {
363627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
364627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
365627f7eb2Smrg }
366627f7eb2Smrg }
367627f7eb2Smrg
368627f7eb2Smrg uisec = isec - isec % 4;
369627f7eb2Smrg i4 = jj + ujsec - 1;
370627f7eb2Smrg for (j = jj; j <= i4; j += 4)
371627f7eb2Smrg {
372627f7eb2Smrg i5 = ii + uisec - 1;
373627f7eb2Smrg for (i = ii; i <= i5; i += 4)
374627f7eb2Smrg {
375627f7eb2Smrg f11 = c[i + j * c_dim1];
376627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
377627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
378627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
379627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
380627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
381627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
382627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
383627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
384627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
385627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
386627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
387627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
388627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
389627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
390627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
391627f7eb2Smrg i6 = ll + lsec - 1;
392627f7eb2Smrg for (l = ll; l <= i6; ++l)
393627f7eb2Smrg {
394627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395627f7eb2Smrg * b[l + j * b_dim1];
396627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397627f7eb2Smrg * b[l + j * b_dim1];
398627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399627f7eb2Smrg * b[l + (j + 1) * b_dim1];
400627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401627f7eb2Smrg * b[l + (j + 1) * b_dim1];
402627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403627f7eb2Smrg * b[l + (j + 2) * b_dim1];
404627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405627f7eb2Smrg * b[l + (j + 2) * b_dim1];
406627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407627f7eb2Smrg * b[l + (j + 3) * b_dim1];
408627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409627f7eb2Smrg * b[l + (j + 3) * b_dim1];
410627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411627f7eb2Smrg * b[l + j * b_dim1];
412627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413627f7eb2Smrg * b[l + j * b_dim1];
414627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415627f7eb2Smrg * b[l + (j + 1) * b_dim1];
416627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417627f7eb2Smrg * b[l + (j + 1) * b_dim1];
418627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419627f7eb2Smrg * b[l + (j + 2) * b_dim1];
420627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421627f7eb2Smrg * b[l + (j + 2) * b_dim1];
422627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423627f7eb2Smrg * b[l + (j + 3) * b_dim1];
424627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425627f7eb2Smrg * b[l + (j + 3) * b_dim1];
426627f7eb2Smrg }
427627f7eb2Smrg c[i + j * c_dim1] = f11;
428627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
429627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
430627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
431627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
432627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
433627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
434627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
435627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
436627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
437627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
438627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
439627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
440627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
441627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
442627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
443627f7eb2Smrg }
444627f7eb2Smrg if (uisec < isec)
445627f7eb2Smrg {
446627f7eb2Smrg i5 = ii + isec - 1;
447627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
448627f7eb2Smrg {
449627f7eb2Smrg f11 = c[i + j * c_dim1];
450627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
451627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
452627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
453627f7eb2Smrg i6 = ll + lsec - 1;
454627f7eb2Smrg for (l = ll; l <= i6; ++l)
455627f7eb2Smrg {
456627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457627f7eb2Smrg 257] * b[l + j * b_dim1];
458627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
460627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
462627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
464627f7eb2Smrg }
465627f7eb2Smrg c[i + j * c_dim1] = f11;
466627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
467627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
468627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
469627f7eb2Smrg }
470627f7eb2Smrg }
471627f7eb2Smrg }
472627f7eb2Smrg if (ujsec < jsec)
473627f7eb2Smrg {
474627f7eb2Smrg i4 = jj + jsec - 1;
475627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
476627f7eb2Smrg {
477627f7eb2Smrg i5 = ii + uisec - 1;
478627f7eb2Smrg for (i = ii; i <= i5; i += 4)
479627f7eb2Smrg {
480627f7eb2Smrg f11 = c[i + j * c_dim1];
481627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
482627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
483627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
484627f7eb2Smrg i6 = ll + lsec - 1;
485627f7eb2Smrg for (l = ll; l <= i6; ++l)
486627f7eb2Smrg {
487627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488627f7eb2Smrg 257] * b[l + j * b_dim1];
489627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490627f7eb2Smrg 257] * b[l + j * b_dim1];
491627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492627f7eb2Smrg 257] * b[l + j * b_dim1];
493627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494627f7eb2Smrg 257] * b[l + j * b_dim1];
495627f7eb2Smrg }
496627f7eb2Smrg c[i + j * c_dim1] = f11;
497627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
498627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
499627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
500627f7eb2Smrg }
501627f7eb2Smrg i5 = ii + isec - 1;
502627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
503627f7eb2Smrg {
504627f7eb2Smrg f11 = c[i + j * c_dim1];
505627f7eb2Smrg i6 = ll + lsec - 1;
506627f7eb2Smrg for (l = ll; l <= i6; ++l)
507627f7eb2Smrg {
508627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509627f7eb2Smrg 257] * b[l + j * b_dim1];
510627f7eb2Smrg }
511627f7eb2Smrg c[i + j * c_dim1] = f11;
512627f7eb2Smrg }
513627f7eb2Smrg }
514627f7eb2Smrg }
515627f7eb2Smrg }
516627f7eb2Smrg }
517627f7eb2Smrg }
518627f7eb2Smrg free(t1);
519627f7eb2Smrg return;
520627f7eb2Smrg }
521627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522627f7eb2Smrg {
523627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
524627f7eb2Smrg {
525627f7eb2Smrg const GFC_REAL_10 *restrict abase_x;
526627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
527627f7eb2Smrg GFC_REAL_10 *restrict dest_y;
528627f7eb2Smrg GFC_REAL_10 s;
529627f7eb2Smrg
530627f7eb2Smrg for (y = 0; y < ycount; y++)
531627f7eb2Smrg {
532627f7eb2Smrg bbase_y = &bbase[y*bystride];
533627f7eb2Smrg dest_y = &dest[y*rystride];
534627f7eb2Smrg for (x = 0; x < xcount; x++)
535627f7eb2Smrg {
536627f7eb2Smrg abase_x = &abase[x*axstride];
537627f7eb2Smrg s = (GFC_REAL_10) 0;
538627f7eb2Smrg for (n = 0; n < count; n++)
539627f7eb2Smrg s += abase_x[n] * bbase_y[n];
540627f7eb2Smrg dest_y[x] = s;
541627f7eb2Smrg }
542627f7eb2Smrg }
543627f7eb2Smrg }
544627f7eb2Smrg else
545627f7eb2Smrg {
546627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
547627f7eb2Smrg GFC_REAL_10 s;
548627f7eb2Smrg
549627f7eb2Smrg for (y = 0; y < ycount; y++)
550627f7eb2Smrg {
551627f7eb2Smrg bbase_y = &bbase[y*bystride];
552627f7eb2Smrg s = (GFC_REAL_10) 0;
553627f7eb2Smrg for (n = 0; n < count; n++)
554627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
555627f7eb2Smrg dest[y*rystride] = s;
556627f7eb2Smrg }
557627f7eb2Smrg }
558627f7eb2Smrg }
559627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
560627f7eb2Smrg {
561627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
562627f7eb2Smrg GFC_REAL_10 s;
563627f7eb2Smrg
564627f7eb2Smrg for (y = 0; y < ycount; y++)
565627f7eb2Smrg {
566627f7eb2Smrg bbase_y = &bbase[y*bystride];
567627f7eb2Smrg s = (GFC_REAL_10) 0;
568627f7eb2Smrg for (n = 0; n < count; n++)
569627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
570627f7eb2Smrg dest[y*rxstride] = s;
571627f7eb2Smrg }
572627f7eb2Smrg }
573*4c3eb207Smrg else if (axstride < aystride)
574*4c3eb207Smrg {
575*4c3eb207Smrg for (y = 0; y < ycount; y++)
576*4c3eb207Smrg for (x = 0; x < xcount; x++)
577*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
578*4c3eb207Smrg
579*4c3eb207Smrg for (y = 0; y < ycount; y++)
580*4c3eb207Smrg for (n = 0; n < count; n++)
581*4c3eb207Smrg for (x = 0; x < xcount; x++)
582*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
583*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
584*4c3eb207Smrg abase[x*axstride + n*aystride] *
585*4c3eb207Smrg bbase[n*bxstride + y*bystride];
586*4c3eb207Smrg }
587627f7eb2Smrg else
588627f7eb2Smrg {
589627f7eb2Smrg const GFC_REAL_10 *restrict abase_x;
590627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
591627f7eb2Smrg GFC_REAL_10 *restrict dest_y;
592627f7eb2Smrg GFC_REAL_10 s;
593627f7eb2Smrg
594627f7eb2Smrg for (y = 0; y < ycount; y++)
595627f7eb2Smrg {
596627f7eb2Smrg bbase_y = &bbase[y*bystride];
597627f7eb2Smrg dest_y = &dest[y*rystride];
598627f7eb2Smrg for (x = 0; x < xcount; x++)
599627f7eb2Smrg {
600627f7eb2Smrg abase_x = &abase[x*axstride];
601627f7eb2Smrg s = (GFC_REAL_10) 0;
602627f7eb2Smrg for (n = 0; n < count; n++)
603627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
604627f7eb2Smrg dest_y[x*rxstride] = s;
605627f7eb2Smrg }
606627f7eb2Smrg }
607627f7eb2Smrg }
608627f7eb2Smrg }
609627f7eb2Smrg #undef POW3
610627f7eb2Smrg #undef min
611627f7eb2Smrg #undef max
612627f7eb2Smrg
613627f7eb2Smrg #endif
614627f7eb2Smrg
615627f7eb2Smrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616627f7eb2Smrg void
617627f7eb2Smrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
618627f7eb2Smrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
619627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620627f7eb2Smrg internal_proto(matmul_r10_avx128_fma4);
621627f7eb2Smrg void
matmul_r10_avx128_fma4(gfc_array_r10 * const restrict retarray,gfc_array_r10 * const restrict a,gfc_array_r10 * const restrict b,int try_blas,int blas_limit,blas_call gemm)622627f7eb2Smrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
623627f7eb2Smrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
624627f7eb2Smrg int blas_limit, blas_call gemm)
625627f7eb2Smrg {
626627f7eb2Smrg const GFC_REAL_10 * restrict abase;
627627f7eb2Smrg const GFC_REAL_10 * restrict bbase;
628627f7eb2Smrg GFC_REAL_10 * restrict dest;
629627f7eb2Smrg
630627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631627f7eb2Smrg index_type x, y, n, count, xcount, ycount;
632627f7eb2Smrg
633627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2
634627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2);
635627f7eb2Smrg
636627f7eb2Smrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637627f7eb2Smrg
638627f7eb2Smrg Either A or B (but not both) can be rank 1:
639627f7eb2Smrg
640627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix
641627f7eb2Smrg dimensioned [1,count], so xcount=1.
642627f7eb2Smrg
643627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix
644627f7eb2Smrg dimensioned [count, 1], so ycount=1.
645627f7eb2Smrg */
646627f7eb2Smrg
647627f7eb2Smrg if (retarray->base_addr == NULL)
648627f7eb2Smrg {
649627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
650627f7eb2Smrg {
651627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
652627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653627f7eb2Smrg }
654627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
655627f7eb2Smrg {
656627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
657627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658627f7eb2Smrg }
659627f7eb2Smrg else
660627f7eb2Smrg {
661627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0,
662627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663627f7eb2Smrg
664627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0,
665627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0));
667627f7eb2Smrg }
668627f7eb2Smrg
669627f7eb2Smrg retarray->base_addr
670627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
671627f7eb2Smrg retarray->offset = 0;
672627f7eb2Smrg }
673627f7eb2Smrg else if (unlikely (compile_options.bounds_check))
674627f7eb2Smrg {
675627f7eb2Smrg index_type ret_extent, arg_extent;
676627f7eb2Smrg
677627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
678627f7eb2Smrg {
679627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681627f7eb2Smrg if (arg_extent != ret_extent)
682627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
683627f7eb2Smrg "array (%ld/%ld) ",
684627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
685627f7eb2Smrg }
686627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1)
687627f7eb2Smrg {
688627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690627f7eb2Smrg if (arg_extent != ret_extent)
691627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
692627f7eb2Smrg "array (%ld/%ld) ",
693627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
694627f7eb2Smrg }
695627f7eb2Smrg else
696627f7eb2Smrg {
697627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699627f7eb2Smrg if (arg_extent != ret_extent)
700627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of "
701627f7eb2Smrg "array (%ld/%ld) ",
702627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
703627f7eb2Smrg
704627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706627f7eb2Smrg if (arg_extent != ret_extent)
707627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of "
708627f7eb2Smrg "array (%ld/%ld) ",
709627f7eb2Smrg (long int) ret_extent, (long int) arg_extent);
710627f7eb2Smrg }
711627f7eb2Smrg }
712627f7eb2Smrg
713627f7eb2Smrg
714627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715627f7eb2Smrg {
716627f7eb2Smrg /* One-dimensional result may be addressed in the code below
717627f7eb2Smrg either as a row or a column matrix. We want both cases to
718627f7eb2Smrg work. */
719627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720627f7eb2Smrg }
721627f7eb2Smrg else
722627f7eb2Smrg {
723627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725627f7eb2Smrg }
726627f7eb2Smrg
727627f7eb2Smrg
728627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1)
729627f7eb2Smrg {
730627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */
731627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732627f7eb2Smrg aystride = 1;
733627f7eb2Smrg
734627f7eb2Smrg xcount = 1;
735627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0);
736627f7eb2Smrg }
737627f7eb2Smrg else
738627f7eb2Smrg {
739627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741627f7eb2Smrg
742627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1);
743627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744627f7eb2Smrg }
745627f7eb2Smrg
746627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747627f7eb2Smrg {
748627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750627f7eb2Smrg "in dimension 1: is %ld, should be %ld",
751627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752627f7eb2Smrg }
753627f7eb2Smrg
754627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1)
755627f7eb2Smrg {
756627f7eb2Smrg /* Treat it as a column matrix B[count,1] */
757627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758627f7eb2Smrg
759627f7eb2Smrg /* bystride should never be used for 1-dimensional b.
760627f7eb2Smrg The value is only used for calculation of the
761627f7eb2Smrg memory by the buffer. */
762627f7eb2Smrg bystride = 256;
763627f7eb2Smrg ycount = 1;
764627f7eb2Smrg }
765627f7eb2Smrg else
766627f7eb2Smrg {
767627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770627f7eb2Smrg }
771627f7eb2Smrg
772627f7eb2Smrg abase = a->base_addr;
773627f7eb2Smrg bbase = b->base_addr;
774627f7eb2Smrg dest = retarray->base_addr;
775627f7eb2Smrg
776627f7eb2Smrg /* Now that everything is set up, we perform the multiplication
777627f7eb2Smrg itself. */
778627f7eb2Smrg
779627f7eb2Smrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780627f7eb2Smrg #define min(a,b) ((a) <= (b) ? (a) : (b))
781627f7eb2Smrg #define max(a,b) ((a) >= (b) ? (a) : (b))
782627f7eb2Smrg
783627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784627f7eb2Smrg && (bxstride == 1 || bystride == 1)
785627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count)
786627f7eb2Smrg > POW3(blas_limit)))
787627f7eb2Smrg {
788627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride;
789627f7eb2Smrg const GFC_REAL_10 one = 1, zero = 0;
790627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride,
791627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride;
792627f7eb2Smrg
793627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794627f7eb2Smrg {
795627f7eb2Smrg assert (gemm != NULL);
796627f7eb2Smrg const char *transa, *transb;
797627f7eb2Smrg if (try_blas & 2)
798627f7eb2Smrg transa = "C";
799627f7eb2Smrg else
800627f7eb2Smrg transa = axstride == 1 ? "N" : "T";
801627f7eb2Smrg
802627f7eb2Smrg if (try_blas & 4)
803627f7eb2Smrg transb = "C";
804627f7eb2Smrg else
805627f7eb2Smrg transb = bxstride == 1 ? "N" : "T";
806627f7eb2Smrg
807627f7eb2Smrg gemm (transa, transb , &m,
808627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
809627f7eb2Smrg &ldc, 1, 1);
810627f7eb2Smrg return;
811627f7eb2Smrg }
812627f7eb2Smrg }
813627f7eb2Smrg
814*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1
815*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1)
816627f7eb2Smrg {
817627f7eb2Smrg /* This block of code implements a tuned matmul, derived from
818627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1
819627f7eb2Smrg
820627f7eb2Smrg Bo Kagstrom and Per Ling
821627f7eb2Smrg Department of Computing Science
822627f7eb2Smrg Umea University
823627f7eb2Smrg S-901 87 Umea, Sweden
824627f7eb2Smrg
825627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */
826627f7eb2Smrg
827627f7eb2Smrg const GFC_REAL_10 *a, *b;
828627f7eb2Smrg GFC_REAL_10 *c;
829627f7eb2Smrg const index_type m = xcount, n = ycount, k = count;
830627f7eb2Smrg
831627f7eb2Smrg /* System generated locals */
832627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833627f7eb2Smrg i1, i2, i3, i4, i5, i6;
834627f7eb2Smrg
835627f7eb2Smrg /* Local variables */
836627f7eb2Smrg GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
837627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44;
838627f7eb2Smrg index_type i, j, l, ii, jj, ll;
839627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840627f7eb2Smrg GFC_REAL_10 *t1;
841627f7eb2Smrg
842627f7eb2Smrg a = abase;
843627f7eb2Smrg b = bbase;
844627f7eb2Smrg c = retarray->base_addr;
845627f7eb2Smrg
846627f7eb2Smrg /* Parameter adjustments */
847627f7eb2Smrg c_dim1 = rystride;
848627f7eb2Smrg c_offset = 1 + c_dim1;
849627f7eb2Smrg c -= c_offset;
850627f7eb2Smrg a_dim1 = aystride;
851627f7eb2Smrg a_offset = 1 + a_dim1;
852627f7eb2Smrg a -= a_offset;
853627f7eb2Smrg b_dim1 = bystride;
854627f7eb2Smrg b_offset = 1 + b_dim1;
855627f7eb2Smrg b -= b_offset;
856627f7eb2Smrg
857627f7eb2Smrg /* Empty c first. */
858627f7eb2Smrg for (j=1; j<=n; j++)
859627f7eb2Smrg for (i=1; i<=m; i++)
860627f7eb2Smrg c[i + j * c_dim1] = (GFC_REAL_10)0;
861627f7eb2Smrg
862627f7eb2Smrg /* Early exit if possible */
863627f7eb2Smrg if (m == 0 || n == 0 || k == 0)
864627f7eb2Smrg return;
865627f7eb2Smrg
866627f7eb2Smrg /* Adjust size of t1 to what is needed. */
867627f7eb2Smrg index_type t1_dim, a_sz;
868627f7eb2Smrg if (aystride == 1)
869627f7eb2Smrg a_sz = rystride;
870627f7eb2Smrg else
871627f7eb2Smrg a_sz = a_dim1;
872627f7eb2Smrg
873627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1;
874627f7eb2Smrg if (t1_dim > 65536)
875627f7eb2Smrg t1_dim = 65536;
876627f7eb2Smrg
877627f7eb2Smrg t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
878627f7eb2Smrg
879627f7eb2Smrg /* Start turning the crank. */
880627f7eb2Smrg i1 = n;
881627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512)
882627f7eb2Smrg {
883627f7eb2Smrg /* Computing MIN */
884627f7eb2Smrg i2 = 512;
885627f7eb2Smrg i3 = n - jj + 1;
886627f7eb2Smrg jsec = min(i2,i3);
887627f7eb2Smrg ujsec = jsec - jsec % 4;
888627f7eb2Smrg i2 = k;
889627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256)
890627f7eb2Smrg {
891627f7eb2Smrg /* Computing MIN */
892627f7eb2Smrg i3 = 256;
893627f7eb2Smrg i4 = k - ll + 1;
894627f7eb2Smrg lsec = min(i3,i4);
895627f7eb2Smrg ulsec = lsec - lsec % 2;
896627f7eb2Smrg
897627f7eb2Smrg i3 = m;
898627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256)
899627f7eb2Smrg {
900627f7eb2Smrg /* Computing MIN */
901627f7eb2Smrg i4 = 256;
902627f7eb2Smrg i5 = m - ii + 1;
903627f7eb2Smrg isec = min(i4,i5);
904627f7eb2Smrg uisec = isec - isec % 2;
905627f7eb2Smrg i4 = ll + ulsec - 1;
906627f7eb2Smrg for (l = ll; l <= i4; l += 2)
907627f7eb2Smrg {
908627f7eb2Smrg i5 = ii + uisec - 1;
909627f7eb2Smrg for (i = ii; i <= i5; i += 2)
910627f7eb2Smrg {
911627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912627f7eb2Smrg a[i + l * a_dim1];
913627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914627f7eb2Smrg a[i + (l + 1) * a_dim1];
915627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916627f7eb2Smrg a[i + 1 + l * a_dim1];
917627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1];
919627f7eb2Smrg }
920627f7eb2Smrg if (uisec < isec)
921627f7eb2Smrg {
922627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] =
923627f7eb2Smrg a[ii + isec - 1 + l * a_dim1];
924627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] =
925627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1];
926627f7eb2Smrg }
927627f7eb2Smrg }
928627f7eb2Smrg if (ulsec < lsec)
929627f7eb2Smrg {
930627f7eb2Smrg i4 = ii + isec - 1;
931627f7eb2Smrg for (i = ii; i<= i4; ++i)
932627f7eb2Smrg {
933627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] =
934627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1];
935627f7eb2Smrg }
936627f7eb2Smrg }
937627f7eb2Smrg
938627f7eb2Smrg uisec = isec - isec % 4;
939627f7eb2Smrg i4 = jj + ujsec - 1;
940627f7eb2Smrg for (j = jj; j <= i4; j += 4)
941627f7eb2Smrg {
942627f7eb2Smrg i5 = ii + uisec - 1;
943627f7eb2Smrg for (i = ii; i <= i5; i += 4)
944627f7eb2Smrg {
945627f7eb2Smrg f11 = c[i + j * c_dim1];
946627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
947627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
948627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1];
949627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
950627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1];
951627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
952627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1];
953627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
954627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
955627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1];
956627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1];
957627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1];
958627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1];
959627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1];
960627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1];
961627f7eb2Smrg i6 = ll + lsec - 1;
962627f7eb2Smrg for (l = ll; l <= i6; ++l)
963627f7eb2Smrg {
964627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965627f7eb2Smrg * b[l + j * b_dim1];
966627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967627f7eb2Smrg * b[l + j * b_dim1];
968627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969627f7eb2Smrg * b[l + (j + 1) * b_dim1];
970627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971627f7eb2Smrg * b[l + (j + 1) * b_dim1];
972627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973627f7eb2Smrg * b[l + (j + 2) * b_dim1];
974627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975627f7eb2Smrg * b[l + (j + 2) * b_dim1];
976627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977627f7eb2Smrg * b[l + (j + 3) * b_dim1];
978627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979627f7eb2Smrg * b[l + (j + 3) * b_dim1];
980627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981627f7eb2Smrg * b[l + j * b_dim1];
982627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983627f7eb2Smrg * b[l + j * b_dim1];
984627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985627f7eb2Smrg * b[l + (j + 1) * b_dim1];
986627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987627f7eb2Smrg * b[l + (j + 1) * b_dim1];
988627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989627f7eb2Smrg * b[l + (j + 2) * b_dim1];
990627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991627f7eb2Smrg * b[l + (j + 2) * b_dim1];
992627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993627f7eb2Smrg * b[l + (j + 3) * b_dim1];
994627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995627f7eb2Smrg * b[l + (j + 3) * b_dim1];
996627f7eb2Smrg }
997627f7eb2Smrg c[i + j * c_dim1] = f11;
998627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
999627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1000627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22;
1001627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1002627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23;
1003627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1004627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24;
1005627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1006627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1007627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32;
1008627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42;
1009627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33;
1010627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43;
1011627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34;
1012627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44;
1013627f7eb2Smrg }
1014627f7eb2Smrg if (uisec < isec)
1015627f7eb2Smrg {
1016627f7eb2Smrg i5 = ii + isec - 1;
1017627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1018627f7eb2Smrg {
1019627f7eb2Smrg f11 = c[i + j * c_dim1];
1020627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1];
1021627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1];
1022627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1];
1023627f7eb2Smrg i6 = ll + lsec - 1;
1024627f7eb2Smrg for (l = ll; l <= i6; ++l)
1025627f7eb2Smrg {
1026627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027627f7eb2Smrg 257] * b[l + j * b_dim1];
1028627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1];
1030627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1];
1032627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1];
1034627f7eb2Smrg }
1035627f7eb2Smrg c[i + j * c_dim1] = f11;
1036627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12;
1037627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13;
1038627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14;
1039627f7eb2Smrg }
1040627f7eb2Smrg }
1041627f7eb2Smrg }
1042627f7eb2Smrg if (ujsec < jsec)
1043627f7eb2Smrg {
1044627f7eb2Smrg i4 = jj + jsec - 1;
1045627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j)
1046627f7eb2Smrg {
1047627f7eb2Smrg i5 = ii + uisec - 1;
1048627f7eb2Smrg for (i = ii; i <= i5; i += 4)
1049627f7eb2Smrg {
1050627f7eb2Smrg f11 = c[i + j * c_dim1];
1051627f7eb2Smrg f21 = c[i + 1 + j * c_dim1];
1052627f7eb2Smrg f31 = c[i + 2 + j * c_dim1];
1053627f7eb2Smrg f41 = c[i + 3 + j * c_dim1];
1054627f7eb2Smrg i6 = ll + lsec - 1;
1055627f7eb2Smrg for (l = ll; l <= i6; ++l)
1056627f7eb2Smrg {
1057627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058627f7eb2Smrg 257] * b[l + j * b_dim1];
1059627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060627f7eb2Smrg 257] * b[l + j * b_dim1];
1061627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062627f7eb2Smrg 257] * b[l + j * b_dim1];
1063627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064627f7eb2Smrg 257] * b[l + j * b_dim1];
1065627f7eb2Smrg }
1066627f7eb2Smrg c[i + j * c_dim1] = f11;
1067627f7eb2Smrg c[i + 1 + j * c_dim1] = f21;
1068627f7eb2Smrg c[i + 2 + j * c_dim1] = f31;
1069627f7eb2Smrg c[i + 3 + j * c_dim1] = f41;
1070627f7eb2Smrg }
1071627f7eb2Smrg i5 = ii + isec - 1;
1072627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i)
1073627f7eb2Smrg {
1074627f7eb2Smrg f11 = c[i + j * c_dim1];
1075627f7eb2Smrg i6 = ll + lsec - 1;
1076627f7eb2Smrg for (l = ll; l <= i6; ++l)
1077627f7eb2Smrg {
1078627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079627f7eb2Smrg 257] * b[l + j * b_dim1];
1080627f7eb2Smrg }
1081627f7eb2Smrg c[i + j * c_dim1] = f11;
1082627f7eb2Smrg }
1083627f7eb2Smrg }
1084627f7eb2Smrg }
1085627f7eb2Smrg }
1086627f7eb2Smrg }
1087627f7eb2Smrg }
1088627f7eb2Smrg free(t1);
1089627f7eb2Smrg return;
1090627f7eb2Smrg }
1091627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092627f7eb2Smrg {
1093627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1)
1094627f7eb2Smrg {
1095627f7eb2Smrg const GFC_REAL_10 *restrict abase_x;
1096627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
1097627f7eb2Smrg GFC_REAL_10 *restrict dest_y;
1098627f7eb2Smrg GFC_REAL_10 s;
1099627f7eb2Smrg
1100627f7eb2Smrg for (y = 0; y < ycount; y++)
1101627f7eb2Smrg {
1102627f7eb2Smrg bbase_y = &bbase[y*bystride];
1103627f7eb2Smrg dest_y = &dest[y*rystride];
1104627f7eb2Smrg for (x = 0; x < xcount; x++)
1105627f7eb2Smrg {
1106627f7eb2Smrg abase_x = &abase[x*axstride];
1107627f7eb2Smrg s = (GFC_REAL_10) 0;
1108627f7eb2Smrg for (n = 0; n < count; n++)
1109627f7eb2Smrg s += abase_x[n] * bbase_y[n];
1110627f7eb2Smrg dest_y[x] = s;
1111627f7eb2Smrg }
1112627f7eb2Smrg }
1113627f7eb2Smrg }
1114627f7eb2Smrg else
1115627f7eb2Smrg {
1116627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
1117627f7eb2Smrg GFC_REAL_10 s;
1118627f7eb2Smrg
1119627f7eb2Smrg for (y = 0; y < ycount; y++)
1120627f7eb2Smrg {
1121627f7eb2Smrg bbase_y = &bbase[y*bystride];
1122627f7eb2Smrg s = (GFC_REAL_10) 0;
1123627f7eb2Smrg for (n = 0; n < count; n++)
1124627f7eb2Smrg s += abase[n*axstride] * bbase_y[n];
1125627f7eb2Smrg dest[y*rystride] = s;
1126627f7eb2Smrg }
1127627f7eb2Smrg }
1128627f7eb2Smrg }
1129627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130627f7eb2Smrg {
1131627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
1132627f7eb2Smrg GFC_REAL_10 s;
1133627f7eb2Smrg
1134627f7eb2Smrg for (y = 0; y < ycount; y++)
1135627f7eb2Smrg {
1136627f7eb2Smrg bbase_y = &bbase[y*bystride];
1137627f7eb2Smrg s = (GFC_REAL_10) 0;
1138627f7eb2Smrg for (n = 0; n < count; n++)
1139627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride];
1140627f7eb2Smrg dest[y*rxstride] = s;
1141627f7eb2Smrg }
1142627f7eb2Smrg }
1143*4c3eb207Smrg else if (axstride < aystride)
1144*4c3eb207Smrg {
1145*4c3eb207Smrg for (y = 0; y < ycount; y++)
1146*4c3eb207Smrg for (x = 0; x < xcount; x++)
1147*4c3eb207Smrg dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
1148*4c3eb207Smrg
1149*4c3eb207Smrg for (y = 0; y < ycount; y++)
1150*4c3eb207Smrg for (n = 0; n < count; n++)
1151*4c3eb207Smrg for (x = 0; x < xcount; x++)
1152*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */
1153*4c3eb207Smrg dest[x*rxstride + y*rystride] +=
1154*4c3eb207Smrg abase[x*axstride + n*aystride] *
1155*4c3eb207Smrg bbase[n*bxstride + y*bystride];
1156*4c3eb207Smrg }
1157627f7eb2Smrg else
1158627f7eb2Smrg {
1159627f7eb2Smrg const GFC_REAL_10 *restrict abase_x;
1160627f7eb2Smrg const GFC_REAL_10 *restrict bbase_y;
1161627f7eb2Smrg GFC_REAL_10 *restrict dest_y;
1162627f7eb2Smrg GFC_REAL_10 s;
1163627f7eb2Smrg
1164627f7eb2Smrg for (y = 0; y < ycount; y++)
1165627f7eb2Smrg {
1166627f7eb2Smrg bbase_y = &bbase[y*bystride];
1167627f7eb2Smrg dest_y = &dest[y*rystride];
1168627f7eb2Smrg for (x = 0; x < xcount; x++)
1169627f7eb2Smrg {
1170627f7eb2Smrg abase_x = &abase[x*axstride];
1171627f7eb2Smrg s = (GFC_REAL_10) 0;
1172627f7eb2Smrg for (n = 0; n < count; n++)
1173627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174627f7eb2Smrg dest_y[x*rxstride] = s;
1175627f7eb2Smrg }
1176627f7eb2Smrg }
1177627f7eb2Smrg }
1178627f7eb2Smrg }
1179627f7eb2Smrg #undef POW3
1180627f7eb2Smrg #undef min
1181627f7eb2Smrg #undef max
1182627f7eb2Smrg
1183627f7eb2Smrg #endif
1184627f7eb2Smrg
1185627f7eb2Smrg #endif
1186627f7eb2Smrg
1187