1 /* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2020 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5 This file is part of the GNU Fortran runtime library (libgfortran). 6 7 Libgfortran is free software; you can redistribute it and/or 8 modify it under the terms of the GNU General Public 9 License as published by the Free Software Foundation; either 10 version 3 of the License, or (at your option) any later version. 11 12 Libgfortran is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 Under Section 7 of GPL version 3, you are granted additional 18 permissions described in the GCC Runtime Library Exception, version 19 3.1, as published by the Free Software Foundation. 20 21 You should have received a copy of the GNU General Public License and 22 a copy of the GCC Runtime Library Exception along with this program; 23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 <http://www.gnu.org/licenses/>. */ 25 26 #include "libgfortran.h" 27 #include <string.h> 28 #include <assert.h> 29 30 31 #if defined (HAVE_GFC_REAL_4) 32 33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 34 passed to us by the front-end, in which case we call it for large 35 matrices. */ 36 37 typedef void (*blas_call)(const char *, const char *, const int *, const int *, 38 const int *, const GFC_REAL_4 *, const GFC_REAL_4 *, 39 const int *, const GFC_REAL_4 *, const int *, 40 const GFC_REAL_4 *, GFC_REAL_4 *, const int *, 41 int, int); 42 43 /* The order of loops is different in the case of plain matrix 44 multiplication C=MATMUL(A,B), and in the frequent special case where 45 the argument A is the temporary result of a TRANSPOSE intrinsic: 46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 47 looking at their strides. 48 49 The equivalent Fortran pseudo-code is: 50 51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 52 IF (.NOT.IS_TRANSPOSED(A)) THEN 53 C = 0 54 DO J=1,N 55 DO K=1,COUNT 56 DO I=1,M 57 C(I,J) = C(I,J)+A(I,K)*B(K,J) 58 ELSE 59 DO J=1,N 60 DO I=1,M 61 S = 0 62 DO K=1,COUNT 63 S = S+A(I,K)*B(K,J) 64 C(I,J) = S 65 ENDIF 66 */ 67 68 /* If try_blas is set to a nonzero value, then the matmul function will 69 see if there is a way to perform the matrix multiplication by a call 70 to the BLAS gemm function. */ 71 72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray, 73 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 74 int blas_limit, blas_call gemm); 75 export_proto(matmul_r4); 76 77 /* Put exhaustive list of possible architectures here here, ORed together. */ 78 79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 80 81 #ifdef HAVE_AVX 82 static void 83 matmul_r4_avx (gfc_array_r4 * const restrict retarray, 84 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 85 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 86 static void 87 matmul_r4_avx (gfc_array_r4 * const restrict retarray, 88 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 89 int blas_limit, blas_call gemm) 90 { 91 const GFC_REAL_4 * restrict abase; 92 const GFC_REAL_4 * restrict bbase; 93 GFC_REAL_4 * restrict dest; 94 95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 96 index_type x, y, n, count, xcount, ycount; 97 98 assert (GFC_DESCRIPTOR_RANK (a) == 2 99 || GFC_DESCRIPTOR_RANK (b) == 2); 100 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 102 103 Either A or B (but not both) can be rank 1: 104 105 o One-dimensional argument A is implicitly treated as a row matrix 106 dimensioned [1,count], so xcount=1. 107 108 o One-dimensional argument B is implicitly treated as a column matrix 109 dimensioned [count, 1], so ycount=1. 110 */ 111 112 if (retarray->base_addr == NULL) 113 { 114 if (GFC_DESCRIPTOR_RANK (a) == 1) 115 { 116 GFC_DIMENSION_SET(retarray->dim[0], 0, 117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 118 } 119 else if (GFC_DESCRIPTOR_RANK (b) == 1) 120 { 121 GFC_DIMENSION_SET(retarray->dim[0], 0, 122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 123 } 124 else 125 { 126 GFC_DIMENSION_SET(retarray->dim[0], 0, 127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 128 129 GFC_DIMENSION_SET(retarray->dim[1], 0, 130 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 131 GFC_DESCRIPTOR_EXTENT(retarray,0)); 132 } 133 134 retarray->base_addr 135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); 136 retarray->offset = 0; 137 } 138 else if (unlikely (compile_options.bounds_check)) 139 { 140 index_type ret_extent, arg_extent; 141 142 if (GFC_DESCRIPTOR_RANK (a) == 1) 143 { 144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 146 if (arg_extent != ret_extent) 147 runtime_error ("Array bound mismatch for dimension 1 of " 148 "array (%ld/%ld) ", 149 (long int) ret_extent, (long int) arg_extent); 150 } 151 else if (GFC_DESCRIPTOR_RANK (b) == 1) 152 { 153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 155 if (arg_extent != ret_extent) 156 runtime_error ("Array bound mismatch for dimension 1 of " 157 "array (%ld/%ld) ", 158 (long int) ret_extent, (long int) arg_extent); 159 } 160 else 161 { 162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 164 if (arg_extent != ret_extent) 165 runtime_error ("Array bound mismatch for dimension 1 of " 166 "array (%ld/%ld) ", 167 (long int) ret_extent, (long int) arg_extent); 168 169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 171 if (arg_extent != ret_extent) 172 runtime_error ("Array bound mismatch for dimension 2 of " 173 "array (%ld/%ld) ", 174 (long int) ret_extent, (long int) arg_extent); 175 } 176 } 177 178 179 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 180 { 181 /* One-dimensional result may be addressed in the code below 182 either as a row or a column matrix. We want both cases to 183 work. */ 184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 185 } 186 else 187 { 188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 190 } 191 192 193 if (GFC_DESCRIPTOR_RANK (a) == 1) 194 { 195 /* Treat it as a a row matrix A[1,count]. */ 196 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 197 aystride = 1; 198 199 xcount = 1; 200 count = GFC_DESCRIPTOR_EXTENT(a,0); 201 } 202 else 203 { 204 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 205 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 206 207 count = GFC_DESCRIPTOR_EXTENT(a,1); 208 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 209 } 210 211 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 212 { 213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 215 "in dimension 1: is %ld, should be %ld", 216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 217 } 218 219 if (GFC_DESCRIPTOR_RANK (b) == 1) 220 { 221 /* Treat it as a column matrix B[count,1] */ 222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 223 224 /* bystride should never be used for 1-dimensional b. 225 The value is only used for calculation of the 226 memory by the buffer. */ 227 bystride = 256; 228 ycount = 1; 229 } 230 else 231 { 232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 233 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 234 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 235 } 236 237 abase = a->base_addr; 238 bbase = b->base_addr; 239 dest = retarray->base_addr; 240 241 /* Now that everything is set up, we perform the multiplication 242 itself. */ 243 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 245 #define min(a,b) ((a) <= (b) ? (a) : (b)) 246 #define max(a,b) ((a) >= (b) ? (a) : (b)) 247 248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 249 && (bxstride == 1 || bystride == 1) 250 && (((float) xcount) * ((float) ycount) * ((float) count) 251 > POW3(blas_limit))) 252 { 253 const int m = xcount, n = ycount, k = count, ldc = rystride; 254 const GFC_REAL_4 one = 1, zero = 0; 255 const int lda = (axstride == 1) ? aystride : axstride, 256 ldb = (bxstride == 1) ? bystride : bxstride; 257 258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 259 { 260 assert (gemm != NULL); 261 const char *transa, *transb; 262 if (try_blas & 2) 263 transa = "C"; 264 else 265 transa = axstride == 1 ? "N" : "T"; 266 267 if (try_blas & 4) 268 transb = "C"; 269 else 270 transb = bxstride == 1 ? "N" : "T"; 271 272 gemm (transa, transb , &m, 273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 274 &ldc, 1, 1); 275 return; 276 } 277 } 278 279 if (rxstride == 1 && axstride == 1 && bxstride == 1 280 && GFC_DESCRIPTOR_RANK (b) != 1) 281 { 282 /* This block of code implements a tuned matmul, derived from 283 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 284 285 Bo Kagstrom and Per Ling 286 Department of Computing Science 287 Umea University 288 S-901 87 Umea, Sweden 289 290 from netlib.org, translated to C, and modified for matmul.m4. */ 291 292 const GFC_REAL_4 *a, *b; 293 GFC_REAL_4 *c; 294 const index_type m = xcount, n = ycount, k = count; 295 296 /* System generated locals */ 297 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 298 i1, i2, i3, i4, i5, i6; 299 300 /* Local variables */ 301 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42, 302 f13, f14, f23, f24, f33, f34, f43, f44; 303 index_type i, j, l, ii, jj, ll; 304 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 305 GFC_REAL_4 *t1; 306 307 a = abase; 308 b = bbase; 309 c = retarray->base_addr; 310 311 /* Parameter adjustments */ 312 c_dim1 = rystride; 313 c_offset = 1 + c_dim1; 314 c -= c_offset; 315 a_dim1 = aystride; 316 a_offset = 1 + a_dim1; 317 a -= a_offset; 318 b_dim1 = bystride; 319 b_offset = 1 + b_dim1; 320 b -= b_offset; 321 322 /* Empty c first. */ 323 for (j=1; j<=n; j++) 324 for (i=1; i<=m; i++) 325 c[i + j * c_dim1] = (GFC_REAL_4)0; 326 327 /* Early exit if possible */ 328 if (m == 0 || n == 0 || k == 0) 329 return; 330 331 /* Adjust size of t1 to what is needed. */ 332 index_type t1_dim, a_sz; 333 if (aystride == 1) 334 a_sz = rystride; 335 else 336 a_sz = a_dim1; 337 338 t1_dim = a_sz * 256 + b_dim1; 339 if (t1_dim > 65536) 340 t1_dim = 65536; 341 342 t1 = malloc (t1_dim * sizeof(GFC_REAL_4)); 343 344 /* Start turning the crank. */ 345 i1 = n; 346 for (jj = 1; jj <= i1; jj += 512) 347 { 348 /* Computing MIN */ 349 i2 = 512; 350 i3 = n - jj + 1; 351 jsec = min(i2,i3); 352 ujsec = jsec - jsec % 4; 353 i2 = k; 354 for (ll = 1; ll <= i2; ll += 256) 355 { 356 /* Computing MIN */ 357 i3 = 256; 358 i4 = k - ll + 1; 359 lsec = min(i3,i4); 360 ulsec = lsec - lsec % 2; 361 362 i3 = m; 363 for (ii = 1; ii <= i3; ii += 256) 364 { 365 /* Computing MIN */ 366 i4 = 256; 367 i5 = m - ii + 1; 368 isec = min(i4,i5); 369 uisec = isec - isec % 2; 370 i4 = ll + ulsec - 1; 371 for (l = ll; l <= i4; l += 2) 372 { 373 i5 = ii + uisec - 1; 374 for (i = ii; i <= i5; i += 2) 375 { 376 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 377 a[i + l * a_dim1]; 378 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 379 a[i + (l + 1) * a_dim1]; 380 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 381 a[i + 1 + l * a_dim1]; 382 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 383 a[i + 1 + (l + 1) * a_dim1]; 384 } 385 if (uisec < isec) 386 { 387 t1[l - ll + 1 + (isec << 8) - 257] = 388 a[ii + isec - 1 + l * a_dim1]; 389 t1[l - ll + 2 + (isec << 8) - 257] = 390 a[ii + isec - 1 + (l + 1) * a_dim1]; 391 } 392 } 393 if (ulsec < lsec) 394 { 395 i4 = ii + isec - 1; 396 for (i = ii; i<= i4; ++i) 397 { 398 t1[lsec + ((i - ii + 1) << 8) - 257] = 399 a[i + (ll + lsec - 1) * a_dim1]; 400 } 401 } 402 403 uisec = isec - isec % 4; 404 i4 = jj + ujsec - 1; 405 for (j = jj; j <= i4; j += 4) 406 { 407 i5 = ii + uisec - 1; 408 for (i = ii; i <= i5; i += 4) 409 { 410 f11 = c[i + j * c_dim1]; 411 f21 = c[i + 1 + j * c_dim1]; 412 f12 = c[i + (j + 1) * c_dim1]; 413 f22 = c[i + 1 + (j + 1) * c_dim1]; 414 f13 = c[i + (j + 2) * c_dim1]; 415 f23 = c[i + 1 + (j + 2) * c_dim1]; 416 f14 = c[i + (j + 3) * c_dim1]; 417 f24 = c[i + 1 + (j + 3) * c_dim1]; 418 f31 = c[i + 2 + j * c_dim1]; 419 f41 = c[i + 3 + j * c_dim1]; 420 f32 = c[i + 2 + (j + 1) * c_dim1]; 421 f42 = c[i + 3 + (j + 1) * c_dim1]; 422 f33 = c[i + 2 + (j + 2) * c_dim1]; 423 f43 = c[i + 3 + (j + 2) * c_dim1]; 424 f34 = c[i + 2 + (j + 3) * c_dim1]; 425 f44 = c[i + 3 + (j + 3) * c_dim1]; 426 i6 = ll + lsec - 1; 427 for (l = ll; l <= i6; ++l) 428 { 429 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 430 * b[l + j * b_dim1]; 431 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 432 * b[l + j * b_dim1]; 433 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 434 * b[l + (j + 1) * b_dim1]; 435 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 436 * b[l + (j + 1) * b_dim1]; 437 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 438 * b[l + (j + 2) * b_dim1]; 439 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 440 * b[l + (j + 2) * b_dim1]; 441 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 442 * b[l + (j + 3) * b_dim1]; 443 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 444 * b[l + (j + 3) * b_dim1]; 445 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 446 * b[l + j * b_dim1]; 447 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 448 * b[l + j * b_dim1]; 449 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 450 * b[l + (j + 1) * b_dim1]; 451 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 452 * b[l + (j + 1) * b_dim1]; 453 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 454 * b[l + (j + 2) * b_dim1]; 455 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 456 * b[l + (j + 2) * b_dim1]; 457 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 458 * b[l + (j + 3) * b_dim1]; 459 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 460 * b[l + (j + 3) * b_dim1]; 461 } 462 c[i + j * c_dim1] = f11; 463 c[i + 1 + j * c_dim1] = f21; 464 c[i + (j + 1) * c_dim1] = f12; 465 c[i + 1 + (j + 1) * c_dim1] = f22; 466 c[i + (j + 2) * c_dim1] = f13; 467 c[i + 1 + (j + 2) * c_dim1] = f23; 468 c[i + (j + 3) * c_dim1] = f14; 469 c[i + 1 + (j + 3) * c_dim1] = f24; 470 c[i + 2 + j * c_dim1] = f31; 471 c[i + 3 + j * c_dim1] = f41; 472 c[i + 2 + (j + 1) * c_dim1] = f32; 473 c[i + 3 + (j + 1) * c_dim1] = f42; 474 c[i + 2 + (j + 2) * c_dim1] = f33; 475 c[i + 3 + (j + 2) * c_dim1] = f43; 476 c[i + 2 + (j + 3) * c_dim1] = f34; 477 c[i + 3 + (j + 3) * c_dim1] = f44; 478 } 479 if (uisec < isec) 480 { 481 i5 = ii + isec - 1; 482 for (i = ii + uisec; i <= i5; ++i) 483 { 484 f11 = c[i + j * c_dim1]; 485 f12 = c[i + (j + 1) * c_dim1]; 486 f13 = c[i + (j + 2) * c_dim1]; 487 f14 = c[i + (j + 3) * c_dim1]; 488 i6 = ll + lsec - 1; 489 for (l = ll; l <= i6; ++l) 490 { 491 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 492 257] * b[l + j * b_dim1]; 493 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 494 257] * b[l + (j + 1) * b_dim1]; 495 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 496 257] * b[l + (j + 2) * b_dim1]; 497 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 498 257] * b[l + (j + 3) * b_dim1]; 499 } 500 c[i + j * c_dim1] = f11; 501 c[i + (j + 1) * c_dim1] = f12; 502 c[i + (j + 2) * c_dim1] = f13; 503 c[i + (j + 3) * c_dim1] = f14; 504 } 505 } 506 } 507 if (ujsec < jsec) 508 { 509 i4 = jj + jsec - 1; 510 for (j = jj + ujsec; j <= i4; ++j) 511 { 512 i5 = ii + uisec - 1; 513 for (i = ii; i <= i5; i += 4) 514 { 515 f11 = c[i + j * c_dim1]; 516 f21 = c[i + 1 + j * c_dim1]; 517 f31 = c[i + 2 + j * c_dim1]; 518 f41 = c[i + 3 + j * c_dim1]; 519 i6 = ll + lsec - 1; 520 for (l = ll; l <= i6; ++l) 521 { 522 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 523 257] * b[l + j * b_dim1]; 524 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 525 257] * b[l + j * b_dim1]; 526 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 527 257] * b[l + j * b_dim1]; 528 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 529 257] * b[l + j * b_dim1]; 530 } 531 c[i + j * c_dim1] = f11; 532 c[i + 1 + j * c_dim1] = f21; 533 c[i + 2 + j * c_dim1] = f31; 534 c[i + 3 + j * c_dim1] = f41; 535 } 536 i5 = ii + isec - 1; 537 for (i = ii + uisec; i <= i5; ++i) 538 { 539 f11 = c[i + j * c_dim1]; 540 i6 = ll + lsec - 1; 541 for (l = ll; l <= i6; ++l) 542 { 543 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 544 257] * b[l + j * b_dim1]; 545 } 546 c[i + j * c_dim1] = f11; 547 } 548 } 549 } 550 } 551 } 552 } 553 free(t1); 554 return; 555 } 556 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 557 { 558 if (GFC_DESCRIPTOR_RANK (a) != 1) 559 { 560 const GFC_REAL_4 *restrict abase_x; 561 const GFC_REAL_4 *restrict bbase_y; 562 GFC_REAL_4 *restrict dest_y; 563 GFC_REAL_4 s; 564 565 for (y = 0; y < ycount; y++) 566 { 567 bbase_y = &bbase[y*bystride]; 568 dest_y = &dest[y*rystride]; 569 for (x = 0; x < xcount; x++) 570 { 571 abase_x = &abase[x*axstride]; 572 s = (GFC_REAL_4) 0; 573 for (n = 0; n < count; n++) 574 s += abase_x[n] * bbase_y[n]; 575 dest_y[x] = s; 576 } 577 } 578 } 579 else 580 { 581 const GFC_REAL_4 *restrict bbase_y; 582 GFC_REAL_4 s; 583 584 for (y = 0; y < ycount; y++) 585 { 586 bbase_y = &bbase[y*bystride]; 587 s = (GFC_REAL_4) 0; 588 for (n = 0; n < count; n++) 589 s += abase[n*axstride] * bbase_y[n]; 590 dest[y*rystride] = s; 591 } 592 } 593 } 594 else if (GFC_DESCRIPTOR_RANK (a) == 1) 595 { 596 const GFC_REAL_4 *restrict bbase_y; 597 GFC_REAL_4 s; 598 599 for (y = 0; y < ycount; y++) 600 { 601 bbase_y = &bbase[y*bystride]; 602 s = (GFC_REAL_4) 0; 603 for (n = 0; n < count; n++) 604 s += abase[n*axstride] * bbase_y[n*bxstride]; 605 dest[y*rxstride] = s; 606 } 607 } 608 else if (axstride < aystride) 609 { 610 for (y = 0; y < ycount; y++) 611 for (x = 0; x < xcount; x++) 612 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; 613 614 for (y = 0; y < ycount; y++) 615 for (n = 0; n < count; n++) 616 for (x = 0; x < xcount; x++) 617 /* dest[x,y] += a[x,n] * b[n,y] */ 618 dest[x*rxstride + y*rystride] += 619 abase[x*axstride + n*aystride] * 620 bbase[n*bxstride + y*bystride]; 621 } 622 else 623 { 624 const GFC_REAL_4 *restrict abase_x; 625 const GFC_REAL_4 *restrict bbase_y; 626 GFC_REAL_4 *restrict dest_y; 627 GFC_REAL_4 s; 628 629 for (y = 0; y < ycount; y++) 630 { 631 bbase_y = &bbase[y*bystride]; 632 dest_y = &dest[y*rystride]; 633 for (x = 0; x < xcount; x++) 634 { 635 abase_x = &abase[x*axstride]; 636 s = (GFC_REAL_4) 0; 637 for (n = 0; n < count; n++) 638 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 639 dest_y[x*rxstride] = s; 640 } 641 } 642 } 643 } 644 #undef POW3 645 #undef min 646 #undef max 647 648 #endif /* HAVE_AVX */ 649 650 #ifdef HAVE_AVX2 651 static void 652 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, 653 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 654 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 655 static void 656 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, 657 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 658 int blas_limit, blas_call gemm) 659 { 660 const GFC_REAL_4 * restrict abase; 661 const GFC_REAL_4 * restrict bbase; 662 GFC_REAL_4 * restrict dest; 663 664 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 665 index_type x, y, n, count, xcount, ycount; 666 667 assert (GFC_DESCRIPTOR_RANK (a) == 2 668 || GFC_DESCRIPTOR_RANK (b) == 2); 669 670 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 671 672 Either A or B (but not both) can be rank 1: 673 674 o One-dimensional argument A is implicitly treated as a row matrix 675 dimensioned [1,count], so xcount=1. 676 677 o One-dimensional argument B is implicitly treated as a column matrix 678 dimensioned [count, 1], so ycount=1. 679 */ 680 681 if (retarray->base_addr == NULL) 682 { 683 if (GFC_DESCRIPTOR_RANK (a) == 1) 684 { 685 GFC_DIMENSION_SET(retarray->dim[0], 0, 686 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 687 } 688 else if (GFC_DESCRIPTOR_RANK (b) == 1) 689 { 690 GFC_DIMENSION_SET(retarray->dim[0], 0, 691 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 692 } 693 else 694 { 695 GFC_DIMENSION_SET(retarray->dim[0], 0, 696 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 697 698 GFC_DIMENSION_SET(retarray->dim[1], 0, 699 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 700 GFC_DESCRIPTOR_EXTENT(retarray,0)); 701 } 702 703 retarray->base_addr 704 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); 705 retarray->offset = 0; 706 } 707 else if (unlikely (compile_options.bounds_check)) 708 { 709 index_type ret_extent, arg_extent; 710 711 if (GFC_DESCRIPTOR_RANK (a) == 1) 712 { 713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 715 if (arg_extent != ret_extent) 716 runtime_error ("Array bound mismatch for dimension 1 of " 717 "array (%ld/%ld) ", 718 (long int) ret_extent, (long int) arg_extent); 719 } 720 else if (GFC_DESCRIPTOR_RANK (b) == 1) 721 { 722 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 724 if (arg_extent != ret_extent) 725 runtime_error ("Array bound mismatch for dimension 1 of " 726 "array (%ld/%ld) ", 727 (long int) ret_extent, (long int) arg_extent); 728 } 729 else 730 { 731 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 732 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 733 if (arg_extent != ret_extent) 734 runtime_error ("Array bound mismatch for dimension 1 of " 735 "array (%ld/%ld) ", 736 (long int) ret_extent, (long int) arg_extent); 737 738 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 739 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 740 if (arg_extent != ret_extent) 741 runtime_error ("Array bound mismatch for dimension 2 of " 742 "array (%ld/%ld) ", 743 (long int) ret_extent, (long int) arg_extent); 744 } 745 } 746 747 748 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 749 { 750 /* One-dimensional result may be addressed in the code below 751 either as a row or a column matrix. We want both cases to 752 work. */ 753 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 754 } 755 else 756 { 757 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 758 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 759 } 760 761 762 if (GFC_DESCRIPTOR_RANK (a) == 1) 763 { 764 /* Treat it as a a row matrix A[1,count]. */ 765 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 766 aystride = 1; 767 768 xcount = 1; 769 count = GFC_DESCRIPTOR_EXTENT(a,0); 770 } 771 else 772 { 773 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 774 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 775 776 count = GFC_DESCRIPTOR_EXTENT(a,1); 777 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 778 } 779 780 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 781 { 782 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 783 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 784 "in dimension 1: is %ld, should be %ld", 785 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 786 } 787 788 if (GFC_DESCRIPTOR_RANK (b) == 1) 789 { 790 /* Treat it as a column matrix B[count,1] */ 791 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 792 793 /* bystride should never be used for 1-dimensional b. 794 The value is only used for calculation of the 795 memory by the buffer. */ 796 bystride = 256; 797 ycount = 1; 798 } 799 else 800 { 801 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 802 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 803 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 804 } 805 806 abase = a->base_addr; 807 bbase = b->base_addr; 808 dest = retarray->base_addr; 809 810 /* Now that everything is set up, we perform the multiplication 811 itself. */ 812 813 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 814 #define min(a,b) ((a) <= (b) ? (a) : (b)) 815 #define max(a,b) ((a) >= (b) ? (a) : (b)) 816 817 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 818 && (bxstride == 1 || bystride == 1) 819 && (((float) xcount) * ((float) ycount) * ((float) count) 820 > POW3(blas_limit))) 821 { 822 const int m = xcount, n = ycount, k = count, ldc = rystride; 823 const GFC_REAL_4 one = 1, zero = 0; 824 const int lda = (axstride == 1) ? aystride : axstride, 825 ldb = (bxstride == 1) ? bystride : bxstride; 826 827 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 828 { 829 assert (gemm != NULL); 830 const char *transa, *transb; 831 if (try_blas & 2) 832 transa = "C"; 833 else 834 transa = axstride == 1 ? "N" : "T"; 835 836 if (try_blas & 4) 837 transb = "C"; 838 else 839 transb = bxstride == 1 ? "N" : "T"; 840 841 gemm (transa, transb , &m, 842 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 843 &ldc, 1, 1); 844 return; 845 } 846 } 847 848 if (rxstride == 1 && axstride == 1 && bxstride == 1 849 && GFC_DESCRIPTOR_RANK (b) != 1) 850 { 851 /* This block of code implements a tuned matmul, derived from 852 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 853 854 Bo Kagstrom and Per Ling 855 Department of Computing Science 856 Umea University 857 S-901 87 Umea, Sweden 858 859 from netlib.org, translated to C, and modified for matmul.m4. */ 860 861 const GFC_REAL_4 *a, *b; 862 GFC_REAL_4 *c; 863 const index_type m = xcount, n = ycount, k = count; 864 865 /* System generated locals */ 866 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 867 i1, i2, i3, i4, i5, i6; 868 869 /* Local variables */ 870 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42, 871 f13, f14, f23, f24, f33, f34, f43, f44; 872 index_type i, j, l, ii, jj, ll; 873 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 874 GFC_REAL_4 *t1; 875 876 a = abase; 877 b = bbase; 878 c = retarray->base_addr; 879 880 /* Parameter adjustments */ 881 c_dim1 = rystride; 882 c_offset = 1 + c_dim1; 883 c -= c_offset; 884 a_dim1 = aystride; 885 a_offset = 1 + a_dim1; 886 a -= a_offset; 887 b_dim1 = bystride; 888 b_offset = 1 + b_dim1; 889 b -= b_offset; 890 891 /* Empty c first. */ 892 for (j=1; j<=n; j++) 893 for (i=1; i<=m; i++) 894 c[i + j * c_dim1] = (GFC_REAL_4)0; 895 896 /* Early exit if possible */ 897 if (m == 0 || n == 0 || k == 0) 898 return; 899 900 /* Adjust size of t1 to what is needed. */ 901 index_type t1_dim, a_sz; 902 if (aystride == 1) 903 a_sz = rystride; 904 else 905 a_sz = a_dim1; 906 907 t1_dim = a_sz * 256 + b_dim1; 908 if (t1_dim > 65536) 909 t1_dim = 65536; 910 911 t1 = malloc (t1_dim * sizeof(GFC_REAL_4)); 912 913 /* Start turning the crank. */ 914 i1 = n; 915 for (jj = 1; jj <= i1; jj += 512) 916 { 917 /* Computing MIN */ 918 i2 = 512; 919 i3 = n - jj + 1; 920 jsec = min(i2,i3); 921 ujsec = jsec - jsec % 4; 922 i2 = k; 923 for (ll = 1; ll <= i2; ll += 256) 924 { 925 /* Computing MIN */ 926 i3 = 256; 927 i4 = k - ll + 1; 928 lsec = min(i3,i4); 929 ulsec = lsec - lsec % 2; 930 931 i3 = m; 932 for (ii = 1; ii <= i3; ii += 256) 933 { 934 /* Computing MIN */ 935 i4 = 256; 936 i5 = m - ii + 1; 937 isec = min(i4,i5); 938 uisec = isec - isec % 2; 939 i4 = ll + ulsec - 1; 940 for (l = ll; l <= i4; l += 2) 941 { 942 i5 = ii + uisec - 1; 943 for (i = ii; i <= i5; i += 2) 944 { 945 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 946 a[i + l * a_dim1]; 947 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 948 a[i + (l + 1) * a_dim1]; 949 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 950 a[i + 1 + l * a_dim1]; 951 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 952 a[i + 1 + (l + 1) * a_dim1]; 953 } 954 if (uisec < isec) 955 { 956 t1[l - ll + 1 + (isec << 8) - 257] = 957 a[ii + isec - 1 + l * a_dim1]; 958 t1[l - ll + 2 + (isec << 8) - 257] = 959 a[ii + isec - 1 + (l + 1) * a_dim1]; 960 } 961 } 962 if (ulsec < lsec) 963 { 964 i4 = ii + isec - 1; 965 for (i = ii; i<= i4; ++i) 966 { 967 t1[lsec + ((i - ii + 1) << 8) - 257] = 968 a[i + (ll + lsec - 1) * a_dim1]; 969 } 970 } 971 972 uisec = isec - isec % 4; 973 i4 = jj + ujsec - 1; 974 for (j = jj; j <= i4; j += 4) 975 { 976 i5 = ii + uisec - 1; 977 for (i = ii; i <= i5; i += 4) 978 { 979 f11 = c[i + j * c_dim1]; 980 f21 = c[i + 1 + j * c_dim1]; 981 f12 = c[i + (j + 1) * c_dim1]; 982 f22 = c[i + 1 + (j + 1) * c_dim1]; 983 f13 = c[i + (j + 2) * c_dim1]; 984 f23 = c[i + 1 + (j + 2) * c_dim1]; 985 f14 = c[i + (j + 3) * c_dim1]; 986 f24 = c[i + 1 + (j + 3) * c_dim1]; 987 f31 = c[i + 2 + j * c_dim1]; 988 f41 = c[i + 3 + j * c_dim1]; 989 f32 = c[i + 2 + (j + 1) * c_dim1]; 990 f42 = c[i + 3 + (j + 1) * c_dim1]; 991 f33 = c[i + 2 + (j + 2) * c_dim1]; 992 f43 = c[i + 3 + (j + 2) * c_dim1]; 993 f34 = c[i + 2 + (j + 3) * c_dim1]; 994 f44 = c[i + 3 + (j + 3) * c_dim1]; 995 i6 = ll + lsec - 1; 996 for (l = ll; l <= i6; ++l) 997 { 998 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 999 * b[l + j * b_dim1]; 1000 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1001 * b[l + j * b_dim1]; 1002 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1003 * b[l + (j + 1) * b_dim1]; 1004 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1005 * b[l + (j + 1) * b_dim1]; 1006 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1007 * b[l + (j + 2) * b_dim1]; 1008 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1009 * b[l + (j + 2) * b_dim1]; 1010 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1011 * b[l + (j + 3) * b_dim1]; 1012 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1013 * b[l + (j + 3) * b_dim1]; 1014 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1015 * b[l + j * b_dim1]; 1016 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1017 * b[l + j * b_dim1]; 1018 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1019 * b[l + (j + 1) * b_dim1]; 1020 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1021 * b[l + (j + 1) * b_dim1]; 1022 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1023 * b[l + (j + 2) * b_dim1]; 1024 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1025 * b[l + (j + 2) * b_dim1]; 1026 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1027 * b[l + (j + 3) * b_dim1]; 1028 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1029 * b[l + (j + 3) * b_dim1]; 1030 } 1031 c[i + j * c_dim1] = f11; 1032 c[i + 1 + j * c_dim1] = f21; 1033 c[i + (j + 1) * c_dim1] = f12; 1034 c[i + 1 + (j + 1) * c_dim1] = f22; 1035 c[i + (j + 2) * c_dim1] = f13; 1036 c[i + 1 + (j + 2) * c_dim1] = f23; 1037 c[i + (j + 3) * c_dim1] = f14; 1038 c[i + 1 + (j + 3) * c_dim1] = f24; 1039 c[i + 2 + j * c_dim1] = f31; 1040 c[i + 3 + j * c_dim1] = f41; 1041 c[i + 2 + (j + 1) * c_dim1] = f32; 1042 c[i + 3 + (j + 1) * c_dim1] = f42; 1043 c[i + 2 + (j + 2) * c_dim1] = f33; 1044 c[i + 3 + (j + 2) * c_dim1] = f43; 1045 c[i + 2 + (j + 3) * c_dim1] = f34; 1046 c[i + 3 + (j + 3) * c_dim1] = f44; 1047 } 1048 if (uisec < isec) 1049 { 1050 i5 = ii + isec - 1; 1051 for (i = ii + uisec; i <= i5; ++i) 1052 { 1053 f11 = c[i + j * c_dim1]; 1054 f12 = c[i + (j + 1) * c_dim1]; 1055 f13 = c[i + (j + 2) * c_dim1]; 1056 f14 = c[i + (j + 3) * c_dim1]; 1057 i6 = ll + lsec - 1; 1058 for (l = ll; l <= i6; ++l) 1059 { 1060 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1061 257] * b[l + j * b_dim1]; 1062 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1063 257] * b[l + (j + 1) * b_dim1]; 1064 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1065 257] * b[l + (j + 2) * b_dim1]; 1066 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1067 257] * b[l + (j + 3) * b_dim1]; 1068 } 1069 c[i + j * c_dim1] = f11; 1070 c[i + (j + 1) * c_dim1] = f12; 1071 c[i + (j + 2) * c_dim1] = f13; 1072 c[i + (j + 3) * c_dim1] = f14; 1073 } 1074 } 1075 } 1076 if (ujsec < jsec) 1077 { 1078 i4 = jj + jsec - 1; 1079 for (j = jj + ujsec; j <= i4; ++j) 1080 { 1081 i5 = ii + uisec - 1; 1082 for (i = ii; i <= i5; i += 4) 1083 { 1084 f11 = c[i + j * c_dim1]; 1085 f21 = c[i + 1 + j * c_dim1]; 1086 f31 = c[i + 2 + j * c_dim1]; 1087 f41 = c[i + 3 + j * c_dim1]; 1088 i6 = ll + lsec - 1; 1089 for (l = ll; l <= i6; ++l) 1090 { 1091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1092 257] * b[l + j * b_dim1]; 1093 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1094 257] * b[l + j * b_dim1]; 1095 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1096 257] * b[l + j * b_dim1]; 1097 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1098 257] * b[l + j * b_dim1]; 1099 } 1100 c[i + j * c_dim1] = f11; 1101 c[i + 1 + j * c_dim1] = f21; 1102 c[i + 2 + j * c_dim1] = f31; 1103 c[i + 3 + j * c_dim1] = f41; 1104 } 1105 i5 = ii + isec - 1; 1106 for (i = ii + uisec; i <= i5; ++i) 1107 { 1108 f11 = c[i + j * c_dim1]; 1109 i6 = ll + lsec - 1; 1110 for (l = ll; l <= i6; ++l) 1111 { 1112 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1113 257] * b[l + j * b_dim1]; 1114 } 1115 c[i + j * c_dim1] = f11; 1116 } 1117 } 1118 } 1119 } 1120 } 1121 } 1122 free(t1); 1123 return; 1124 } 1125 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1126 { 1127 if (GFC_DESCRIPTOR_RANK (a) != 1) 1128 { 1129 const GFC_REAL_4 *restrict abase_x; 1130 const GFC_REAL_4 *restrict bbase_y; 1131 GFC_REAL_4 *restrict dest_y; 1132 GFC_REAL_4 s; 1133 1134 for (y = 0; y < ycount; y++) 1135 { 1136 bbase_y = &bbase[y*bystride]; 1137 dest_y = &dest[y*rystride]; 1138 for (x = 0; x < xcount; x++) 1139 { 1140 abase_x = &abase[x*axstride]; 1141 s = (GFC_REAL_4) 0; 1142 for (n = 0; n < count; n++) 1143 s += abase_x[n] * bbase_y[n]; 1144 dest_y[x] = s; 1145 } 1146 } 1147 } 1148 else 1149 { 1150 const GFC_REAL_4 *restrict bbase_y; 1151 GFC_REAL_4 s; 1152 1153 for (y = 0; y < ycount; y++) 1154 { 1155 bbase_y = &bbase[y*bystride]; 1156 s = (GFC_REAL_4) 0; 1157 for (n = 0; n < count; n++) 1158 s += abase[n*axstride] * bbase_y[n]; 1159 dest[y*rystride] = s; 1160 } 1161 } 1162 } 1163 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1164 { 1165 const GFC_REAL_4 *restrict bbase_y; 1166 GFC_REAL_4 s; 1167 1168 for (y = 0; y < ycount; y++) 1169 { 1170 bbase_y = &bbase[y*bystride]; 1171 s = (GFC_REAL_4) 0; 1172 for (n = 0; n < count; n++) 1173 s += abase[n*axstride] * bbase_y[n*bxstride]; 1174 dest[y*rxstride] = s; 1175 } 1176 } 1177 else if (axstride < aystride) 1178 { 1179 for (y = 0; y < ycount; y++) 1180 for (x = 0; x < xcount; x++) 1181 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; 1182 1183 for (y = 0; y < ycount; y++) 1184 for (n = 0; n < count; n++) 1185 for (x = 0; x < xcount; x++) 1186 /* dest[x,y] += a[x,n] * b[n,y] */ 1187 dest[x*rxstride + y*rystride] += 1188 abase[x*axstride + n*aystride] * 1189 bbase[n*bxstride + y*bystride]; 1190 } 1191 else 1192 { 1193 const GFC_REAL_4 *restrict abase_x; 1194 const GFC_REAL_4 *restrict bbase_y; 1195 GFC_REAL_4 *restrict dest_y; 1196 GFC_REAL_4 s; 1197 1198 for (y = 0; y < ycount; y++) 1199 { 1200 bbase_y = &bbase[y*bystride]; 1201 dest_y = &dest[y*rystride]; 1202 for (x = 0; x < xcount; x++) 1203 { 1204 abase_x = &abase[x*axstride]; 1205 s = (GFC_REAL_4) 0; 1206 for (n = 0; n < count; n++) 1207 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1208 dest_y[x*rxstride] = s; 1209 } 1210 } 1211 } 1212 } 1213 #undef POW3 1214 #undef min 1215 #undef max 1216 1217 #endif /* HAVE_AVX2 */ 1218 1219 #ifdef HAVE_AVX512F 1220 static void 1221 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, 1222 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 1223 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 1224 static void 1225 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, 1226 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 1227 int blas_limit, blas_call gemm) 1228 { 1229 const GFC_REAL_4 * restrict abase; 1230 const GFC_REAL_4 * restrict bbase; 1231 GFC_REAL_4 * restrict dest; 1232 1233 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1234 index_type x, y, n, count, xcount, ycount; 1235 1236 assert (GFC_DESCRIPTOR_RANK (a) == 2 1237 || GFC_DESCRIPTOR_RANK (b) == 2); 1238 1239 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1240 1241 Either A or B (but not both) can be rank 1: 1242 1243 o One-dimensional argument A is implicitly treated as a row matrix 1244 dimensioned [1,count], so xcount=1. 1245 1246 o One-dimensional argument B is implicitly treated as a column matrix 1247 dimensioned [count, 1], so ycount=1. 1248 */ 1249 1250 if (retarray->base_addr == NULL) 1251 { 1252 if (GFC_DESCRIPTOR_RANK (a) == 1) 1253 { 1254 GFC_DIMENSION_SET(retarray->dim[0], 0, 1255 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1256 } 1257 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1258 { 1259 GFC_DIMENSION_SET(retarray->dim[0], 0, 1260 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1261 } 1262 else 1263 { 1264 GFC_DIMENSION_SET(retarray->dim[0], 0, 1265 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1266 1267 GFC_DIMENSION_SET(retarray->dim[1], 0, 1268 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1269 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1270 } 1271 1272 retarray->base_addr 1273 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); 1274 retarray->offset = 0; 1275 } 1276 else if (unlikely (compile_options.bounds_check)) 1277 { 1278 index_type ret_extent, arg_extent; 1279 1280 if (GFC_DESCRIPTOR_RANK (a) == 1) 1281 { 1282 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1283 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1284 if (arg_extent != ret_extent) 1285 runtime_error ("Array bound mismatch for dimension 1 of " 1286 "array (%ld/%ld) ", 1287 (long int) ret_extent, (long int) arg_extent); 1288 } 1289 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1290 { 1291 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1292 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1293 if (arg_extent != ret_extent) 1294 runtime_error ("Array bound mismatch for dimension 1 of " 1295 "array (%ld/%ld) ", 1296 (long int) ret_extent, (long int) arg_extent); 1297 } 1298 else 1299 { 1300 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1301 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1302 if (arg_extent != ret_extent) 1303 runtime_error ("Array bound mismatch for dimension 1 of " 1304 "array (%ld/%ld) ", 1305 (long int) ret_extent, (long int) arg_extent); 1306 1307 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1308 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1309 if (arg_extent != ret_extent) 1310 runtime_error ("Array bound mismatch for dimension 2 of " 1311 "array (%ld/%ld) ", 1312 (long int) ret_extent, (long int) arg_extent); 1313 } 1314 } 1315 1316 1317 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1318 { 1319 /* One-dimensional result may be addressed in the code below 1320 either as a row or a column matrix. We want both cases to 1321 work. */ 1322 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1323 } 1324 else 1325 { 1326 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1327 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1328 } 1329 1330 1331 if (GFC_DESCRIPTOR_RANK (a) == 1) 1332 { 1333 /* Treat it as a a row matrix A[1,count]. */ 1334 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1335 aystride = 1; 1336 1337 xcount = 1; 1338 count = GFC_DESCRIPTOR_EXTENT(a,0); 1339 } 1340 else 1341 { 1342 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1343 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1344 1345 count = GFC_DESCRIPTOR_EXTENT(a,1); 1346 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1347 } 1348 1349 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1350 { 1351 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1352 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1353 "in dimension 1: is %ld, should be %ld", 1354 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1355 } 1356 1357 if (GFC_DESCRIPTOR_RANK (b) == 1) 1358 { 1359 /* Treat it as a column matrix B[count,1] */ 1360 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1361 1362 /* bystride should never be used for 1-dimensional b. 1363 The value is only used for calculation of the 1364 memory by the buffer. */ 1365 bystride = 256; 1366 ycount = 1; 1367 } 1368 else 1369 { 1370 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1371 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1372 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1373 } 1374 1375 abase = a->base_addr; 1376 bbase = b->base_addr; 1377 dest = retarray->base_addr; 1378 1379 /* Now that everything is set up, we perform the multiplication 1380 itself. */ 1381 1382 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1383 #define min(a,b) ((a) <= (b) ? (a) : (b)) 1384 #define max(a,b) ((a) >= (b) ? (a) : (b)) 1385 1386 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1387 && (bxstride == 1 || bystride == 1) 1388 && (((float) xcount) * ((float) ycount) * ((float) count) 1389 > POW3(blas_limit))) 1390 { 1391 const int m = xcount, n = ycount, k = count, ldc = rystride; 1392 const GFC_REAL_4 one = 1, zero = 0; 1393 const int lda = (axstride == 1) ? aystride : axstride, 1394 ldb = (bxstride == 1) ? bystride : bxstride; 1395 1396 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1397 { 1398 assert (gemm != NULL); 1399 const char *transa, *transb; 1400 if (try_blas & 2) 1401 transa = "C"; 1402 else 1403 transa = axstride == 1 ? "N" : "T"; 1404 1405 if (try_blas & 4) 1406 transb = "C"; 1407 else 1408 transb = bxstride == 1 ? "N" : "T"; 1409 1410 gemm (transa, transb , &m, 1411 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1412 &ldc, 1, 1); 1413 return; 1414 } 1415 } 1416 1417 if (rxstride == 1 && axstride == 1 && bxstride == 1 1418 && GFC_DESCRIPTOR_RANK (b) != 1) 1419 { 1420 /* This block of code implements a tuned matmul, derived from 1421 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 1422 1423 Bo Kagstrom and Per Ling 1424 Department of Computing Science 1425 Umea University 1426 S-901 87 Umea, Sweden 1427 1428 from netlib.org, translated to C, and modified for matmul.m4. */ 1429 1430 const GFC_REAL_4 *a, *b; 1431 GFC_REAL_4 *c; 1432 const index_type m = xcount, n = ycount, k = count; 1433 1434 /* System generated locals */ 1435 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 1436 i1, i2, i3, i4, i5, i6; 1437 1438 /* Local variables */ 1439 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42, 1440 f13, f14, f23, f24, f33, f34, f43, f44; 1441 index_type i, j, l, ii, jj, ll; 1442 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 1443 GFC_REAL_4 *t1; 1444 1445 a = abase; 1446 b = bbase; 1447 c = retarray->base_addr; 1448 1449 /* Parameter adjustments */ 1450 c_dim1 = rystride; 1451 c_offset = 1 + c_dim1; 1452 c -= c_offset; 1453 a_dim1 = aystride; 1454 a_offset = 1 + a_dim1; 1455 a -= a_offset; 1456 b_dim1 = bystride; 1457 b_offset = 1 + b_dim1; 1458 b -= b_offset; 1459 1460 /* Empty c first. */ 1461 for (j=1; j<=n; j++) 1462 for (i=1; i<=m; i++) 1463 c[i + j * c_dim1] = (GFC_REAL_4)0; 1464 1465 /* Early exit if possible */ 1466 if (m == 0 || n == 0 || k == 0) 1467 return; 1468 1469 /* Adjust size of t1 to what is needed. */ 1470 index_type t1_dim, a_sz; 1471 if (aystride == 1) 1472 a_sz = rystride; 1473 else 1474 a_sz = a_dim1; 1475 1476 t1_dim = a_sz * 256 + b_dim1; 1477 if (t1_dim > 65536) 1478 t1_dim = 65536; 1479 1480 t1 = malloc (t1_dim * sizeof(GFC_REAL_4)); 1481 1482 /* Start turning the crank. */ 1483 i1 = n; 1484 for (jj = 1; jj <= i1; jj += 512) 1485 { 1486 /* Computing MIN */ 1487 i2 = 512; 1488 i3 = n - jj + 1; 1489 jsec = min(i2,i3); 1490 ujsec = jsec - jsec % 4; 1491 i2 = k; 1492 for (ll = 1; ll <= i2; ll += 256) 1493 { 1494 /* Computing MIN */ 1495 i3 = 256; 1496 i4 = k - ll + 1; 1497 lsec = min(i3,i4); 1498 ulsec = lsec - lsec % 2; 1499 1500 i3 = m; 1501 for (ii = 1; ii <= i3; ii += 256) 1502 { 1503 /* Computing MIN */ 1504 i4 = 256; 1505 i5 = m - ii + 1; 1506 isec = min(i4,i5); 1507 uisec = isec - isec % 2; 1508 i4 = ll + ulsec - 1; 1509 for (l = ll; l <= i4; l += 2) 1510 { 1511 i5 = ii + uisec - 1; 1512 for (i = ii; i <= i5; i += 2) 1513 { 1514 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 1515 a[i + l * a_dim1]; 1516 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 1517 a[i + (l + 1) * a_dim1]; 1518 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 1519 a[i + 1 + l * a_dim1]; 1520 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 1521 a[i + 1 + (l + 1) * a_dim1]; 1522 } 1523 if (uisec < isec) 1524 { 1525 t1[l - ll + 1 + (isec << 8) - 257] = 1526 a[ii + isec - 1 + l * a_dim1]; 1527 t1[l - ll + 2 + (isec << 8) - 257] = 1528 a[ii + isec - 1 + (l + 1) * a_dim1]; 1529 } 1530 } 1531 if (ulsec < lsec) 1532 { 1533 i4 = ii + isec - 1; 1534 for (i = ii; i<= i4; ++i) 1535 { 1536 t1[lsec + ((i - ii + 1) << 8) - 257] = 1537 a[i + (ll + lsec - 1) * a_dim1]; 1538 } 1539 } 1540 1541 uisec = isec - isec % 4; 1542 i4 = jj + ujsec - 1; 1543 for (j = jj; j <= i4; j += 4) 1544 { 1545 i5 = ii + uisec - 1; 1546 for (i = ii; i <= i5; i += 4) 1547 { 1548 f11 = c[i + j * c_dim1]; 1549 f21 = c[i + 1 + j * c_dim1]; 1550 f12 = c[i + (j + 1) * c_dim1]; 1551 f22 = c[i + 1 + (j + 1) * c_dim1]; 1552 f13 = c[i + (j + 2) * c_dim1]; 1553 f23 = c[i + 1 + (j + 2) * c_dim1]; 1554 f14 = c[i + (j + 3) * c_dim1]; 1555 f24 = c[i + 1 + (j + 3) * c_dim1]; 1556 f31 = c[i + 2 + j * c_dim1]; 1557 f41 = c[i + 3 + j * c_dim1]; 1558 f32 = c[i + 2 + (j + 1) * c_dim1]; 1559 f42 = c[i + 3 + (j + 1) * c_dim1]; 1560 f33 = c[i + 2 + (j + 2) * c_dim1]; 1561 f43 = c[i + 3 + (j + 2) * c_dim1]; 1562 f34 = c[i + 2 + (j + 3) * c_dim1]; 1563 f44 = c[i + 3 + (j + 3) * c_dim1]; 1564 i6 = ll + lsec - 1; 1565 for (l = ll; l <= i6; ++l) 1566 { 1567 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1568 * b[l + j * b_dim1]; 1569 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1570 * b[l + j * b_dim1]; 1571 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1572 * b[l + (j + 1) * b_dim1]; 1573 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1574 * b[l + (j + 1) * b_dim1]; 1575 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1576 * b[l + (j + 2) * b_dim1]; 1577 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1578 * b[l + (j + 2) * b_dim1]; 1579 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1580 * b[l + (j + 3) * b_dim1]; 1581 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1582 * b[l + (j + 3) * b_dim1]; 1583 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1584 * b[l + j * b_dim1]; 1585 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1586 * b[l + j * b_dim1]; 1587 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1588 * b[l + (j + 1) * b_dim1]; 1589 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1590 * b[l + (j + 1) * b_dim1]; 1591 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1592 * b[l + (j + 2) * b_dim1]; 1593 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1594 * b[l + (j + 2) * b_dim1]; 1595 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1596 * b[l + (j + 3) * b_dim1]; 1597 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1598 * b[l + (j + 3) * b_dim1]; 1599 } 1600 c[i + j * c_dim1] = f11; 1601 c[i + 1 + j * c_dim1] = f21; 1602 c[i + (j + 1) * c_dim1] = f12; 1603 c[i + 1 + (j + 1) * c_dim1] = f22; 1604 c[i + (j + 2) * c_dim1] = f13; 1605 c[i + 1 + (j + 2) * c_dim1] = f23; 1606 c[i + (j + 3) * c_dim1] = f14; 1607 c[i + 1 + (j + 3) * c_dim1] = f24; 1608 c[i + 2 + j * c_dim1] = f31; 1609 c[i + 3 + j * c_dim1] = f41; 1610 c[i + 2 + (j + 1) * c_dim1] = f32; 1611 c[i + 3 + (j + 1) * c_dim1] = f42; 1612 c[i + 2 + (j + 2) * c_dim1] = f33; 1613 c[i + 3 + (j + 2) * c_dim1] = f43; 1614 c[i + 2 + (j + 3) * c_dim1] = f34; 1615 c[i + 3 + (j + 3) * c_dim1] = f44; 1616 } 1617 if (uisec < isec) 1618 { 1619 i5 = ii + isec - 1; 1620 for (i = ii + uisec; i <= i5; ++i) 1621 { 1622 f11 = c[i + j * c_dim1]; 1623 f12 = c[i + (j + 1) * c_dim1]; 1624 f13 = c[i + (j + 2) * c_dim1]; 1625 f14 = c[i + (j + 3) * c_dim1]; 1626 i6 = ll + lsec - 1; 1627 for (l = ll; l <= i6; ++l) 1628 { 1629 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1630 257] * b[l + j * b_dim1]; 1631 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1632 257] * b[l + (j + 1) * b_dim1]; 1633 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1634 257] * b[l + (j + 2) * b_dim1]; 1635 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1636 257] * b[l + (j + 3) * b_dim1]; 1637 } 1638 c[i + j * c_dim1] = f11; 1639 c[i + (j + 1) * c_dim1] = f12; 1640 c[i + (j + 2) * c_dim1] = f13; 1641 c[i + (j + 3) * c_dim1] = f14; 1642 } 1643 } 1644 } 1645 if (ujsec < jsec) 1646 { 1647 i4 = jj + jsec - 1; 1648 for (j = jj + ujsec; j <= i4; ++j) 1649 { 1650 i5 = ii + uisec - 1; 1651 for (i = ii; i <= i5; i += 4) 1652 { 1653 f11 = c[i + j * c_dim1]; 1654 f21 = c[i + 1 + j * c_dim1]; 1655 f31 = c[i + 2 + j * c_dim1]; 1656 f41 = c[i + 3 + j * c_dim1]; 1657 i6 = ll + lsec - 1; 1658 for (l = ll; l <= i6; ++l) 1659 { 1660 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1661 257] * b[l + j * b_dim1]; 1662 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1663 257] * b[l + j * b_dim1]; 1664 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1665 257] * b[l + j * b_dim1]; 1666 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1667 257] * b[l + j * b_dim1]; 1668 } 1669 c[i + j * c_dim1] = f11; 1670 c[i + 1 + j * c_dim1] = f21; 1671 c[i + 2 + j * c_dim1] = f31; 1672 c[i + 3 + j * c_dim1] = f41; 1673 } 1674 i5 = ii + isec - 1; 1675 for (i = ii + uisec; i <= i5; ++i) 1676 { 1677 f11 = c[i + j * c_dim1]; 1678 i6 = ll + lsec - 1; 1679 for (l = ll; l <= i6; ++l) 1680 { 1681 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1682 257] * b[l + j * b_dim1]; 1683 } 1684 c[i + j * c_dim1] = f11; 1685 } 1686 } 1687 } 1688 } 1689 } 1690 } 1691 free(t1); 1692 return; 1693 } 1694 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1695 { 1696 if (GFC_DESCRIPTOR_RANK (a) != 1) 1697 { 1698 const GFC_REAL_4 *restrict abase_x; 1699 const GFC_REAL_4 *restrict bbase_y; 1700 GFC_REAL_4 *restrict dest_y; 1701 GFC_REAL_4 s; 1702 1703 for (y = 0; y < ycount; y++) 1704 { 1705 bbase_y = &bbase[y*bystride]; 1706 dest_y = &dest[y*rystride]; 1707 for (x = 0; x < xcount; x++) 1708 { 1709 abase_x = &abase[x*axstride]; 1710 s = (GFC_REAL_4) 0; 1711 for (n = 0; n < count; n++) 1712 s += abase_x[n] * bbase_y[n]; 1713 dest_y[x] = s; 1714 } 1715 } 1716 } 1717 else 1718 { 1719 const GFC_REAL_4 *restrict bbase_y; 1720 GFC_REAL_4 s; 1721 1722 for (y = 0; y < ycount; y++) 1723 { 1724 bbase_y = &bbase[y*bystride]; 1725 s = (GFC_REAL_4) 0; 1726 for (n = 0; n < count; n++) 1727 s += abase[n*axstride] * bbase_y[n]; 1728 dest[y*rystride] = s; 1729 } 1730 } 1731 } 1732 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1733 { 1734 const GFC_REAL_4 *restrict bbase_y; 1735 GFC_REAL_4 s; 1736 1737 for (y = 0; y < ycount; y++) 1738 { 1739 bbase_y = &bbase[y*bystride]; 1740 s = (GFC_REAL_4) 0; 1741 for (n = 0; n < count; n++) 1742 s += abase[n*axstride] * bbase_y[n*bxstride]; 1743 dest[y*rxstride] = s; 1744 } 1745 } 1746 else if (axstride < aystride) 1747 { 1748 for (y = 0; y < ycount; y++) 1749 for (x = 0; x < xcount; x++) 1750 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; 1751 1752 for (y = 0; y < ycount; y++) 1753 for (n = 0; n < count; n++) 1754 for (x = 0; x < xcount; x++) 1755 /* dest[x,y] += a[x,n] * b[n,y] */ 1756 dest[x*rxstride + y*rystride] += 1757 abase[x*axstride + n*aystride] * 1758 bbase[n*bxstride + y*bystride]; 1759 } 1760 else 1761 { 1762 const GFC_REAL_4 *restrict abase_x; 1763 const GFC_REAL_4 *restrict bbase_y; 1764 GFC_REAL_4 *restrict dest_y; 1765 GFC_REAL_4 s; 1766 1767 for (y = 0; y < ycount; y++) 1768 { 1769 bbase_y = &bbase[y*bystride]; 1770 dest_y = &dest[y*rystride]; 1771 for (x = 0; x < xcount; x++) 1772 { 1773 abase_x = &abase[x*axstride]; 1774 s = (GFC_REAL_4) 0; 1775 for (n = 0; n < count; n++) 1776 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1777 dest_y[x*rxstride] = s; 1778 } 1779 } 1780 } 1781 } 1782 #undef POW3 1783 #undef min 1784 #undef max 1785 1786 #endif /* HAVE_AVX512F */ 1787 1788 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 1789 1790 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 1791 void 1792 matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray, 1793 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 1794 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 1795 internal_proto(matmul_r4_avx128_fma3); 1796 #endif 1797 1798 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 1799 void 1800 matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray, 1801 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 1802 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 1803 internal_proto(matmul_r4_avx128_fma4); 1804 #endif 1805 1806 /* Function to fall back to if there is no special processor-specific version. */ 1807 static void 1808 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, 1809 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 1810 int blas_limit, blas_call gemm) 1811 { 1812 const GFC_REAL_4 * restrict abase; 1813 const GFC_REAL_4 * restrict bbase; 1814 GFC_REAL_4 * restrict dest; 1815 1816 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1817 index_type x, y, n, count, xcount, ycount; 1818 1819 assert (GFC_DESCRIPTOR_RANK (a) == 2 1820 || GFC_DESCRIPTOR_RANK (b) == 2); 1821 1822 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1823 1824 Either A or B (but not both) can be rank 1: 1825 1826 o One-dimensional argument A is implicitly treated as a row matrix 1827 dimensioned [1,count], so xcount=1. 1828 1829 o One-dimensional argument B is implicitly treated as a column matrix 1830 dimensioned [count, 1], so ycount=1. 1831 */ 1832 1833 if (retarray->base_addr == NULL) 1834 { 1835 if (GFC_DESCRIPTOR_RANK (a) == 1) 1836 { 1837 GFC_DIMENSION_SET(retarray->dim[0], 0, 1838 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1839 } 1840 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1841 { 1842 GFC_DIMENSION_SET(retarray->dim[0], 0, 1843 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1844 } 1845 else 1846 { 1847 GFC_DIMENSION_SET(retarray->dim[0], 0, 1848 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1849 1850 GFC_DIMENSION_SET(retarray->dim[1], 0, 1851 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1852 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1853 } 1854 1855 retarray->base_addr 1856 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); 1857 retarray->offset = 0; 1858 } 1859 else if (unlikely (compile_options.bounds_check)) 1860 { 1861 index_type ret_extent, arg_extent; 1862 1863 if (GFC_DESCRIPTOR_RANK (a) == 1) 1864 { 1865 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1866 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1867 if (arg_extent != ret_extent) 1868 runtime_error ("Array bound mismatch for dimension 1 of " 1869 "array (%ld/%ld) ", 1870 (long int) ret_extent, (long int) arg_extent); 1871 } 1872 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1873 { 1874 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1875 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1876 if (arg_extent != ret_extent) 1877 runtime_error ("Array bound mismatch for dimension 1 of " 1878 "array (%ld/%ld) ", 1879 (long int) ret_extent, (long int) arg_extent); 1880 } 1881 else 1882 { 1883 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1884 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1885 if (arg_extent != ret_extent) 1886 runtime_error ("Array bound mismatch for dimension 1 of " 1887 "array (%ld/%ld) ", 1888 (long int) ret_extent, (long int) arg_extent); 1889 1890 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1891 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1892 if (arg_extent != ret_extent) 1893 runtime_error ("Array bound mismatch for dimension 2 of " 1894 "array (%ld/%ld) ", 1895 (long int) ret_extent, (long int) arg_extent); 1896 } 1897 } 1898 1899 1900 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1901 { 1902 /* One-dimensional result may be addressed in the code below 1903 either as a row or a column matrix. We want both cases to 1904 work. */ 1905 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1906 } 1907 else 1908 { 1909 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1910 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1911 } 1912 1913 1914 if (GFC_DESCRIPTOR_RANK (a) == 1) 1915 { 1916 /* Treat it as a a row matrix A[1,count]. */ 1917 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1918 aystride = 1; 1919 1920 xcount = 1; 1921 count = GFC_DESCRIPTOR_EXTENT(a,0); 1922 } 1923 else 1924 { 1925 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1926 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1927 1928 count = GFC_DESCRIPTOR_EXTENT(a,1); 1929 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1930 } 1931 1932 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1933 { 1934 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1935 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1936 "in dimension 1: is %ld, should be %ld", 1937 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1938 } 1939 1940 if (GFC_DESCRIPTOR_RANK (b) == 1) 1941 { 1942 /* Treat it as a column matrix B[count,1] */ 1943 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1944 1945 /* bystride should never be used for 1-dimensional b. 1946 The value is only used for calculation of the 1947 memory by the buffer. */ 1948 bystride = 256; 1949 ycount = 1; 1950 } 1951 else 1952 { 1953 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1954 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1955 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1956 } 1957 1958 abase = a->base_addr; 1959 bbase = b->base_addr; 1960 dest = retarray->base_addr; 1961 1962 /* Now that everything is set up, we perform the multiplication 1963 itself. */ 1964 1965 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1966 #define min(a,b) ((a) <= (b) ? (a) : (b)) 1967 #define max(a,b) ((a) >= (b) ? (a) : (b)) 1968 1969 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1970 && (bxstride == 1 || bystride == 1) 1971 && (((float) xcount) * ((float) ycount) * ((float) count) 1972 > POW3(blas_limit))) 1973 { 1974 const int m = xcount, n = ycount, k = count, ldc = rystride; 1975 const GFC_REAL_4 one = 1, zero = 0; 1976 const int lda = (axstride == 1) ? aystride : axstride, 1977 ldb = (bxstride == 1) ? bystride : bxstride; 1978 1979 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1980 { 1981 assert (gemm != NULL); 1982 const char *transa, *transb; 1983 if (try_blas & 2) 1984 transa = "C"; 1985 else 1986 transa = axstride == 1 ? "N" : "T"; 1987 1988 if (try_blas & 4) 1989 transb = "C"; 1990 else 1991 transb = bxstride == 1 ? "N" : "T"; 1992 1993 gemm (transa, transb , &m, 1994 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1995 &ldc, 1, 1); 1996 return; 1997 } 1998 } 1999 2000 if (rxstride == 1 && axstride == 1 && bxstride == 1 2001 && GFC_DESCRIPTOR_RANK (b) != 1) 2002 { 2003 /* This block of code implements a tuned matmul, derived from 2004 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2005 2006 Bo Kagstrom and Per Ling 2007 Department of Computing Science 2008 Umea University 2009 S-901 87 Umea, Sweden 2010 2011 from netlib.org, translated to C, and modified for matmul.m4. */ 2012 2013 const GFC_REAL_4 *a, *b; 2014 GFC_REAL_4 *c; 2015 const index_type m = xcount, n = ycount, k = count; 2016 2017 /* System generated locals */ 2018 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2019 i1, i2, i3, i4, i5, i6; 2020 2021 /* Local variables */ 2022 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42, 2023 f13, f14, f23, f24, f33, f34, f43, f44; 2024 index_type i, j, l, ii, jj, ll; 2025 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2026 GFC_REAL_4 *t1; 2027 2028 a = abase; 2029 b = bbase; 2030 c = retarray->base_addr; 2031 2032 /* Parameter adjustments */ 2033 c_dim1 = rystride; 2034 c_offset = 1 + c_dim1; 2035 c -= c_offset; 2036 a_dim1 = aystride; 2037 a_offset = 1 + a_dim1; 2038 a -= a_offset; 2039 b_dim1 = bystride; 2040 b_offset = 1 + b_dim1; 2041 b -= b_offset; 2042 2043 /* Empty c first. */ 2044 for (j=1; j<=n; j++) 2045 for (i=1; i<=m; i++) 2046 c[i + j * c_dim1] = (GFC_REAL_4)0; 2047 2048 /* Early exit if possible */ 2049 if (m == 0 || n == 0 || k == 0) 2050 return; 2051 2052 /* Adjust size of t1 to what is needed. */ 2053 index_type t1_dim, a_sz; 2054 if (aystride == 1) 2055 a_sz = rystride; 2056 else 2057 a_sz = a_dim1; 2058 2059 t1_dim = a_sz * 256 + b_dim1; 2060 if (t1_dim > 65536) 2061 t1_dim = 65536; 2062 2063 t1 = malloc (t1_dim * sizeof(GFC_REAL_4)); 2064 2065 /* Start turning the crank. */ 2066 i1 = n; 2067 for (jj = 1; jj <= i1; jj += 512) 2068 { 2069 /* Computing MIN */ 2070 i2 = 512; 2071 i3 = n - jj + 1; 2072 jsec = min(i2,i3); 2073 ujsec = jsec - jsec % 4; 2074 i2 = k; 2075 for (ll = 1; ll <= i2; ll += 256) 2076 { 2077 /* Computing MIN */ 2078 i3 = 256; 2079 i4 = k - ll + 1; 2080 lsec = min(i3,i4); 2081 ulsec = lsec - lsec % 2; 2082 2083 i3 = m; 2084 for (ii = 1; ii <= i3; ii += 256) 2085 { 2086 /* Computing MIN */ 2087 i4 = 256; 2088 i5 = m - ii + 1; 2089 isec = min(i4,i5); 2090 uisec = isec - isec % 2; 2091 i4 = ll + ulsec - 1; 2092 for (l = ll; l <= i4; l += 2) 2093 { 2094 i5 = ii + uisec - 1; 2095 for (i = ii; i <= i5; i += 2) 2096 { 2097 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2098 a[i + l * a_dim1]; 2099 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2100 a[i + (l + 1) * a_dim1]; 2101 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2102 a[i + 1 + l * a_dim1]; 2103 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2104 a[i + 1 + (l + 1) * a_dim1]; 2105 } 2106 if (uisec < isec) 2107 { 2108 t1[l - ll + 1 + (isec << 8) - 257] = 2109 a[ii + isec - 1 + l * a_dim1]; 2110 t1[l - ll + 2 + (isec << 8) - 257] = 2111 a[ii + isec - 1 + (l + 1) * a_dim1]; 2112 } 2113 } 2114 if (ulsec < lsec) 2115 { 2116 i4 = ii + isec - 1; 2117 for (i = ii; i<= i4; ++i) 2118 { 2119 t1[lsec + ((i - ii + 1) << 8) - 257] = 2120 a[i + (ll + lsec - 1) * a_dim1]; 2121 } 2122 } 2123 2124 uisec = isec - isec % 4; 2125 i4 = jj + ujsec - 1; 2126 for (j = jj; j <= i4; j += 4) 2127 { 2128 i5 = ii + uisec - 1; 2129 for (i = ii; i <= i5; i += 4) 2130 { 2131 f11 = c[i + j * c_dim1]; 2132 f21 = c[i + 1 + j * c_dim1]; 2133 f12 = c[i + (j + 1) * c_dim1]; 2134 f22 = c[i + 1 + (j + 1) * c_dim1]; 2135 f13 = c[i + (j + 2) * c_dim1]; 2136 f23 = c[i + 1 + (j + 2) * c_dim1]; 2137 f14 = c[i + (j + 3) * c_dim1]; 2138 f24 = c[i + 1 + (j + 3) * c_dim1]; 2139 f31 = c[i + 2 + j * c_dim1]; 2140 f41 = c[i + 3 + j * c_dim1]; 2141 f32 = c[i + 2 + (j + 1) * c_dim1]; 2142 f42 = c[i + 3 + (j + 1) * c_dim1]; 2143 f33 = c[i + 2 + (j + 2) * c_dim1]; 2144 f43 = c[i + 3 + (j + 2) * c_dim1]; 2145 f34 = c[i + 2 + (j + 3) * c_dim1]; 2146 f44 = c[i + 3 + (j + 3) * c_dim1]; 2147 i6 = ll + lsec - 1; 2148 for (l = ll; l <= i6; ++l) 2149 { 2150 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2151 * b[l + j * b_dim1]; 2152 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2153 * b[l + j * b_dim1]; 2154 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2155 * b[l + (j + 1) * b_dim1]; 2156 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2157 * b[l + (j + 1) * b_dim1]; 2158 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2159 * b[l + (j + 2) * b_dim1]; 2160 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2161 * b[l + (j + 2) * b_dim1]; 2162 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2163 * b[l + (j + 3) * b_dim1]; 2164 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2165 * b[l + (j + 3) * b_dim1]; 2166 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2167 * b[l + j * b_dim1]; 2168 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2169 * b[l + j * b_dim1]; 2170 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2171 * b[l + (j + 1) * b_dim1]; 2172 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2173 * b[l + (j + 1) * b_dim1]; 2174 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2175 * b[l + (j + 2) * b_dim1]; 2176 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2177 * b[l + (j + 2) * b_dim1]; 2178 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2179 * b[l + (j + 3) * b_dim1]; 2180 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2181 * b[l + (j + 3) * b_dim1]; 2182 } 2183 c[i + j * c_dim1] = f11; 2184 c[i + 1 + j * c_dim1] = f21; 2185 c[i + (j + 1) * c_dim1] = f12; 2186 c[i + 1 + (j + 1) * c_dim1] = f22; 2187 c[i + (j + 2) * c_dim1] = f13; 2188 c[i + 1 + (j + 2) * c_dim1] = f23; 2189 c[i + (j + 3) * c_dim1] = f14; 2190 c[i + 1 + (j + 3) * c_dim1] = f24; 2191 c[i + 2 + j * c_dim1] = f31; 2192 c[i + 3 + j * c_dim1] = f41; 2193 c[i + 2 + (j + 1) * c_dim1] = f32; 2194 c[i + 3 + (j + 1) * c_dim1] = f42; 2195 c[i + 2 + (j + 2) * c_dim1] = f33; 2196 c[i + 3 + (j + 2) * c_dim1] = f43; 2197 c[i + 2 + (j + 3) * c_dim1] = f34; 2198 c[i + 3 + (j + 3) * c_dim1] = f44; 2199 } 2200 if (uisec < isec) 2201 { 2202 i5 = ii + isec - 1; 2203 for (i = ii + uisec; i <= i5; ++i) 2204 { 2205 f11 = c[i + j * c_dim1]; 2206 f12 = c[i + (j + 1) * c_dim1]; 2207 f13 = c[i + (j + 2) * c_dim1]; 2208 f14 = c[i + (j + 3) * c_dim1]; 2209 i6 = ll + lsec - 1; 2210 for (l = ll; l <= i6; ++l) 2211 { 2212 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2213 257] * b[l + j * b_dim1]; 2214 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2215 257] * b[l + (j + 1) * b_dim1]; 2216 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2217 257] * b[l + (j + 2) * b_dim1]; 2218 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2219 257] * b[l + (j + 3) * b_dim1]; 2220 } 2221 c[i + j * c_dim1] = f11; 2222 c[i + (j + 1) * c_dim1] = f12; 2223 c[i + (j + 2) * c_dim1] = f13; 2224 c[i + (j + 3) * c_dim1] = f14; 2225 } 2226 } 2227 } 2228 if (ujsec < jsec) 2229 { 2230 i4 = jj + jsec - 1; 2231 for (j = jj + ujsec; j <= i4; ++j) 2232 { 2233 i5 = ii + uisec - 1; 2234 for (i = ii; i <= i5; i += 4) 2235 { 2236 f11 = c[i + j * c_dim1]; 2237 f21 = c[i + 1 + j * c_dim1]; 2238 f31 = c[i + 2 + j * c_dim1]; 2239 f41 = c[i + 3 + j * c_dim1]; 2240 i6 = ll + lsec - 1; 2241 for (l = ll; l <= i6; ++l) 2242 { 2243 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2244 257] * b[l + j * b_dim1]; 2245 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2246 257] * b[l + j * b_dim1]; 2247 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2248 257] * b[l + j * b_dim1]; 2249 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2250 257] * b[l + j * b_dim1]; 2251 } 2252 c[i + j * c_dim1] = f11; 2253 c[i + 1 + j * c_dim1] = f21; 2254 c[i + 2 + j * c_dim1] = f31; 2255 c[i + 3 + j * c_dim1] = f41; 2256 } 2257 i5 = ii + isec - 1; 2258 for (i = ii + uisec; i <= i5; ++i) 2259 { 2260 f11 = c[i + j * c_dim1]; 2261 i6 = ll + lsec - 1; 2262 for (l = ll; l <= i6; ++l) 2263 { 2264 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2265 257] * b[l + j * b_dim1]; 2266 } 2267 c[i + j * c_dim1] = f11; 2268 } 2269 } 2270 } 2271 } 2272 } 2273 } 2274 free(t1); 2275 return; 2276 } 2277 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2278 { 2279 if (GFC_DESCRIPTOR_RANK (a) != 1) 2280 { 2281 const GFC_REAL_4 *restrict abase_x; 2282 const GFC_REAL_4 *restrict bbase_y; 2283 GFC_REAL_4 *restrict dest_y; 2284 GFC_REAL_4 s; 2285 2286 for (y = 0; y < ycount; y++) 2287 { 2288 bbase_y = &bbase[y*bystride]; 2289 dest_y = &dest[y*rystride]; 2290 for (x = 0; x < xcount; x++) 2291 { 2292 abase_x = &abase[x*axstride]; 2293 s = (GFC_REAL_4) 0; 2294 for (n = 0; n < count; n++) 2295 s += abase_x[n] * bbase_y[n]; 2296 dest_y[x] = s; 2297 } 2298 } 2299 } 2300 else 2301 { 2302 const GFC_REAL_4 *restrict bbase_y; 2303 GFC_REAL_4 s; 2304 2305 for (y = 0; y < ycount; y++) 2306 { 2307 bbase_y = &bbase[y*bystride]; 2308 s = (GFC_REAL_4) 0; 2309 for (n = 0; n < count; n++) 2310 s += abase[n*axstride] * bbase_y[n]; 2311 dest[y*rystride] = s; 2312 } 2313 } 2314 } 2315 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2316 { 2317 const GFC_REAL_4 *restrict bbase_y; 2318 GFC_REAL_4 s; 2319 2320 for (y = 0; y < ycount; y++) 2321 { 2322 bbase_y = &bbase[y*bystride]; 2323 s = (GFC_REAL_4) 0; 2324 for (n = 0; n < count; n++) 2325 s += abase[n*axstride] * bbase_y[n*bxstride]; 2326 dest[y*rxstride] = s; 2327 } 2328 } 2329 else if (axstride < aystride) 2330 { 2331 for (y = 0; y < ycount; y++) 2332 for (x = 0; x < xcount; x++) 2333 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; 2334 2335 for (y = 0; y < ycount; y++) 2336 for (n = 0; n < count; n++) 2337 for (x = 0; x < xcount; x++) 2338 /* dest[x,y] += a[x,n] * b[n,y] */ 2339 dest[x*rxstride + y*rystride] += 2340 abase[x*axstride + n*aystride] * 2341 bbase[n*bxstride + y*bystride]; 2342 } 2343 else 2344 { 2345 const GFC_REAL_4 *restrict abase_x; 2346 const GFC_REAL_4 *restrict bbase_y; 2347 GFC_REAL_4 *restrict dest_y; 2348 GFC_REAL_4 s; 2349 2350 for (y = 0; y < ycount; y++) 2351 { 2352 bbase_y = &bbase[y*bystride]; 2353 dest_y = &dest[y*rystride]; 2354 for (x = 0; x < xcount; x++) 2355 { 2356 abase_x = &abase[x*axstride]; 2357 s = (GFC_REAL_4) 0; 2358 for (n = 0; n < count; n++) 2359 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2360 dest_y[x*rxstride] = s; 2361 } 2362 } 2363 } 2364 } 2365 #undef POW3 2366 #undef min 2367 #undef max 2368 2369 2370 /* Compiling main function, with selection code for the processor. */ 2371 2372 /* Currently, this is i386 only. Adjust for other architectures. */ 2373 2374 #include <config/i386/cpuinfo.h> 2375 void matmul_r4 (gfc_array_r4 * const restrict retarray, 2376 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 2377 int blas_limit, blas_call gemm) 2378 { 2379 static void (*matmul_p) (gfc_array_r4 * const restrict retarray, 2380 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 2381 int blas_limit, blas_call gemm); 2382 2383 void (*matmul_fn) (gfc_array_r4 * const restrict retarray, 2384 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 2385 int blas_limit, blas_call gemm); 2386 2387 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 2388 if (matmul_fn == NULL) 2389 { 2390 matmul_fn = matmul_r4_vanilla; 2391 if (__cpu_model.__cpu_vendor == VENDOR_INTEL) 2392 { 2393 /* Run down the available processors in order of preference. */ 2394 #ifdef HAVE_AVX512F 2395 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) 2396 { 2397 matmul_fn = matmul_r4_avx512f; 2398 goto store; 2399 } 2400 2401 #endif /* HAVE_AVX512F */ 2402 2403 #ifdef HAVE_AVX2 2404 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) 2405 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2406 { 2407 matmul_fn = matmul_r4_avx2; 2408 goto store; 2409 } 2410 2411 #endif 2412 2413 #ifdef HAVE_AVX 2414 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2415 { 2416 matmul_fn = matmul_r4_avx; 2417 goto store; 2418 } 2419 #endif /* HAVE_AVX */ 2420 } 2421 else if (__cpu_model.__cpu_vendor == VENDOR_AMD) 2422 { 2423 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 2424 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2425 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2426 { 2427 matmul_fn = matmul_r4_avx128_fma3; 2428 goto store; 2429 } 2430 #endif 2431 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 2432 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2433 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) 2434 { 2435 matmul_fn = matmul_r4_avx128_fma4; 2436 goto store; 2437 } 2438 #endif 2439 2440 } 2441 store: 2442 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 2443 } 2444 2445 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 2446 } 2447 2448 #else /* Just the vanilla function. */ 2449 2450 void 2451 matmul_r4 (gfc_array_r4 * const restrict retarray, 2452 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, 2453 int blas_limit, blas_call gemm) 2454 { 2455 const GFC_REAL_4 * restrict abase; 2456 const GFC_REAL_4 * restrict bbase; 2457 GFC_REAL_4 * restrict dest; 2458 2459 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 2460 index_type x, y, n, count, xcount, ycount; 2461 2462 assert (GFC_DESCRIPTOR_RANK (a) == 2 2463 || GFC_DESCRIPTOR_RANK (b) == 2); 2464 2465 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 2466 2467 Either A or B (but not both) can be rank 1: 2468 2469 o One-dimensional argument A is implicitly treated as a row matrix 2470 dimensioned [1,count], so xcount=1. 2471 2472 o One-dimensional argument B is implicitly treated as a column matrix 2473 dimensioned [count, 1], so ycount=1. 2474 */ 2475 2476 if (retarray->base_addr == NULL) 2477 { 2478 if (GFC_DESCRIPTOR_RANK (a) == 1) 2479 { 2480 GFC_DIMENSION_SET(retarray->dim[0], 0, 2481 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 2482 } 2483 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2484 { 2485 GFC_DIMENSION_SET(retarray->dim[0], 0, 2486 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2487 } 2488 else 2489 { 2490 GFC_DIMENSION_SET(retarray->dim[0], 0, 2491 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2492 2493 GFC_DIMENSION_SET(retarray->dim[1], 0, 2494 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 2495 GFC_DESCRIPTOR_EXTENT(retarray,0)); 2496 } 2497 2498 retarray->base_addr 2499 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); 2500 retarray->offset = 0; 2501 } 2502 else if (unlikely (compile_options.bounds_check)) 2503 { 2504 index_type ret_extent, arg_extent; 2505 2506 if (GFC_DESCRIPTOR_RANK (a) == 1) 2507 { 2508 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2509 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2510 if (arg_extent != ret_extent) 2511 runtime_error ("Array bound mismatch for dimension 1 of " 2512 "array (%ld/%ld) ", 2513 (long int) ret_extent, (long int) arg_extent); 2514 } 2515 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2516 { 2517 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2518 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2519 if (arg_extent != ret_extent) 2520 runtime_error ("Array bound mismatch for dimension 1 of " 2521 "array (%ld/%ld) ", 2522 (long int) ret_extent, (long int) arg_extent); 2523 } 2524 else 2525 { 2526 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2527 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2528 if (arg_extent != ret_extent) 2529 runtime_error ("Array bound mismatch for dimension 1 of " 2530 "array (%ld/%ld) ", 2531 (long int) ret_extent, (long int) arg_extent); 2532 2533 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2534 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 2535 if (arg_extent != ret_extent) 2536 runtime_error ("Array bound mismatch for dimension 2 of " 2537 "array (%ld/%ld) ", 2538 (long int) ret_extent, (long int) arg_extent); 2539 } 2540 } 2541 2542 2543 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 2544 { 2545 /* One-dimensional result may be addressed in the code below 2546 either as a row or a column matrix. We want both cases to 2547 work. */ 2548 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2549 } 2550 else 2551 { 2552 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2553 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 2554 } 2555 2556 2557 if (GFC_DESCRIPTOR_RANK (a) == 1) 2558 { 2559 /* Treat it as a a row matrix A[1,count]. */ 2560 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2561 aystride = 1; 2562 2563 xcount = 1; 2564 count = GFC_DESCRIPTOR_EXTENT(a,0); 2565 } 2566 else 2567 { 2568 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2569 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 2570 2571 count = GFC_DESCRIPTOR_EXTENT(a,1); 2572 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 2573 } 2574 2575 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 2576 { 2577 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 2578 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 2579 "in dimension 1: is %ld, should be %ld", 2580 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 2581 } 2582 2583 if (GFC_DESCRIPTOR_RANK (b) == 1) 2584 { 2585 /* Treat it as a column matrix B[count,1] */ 2586 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2587 2588 /* bystride should never be used for 1-dimensional b. 2589 The value is only used for calculation of the 2590 memory by the buffer. */ 2591 bystride = 256; 2592 ycount = 1; 2593 } 2594 else 2595 { 2596 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2597 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 2598 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 2599 } 2600 2601 abase = a->base_addr; 2602 bbase = b->base_addr; 2603 dest = retarray->base_addr; 2604 2605 /* Now that everything is set up, we perform the multiplication 2606 itself. */ 2607 2608 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 2609 #define min(a,b) ((a) <= (b) ? (a) : (b)) 2610 #define max(a,b) ((a) >= (b) ? (a) : (b)) 2611 2612 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 2613 && (bxstride == 1 || bystride == 1) 2614 && (((float) xcount) * ((float) ycount) * ((float) count) 2615 > POW3(blas_limit))) 2616 { 2617 const int m = xcount, n = ycount, k = count, ldc = rystride; 2618 const GFC_REAL_4 one = 1, zero = 0; 2619 const int lda = (axstride == 1) ? aystride : axstride, 2620 ldb = (bxstride == 1) ? bystride : bxstride; 2621 2622 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 2623 { 2624 assert (gemm != NULL); 2625 const char *transa, *transb; 2626 if (try_blas & 2) 2627 transa = "C"; 2628 else 2629 transa = axstride == 1 ? "N" : "T"; 2630 2631 if (try_blas & 4) 2632 transb = "C"; 2633 else 2634 transb = bxstride == 1 ? "N" : "T"; 2635 2636 gemm (transa, transb , &m, 2637 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 2638 &ldc, 1, 1); 2639 return; 2640 } 2641 } 2642 2643 if (rxstride == 1 && axstride == 1 && bxstride == 1 2644 && GFC_DESCRIPTOR_RANK (b) != 1) 2645 { 2646 /* This block of code implements a tuned matmul, derived from 2647 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2648 2649 Bo Kagstrom and Per Ling 2650 Department of Computing Science 2651 Umea University 2652 S-901 87 Umea, Sweden 2653 2654 from netlib.org, translated to C, and modified for matmul.m4. */ 2655 2656 const GFC_REAL_4 *a, *b; 2657 GFC_REAL_4 *c; 2658 const index_type m = xcount, n = ycount, k = count; 2659 2660 /* System generated locals */ 2661 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2662 i1, i2, i3, i4, i5, i6; 2663 2664 /* Local variables */ 2665 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42, 2666 f13, f14, f23, f24, f33, f34, f43, f44; 2667 index_type i, j, l, ii, jj, ll; 2668 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2669 GFC_REAL_4 *t1; 2670 2671 a = abase; 2672 b = bbase; 2673 c = retarray->base_addr; 2674 2675 /* Parameter adjustments */ 2676 c_dim1 = rystride; 2677 c_offset = 1 + c_dim1; 2678 c -= c_offset; 2679 a_dim1 = aystride; 2680 a_offset = 1 + a_dim1; 2681 a -= a_offset; 2682 b_dim1 = bystride; 2683 b_offset = 1 + b_dim1; 2684 b -= b_offset; 2685 2686 /* Empty c first. */ 2687 for (j=1; j<=n; j++) 2688 for (i=1; i<=m; i++) 2689 c[i + j * c_dim1] = (GFC_REAL_4)0; 2690 2691 /* Early exit if possible */ 2692 if (m == 0 || n == 0 || k == 0) 2693 return; 2694 2695 /* Adjust size of t1 to what is needed. */ 2696 index_type t1_dim, a_sz; 2697 if (aystride == 1) 2698 a_sz = rystride; 2699 else 2700 a_sz = a_dim1; 2701 2702 t1_dim = a_sz * 256 + b_dim1; 2703 if (t1_dim > 65536) 2704 t1_dim = 65536; 2705 2706 t1 = malloc (t1_dim * sizeof(GFC_REAL_4)); 2707 2708 /* Start turning the crank. */ 2709 i1 = n; 2710 for (jj = 1; jj <= i1; jj += 512) 2711 { 2712 /* Computing MIN */ 2713 i2 = 512; 2714 i3 = n - jj + 1; 2715 jsec = min(i2,i3); 2716 ujsec = jsec - jsec % 4; 2717 i2 = k; 2718 for (ll = 1; ll <= i2; ll += 256) 2719 { 2720 /* Computing MIN */ 2721 i3 = 256; 2722 i4 = k - ll + 1; 2723 lsec = min(i3,i4); 2724 ulsec = lsec - lsec % 2; 2725 2726 i3 = m; 2727 for (ii = 1; ii <= i3; ii += 256) 2728 { 2729 /* Computing MIN */ 2730 i4 = 256; 2731 i5 = m - ii + 1; 2732 isec = min(i4,i5); 2733 uisec = isec - isec % 2; 2734 i4 = ll + ulsec - 1; 2735 for (l = ll; l <= i4; l += 2) 2736 { 2737 i5 = ii + uisec - 1; 2738 for (i = ii; i <= i5; i += 2) 2739 { 2740 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2741 a[i + l * a_dim1]; 2742 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2743 a[i + (l + 1) * a_dim1]; 2744 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2745 a[i + 1 + l * a_dim1]; 2746 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2747 a[i + 1 + (l + 1) * a_dim1]; 2748 } 2749 if (uisec < isec) 2750 { 2751 t1[l - ll + 1 + (isec << 8) - 257] = 2752 a[ii + isec - 1 + l * a_dim1]; 2753 t1[l - ll + 2 + (isec << 8) - 257] = 2754 a[ii + isec - 1 + (l + 1) * a_dim1]; 2755 } 2756 } 2757 if (ulsec < lsec) 2758 { 2759 i4 = ii + isec - 1; 2760 for (i = ii; i<= i4; ++i) 2761 { 2762 t1[lsec + ((i - ii + 1) << 8) - 257] = 2763 a[i + (ll + lsec - 1) * a_dim1]; 2764 } 2765 } 2766 2767 uisec = isec - isec % 4; 2768 i4 = jj + ujsec - 1; 2769 for (j = jj; j <= i4; j += 4) 2770 { 2771 i5 = ii + uisec - 1; 2772 for (i = ii; i <= i5; i += 4) 2773 { 2774 f11 = c[i + j * c_dim1]; 2775 f21 = c[i + 1 + j * c_dim1]; 2776 f12 = c[i + (j + 1) * c_dim1]; 2777 f22 = c[i + 1 + (j + 1) * c_dim1]; 2778 f13 = c[i + (j + 2) * c_dim1]; 2779 f23 = c[i + 1 + (j + 2) * c_dim1]; 2780 f14 = c[i + (j + 3) * c_dim1]; 2781 f24 = c[i + 1 + (j + 3) * c_dim1]; 2782 f31 = c[i + 2 + j * c_dim1]; 2783 f41 = c[i + 3 + j * c_dim1]; 2784 f32 = c[i + 2 + (j + 1) * c_dim1]; 2785 f42 = c[i + 3 + (j + 1) * c_dim1]; 2786 f33 = c[i + 2 + (j + 2) * c_dim1]; 2787 f43 = c[i + 3 + (j + 2) * c_dim1]; 2788 f34 = c[i + 2 + (j + 3) * c_dim1]; 2789 f44 = c[i + 3 + (j + 3) * c_dim1]; 2790 i6 = ll + lsec - 1; 2791 for (l = ll; l <= i6; ++l) 2792 { 2793 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2794 * b[l + j * b_dim1]; 2795 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2796 * b[l + j * b_dim1]; 2797 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2798 * b[l + (j + 1) * b_dim1]; 2799 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2800 * b[l + (j + 1) * b_dim1]; 2801 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2802 * b[l + (j + 2) * b_dim1]; 2803 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2804 * b[l + (j + 2) * b_dim1]; 2805 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2806 * b[l + (j + 3) * b_dim1]; 2807 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2808 * b[l + (j + 3) * b_dim1]; 2809 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2810 * b[l + j * b_dim1]; 2811 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2812 * b[l + j * b_dim1]; 2813 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2814 * b[l + (j + 1) * b_dim1]; 2815 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2816 * b[l + (j + 1) * b_dim1]; 2817 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2818 * b[l + (j + 2) * b_dim1]; 2819 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2820 * b[l + (j + 2) * b_dim1]; 2821 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2822 * b[l + (j + 3) * b_dim1]; 2823 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2824 * b[l + (j + 3) * b_dim1]; 2825 } 2826 c[i + j * c_dim1] = f11; 2827 c[i + 1 + j * c_dim1] = f21; 2828 c[i + (j + 1) * c_dim1] = f12; 2829 c[i + 1 + (j + 1) * c_dim1] = f22; 2830 c[i + (j + 2) * c_dim1] = f13; 2831 c[i + 1 + (j + 2) * c_dim1] = f23; 2832 c[i + (j + 3) * c_dim1] = f14; 2833 c[i + 1 + (j + 3) * c_dim1] = f24; 2834 c[i + 2 + j * c_dim1] = f31; 2835 c[i + 3 + j * c_dim1] = f41; 2836 c[i + 2 + (j + 1) * c_dim1] = f32; 2837 c[i + 3 + (j + 1) * c_dim1] = f42; 2838 c[i + 2 + (j + 2) * c_dim1] = f33; 2839 c[i + 3 + (j + 2) * c_dim1] = f43; 2840 c[i + 2 + (j + 3) * c_dim1] = f34; 2841 c[i + 3 + (j + 3) * c_dim1] = f44; 2842 } 2843 if (uisec < isec) 2844 { 2845 i5 = ii + isec - 1; 2846 for (i = ii + uisec; i <= i5; ++i) 2847 { 2848 f11 = c[i + j * c_dim1]; 2849 f12 = c[i + (j + 1) * c_dim1]; 2850 f13 = c[i + (j + 2) * c_dim1]; 2851 f14 = c[i + (j + 3) * c_dim1]; 2852 i6 = ll + lsec - 1; 2853 for (l = ll; l <= i6; ++l) 2854 { 2855 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2856 257] * b[l + j * b_dim1]; 2857 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2858 257] * b[l + (j + 1) * b_dim1]; 2859 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2860 257] * b[l + (j + 2) * b_dim1]; 2861 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2862 257] * b[l + (j + 3) * b_dim1]; 2863 } 2864 c[i + j * c_dim1] = f11; 2865 c[i + (j + 1) * c_dim1] = f12; 2866 c[i + (j + 2) * c_dim1] = f13; 2867 c[i + (j + 3) * c_dim1] = f14; 2868 } 2869 } 2870 } 2871 if (ujsec < jsec) 2872 { 2873 i4 = jj + jsec - 1; 2874 for (j = jj + ujsec; j <= i4; ++j) 2875 { 2876 i5 = ii + uisec - 1; 2877 for (i = ii; i <= i5; i += 4) 2878 { 2879 f11 = c[i + j * c_dim1]; 2880 f21 = c[i + 1 + j * c_dim1]; 2881 f31 = c[i + 2 + j * c_dim1]; 2882 f41 = c[i + 3 + j * c_dim1]; 2883 i6 = ll + lsec - 1; 2884 for (l = ll; l <= i6; ++l) 2885 { 2886 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2887 257] * b[l + j * b_dim1]; 2888 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2889 257] * b[l + j * b_dim1]; 2890 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2891 257] * b[l + j * b_dim1]; 2892 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2893 257] * b[l + j * b_dim1]; 2894 } 2895 c[i + j * c_dim1] = f11; 2896 c[i + 1 + j * c_dim1] = f21; 2897 c[i + 2 + j * c_dim1] = f31; 2898 c[i + 3 + j * c_dim1] = f41; 2899 } 2900 i5 = ii + isec - 1; 2901 for (i = ii + uisec; i <= i5; ++i) 2902 { 2903 f11 = c[i + j * c_dim1]; 2904 i6 = ll + lsec - 1; 2905 for (l = ll; l <= i6; ++l) 2906 { 2907 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2908 257] * b[l + j * b_dim1]; 2909 } 2910 c[i + j * c_dim1] = f11; 2911 } 2912 } 2913 } 2914 } 2915 } 2916 } 2917 free(t1); 2918 return; 2919 } 2920 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2921 { 2922 if (GFC_DESCRIPTOR_RANK (a) != 1) 2923 { 2924 const GFC_REAL_4 *restrict abase_x; 2925 const GFC_REAL_4 *restrict bbase_y; 2926 GFC_REAL_4 *restrict dest_y; 2927 GFC_REAL_4 s; 2928 2929 for (y = 0; y < ycount; y++) 2930 { 2931 bbase_y = &bbase[y*bystride]; 2932 dest_y = &dest[y*rystride]; 2933 for (x = 0; x < xcount; x++) 2934 { 2935 abase_x = &abase[x*axstride]; 2936 s = (GFC_REAL_4) 0; 2937 for (n = 0; n < count; n++) 2938 s += abase_x[n] * bbase_y[n]; 2939 dest_y[x] = s; 2940 } 2941 } 2942 } 2943 else 2944 { 2945 const GFC_REAL_4 *restrict bbase_y; 2946 GFC_REAL_4 s; 2947 2948 for (y = 0; y < ycount; y++) 2949 { 2950 bbase_y = &bbase[y*bystride]; 2951 s = (GFC_REAL_4) 0; 2952 for (n = 0; n < count; n++) 2953 s += abase[n*axstride] * bbase_y[n]; 2954 dest[y*rystride] = s; 2955 } 2956 } 2957 } 2958 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2959 { 2960 const GFC_REAL_4 *restrict bbase_y; 2961 GFC_REAL_4 s; 2962 2963 for (y = 0; y < ycount; y++) 2964 { 2965 bbase_y = &bbase[y*bystride]; 2966 s = (GFC_REAL_4) 0; 2967 for (n = 0; n < count; n++) 2968 s += abase[n*axstride] * bbase_y[n*bxstride]; 2969 dest[y*rxstride] = s; 2970 } 2971 } 2972 else if (axstride < aystride) 2973 { 2974 for (y = 0; y < ycount; y++) 2975 for (x = 0; x < xcount; x++) 2976 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; 2977 2978 for (y = 0; y < ycount; y++) 2979 for (n = 0; n < count; n++) 2980 for (x = 0; x < xcount; x++) 2981 /* dest[x,y] += a[x,n] * b[n,y] */ 2982 dest[x*rxstride + y*rystride] += 2983 abase[x*axstride + n*aystride] * 2984 bbase[n*bxstride + y*bystride]; 2985 } 2986 else 2987 { 2988 const GFC_REAL_4 *restrict abase_x; 2989 const GFC_REAL_4 *restrict bbase_y; 2990 GFC_REAL_4 *restrict dest_y; 2991 GFC_REAL_4 s; 2992 2993 for (y = 0; y < ycount; y++) 2994 { 2995 bbase_y = &bbase[y*bystride]; 2996 dest_y = &dest[y*rystride]; 2997 for (x = 0; x < xcount; x++) 2998 { 2999 abase_x = &abase[x*axstride]; 3000 s = (GFC_REAL_4) 0; 3001 for (n = 0; n < count; n++) 3002 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 3003 dest_y[x*rxstride] = s; 3004 } 3005 } 3006 } 3007 } 3008 #undef POW3 3009 #undef min 3010 #undef max 3011 3012 #endif 3013 #endif 3014 3015