1 /* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2019 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5 This file is part of the GNU Fortran runtime library (libgfortran). 6 7 Libgfortran is free software; you can redistribute it and/or 8 modify it under the terms of the GNU General Public 9 License as published by the Free Software Foundation; either 10 version 3 of the License, or (at your option) any later version. 11 12 Libgfortran is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 Under Section 7 of GPL version 3, you are granted additional 18 permissions described in the GCC Runtime Library Exception, version 19 3.1, as published by the Free Software Foundation. 20 21 You should have received a copy of the GNU General Public License and 22 a copy of the GCC Runtime Library Exception along with this program; 23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 <http://www.gnu.org/licenses/>. */ 25 26 #include "libgfortran.h" 27 #include <string.h> 28 #include <assert.h> 29 30 31 #if defined (HAVE_GFC_INTEGER_2) 32 33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 34 passed to us by the front-end, in which case we call it for large 35 matrices. */ 36 37 typedef void (*blas_call)(const char *, const char *, const int *, const int *, 38 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *, 39 const int *, const GFC_INTEGER_2 *, const int *, 40 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *, 41 int, int); 42 43 /* The order of loops is different in the case of plain matrix 44 multiplication C=MATMUL(A,B), and in the frequent special case where 45 the argument A is the temporary result of a TRANSPOSE intrinsic: 46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 47 looking at their strides. 48 49 The equivalent Fortran pseudo-code is: 50 51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 52 IF (.NOT.IS_TRANSPOSED(A)) THEN 53 C = 0 54 DO J=1,N 55 DO K=1,COUNT 56 DO I=1,M 57 C(I,J) = C(I,J)+A(I,K)*B(K,J) 58 ELSE 59 DO J=1,N 60 DO I=1,M 61 S = 0 62 DO K=1,COUNT 63 S = S+A(I,K)*B(K,J) 64 C(I,J) = S 65 ENDIF 66 */ 67 68 /* If try_blas is set to a nonzero value, then the matmul function will 69 see if there is a way to perform the matrix multiplication by a call 70 to the BLAS gemm function. */ 71 72 extern void matmul_i2 (gfc_array_i2 * const restrict retarray, 73 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 74 int blas_limit, blas_call gemm); 75 export_proto(matmul_i2); 76 77 /* Put exhaustive list of possible architectures here here, ORed together. */ 78 79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 80 81 #ifdef HAVE_AVX 82 static void 83 matmul_i2_avx (gfc_array_i2 * const restrict retarray, 84 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 85 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 86 static void 87 matmul_i2_avx (gfc_array_i2 * const restrict retarray, 88 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 89 int blas_limit, blas_call gemm) 90 { 91 const GFC_INTEGER_2 * restrict abase; 92 const GFC_INTEGER_2 * restrict bbase; 93 GFC_INTEGER_2 * restrict dest; 94 95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 96 index_type x, y, n, count, xcount, ycount; 97 98 assert (GFC_DESCRIPTOR_RANK (a) == 2 99 || GFC_DESCRIPTOR_RANK (b) == 2); 100 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 102 103 Either A or B (but not both) can be rank 1: 104 105 o One-dimensional argument A is implicitly treated as a row matrix 106 dimensioned [1,count], so xcount=1. 107 108 o One-dimensional argument B is implicitly treated as a column matrix 109 dimensioned [count, 1], so ycount=1. 110 */ 111 112 if (retarray->base_addr == NULL) 113 { 114 if (GFC_DESCRIPTOR_RANK (a) == 1) 115 { 116 GFC_DIMENSION_SET(retarray->dim[0], 0, 117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 118 } 119 else if (GFC_DESCRIPTOR_RANK (b) == 1) 120 { 121 GFC_DIMENSION_SET(retarray->dim[0], 0, 122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 123 } 124 else 125 { 126 GFC_DIMENSION_SET(retarray->dim[0], 0, 127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 128 129 GFC_DIMENSION_SET(retarray->dim[1], 0, 130 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 131 GFC_DESCRIPTOR_EXTENT(retarray,0)); 132 } 133 134 retarray->base_addr 135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 136 retarray->offset = 0; 137 } 138 else if (unlikely (compile_options.bounds_check)) 139 { 140 index_type ret_extent, arg_extent; 141 142 if (GFC_DESCRIPTOR_RANK (a) == 1) 143 { 144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 146 if (arg_extent != ret_extent) 147 runtime_error ("Array bound mismatch for dimension 1 of " 148 "array (%ld/%ld) ", 149 (long int) ret_extent, (long int) arg_extent); 150 } 151 else if (GFC_DESCRIPTOR_RANK (b) == 1) 152 { 153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 155 if (arg_extent != ret_extent) 156 runtime_error ("Array bound mismatch for dimension 1 of " 157 "array (%ld/%ld) ", 158 (long int) ret_extent, (long int) arg_extent); 159 } 160 else 161 { 162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 164 if (arg_extent != ret_extent) 165 runtime_error ("Array bound mismatch for dimension 1 of " 166 "array (%ld/%ld) ", 167 (long int) ret_extent, (long int) arg_extent); 168 169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 171 if (arg_extent != ret_extent) 172 runtime_error ("Array bound mismatch for dimension 2 of " 173 "array (%ld/%ld) ", 174 (long int) ret_extent, (long int) arg_extent); 175 } 176 } 177 178 179 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 180 { 181 /* One-dimensional result may be addressed in the code below 182 either as a row or a column matrix. We want both cases to 183 work. */ 184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 185 } 186 else 187 { 188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 190 } 191 192 193 if (GFC_DESCRIPTOR_RANK (a) == 1) 194 { 195 /* Treat it as a a row matrix A[1,count]. */ 196 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 197 aystride = 1; 198 199 xcount = 1; 200 count = GFC_DESCRIPTOR_EXTENT(a,0); 201 } 202 else 203 { 204 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 205 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 206 207 count = GFC_DESCRIPTOR_EXTENT(a,1); 208 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 209 } 210 211 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 212 { 213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 215 "in dimension 1: is %ld, should be %ld", 216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 217 } 218 219 if (GFC_DESCRIPTOR_RANK (b) == 1) 220 { 221 /* Treat it as a column matrix B[count,1] */ 222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 223 224 /* bystride should never be used for 1-dimensional b. 225 The value is only used for calculation of the 226 memory by the buffer. */ 227 bystride = 256; 228 ycount = 1; 229 } 230 else 231 { 232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 233 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 234 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 235 } 236 237 abase = a->base_addr; 238 bbase = b->base_addr; 239 dest = retarray->base_addr; 240 241 /* Now that everything is set up, we perform the multiplication 242 itself. */ 243 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 245 #define min(a,b) ((a) <= (b) ? (a) : (b)) 246 #define max(a,b) ((a) >= (b) ? (a) : (b)) 247 248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 249 && (bxstride == 1 || bystride == 1) 250 && (((float) xcount) * ((float) ycount) * ((float) count) 251 > POW3(blas_limit))) 252 { 253 const int m = xcount, n = ycount, k = count, ldc = rystride; 254 const GFC_INTEGER_2 one = 1, zero = 0; 255 const int lda = (axstride == 1) ? aystride : axstride, 256 ldb = (bxstride == 1) ? bystride : bxstride; 257 258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 259 { 260 assert (gemm != NULL); 261 const char *transa, *transb; 262 if (try_blas & 2) 263 transa = "C"; 264 else 265 transa = axstride == 1 ? "N" : "T"; 266 267 if (try_blas & 4) 268 transb = "C"; 269 else 270 transb = bxstride == 1 ? "N" : "T"; 271 272 gemm (transa, transb , &m, 273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 274 &ldc, 1, 1); 275 return; 276 } 277 } 278 279 if (rxstride == 1 && axstride == 1 && bxstride == 1) 280 { 281 /* This block of code implements a tuned matmul, derived from 282 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 283 284 Bo Kagstrom and Per Ling 285 Department of Computing Science 286 Umea University 287 S-901 87 Umea, Sweden 288 289 from netlib.org, translated to C, and modified for matmul.m4. */ 290 291 const GFC_INTEGER_2 *a, *b; 292 GFC_INTEGER_2 *c; 293 const index_type m = xcount, n = ycount, k = count; 294 295 /* System generated locals */ 296 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 297 i1, i2, i3, i4, i5, i6; 298 299 /* Local variables */ 300 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 301 f13, f14, f23, f24, f33, f34, f43, f44; 302 index_type i, j, l, ii, jj, ll; 303 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 304 GFC_INTEGER_2 *t1; 305 306 a = abase; 307 b = bbase; 308 c = retarray->base_addr; 309 310 /* Parameter adjustments */ 311 c_dim1 = rystride; 312 c_offset = 1 + c_dim1; 313 c -= c_offset; 314 a_dim1 = aystride; 315 a_offset = 1 + a_dim1; 316 a -= a_offset; 317 b_dim1 = bystride; 318 b_offset = 1 + b_dim1; 319 b -= b_offset; 320 321 /* Empty c first. */ 322 for (j=1; j<=n; j++) 323 for (i=1; i<=m; i++) 324 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 325 326 /* Early exit if possible */ 327 if (m == 0 || n == 0 || k == 0) 328 return; 329 330 /* Adjust size of t1 to what is needed. */ 331 index_type t1_dim, a_sz; 332 if (aystride == 1) 333 a_sz = rystride; 334 else 335 a_sz = a_dim1; 336 337 t1_dim = a_sz * 256 + b_dim1; 338 if (t1_dim > 65536) 339 t1_dim = 65536; 340 341 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 342 343 /* Start turning the crank. */ 344 i1 = n; 345 for (jj = 1; jj <= i1; jj += 512) 346 { 347 /* Computing MIN */ 348 i2 = 512; 349 i3 = n - jj + 1; 350 jsec = min(i2,i3); 351 ujsec = jsec - jsec % 4; 352 i2 = k; 353 for (ll = 1; ll <= i2; ll += 256) 354 { 355 /* Computing MIN */ 356 i3 = 256; 357 i4 = k - ll + 1; 358 lsec = min(i3,i4); 359 ulsec = lsec - lsec % 2; 360 361 i3 = m; 362 for (ii = 1; ii <= i3; ii += 256) 363 { 364 /* Computing MIN */ 365 i4 = 256; 366 i5 = m - ii + 1; 367 isec = min(i4,i5); 368 uisec = isec - isec % 2; 369 i4 = ll + ulsec - 1; 370 for (l = ll; l <= i4; l += 2) 371 { 372 i5 = ii + uisec - 1; 373 for (i = ii; i <= i5; i += 2) 374 { 375 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 376 a[i + l * a_dim1]; 377 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 378 a[i + (l + 1) * a_dim1]; 379 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 380 a[i + 1 + l * a_dim1]; 381 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 382 a[i + 1 + (l + 1) * a_dim1]; 383 } 384 if (uisec < isec) 385 { 386 t1[l - ll + 1 + (isec << 8) - 257] = 387 a[ii + isec - 1 + l * a_dim1]; 388 t1[l - ll + 2 + (isec << 8) - 257] = 389 a[ii + isec - 1 + (l + 1) * a_dim1]; 390 } 391 } 392 if (ulsec < lsec) 393 { 394 i4 = ii + isec - 1; 395 for (i = ii; i<= i4; ++i) 396 { 397 t1[lsec + ((i - ii + 1) << 8) - 257] = 398 a[i + (ll + lsec - 1) * a_dim1]; 399 } 400 } 401 402 uisec = isec - isec % 4; 403 i4 = jj + ujsec - 1; 404 for (j = jj; j <= i4; j += 4) 405 { 406 i5 = ii + uisec - 1; 407 for (i = ii; i <= i5; i += 4) 408 { 409 f11 = c[i + j * c_dim1]; 410 f21 = c[i + 1 + j * c_dim1]; 411 f12 = c[i + (j + 1) * c_dim1]; 412 f22 = c[i + 1 + (j + 1) * c_dim1]; 413 f13 = c[i + (j + 2) * c_dim1]; 414 f23 = c[i + 1 + (j + 2) * c_dim1]; 415 f14 = c[i + (j + 3) * c_dim1]; 416 f24 = c[i + 1 + (j + 3) * c_dim1]; 417 f31 = c[i + 2 + j * c_dim1]; 418 f41 = c[i + 3 + j * c_dim1]; 419 f32 = c[i + 2 + (j + 1) * c_dim1]; 420 f42 = c[i + 3 + (j + 1) * c_dim1]; 421 f33 = c[i + 2 + (j + 2) * c_dim1]; 422 f43 = c[i + 3 + (j + 2) * c_dim1]; 423 f34 = c[i + 2 + (j + 3) * c_dim1]; 424 f44 = c[i + 3 + (j + 3) * c_dim1]; 425 i6 = ll + lsec - 1; 426 for (l = ll; l <= i6; ++l) 427 { 428 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 429 * b[l + j * b_dim1]; 430 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 431 * b[l + j * b_dim1]; 432 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 433 * b[l + (j + 1) * b_dim1]; 434 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 435 * b[l + (j + 1) * b_dim1]; 436 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 437 * b[l + (j + 2) * b_dim1]; 438 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 439 * b[l + (j + 2) * b_dim1]; 440 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 441 * b[l + (j + 3) * b_dim1]; 442 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 443 * b[l + (j + 3) * b_dim1]; 444 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 445 * b[l + j * b_dim1]; 446 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 447 * b[l + j * b_dim1]; 448 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 449 * b[l + (j + 1) * b_dim1]; 450 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 451 * b[l + (j + 1) * b_dim1]; 452 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 453 * b[l + (j + 2) * b_dim1]; 454 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 455 * b[l + (j + 2) * b_dim1]; 456 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 457 * b[l + (j + 3) * b_dim1]; 458 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 459 * b[l + (j + 3) * b_dim1]; 460 } 461 c[i + j * c_dim1] = f11; 462 c[i + 1 + j * c_dim1] = f21; 463 c[i + (j + 1) * c_dim1] = f12; 464 c[i + 1 + (j + 1) * c_dim1] = f22; 465 c[i + (j + 2) * c_dim1] = f13; 466 c[i + 1 + (j + 2) * c_dim1] = f23; 467 c[i + (j + 3) * c_dim1] = f14; 468 c[i + 1 + (j + 3) * c_dim1] = f24; 469 c[i + 2 + j * c_dim1] = f31; 470 c[i + 3 + j * c_dim1] = f41; 471 c[i + 2 + (j + 1) * c_dim1] = f32; 472 c[i + 3 + (j + 1) * c_dim1] = f42; 473 c[i + 2 + (j + 2) * c_dim1] = f33; 474 c[i + 3 + (j + 2) * c_dim1] = f43; 475 c[i + 2 + (j + 3) * c_dim1] = f34; 476 c[i + 3 + (j + 3) * c_dim1] = f44; 477 } 478 if (uisec < isec) 479 { 480 i5 = ii + isec - 1; 481 for (i = ii + uisec; i <= i5; ++i) 482 { 483 f11 = c[i + j * c_dim1]; 484 f12 = c[i + (j + 1) * c_dim1]; 485 f13 = c[i + (j + 2) * c_dim1]; 486 f14 = c[i + (j + 3) * c_dim1]; 487 i6 = ll + lsec - 1; 488 for (l = ll; l <= i6; ++l) 489 { 490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 491 257] * b[l + j * b_dim1]; 492 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 493 257] * b[l + (j + 1) * b_dim1]; 494 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 495 257] * b[l + (j + 2) * b_dim1]; 496 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 497 257] * b[l + (j + 3) * b_dim1]; 498 } 499 c[i + j * c_dim1] = f11; 500 c[i + (j + 1) * c_dim1] = f12; 501 c[i + (j + 2) * c_dim1] = f13; 502 c[i + (j + 3) * c_dim1] = f14; 503 } 504 } 505 } 506 if (ujsec < jsec) 507 { 508 i4 = jj + jsec - 1; 509 for (j = jj + ujsec; j <= i4; ++j) 510 { 511 i5 = ii + uisec - 1; 512 for (i = ii; i <= i5; i += 4) 513 { 514 f11 = c[i + j * c_dim1]; 515 f21 = c[i + 1 + j * c_dim1]; 516 f31 = c[i + 2 + j * c_dim1]; 517 f41 = c[i + 3 + j * c_dim1]; 518 i6 = ll + lsec - 1; 519 for (l = ll; l <= i6; ++l) 520 { 521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 522 257] * b[l + j * b_dim1]; 523 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 524 257] * b[l + j * b_dim1]; 525 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 526 257] * b[l + j * b_dim1]; 527 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 528 257] * b[l + j * b_dim1]; 529 } 530 c[i + j * c_dim1] = f11; 531 c[i + 1 + j * c_dim1] = f21; 532 c[i + 2 + j * c_dim1] = f31; 533 c[i + 3 + j * c_dim1] = f41; 534 } 535 i5 = ii + isec - 1; 536 for (i = ii + uisec; i <= i5; ++i) 537 { 538 f11 = c[i + j * c_dim1]; 539 i6 = ll + lsec - 1; 540 for (l = ll; l <= i6; ++l) 541 { 542 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 543 257] * b[l + j * b_dim1]; 544 } 545 c[i + j * c_dim1] = f11; 546 } 547 } 548 } 549 } 550 } 551 } 552 free(t1); 553 return; 554 } 555 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 556 { 557 if (GFC_DESCRIPTOR_RANK (a) != 1) 558 { 559 const GFC_INTEGER_2 *restrict abase_x; 560 const GFC_INTEGER_2 *restrict bbase_y; 561 GFC_INTEGER_2 *restrict dest_y; 562 GFC_INTEGER_2 s; 563 564 for (y = 0; y < ycount; y++) 565 { 566 bbase_y = &bbase[y*bystride]; 567 dest_y = &dest[y*rystride]; 568 for (x = 0; x < xcount; x++) 569 { 570 abase_x = &abase[x*axstride]; 571 s = (GFC_INTEGER_2) 0; 572 for (n = 0; n < count; n++) 573 s += abase_x[n] * bbase_y[n]; 574 dest_y[x] = s; 575 } 576 } 577 } 578 else 579 { 580 const GFC_INTEGER_2 *restrict bbase_y; 581 GFC_INTEGER_2 s; 582 583 for (y = 0; y < ycount; y++) 584 { 585 bbase_y = &bbase[y*bystride]; 586 s = (GFC_INTEGER_2) 0; 587 for (n = 0; n < count; n++) 588 s += abase[n*axstride] * bbase_y[n]; 589 dest[y*rystride] = s; 590 } 591 } 592 } 593 else if (axstride < aystride) 594 { 595 for (y = 0; y < ycount; y++) 596 for (x = 0; x < xcount; x++) 597 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 598 599 for (y = 0; y < ycount; y++) 600 for (n = 0; n < count; n++) 601 for (x = 0; x < xcount; x++) 602 /* dest[x,y] += a[x,n] * b[n,y] */ 603 dest[x*rxstride + y*rystride] += 604 abase[x*axstride + n*aystride] * 605 bbase[n*bxstride + y*bystride]; 606 } 607 else if (GFC_DESCRIPTOR_RANK (a) == 1) 608 { 609 const GFC_INTEGER_2 *restrict bbase_y; 610 GFC_INTEGER_2 s; 611 612 for (y = 0; y < ycount; y++) 613 { 614 bbase_y = &bbase[y*bystride]; 615 s = (GFC_INTEGER_2) 0; 616 for (n = 0; n < count; n++) 617 s += abase[n*axstride] * bbase_y[n*bxstride]; 618 dest[y*rxstride] = s; 619 } 620 } 621 else 622 { 623 const GFC_INTEGER_2 *restrict abase_x; 624 const GFC_INTEGER_2 *restrict bbase_y; 625 GFC_INTEGER_2 *restrict dest_y; 626 GFC_INTEGER_2 s; 627 628 for (y = 0; y < ycount; y++) 629 { 630 bbase_y = &bbase[y*bystride]; 631 dest_y = &dest[y*rystride]; 632 for (x = 0; x < xcount; x++) 633 { 634 abase_x = &abase[x*axstride]; 635 s = (GFC_INTEGER_2) 0; 636 for (n = 0; n < count; n++) 637 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 638 dest_y[x*rxstride] = s; 639 } 640 } 641 } 642 } 643 #undef POW3 644 #undef min 645 #undef max 646 647 #endif /* HAVE_AVX */ 648 649 #ifdef HAVE_AVX2 650 static void 651 matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 652 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 653 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 654 static void 655 matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 656 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 657 int blas_limit, blas_call gemm) 658 { 659 const GFC_INTEGER_2 * restrict abase; 660 const GFC_INTEGER_2 * restrict bbase; 661 GFC_INTEGER_2 * restrict dest; 662 663 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 664 index_type x, y, n, count, xcount, ycount; 665 666 assert (GFC_DESCRIPTOR_RANK (a) == 2 667 || GFC_DESCRIPTOR_RANK (b) == 2); 668 669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 670 671 Either A or B (but not both) can be rank 1: 672 673 o One-dimensional argument A is implicitly treated as a row matrix 674 dimensioned [1,count], so xcount=1. 675 676 o One-dimensional argument B is implicitly treated as a column matrix 677 dimensioned [count, 1], so ycount=1. 678 */ 679 680 if (retarray->base_addr == NULL) 681 { 682 if (GFC_DESCRIPTOR_RANK (a) == 1) 683 { 684 GFC_DIMENSION_SET(retarray->dim[0], 0, 685 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 686 } 687 else if (GFC_DESCRIPTOR_RANK (b) == 1) 688 { 689 GFC_DIMENSION_SET(retarray->dim[0], 0, 690 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 691 } 692 else 693 { 694 GFC_DIMENSION_SET(retarray->dim[0], 0, 695 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 696 697 GFC_DIMENSION_SET(retarray->dim[1], 0, 698 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 699 GFC_DESCRIPTOR_EXTENT(retarray,0)); 700 } 701 702 retarray->base_addr 703 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 704 retarray->offset = 0; 705 } 706 else if (unlikely (compile_options.bounds_check)) 707 { 708 index_type ret_extent, arg_extent; 709 710 if (GFC_DESCRIPTOR_RANK (a) == 1) 711 { 712 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 714 if (arg_extent != ret_extent) 715 runtime_error ("Array bound mismatch for dimension 1 of " 716 "array (%ld/%ld) ", 717 (long int) ret_extent, (long int) arg_extent); 718 } 719 else if (GFC_DESCRIPTOR_RANK (b) == 1) 720 { 721 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 723 if (arg_extent != ret_extent) 724 runtime_error ("Array bound mismatch for dimension 1 of " 725 "array (%ld/%ld) ", 726 (long int) ret_extent, (long int) arg_extent); 727 } 728 else 729 { 730 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 731 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 732 if (arg_extent != ret_extent) 733 runtime_error ("Array bound mismatch for dimension 1 of " 734 "array (%ld/%ld) ", 735 (long int) ret_extent, (long int) arg_extent); 736 737 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 738 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 739 if (arg_extent != ret_extent) 740 runtime_error ("Array bound mismatch for dimension 2 of " 741 "array (%ld/%ld) ", 742 (long int) ret_extent, (long int) arg_extent); 743 } 744 } 745 746 747 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 748 { 749 /* One-dimensional result may be addressed in the code below 750 either as a row or a column matrix. We want both cases to 751 work. */ 752 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 753 } 754 else 755 { 756 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 757 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 758 } 759 760 761 if (GFC_DESCRIPTOR_RANK (a) == 1) 762 { 763 /* Treat it as a a row matrix A[1,count]. */ 764 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 765 aystride = 1; 766 767 xcount = 1; 768 count = GFC_DESCRIPTOR_EXTENT(a,0); 769 } 770 else 771 { 772 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 773 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 774 775 count = GFC_DESCRIPTOR_EXTENT(a,1); 776 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 777 } 778 779 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 780 { 781 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 782 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 783 "in dimension 1: is %ld, should be %ld", 784 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 785 } 786 787 if (GFC_DESCRIPTOR_RANK (b) == 1) 788 { 789 /* Treat it as a column matrix B[count,1] */ 790 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 791 792 /* bystride should never be used for 1-dimensional b. 793 The value is only used for calculation of the 794 memory by the buffer. */ 795 bystride = 256; 796 ycount = 1; 797 } 798 else 799 { 800 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 801 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 802 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 803 } 804 805 abase = a->base_addr; 806 bbase = b->base_addr; 807 dest = retarray->base_addr; 808 809 /* Now that everything is set up, we perform the multiplication 810 itself. */ 811 812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 813 #define min(a,b) ((a) <= (b) ? (a) : (b)) 814 #define max(a,b) ((a) >= (b) ? (a) : (b)) 815 816 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 817 && (bxstride == 1 || bystride == 1) 818 && (((float) xcount) * ((float) ycount) * ((float) count) 819 > POW3(blas_limit))) 820 { 821 const int m = xcount, n = ycount, k = count, ldc = rystride; 822 const GFC_INTEGER_2 one = 1, zero = 0; 823 const int lda = (axstride == 1) ? aystride : axstride, 824 ldb = (bxstride == 1) ? bystride : bxstride; 825 826 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 827 { 828 assert (gemm != NULL); 829 const char *transa, *transb; 830 if (try_blas & 2) 831 transa = "C"; 832 else 833 transa = axstride == 1 ? "N" : "T"; 834 835 if (try_blas & 4) 836 transb = "C"; 837 else 838 transb = bxstride == 1 ? "N" : "T"; 839 840 gemm (transa, transb , &m, 841 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 842 &ldc, 1, 1); 843 return; 844 } 845 } 846 847 if (rxstride == 1 && axstride == 1 && bxstride == 1) 848 { 849 /* This block of code implements a tuned matmul, derived from 850 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 851 852 Bo Kagstrom and Per Ling 853 Department of Computing Science 854 Umea University 855 S-901 87 Umea, Sweden 856 857 from netlib.org, translated to C, and modified for matmul.m4. */ 858 859 const GFC_INTEGER_2 *a, *b; 860 GFC_INTEGER_2 *c; 861 const index_type m = xcount, n = ycount, k = count; 862 863 /* System generated locals */ 864 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 865 i1, i2, i3, i4, i5, i6; 866 867 /* Local variables */ 868 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 869 f13, f14, f23, f24, f33, f34, f43, f44; 870 index_type i, j, l, ii, jj, ll; 871 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 872 GFC_INTEGER_2 *t1; 873 874 a = abase; 875 b = bbase; 876 c = retarray->base_addr; 877 878 /* Parameter adjustments */ 879 c_dim1 = rystride; 880 c_offset = 1 + c_dim1; 881 c -= c_offset; 882 a_dim1 = aystride; 883 a_offset = 1 + a_dim1; 884 a -= a_offset; 885 b_dim1 = bystride; 886 b_offset = 1 + b_dim1; 887 b -= b_offset; 888 889 /* Empty c first. */ 890 for (j=1; j<=n; j++) 891 for (i=1; i<=m; i++) 892 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 893 894 /* Early exit if possible */ 895 if (m == 0 || n == 0 || k == 0) 896 return; 897 898 /* Adjust size of t1 to what is needed. */ 899 index_type t1_dim, a_sz; 900 if (aystride == 1) 901 a_sz = rystride; 902 else 903 a_sz = a_dim1; 904 905 t1_dim = a_sz * 256 + b_dim1; 906 if (t1_dim > 65536) 907 t1_dim = 65536; 908 909 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 910 911 /* Start turning the crank. */ 912 i1 = n; 913 for (jj = 1; jj <= i1; jj += 512) 914 { 915 /* Computing MIN */ 916 i2 = 512; 917 i3 = n - jj + 1; 918 jsec = min(i2,i3); 919 ujsec = jsec - jsec % 4; 920 i2 = k; 921 for (ll = 1; ll <= i2; ll += 256) 922 { 923 /* Computing MIN */ 924 i3 = 256; 925 i4 = k - ll + 1; 926 lsec = min(i3,i4); 927 ulsec = lsec - lsec % 2; 928 929 i3 = m; 930 for (ii = 1; ii <= i3; ii += 256) 931 { 932 /* Computing MIN */ 933 i4 = 256; 934 i5 = m - ii + 1; 935 isec = min(i4,i5); 936 uisec = isec - isec % 2; 937 i4 = ll + ulsec - 1; 938 for (l = ll; l <= i4; l += 2) 939 { 940 i5 = ii + uisec - 1; 941 for (i = ii; i <= i5; i += 2) 942 { 943 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 944 a[i + l * a_dim1]; 945 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 946 a[i + (l + 1) * a_dim1]; 947 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 948 a[i + 1 + l * a_dim1]; 949 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 950 a[i + 1 + (l + 1) * a_dim1]; 951 } 952 if (uisec < isec) 953 { 954 t1[l - ll + 1 + (isec << 8) - 257] = 955 a[ii + isec - 1 + l * a_dim1]; 956 t1[l - ll + 2 + (isec << 8) - 257] = 957 a[ii + isec - 1 + (l + 1) * a_dim1]; 958 } 959 } 960 if (ulsec < lsec) 961 { 962 i4 = ii + isec - 1; 963 for (i = ii; i<= i4; ++i) 964 { 965 t1[lsec + ((i - ii + 1) << 8) - 257] = 966 a[i + (ll + lsec - 1) * a_dim1]; 967 } 968 } 969 970 uisec = isec - isec % 4; 971 i4 = jj + ujsec - 1; 972 for (j = jj; j <= i4; j += 4) 973 { 974 i5 = ii + uisec - 1; 975 for (i = ii; i <= i5; i += 4) 976 { 977 f11 = c[i + j * c_dim1]; 978 f21 = c[i + 1 + j * c_dim1]; 979 f12 = c[i + (j + 1) * c_dim1]; 980 f22 = c[i + 1 + (j + 1) * c_dim1]; 981 f13 = c[i + (j + 2) * c_dim1]; 982 f23 = c[i + 1 + (j + 2) * c_dim1]; 983 f14 = c[i + (j + 3) * c_dim1]; 984 f24 = c[i + 1 + (j + 3) * c_dim1]; 985 f31 = c[i + 2 + j * c_dim1]; 986 f41 = c[i + 3 + j * c_dim1]; 987 f32 = c[i + 2 + (j + 1) * c_dim1]; 988 f42 = c[i + 3 + (j + 1) * c_dim1]; 989 f33 = c[i + 2 + (j + 2) * c_dim1]; 990 f43 = c[i + 3 + (j + 2) * c_dim1]; 991 f34 = c[i + 2 + (j + 3) * c_dim1]; 992 f44 = c[i + 3 + (j + 3) * c_dim1]; 993 i6 = ll + lsec - 1; 994 for (l = ll; l <= i6; ++l) 995 { 996 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 997 * b[l + j * b_dim1]; 998 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 999 * b[l + j * b_dim1]; 1000 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1001 * b[l + (j + 1) * b_dim1]; 1002 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1003 * b[l + (j + 1) * b_dim1]; 1004 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1005 * b[l + (j + 2) * b_dim1]; 1006 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1007 * b[l + (j + 2) * b_dim1]; 1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1009 * b[l + (j + 3) * b_dim1]; 1010 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1011 * b[l + (j + 3) * b_dim1]; 1012 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1013 * b[l + j * b_dim1]; 1014 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1015 * b[l + j * b_dim1]; 1016 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1017 * b[l + (j + 1) * b_dim1]; 1018 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1019 * b[l + (j + 1) * b_dim1]; 1020 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1021 * b[l + (j + 2) * b_dim1]; 1022 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1023 * b[l + (j + 2) * b_dim1]; 1024 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1025 * b[l + (j + 3) * b_dim1]; 1026 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1027 * b[l + (j + 3) * b_dim1]; 1028 } 1029 c[i + j * c_dim1] = f11; 1030 c[i + 1 + j * c_dim1] = f21; 1031 c[i + (j + 1) * c_dim1] = f12; 1032 c[i + 1 + (j + 1) * c_dim1] = f22; 1033 c[i + (j + 2) * c_dim1] = f13; 1034 c[i + 1 + (j + 2) * c_dim1] = f23; 1035 c[i + (j + 3) * c_dim1] = f14; 1036 c[i + 1 + (j + 3) * c_dim1] = f24; 1037 c[i + 2 + j * c_dim1] = f31; 1038 c[i + 3 + j * c_dim1] = f41; 1039 c[i + 2 + (j + 1) * c_dim1] = f32; 1040 c[i + 3 + (j + 1) * c_dim1] = f42; 1041 c[i + 2 + (j + 2) * c_dim1] = f33; 1042 c[i + 3 + (j + 2) * c_dim1] = f43; 1043 c[i + 2 + (j + 3) * c_dim1] = f34; 1044 c[i + 3 + (j + 3) * c_dim1] = f44; 1045 } 1046 if (uisec < isec) 1047 { 1048 i5 = ii + isec - 1; 1049 for (i = ii + uisec; i <= i5; ++i) 1050 { 1051 f11 = c[i + j * c_dim1]; 1052 f12 = c[i + (j + 1) * c_dim1]; 1053 f13 = c[i + (j + 2) * c_dim1]; 1054 f14 = c[i + (j + 3) * c_dim1]; 1055 i6 = ll + lsec - 1; 1056 for (l = ll; l <= i6; ++l) 1057 { 1058 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1059 257] * b[l + j * b_dim1]; 1060 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1061 257] * b[l + (j + 1) * b_dim1]; 1062 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1063 257] * b[l + (j + 2) * b_dim1]; 1064 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1065 257] * b[l + (j + 3) * b_dim1]; 1066 } 1067 c[i + j * c_dim1] = f11; 1068 c[i + (j + 1) * c_dim1] = f12; 1069 c[i + (j + 2) * c_dim1] = f13; 1070 c[i + (j + 3) * c_dim1] = f14; 1071 } 1072 } 1073 } 1074 if (ujsec < jsec) 1075 { 1076 i4 = jj + jsec - 1; 1077 for (j = jj + ujsec; j <= i4; ++j) 1078 { 1079 i5 = ii + uisec - 1; 1080 for (i = ii; i <= i5; i += 4) 1081 { 1082 f11 = c[i + j * c_dim1]; 1083 f21 = c[i + 1 + j * c_dim1]; 1084 f31 = c[i + 2 + j * c_dim1]; 1085 f41 = c[i + 3 + j * c_dim1]; 1086 i6 = ll + lsec - 1; 1087 for (l = ll; l <= i6; ++l) 1088 { 1089 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1090 257] * b[l + j * b_dim1]; 1091 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1092 257] * b[l + j * b_dim1]; 1093 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1094 257] * b[l + j * b_dim1]; 1095 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1096 257] * b[l + j * b_dim1]; 1097 } 1098 c[i + j * c_dim1] = f11; 1099 c[i + 1 + j * c_dim1] = f21; 1100 c[i + 2 + j * c_dim1] = f31; 1101 c[i + 3 + j * c_dim1] = f41; 1102 } 1103 i5 = ii + isec - 1; 1104 for (i = ii + uisec; i <= i5; ++i) 1105 { 1106 f11 = c[i + j * c_dim1]; 1107 i6 = ll + lsec - 1; 1108 for (l = ll; l <= i6; ++l) 1109 { 1110 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1111 257] * b[l + j * b_dim1]; 1112 } 1113 c[i + j * c_dim1] = f11; 1114 } 1115 } 1116 } 1117 } 1118 } 1119 } 1120 free(t1); 1121 return; 1122 } 1123 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1124 { 1125 if (GFC_DESCRIPTOR_RANK (a) != 1) 1126 { 1127 const GFC_INTEGER_2 *restrict abase_x; 1128 const GFC_INTEGER_2 *restrict bbase_y; 1129 GFC_INTEGER_2 *restrict dest_y; 1130 GFC_INTEGER_2 s; 1131 1132 for (y = 0; y < ycount; y++) 1133 { 1134 bbase_y = &bbase[y*bystride]; 1135 dest_y = &dest[y*rystride]; 1136 for (x = 0; x < xcount; x++) 1137 { 1138 abase_x = &abase[x*axstride]; 1139 s = (GFC_INTEGER_2) 0; 1140 for (n = 0; n < count; n++) 1141 s += abase_x[n] * bbase_y[n]; 1142 dest_y[x] = s; 1143 } 1144 } 1145 } 1146 else 1147 { 1148 const GFC_INTEGER_2 *restrict bbase_y; 1149 GFC_INTEGER_2 s; 1150 1151 for (y = 0; y < ycount; y++) 1152 { 1153 bbase_y = &bbase[y*bystride]; 1154 s = (GFC_INTEGER_2) 0; 1155 for (n = 0; n < count; n++) 1156 s += abase[n*axstride] * bbase_y[n]; 1157 dest[y*rystride] = s; 1158 } 1159 } 1160 } 1161 else if (axstride < aystride) 1162 { 1163 for (y = 0; y < ycount; y++) 1164 for (x = 0; x < xcount; x++) 1165 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 1166 1167 for (y = 0; y < ycount; y++) 1168 for (n = 0; n < count; n++) 1169 for (x = 0; x < xcount; x++) 1170 /* dest[x,y] += a[x,n] * b[n,y] */ 1171 dest[x*rxstride + y*rystride] += 1172 abase[x*axstride + n*aystride] * 1173 bbase[n*bxstride + y*bystride]; 1174 } 1175 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1176 { 1177 const GFC_INTEGER_2 *restrict bbase_y; 1178 GFC_INTEGER_2 s; 1179 1180 for (y = 0; y < ycount; y++) 1181 { 1182 bbase_y = &bbase[y*bystride]; 1183 s = (GFC_INTEGER_2) 0; 1184 for (n = 0; n < count; n++) 1185 s += abase[n*axstride] * bbase_y[n*bxstride]; 1186 dest[y*rxstride] = s; 1187 } 1188 } 1189 else 1190 { 1191 const GFC_INTEGER_2 *restrict abase_x; 1192 const GFC_INTEGER_2 *restrict bbase_y; 1193 GFC_INTEGER_2 *restrict dest_y; 1194 GFC_INTEGER_2 s; 1195 1196 for (y = 0; y < ycount; y++) 1197 { 1198 bbase_y = &bbase[y*bystride]; 1199 dest_y = &dest[y*rystride]; 1200 for (x = 0; x < xcount; x++) 1201 { 1202 abase_x = &abase[x*axstride]; 1203 s = (GFC_INTEGER_2) 0; 1204 for (n = 0; n < count; n++) 1205 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1206 dest_y[x*rxstride] = s; 1207 } 1208 } 1209 } 1210 } 1211 #undef POW3 1212 #undef min 1213 #undef max 1214 1215 #endif /* HAVE_AVX2 */ 1216 1217 #ifdef HAVE_AVX512F 1218 static void 1219 matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, 1220 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1221 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 1222 static void 1223 matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, 1224 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1225 int blas_limit, blas_call gemm) 1226 { 1227 const GFC_INTEGER_2 * restrict abase; 1228 const GFC_INTEGER_2 * restrict bbase; 1229 GFC_INTEGER_2 * restrict dest; 1230 1231 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1232 index_type x, y, n, count, xcount, ycount; 1233 1234 assert (GFC_DESCRIPTOR_RANK (a) == 2 1235 || GFC_DESCRIPTOR_RANK (b) == 2); 1236 1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1238 1239 Either A or B (but not both) can be rank 1: 1240 1241 o One-dimensional argument A is implicitly treated as a row matrix 1242 dimensioned [1,count], so xcount=1. 1243 1244 o One-dimensional argument B is implicitly treated as a column matrix 1245 dimensioned [count, 1], so ycount=1. 1246 */ 1247 1248 if (retarray->base_addr == NULL) 1249 { 1250 if (GFC_DESCRIPTOR_RANK (a) == 1) 1251 { 1252 GFC_DIMENSION_SET(retarray->dim[0], 0, 1253 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1254 } 1255 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1256 { 1257 GFC_DIMENSION_SET(retarray->dim[0], 0, 1258 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1259 } 1260 else 1261 { 1262 GFC_DIMENSION_SET(retarray->dim[0], 0, 1263 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1264 1265 GFC_DIMENSION_SET(retarray->dim[1], 0, 1266 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1267 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1268 } 1269 1270 retarray->base_addr 1271 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 1272 retarray->offset = 0; 1273 } 1274 else if (unlikely (compile_options.bounds_check)) 1275 { 1276 index_type ret_extent, arg_extent; 1277 1278 if (GFC_DESCRIPTOR_RANK (a) == 1) 1279 { 1280 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1281 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1282 if (arg_extent != ret_extent) 1283 runtime_error ("Array bound mismatch for dimension 1 of " 1284 "array (%ld/%ld) ", 1285 (long int) ret_extent, (long int) arg_extent); 1286 } 1287 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1288 { 1289 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1290 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1291 if (arg_extent != ret_extent) 1292 runtime_error ("Array bound mismatch for dimension 1 of " 1293 "array (%ld/%ld) ", 1294 (long int) ret_extent, (long int) arg_extent); 1295 } 1296 else 1297 { 1298 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1299 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1300 if (arg_extent != ret_extent) 1301 runtime_error ("Array bound mismatch for dimension 1 of " 1302 "array (%ld/%ld) ", 1303 (long int) ret_extent, (long int) arg_extent); 1304 1305 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1306 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1307 if (arg_extent != ret_extent) 1308 runtime_error ("Array bound mismatch for dimension 2 of " 1309 "array (%ld/%ld) ", 1310 (long int) ret_extent, (long int) arg_extent); 1311 } 1312 } 1313 1314 1315 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1316 { 1317 /* One-dimensional result may be addressed in the code below 1318 either as a row or a column matrix. We want both cases to 1319 work. */ 1320 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1321 } 1322 else 1323 { 1324 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1325 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1326 } 1327 1328 1329 if (GFC_DESCRIPTOR_RANK (a) == 1) 1330 { 1331 /* Treat it as a a row matrix A[1,count]. */ 1332 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1333 aystride = 1; 1334 1335 xcount = 1; 1336 count = GFC_DESCRIPTOR_EXTENT(a,0); 1337 } 1338 else 1339 { 1340 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1341 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1342 1343 count = GFC_DESCRIPTOR_EXTENT(a,1); 1344 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1345 } 1346 1347 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1348 { 1349 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1350 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1351 "in dimension 1: is %ld, should be %ld", 1352 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1353 } 1354 1355 if (GFC_DESCRIPTOR_RANK (b) == 1) 1356 { 1357 /* Treat it as a column matrix B[count,1] */ 1358 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1359 1360 /* bystride should never be used for 1-dimensional b. 1361 The value is only used for calculation of the 1362 memory by the buffer. */ 1363 bystride = 256; 1364 ycount = 1; 1365 } 1366 else 1367 { 1368 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1369 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1370 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1371 } 1372 1373 abase = a->base_addr; 1374 bbase = b->base_addr; 1375 dest = retarray->base_addr; 1376 1377 /* Now that everything is set up, we perform the multiplication 1378 itself. */ 1379 1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1381 #define min(a,b) ((a) <= (b) ? (a) : (b)) 1382 #define max(a,b) ((a) >= (b) ? (a) : (b)) 1383 1384 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1385 && (bxstride == 1 || bystride == 1) 1386 && (((float) xcount) * ((float) ycount) * ((float) count) 1387 > POW3(blas_limit))) 1388 { 1389 const int m = xcount, n = ycount, k = count, ldc = rystride; 1390 const GFC_INTEGER_2 one = 1, zero = 0; 1391 const int lda = (axstride == 1) ? aystride : axstride, 1392 ldb = (bxstride == 1) ? bystride : bxstride; 1393 1394 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1395 { 1396 assert (gemm != NULL); 1397 const char *transa, *transb; 1398 if (try_blas & 2) 1399 transa = "C"; 1400 else 1401 transa = axstride == 1 ? "N" : "T"; 1402 1403 if (try_blas & 4) 1404 transb = "C"; 1405 else 1406 transb = bxstride == 1 ? "N" : "T"; 1407 1408 gemm (transa, transb , &m, 1409 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1410 &ldc, 1, 1); 1411 return; 1412 } 1413 } 1414 1415 if (rxstride == 1 && axstride == 1 && bxstride == 1) 1416 { 1417 /* This block of code implements a tuned matmul, derived from 1418 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 1419 1420 Bo Kagstrom and Per Ling 1421 Department of Computing Science 1422 Umea University 1423 S-901 87 Umea, Sweden 1424 1425 from netlib.org, translated to C, and modified for matmul.m4. */ 1426 1427 const GFC_INTEGER_2 *a, *b; 1428 GFC_INTEGER_2 *c; 1429 const index_type m = xcount, n = ycount, k = count; 1430 1431 /* System generated locals */ 1432 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 1433 i1, i2, i3, i4, i5, i6; 1434 1435 /* Local variables */ 1436 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 1437 f13, f14, f23, f24, f33, f34, f43, f44; 1438 index_type i, j, l, ii, jj, ll; 1439 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 1440 GFC_INTEGER_2 *t1; 1441 1442 a = abase; 1443 b = bbase; 1444 c = retarray->base_addr; 1445 1446 /* Parameter adjustments */ 1447 c_dim1 = rystride; 1448 c_offset = 1 + c_dim1; 1449 c -= c_offset; 1450 a_dim1 = aystride; 1451 a_offset = 1 + a_dim1; 1452 a -= a_offset; 1453 b_dim1 = bystride; 1454 b_offset = 1 + b_dim1; 1455 b -= b_offset; 1456 1457 /* Empty c first. */ 1458 for (j=1; j<=n; j++) 1459 for (i=1; i<=m; i++) 1460 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 1461 1462 /* Early exit if possible */ 1463 if (m == 0 || n == 0 || k == 0) 1464 return; 1465 1466 /* Adjust size of t1 to what is needed. */ 1467 index_type t1_dim, a_sz; 1468 if (aystride == 1) 1469 a_sz = rystride; 1470 else 1471 a_sz = a_dim1; 1472 1473 t1_dim = a_sz * 256 + b_dim1; 1474 if (t1_dim > 65536) 1475 t1_dim = 65536; 1476 1477 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 1478 1479 /* Start turning the crank. */ 1480 i1 = n; 1481 for (jj = 1; jj <= i1; jj += 512) 1482 { 1483 /* Computing MIN */ 1484 i2 = 512; 1485 i3 = n - jj + 1; 1486 jsec = min(i2,i3); 1487 ujsec = jsec - jsec % 4; 1488 i2 = k; 1489 for (ll = 1; ll <= i2; ll += 256) 1490 { 1491 /* Computing MIN */ 1492 i3 = 256; 1493 i4 = k - ll + 1; 1494 lsec = min(i3,i4); 1495 ulsec = lsec - lsec % 2; 1496 1497 i3 = m; 1498 for (ii = 1; ii <= i3; ii += 256) 1499 { 1500 /* Computing MIN */ 1501 i4 = 256; 1502 i5 = m - ii + 1; 1503 isec = min(i4,i5); 1504 uisec = isec - isec % 2; 1505 i4 = ll + ulsec - 1; 1506 for (l = ll; l <= i4; l += 2) 1507 { 1508 i5 = ii + uisec - 1; 1509 for (i = ii; i <= i5; i += 2) 1510 { 1511 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 1512 a[i + l * a_dim1]; 1513 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 1514 a[i + (l + 1) * a_dim1]; 1515 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 1516 a[i + 1 + l * a_dim1]; 1517 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 1518 a[i + 1 + (l + 1) * a_dim1]; 1519 } 1520 if (uisec < isec) 1521 { 1522 t1[l - ll + 1 + (isec << 8) - 257] = 1523 a[ii + isec - 1 + l * a_dim1]; 1524 t1[l - ll + 2 + (isec << 8) - 257] = 1525 a[ii + isec - 1 + (l + 1) * a_dim1]; 1526 } 1527 } 1528 if (ulsec < lsec) 1529 { 1530 i4 = ii + isec - 1; 1531 for (i = ii; i<= i4; ++i) 1532 { 1533 t1[lsec + ((i - ii + 1) << 8) - 257] = 1534 a[i + (ll + lsec - 1) * a_dim1]; 1535 } 1536 } 1537 1538 uisec = isec - isec % 4; 1539 i4 = jj + ujsec - 1; 1540 for (j = jj; j <= i4; j += 4) 1541 { 1542 i5 = ii + uisec - 1; 1543 for (i = ii; i <= i5; i += 4) 1544 { 1545 f11 = c[i + j * c_dim1]; 1546 f21 = c[i + 1 + j * c_dim1]; 1547 f12 = c[i + (j + 1) * c_dim1]; 1548 f22 = c[i + 1 + (j + 1) * c_dim1]; 1549 f13 = c[i + (j + 2) * c_dim1]; 1550 f23 = c[i + 1 + (j + 2) * c_dim1]; 1551 f14 = c[i + (j + 3) * c_dim1]; 1552 f24 = c[i + 1 + (j + 3) * c_dim1]; 1553 f31 = c[i + 2 + j * c_dim1]; 1554 f41 = c[i + 3 + j * c_dim1]; 1555 f32 = c[i + 2 + (j + 1) * c_dim1]; 1556 f42 = c[i + 3 + (j + 1) * c_dim1]; 1557 f33 = c[i + 2 + (j + 2) * c_dim1]; 1558 f43 = c[i + 3 + (j + 2) * c_dim1]; 1559 f34 = c[i + 2 + (j + 3) * c_dim1]; 1560 f44 = c[i + 3 + (j + 3) * c_dim1]; 1561 i6 = ll + lsec - 1; 1562 for (l = ll; l <= i6; ++l) 1563 { 1564 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1565 * b[l + j * b_dim1]; 1566 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1567 * b[l + j * b_dim1]; 1568 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1569 * b[l + (j + 1) * b_dim1]; 1570 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1571 * b[l + (j + 1) * b_dim1]; 1572 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1573 * b[l + (j + 2) * b_dim1]; 1574 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1575 * b[l + (j + 2) * b_dim1]; 1576 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1577 * b[l + (j + 3) * b_dim1]; 1578 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1579 * b[l + (j + 3) * b_dim1]; 1580 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1581 * b[l + j * b_dim1]; 1582 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1583 * b[l + j * b_dim1]; 1584 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1585 * b[l + (j + 1) * b_dim1]; 1586 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1587 * b[l + (j + 1) * b_dim1]; 1588 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1589 * b[l + (j + 2) * b_dim1]; 1590 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1591 * b[l + (j + 2) * b_dim1]; 1592 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1593 * b[l + (j + 3) * b_dim1]; 1594 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1595 * b[l + (j + 3) * b_dim1]; 1596 } 1597 c[i + j * c_dim1] = f11; 1598 c[i + 1 + j * c_dim1] = f21; 1599 c[i + (j + 1) * c_dim1] = f12; 1600 c[i + 1 + (j + 1) * c_dim1] = f22; 1601 c[i + (j + 2) * c_dim1] = f13; 1602 c[i + 1 + (j + 2) * c_dim1] = f23; 1603 c[i + (j + 3) * c_dim1] = f14; 1604 c[i + 1 + (j + 3) * c_dim1] = f24; 1605 c[i + 2 + j * c_dim1] = f31; 1606 c[i + 3 + j * c_dim1] = f41; 1607 c[i + 2 + (j + 1) * c_dim1] = f32; 1608 c[i + 3 + (j + 1) * c_dim1] = f42; 1609 c[i + 2 + (j + 2) * c_dim1] = f33; 1610 c[i + 3 + (j + 2) * c_dim1] = f43; 1611 c[i + 2 + (j + 3) * c_dim1] = f34; 1612 c[i + 3 + (j + 3) * c_dim1] = f44; 1613 } 1614 if (uisec < isec) 1615 { 1616 i5 = ii + isec - 1; 1617 for (i = ii + uisec; i <= i5; ++i) 1618 { 1619 f11 = c[i + j * c_dim1]; 1620 f12 = c[i + (j + 1) * c_dim1]; 1621 f13 = c[i + (j + 2) * c_dim1]; 1622 f14 = c[i + (j + 3) * c_dim1]; 1623 i6 = ll + lsec - 1; 1624 for (l = ll; l <= i6; ++l) 1625 { 1626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1627 257] * b[l + j * b_dim1]; 1628 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1629 257] * b[l + (j + 1) * b_dim1]; 1630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1631 257] * b[l + (j + 2) * b_dim1]; 1632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1633 257] * b[l + (j + 3) * b_dim1]; 1634 } 1635 c[i + j * c_dim1] = f11; 1636 c[i + (j + 1) * c_dim1] = f12; 1637 c[i + (j + 2) * c_dim1] = f13; 1638 c[i + (j + 3) * c_dim1] = f14; 1639 } 1640 } 1641 } 1642 if (ujsec < jsec) 1643 { 1644 i4 = jj + jsec - 1; 1645 for (j = jj + ujsec; j <= i4; ++j) 1646 { 1647 i5 = ii + uisec - 1; 1648 for (i = ii; i <= i5; i += 4) 1649 { 1650 f11 = c[i + j * c_dim1]; 1651 f21 = c[i + 1 + j * c_dim1]; 1652 f31 = c[i + 2 + j * c_dim1]; 1653 f41 = c[i + 3 + j * c_dim1]; 1654 i6 = ll + lsec - 1; 1655 for (l = ll; l <= i6; ++l) 1656 { 1657 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1658 257] * b[l + j * b_dim1]; 1659 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1660 257] * b[l + j * b_dim1]; 1661 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1662 257] * b[l + j * b_dim1]; 1663 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1664 257] * b[l + j * b_dim1]; 1665 } 1666 c[i + j * c_dim1] = f11; 1667 c[i + 1 + j * c_dim1] = f21; 1668 c[i + 2 + j * c_dim1] = f31; 1669 c[i + 3 + j * c_dim1] = f41; 1670 } 1671 i5 = ii + isec - 1; 1672 for (i = ii + uisec; i <= i5; ++i) 1673 { 1674 f11 = c[i + j * c_dim1]; 1675 i6 = ll + lsec - 1; 1676 for (l = ll; l <= i6; ++l) 1677 { 1678 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1679 257] * b[l + j * b_dim1]; 1680 } 1681 c[i + j * c_dim1] = f11; 1682 } 1683 } 1684 } 1685 } 1686 } 1687 } 1688 free(t1); 1689 return; 1690 } 1691 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1692 { 1693 if (GFC_DESCRIPTOR_RANK (a) != 1) 1694 { 1695 const GFC_INTEGER_2 *restrict abase_x; 1696 const GFC_INTEGER_2 *restrict bbase_y; 1697 GFC_INTEGER_2 *restrict dest_y; 1698 GFC_INTEGER_2 s; 1699 1700 for (y = 0; y < ycount; y++) 1701 { 1702 bbase_y = &bbase[y*bystride]; 1703 dest_y = &dest[y*rystride]; 1704 for (x = 0; x < xcount; x++) 1705 { 1706 abase_x = &abase[x*axstride]; 1707 s = (GFC_INTEGER_2) 0; 1708 for (n = 0; n < count; n++) 1709 s += abase_x[n] * bbase_y[n]; 1710 dest_y[x] = s; 1711 } 1712 } 1713 } 1714 else 1715 { 1716 const GFC_INTEGER_2 *restrict bbase_y; 1717 GFC_INTEGER_2 s; 1718 1719 for (y = 0; y < ycount; y++) 1720 { 1721 bbase_y = &bbase[y*bystride]; 1722 s = (GFC_INTEGER_2) 0; 1723 for (n = 0; n < count; n++) 1724 s += abase[n*axstride] * bbase_y[n]; 1725 dest[y*rystride] = s; 1726 } 1727 } 1728 } 1729 else if (axstride < aystride) 1730 { 1731 for (y = 0; y < ycount; y++) 1732 for (x = 0; x < xcount; x++) 1733 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 1734 1735 for (y = 0; y < ycount; y++) 1736 for (n = 0; n < count; n++) 1737 for (x = 0; x < xcount; x++) 1738 /* dest[x,y] += a[x,n] * b[n,y] */ 1739 dest[x*rxstride + y*rystride] += 1740 abase[x*axstride + n*aystride] * 1741 bbase[n*bxstride + y*bystride]; 1742 } 1743 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1744 { 1745 const GFC_INTEGER_2 *restrict bbase_y; 1746 GFC_INTEGER_2 s; 1747 1748 for (y = 0; y < ycount; y++) 1749 { 1750 bbase_y = &bbase[y*bystride]; 1751 s = (GFC_INTEGER_2) 0; 1752 for (n = 0; n < count; n++) 1753 s += abase[n*axstride] * bbase_y[n*bxstride]; 1754 dest[y*rxstride] = s; 1755 } 1756 } 1757 else 1758 { 1759 const GFC_INTEGER_2 *restrict abase_x; 1760 const GFC_INTEGER_2 *restrict bbase_y; 1761 GFC_INTEGER_2 *restrict dest_y; 1762 GFC_INTEGER_2 s; 1763 1764 for (y = 0; y < ycount; y++) 1765 { 1766 bbase_y = &bbase[y*bystride]; 1767 dest_y = &dest[y*rystride]; 1768 for (x = 0; x < xcount; x++) 1769 { 1770 abase_x = &abase[x*axstride]; 1771 s = (GFC_INTEGER_2) 0; 1772 for (n = 0; n < count; n++) 1773 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1774 dest_y[x*rxstride] = s; 1775 } 1776 } 1777 } 1778 } 1779 #undef POW3 1780 #undef min 1781 #undef max 1782 1783 #endif /* HAVE_AVX512F */ 1784 1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 1786 1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 1788 void 1789 matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray, 1790 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1791 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 1792 internal_proto(matmul_i2_avx128_fma3); 1793 #endif 1794 1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 1796 void 1797 matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray, 1798 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1799 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 1800 internal_proto(matmul_i2_avx128_fma4); 1801 #endif 1802 1803 /* Function to fall back to if there is no special processor-specific version. */ 1804 static void 1805 matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, 1806 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1807 int blas_limit, blas_call gemm) 1808 { 1809 const GFC_INTEGER_2 * restrict abase; 1810 const GFC_INTEGER_2 * restrict bbase; 1811 GFC_INTEGER_2 * restrict dest; 1812 1813 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1814 index_type x, y, n, count, xcount, ycount; 1815 1816 assert (GFC_DESCRIPTOR_RANK (a) == 2 1817 || GFC_DESCRIPTOR_RANK (b) == 2); 1818 1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1820 1821 Either A or B (but not both) can be rank 1: 1822 1823 o One-dimensional argument A is implicitly treated as a row matrix 1824 dimensioned [1,count], so xcount=1. 1825 1826 o One-dimensional argument B is implicitly treated as a column matrix 1827 dimensioned [count, 1], so ycount=1. 1828 */ 1829 1830 if (retarray->base_addr == NULL) 1831 { 1832 if (GFC_DESCRIPTOR_RANK (a) == 1) 1833 { 1834 GFC_DIMENSION_SET(retarray->dim[0], 0, 1835 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1836 } 1837 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1838 { 1839 GFC_DIMENSION_SET(retarray->dim[0], 0, 1840 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1841 } 1842 else 1843 { 1844 GFC_DIMENSION_SET(retarray->dim[0], 0, 1845 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1846 1847 GFC_DIMENSION_SET(retarray->dim[1], 0, 1848 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1849 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1850 } 1851 1852 retarray->base_addr 1853 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 1854 retarray->offset = 0; 1855 } 1856 else if (unlikely (compile_options.bounds_check)) 1857 { 1858 index_type ret_extent, arg_extent; 1859 1860 if (GFC_DESCRIPTOR_RANK (a) == 1) 1861 { 1862 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1863 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1864 if (arg_extent != ret_extent) 1865 runtime_error ("Array bound mismatch for dimension 1 of " 1866 "array (%ld/%ld) ", 1867 (long int) ret_extent, (long int) arg_extent); 1868 } 1869 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1870 { 1871 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1872 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1873 if (arg_extent != ret_extent) 1874 runtime_error ("Array bound mismatch for dimension 1 of " 1875 "array (%ld/%ld) ", 1876 (long int) ret_extent, (long int) arg_extent); 1877 } 1878 else 1879 { 1880 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1881 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1882 if (arg_extent != ret_extent) 1883 runtime_error ("Array bound mismatch for dimension 1 of " 1884 "array (%ld/%ld) ", 1885 (long int) ret_extent, (long int) arg_extent); 1886 1887 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1888 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1889 if (arg_extent != ret_extent) 1890 runtime_error ("Array bound mismatch for dimension 2 of " 1891 "array (%ld/%ld) ", 1892 (long int) ret_extent, (long int) arg_extent); 1893 } 1894 } 1895 1896 1897 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1898 { 1899 /* One-dimensional result may be addressed in the code below 1900 either as a row or a column matrix. We want both cases to 1901 work. */ 1902 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1903 } 1904 else 1905 { 1906 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1907 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1908 } 1909 1910 1911 if (GFC_DESCRIPTOR_RANK (a) == 1) 1912 { 1913 /* Treat it as a a row matrix A[1,count]. */ 1914 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1915 aystride = 1; 1916 1917 xcount = 1; 1918 count = GFC_DESCRIPTOR_EXTENT(a,0); 1919 } 1920 else 1921 { 1922 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1923 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1924 1925 count = GFC_DESCRIPTOR_EXTENT(a,1); 1926 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1927 } 1928 1929 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1930 { 1931 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1932 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1933 "in dimension 1: is %ld, should be %ld", 1934 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1935 } 1936 1937 if (GFC_DESCRIPTOR_RANK (b) == 1) 1938 { 1939 /* Treat it as a column matrix B[count,1] */ 1940 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1941 1942 /* bystride should never be used for 1-dimensional b. 1943 The value is only used for calculation of the 1944 memory by the buffer. */ 1945 bystride = 256; 1946 ycount = 1; 1947 } 1948 else 1949 { 1950 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1951 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1952 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1953 } 1954 1955 abase = a->base_addr; 1956 bbase = b->base_addr; 1957 dest = retarray->base_addr; 1958 1959 /* Now that everything is set up, we perform the multiplication 1960 itself. */ 1961 1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1963 #define min(a,b) ((a) <= (b) ? (a) : (b)) 1964 #define max(a,b) ((a) >= (b) ? (a) : (b)) 1965 1966 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1967 && (bxstride == 1 || bystride == 1) 1968 && (((float) xcount) * ((float) ycount) * ((float) count) 1969 > POW3(blas_limit))) 1970 { 1971 const int m = xcount, n = ycount, k = count, ldc = rystride; 1972 const GFC_INTEGER_2 one = 1, zero = 0; 1973 const int lda = (axstride == 1) ? aystride : axstride, 1974 ldb = (bxstride == 1) ? bystride : bxstride; 1975 1976 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1977 { 1978 assert (gemm != NULL); 1979 const char *transa, *transb; 1980 if (try_blas & 2) 1981 transa = "C"; 1982 else 1983 transa = axstride == 1 ? "N" : "T"; 1984 1985 if (try_blas & 4) 1986 transb = "C"; 1987 else 1988 transb = bxstride == 1 ? "N" : "T"; 1989 1990 gemm (transa, transb , &m, 1991 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1992 &ldc, 1, 1); 1993 return; 1994 } 1995 } 1996 1997 if (rxstride == 1 && axstride == 1 && bxstride == 1) 1998 { 1999 /* This block of code implements a tuned matmul, derived from 2000 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2001 2002 Bo Kagstrom and Per Ling 2003 Department of Computing Science 2004 Umea University 2005 S-901 87 Umea, Sweden 2006 2007 from netlib.org, translated to C, and modified for matmul.m4. */ 2008 2009 const GFC_INTEGER_2 *a, *b; 2010 GFC_INTEGER_2 *c; 2011 const index_type m = xcount, n = ycount, k = count; 2012 2013 /* System generated locals */ 2014 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2015 i1, i2, i3, i4, i5, i6; 2016 2017 /* Local variables */ 2018 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 2019 f13, f14, f23, f24, f33, f34, f43, f44; 2020 index_type i, j, l, ii, jj, ll; 2021 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2022 GFC_INTEGER_2 *t1; 2023 2024 a = abase; 2025 b = bbase; 2026 c = retarray->base_addr; 2027 2028 /* Parameter adjustments */ 2029 c_dim1 = rystride; 2030 c_offset = 1 + c_dim1; 2031 c -= c_offset; 2032 a_dim1 = aystride; 2033 a_offset = 1 + a_dim1; 2034 a -= a_offset; 2035 b_dim1 = bystride; 2036 b_offset = 1 + b_dim1; 2037 b -= b_offset; 2038 2039 /* Empty c first. */ 2040 for (j=1; j<=n; j++) 2041 for (i=1; i<=m; i++) 2042 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 2043 2044 /* Early exit if possible */ 2045 if (m == 0 || n == 0 || k == 0) 2046 return; 2047 2048 /* Adjust size of t1 to what is needed. */ 2049 index_type t1_dim, a_sz; 2050 if (aystride == 1) 2051 a_sz = rystride; 2052 else 2053 a_sz = a_dim1; 2054 2055 t1_dim = a_sz * 256 + b_dim1; 2056 if (t1_dim > 65536) 2057 t1_dim = 65536; 2058 2059 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 2060 2061 /* Start turning the crank. */ 2062 i1 = n; 2063 for (jj = 1; jj <= i1; jj += 512) 2064 { 2065 /* Computing MIN */ 2066 i2 = 512; 2067 i3 = n - jj + 1; 2068 jsec = min(i2,i3); 2069 ujsec = jsec - jsec % 4; 2070 i2 = k; 2071 for (ll = 1; ll <= i2; ll += 256) 2072 { 2073 /* Computing MIN */ 2074 i3 = 256; 2075 i4 = k - ll + 1; 2076 lsec = min(i3,i4); 2077 ulsec = lsec - lsec % 2; 2078 2079 i3 = m; 2080 for (ii = 1; ii <= i3; ii += 256) 2081 { 2082 /* Computing MIN */ 2083 i4 = 256; 2084 i5 = m - ii + 1; 2085 isec = min(i4,i5); 2086 uisec = isec - isec % 2; 2087 i4 = ll + ulsec - 1; 2088 for (l = ll; l <= i4; l += 2) 2089 { 2090 i5 = ii + uisec - 1; 2091 for (i = ii; i <= i5; i += 2) 2092 { 2093 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2094 a[i + l * a_dim1]; 2095 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2096 a[i + (l + 1) * a_dim1]; 2097 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2098 a[i + 1 + l * a_dim1]; 2099 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2100 a[i + 1 + (l + 1) * a_dim1]; 2101 } 2102 if (uisec < isec) 2103 { 2104 t1[l - ll + 1 + (isec << 8) - 257] = 2105 a[ii + isec - 1 + l * a_dim1]; 2106 t1[l - ll + 2 + (isec << 8) - 257] = 2107 a[ii + isec - 1 + (l + 1) * a_dim1]; 2108 } 2109 } 2110 if (ulsec < lsec) 2111 { 2112 i4 = ii + isec - 1; 2113 for (i = ii; i<= i4; ++i) 2114 { 2115 t1[lsec + ((i - ii + 1) << 8) - 257] = 2116 a[i + (ll + lsec - 1) * a_dim1]; 2117 } 2118 } 2119 2120 uisec = isec - isec % 4; 2121 i4 = jj + ujsec - 1; 2122 for (j = jj; j <= i4; j += 4) 2123 { 2124 i5 = ii + uisec - 1; 2125 for (i = ii; i <= i5; i += 4) 2126 { 2127 f11 = c[i + j * c_dim1]; 2128 f21 = c[i + 1 + j * c_dim1]; 2129 f12 = c[i + (j + 1) * c_dim1]; 2130 f22 = c[i + 1 + (j + 1) * c_dim1]; 2131 f13 = c[i + (j + 2) * c_dim1]; 2132 f23 = c[i + 1 + (j + 2) * c_dim1]; 2133 f14 = c[i + (j + 3) * c_dim1]; 2134 f24 = c[i + 1 + (j + 3) * c_dim1]; 2135 f31 = c[i + 2 + j * c_dim1]; 2136 f41 = c[i + 3 + j * c_dim1]; 2137 f32 = c[i + 2 + (j + 1) * c_dim1]; 2138 f42 = c[i + 3 + (j + 1) * c_dim1]; 2139 f33 = c[i + 2 + (j + 2) * c_dim1]; 2140 f43 = c[i + 3 + (j + 2) * c_dim1]; 2141 f34 = c[i + 2 + (j + 3) * c_dim1]; 2142 f44 = c[i + 3 + (j + 3) * c_dim1]; 2143 i6 = ll + lsec - 1; 2144 for (l = ll; l <= i6; ++l) 2145 { 2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2147 * b[l + j * b_dim1]; 2148 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2149 * b[l + j * b_dim1]; 2150 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2151 * b[l + (j + 1) * b_dim1]; 2152 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2153 * b[l + (j + 1) * b_dim1]; 2154 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2155 * b[l + (j + 2) * b_dim1]; 2156 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2157 * b[l + (j + 2) * b_dim1]; 2158 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2159 * b[l + (j + 3) * b_dim1]; 2160 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2161 * b[l + (j + 3) * b_dim1]; 2162 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2163 * b[l + j * b_dim1]; 2164 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2165 * b[l + j * b_dim1]; 2166 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2167 * b[l + (j + 1) * b_dim1]; 2168 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2169 * b[l + (j + 1) * b_dim1]; 2170 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2171 * b[l + (j + 2) * b_dim1]; 2172 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2173 * b[l + (j + 2) * b_dim1]; 2174 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2175 * b[l + (j + 3) * b_dim1]; 2176 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2177 * b[l + (j + 3) * b_dim1]; 2178 } 2179 c[i + j * c_dim1] = f11; 2180 c[i + 1 + j * c_dim1] = f21; 2181 c[i + (j + 1) * c_dim1] = f12; 2182 c[i + 1 + (j + 1) * c_dim1] = f22; 2183 c[i + (j + 2) * c_dim1] = f13; 2184 c[i + 1 + (j + 2) * c_dim1] = f23; 2185 c[i + (j + 3) * c_dim1] = f14; 2186 c[i + 1 + (j + 3) * c_dim1] = f24; 2187 c[i + 2 + j * c_dim1] = f31; 2188 c[i + 3 + j * c_dim1] = f41; 2189 c[i + 2 + (j + 1) * c_dim1] = f32; 2190 c[i + 3 + (j + 1) * c_dim1] = f42; 2191 c[i + 2 + (j + 2) * c_dim1] = f33; 2192 c[i + 3 + (j + 2) * c_dim1] = f43; 2193 c[i + 2 + (j + 3) * c_dim1] = f34; 2194 c[i + 3 + (j + 3) * c_dim1] = f44; 2195 } 2196 if (uisec < isec) 2197 { 2198 i5 = ii + isec - 1; 2199 for (i = ii + uisec; i <= i5; ++i) 2200 { 2201 f11 = c[i + j * c_dim1]; 2202 f12 = c[i + (j + 1) * c_dim1]; 2203 f13 = c[i + (j + 2) * c_dim1]; 2204 f14 = c[i + (j + 3) * c_dim1]; 2205 i6 = ll + lsec - 1; 2206 for (l = ll; l <= i6; ++l) 2207 { 2208 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2209 257] * b[l + j * b_dim1]; 2210 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2211 257] * b[l + (j + 1) * b_dim1]; 2212 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2213 257] * b[l + (j + 2) * b_dim1]; 2214 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2215 257] * b[l + (j + 3) * b_dim1]; 2216 } 2217 c[i + j * c_dim1] = f11; 2218 c[i + (j + 1) * c_dim1] = f12; 2219 c[i + (j + 2) * c_dim1] = f13; 2220 c[i + (j + 3) * c_dim1] = f14; 2221 } 2222 } 2223 } 2224 if (ujsec < jsec) 2225 { 2226 i4 = jj + jsec - 1; 2227 for (j = jj + ujsec; j <= i4; ++j) 2228 { 2229 i5 = ii + uisec - 1; 2230 for (i = ii; i <= i5; i += 4) 2231 { 2232 f11 = c[i + j * c_dim1]; 2233 f21 = c[i + 1 + j * c_dim1]; 2234 f31 = c[i + 2 + j * c_dim1]; 2235 f41 = c[i + 3 + j * c_dim1]; 2236 i6 = ll + lsec - 1; 2237 for (l = ll; l <= i6; ++l) 2238 { 2239 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2240 257] * b[l + j * b_dim1]; 2241 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2242 257] * b[l + j * b_dim1]; 2243 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2244 257] * b[l + j * b_dim1]; 2245 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2246 257] * b[l + j * b_dim1]; 2247 } 2248 c[i + j * c_dim1] = f11; 2249 c[i + 1 + j * c_dim1] = f21; 2250 c[i + 2 + j * c_dim1] = f31; 2251 c[i + 3 + j * c_dim1] = f41; 2252 } 2253 i5 = ii + isec - 1; 2254 for (i = ii + uisec; i <= i5; ++i) 2255 { 2256 f11 = c[i + j * c_dim1]; 2257 i6 = ll + lsec - 1; 2258 for (l = ll; l <= i6; ++l) 2259 { 2260 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2261 257] * b[l + j * b_dim1]; 2262 } 2263 c[i + j * c_dim1] = f11; 2264 } 2265 } 2266 } 2267 } 2268 } 2269 } 2270 free(t1); 2271 return; 2272 } 2273 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2274 { 2275 if (GFC_DESCRIPTOR_RANK (a) != 1) 2276 { 2277 const GFC_INTEGER_2 *restrict abase_x; 2278 const GFC_INTEGER_2 *restrict bbase_y; 2279 GFC_INTEGER_2 *restrict dest_y; 2280 GFC_INTEGER_2 s; 2281 2282 for (y = 0; y < ycount; y++) 2283 { 2284 bbase_y = &bbase[y*bystride]; 2285 dest_y = &dest[y*rystride]; 2286 for (x = 0; x < xcount; x++) 2287 { 2288 abase_x = &abase[x*axstride]; 2289 s = (GFC_INTEGER_2) 0; 2290 for (n = 0; n < count; n++) 2291 s += abase_x[n] * bbase_y[n]; 2292 dest_y[x] = s; 2293 } 2294 } 2295 } 2296 else 2297 { 2298 const GFC_INTEGER_2 *restrict bbase_y; 2299 GFC_INTEGER_2 s; 2300 2301 for (y = 0; y < ycount; y++) 2302 { 2303 bbase_y = &bbase[y*bystride]; 2304 s = (GFC_INTEGER_2) 0; 2305 for (n = 0; n < count; n++) 2306 s += abase[n*axstride] * bbase_y[n]; 2307 dest[y*rystride] = s; 2308 } 2309 } 2310 } 2311 else if (axstride < aystride) 2312 { 2313 for (y = 0; y < ycount; y++) 2314 for (x = 0; x < xcount; x++) 2315 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 2316 2317 for (y = 0; y < ycount; y++) 2318 for (n = 0; n < count; n++) 2319 for (x = 0; x < xcount; x++) 2320 /* dest[x,y] += a[x,n] * b[n,y] */ 2321 dest[x*rxstride + y*rystride] += 2322 abase[x*axstride + n*aystride] * 2323 bbase[n*bxstride + y*bystride]; 2324 } 2325 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2326 { 2327 const GFC_INTEGER_2 *restrict bbase_y; 2328 GFC_INTEGER_2 s; 2329 2330 for (y = 0; y < ycount; y++) 2331 { 2332 bbase_y = &bbase[y*bystride]; 2333 s = (GFC_INTEGER_2) 0; 2334 for (n = 0; n < count; n++) 2335 s += abase[n*axstride] * bbase_y[n*bxstride]; 2336 dest[y*rxstride] = s; 2337 } 2338 } 2339 else 2340 { 2341 const GFC_INTEGER_2 *restrict abase_x; 2342 const GFC_INTEGER_2 *restrict bbase_y; 2343 GFC_INTEGER_2 *restrict dest_y; 2344 GFC_INTEGER_2 s; 2345 2346 for (y = 0; y < ycount; y++) 2347 { 2348 bbase_y = &bbase[y*bystride]; 2349 dest_y = &dest[y*rystride]; 2350 for (x = 0; x < xcount; x++) 2351 { 2352 abase_x = &abase[x*axstride]; 2353 s = (GFC_INTEGER_2) 0; 2354 for (n = 0; n < count; n++) 2355 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2356 dest_y[x*rxstride] = s; 2357 } 2358 } 2359 } 2360 } 2361 #undef POW3 2362 #undef min 2363 #undef max 2364 2365 2366 /* Compiling main function, with selection code for the processor. */ 2367 2368 /* Currently, this is i386 only. Adjust for other architectures. */ 2369 2370 #include <config/i386/cpuinfo.h> 2371 void matmul_i2 (gfc_array_i2 * const restrict retarray, 2372 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2373 int blas_limit, blas_call gemm) 2374 { 2375 static void (*matmul_p) (gfc_array_i2 * const restrict retarray, 2376 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2377 int blas_limit, blas_call gemm); 2378 2379 void (*matmul_fn) (gfc_array_i2 * const restrict retarray, 2380 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2381 int blas_limit, blas_call gemm); 2382 2383 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 2384 if (matmul_fn == NULL) 2385 { 2386 matmul_fn = matmul_i2_vanilla; 2387 if (__cpu_model.__cpu_vendor == VENDOR_INTEL) 2388 { 2389 /* Run down the available processors in order of preference. */ 2390 #ifdef HAVE_AVX512F 2391 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) 2392 { 2393 matmul_fn = matmul_i2_avx512f; 2394 goto store; 2395 } 2396 2397 #endif /* HAVE_AVX512F */ 2398 2399 #ifdef HAVE_AVX2 2400 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) 2401 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2402 { 2403 matmul_fn = matmul_i2_avx2; 2404 goto store; 2405 } 2406 2407 #endif 2408 2409 #ifdef HAVE_AVX 2410 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2411 { 2412 matmul_fn = matmul_i2_avx; 2413 goto store; 2414 } 2415 #endif /* HAVE_AVX */ 2416 } 2417 else if (__cpu_model.__cpu_vendor == VENDOR_AMD) 2418 { 2419 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 2420 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2421 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2422 { 2423 matmul_fn = matmul_i2_avx128_fma3; 2424 goto store; 2425 } 2426 #endif 2427 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 2428 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2429 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) 2430 { 2431 matmul_fn = matmul_i2_avx128_fma4; 2432 goto store; 2433 } 2434 #endif 2435 2436 } 2437 store: 2438 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 2439 } 2440 2441 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 2442 } 2443 2444 #else /* Just the vanilla function. */ 2445 2446 void 2447 matmul_i2 (gfc_array_i2 * const restrict retarray, 2448 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2449 int blas_limit, blas_call gemm) 2450 { 2451 const GFC_INTEGER_2 * restrict abase; 2452 const GFC_INTEGER_2 * restrict bbase; 2453 GFC_INTEGER_2 * restrict dest; 2454 2455 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 2456 index_type x, y, n, count, xcount, ycount; 2457 2458 assert (GFC_DESCRIPTOR_RANK (a) == 2 2459 || GFC_DESCRIPTOR_RANK (b) == 2); 2460 2461 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 2462 2463 Either A or B (but not both) can be rank 1: 2464 2465 o One-dimensional argument A is implicitly treated as a row matrix 2466 dimensioned [1,count], so xcount=1. 2467 2468 o One-dimensional argument B is implicitly treated as a column matrix 2469 dimensioned [count, 1], so ycount=1. 2470 */ 2471 2472 if (retarray->base_addr == NULL) 2473 { 2474 if (GFC_DESCRIPTOR_RANK (a) == 1) 2475 { 2476 GFC_DIMENSION_SET(retarray->dim[0], 0, 2477 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 2478 } 2479 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2480 { 2481 GFC_DIMENSION_SET(retarray->dim[0], 0, 2482 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2483 } 2484 else 2485 { 2486 GFC_DIMENSION_SET(retarray->dim[0], 0, 2487 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2488 2489 GFC_DIMENSION_SET(retarray->dim[1], 0, 2490 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 2491 GFC_DESCRIPTOR_EXTENT(retarray,0)); 2492 } 2493 2494 retarray->base_addr 2495 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 2496 retarray->offset = 0; 2497 } 2498 else if (unlikely (compile_options.bounds_check)) 2499 { 2500 index_type ret_extent, arg_extent; 2501 2502 if (GFC_DESCRIPTOR_RANK (a) == 1) 2503 { 2504 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2505 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2506 if (arg_extent != ret_extent) 2507 runtime_error ("Array bound mismatch for dimension 1 of " 2508 "array (%ld/%ld) ", 2509 (long int) ret_extent, (long int) arg_extent); 2510 } 2511 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2512 { 2513 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2514 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2515 if (arg_extent != ret_extent) 2516 runtime_error ("Array bound mismatch for dimension 1 of " 2517 "array (%ld/%ld) ", 2518 (long int) ret_extent, (long int) arg_extent); 2519 } 2520 else 2521 { 2522 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2523 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2524 if (arg_extent != ret_extent) 2525 runtime_error ("Array bound mismatch for dimension 1 of " 2526 "array (%ld/%ld) ", 2527 (long int) ret_extent, (long int) arg_extent); 2528 2529 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2530 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 2531 if (arg_extent != ret_extent) 2532 runtime_error ("Array bound mismatch for dimension 2 of " 2533 "array (%ld/%ld) ", 2534 (long int) ret_extent, (long int) arg_extent); 2535 } 2536 } 2537 2538 2539 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 2540 { 2541 /* One-dimensional result may be addressed in the code below 2542 either as a row or a column matrix. We want both cases to 2543 work. */ 2544 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2545 } 2546 else 2547 { 2548 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2549 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 2550 } 2551 2552 2553 if (GFC_DESCRIPTOR_RANK (a) == 1) 2554 { 2555 /* Treat it as a a row matrix A[1,count]. */ 2556 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2557 aystride = 1; 2558 2559 xcount = 1; 2560 count = GFC_DESCRIPTOR_EXTENT(a,0); 2561 } 2562 else 2563 { 2564 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2565 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 2566 2567 count = GFC_DESCRIPTOR_EXTENT(a,1); 2568 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 2569 } 2570 2571 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 2572 { 2573 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 2574 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 2575 "in dimension 1: is %ld, should be %ld", 2576 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 2577 } 2578 2579 if (GFC_DESCRIPTOR_RANK (b) == 1) 2580 { 2581 /* Treat it as a column matrix B[count,1] */ 2582 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2583 2584 /* bystride should never be used for 1-dimensional b. 2585 The value is only used for calculation of the 2586 memory by the buffer. */ 2587 bystride = 256; 2588 ycount = 1; 2589 } 2590 else 2591 { 2592 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2593 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 2594 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 2595 } 2596 2597 abase = a->base_addr; 2598 bbase = b->base_addr; 2599 dest = retarray->base_addr; 2600 2601 /* Now that everything is set up, we perform the multiplication 2602 itself. */ 2603 2604 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 2605 #define min(a,b) ((a) <= (b) ? (a) : (b)) 2606 #define max(a,b) ((a) >= (b) ? (a) : (b)) 2607 2608 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 2609 && (bxstride == 1 || bystride == 1) 2610 && (((float) xcount) * ((float) ycount) * ((float) count) 2611 > POW3(blas_limit))) 2612 { 2613 const int m = xcount, n = ycount, k = count, ldc = rystride; 2614 const GFC_INTEGER_2 one = 1, zero = 0; 2615 const int lda = (axstride == 1) ? aystride : axstride, 2616 ldb = (bxstride == 1) ? bystride : bxstride; 2617 2618 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 2619 { 2620 assert (gemm != NULL); 2621 const char *transa, *transb; 2622 if (try_blas & 2) 2623 transa = "C"; 2624 else 2625 transa = axstride == 1 ? "N" : "T"; 2626 2627 if (try_blas & 4) 2628 transb = "C"; 2629 else 2630 transb = bxstride == 1 ? "N" : "T"; 2631 2632 gemm (transa, transb , &m, 2633 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 2634 &ldc, 1, 1); 2635 return; 2636 } 2637 } 2638 2639 if (rxstride == 1 && axstride == 1 && bxstride == 1) 2640 { 2641 /* This block of code implements a tuned matmul, derived from 2642 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2643 2644 Bo Kagstrom and Per Ling 2645 Department of Computing Science 2646 Umea University 2647 S-901 87 Umea, Sweden 2648 2649 from netlib.org, translated to C, and modified for matmul.m4. */ 2650 2651 const GFC_INTEGER_2 *a, *b; 2652 GFC_INTEGER_2 *c; 2653 const index_type m = xcount, n = ycount, k = count; 2654 2655 /* System generated locals */ 2656 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2657 i1, i2, i3, i4, i5, i6; 2658 2659 /* Local variables */ 2660 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 2661 f13, f14, f23, f24, f33, f34, f43, f44; 2662 index_type i, j, l, ii, jj, ll; 2663 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2664 GFC_INTEGER_2 *t1; 2665 2666 a = abase; 2667 b = bbase; 2668 c = retarray->base_addr; 2669 2670 /* Parameter adjustments */ 2671 c_dim1 = rystride; 2672 c_offset = 1 + c_dim1; 2673 c -= c_offset; 2674 a_dim1 = aystride; 2675 a_offset = 1 + a_dim1; 2676 a -= a_offset; 2677 b_dim1 = bystride; 2678 b_offset = 1 + b_dim1; 2679 b -= b_offset; 2680 2681 /* Empty c first. */ 2682 for (j=1; j<=n; j++) 2683 for (i=1; i<=m; i++) 2684 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 2685 2686 /* Early exit if possible */ 2687 if (m == 0 || n == 0 || k == 0) 2688 return; 2689 2690 /* Adjust size of t1 to what is needed. */ 2691 index_type t1_dim, a_sz; 2692 if (aystride == 1) 2693 a_sz = rystride; 2694 else 2695 a_sz = a_dim1; 2696 2697 t1_dim = a_sz * 256 + b_dim1; 2698 if (t1_dim > 65536) 2699 t1_dim = 65536; 2700 2701 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 2702 2703 /* Start turning the crank. */ 2704 i1 = n; 2705 for (jj = 1; jj <= i1; jj += 512) 2706 { 2707 /* Computing MIN */ 2708 i2 = 512; 2709 i3 = n - jj + 1; 2710 jsec = min(i2,i3); 2711 ujsec = jsec - jsec % 4; 2712 i2 = k; 2713 for (ll = 1; ll <= i2; ll += 256) 2714 { 2715 /* Computing MIN */ 2716 i3 = 256; 2717 i4 = k - ll + 1; 2718 lsec = min(i3,i4); 2719 ulsec = lsec - lsec % 2; 2720 2721 i3 = m; 2722 for (ii = 1; ii <= i3; ii += 256) 2723 { 2724 /* Computing MIN */ 2725 i4 = 256; 2726 i5 = m - ii + 1; 2727 isec = min(i4,i5); 2728 uisec = isec - isec % 2; 2729 i4 = ll + ulsec - 1; 2730 for (l = ll; l <= i4; l += 2) 2731 { 2732 i5 = ii + uisec - 1; 2733 for (i = ii; i <= i5; i += 2) 2734 { 2735 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2736 a[i + l * a_dim1]; 2737 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2738 a[i + (l + 1) * a_dim1]; 2739 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2740 a[i + 1 + l * a_dim1]; 2741 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2742 a[i + 1 + (l + 1) * a_dim1]; 2743 } 2744 if (uisec < isec) 2745 { 2746 t1[l - ll + 1 + (isec << 8) - 257] = 2747 a[ii + isec - 1 + l * a_dim1]; 2748 t1[l - ll + 2 + (isec << 8) - 257] = 2749 a[ii + isec - 1 + (l + 1) * a_dim1]; 2750 } 2751 } 2752 if (ulsec < lsec) 2753 { 2754 i4 = ii + isec - 1; 2755 for (i = ii; i<= i4; ++i) 2756 { 2757 t1[lsec + ((i - ii + 1) << 8) - 257] = 2758 a[i + (ll + lsec - 1) * a_dim1]; 2759 } 2760 } 2761 2762 uisec = isec - isec % 4; 2763 i4 = jj + ujsec - 1; 2764 for (j = jj; j <= i4; j += 4) 2765 { 2766 i5 = ii + uisec - 1; 2767 for (i = ii; i <= i5; i += 4) 2768 { 2769 f11 = c[i + j * c_dim1]; 2770 f21 = c[i + 1 + j * c_dim1]; 2771 f12 = c[i + (j + 1) * c_dim1]; 2772 f22 = c[i + 1 + (j + 1) * c_dim1]; 2773 f13 = c[i + (j + 2) * c_dim1]; 2774 f23 = c[i + 1 + (j + 2) * c_dim1]; 2775 f14 = c[i + (j + 3) * c_dim1]; 2776 f24 = c[i + 1 + (j + 3) * c_dim1]; 2777 f31 = c[i + 2 + j * c_dim1]; 2778 f41 = c[i + 3 + j * c_dim1]; 2779 f32 = c[i + 2 + (j + 1) * c_dim1]; 2780 f42 = c[i + 3 + (j + 1) * c_dim1]; 2781 f33 = c[i + 2 + (j + 2) * c_dim1]; 2782 f43 = c[i + 3 + (j + 2) * c_dim1]; 2783 f34 = c[i + 2 + (j + 3) * c_dim1]; 2784 f44 = c[i + 3 + (j + 3) * c_dim1]; 2785 i6 = ll + lsec - 1; 2786 for (l = ll; l <= i6; ++l) 2787 { 2788 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2789 * b[l + j * b_dim1]; 2790 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2791 * b[l + j * b_dim1]; 2792 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2793 * b[l + (j + 1) * b_dim1]; 2794 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2795 * b[l + (j + 1) * b_dim1]; 2796 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2797 * b[l + (j + 2) * b_dim1]; 2798 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2799 * b[l + (j + 2) * b_dim1]; 2800 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2801 * b[l + (j + 3) * b_dim1]; 2802 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2803 * b[l + (j + 3) * b_dim1]; 2804 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2805 * b[l + j * b_dim1]; 2806 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2807 * b[l + j * b_dim1]; 2808 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2809 * b[l + (j + 1) * b_dim1]; 2810 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2811 * b[l + (j + 1) * b_dim1]; 2812 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2813 * b[l + (j + 2) * b_dim1]; 2814 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2815 * b[l + (j + 2) * b_dim1]; 2816 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2817 * b[l + (j + 3) * b_dim1]; 2818 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2819 * b[l + (j + 3) * b_dim1]; 2820 } 2821 c[i + j * c_dim1] = f11; 2822 c[i + 1 + j * c_dim1] = f21; 2823 c[i + (j + 1) * c_dim1] = f12; 2824 c[i + 1 + (j + 1) * c_dim1] = f22; 2825 c[i + (j + 2) * c_dim1] = f13; 2826 c[i + 1 + (j + 2) * c_dim1] = f23; 2827 c[i + (j + 3) * c_dim1] = f14; 2828 c[i + 1 + (j + 3) * c_dim1] = f24; 2829 c[i + 2 + j * c_dim1] = f31; 2830 c[i + 3 + j * c_dim1] = f41; 2831 c[i + 2 + (j + 1) * c_dim1] = f32; 2832 c[i + 3 + (j + 1) * c_dim1] = f42; 2833 c[i + 2 + (j + 2) * c_dim1] = f33; 2834 c[i + 3 + (j + 2) * c_dim1] = f43; 2835 c[i + 2 + (j + 3) * c_dim1] = f34; 2836 c[i + 3 + (j + 3) * c_dim1] = f44; 2837 } 2838 if (uisec < isec) 2839 { 2840 i5 = ii + isec - 1; 2841 for (i = ii + uisec; i <= i5; ++i) 2842 { 2843 f11 = c[i + j * c_dim1]; 2844 f12 = c[i + (j + 1) * c_dim1]; 2845 f13 = c[i + (j + 2) * c_dim1]; 2846 f14 = c[i + (j + 3) * c_dim1]; 2847 i6 = ll + lsec - 1; 2848 for (l = ll; l <= i6; ++l) 2849 { 2850 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2851 257] * b[l + j * b_dim1]; 2852 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2853 257] * b[l + (j + 1) * b_dim1]; 2854 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2855 257] * b[l + (j + 2) * b_dim1]; 2856 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2857 257] * b[l + (j + 3) * b_dim1]; 2858 } 2859 c[i + j * c_dim1] = f11; 2860 c[i + (j + 1) * c_dim1] = f12; 2861 c[i + (j + 2) * c_dim1] = f13; 2862 c[i + (j + 3) * c_dim1] = f14; 2863 } 2864 } 2865 } 2866 if (ujsec < jsec) 2867 { 2868 i4 = jj + jsec - 1; 2869 for (j = jj + ujsec; j <= i4; ++j) 2870 { 2871 i5 = ii + uisec - 1; 2872 for (i = ii; i <= i5; i += 4) 2873 { 2874 f11 = c[i + j * c_dim1]; 2875 f21 = c[i + 1 + j * c_dim1]; 2876 f31 = c[i + 2 + j * c_dim1]; 2877 f41 = c[i + 3 + j * c_dim1]; 2878 i6 = ll + lsec - 1; 2879 for (l = ll; l <= i6; ++l) 2880 { 2881 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2882 257] * b[l + j * b_dim1]; 2883 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2884 257] * b[l + j * b_dim1]; 2885 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2886 257] * b[l + j * b_dim1]; 2887 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2888 257] * b[l + j * b_dim1]; 2889 } 2890 c[i + j * c_dim1] = f11; 2891 c[i + 1 + j * c_dim1] = f21; 2892 c[i + 2 + j * c_dim1] = f31; 2893 c[i + 3 + j * c_dim1] = f41; 2894 } 2895 i5 = ii + isec - 1; 2896 for (i = ii + uisec; i <= i5; ++i) 2897 { 2898 f11 = c[i + j * c_dim1]; 2899 i6 = ll + lsec - 1; 2900 for (l = ll; l <= i6; ++l) 2901 { 2902 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2903 257] * b[l + j * b_dim1]; 2904 } 2905 c[i + j * c_dim1] = f11; 2906 } 2907 } 2908 } 2909 } 2910 } 2911 } 2912 free(t1); 2913 return; 2914 } 2915 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2916 { 2917 if (GFC_DESCRIPTOR_RANK (a) != 1) 2918 { 2919 const GFC_INTEGER_2 *restrict abase_x; 2920 const GFC_INTEGER_2 *restrict bbase_y; 2921 GFC_INTEGER_2 *restrict dest_y; 2922 GFC_INTEGER_2 s; 2923 2924 for (y = 0; y < ycount; y++) 2925 { 2926 bbase_y = &bbase[y*bystride]; 2927 dest_y = &dest[y*rystride]; 2928 for (x = 0; x < xcount; x++) 2929 { 2930 abase_x = &abase[x*axstride]; 2931 s = (GFC_INTEGER_2) 0; 2932 for (n = 0; n < count; n++) 2933 s += abase_x[n] * bbase_y[n]; 2934 dest_y[x] = s; 2935 } 2936 } 2937 } 2938 else 2939 { 2940 const GFC_INTEGER_2 *restrict bbase_y; 2941 GFC_INTEGER_2 s; 2942 2943 for (y = 0; y < ycount; y++) 2944 { 2945 bbase_y = &bbase[y*bystride]; 2946 s = (GFC_INTEGER_2) 0; 2947 for (n = 0; n < count; n++) 2948 s += abase[n*axstride] * bbase_y[n]; 2949 dest[y*rystride] = s; 2950 } 2951 } 2952 } 2953 else if (axstride < aystride) 2954 { 2955 for (y = 0; y < ycount; y++) 2956 for (x = 0; x < xcount; x++) 2957 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 2958 2959 for (y = 0; y < ycount; y++) 2960 for (n = 0; n < count; n++) 2961 for (x = 0; x < xcount; x++) 2962 /* dest[x,y] += a[x,n] * b[n,y] */ 2963 dest[x*rxstride + y*rystride] += 2964 abase[x*axstride + n*aystride] * 2965 bbase[n*bxstride + y*bystride]; 2966 } 2967 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2968 { 2969 const GFC_INTEGER_2 *restrict bbase_y; 2970 GFC_INTEGER_2 s; 2971 2972 for (y = 0; y < ycount; y++) 2973 { 2974 bbase_y = &bbase[y*bystride]; 2975 s = (GFC_INTEGER_2) 0; 2976 for (n = 0; n < count; n++) 2977 s += abase[n*axstride] * bbase_y[n*bxstride]; 2978 dest[y*rxstride] = s; 2979 } 2980 } 2981 else 2982 { 2983 const GFC_INTEGER_2 *restrict abase_x; 2984 const GFC_INTEGER_2 *restrict bbase_y; 2985 GFC_INTEGER_2 *restrict dest_y; 2986 GFC_INTEGER_2 s; 2987 2988 for (y = 0; y < ycount; y++) 2989 { 2990 bbase_y = &bbase[y*bystride]; 2991 dest_y = &dest[y*rystride]; 2992 for (x = 0; x < xcount; x++) 2993 { 2994 abase_x = &abase[x*axstride]; 2995 s = (GFC_INTEGER_2) 0; 2996 for (n = 0; n < count; n++) 2997 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2998 dest_y[x*rxstride] = s; 2999 } 3000 } 3001 } 3002 } 3003 #undef POW3 3004 #undef min 3005 #undef max 3006 3007 #endif 3008 #endif 3009 3010