1627f7eb2Smrg`void 2627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 3627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 4627f7eb2Smrg int blas_limit, blas_call gemm) 5627f7eb2Smrg{ 6627f7eb2Smrg const 'rtype_name` * restrict abase; 7627f7eb2Smrg const 'rtype_name` * restrict bbase; 8627f7eb2Smrg 'rtype_name` * restrict dest; 9627f7eb2Smrg 10627f7eb2Smrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 11627f7eb2Smrg index_type x, y, n, count, xcount, ycount; 12627f7eb2Smrg 13627f7eb2Smrg assert (GFC_DESCRIPTOR_RANK (a) == 2 14627f7eb2Smrg || GFC_DESCRIPTOR_RANK (b) == 2); 15627f7eb2Smrg 16627f7eb2Smrg/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 17627f7eb2Smrg 18627f7eb2Smrg Either A or B (but not both) can be rank 1: 19627f7eb2Smrg 20627f7eb2Smrg o One-dimensional argument A is implicitly treated as a row matrix 21627f7eb2Smrg dimensioned [1,count], so xcount=1. 22627f7eb2Smrg 23627f7eb2Smrg o One-dimensional argument B is implicitly treated as a column matrix 24627f7eb2Smrg dimensioned [count, 1], so ycount=1. 25627f7eb2Smrg*/ 26627f7eb2Smrg 27627f7eb2Smrg if (retarray->base_addr == NULL) 28627f7eb2Smrg { 29627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1) 30627f7eb2Smrg { 31627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0, 32627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 33627f7eb2Smrg } 34627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 35627f7eb2Smrg { 36627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0, 37627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 38627f7eb2Smrg } 39627f7eb2Smrg else 40627f7eb2Smrg { 41627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[0], 0, 42627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 43627f7eb2Smrg 44627f7eb2Smrg GFC_DIMENSION_SET(retarray->dim[1], 0, 45627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 46627f7eb2Smrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 47627f7eb2Smrg } 48627f7eb2Smrg 49627f7eb2Smrg retarray->base_addr 50627f7eb2Smrg = xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`)); 51627f7eb2Smrg retarray->offset = 0; 52627f7eb2Smrg } 53627f7eb2Smrg else if (unlikely (compile_options.bounds_check)) 54627f7eb2Smrg { 55627f7eb2Smrg index_type ret_extent, arg_extent; 56627f7eb2Smrg 57627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1) 58627f7eb2Smrg { 59627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 60627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 61627f7eb2Smrg if (arg_extent != ret_extent) 62627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of " 63627f7eb2Smrg "array (%ld/%ld) ", 64627f7eb2Smrg (long int) ret_extent, (long int) arg_extent); 65627f7eb2Smrg } 66627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 67627f7eb2Smrg { 68627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 69627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 70627f7eb2Smrg if (arg_extent != ret_extent) 71627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of " 72627f7eb2Smrg "array (%ld/%ld) ", 73627f7eb2Smrg (long int) ret_extent, (long int) arg_extent); 74627f7eb2Smrg } 75627f7eb2Smrg else 76627f7eb2Smrg { 77627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 78627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 79627f7eb2Smrg if (arg_extent != ret_extent) 80627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 1 of " 81627f7eb2Smrg "array (%ld/%ld) ", 82627f7eb2Smrg (long int) ret_extent, (long int) arg_extent); 83627f7eb2Smrg 84627f7eb2Smrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 85627f7eb2Smrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 86627f7eb2Smrg if (arg_extent != ret_extent) 87627f7eb2Smrg runtime_error ("Array bound mismatch for dimension 2 of " 88627f7eb2Smrg "array (%ld/%ld) ", 89627f7eb2Smrg (long int) ret_extent, (long int) arg_extent); 90627f7eb2Smrg } 91627f7eb2Smrg } 92627f7eb2Smrg' 93627f7eb2Smrgsinclude(`matmul_asm_'rtype_code`.m4')dnl 94627f7eb2Smrg` 95627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 96627f7eb2Smrg { 97627f7eb2Smrg /* One-dimensional result may be addressed in the code below 98627f7eb2Smrg either as a row or a column matrix. We want both cases to 99627f7eb2Smrg work. */ 100627f7eb2Smrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 101627f7eb2Smrg } 102627f7eb2Smrg else 103627f7eb2Smrg { 104627f7eb2Smrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 105627f7eb2Smrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 106627f7eb2Smrg } 107627f7eb2Smrg 108627f7eb2Smrg 109627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) == 1) 110627f7eb2Smrg { 111627f7eb2Smrg /* Treat it as a a row matrix A[1,count]. */ 112627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 113627f7eb2Smrg aystride = 1; 114627f7eb2Smrg 115627f7eb2Smrg xcount = 1; 116627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,0); 117627f7eb2Smrg } 118627f7eb2Smrg else 119627f7eb2Smrg { 120627f7eb2Smrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 121627f7eb2Smrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 122627f7eb2Smrg 123627f7eb2Smrg count = GFC_DESCRIPTOR_EXTENT(a,1); 124627f7eb2Smrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 125627f7eb2Smrg } 126627f7eb2Smrg 127627f7eb2Smrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 128627f7eb2Smrg { 129627f7eb2Smrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 130627f7eb2Smrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 131627f7eb2Smrg "in dimension 1: is %ld, should be %ld", 132627f7eb2Smrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 133627f7eb2Smrg } 134627f7eb2Smrg 135627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (b) == 1) 136627f7eb2Smrg { 137627f7eb2Smrg /* Treat it as a column matrix B[count,1] */ 138627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 139627f7eb2Smrg 140627f7eb2Smrg /* bystride should never be used for 1-dimensional b. 141627f7eb2Smrg The value is only used for calculation of the 142627f7eb2Smrg memory by the buffer. */ 143627f7eb2Smrg bystride = 256; 144627f7eb2Smrg ycount = 1; 145627f7eb2Smrg } 146627f7eb2Smrg else 147627f7eb2Smrg { 148627f7eb2Smrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 149627f7eb2Smrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 150627f7eb2Smrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 151627f7eb2Smrg } 152627f7eb2Smrg 153627f7eb2Smrg abase = a->base_addr; 154627f7eb2Smrg bbase = b->base_addr; 155627f7eb2Smrg dest = retarray->base_addr; 156627f7eb2Smrg 157627f7eb2Smrg /* Now that everything is set up, we perform the multiplication 158627f7eb2Smrg itself. */ 159627f7eb2Smrg 160627f7eb2Smrg#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 161627f7eb2Smrg#define min(a,b) ((a) <= (b) ? (a) : (b)) 162627f7eb2Smrg#define max(a,b) ((a) >= (b) ? (a) : (b)) 163627f7eb2Smrg 164627f7eb2Smrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 165627f7eb2Smrg && (bxstride == 1 || bystride == 1) 166627f7eb2Smrg && (((float) xcount) * ((float) ycount) * ((float) count) 167627f7eb2Smrg > POW3(blas_limit))) 168627f7eb2Smrg { 169627f7eb2Smrg const int m = xcount, n = ycount, k = count, ldc = rystride; 170627f7eb2Smrg const 'rtype_name` one = 1, zero = 0; 171627f7eb2Smrg const int lda = (axstride == 1) ? aystride : axstride, 172627f7eb2Smrg ldb = (bxstride == 1) ? bystride : bxstride; 173627f7eb2Smrg 174627f7eb2Smrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 175627f7eb2Smrg { 176627f7eb2Smrg assert (gemm != NULL); 177627f7eb2Smrg const char *transa, *transb; 178627f7eb2Smrg if (try_blas & 2) 179627f7eb2Smrg transa = "C"; 180627f7eb2Smrg else 181627f7eb2Smrg transa = axstride == 1 ? "N" : "T"; 182627f7eb2Smrg 183627f7eb2Smrg if (try_blas & 4) 184627f7eb2Smrg transb = "C"; 185627f7eb2Smrg else 186627f7eb2Smrg transb = bxstride == 1 ? "N" : "T"; 187627f7eb2Smrg 188627f7eb2Smrg gemm (transa, transb , &m, 189627f7eb2Smrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 190627f7eb2Smrg &ldc, 1, 1); 191627f7eb2Smrg return; 192627f7eb2Smrg } 193627f7eb2Smrg } 194627f7eb2Smrg 195*4c3eb207Smrg if (rxstride == 1 && axstride == 1 && bxstride == 1 196*4c3eb207Smrg && GFC_DESCRIPTOR_RANK (b) != 1) 197627f7eb2Smrg { 198627f7eb2Smrg /* This block of code implements a tuned matmul, derived from 199627f7eb2Smrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 200627f7eb2Smrg 201627f7eb2Smrg Bo Kagstrom and Per Ling 202627f7eb2Smrg Department of Computing Science 203627f7eb2Smrg Umea University 204627f7eb2Smrg S-901 87 Umea, Sweden 205627f7eb2Smrg 206627f7eb2Smrg from netlib.org, translated to C, and modified for matmul.m4. */ 207627f7eb2Smrg 208627f7eb2Smrg const 'rtype_name` *a, *b; 209627f7eb2Smrg 'rtype_name` *c; 210627f7eb2Smrg const index_type m = xcount, n = ycount, k = count; 211627f7eb2Smrg 212627f7eb2Smrg /* System generated locals */ 213627f7eb2Smrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 214627f7eb2Smrg i1, i2, i3, i4, i5, i6; 215627f7eb2Smrg 216627f7eb2Smrg /* Local variables */ 217627f7eb2Smrg 'rtype_name` f11, f12, f21, f22, f31, f32, f41, f42, 218627f7eb2Smrg f13, f14, f23, f24, f33, f34, f43, f44; 219627f7eb2Smrg index_type i, j, l, ii, jj, ll; 220627f7eb2Smrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 221627f7eb2Smrg 'rtype_name` *t1; 222627f7eb2Smrg 223627f7eb2Smrg a = abase; 224627f7eb2Smrg b = bbase; 225627f7eb2Smrg c = retarray->base_addr; 226627f7eb2Smrg 227627f7eb2Smrg /* Parameter adjustments */ 228627f7eb2Smrg c_dim1 = rystride; 229627f7eb2Smrg c_offset = 1 + c_dim1; 230627f7eb2Smrg c -= c_offset; 231627f7eb2Smrg a_dim1 = aystride; 232627f7eb2Smrg a_offset = 1 + a_dim1; 233627f7eb2Smrg a -= a_offset; 234627f7eb2Smrg b_dim1 = bystride; 235627f7eb2Smrg b_offset = 1 + b_dim1; 236627f7eb2Smrg b -= b_offset; 237627f7eb2Smrg 238627f7eb2Smrg /* Empty c first. */ 239627f7eb2Smrg for (j=1; j<=n; j++) 240627f7eb2Smrg for (i=1; i<=m; i++) 241627f7eb2Smrg c[i + j * c_dim1] = ('rtype_name`)0; 242627f7eb2Smrg 243627f7eb2Smrg /* Early exit if possible */ 244627f7eb2Smrg if (m == 0 || n == 0 || k == 0) 245627f7eb2Smrg return; 246627f7eb2Smrg 247627f7eb2Smrg /* Adjust size of t1 to what is needed. */ 248627f7eb2Smrg index_type t1_dim, a_sz; 249627f7eb2Smrg if (aystride == 1) 250627f7eb2Smrg a_sz = rystride; 251627f7eb2Smrg else 252627f7eb2Smrg a_sz = a_dim1; 253627f7eb2Smrg 254627f7eb2Smrg t1_dim = a_sz * 256 + b_dim1; 255627f7eb2Smrg if (t1_dim > 65536) 256627f7eb2Smrg t1_dim = 65536; 257627f7eb2Smrg 258627f7eb2Smrg t1 = malloc (t1_dim * sizeof('rtype_name`)); 259627f7eb2Smrg 260627f7eb2Smrg /* Start turning the crank. */ 261627f7eb2Smrg i1 = n; 262627f7eb2Smrg for (jj = 1; jj <= i1; jj += 512) 263627f7eb2Smrg { 264627f7eb2Smrg /* Computing MIN */ 265627f7eb2Smrg i2 = 512; 266627f7eb2Smrg i3 = n - jj + 1; 267627f7eb2Smrg jsec = min(i2,i3); 268627f7eb2Smrg ujsec = jsec - jsec % 4; 269627f7eb2Smrg i2 = k; 270627f7eb2Smrg for (ll = 1; ll <= i2; ll += 256) 271627f7eb2Smrg { 272627f7eb2Smrg /* Computing MIN */ 273627f7eb2Smrg i3 = 256; 274627f7eb2Smrg i4 = k - ll + 1; 275627f7eb2Smrg lsec = min(i3,i4); 276627f7eb2Smrg ulsec = lsec - lsec % 2; 277627f7eb2Smrg 278627f7eb2Smrg i3 = m; 279627f7eb2Smrg for (ii = 1; ii <= i3; ii += 256) 280627f7eb2Smrg { 281627f7eb2Smrg /* Computing MIN */ 282627f7eb2Smrg i4 = 256; 283627f7eb2Smrg i5 = m - ii + 1; 284627f7eb2Smrg isec = min(i4,i5); 285627f7eb2Smrg uisec = isec - isec % 2; 286627f7eb2Smrg i4 = ll + ulsec - 1; 287627f7eb2Smrg for (l = ll; l <= i4; l += 2) 288627f7eb2Smrg { 289627f7eb2Smrg i5 = ii + uisec - 1; 290627f7eb2Smrg for (i = ii; i <= i5; i += 2) 291627f7eb2Smrg { 292627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 293627f7eb2Smrg a[i + l * a_dim1]; 294627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 295627f7eb2Smrg a[i + (l + 1) * a_dim1]; 296627f7eb2Smrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 297627f7eb2Smrg a[i + 1 + l * a_dim1]; 298627f7eb2Smrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 299627f7eb2Smrg a[i + 1 + (l + 1) * a_dim1]; 300627f7eb2Smrg } 301627f7eb2Smrg if (uisec < isec) 302627f7eb2Smrg { 303627f7eb2Smrg t1[l - ll + 1 + (isec << 8) - 257] = 304627f7eb2Smrg a[ii + isec - 1 + l * a_dim1]; 305627f7eb2Smrg t1[l - ll + 2 + (isec << 8) - 257] = 306627f7eb2Smrg a[ii + isec - 1 + (l + 1) * a_dim1]; 307627f7eb2Smrg } 308627f7eb2Smrg } 309627f7eb2Smrg if (ulsec < lsec) 310627f7eb2Smrg { 311627f7eb2Smrg i4 = ii + isec - 1; 312627f7eb2Smrg for (i = ii; i<= i4; ++i) 313627f7eb2Smrg { 314627f7eb2Smrg t1[lsec + ((i - ii + 1) << 8) - 257] = 315627f7eb2Smrg a[i + (ll + lsec - 1) * a_dim1]; 316627f7eb2Smrg } 317627f7eb2Smrg } 318627f7eb2Smrg 319627f7eb2Smrg uisec = isec - isec % 4; 320627f7eb2Smrg i4 = jj + ujsec - 1; 321627f7eb2Smrg for (j = jj; j <= i4; j += 4) 322627f7eb2Smrg { 323627f7eb2Smrg i5 = ii + uisec - 1; 324627f7eb2Smrg for (i = ii; i <= i5; i += 4) 325627f7eb2Smrg { 326627f7eb2Smrg f11 = c[i + j * c_dim1]; 327627f7eb2Smrg f21 = c[i + 1 + j * c_dim1]; 328627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1]; 329627f7eb2Smrg f22 = c[i + 1 + (j + 1) * c_dim1]; 330627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1]; 331627f7eb2Smrg f23 = c[i + 1 + (j + 2) * c_dim1]; 332627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1]; 333627f7eb2Smrg f24 = c[i + 1 + (j + 3) * c_dim1]; 334627f7eb2Smrg f31 = c[i + 2 + j * c_dim1]; 335627f7eb2Smrg f41 = c[i + 3 + j * c_dim1]; 336627f7eb2Smrg f32 = c[i + 2 + (j + 1) * c_dim1]; 337627f7eb2Smrg f42 = c[i + 3 + (j + 1) * c_dim1]; 338627f7eb2Smrg f33 = c[i + 2 + (j + 2) * c_dim1]; 339627f7eb2Smrg f43 = c[i + 3 + (j + 2) * c_dim1]; 340627f7eb2Smrg f34 = c[i + 2 + (j + 3) * c_dim1]; 341627f7eb2Smrg f44 = c[i + 3 + (j + 3) * c_dim1]; 342627f7eb2Smrg i6 = ll + lsec - 1; 343627f7eb2Smrg for (l = ll; l <= i6; ++l) 344627f7eb2Smrg { 345627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 346627f7eb2Smrg * b[l + j * b_dim1]; 347627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 348627f7eb2Smrg * b[l + j * b_dim1]; 349627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 350627f7eb2Smrg * b[l + (j + 1) * b_dim1]; 351627f7eb2Smrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 352627f7eb2Smrg * b[l + (j + 1) * b_dim1]; 353627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 354627f7eb2Smrg * b[l + (j + 2) * b_dim1]; 355627f7eb2Smrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 356627f7eb2Smrg * b[l + (j + 2) * b_dim1]; 357627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 358627f7eb2Smrg * b[l + (j + 3) * b_dim1]; 359627f7eb2Smrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 360627f7eb2Smrg * b[l + (j + 3) * b_dim1]; 361627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 362627f7eb2Smrg * b[l + j * b_dim1]; 363627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 364627f7eb2Smrg * b[l + j * b_dim1]; 365627f7eb2Smrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 366627f7eb2Smrg * b[l + (j + 1) * b_dim1]; 367627f7eb2Smrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 368627f7eb2Smrg * b[l + (j + 1) * b_dim1]; 369627f7eb2Smrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 370627f7eb2Smrg * b[l + (j + 2) * b_dim1]; 371627f7eb2Smrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 372627f7eb2Smrg * b[l + (j + 2) * b_dim1]; 373627f7eb2Smrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 374627f7eb2Smrg * b[l + (j + 3) * b_dim1]; 375627f7eb2Smrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 376627f7eb2Smrg * b[l + (j + 3) * b_dim1]; 377627f7eb2Smrg } 378627f7eb2Smrg c[i + j * c_dim1] = f11; 379627f7eb2Smrg c[i + 1 + j * c_dim1] = f21; 380627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12; 381627f7eb2Smrg c[i + 1 + (j + 1) * c_dim1] = f22; 382627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13; 383627f7eb2Smrg c[i + 1 + (j + 2) * c_dim1] = f23; 384627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14; 385627f7eb2Smrg c[i + 1 + (j + 3) * c_dim1] = f24; 386627f7eb2Smrg c[i + 2 + j * c_dim1] = f31; 387627f7eb2Smrg c[i + 3 + j * c_dim1] = f41; 388627f7eb2Smrg c[i + 2 + (j + 1) * c_dim1] = f32; 389627f7eb2Smrg c[i + 3 + (j + 1) * c_dim1] = f42; 390627f7eb2Smrg c[i + 2 + (j + 2) * c_dim1] = f33; 391627f7eb2Smrg c[i + 3 + (j + 2) * c_dim1] = f43; 392627f7eb2Smrg c[i + 2 + (j + 3) * c_dim1] = f34; 393627f7eb2Smrg c[i + 3 + (j + 3) * c_dim1] = f44; 394627f7eb2Smrg } 395627f7eb2Smrg if (uisec < isec) 396627f7eb2Smrg { 397627f7eb2Smrg i5 = ii + isec - 1; 398627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i) 399627f7eb2Smrg { 400627f7eb2Smrg f11 = c[i + j * c_dim1]; 401627f7eb2Smrg f12 = c[i + (j + 1) * c_dim1]; 402627f7eb2Smrg f13 = c[i + (j + 2) * c_dim1]; 403627f7eb2Smrg f14 = c[i + (j + 3) * c_dim1]; 404627f7eb2Smrg i6 = ll + lsec - 1; 405627f7eb2Smrg for (l = ll; l <= i6; ++l) 406627f7eb2Smrg { 407627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 408627f7eb2Smrg 257] * b[l + j * b_dim1]; 409627f7eb2Smrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 410627f7eb2Smrg 257] * b[l + (j + 1) * b_dim1]; 411627f7eb2Smrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 412627f7eb2Smrg 257] * b[l + (j + 2) * b_dim1]; 413627f7eb2Smrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 414627f7eb2Smrg 257] * b[l + (j + 3) * b_dim1]; 415627f7eb2Smrg } 416627f7eb2Smrg c[i + j * c_dim1] = f11; 417627f7eb2Smrg c[i + (j + 1) * c_dim1] = f12; 418627f7eb2Smrg c[i + (j + 2) * c_dim1] = f13; 419627f7eb2Smrg c[i + (j + 3) * c_dim1] = f14; 420627f7eb2Smrg } 421627f7eb2Smrg } 422627f7eb2Smrg } 423627f7eb2Smrg if (ujsec < jsec) 424627f7eb2Smrg { 425627f7eb2Smrg i4 = jj + jsec - 1; 426627f7eb2Smrg for (j = jj + ujsec; j <= i4; ++j) 427627f7eb2Smrg { 428627f7eb2Smrg i5 = ii + uisec - 1; 429627f7eb2Smrg for (i = ii; i <= i5; i += 4) 430627f7eb2Smrg { 431627f7eb2Smrg f11 = c[i + j * c_dim1]; 432627f7eb2Smrg f21 = c[i + 1 + j * c_dim1]; 433627f7eb2Smrg f31 = c[i + 2 + j * c_dim1]; 434627f7eb2Smrg f41 = c[i + 3 + j * c_dim1]; 435627f7eb2Smrg i6 = ll + lsec - 1; 436627f7eb2Smrg for (l = ll; l <= i6; ++l) 437627f7eb2Smrg { 438627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 439627f7eb2Smrg 257] * b[l + j * b_dim1]; 440627f7eb2Smrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 441627f7eb2Smrg 257] * b[l + j * b_dim1]; 442627f7eb2Smrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 443627f7eb2Smrg 257] * b[l + j * b_dim1]; 444627f7eb2Smrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 445627f7eb2Smrg 257] * b[l + j * b_dim1]; 446627f7eb2Smrg } 447627f7eb2Smrg c[i + j * c_dim1] = f11; 448627f7eb2Smrg c[i + 1 + j * c_dim1] = f21; 449627f7eb2Smrg c[i + 2 + j * c_dim1] = f31; 450627f7eb2Smrg c[i + 3 + j * c_dim1] = f41; 451627f7eb2Smrg } 452627f7eb2Smrg i5 = ii + isec - 1; 453627f7eb2Smrg for (i = ii + uisec; i <= i5; ++i) 454627f7eb2Smrg { 455627f7eb2Smrg f11 = c[i + j * c_dim1]; 456627f7eb2Smrg i6 = ll + lsec - 1; 457627f7eb2Smrg for (l = ll; l <= i6; ++l) 458627f7eb2Smrg { 459627f7eb2Smrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 460627f7eb2Smrg 257] * b[l + j * b_dim1]; 461627f7eb2Smrg } 462627f7eb2Smrg c[i + j * c_dim1] = f11; 463627f7eb2Smrg } 464627f7eb2Smrg } 465627f7eb2Smrg } 466627f7eb2Smrg } 467627f7eb2Smrg } 468627f7eb2Smrg } 469627f7eb2Smrg free(t1); 470627f7eb2Smrg return; 471627f7eb2Smrg } 472627f7eb2Smrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 473627f7eb2Smrg { 474627f7eb2Smrg if (GFC_DESCRIPTOR_RANK (a) != 1) 475627f7eb2Smrg { 476627f7eb2Smrg const 'rtype_name` *restrict abase_x; 477627f7eb2Smrg const 'rtype_name` *restrict bbase_y; 478627f7eb2Smrg 'rtype_name` *restrict dest_y; 479627f7eb2Smrg 'rtype_name` s; 480627f7eb2Smrg 481627f7eb2Smrg for (y = 0; y < ycount; y++) 482627f7eb2Smrg { 483627f7eb2Smrg bbase_y = &bbase[y*bystride]; 484627f7eb2Smrg dest_y = &dest[y*rystride]; 485627f7eb2Smrg for (x = 0; x < xcount; x++) 486627f7eb2Smrg { 487627f7eb2Smrg abase_x = &abase[x*axstride]; 488627f7eb2Smrg s = ('rtype_name`) 0; 489627f7eb2Smrg for (n = 0; n < count; n++) 490627f7eb2Smrg s += abase_x[n] * bbase_y[n]; 491627f7eb2Smrg dest_y[x] = s; 492627f7eb2Smrg } 493627f7eb2Smrg } 494627f7eb2Smrg } 495627f7eb2Smrg else 496627f7eb2Smrg { 497627f7eb2Smrg const 'rtype_name` *restrict bbase_y; 498627f7eb2Smrg 'rtype_name` s; 499627f7eb2Smrg 500627f7eb2Smrg for (y = 0; y < ycount; y++) 501627f7eb2Smrg { 502627f7eb2Smrg bbase_y = &bbase[y*bystride]; 503627f7eb2Smrg s = ('rtype_name`) 0; 504627f7eb2Smrg for (n = 0; n < count; n++) 505627f7eb2Smrg s += abase[n*axstride] * bbase_y[n]; 506627f7eb2Smrg dest[y*rystride] = s; 507627f7eb2Smrg } 508627f7eb2Smrg } 509627f7eb2Smrg } 510627f7eb2Smrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 511627f7eb2Smrg { 512627f7eb2Smrg const 'rtype_name` *restrict bbase_y; 513627f7eb2Smrg 'rtype_name` s; 514627f7eb2Smrg 515627f7eb2Smrg for (y = 0; y < ycount; y++) 516627f7eb2Smrg { 517627f7eb2Smrg bbase_y = &bbase[y*bystride]; 518627f7eb2Smrg s = ('rtype_name`) 0; 519627f7eb2Smrg for (n = 0; n < count; n++) 520627f7eb2Smrg s += abase[n*axstride] * bbase_y[n*bxstride]; 521627f7eb2Smrg dest[y*rxstride] = s; 522627f7eb2Smrg } 523627f7eb2Smrg } 524*4c3eb207Smrg else if (axstride < aystride) 525*4c3eb207Smrg { 526*4c3eb207Smrg for (y = 0; y < ycount; y++) 527*4c3eb207Smrg for (x = 0; x < xcount; x++) 528*4c3eb207Smrg dest[x*rxstride + y*rystride] = ('rtype_name`)0; 529*4c3eb207Smrg 530*4c3eb207Smrg for (y = 0; y < ycount; y++) 531*4c3eb207Smrg for (n = 0; n < count; n++) 532*4c3eb207Smrg for (x = 0; x < xcount; x++) 533*4c3eb207Smrg /* dest[x,y] += a[x,n] * b[n,y] */ 534*4c3eb207Smrg dest[x*rxstride + y*rystride] += 535*4c3eb207Smrg abase[x*axstride + n*aystride] * 536*4c3eb207Smrg bbase[n*bxstride + y*bystride]; 537*4c3eb207Smrg } 538627f7eb2Smrg else 539627f7eb2Smrg { 540627f7eb2Smrg const 'rtype_name` *restrict abase_x; 541627f7eb2Smrg const 'rtype_name` *restrict bbase_y; 542627f7eb2Smrg 'rtype_name` *restrict dest_y; 543627f7eb2Smrg 'rtype_name` s; 544627f7eb2Smrg 545627f7eb2Smrg for (y = 0; y < ycount; y++) 546627f7eb2Smrg { 547627f7eb2Smrg bbase_y = &bbase[y*bystride]; 548627f7eb2Smrg dest_y = &dest[y*rystride]; 549627f7eb2Smrg for (x = 0; x < xcount; x++) 550627f7eb2Smrg { 551627f7eb2Smrg abase_x = &abase[x*axstride]; 552627f7eb2Smrg s = ('rtype_name`) 0; 553627f7eb2Smrg for (n = 0; n < count; n++) 554627f7eb2Smrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 555627f7eb2Smrg dest_y[x*rxstride] = s; 556627f7eb2Smrg } 557627f7eb2Smrg } 558627f7eb2Smrg } 559627f7eb2Smrg} 560627f7eb2Smrg#undef POW3 561627f7eb2Smrg#undef min 562627f7eb2Smrg#undef max 563627f7eb2Smrg' 564