1`/* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2020 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5This file is part of the GNU Fortran runtime library (libgfortran). 6 7Libgfortran is free software; you can redistribute it and/or 8modify it under the terms of the GNU General Public 9License as published by the Free Software Foundation; either 10version 3 of the License, or (at your option) any later version. 11 12Libgfortran is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26#include "libgfortran.h" 27#include <string.h> 28#include <assert.h>' 29 30include(iparm.m4)dnl 31 32`#if defined (HAVE_'rtype_name`) 33 34/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 35 passed to us by the front-end, in which case we call it for large 36 matrices. */ 37 38typedef void (*blas_call)(const char *, const char *, const int *, const int *, 39 const int *, const 'rtype_name` *, const 'rtype_name` *, 40 const int *, const 'rtype_name` *, const int *, 41 const 'rtype_name` *, 'rtype_name` *, const int *, 42 int, int); 43 44/* The order of loops is different in the case of plain matrix 45 multiplication C=MATMUL(A,B), and in the frequent special case where 46 the argument A is the temporary result of a TRANSPOSE intrinsic: 47 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 48 looking at their strides. 49 50 The equivalent Fortran pseudo-code is: 51 52 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 53 IF (.NOT.IS_TRANSPOSED(A)) THEN 54 C = 0 55 DO J=1,N 56 DO K=1,COUNT 57 DO I=1,M 58 C(I,J) = C(I,J)+A(I,K)*B(K,J) 59 ELSE 60 DO J=1,N 61 DO I=1,M 62 S = 0 63 DO K=1,COUNT 64 S = S+A(I,K)*B(K,J) 65 C(I,J) = S 66 ENDIF 67*/ 68 69/* If try_blas is set to a nonzero value, then the matmul function will 70 see if there is a way to perform the matrix multiplication by a call 71 to the BLAS gemm function. */ 72 73extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 74 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 75 int blas_limit, blas_call gemm); 76export_proto(matmul_'rtype_code`); 77 78/* Put exhaustive list of possible architectures here here, ORed together. */ 79 80#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 81 82#ifdef HAVE_AVX 83'define(`matmul_name',`matmul_'rtype_code`_avx')dnl 84`static void 85'matmul_name` ('rtype` * const restrict retarray, 86 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 87 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 88static' include(matmul_internal.m4)dnl 89`#endif /* HAVE_AVX */ 90 91#ifdef HAVE_AVX2 92'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl 93`static void 94'matmul_name` ('rtype` * const restrict retarray, 95 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 96 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 97static' include(matmul_internal.m4)dnl 98`#endif /* HAVE_AVX2 */ 99 100#ifdef HAVE_AVX512F 101'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl 102`static void 103'matmul_name` ('rtype` * const restrict retarray, 104 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 105 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 106static' include(matmul_internal.m4)dnl 107`#endif /* HAVE_AVX512F */ 108 109/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 110 111#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 112'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl 113`void 114'matmul_name` ('rtype` * const restrict retarray, 115 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 116 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 117internal_proto('matmul_name`); 118#endif 119 120#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 121'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl 122`void 123'matmul_name` ('rtype` * const restrict retarray, 124 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 125 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 126internal_proto('matmul_name`); 127#endif 128 129/* Function to fall back to if there is no special processor-specific version. */ 130'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl 131`static' include(matmul_internal.m4)dnl 132 133`/* Compiling main function, with selection code for the processor. */ 134 135/* Currently, this is i386 only. Adjust for other architectures. */ 136 137#include <config/i386/cpuinfo.h> 138void matmul_'rtype_code` ('rtype` * const restrict retarray, 139 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 140 int blas_limit, blas_call gemm) 141{ 142 static void (*matmul_p) ('rtype` * const restrict retarray, 143 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 144 int blas_limit, blas_call gemm); 145 146 void (*matmul_fn) ('rtype` * const restrict retarray, 147 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 148 int blas_limit, blas_call gemm); 149 150 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 151 if (matmul_fn == NULL) 152 { 153 matmul_fn = matmul_'rtype_code`_vanilla; 154 if (__cpu_model.__cpu_vendor == VENDOR_INTEL) 155 { 156 /* Run down the available processors in order of preference. */ 157#ifdef HAVE_AVX512F 158 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) 159 { 160 matmul_fn = matmul_'rtype_code`_avx512f; 161 goto store; 162 } 163 164#endif /* HAVE_AVX512F */ 165 166#ifdef HAVE_AVX2 167 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) 168 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 169 { 170 matmul_fn = matmul_'rtype_code`_avx2; 171 goto store; 172 } 173 174#endif 175 176#ifdef HAVE_AVX 177 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 178 { 179 matmul_fn = matmul_'rtype_code`_avx; 180 goto store; 181 } 182#endif /* HAVE_AVX */ 183 } 184 else if (__cpu_model.__cpu_vendor == VENDOR_AMD) 185 { 186#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 187 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 188 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 189 { 190 matmul_fn = matmul_'rtype_code`_avx128_fma3; 191 goto store; 192 } 193#endif 194#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 195 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 196 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) 197 { 198 matmul_fn = matmul_'rtype_code`_avx128_fma4; 199 goto store; 200 } 201#endif 202 203 } 204 store: 205 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 206 } 207 208 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 209} 210 211#else /* Just the vanilla function. */ 212 213'define(`matmul_name',`matmul_'rtype_code)dnl 214define(`target_attribute',`')dnl 215include(matmul_internal.m4)dnl 216`#endif 217#endif 218' 219