1`/* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2022 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5This file is part of the GNU Fortran runtime library (libgfortran). 6 7Libgfortran is free software; you can redistribute it and/or 8modify it under the terms of the GNU General Public 9License as published by the Free Software Foundation; either 10version 3 of the License, or (at your option) any later version. 11 12Libgfortran is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26#include "libgfortran.h" 27#include <string.h> 28#include <assert.h>' 29 30include(iparm.m4)dnl 31 32`#if defined (HAVE_'rtype_name`) 33 34/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 35 passed to us by the front-end, in which case we call it for large 36 matrices. */ 37 38typedef void (*blas_call)(const char *, const char *, const int *, const int *, 39 const int *, const 'rtype_name` *, const 'rtype_name` *, 40 const int *, const 'rtype_name` *, const int *, 41 const 'rtype_name` *, 'rtype_name` *, const int *, 42 int, int); 43 44/* The order of loops is different in the case of plain matrix 45 multiplication C=MATMUL(A,B), and in the frequent special case where 46 the argument A is the temporary result of a TRANSPOSE intrinsic: 47 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 48 looking at their strides. 49 50 The equivalent Fortran pseudo-code is: 51 52 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 53 IF (.NOT.IS_TRANSPOSED(A)) THEN 54 C = 0 55 DO J=1,N 56 DO K=1,COUNT 57 DO I=1,M 58 C(I,J) = C(I,J)+A(I,K)*B(K,J) 59 ELSE 60 DO J=1,N 61 DO I=1,M 62 S = 0 63 DO K=1,COUNT 64 S = S+A(I,K)*B(K,J) 65 C(I,J) = S 66 ENDIF 67*/ 68 69/* If try_blas is set to a nonzero value, then the matmul function will 70 see if there is a way to perform the matrix multiplication by a call 71 to the BLAS gemm function. */ 72 73extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 74 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 75 int blas_limit, blas_call gemm); 76export_proto(matmul_'rtype_code`); 77 78/* Put exhaustive list of possible architectures here here, ORed together. */ 79 80#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 81 82#ifdef HAVE_AVX 83'define(`matmul_name',`matmul_'rtype_code`_avx')dnl 84`static void 85'matmul_name` ('rtype` * const restrict retarray, 86 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 87 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 88static' include(matmul_internal.m4)dnl 89`#endif /* HAVE_AVX */ 90 91#ifdef HAVE_AVX2 92'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl 93`static void 94'matmul_name` ('rtype` * const restrict retarray, 95 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 96 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 97static' include(matmul_internal.m4)dnl 98`#endif /* HAVE_AVX2 */ 99 100#ifdef HAVE_AVX512F 101'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl 102`static void 103'matmul_name` ('rtype` * const restrict retarray, 104 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 105 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 106static' include(matmul_internal.m4)dnl 107`#endif /* HAVE_AVX512F */ 108 109/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 110 111#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 112'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl 113`void 114'matmul_name` ('rtype` * const restrict retarray, 115 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 116 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 117internal_proto('matmul_name`); 118#endif 119 120#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 121'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl 122`void 123'matmul_name` ('rtype` * const restrict retarray, 124 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 125 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 126internal_proto('matmul_name`); 127#endif 128 129/* Function to fall back to if there is no special processor-specific version. */ 130'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl 131`static' include(matmul_internal.m4)dnl 132 133`/* Compiling main function, with selection code for the processor. */ 134 135/* Currently, this is i386 only. Adjust for other architectures. */ 136 137void matmul_'rtype_code` ('rtype` * const restrict retarray, 138 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 139 int blas_limit, blas_call gemm) 140{ 141 static void (*matmul_p) ('rtype` * const restrict retarray, 142 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 143 int blas_limit, blas_call gemm); 144 145 void (*matmul_fn) ('rtype` * const restrict retarray, 146 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 147 int blas_limit, blas_call gemm); 148 149 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 150 if (matmul_fn == NULL) 151 { 152 matmul_fn = matmul_'rtype_code`_vanilla; 153 if (__builtin_cpu_is ("intel")) 154 { 155 /* Run down the available processors in order of preference. */ 156#ifdef HAVE_AVX512F 157 if (__builtin_cpu_supports ("avx512f")) 158 { 159 matmul_fn = matmul_'rtype_code`_avx512f; 160 goto store; 161 } 162 163#endif /* HAVE_AVX512F */ 164 165#ifdef HAVE_AVX2 166 if (__builtin_cpu_supports ("avx2") 167 && __builtin_cpu_supports ("fma")) 168 { 169 matmul_fn = matmul_'rtype_code`_avx2; 170 goto store; 171 } 172 173#endif 174 175#ifdef HAVE_AVX 176 if (__builtin_cpu_supports ("avx")) 177 { 178 matmul_fn = matmul_'rtype_code`_avx; 179 goto store; 180 } 181#endif /* HAVE_AVX */ 182 } 183 else if (__builtin_cpu_is ("amd")) 184 { 185#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 186 if (__builtin_cpu_supports ("avx") 187 && __builtin_cpu_supports ("fma")) 188 { 189 matmul_fn = matmul_'rtype_code`_avx128_fma3; 190 goto store; 191 } 192#endif 193#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 194 if (__builtin_cpu_supports ("avx") 195 && __builtin_cpu_supports ("fma4")) 196 { 197 matmul_fn = matmul_'rtype_code`_avx128_fma4; 198 goto store; 199 } 200#endif 201 202 } 203 store: 204 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 205 } 206 207 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 208} 209 210#else /* Just the vanilla function. */ 211 212'define(`matmul_name',`matmul_'rtype_code)dnl 213define(`target_attribute',`')dnl 214include(matmul_internal.m4)dnl 215`#endif 216#endif 217' 218