1627f7eb2Smrg`/* Implementation of the MATMUL intrinsic 2*4c3eb207Smrg Copyright (C) 2002-2020 Free Software Foundation, Inc. 3627f7eb2Smrg Contributed by Paul Brook <paul@nowt.org> 4627f7eb2Smrg 5627f7eb2SmrgThis file is part of the GNU Fortran runtime library (libgfortran). 6627f7eb2Smrg 7627f7eb2SmrgLibgfortran is free software; you can redistribute it and/or 8627f7eb2Smrgmodify it under the terms of the GNU General Public 9627f7eb2SmrgLicense as published by the Free Software Foundation; either 10627f7eb2Smrgversion 3 of the License, or (at your option) any later version. 11627f7eb2Smrg 12627f7eb2SmrgLibgfortran is distributed in the hope that it will be useful, 13627f7eb2Smrgbut WITHOUT ANY WARRANTY; without even the implied warranty of 14627f7eb2SmrgMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15627f7eb2SmrgGNU General Public License for more details. 16627f7eb2Smrg 17627f7eb2SmrgUnder Section 7 of GPL version 3, you are granted additional 18627f7eb2Smrgpermissions described in the GCC Runtime Library Exception, version 19627f7eb2Smrg3.1, as published by the Free Software Foundation. 20627f7eb2Smrg 21627f7eb2SmrgYou should have received a copy of the GNU General Public License and 22627f7eb2Smrga copy of the GCC Runtime Library Exception along with this program; 23627f7eb2Smrgsee the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24627f7eb2Smrg<http://www.gnu.org/licenses/>. */ 25627f7eb2Smrg 26627f7eb2Smrg#include "libgfortran.h" 27627f7eb2Smrg#include <string.h> 28627f7eb2Smrg#include <assert.h>' 29627f7eb2Smrg 30627f7eb2Smrginclude(iparm.m4)dnl 31627f7eb2Smrg 32627f7eb2Smrg`#if defined (HAVE_'rtype_name`) 33627f7eb2Smrg 34627f7eb2Smrg/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 35627f7eb2Smrg passed to us by the front-end, in which case we call it for large 36627f7eb2Smrg matrices. */ 37627f7eb2Smrg 38627f7eb2Smrgtypedef void (*blas_call)(const char *, const char *, const int *, const int *, 39627f7eb2Smrg const int *, const 'rtype_name` *, const 'rtype_name` *, 40627f7eb2Smrg const int *, const 'rtype_name` *, const int *, 41627f7eb2Smrg const 'rtype_name` *, 'rtype_name` *, const int *, 42627f7eb2Smrg int, int); 43627f7eb2Smrg 44627f7eb2Smrg/* The order of loops is different in the case of plain matrix 45627f7eb2Smrg multiplication C=MATMUL(A,B), and in the frequent special case where 46627f7eb2Smrg the argument A is the temporary result of a TRANSPOSE intrinsic: 47627f7eb2Smrg C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 48627f7eb2Smrg looking at their strides. 49627f7eb2Smrg 50627f7eb2Smrg The equivalent Fortran pseudo-code is: 51627f7eb2Smrg 52627f7eb2Smrg DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 53627f7eb2Smrg IF (.NOT.IS_TRANSPOSED(A)) THEN 54627f7eb2Smrg C = 0 55627f7eb2Smrg DO J=1,N 56627f7eb2Smrg DO K=1,COUNT 57627f7eb2Smrg DO I=1,M 58627f7eb2Smrg C(I,J) = C(I,J)+A(I,K)*B(K,J) 59627f7eb2Smrg ELSE 60627f7eb2Smrg DO J=1,N 61627f7eb2Smrg DO I=1,M 62627f7eb2Smrg S = 0 63627f7eb2Smrg DO K=1,COUNT 64627f7eb2Smrg S = S+A(I,K)*B(K,J) 65627f7eb2Smrg C(I,J) = S 66627f7eb2Smrg ENDIF 67627f7eb2Smrg*/ 68627f7eb2Smrg 69627f7eb2Smrg/* If try_blas is set to a nonzero value, then the matmul function will 70627f7eb2Smrg see if there is a way to perform the matrix multiplication by a call 71627f7eb2Smrg to the BLAS gemm function. */ 72627f7eb2Smrg 73627f7eb2Smrgextern void matmul_'rtype_code` ('rtype` * const restrict retarray, 74627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 75627f7eb2Smrg int blas_limit, blas_call gemm); 76627f7eb2Smrgexport_proto(matmul_'rtype_code`); 77627f7eb2Smrg 78627f7eb2Smrg/* Put exhaustive list of possible architectures here here, ORed together. */ 79627f7eb2Smrg 80627f7eb2Smrg#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 81627f7eb2Smrg 82627f7eb2Smrg#ifdef HAVE_AVX 83627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_avx')dnl 84627f7eb2Smrg`static void 85627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 86627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 87627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 88627f7eb2Smrgstatic' include(matmul_internal.m4)dnl 89627f7eb2Smrg`#endif /* HAVE_AVX */ 90627f7eb2Smrg 91627f7eb2Smrg#ifdef HAVE_AVX2 92627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl 93627f7eb2Smrg`static void 94627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 95627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 96627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 97627f7eb2Smrgstatic' include(matmul_internal.m4)dnl 98627f7eb2Smrg`#endif /* HAVE_AVX2 */ 99627f7eb2Smrg 100627f7eb2Smrg#ifdef HAVE_AVX512F 101627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl 102627f7eb2Smrg`static void 103627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 104627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 105627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 106627f7eb2Smrgstatic' include(matmul_internal.m4)dnl 107627f7eb2Smrg`#endif /* HAVE_AVX512F */ 108627f7eb2Smrg 109627f7eb2Smrg/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 110627f7eb2Smrg 111627f7eb2Smrg#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 112627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl 113627f7eb2Smrg`void 114627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 115627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 116627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 117627f7eb2Smrginternal_proto('matmul_name`); 118627f7eb2Smrg#endif 119627f7eb2Smrg 120627f7eb2Smrg#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 121627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl 122627f7eb2Smrg`void 123627f7eb2Smrg'matmul_name` ('rtype` * const restrict retarray, 124627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 125627f7eb2Smrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 126627f7eb2Smrginternal_proto('matmul_name`); 127627f7eb2Smrg#endif 128627f7eb2Smrg 129627f7eb2Smrg/* Function to fall back to if there is no special processor-specific version. */ 130627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl 131627f7eb2Smrg`static' include(matmul_internal.m4)dnl 132627f7eb2Smrg 133627f7eb2Smrg`/* Compiling main function, with selection code for the processor. */ 134627f7eb2Smrg 135627f7eb2Smrg/* Currently, this is i386 only. Adjust for other architectures. */ 136627f7eb2Smrg 137627f7eb2Smrg#include <config/i386/cpuinfo.h> 138627f7eb2Smrgvoid matmul_'rtype_code` ('rtype` * const restrict retarray, 139627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 140627f7eb2Smrg int blas_limit, blas_call gemm) 141627f7eb2Smrg{ 142627f7eb2Smrg static void (*matmul_p) ('rtype` * const restrict retarray, 143627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 144627f7eb2Smrg int blas_limit, blas_call gemm); 145627f7eb2Smrg 146627f7eb2Smrg void (*matmul_fn) ('rtype` * const restrict retarray, 147627f7eb2Smrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 148627f7eb2Smrg int blas_limit, blas_call gemm); 149627f7eb2Smrg 150627f7eb2Smrg matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 151627f7eb2Smrg if (matmul_fn == NULL) 152627f7eb2Smrg { 153627f7eb2Smrg matmul_fn = matmul_'rtype_code`_vanilla; 154627f7eb2Smrg if (__cpu_model.__cpu_vendor == VENDOR_INTEL) 155627f7eb2Smrg { 156627f7eb2Smrg /* Run down the available processors in order of preference. */ 157627f7eb2Smrg#ifdef HAVE_AVX512F 158627f7eb2Smrg if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) 159627f7eb2Smrg { 160627f7eb2Smrg matmul_fn = matmul_'rtype_code`_avx512f; 161627f7eb2Smrg goto store; 162627f7eb2Smrg } 163627f7eb2Smrg 164627f7eb2Smrg#endif /* HAVE_AVX512F */ 165627f7eb2Smrg 166627f7eb2Smrg#ifdef HAVE_AVX2 167627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) 168627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 169627f7eb2Smrg { 170627f7eb2Smrg matmul_fn = matmul_'rtype_code`_avx2; 171627f7eb2Smrg goto store; 172627f7eb2Smrg } 173627f7eb2Smrg 174627f7eb2Smrg#endif 175627f7eb2Smrg 176627f7eb2Smrg#ifdef HAVE_AVX 177627f7eb2Smrg if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 178627f7eb2Smrg { 179627f7eb2Smrg matmul_fn = matmul_'rtype_code`_avx; 180627f7eb2Smrg goto store; 181627f7eb2Smrg } 182627f7eb2Smrg#endif /* HAVE_AVX */ 183627f7eb2Smrg } 184627f7eb2Smrg else if (__cpu_model.__cpu_vendor == VENDOR_AMD) 185627f7eb2Smrg { 186627f7eb2Smrg#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 187627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 188627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 189627f7eb2Smrg { 190627f7eb2Smrg matmul_fn = matmul_'rtype_code`_avx128_fma3; 191627f7eb2Smrg goto store; 192627f7eb2Smrg } 193627f7eb2Smrg#endif 194627f7eb2Smrg#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 195627f7eb2Smrg if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 196627f7eb2Smrg && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) 197627f7eb2Smrg { 198627f7eb2Smrg matmul_fn = matmul_'rtype_code`_avx128_fma4; 199627f7eb2Smrg goto store; 200627f7eb2Smrg } 201627f7eb2Smrg#endif 202627f7eb2Smrg 203627f7eb2Smrg } 204627f7eb2Smrg store: 205627f7eb2Smrg __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 206627f7eb2Smrg } 207627f7eb2Smrg 208627f7eb2Smrg (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 209627f7eb2Smrg} 210627f7eb2Smrg 211627f7eb2Smrg#else /* Just the vanilla function. */ 212627f7eb2Smrg 213627f7eb2Smrg'define(`matmul_name',`matmul_'rtype_code)dnl 214627f7eb2Smrgdefine(`target_attribute',`')dnl 215627f7eb2Smrginclude(matmul_internal.m4)dnl 216627f7eb2Smrg`#endif 217627f7eb2Smrg#endif 218627f7eb2Smrg' 219