1 /*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead." 11 #endif // __IMMINTRIN_H 12 13 #ifndef __AMX_AVX512INTRIN_H 14 #define __AMX_AVX512INTRIN_H 15 #if defined(__x86_64__) && defined(__SSE2__) 16 17 #define __DEFAULT_FN_ATTRS_AVX512 \ 18 __attribute__((__always_inline__, __nodebug__, \ 19 __target__("amx-avx512,avx10.2-512"))) 20 21 /// Moves a row from a tile register to a zmm destination register, converting 22 /// the int32 source elements to fp32. The row of the tile is selected by a 23 /// 32b GPR. 24 /// 25 /// \headerfile <x86intrin.h> 26 /// 27 /// \code 28 /// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row); 29 /// \endcode 30 /// 31 /// \code{.operation} 32 /// VL := 512 33 /// VL_bytes := VL >> 3 34 /// row_index := row & 0xffff 35 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes 36 /// FOR i := 0 TO (VL_bytes / 4) - 1 37 /// IF i + row_chunk / 4 >= tsrc.colsb / 4 38 /// dst.dword[i] := 0 39 /// ELSE 40 /// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) 41 /// FI 42 /// ENDFOR 43 /// dst[MAX_VL-1:VL] := 0 44 /// zero_tileconfig_start() 45 /// \endcode 46 /// 47 /// This intrinsic corresponds to the \c TCVTROWD2PS instruction. 48 /// 49 /// \param tsrc 50 /// The source tile. Max size is 1024 Bytes. 51 /// \param row 52 /// The row of the source tile 53 #define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) 54 55 /// Moves a row from a tile register to a zmm destination register, converting 56 /// the fp32 source elements to bf16. It places the resulting bf16 elements 57 /// in the high 16 bits within each dword. The row of the tile is selected 58 /// by a 32b GPR. 59 /// 60 /// \headerfile <x86intrin.h> 61 /// 62 /// \code 63 /// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row); 64 /// \endcode 65 /// 66 /// \code{.operation} 67 /// VL := 512 68 /// VL_bytes := VL >> 3 69 /// row_index := row & 0xffff 70 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes 71 /// FOR i := 0 TO (VL_bytes / 4) - 1 72 /// IF i + row_chunk / 4 >= tsrc.colsb / 4 73 /// dst.dword[i] := 0 74 /// ELSE 75 /// dst.word[2*i+0] := 0 76 /// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) 77 /// FI 78 /// ENDFOR 79 /// dst[MAX_VL-1:VL] := 0 80 /// zero_tileconfig_start() 81 /// \endcode 82 /// 83 /// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction. 84 /// 85 /// \param tsrc 86 /// The source tile. Max size is 1024 Bytes. 87 /// \param row 88 /// The the row of the source tile. 89 #define _tile_cvtrowps2bf16h(tsrc, row) \ 90 __builtin_ia32_tcvtrowps2bf16h(tsrc, row) 91 92 /// Moves a row from a tile register to a zmm destination register, converting 93 /// the fp32 source elements to bf16. It places the resulting bf16 elements 94 /// in the low 16 bits within each dword. The row of the tile is selected 95 /// by a 32b GPR. 96 /// 97 /// \headerfile <x86intrin.h> 98 /// 99 /// \code 100 /// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row); 101 /// \endcode 102 /// 103 /// \code{.operation} 104 /// VL := 512 105 /// VL_bytes := VL >> 3 106 /// row_index := row & 0xffff 107 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes 108 /// FOR i := 0 TO (VL_bytes / 4) - 1 109 /// IF i + row_chunk / 4 >= tsrc.colsb / 4 110 /// dst.dword[i] := 0 111 /// ELSE 112 /// dst.word[2*i+1] := 0 113 /// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) 114 /// FI 115 /// ENDFOR 116 /// dst[MAX_VL-1:VL] := 0 117 /// zero_tileconfig_start() 118 /// \endcode 119 /// 120 /// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction. 121 /// 122 /// \param tsrc 123 /// The source tile. Max size is 1024 Bytes. 124 /// \param row 125 /// The the row of the source tile. 126 #define _tile_cvtrowps2bf16l(tsrc, row) \ 127 __builtin_ia32_tcvtrowps2bf16l(tsrc, row) 128 129 /// Moves a row from a tile register to a zmm destination register, converting 130 /// the fp32 source elements to fp16. It places the resulting fp16 elements 131 /// in the high 16 bits within each dword. The row of the tile is selected 132 /// by a 32b GPR. 133 /// 134 /// \headerfile <x86intrin.h> 135 /// 136 /// \code 137 /// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row); 138 /// \endcode 139 /// 140 /// \code{.operation} 141 /// VL := 512 142 /// VL_bytes := VL >> 3 143 /// row_index := row & 0xffff 144 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes 145 /// FOR i := 0 TO (VL_bytes / 4) - 1 146 /// IF i + row_chunk / 4 >= tsrc.colsb / 4 147 /// dst.dword[i] := 0 148 /// ELSE 149 /// dst.word[2*i+0] := 0 150 /// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) 151 /// FI 152 /// ENDFOR 153 /// dst[MAX_VL-1:VL] := 0 154 /// zero_tileconfig_start() 155 /// \endcode 156 /// 157 /// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. 158 /// 159 /// \param tsrc 160 /// The source tile. Max size is 1024 Bytes. 161 /// \param row 162 /// The the row of the source tile. 163 #define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) 164 165 /// Moves a row from a tile register to a zmm destination register, converting 166 /// the fp32 source elements to fp16. It places the resulting fp16 elements 167 /// in the low 16 bits within each dword. The row of the tile is selected 168 /// by a 32b GPR. 169 /// 170 /// \headerfile <x86intrin.h> 171 /// 172 /// \code 173 /// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row); 174 /// \endcode 175 /// 176 /// \code{.operation} 177 /// VL := 512 178 /// VL_bytes := VL >> 3 179 /// row_index := row & 0xffff 180 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes 181 /// FOR i := 0 TO (VL_bytes / 4) - 1 182 /// IF i + row_chunk / 4 >= tsrc.colsb / 4 183 /// dst.dword[i] := 0 184 /// ELSE 185 /// dst.word[2*i+1] := 0 186 /// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) 187 /// FI 188 /// ENDFOR 189 /// dst[MAX_VL-1:VL] := 0 190 /// zero_tileconfig_start() 191 /// \endcode 192 /// 193 /// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. 194 /// 195 /// \param tsrc 196 /// The source tile. Max size is 1024 Bytes. 197 /// \param row 198 /// The the row of the source tile. 199 #define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) 200 201 /// Move one row of a tile data to a v16f32 data. 202 /// The row of the tile is selected by a 32b GPR. 203 /// 204 /// \headerfile <immintrin.h> 205 /// 206 /// \code 207 /// __m512 _tile_movrow(__tile a, unsigned b); 208 /// \endcode 209 /// 210 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. 211 /// 212 /// \param a 213 /// The 1st source tile. Max size is 1024 Bytes. 214 /// \param b 215 /// The 2nd source r32. Size is 4 Bytes. 216 /// \returns 217 /// The destination v16f32 data. Size is 64 Bytes. 218 /// 219 /// \code{.operation} 220 /// VL := 512 221 /// VL_bytes := VL>>3 222 /// row_index := b&0xffff 223 /// row_chunk := ((b>>16)&0xffff) * VL_bytes 224 /// FOR i := 0 TO (VL_bytes-1) 225 /// IF (row_chunk + i >= a.colsb) 226 /// dst.byte[i] := 0 227 /// ELSE 228 /// dst.byte[i] := a.row[row_index].byte[row_chunk+i] 229 /// ENDFOR 230 /// \endcode 231 #define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b) 232 233 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 234 235 static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( 236 unsigned short m, unsigned short n, _tile1024i src, unsigned u) { 237 return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u); 238 } 239 240 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 241 _tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n, 242 _tile1024i src, unsigned u) { 243 return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u); 244 } 245 246 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 247 _tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n, 248 _tile1024i src, unsigned u) { 249 return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u); 250 } 251 252 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal( 253 unsigned short m, unsigned short n, _tile1024i src, unsigned u) { 254 return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u); 255 } 256 257 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal( 258 unsigned short m, unsigned short n, _tile1024i src, unsigned u) { 259 return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u); 260 } 261 262 static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal( 263 unsigned short m, unsigned short n, _tile1024i src, unsigned u) { 264 return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u); 265 } 266 267 /// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source 268 /// elements to fp32. No SIMD exceptions are generated. Rounding is done as if 269 /// MXCSR.RC=RNE. Embedded rounding is not supported. 270 /// The row and chunk elements of tile is fetched from 32bit src1. 271 /// 272 /// \headerfile <immintrin.h> 273 /// 274 /// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction. 275 /// 276 /// \param src0 277 /// The 1st source tile. Max size is 1024 Bytes. 278 /// \param src1 279 /// The 2nd source r32. Size is 4 Bytes. 280 /// \returns 281 /// The destination v16f32 data. Size is 64 Bytes. 282 __DEFAULT_FN_ATTRS_AVX512 283 static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) { 284 return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1); 285 } 286 287 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source 288 /// elements to bf16 at high 16-bits of each dword. 289 /// The row and chunk elements of tile is fetched from 32bit src1. 290 /// 291 /// \headerfile <immintrin.h> 292 /// 293 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction. 294 /// 295 /// \param src0 296 /// The 1st source tile. Max size is 1024 Bytes. 297 /// \param src1 298 /// The 2nd source r32. Size is 4 Bytes. 299 /// \returns 300 /// The destination v32bf16 data. Size is 64 Bytes. 301 __DEFAULT_FN_ATTRS_AVX512 302 static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) { 303 return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1); 304 } 305 306 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source 307 /// elements to bf16 at low 16-bits of each dword. 308 /// The row and chunk elements of tile is fetched from 32bit src1. 309 /// 310 /// \headerfile <immintrin.h> 311 /// 312 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction. 313 /// 314 /// \param src0 315 /// The 1st source tile. Max size is 1024 Bytes. 316 /// \param src1 317 /// The 2nd source r32. Size is 4 Bytes. 318 /// \returns 319 /// The destination v32bf16 data. Size is 64 Bytes. 320 __DEFAULT_FN_ATTRS_AVX512 321 static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) { 322 return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1); 323 } 324 325 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source 326 /// elements to fp16 at high 16-bits of each dword. 327 /// The row and chunk elements of tile is fetched from 32bit src1. 328 /// 329 /// \headerfile <immintrin.h> 330 /// 331 /// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction. 332 /// 333 /// \param src0 334 /// The 1st source tile. Max size is 1024 Bytes. 335 /// \param src1 336 /// The 2nd source r32. Size is 4 Bytes. 337 /// \returns 338 /// The destination v32fp16 data. Size is 64 Bytes. 339 __DEFAULT_FN_ATTRS_AVX512 340 static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) { 341 return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1); 342 } 343 344 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source 345 /// elements to fp16 at low 16-bits of each dword. 346 /// The row and chunk elements of tile is fetched from 32bit src1. 347 /// 348 /// \headerfile <immintrin.h> 349 /// 350 /// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction. 351 /// 352 /// \param src0 353 /// The 1st source tile. Max size is 1024 Bytes. 354 /// \param src1 355 /// The 2nd source r32. Size is 4 Bytes. 356 /// \returns 357 /// The destination v32fp16 data. Size is 64 Bytes. 358 __DEFAULT_FN_ATTRS_AVX512 359 static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) { 360 return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1); 361 } 362 363 /// Move one row of a tile data to a v16f32 data. 364 /// The row of the tile is selected by a 32b GPR. 365 /// 366 /// \headerfile <immintrin.h> 367 /// 368 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. 369 /// 370 /// \param src0 371 /// The 1st source tile. Max size is 1024 Bytes. 372 /// \param src1 373 /// The 2nd source r32. Size is 4 Bytes. 374 /// \returns 375 /// The destination v16i32 data. Size is 64 Bytes. 376 __DEFAULT_FN_ATTRS_AVX512 377 static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { 378 return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); 379 } 380 381 #endif // __x86_64__ && __SSE2__ 382 #endif // __AMX_AVX512INTRIN_H 383