1//===-- AMDGPU.td - AMDGPU dialect definitions *- tablegen -*------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#ifndef AMDGPU 10#define AMDGPU 11 12include "mlir/Interfaces/SideEffectInterfaces.td" 13include "mlir/IR/EnumAttr.td" 14include "mlir/IR/OpBase.td" 15 16def AMDGPU_Dialect : Dialect { 17 let name = "amdgpu"; 18 let cppNamespace = "::mlir::amdgpu"; 19 let description = [{ 20 The `AMDGPU` dialect provides wrappers around AMD-specific functionality 21 and LLVM intrinsics. These wrappers should be used in conjunction with 22 more generic dialects, such as `gpu` and `vector`, when generating LLVM IR 23 that will eventually be executed on AMD hardware. 24 }]; 25 26 27 let dependentDialects = [ 28 "ROCDL::ROCDLDialect", 29 "arith::ArithDialect", 30 "gpu::GPUDialect" 31 ]; 32 let useDefaultAttributePrinterParser = 1; 33} 34 35//===----------------------------------------------------------------------===// 36// AMDGPU Op definitions 37//===----------------------------------------------------------------------===// 38 39class AMDGPU_Op<string mnemonic, list<Trait> traits = []> : 40 Op<AMDGPU_Dialect, mnemonic, traits> {} 41 42def AMDGPU_ExtPackedFp8Op : 43 AMDGPU_Op<"ext_packed_fp8", [Pure]>, 44 Arguments<(ins AnyTypeOf<[F8E5M2FNUZ, F8E4M3FNUZ, 45 VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2FNUZ, F8E4M3FNUZ]>]>:$source, 46 ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$index)>, 47 Results<(outs F32:$res)> { 48 let summary = "Extend one of a vector of packed fp8 values to a float"; 49 let description = [{ 50 Extend the value `source[index]` to a 32-bit float and return it. 51 52 This rather unusual signature arises from the fact that AMD GPUs cannot 53 easily work with sub 32-bit quantities, so the compiler intrinsics for 54 extending 8-bit floats (which are, currently, the only way to work with 55 this operation) take packed vectors of 4 such floats. 56 57 If the passed-in vector has fewer than four elements, or the input is scalar, 58 the remaining values in the <4 x i8> will be filled with with 59 undefined values as needed. 60 }]; 61 let assemblyFormat = [{ 62 attr-dict $source `[` $index `]` `:` type($source) `to` type($res) 63 }]; 64} 65 66def AMDGPU_PackedTrunc2xFp8Op : 67 AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>, 68 Arguments<(ins F32:$sourceA, 69 Optional<F32>:$sourceB, 70 ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<1>]>:$wordIndex, 71 Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>, 72 Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> { 73 let summary = "Round two floats into a packed vector of 8-bit floats"; 74 let description = [{ 75 Round the inputs `sourceA` and `sourceB` (which is undefined if not 76 specified) into the low or high word (bottom two or top two) elements 77 of the returned vector, keeping the other two elements of `existing` 78 unchanged if present (or undefined if it was not passed in). 79 80 The reason for this odd signature is that AMD GPUs cannot easily work with 81 sub-registers, and so the conversion intrinsics (which are currently the 82 only way to work with 8-bit float types) take packed vectors of 4 8-bit 83 values. 84 }]; 85 let assemblyFormat = [{ 86 attr-dict $sourceA `,` ($sourceB^):(`undef`)? 87 `into` ($existing^):(`undef`)? `[` `word` $wordIndex `]` 88 `:` type($sourceA) `to` type($res) (`into` type($existing)^)? 89 }]; 90 let hasVerifier = 1; 91} 92 93def AMDGPU_PackedStochRoundFp8Op : 94 AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>, 95 Arguments<(ins F32:$source, 96 I32:$stochiasticParam, 97 ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$storeIndex, 98 Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>, 99 Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> { 100 let summary = "Round float stochiastically into a packed vector of 8-bit floats"; 101 let description = [{ 102 Round the input `source`, adding in `stochiasticParam`, and place it into 103 the `storeIndex`th element of `res`. 104 105 If `existing` is passed in, elements of `res` other than the one at `storeIndex` 106 are copied from `existing`. 107 108 The reason for this odd signature is that AMD GPUs cannot easily work with 109 sub-registers, and so the conversion intrinsics (which are currently the 110 only way to work with 8-bit float types) take packed vectors of 4 8-bit 111 values. 112 }]; 113 let assemblyFormat = [{ 114 attr-dict $source `+` $stochiasticParam 115 `into` ($existing^):(`undef`)? `[` $storeIndex `]` 116 `:` type($source) `to` type($res) (`into` type($existing)^)? 117 }]; 118 let hasVerifier = 1; 119} 120 121/// Raw buffer load 122def AMDGPU_RawBufferLoadOp : 123 AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>, 124 AttrSizedOperandSegments]>, 125 Arguments<(ins Arg<AnyMemRef, "buffer to load from", [MemRead]>:$memref, 126 Variadic<I32>:$indices, 127 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 128 OptionalAttr<I32Attr>:$indexOffset, 129 Optional<I32>:$sgprOffset)>, 130 Results<(outs AnyType:$value)> { 131 132 let summary = "Raw Buffer load, exposing GCN features"; 133 let description = [{ 134 The `amdgpu.raw_buffer_load` op is a wrapper around the buffer load intrinsics 135 available on AMD GPUs, including extensions in newer GPUs. 136 137 The index into the buffer is computed as for `memref.load` with the additon 138 of `indexOffset` and `sgprOffset` (which **may or may not** be considered 139 in bounds checks and includes any offset present on the memref type if it's 140 non-zero). 141 142 All indices and offsets are in units of the memref's data type and are 143 converted to bytes during lowering. 144 145 When a load is out of bounds, the instruction returns zero. 146 Partially-out of bounds have chipset-dependent behavior: whether reading 147 2 elements starting at index 7 of a `memref<8xf32>` returns the last element 148 in the first vector component depends on the architecture. 149 150 The memref struct is converted into a buffer resource (a V#) and the arguments 151 are translated to intrinsic arguments as follows: 152 - The base address of the buffer is the base address of the memref 153 - The stride is 0 to enable raw mode 154 - The number of records is the size of the memref, in bytes 155 In the case of dynamically-shaped memrefs, this is computed at runtime 156 as max_d (size(d) * stride(d)) * sizeof(elementType(memref)) 157 - The offset enable bit is 1, the index enable bit is 0. 158 - The thread ID addition bit is off 159 - If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set 160 to 2 to disable bounds checks, otherwise it is 3 161 - The cache coherency bits are off 162 }]; 163 let assemblyFormat = [{ 164 attr-dict $memref `[` $indices `]` 165 (`sgprOffset` $sgprOffset^)? `:` 166 type($memref) (`,` type($indices)^)? `->` type($value) 167 }]; 168 let hasCanonicalizer = 1; 169 let hasVerifier = 1; 170} 171 172/// Raw buffer store 173def AMDGPU_RawBufferStoreOp : 174 AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>, 175 AttrSizedOperandSegments]>, 176 Arguments<(ins AnyType:$value, 177 Arg<AnyMemRef, "buffer to store to", [MemWrite]>:$memref, 178 Variadic<I32>:$indices, 179 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 180 OptionalAttr<I32Attr>:$indexOffset, 181 Optional<I32>:$sgprOffset)> { 182 183 let summary = "Raw Buffer Store, exposing GCN features"; 184 let description = [{ 185 The `amdgpu.raw_buffer_store` op is a wrapper around the buffer store 186 intrinsics available on AMD GPUs, including extensions in newer GPUs. 187 188 The store index is computed as in `memref.store` with the addition of 189 `indexOffset` (which is included for uniformity with atomics and may be useful 190 when writing vectorized code) and `sgprOffset` (which is added after bounds 191 checks and implicitly includes the offset of the memref type if non-zero). 192 All index components are in terms of the elements of the memref, not bytes, 193 and are scaled up appropriately. 194 195 Out of bounds stores are ignored in hardware. 196 Wthether a vector write that includes some in-bounds and soeme out-of-bounds 197 components is partically completed is chipset-dependent. 198 199 See `amdgpu.raw_buffer_load` for a description of how the underlying 200 instruction is constructed. 201 }]; 202 let assemblyFormat = [{ 203 attr-dict $value `->` $memref `[` $indices `]` 204 (`sgprOffset` $sgprOffset^)? `:` 205 type($value) `->` type($memref) (`,` type($indices)^)? 206 }]; 207 let hasCanonicalizer = 1; 208 let hasVerifier = 1; 209} 210 211// Raw buffer atomic compare-and-swap 212def AMDGPU_RawBufferAtomicCmpswapOp : 213 AMDGPU_Op<"raw_buffer_atomic_cmpswap", [ 214 AttrSizedOperandSegments, 215 AllTypesMatch<["src", "cmp", "value"]>, 216 AllElementTypesMatch<["value", "memref"]>]>, 217 Arguments<(ins AnyType:$src, 218 AnyType:$cmp, 219 Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref, 220 Variadic<I32>:$indices, 221 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 222 OptionalAttr<I32Attr>:$indexOffset, 223 Optional<I32>:$sgprOffset)>, 224 Results<(outs AnyType:$value)> { 225 226 let summary = "Raw Buffer Atomic compare-and-swap"; 227 let description = [{ 228 The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the 229 buffer-based atomic compare-and-swap min available on AMD GPUs. 230 231 The index into the buffer is computed as for `memref.store` with the addition 232 of `indexOffset` (which is used to aid in emitting vectorized code) and, 233 if present `sgprOffset` (which is added after bounds checks and includes 234 any non-zero offset on the memref type). 235 236 All indexing components are given in terms of the memref's element size, not 237 the byte lengths required by the intrinsic. 238 239 Out of bounds atomic operations are ignored in hardware. 240 241 See `amdgpu.raw_buffer_load` for a description of how the underlying 242 instruction is constructed. 243 }]; 244 let assemblyFormat = [{ 245 attr-dict $src `,` $cmp `->` $memref `[` $indices `]` 246 (`sgprOffset` $sgprOffset^)? `:` 247 type($value) `->` type($memref) `,` type($indices) 248 }]; 249 let hasCanonicalizer = 1; 250 let hasVerifier = 1; 251} 252 253// Raw buffer atomic floating point add 254def AMDGPU_RawBufferAtomicFaddOp : 255 AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, 256 AttrSizedOperandSegments]>, 257 Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16, BF16]>]>:$value, 258 Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref, 259 Variadic<I32>:$indices, 260 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 261 OptionalAttr<I32Attr>:$indexOffset, 262 Optional<I32>:$sgprOffset)> { 263 264 let summary = "Raw Buffer Floating-point Atomic Add (MI-* only)"; 265 let description = [{ 266 The `amdgpu.raw_buffer_atomic_fadd` op is a wrapper around the 267 buffer-based atomic floating point addition available on the MI-* series 268 of AMD GPUs. 269 270 The index into the buffer is computed as for `memref.store` with the addition 271 of `indexOffset` (which is used to aid in emitting vectorized code) and, 272 if present `sgprOffset` (which is added after bounds checks and includes 273 any non-zero offset on the memref type). 274 275 All indexing components are given in terms of the memref's element size, not 276 the byte lengths required by the intrinsic. 277 278 Out of bounds atomic operations are ignored in hardware. 279 280 See `amdgpu.raw_buffer_load` for a description of how the underlying 281 instruction is constructed. 282 }]; 283 let assemblyFormat = [{ 284 attr-dict $value `->` $memref `[` $indices `]` 285 (`sgprOffset` $sgprOffset^)? `:` 286 type($value) `->` type($memref) `,` type($indices) 287 }]; 288 let hasCanonicalizer = 1; 289 let hasVerifier = 1; 290} 291 292// Raw buffer atomic floating point max 293def AMDGPU_RawBufferAtomicFmaxOp : 294 AMDGPU_Op<"raw_buffer_atomic_fmax", [AllElementTypesMatch<["value", "memref"]>, 295 AttrSizedOperandSegments]>, 296 Arguments<(ins AnyTypeOf<[F32, F64]>:$value, 297 Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref, 298 Variadic<I32>:$indices, 299 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 300 OptionalAttr<I32Attr>:$indexOffset, 301 Optional<I32>:$sgprOffset)> { 302 303 let summary = "Raw Buffer Floating-point Atomic Max (non-GFX9)"; 304 let description = [{ 305 The `amdgpu.raw_buffer_atomic_fmax` op is a wrapper around the 306 buffer-based atomic floating point max available on AMD GPUs (except GFX9). 307 308 The index into the buffer is computed as for `memref.store` with the addition 309 of `indexOffset` (which is used to aid in emitting vectorized code) and, 310 if present `sgprOffset` (which is added after bounds checks and includes 311 any non-zero offset on the memref type). 312 313 All indexing components are given in terms of the memref's element size, not 314 the byte lengths required by the intrinsic. 315 316 Out of bounds atomic operations are ignored in hardware. 317 318 See `amdgpu.raw_buffer_load` for a description of how the underlying 319 instruction is constructed. 320 }]; 321 let assemblyFormat = [{ 322 attr-dict $value `->` $memref `[` $indices `]` 323 (`sgprOffset` $sgprOffset^)? `:` 324 type($value) `->` type($memref) `,` type($indices) 325 }]; 326 let hasCanonicalizer = 1; 327 let hasVerifier = 1; 328} 329 330// Raw buffer atomic signed integer max 331def AMDGPU_RawBufferAtomicSmaxOp : 332 AMDGPU_Op<"raw_buffer_atomic_smax", [ 333 AttrSizedOperandSegments]>, 334 Arguments<(ins I32:$value, 335 Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref, 336 Variadic<I32>:$indices, 337 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 338 OptionalAttr<I32Attr>:$indexOffset, 339 Optional<I32>:$sgprOffset)> { 340 341 let summary = "Raw Buffer Signed Integer Atomic Max"; 342 let description = [{ 343 The `amdgpu.raw_buffer_atomic_smax` op is a wrapper around the 344 buffer-based atomic signed integer max available on AMD GPUs. 345 346 The index into the buffer is computed as for `memref.store` with the addition 347 of `indexOffset` (which is used to aid in emitting vectorized code) and, 348 if present `sgprOffset` (which is added after bounds checks and includes 349 any non-zero offset on the memref type). 350 351 All indexing components are given in terms of the memref's element size, not 352 the byte lengths required by the intrinsic. 353 354 Out of bounds atomic operations are ignored in hardware. 355 356 See `amdgpu.raw_buffer_load` for a description of how the underlying 357 instruction is constructed. 358 }]; 359 let assemblyFormat = [{ 360 attr-dict $value `->` $memref `[` $indices `]` 361 (`sgprOffset` $sgprOffset^)? `:` 362 type($value) `->` type($memref) `,` type($indices) 363 }]; 364 let hasCanonicalizer = 1; 365 let hasVerifier = 1; 366} 367 368// Raw buffer atomic unsigned integer min 369def AMDGPU_RawBufferAtomicUminOp : 370 AMDGPU_Op<"raw_buffer_atomic_umin", [ 371 AttrSizedOperandSegments]>, 372 Arguments<(ins I32:$value, 373 Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref, 374 Variadic<I32>:$indices, 375 DefaultValuedAttr<BoolAttr, "true">:$boundsCheck, 376 OptionalAttr<I32Attr>:$indexOffset, 377 Optional<I32>:$sgprOffset)> { 378 379 let summary = "Raw Buffer Unsigned Integer Atomic Min"; 380 let description = [{ 381 The `amdgpu.raw_buffer_atomic_umin` op is a wrapper around the 382 buffer-based atomic signed integer min available on AMD GPUs. 383 384 The index into the buffer is computed as for `memref.store` with the addition 385 of `indexOffset` (which is used to aid in emitting vectorized code) and, 386 if present `sgprOffset` (which is added after bounds checks and includes 387 any non-zero offset on the memref type). 388 389 All indexing components are given in terms of the memref's element size, not 390 the byte lengths required by the intrinsic. 391 392 Out of bounds atomic operations are ignored in hardware. 393 394 See `amdgpu.raw_buffer_load` for a description of how the underlying 395 instruction is constructed. 396 }]; 397 let assemblyFormat = [{ 398 attr-dict $value `->` $memref `[` $indices `]` 399 (`sgprOffset` $sgprOffset^)? `:` 400 type($value) `->` type($memref) `,` type($indices) 401 }]; 402 let hasCanonicalizer = 1; 403 let hasVerifier = 1; 404} 405 406def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm", 407 "The possible permutations for a DPP operation", 408 [ 409 I32EnumAttrCase<"quad_perm", 0>, 410 I32EnumAttrCase<"row_shl", 1>, 411 I32EnumAttrCase<"row_shr", 2>, 412 I32EnumAttrCase<"row_ror", 3>, 413 I32EnumAttrCase<"wave_shl", 4>, 414 I32EnumAttrCase<"wave_shr", 5>, 415 I32EnumAttrCase<"wave_ror", 6>, 416 I32EnumAttrCase<"wave_rol", 7>, 417 I32EnumAttrCase<"row_mirror", 8>, 418 I32EnumAttrCase<"row_half_mirror", 9>, 419 I32EnumAttrCase<"row_bcast_15", 10>, 420 I32EnumAttrCase<"row_bcast_31", 11> 421 ]> { 422 let genSpecializedAttr = 0; 423 let cppNamespace = "::mlir::amdgpu"; 424} 425 426def AMDGPU_DPPPermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_DPPPerm, 427 "dpp_perm">; 428 429def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result", "old", "src"]>]>, 430 Arguments<(ins AnyType:$old, 431 AnyType:$src, 432 AMDGPU_DPPPermAttr:$kind, 433 OptionalAttr<AnyAttrOf<[I32Attr, ArrayAttr, UnitAttr]>>:$permArgument, 434 DefaultValuedAttr<I32Attr, "0xf">:$row_mask, 435 DefaultValuedAttr<I32Attr, "0xf">:$bank_mask, 436 DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl)> { 437 let summary = "AMDGPU DPP operation"; 438 let description = [{ 439 This operation represents DPP functionality in a GPU program. 440 DPP provides the following operations: 441 - Full crossbar in a group of four (`quad_perm`) 442 - Wavefront shift left by one lane (`wave_shl`) 443 - Wavefront shift right by one lane (`wave_shr`) 444 - Wavefront rotate right by one lane (`wave_ror`) 445 - Wavefront rotate left by one lane (`wave_rol`) 446 - Row shift left by 1–15 lanes (`row_shl`) 447 - Row shift right by 1–15 lanes (`row_shr`) 448 - Row rotate right by 1–15 lanes (`row_ror`) 449 - Reverse within a row (`row_mirror`) 450 - Reverse within a half-row (`row_half_mirror`) 451 - Broadcast the 15th lane of each row to the next row (`row_bcast`) 452 - Broadcast lane 31 to rows 2 and 3 (`row_bcast`) 453 }]; 454 let results = (outs AnyType:$result); 455 let assemblyFormat = [{ 456 $old $src $kind (`(` $permArgument^ `)`)? attr-dict `:` type($result) 457 }]; 458 let hasVerifier = 1; 459} 460 461def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { 462 let summary = "Barrier that includes a wait for LDS memory operations."; 463 let description = [{ 464 `amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach 465 the barrier before any of them may proceed past it) and a wait for all 466 operations that affect the Local Data Store (LDS) issued from that wrokgroup 467 to complete before the workgroup may continue. Since the LDS is per-workgroup 468 memory, this barrier may be used, for example, to ensure all workitems have 469 written data to LDS before any workitem attempts to read from it. 470 471 Note that `lds_barrier` does **not** force reads to or from global memory 472 to complete before execution continues. Therefore, it should be used when 473 operations on global memory can be issued far in advance of when their results 474 are used (for example, by writing them to LDS). 475 476 WARNING: On architectures that do not support the BackOffBarrier feature, 477 (those which will implement this barrier by emitting inline assembly), 478 use of this operation will impede the usabiliity of memory watches (including 479 breakpoints set on variables) when debugging. 480 }]; 481 let assemblyFormat = "attr-dict"; 482} 483 484def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum", 485 "The possible options for scheduling barriers", 486 [ 487 I32BitEnumAttrCaseNone<"none">, 488 I32BitEnumAttrCaseBit<"non_mem_non_sideffect", 0>, 489 I32BitEnumAttrCaseBit<"valu", 1>, 490 I32BitEnumAttrCaseBit<"salu", 2>, 491 I32BitEnumAttrCaseBit<"mfma_wmma", 3>, 492 I32BitEnumAttrCaseBit<"all_vmem", 4>, 493 I32BitEnumAttrCaseBit<"vmem_read", 5>, 494 I32BitEnumAttrCaseBit<"vmem_write", 6>, 495 I32BitEnumAttrCaseBit<"all_ds", 7>, 496 I32BitEnumAttrCaseBit<"ds_read", 8>, 497 I32BitEnumAttrCaseBit<"ds_write", 9>, 498 I32BitEnumAttrCaseBit<"transcendental", 10> 499 ]> { 500 let genSpecializedAttr = 0; 501 let cppNamespace = "::mlir::amdgpu"; 502} 503 504def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierOpOpt, 505 "sched_barrier_opt">{ 506 let assemblyFormat = "`<` $value `>`"; 507} 508 509def AMDGPU_SchedBarrierOp : 510 AMDGPU_Op<"sched_barrier">, 511 Arguments<(ins AMDGPU_SchedBarrierOpOptAttr:$opts)> 512 { 513 let summary = "Barrier that limits the backend scheduler of instruction movement"; 514 let description = [{ 515 `amdgpu.sched_barrier` serves as a barrier that could be 516 configured to restrict movements of instructions through it as 517 defined by sched_barrier_opts. 518 }]; 519 let assemblyFormat = [{ 520 `allow` `=` $opts attr-dict 521 }]; 522} 523 524def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB", 525 "The possible permutations of the lanes storing B available in an MFMA", 526 [ 527 I32EnumAttrCase<"none", 0>, 528 I32EnumAttrCase<"bcast_first_32", 1>, 529 I32EnumAttrCase<"bcast_second_32", 2>, 530 I32EnumAttrCase<"rotate_16_right", 3>, 531 I32EnumAttrCase<"bcast_first_16", 4>, 532 I32EnumAttrCase<"bcast_second_16", 5>, 533 I32EnumAttrCase<"bcast_third_16", 6>, 534 I32EnumAttrCase<"bcast_fourth_16", 7> 535 ]> { 536 let genSpecializedAttr = 0; 537 let cppNamespace = "::mlir::amdgpu"; 538} 539 540def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB, 541 "mfma_perm_b">; 542 543// mfma 544def MFMAInTypes : AnyTypeOf<[F32, F64, I32, I64, 545 VectorOfLengthAndType<[2], [F32]>, 546 VectorOfLengthAndType<[4], [F16]>, 547 VectorOfLengthAndType<[2, 4], [BF16]>, 548 VectorOfLengthAndType<[4, 8], [I8]>, 549 VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>]>; 550def MFMAOutTypes : AnyTypeOf<[F64, 551 VectorOfLengthAndType<[4, 16, 32], [F32]>, 552 VectorOfLengthAndType<[4, 16, 32], [I32]>, 553 VectorOfLengthAndType<[4], [F64]>]>; 554// wmma 555def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[8, 16], [F16, BF16, I8, SI8, UI8, F8E4M3FN, F8E5M2]>]>; 556def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>, 557 VectorOfLengthAndType<[8, 16], [F16, BF16]>]>; 558 559def AMDGPU_MFMAOp : 560 AMDGPU_Op<"mfma", [AllTypesMatch<["destC", "destD"]>, 561 Pure]>, 562 Arguments<(ins 563 I32Attr:$m, 564 I32Attr:$n, 565 I32Attr:$k, 566 I32Attr:$blocks, 567 MFMAInTypes:$sourceA, 568 MFMAInTypes:$sourceB, 569 MFMAOutTypes:$destC, 570 DefaultValuedAttr<I32Attr, "0">:$cbsz, 571 DefaultValuedAttr<I32Attr, "0">:$abid, 572 DefaultValuedAttr<AMDGPU_MFMAPermBAttr, 573 "::mlir::amdgpu::MFMAPermB::none">:$blgp, 574 UnitAttr:$reducePrecision, 575 UnitAttr:$negateA, 576 UnitAttr:$negateB, 577 UnitAttr:$negateC)>, 578 Results<(outs MFMAOutTypes: $destD)> { 579 let summary = "MLIR wrapper for CDNA mfma instructions"; 580 let description = [{ 581 The `amdgpu.mfma` op is an MLIR wrapper around intrinsics 582 for various `mfma` instructions in the CDNA architecture, which perform 583 multiple outer products in order to allow fast matrix multiplication. 584 585 The wrapper will select an appropriate `mfma` instruction, if one is available, 586 based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the 587 types of the source and destination arguments. 588 589 For information on the layouts of the input and output matrces (which are stored 590 in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation. 591 592 The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave 593 are permuted when matrix data is being loaded: `blgp` can be any number of 594 fixed permutations, `cbsz` specifies the log_2 of the number of chunks the lanes 595 holding sourceA are split into, and `abid` selects one of those chunks. 596 597 Note, this wrapper allows specifying `vector<4Kxi8>` arguments to MFMA 598 intrinsics that take an integer type of width `4K`. For example, 599 one can provide a vector<4xi8> as an argument to an MFMA instruction that 600 logically takes 4 i8s but whose intrinsics are specified to take an i32. 601 In these cases, the bytes in the vector will be concatenated in little-endian 602 order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). 603 604 The negateA, negateB, and negateC flags are only supported for double-precision 605 operations on gfx940+. 606 }]; 607 let assemblyFormat = [{ 608 $sourceA `*` $sourceB `+` $destC 609 attr-dict 610 `blgp` `=` $blgp 611 `:` type($sourceA) `,` type($sourceB) `,` type($destC) 612 }]; 613 let hasVerifier = 1; 614} 615 616def AMDGPU_WMMAOp : 617 AMDGPU_Op<"wmma", [AllTypesMatch<["destC", "destD"]>, 618 AllTypesMatch<["sourceA", "sourceB"]>, 619 Pure]>, 620 Arguments<(ins 621 WMMAInTypes:$sourceA, 622 WMMAInTypes:$sourceB, 623 WMMAOutTypes:$destC, 624 DefaultValuedAttr<ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>, "0">:$subwordOffset, 625 UnitAttr:$unsignedA, 626 UnitAttr:$unsignedB, 627 UnitAttr:$clamp)>, 628 Results<(outs WMMAOutTypes: $destD)> { 629 let summary = "MLIR wrapper for RDNA3 wmma instructions"; 630 let description = [{ 631 The `amdgpu.wmma` op is an MLIR wrapper around intrinsics 632 for various `wmma` instructions in the RDNA3 architecture, which perform 633 a 16x16 matrix multiplication for different data types. 634 635 When emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16 (or 16xbf16) vector 636 containing only 8 valid values: 637 - If `subwordOffset` is 0, then the output is stored at indices 0, 2, 4, ..., 14. 638 - If `subwordOffset` is 1, then the output is stored at indices 1, 3, 5, ..., 15. 639 640 `unsignedA` and `unsignedB` flag that the `int8` LLVM inputs are unsigned. 641 642 The `clamp` flag is used to saturate the output of type T to numeric_limits<T>::max() 643 in case of overflow. 644 }]; 645 let assemblyFormat = [{ 646 $sourceA `*` $sourceB `+` $destC 647 attr-dict 648 `:` type($sourceA) `,` type($sourceB) `,` type($destC) 649 }]; 650 let hasVerifier = 1; 651} 652 653#endif // AMDGPU 654