1//===-- VOP3PInstructions.td - Vector Instruction Definitions -------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9//===----------------------------------------------------------------------===// 10// VOP3P Classes 11//===----------------------------------------------------------------------===// 12 13class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, 14 bit HasDPP = 0> : VOP3_Profile<P, Features> { 15 let IsVOP3P = 1; 16 let HasExtVOP3DPP = HasDPP; 17 // We do not want to print src modifiers for vop3p because the bits are 18 // overloaded in meaning and the logic in printOperandAndFPInputMods is 19 // wrong for vop3p 20 let AsmVOP3Base = AsmVOP3P; 21 bit IsSMFMAC = false; 22} 23 24def VOP_MFMA_LD_SCALE : VOP3P_Profile<VOPProfile<[untyped, i32, i32, untyped]>, VOP3P_LD_SCALE> { 25 let HasModifiers = 1; 26 let HasNeg = 0; 27} 28 29// Used for FMA_MIX* and MAD_MIX* insts 30// Their operands are only sort of f16 operands. Depending on 31// op_sel_hi, these may be interpreted as f32. The inline immediate 32// values are really f16 converted to f32, so we treat these as f16 33// operands. 34class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, 35 bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { 36 bit UseTiedOutput = useTiedOutput; 37 38 dag srcs = 39 (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, 40 FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, 41 FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); 42 dag dpp_srcs = 43 (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, 44 FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, 45 FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); 46 47 // FIXME: Clamp0 misbehaves with the non-default vdst_in 48 // following it. For now workaround this by requiring clamp 49 // in tied patterns. This should use undef_tied_input, but it 50 // seems underdeveloped and doesn't apply the right register 51 // class constraints. 52 dag mods = !con(!if(UseTiedOutput, (ins Clamp:$clamp, VGPR_32:$vdst_in), 53 (ins Clamp0:$clamp)), 54 (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); 55 // We use Ins64 because that is the one which populates InOperandList 56 // due to the logic in class VOP3_Pseudo 57 let Ins64 = !con(srcs, mods); 58 let InsVOP3Base = !con(dpp_srcs, mods); 59 let AsmVOP3Base = 60 "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; 61} 62 63multiclass VOP3PInst<string OpName, VOPProfile P, 64 SDPatternOperator node = null_frag, bit IsDOT = 0> { 65 def NAME : VOP3P_Pseudo<OpName, P, 66 !if (P.HasModifiers, 67 getVOP3PModPat<P, node, IsDOT, IsDOT>.ret, 68 getVOP3Pat<P, node>.ret)>; 69 let SubtargetPredicate = isGFX11Plus in { 70 if P.HasExtVOP3DPP then 71 def _dpp : VOP3_DPP_Pseudo<OpName, P> { 72 let VOP3P = 1; 73 let PseudoInstr = OpName #"_dpp"; 74 } 75 } // end SubtargetPredicate = isGFX11Plus 76} 77 78// Non-packed instructions that use the VOP3P encoding. 79// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. 80multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { 81 def NAME : VOP3P_Pseudo<OpName, P> { 82 let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); 83 let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); 84 } 85 let SubtargetPredicate = isGFX11Plus in { 86 if P.HasExtVOP3DPP then 87 def _dpp : VOP3_DPP_Pseudo<OpName, P> { 88 let VOP3P = 1; 89 let PseudoInstr = OpName#"_dpp"; 90 let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); 91 let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); 92 } 93 } // end SubtargetPredicate = isGFX11Plus 94} 95 96let isReMaterializable = 1 in { 97let isCommutable = 1 in { 98defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 99defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>; 100 101let FPDPRounding = 1 in { 102defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; 103defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; 104defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; 105} // End FPDPRounding = 1 106defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; 107defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; 108 109defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>; 110defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>; 111defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>; 112 113defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>; 114defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>; 115defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>; 116defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>; 117 118let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { 119defm V_PK_MAXIMUM_F16 : VOP3PInst<"v_pk_maximum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16, VOP3_PACKED>, fmaximum>; 120defm V_PK_MINIMUM_F16 : VOP3PInst<"v_pk_minimum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16, VOP3_PACKED>, fminimum>; 121} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 122} 123 124defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>; 125defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>; 126 127defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>; 128defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>; 129defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>; 130} // End isReMaterializable = 1 131 132let SubtargetPredicate = HasVOP3PInsts in { 133 134// Integer operations with clamp bit set. 135class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat< 136 (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), 137 (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))), 138 (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE) 139>; 140 141def : VOP3PSatPat<uaddsat, V_PK_ADD_U16>; 142def : VOP3PSatPat<saddsat, V_PK_ADD_I16>; 143def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; 144def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; 145} // End SubtargetPredicate = HasVOP3PInsts 146 147let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { 148defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; 149defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; 150} 151 152// TODO: Make sure we're doing the right thing with denormals. Note 153// that FMA and MAD will differ. 154multiclass MadFmaMixPats<SDPatternOperator fma_like, 155 Instruction mix_inst, 156 Instruction mixlo_inst, 157 Instruction mixhi_inst> { 158 // At least one of the operands needs to be an fpextend of an f16 159 // for this to be worthwhile, so we need three patterns here. 160 // TODO: Could we use a predicate to inspect src1/2/3 instead? 161 def : GCNPat < 162 (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), 163 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), 164 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), 165 (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 166 DSTCLAMP.NONE)>; 167 def : GCNPat < 168 (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), 169 (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), 170 (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), 171 (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 172 DSTCLAMP.NONE)>; 173 def : GCNPat < 174 (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), 175 (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), 176 (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), 177 (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 178 DSTCLAMP.NONE)>; 179 180 def : GCNPat < 181 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 182 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 183 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 184 (mixlo_inst $src0_modifiers, $src0, 185 $src1_modifiers, $src1, 186 $src2_modifiers, $src2, 187 DSTCLAMP.NONE, 188 (i32 (IMPLICIT_DEF))) 189 >; 190 191 // FIXME: Special case handling for maxhi (especially for clamp) 192 // because dealing with the write to high half of the register is 193 // difficult. 194 def : GCNPat < 195 (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 196 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 197 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), 198 (v2f16 (mixhi_inst $src0_modifiers, $src0, 199 $src1_modifiers, $src1, 200 $src2_modifiers, $src2, 201 DSTCLAMP.NONE, 202 VGPR_32:$elt0)) 203 >; 204 205 def : GCNPat < 206 (build_vector 207 f16:$elt0, 208 (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 209 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 210 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), 211 (v2f16 (mixhi_inst $src0_modifiers, $src0, 212 $src1_modifiers, $src1, 213 $src2_modifiers, $src2, 214 DSTCLAMP.ENABLE, 215 VGPR_32:$elt0)) 216 >; 217 218 def : GCNPat < 219 (AMDGPUclamp (build_vector 220 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), 221 (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), 222 (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), 223 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), 224 (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), 225 (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), 226 (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, 227 $hi_src1_modifiers, $hi_src1, 228 $hi_src2_modifiers, $hi_src2, 229 DSTCLAMP.ENABLE, 230 (mixlo_inst $lo_src0_modifiers, $lo_src0, 231 $lo_src1_modifiers, $lo_src1, 232 $lo_src2_modifiers, $lo_src2, 233 DSTCLAMP.ENABLE, 234 (i32 (IMPLICIT_DEF))))) 235 >; 236 237 def : GCNPat < 238 (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), 239 (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), 240 (mixlo_inst $src0_modifiers, $src0, 241 $src1_modifiers, $src1, 242 (i32 0), (i32 0), 243 DSTCLAMP.NONE, 244 (i32 (IMPLICIT_DEF))) 245 >; 246 247 def : GCNPat < 248 (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), 249 (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), 250 (v2f16 (mixhi_inst $src0_modifiers, $src0, 251 $src1_modifiers, $src1, 252 (i32 0), (i32 0), 253 DSTCLAMP.NONE, 254 VGPR_32:$elt0)) 255 >; 256} 257 258class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node, 259 Instruction inst> : GCNPat< 260 (v2f16 (node (VOP3PMods v2f16:$src0, i32:$src0_mods), (VOP3PMods v2f16:$src1, i32:$src1_mods))), 261 (inst $src0_mods, $src0, $src1_mods, $src1, $src1_mods, $src1) 262>; 263 264let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { 265def : MinimumMaximumByMinimum3Maximum3VOP3P<fminimum, V_PK_MINIMUM3_F16>; 266def : MinimumMaximumByMinimum3Maximum3VOP3P<fmaximum, V_PK_MAXIMUM3_F16>; 267} 268 269let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in { 270 271// These are VOP3a-like opcodes which accept no omod. 272// Size of src arguments (16/32) is controlled by op_sel. 273// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. 274let isCommutable = 1, mayRaiseFPException = 0 in { 275let isReMaterializable = 1 in 276defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 277 278let FPDPRounding = 1 in { 279// Clamp modifier is applied after conversion to f16. 280defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 281 282let ClampLo = 0, ClampHi = 1 in { 283defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 284} 285} // End FPDPRounding = 1 286} 287 288defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; 289} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] 290 291 292// Essentially the same as the mad_mix versions 293let SubtargetPredicate = HasFmaMixInsts in { 294let isCommutable = 1 in { 295 296let isReMaterializable = 1 in 297defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 298 299let FPDPRounding = 1 in { 300// Clamp modifier is applied after conversion to f16. 301defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 302 303let ClampLo = 0, ClampHi = 1 in { 304defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 305} 306} // End FPDPRounding = 1 307} 308 309defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; 310} 311 312// Defines patterns that extract signed 4bit from each Idx[0]. 313foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in 314 def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), 315 (sra (shl node:$src, (i32 Idx[1])), (i32 28))>; 316 317// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex. 318class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag< 319 (ops node:$src), 320 !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element 321 !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))), 322 !if (!eq (FromBitIndex, 0), // first element 323 !if (U, (and node:$src, (i32 BitMask)), 324 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 325 (sext_inreg node:$src, i8))), 326 !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)), 327 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 328 (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; 329 330 331foreach Type = ["I", "U"] in 332 foreach Index = 0-3 in { 333 // Defines patterns that extract each Index'ed 8bit from an unsigned 334 // 32bit scalar value; 335 def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !eq (Type, "U")>; 336 337 // Defines multiplication patterns where the multiplication is happening on each 338 // Index'ed 8bit of a 32bit scalar value. 339 340 def Mul#Type#_Elt#Index : PatFrag< 341 (ops node:$src0, node:$src1), 342 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) 343 (!cast<Extract>(Type#Index#"_8bit") node:$src0), 344 (!cast<Extract>(Type#Index#"_8bit") node:$src1))>; 345 } 346 347// Different variants of dot8 patterns cause a huge increase in the compile time. 348// Define non-associative/commutative add/mul to prevent permutation in the dot8 349// pattern. 350def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; 351def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>; 352 353def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; 354def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>; 355 356def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>; 357def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>; 358 359foreach Type = ["I", "U"] in 360 foreach Index = 0-7 in { 361 // Defines patterns that extract each Index'ed 4bit from an unsigned 362 // 32bit scalar value; 363 def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !eq (Type, "U")>; 364 365 // Defines multiplication patterns where the multiplication is happening on each 366 // Index'ed 8bit of a 32bit scalar value. 367 def Mul#Type#Index#"_4bit" : PatFrag< 368 (ops node:$src0, node:$src1), 369 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) 370 (!cast<Extract>(Type#Index#"_4bit") node:$src0), 371 (!cast<Extract>(Type#Index#"_4bit") node:$src1))>; 372 } 373 374class UDot2Pat<VOP_Pseudo Inst> : GCNPat < 375 (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), 376 (srl i32:$src1, (i32 16))), i32:$src2), 377 (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), 378 (and i32:$src1, (i32 65535))) 379 ), 380 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 381 let Predicates = Inst.Predicates; 382} 383 384class SDot2Pat<VOP_Pseudo Inst> : GCNPat < 385 (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), 386 (sra i32:$src1, (i32 16))), i32:$src2), 387 (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), 388 (sext_inreg i32:$src1, i16))), 389 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 390 let Predicates = Inst.Predicates; 391} 392 393let IsDOT = 1 in { 394let OtherPredicates = [HasDot2Insts] in { 395defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", 396 VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; 397defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", 398 VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; 399} // End OtherPredicates = [HasDot2Insts] 400 401let OtherPredicates = [HasDot10Insts] in 402defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", 403 VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, 404 AMDGPUfdot2, 1/*ExplicitClamp*/>; 405 406let OtherPredicates = [HasDot7Insts] in { 407defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", 408 VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot4, 1>; 409defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", 410 VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot8, 1>; 411} // End OtherPredicates = [HasDot7Insts] 412 413let OtherPredicates = [HasDot1Insts] in { 414defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", 415 VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot4, 1>; 416defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", 417 VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot8, 1>; 418} // End OtherPredicates = [HasDot1Insts] 419 420def DOT2_BF16_Profile 421 : VOP3P_Profile<VOP_F32_V2BF16_V2BF16_F32, VOP3_REGULAR, /*HasDPP*/ 1> { 422 let HasSrc1Mods = 1; 423} 424 425let SubtargetPredicate = HasDot12Insts in { 426 427defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile, 428 int_amdgcn_fdot2_f32_bf16, 1>; 429 430} // End SubtargetPredicate = HasDot12Insts 431 432} // End let IsDOT = 1 433 434multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { 435 let IsDOT = 1 in 436 defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, 437 null_frag, 1>; 438 // Dot-iu instructions consider input as signed if imod neg bits are set. Thus 439 // Dot-iu Intrinsics have extra operands and require separate codegen pattern. 440 def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0, 441 (VOP3PModsNeg i32:$src1_mods), i32:$src1, 442 i32:$src2, (i1 timm:$clamp)), 443 (!cast<Instruction>(NAME) $src0_mods, i32:$src0, 444 $src1_mods, i32:$src1, 445 (i32 8), i32:$src2, i1:$clamp) 446 >; 447} 448 449let SubtargetPredicate = HasDot8Insts in { 450defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; 451defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; 452 453def : GCNPat < (int_amdgcn_sdot8 i32:$src0, 454 i32:$src1, 455 i32:$src2, (i1 timm:$clamp)), 456 (V_DOT8_I32_IU4 (i32 9), i32:$src0, 457 (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) 458>; 459 460def : GCNPat < (int_amdgcn_sdot4 i32:$src0, 461 i32:$src1, 462 i32:$src2, (i1 timm:$clamp)), 463 (V_DOT4_I32_IU8 (i32 9), i32:$src0, 464 (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) 465>; 466} // End SubtargetPredicate = HasDot8Insts 467 468// Does not use opsel, no src_modifiers on src0 and src1. 469// src_modifiers on src2(f32) are f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). 470def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>, 471 VOP3_PACKED, 1> { 472 let HasClamp = 0; 473 let HasOpSel = 0; 474 let HasOMod = 0; 475 let IsDOT = 1; 476 let HasSrc0Mods = 0; 477 let HasSrc1Mods = 0; 478 let HasSrc2Mods = 1; 479 480 let InsVOP3P = (ins VSrc_b32:$src0, VSrc_b32:$src1, 481 PackedF16InputMods:$src2_modifiers, VSrc_f32:$src2, 482 neg_lo0:$neg_lo, neg_hi0:$neg_hi); 483 484 let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, 485 PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, 486 neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi); 487 488 let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, 489 PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, 490 neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl, 491 DppRowMask:$row_mask, DppBankMask:$bank_mask, 492 DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); 493} 494 495multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> { 496 defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>; 497 498 let SubtargetPredicate = isGFX12Plus in 499 def : GCNPat <(intrinsic_node i32:$src0, i32:$src1, 500 (VOP3Mods f32:$src2, i32:$src2_modifiers)), 501 (!cast<Instruction>(NAME) i32:$src0, i32:$src1, 502 i32:$src2_modifiers, f32:$src2)>; 503} 504 505let OtherPredicates = [HasDot11Insts] in { 506defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_dot4_f32_fp8_bf8>; 507defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_dot4_f32_bf8_fp8>; 508defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_dot4_f32_fp8_fp8>; 509defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_dot4_f32_bf8_bf8>; 510} 511 512def : UDot2Pat<V_DOT2_U32_U16>; 513def : SDot2Pat<V_DOT2_I32_I16>; 514 515foreach Type = ["U", "I"] in 516 let Predicates = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).Predicates in 517 def : GCNPat < 518 !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, 519 (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), 520 (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 521 522foreach Type = ["U", "I"] in 523 let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in 524 def : GCNPat < 525 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 526 [1, 2, 3, 4, 5, 6, 7], lhs, y, 527 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 528 (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 529 530// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase 531// in the compile time. Directly handle the pattern generated by the FE here. 532foreach Type = ["U", "I"] in 533 let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in 534 def : GCNPat < 535 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 536 [7, 1, 2, 3, 4, 5, 6], lhs, y, 537 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 538 (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 539 540def ADst_32 : VOPDstOperand<AGPR_32>; 541def ADst_64 : VOPDstOperand<AReg_64>; 542def ADst_128 : VOPDstOperand<AReg_128>; 543def ADst_256 : VOPDstOperand<AReg_256>; 544def ADst_512 : VOPDstOperand<AReg_512>; 545def ADst_1024 : VOPDstOperand<AReg_1024>; 546def VDst_64 : VOPDstOperand<VReg_64>; 547def VDst_128 : VOPDstOperand<VReg_128>; 548def VDst_256 : VOPDstOperand<VReg_256>; 549def VDst_512 : VOPDstOperand<VReg_512>; 550def VDst_1024 : VOPDstOperand<VReg_1024>; 551 552def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { 553 let Src0RC64 = ARegSrc_32; 554} 555 556def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { 557 let DstRC = ADst_32; 558 let Src0RC64 = VCSrc_b32; 559} 560 561class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, 562 RegisterOperand SrcARC = AVSrc_32, RegisterOperand SrcBRC = SrcARC> 563 : VOP3P_Profile<P, VOP3_MAI> { 564 bit HasAbid = true; 565 let DstRC = _DstRC; 566 let Src0RC64 = SrcARC; 567 let Src1RC64 = SrcBRC; 568 let Src2RC64 = _SrcRC; 569 let HasOpSel = 0; 570 let HasClamp = 0; 571 let HasIntClamp = 0; 572 let HasOMod = 0; 573 let HasModifiers = 0; 574 let AsmVOP3Base = "$vdst, $src0, $src1, $src2$cbsz"#!if(HasAbid,"$abid","")#"$blgp"; 575 let Ins64 = !con( 576 (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, CBSZ:$cbsz), 577 !if(HasAbid, (ins ABID:$abid), (ins)), 578 (ins blgp:$blgp)); 579 let InsVOP3Base = Ins64; 580 // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. 581 // We then create two versions of the instruction: with tied dst and src2 582 // and with the earlyclobber flag on the dst. This is stricter than the 583 // actual HW restriction. In particular earlyclobber also affects src0 and 584 // src1 allocation which is not required. 585 bit NoDstOverlap = !gt(DstVT.Size, 128); 586} 587 588class VOPProfileSMFMAC<VOPProfile P, RegisterOperand _DstRC, 589 RegisterOperand _SrcARC, RegisterOperand _SrcBRC> 590 : VOPProfileMAI<P, _DstRC, _DstRC, _SrcARC> { 591 let Src1RC64 = _SrcBRC; 592 let Src2VT = DstVT; 593 let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid"; 594 let Outs64 = (outs DstRC:$vdst); 595 let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, CBSZ:$cbsz, ABID:$abid, Src2RC64:$src2); 596 let IsSMFMAC = true; 597} 598 599def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; 600def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; 601def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; 602def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>; 603def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>; 604def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>; 605def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>; 606def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>; 607def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>; 608def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 609def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 610def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 611def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 612def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 613def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 614def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>; 615def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>; 616def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>; 617def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>; 618def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 619def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 620def VOPProfileMAI_F32_I64_X32 : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 621def VOPProfileMAI_F32_I64_X16 : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 622 623def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>; 624def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>; 625def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, VISrc_1024_f32, VDst_1024>; 626def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>; 627def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>; 628def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>; 629def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>; 630def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>; 631def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>; 632def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 633def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 634def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; 635def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 636def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 637def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; 638def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; 639def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; 640def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>; 641def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>; 642def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 643def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 644def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 645def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 646 647def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; 648def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>; 649def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>; 650def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>; 651def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>; 652def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; 653def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>; 654def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; 655def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; 656def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; 657def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; 658def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; 659def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; 660def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; 661 662def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; 663def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; 664 665def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>; 666def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>; 667def VOPProfileMAI_F32_V8F16_X16 : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>; 668def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>; 669 670def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>; 671def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>; 672 673def VOPProfileMAI_F32_V8BF16_X4 : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>; 674def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>; 675 676 677let HasAbid = false in { 678// For f32_16x16x128_f8f6f4 - f8 x f8 case 679def VOPProfileMAI_F32_V8I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256>; 680def VOPProfileMAI_F32_V8I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256>; 681 682// For f32_16x16x128_f8f6f4 - f8 x f6 case 683def VOPProfileMAI_F32_V8I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256, AVSrc_192>; 684def VOPProfileMAI_F32_V8I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256, AVSrc_192>; 685 686// For f32_16x16x128_f8f6f4 - f6 x f8 case 687def VOPProfileMAI_F32_V6I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_256>; 688def VOPProfileMAI_F32_V6I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_256>; 689 690// For f32_16x16x128_f8f6f4 - f6 x f6 case 691def VOPProfileMAI_F32_V6I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_192>; 692def VOPProfileMAI_F32_V6I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_192>; 693 694// For f32_16x16x128_f8f6f4 - f6 x f4 case 695def VOPProfileMAI_F32_V6I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_128>; 696def VOPProfileMAI_F32_V6I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_128>; 697 698// For f32_16x16x128_f8f6f4 - f4 x f6 case 699def VOPProfileMAI_F32_V4I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_192>; 700def VOPProfileMAI_F32_V4I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_192>; 701 702// For f32_16x16x128_f8f6f4 - f8 x f4 case 703def VOPProfileMAI_F32_V8I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256, AVSrc_128>; 704def VOPProfileMAI_F32_V8I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256, AVSrc_128>; 705 706// For f32_16x16x128_f8f6f4 - f4 x f8 case 707def VOPProfileMAI_F32_V4I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_256>; 708def VOPProfileMAI_F32_V4I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_256>; 709 710// For f32_16x16x128_f8f6f4 - f4 x f4 case 711def VOPProfileMAI_F32_V4I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_128>; 712def VOPProfileMAI_F32_V4I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_128>; 713 714// For f32_32x32x64_f8f6f4 - f8 x f8 case 715def VOPProfileMAI_F32_V8I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256>; 716def VOPProfileMAI_F32_V8I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256>; 717 718// For f32_32x32x64_f8f6f4 - f8 x f6 case 719def VOPProfileMAI_F32_V8I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256, AVSrc_192>; 720def VOPProfileMAI_F32_V8I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256, AVSrc_192>; 721 722// For f32_32x32x64_f8f6f4 - f8 x f4 case 723def VOPProfileMAI_F32_V8I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256, AVSrc_128>; 724def VOPProfileMAI_F32_V8I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256, AVSrc_128>; 725 726// For f32_32x32x64_f8f6f4 - f4 x f8 case 727def VOPProfileMAI_F32_V4I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_256>; 728def VOPProfileMAI_F32_V4I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_256>; 729 730// For f32_32x32x64_f8f6f4 - f6 x f8 case 731def VOPProfileMAI_F32_V6I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_256>; 732def VOPProfileMAI_F32_V6I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_256>; 733 734// For f32_32x32x64_f8f6f4 - f6 x f6 case 735def VOPProfileMAI_F32_V6I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_192>; 736def VOPProfileMAI_F32_V6I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_192>; 737 738// For f32_32x32x64_f8f6f4 - f6 x f4 case 739def VOPProfileMAI_F32_V6I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_128>; 740def VOPProfileMAI_F32_V6I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_128>; 741 742// For f32_32x32x64_f8f6f4 - f4 x f6 case 743def VOPProfileMAI_F32_V4I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_192>; 744def VOPProfileMAI_F32_V4I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_192>; 745 746// For f32_32x32x64_f8f6f4 - f4 x f4 case 747def VOPProfileMAI_F32_V4I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_128>; 748def VOPProfileMAI_F32_V4I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_128>; 749} 750 751 752// For i32_16x16x64_i8 753def VOPProfileMAI_I32_V4I32_X128 : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, AISrc_128_f32, ADst_128, AVSrc_128>; 754def VOPProfileMAI_I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, VISrc_128_f32, VDst_128, AVSrc_128>; 755 756// For i32_32x32x32_i8 757def VOPProfileMAI_I32_V4I32_X16 : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, AISrc_512_b32, ADst_512, AVSrc_128>; 758def VOPProfileMAI_I32_V4I32_X16_VCD : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, VISrc_512_b32, VDst_512, AVSrc_128>; 759 760 761class MFMATable <bit is_mac, string Kind, string Name, 762 string AGPROpName = NAME> { 763 bit IsMac = is_mac; 764 string FMAOp = Name; 765 string AGPROp = AGPROpName; 766 767 // Does this MFMA use "AGPR" or "VGPR" for srcC/vdst 768 string MFMAKind = Kind; 769} 770 771class MFMA_F8F6F4_WithSizeTable<int A, int B, Instruction ThisVariant, Instruction F8F8Variant> { 772 Instruction F8F8Opcode = F8F8Variant; 773 Instruction Opcode = ThisVariant; 774 bits<8> NumRegsSrcA = A; 775 bits<8> NumRegsSrcB = B; 776} 777 778class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> : 779 MFMA_F8F6F4_WithSizeTable<!srl(ps.Pfl.Src0VT.Size, 5), 780 !srl(ps.Pfl.Src1VT.Size, 5), 781 !cast<Instruction>(NAME), 782 !cast<Instruction>(F8F8Op)> { 783} 784 785// Currently assumes scaled instructions never have abid 786class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag < 787 !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, 788 node:$scale_src0_opsel, node:$scale_src0, 789 node:$scale_src1_opsel, node:$scale_src1), 790 !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), 791 !if(HasAbid, (ops node:$abid), (ops)), 792 (ops node:$blgp))), 793 !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1), 794 !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), 795 (Op $src0, $src1, $src2, $cbsz, $blgp))), 796 pred 797>; 798 799defvar MayNeedAGPRs = [{ 800 return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); 801}]; 802 803defvar MayNeedAGPRs_gisel = [{ 804 return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); 805}]; 806 807defvar MayNotNeedAGPRs = [{ 808 return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); 809}]; 810 811defvar MayNotNeedAGPRs_gisel = [{ 812 return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); 813}]; 814 815class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, 816 bit Scaled = false> : 817 MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> { 818 let GISelPredicateCode = MayNeedAGPRs_gisel; 819} 820 821class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, 822 bit Scaled = false> : 823 MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> { 824 let GISelPredicateCode = MayNotNeedAGPRs_gisel; 825} 826 827let isAsCheapAsAMove = 1, isReMaterializable = 1 in { 828 defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; 829 let isMoveImm = 1 in { 830 defm V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite>; 831 } // End isMoveImm = 1 832} // End isAsCheapAsAMove = 1, isReMaterializable = 1 833 834class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled = false> 835 : VOP3InstBase<OpName, P, node, /*IsVOP2=*/0, Scaled> { 836 let SubtargetPredicate = HasMAIInsts; 837 Instruction Opcode = !cast<Instruction>(NAME); 838 bit is_dgemm = 0; 839 bit is_gfx940_xdl = 0; 840 let PseudoInstr = NAME; // FIXME: Why is this not the default 841} 842 843// FIXME: Intrinsic should probably not have op_sel operands, we can 844// pattern match byte select patterns into op_sel. 845// FIXME: Missing neg and clamp modifiers 846// 847// FIXME: Usual syntax for op_sel is quite hostile here. 848class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> : 849 MAIInst<OpName, BaseInst.Pfl, node, /*Scaled=*/true> { 850 // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them. 851 let InOperandList = !con(BaseInst.InOperandList, 852 (ins VSrc_b32:$scale_src0, 853 VSrc_b32:$scale_src1, 854 op_sel0:$scale_src0_opsel, 855 op_sel_hi0:$scale_src1_opsel)); 856 let AsmOperands = 857 "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" 858 "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp"; 859 860 let FixedSize = 1; 861 let Size = 16; 862} 863 864multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, 865 bit HasAbid = true, 866 bit Scaled = false> { 867 defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap; 868 869 let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { 870 // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. 871 let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { 872 def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), 873 !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, 874 MFMATable<0, "AGPR", NAME # "_e64">; 875 876 let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in 877 def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), 878 !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, 879 MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; 880 } 881 882 if NoDstOverlap then { 883 let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), 884 isConvertibleToThreeAddress = NoDstOverlap, 885 Mnemonic = OpName in { 886 def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), 887 !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, 888 MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; 889 890 let OtherPredicates = [isGFX90APlus] in 891 def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), 892 !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, 893 MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; 894 } 895 } 896 } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 897} 898 899// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32 900multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOperator node> { 901 defvar VariantSuffix = !subst(!toupper(OpName), "", NAME); // Drop the main opcode name prefix to get the "_fN_fM" suffix. 902 defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; 903 904 defvar HasAbid = false; 905 906 defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap; 907 908 def _e64 : ScaledMAIInst<OpName, 909 !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>, 910 MFMATable<0, "AGPR", NAME # "_e64">; 911 912 def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd", 913 !cast<MAIInst>(UnscaledOpName#"_vgprcd_e64"), !if(NoDstOverlap, null_frag, VgprMAIFrag<node, HasAbid, true>)>, 914 MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; 915 916 if NoDstOverlap then { 917 let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), 918 isConvertibleToThreeAddress = NoDstOverlap, 919 Mnemonic = UnscaledOpName_ in { 920 def _mac_e64 : ScaledMAIInst<OpName # "_mac", 921 !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>, 922 MFMATable<1, "AGPR", NAME # "_e64">; 923 924 def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd", 925 !cast<MAIInst>(UnscaledOpName # "_mac_vgprcd_e64"), VgprMAIFrag<node, HasAbid, true>>, 926 MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; 927 } 928 } 929} 930 931// Each of SrcA and SrcB can be encoded using 3 different sizes, so 932// define 9 permutations of register classes. 933multiclass MAIInst_SrcFormats_mc<string OpName, string ProfileSuffix, SDPatternOperator node> { 934 defvar HasAbid = false; 935 defm _f8_f8 : MAIInst<OpName, "F32_V8I32_V8I32"#ProfileSuffix, node, HasAbid>; 936 defm _f8_f6 : MAIInst<OpName, "F32_V8I32_V6I32"#ProfileSuffix, node, HasAbid>; 937 defm _f6_f8 : MAIInst<OpName, "F32_V6I32_V8I32"#ProfileSuffix, node, HasAbid>; 938 defm _f6_f6 : MAIInst<OpName, "F32_V6I32_V6I32"#ProfileSuffix, node, HasAbid>; 939 defm _f8_f4 : MAIInst<OpName, "F32_V8I32_V4I32"#ProfileSuffix, node, HasAbid>; 940 defm _f4_f8 : MAIInst<OpName, "F32_V4I32_V8I32"#ProfileSuffix, node, HasAbid>; 941 defm _f6_f4 : MAIInst<OpName, "F32_V6I32_V4I32"#ProfileSuffix, node, HasAbid>; 942 defm _f4_f6 : MAIInst<OpName, "F32_V4I32_V6I32"#ProfileSuffix, node, HasAbid>; 943 defm _f4_f4 : MAIInst<OpName, "F32_V4I32_V4I32"#ProfileSuffix, node, HasAbid>; 944} 945 946multiclass MAIInst_SrcFormats_Scaled_mc<string OpName, string UnscaledOpName, SDPatternOperator node> { 947 defm _f8_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 948 defm _f8_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 949 defm _f6_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 950 defm _f6_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 951 defm _f8_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 952 defm _f4_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 953 defm _f6_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 954 defm _f4_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 955 defm _f4_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>; 956} 957 958defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; 959defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; 960defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; 961defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; 962defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; 963 964let is_gfx940_xdl = 1 in { 965defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; 966defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; 967defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>; 968defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>; 969defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>; 970defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>; 971defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>; 972defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>; 973} 974 975let SubtargetPredicate = isGFX908orGFX90A in { 976defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>; 977defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>; 978defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>; 979defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>; 980defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>; 981defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>; 982defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>; 983} 984 985let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in { 986defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>; 987defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>; 988defm V_MFMA_F32_16X16X32_BF16 : MAIInst<"v_mfma_f32_16x16x32bf16", "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>; 989defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>; 990defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>; 991defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>; 992 993defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_16x16x128f8f6f4", 994 "_X128", mfma_f32_16x16x128_f8f6f4>; 995defm V_MFMA_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_32x32x64f8f6f4", 996 "_X512", mfma_f32_32x32x64_f8f6f4>; 997 998defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_Scaled_mc< 999 "v_mfma_scale_f32_16x16x128_f8f6f4", "V_MFMA_F32_16X16X128_F8F6F4", 1000 int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4>; 1001 1002defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_Scaled_mc< 1003 "v_mfma_scale_f32_32x32x64_f8f6f4", 1004 "V_MFMA_F32_32X32X64_F8F6F4", 1005 int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4>; 1006} 1007 1008let SubtargetPredicate = HasGFX950Insts in { 1009defm V_MFMA_LD_SCALE_B32 : VOP3PInst<"v_mfma_ld_scale_b32", VOP_MFMA_LD_SCALE>; 1010} 1011 1012let SubtargetPredicate = isGFX90APlus in { 1013 let is_gfx940_xdl = 1 in { 1014 defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; 1015 defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>; 1016 defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>; 1017 defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>; 1018 defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>; 1019 } 1020 1021 let is_dgemm = 1 in { 1022 defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>; 1023 defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; 1024 } 1025} // End SubtargetPredicate = isGFX90APlus 1026 1027let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in { 1028 defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>; 1029 defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>; 1030} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 1031 1032let SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1 in { 1033 defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>; 1034 defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>; 1035} // End SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1 1036 1037let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in { 1038 defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_bf8>; 1039 defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_fp8>; 1040 defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_bf8>; 1041 defm V_MFMA_F32_16X16X32_FP8_FP8 : MAIInst<"v_mfma_f32_16x16x32_fp8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_fp8>; 1042 defm V_MFMA_F32_32X32X16_BF8_BF8 : MAIInst<"v_mfma_f32_32x32x16_bf8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_bf8>; 1043 defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>; 1044 defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>; 1045 defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>; 1046} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 1047 1048multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> { 1049 let Constraints = "$vdst = $src2", DisableEncoding = "$src2", 1050 isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in { 1051 def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>; 1052 } 1053} 1054 1055let SubtargetPredicate = isGFX940Plus in { 1056defm V_SMFMAC_F32_16X16X32_F16 : SMFMACInst<"v_smfmac_f32_16x16x32_f16", "F32_16X16X32_F16", int_amdgcn_smfmac_f32_16x16x32_f16>; 1057defm V_SMFMAC_F32_32X32X16_F16 : SMFMACInst<"v_smfmac_f32_32x32x16_f16", "F32_32X32X16_F16", int_amdgcn_smfmac_f32_32x32x16_f16>; 1058defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16", "F32_16X16X32_I16", int_amdgcn_smfmac_f32_16x16x32_bf16>; 1059defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>; 1060defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>; 1061defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>; 1062} 1063 1064let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in { 1065defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>; 1066defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>; 1067defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>; 1068defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>; 1069defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>; 1070defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>; 1071defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>; 1072defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>; 1073} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 1074 1075let SubtargetPredicate = HasGFX950Insts in { 1076defm V_SMFMAC_F32_16X16X64_F16 : SMFMACInst<"v_smfmac_f32_16x16x64_f16", "F32_16X16X64_F16", int_amdgcn_smfmac_f32_16x16x64_f16>; 1077defm V_SMFMAC_F32_32X32X32_F16 : SMFMACInst<"v_smfmac_f32_32x32x32_f16", "F32_32X32X32_F16", int_amdgcn_smfmac_f32_32x32x32_f16>; 1078defm V_SMFMAC_F32_16X16X64_BF16 : SMFMACInst<"v_smfmac_f32_16x16x64_bf16", "F32_16X16X64_BF16", int_amdgcn_smfmac_f32_16x16x64_bf16>; 1079defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16", "F32_32X32X32_BF16", int_amdgcn_smfmac_f32_32x32x32_bf16>; 1080defm V_SMFMAC_I32_16X16X128_I8 : SMFMACInst<"v_smfmac_i32_16x16x128_i8", "I32_16X16X128_I8", int_amdgcn_smfmac_i32_16x16x128_i8>; 1081defm V_SMFMAC_I32_32X32X64_I8 : SMFMACInst<"v_smfmac_i32_32x32x64_i8", "I32_32X32X64_I8", int_amdgcn_smfmac_i32_32x32x64_i8>; 1082defm V_SMFMAC_F32_16X16X128_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_bf8>; 1083defm V_SMFMAC_F32_16X16X128_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_fp8>; 1084defm V_SMFMAC_F32_16X16X128_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_bf8>; 1085defm V_SMFMAC_F32_16X16X128_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_fp8>; 1086defm V_SMFMAC_F32_32X32X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x64_bf8_bf8", "F32_32X32X64_F8", int_amdgcn_smfmac_f32_32x32x64_bf8_bf8>; 1087defm V_SMFMAC_F32_32X32X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x64_bf8_fp8", "F32_32X32X64_F8", int_amdgcn_smfmac_f32_32x32x64_bf8_fp8>; 1088defm V_SMFMAC_F32_32X32X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x64_fp8_bf8", "F32_32X32X64_F8", int_amdgcn_smfmac_f32_32x32x64_fp8_bf8>; 1089defm V_SMFMAC_F32_32X32X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x64_fp8_fp8", "F32_32X32X64_F8", int_amdgcn_smfmac_f32_32x32x64_fp8_fp8>; 1090} 1091 1092def MAIInstInfoTable : GenericTable { 1093 let FilterClass = "MAIInst"; 1094 let CppTypeName = "MAIInstInfo"; 1095 let Fields = [ 1096 "Opcode", "is_dgemm", "is_gfx940_xdl" 1097 ]; 1098 1099 let PrimaryKey = ["Opcode"]; 1100 let PrimaryKeyName = "getMAIInstInfoHelper"; 1101} 1102 1103let isCommutable = 1, isReMaterializable = 1 in { 1104 let SubtargetPredicate = HasPackedFP32Ops in { 1105 defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; 1106 defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; 1107 defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; 1108 } // End SubtargetPredicate = HasPackedFP32Ops 1109 1110 let SubtargetPredicate = HasPkMovB32 in 1111 defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; 1112} // End isCommutable = 1, isReMaterializable = 1 1113 1114def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; 1115def : AMDGPUMnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; 1116 1117class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> { 1118 let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128); 1119 let Src0RC64 = _Src01RC64; 1120 let Src1RC64 = _Src01RC64; 1121 let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32); 1122 let HasClamp = _HasClamp; 1123 let HasOpSel = _HasOpSel; 1124 let IsPacked = 1; 1125 let IsWMMA = 1; 1126} 1127 1128def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>; 1129def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>; 1130def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>; 1131def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>; 1132def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>; 1133def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>; 1134 1135def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>; 1136def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>; 1137def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>; 1138def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>; 1139def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>; 1140 1141 1142class WMMAType <bits<2> val> { 1143 bit hasClamp = val{0}; 1144 bit hasOpsel = val{1}; 1145} 1146 1147def WMMARegular : WMMAType<0b00>; 1148def WMMAUIClamp : WMMAType<0b01>; 1149def WMMAOpSel : WMMAType<0b10>; 1150 1151class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : 1152 GCNPat < (P.DstVT (node 1153 (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)), 1154 (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), 1155 (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)) 1156 )), 1157 (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2)) 1158>; 1159 1160class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : 1161 GCNPat < (P.DstVT (node 1162 (P.Src0VT P.Src0VT:$src0), 1163 (P.Src1VT P.Src1VT:$src1), 1164 (P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers) 1165 )), 1166 (P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2)) 1167>; 1168 1169class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : 1170 GCNPat < (P.DstVT (node 1171 (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), 1172 (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), 1173 (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) 1174 )), 1175 (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) 1176>; 1177 1178class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { 1179 Instruction Opcode2Addr = TwoAddr; 1180 Instruction Opcode3Addr = ThreeAddr; 1181 Predicate WaveSizePredicate; 1182} 1183 1184def WMMAOpcode : GenericEnum { 1185 let FilterClass = "VOP3P_Pseudo"; 1186} 1187 1188class WMMAMappingTable : GenericTable { 1189 let FilterClass = "WMMAOpcodeMapping"; 1190 let CppTypeName = "WMMAOpcodeMappingInfo"; 1191 let Fields = ["Opcode2Addr", "Opcode3Addr"]; 1192 string TypeOf_Opcode2Addr = "WMMAOpcode"; 1193 string TypeOf_Opcode3Addr = "WMMAOpcode"; 1194} 1195 1196def WMMAOpcode2AddrMappingTable : WMMAMappingTable { 1197 let PrimaryKey = ["Opcode2Addr"]; 1198 let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode"; 1199} 1200 1201def WMMAOpcode3AddrMappingTable : WMMAMappingTable { 1202 let PrimaryKey = ["Opcode3Addr"]; 1203 let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode"; 1204} 1205 1206// The WMMA instruction has extra constraints: 1207// Matrices A and B cannot overlap with D. C cannot partially overlap with D, 1208// but it is OK for them to be the same (which is a typical case). 1209// 1210// We implement it as follows: 1211// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2). 1212// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case 1213// it converts the default pseudo to the pseudo where src2 is not the same as vdst. 1214// 3) @earlyclobber on the destination satisfies the constraint during RA. 1215 1216multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> { 1217 1218 defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; 1219 defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; 1220 1221 defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>; 1222 let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { 1223 let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { 1224 def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; 1225 } 1226 } 1227 if convertibleTo3Addr then { 1228 let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { 1229 let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { 1230 def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; 1231 } 1232 } 1233 def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix), 1234 !cast<Instruction>(NAME # _threeaddr # Suffix)>; 1235 } 1236 1237 let SubtargetPredicate = isGFX11Only in { 1238 if !eq(Type, WMMAOpSel) then { 1239 def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; 1240 } else if !eq(Type, WMMAUIClamp) then { 1241 def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; 1242 } else { 1243 def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; 1244 } 1245 } 1246} 1247 1248 1249 1250let WaveSizePredicate = isWave32 in { 1251 defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; 1252 defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; 1253 defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; 1254 defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; 1255 defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>; 1256 defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>; 1257 defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; 1258 defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; 1259} 1260 1261let WaveSizePredicate = isWave64 in { 1262 defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; 1263 defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; 1264 defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; 1265 defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; 1266 defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>; 1267 defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>; 1268 defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; 1269 defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; 1270 1271} 1272 1273class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, 1274 bit _IsIU, bit _IsFP8BF8> 1275 : VOP3P_Profile<VOPProfile<ArgTy>> { 1276 bit IsIU = _IsIU; 1277 bit IsFP8BF8 = _IsFP8BF8; 1278 bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); 1279 1280 int IndexType = _IndexType; 1281 1282 let IsPacked = 1; 1283 let IsWMMA = !not(_IsSWMMAC); 1284 let IsSWMMAC = _IsSWMMAC; 1285 1286 bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); 1287 bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret); 1288 bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); 1289 bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); 1290 bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); 1291 1292 bit NegLo01 = !or(IsF16BF16, IsIU); 1293 bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); 1294 bit NegHi01 = IsF16BF16; 1295 bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); 1296 bit NegLoAny = !or(NegLo01, NegLo2); 1297 bit NegHiAny = !or(NegHi01, NegHi2); 1298 1299 let DstRC = !cast<RegisterOperand>("VDst_"#ArgTy[0].Size); 1300 let Src0RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[1].Size); 1301 let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size); 1302 let Src2RC64 = !if(IsSWMMAC, DstRC, 1303 !cast<RegisterOperand>("VISrc_"#ArgTy[3].Size# 1304 !cond(IsC_F32: "_f32", 1305 IsC_F16: "_f16", 1306 IsC_BF16: "_bf16", 1307 1: "_b32"))); 1308 1309 // For f16 and bf16 matrices A and B, each element can be modified by 1310 // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is 1311 // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) 1312 // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each 1313 // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). 1314 1315 // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index 1316 // --------------------------------------------------------------------------- 1317 // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) 1318 // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) 1319 // --------------------------------------------------------------------------- 1320 // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16) 1321 // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16) 1322 // --------------------------------------------------------------------------- 1323 // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for 1324 // | neg_lo = 1 i4/i8(sext) | i32 matrices 1325 // --------------------------------------------------------------------------- 1326 // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) 1327 // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) 1328 // --------------------------------------------------------------------------- 1329 // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix 1330 // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst 1331 // --------------------------------------------------------------------------- 1332 // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix 1333 // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst 1334 // --------------------------------------------------------------------------- 1335 // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix 1336 // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst 1337 // --------------------------------------------------------------------------- 1338 // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix 1339 // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst 1340 1341 // pseudo 1342 1343 // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 1344 // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, 1345 // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 1346 // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. 1347 1348 dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); 1349 dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); 1350 dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); 1351 dag IndexKey = !cond(!eq(IndexType, 0) : (ins), 1352 !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), 1353 !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); 1354 dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins)); 1355 dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), 1356 !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), 1357 !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); 1358 1359 let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), 1360 !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), 1361 IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), 1362 Clamp, Neg); 1363 1364 // asm 1365 1366 string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", 1367 !eq(IndexType, 8) : "$index_key_8bit", 1368 !eq(IndexType, 16) : "$index_key_16bit"); 1369 string ClampAsm = !if(IsIU, "$clamp", ""); 1370 string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", 1371 !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", 1372 !and(!not(NegLoAny), !not(NegHiAny)) : ""); 1373 1374 let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; 1375 1376 // isel patterns 1377 1378 dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), 1379 IsAB_BF16 : (ins Src0VT:$src0), 1380 IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), 1381 IsFP8BF8 : (ins Src0VT:$src0)); 1382 dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), 1383 IsAB_BF16 : (ins (i32 8), Src0VT:$src0), 1384 IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), 1385 IsFP8BF8 : (ins Src0VT:$src0)); 1386 dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), 1387 IsAB_BF16 : (ins Src1VT:$src1), 1388 IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), 1389 IsFP8BF8 : (ins Src1VT:$src1)); 1390 dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), 1391 IsAB_BF16 : (ins (i32 8), Src1VT:$src1), 1392 IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), 1393 IsFP8BF8 : (ins Src1VT:$src1)); 1394 dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), 1395 IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), 1396 IsC_BF16 : (ins Src2VT:$src2), 1397 IsIU : (ins Src2VT:$src2), 1398 IsSWMMAC : (ins)); 1399 dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), 1400 IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), 1401 IsC_BF16 : (ins (i32 8), Src2VT:$src2), 1402 IsIU : (ins Src2VT:$src2), 1403 IsSWMMAC : (ins)); 1404 dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); 1405 dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), 1406 !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), 1407 !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); 1408 dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), 1409 !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), 1410 !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); 1411 dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); 1412 dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); 1413 1414 1415 dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); 1416 dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); 1417 1418 dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); 1419 dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); 1420 1421 // wmma pattern where src2 is inline imm uses _threeaddr pseudo, 1422 // can't use _twoaddr since it would violate src2 tied to vdst constraint. 1423 dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); 1424 dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); 1425} 1426 1427multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { 1428 let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { 1429 let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in 1430 def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ 1431 let PseudoInstr = Instr#PseudoInstrSuffix; 1432 } 1433 1434 let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in 1435 def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ 1436 let PseudoInstr = Instr#PseudoInstrSuffix; 1437 } 1438 1439 } 1440 def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr), 1441 !cast<Instruction>(NAME # _threeaddr)>; 1442} 1443 1444multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { 1445 def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ 1446 let Mnemonic = Instr; 1447 let PseudoInstr = Instr#PseudoInstrSuffix; 1448 let mayRaiseFPException = 0; 1449 let ReadsModeReg = 0; 1450 let AsmMatchConverter = "cvtSWMMAC"; 1451 1452 let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; 1453 } 1454} 1455 1456// First argument in Profile is types for matrices D, A, B and C (D = A * B + C) 1457// as used by llvm ir, types are vectors(with matrix elements) 1458// wave32: 1459// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts, 1460// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts. 1461// wave64: 1462// lanes will have half the size of elements in lanes compared to wave32 with 1463// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored 1464 1465// general idea on element distribution differences: 1466// wave32: lane n has 8 matrix elements 1467// wave64: lane n has first 4, lane n+32 has other 4 elements 1468 1469// index size, for each 2 elements in lane you need 4bits in index 1470 1471// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s. 1472// Original type for them is in comment on the right and refers to A and B. 1473 1474def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>; 1475def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>; 1476def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>; 1477def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>; 1478def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8 1479def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4 1480def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8 1481def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4 1482 1483def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>; 1484def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>; 1485def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>; 1486def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>; 1487def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8 1488def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 * 1489def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8 1490def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 1491 1492def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>; 1493def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>; 1494def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>; 1495def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>; 1496def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8 1497def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4 1498def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 ** 1499def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8 1500 1501def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>; 1502def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>; 1503def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>; 1504def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>; 1505def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8 1506def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 *** 1507def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4 1508def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8 1509 1510// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored 1511// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used 1512// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored 1513// for matrix A, index is i16; Matrix B uses all lanes 1514 1515let WaveSizePredicate = isWave32 in { 1516defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; 1517defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; 1518defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">; 1519defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">; 1520defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">; 1521defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">; 1522defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; 1523defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; 1524defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; 1525defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; 1526defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">; 1527 1528defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">; 1529defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">; 1530defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">; 1531defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">; 1532defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">; 1533defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">; 1534defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">; 1535defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; 1536defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; 1537defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; 1538defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; 1539} 1540 1541let WaveSizePredicate = isWave64 in { 1542defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">; 1543defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">; 1544defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">; 1545defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">; 1546defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">; 1547defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">; 1548defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; 1549defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; 1550defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; 1551defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; 1552defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">; 1553 1554defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">; 1555defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">; 1556defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">; 1557defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">; 1558defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">; 1559defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">; 1560defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">; 1561defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; 1562defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; 1563defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; 1564defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; 1565} 1566 1567// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that 1568// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0). 1569multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> { 1570 def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), 1571 (P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>; 1572 let AddedComplexity = 4 in 1573 def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), 1574 (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>; 1575} 1576 1577class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> : 1578 GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), 1579 (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>; 1580 1581class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> : 1582 GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), 1583 (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{ 1584 let WaveSizePredicate = isWave64; 1585 } 1586 1587let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { 1588 defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; 1589 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; 1590 defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; 1591 defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>; 1592 defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>; 1593 defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>; 1594 defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>; 1595 defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>; 1596 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>; 1597 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>; 1598 defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>; 1599 1600 def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w32>; 1601 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w32>; 1602 def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w32>; 1603 def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w32>; 1604 def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w32>; 1605 def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w32>; 1606 def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)), 1607 (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>; 1608 def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>; 1609 def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>; 1610 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>; 1611 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>; 1612} 1613 1614let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { 1615 defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; 1616 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; 1617 defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; 1618 defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>; 1619 defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>; 1620 defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>; 1621 defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>; 1622 defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>; 1623 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>; 1624 defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>; 1625 defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>; 1626 1627 def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w64>; 1628 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w64>; 1629 def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w64>; 1630 def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w64>; 1631 def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w64>; 1632 def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w64>; 1633 def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x64_iu4, I32_IU4X64_SWMMAC_w64>; 1634 def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>; 1635 def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>; 1636 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>; 1637 def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>; 1638} 1639 1640 1641//===----------------------------------------------------------------------===// 1642// Begin Real Encodings 1643//===----------------------------------------------------------------------===// 1644 1645class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget, 1646 string opName = ps.OpName> 1647 : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> { 1648 let hasSideEffects = ps.hasSideEffects; 1649 let Defs = ps.Defs; 1650 let SchedRW = ps.SchedRW; 1651 let Uses = ps.Uses; 1652 let AssemblerPredicate = HasDPP16; 1653 let SubtargetPredicate = ps.SubtargetPredicate; 1654 let OtherPredicates = ps.OtherPredicates; 1655 let IsPacked = ps.IsPacked; 1656} 1657 1658class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> 1659 : VOP3P_DPP8<op, opName, ps.Pfl> { 1660 let hasSideEffects = ps.hasSideEffects; 1661 let Defs = ps.Defs; 1662 let SchedRW = ps.SchedRW; 1663 let Uses = ps.Uses; 1664 let SubtargetPredicate = ps.SubtargetPredicate; 1665 let OtherPredicates = ps.OtherPredicates; 1666 let IsPacked = ps.IsPacked; 1667} 1668 1669//===----------------------------------------------------------------------===// 1670// GFX11, GFX12 1671//===----------------------------------------------------------------------===// 1672 1673multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, 1674 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { 1675 def Gen.Suffix : 1676 VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>, 1677 VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>; 1678} 1679 1680class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> 1681 : VOP3Pe_gfx11_gfx12<op, P>{ 1682 // opsel 1683 let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, 1684 !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, 1685 !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); 1686 let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); 1687 let Inst{13} = 0; 1688 // opsel_hi 1689 let Inst{59} = 1; 1690 let Inst{60} = 1; 1691 let Inst{14} = 1; 1692 // neg_lo 1693 let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); 1694 let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); 1695 let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0); 1696 // neg_hi 1697 let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0); 1698 let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); 1699 let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); 1700 // clamp 1701 let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); 1702} 1703 1704multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP, 1705 string backing_ps_name = NAME, 1706 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { 1707 def Gen.Suffix : 1708 VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>, 1709 VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>; 1710} 1711 1712multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> { 1713 let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { 1714 defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>; 1715 } 1716} 1717 1718multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> { 1719 let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in { 1720 defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>; 1721 } 1722} 1723 1724defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; 1725defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; 1726defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; 1727defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>; 1728defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>; 1729defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>; 1730defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; 1731defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; 1732defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; 1733defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; 1734defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; 1735 1736defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>; 1737defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>; 1738defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>; 1739defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; 1740defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>; 1741defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; 1742defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; 1743defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; 1744defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; 1745defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; 1746defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; 1747 1748 1749defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>; 1750defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>; 1751defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>; 1752defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; 1753defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>; 1754defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; 1755defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; 1756defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; 1757defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; 1758defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; 1759defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; 1760 1761defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; 1762defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; 1763defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; 1764defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; 1765defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; 1766defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; 1767defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; 1768defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; 1769defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; 1770defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; 1771defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; 1772 1773multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op, 1774 string backing_ps_name = NAME, 1775 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { 1776 defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); 1777 let AsmString = asmName # ps.AsmOperands in 1778 def Gen.Suffix : 1779 VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>, 1780 VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>; 1781 1782 def : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> { 1783 let AssemblerPredicate = Gen.AssemblerPredicate; 1784 } 1785} 1786 1787multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, 1788 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { 1789 defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); 1790 def _dpp#Gen.Suffix 1791 : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), 1792 Gen.Subtarget> { 1793 let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; 1794 let DecoderNamespace = Gen.DecoderNamespace; 1795 let AssemblerPredicate = Gen.AssemblerPredicate; 1796 } 1797} 1798 1799multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, 1800 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { 1801 defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); 1802 def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> { 1803 let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; 1804 let DecoderNamespace = Gen.DecoderNamespace; 1805 let AssemblerPredicate = Gen.AssemblerPredicate; 1806 } 1807} 1808 1809multiclass VOP3P_Realtriple<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, 1810 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> 1811 : VOP3P_Real_Base<Gen, op, backing_ps_name, asmName>, 1812 VOP3P_Real_dpp<Gen, op, backing_ps_name, asmName>, 1813 VOP3P_Real_dpp8<Gen, op, backing_ps_name, asmName>; 1814 1815//===----------------------------------------------------------------------===// 1816// GFX12 1817//===----------------------------------------------------------------------===// 1818 1819multiclass VOP3P_Real_gfx12<bits<7> op> : VOP3P_Real_Base<GFX12Gen, op>; 1820 1821multiclass VOP3P_Real_with_name_gfx12<bits<7> op, 1822 string backing_ps_name = NAME, 1823 string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : 1824 VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>; 1825 1826defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; 1827defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; 1828 1829defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; 1830defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; 1831 1832defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>; 1833defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>; 1834defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>; 1835defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>; 1836 1837//===----------------------------------------------------------------------===// 1838// GFX11 1839//===----------------------------------------------------------------------===// 1840 1841multiclass VOP3P_Real_gfx11_gfx12<bits<7> op> : 1842 VOP3P_Real_Base<GFX11Gen, op>, VOP3P_Real_Base<GFX12Gen, op>; 1843 1844defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11_gfx12<0x16>; 1845defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11_gfx12<0x18>; 1846defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11_gfx12<0x1a>; 1847 1848let AssemblerPredicate = isGFX11Plus in { 1849 def : AMDGPUMnemonicAlias<"v_dot4_i32_i8", "v_dot4_i32_iu8">; 1850 def : AMDGPUMnemonicAlias<"v_dot8_i32_i4", "v_dot8_i32_iu4">; 1851} 1852 1853multiclass VOP3P_Real_WMMA <bits<7> op> { 1854 let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in { 1855 defm _twoaddr_w32 : VOP3P_Real_Base <GFX11Gen, op>; 1856 } 1857 let WaveSizePredicate = isWave64, DecoderNamespace = "GFX11W64" in { 1858 defm _twoaddr_w64 : VOP3P_Real_Base <GFX11Gen, op>; 1859 } 1860} 1861 1862defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>; 1863defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>; 1864defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>; 1865defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>; 1866defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>; 1867defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>; 1868 1869//===----------------------------------------------------------------------===// 1870// GFX8 (VI) 1871//===----------------------------------------------------------------------===// 1872 1873multiclass VOP3P_Real_vi<bits<7> op> { 1874 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 1875 VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 1876 let AssemblerPredicate = HasVOP3PInsts; 1877 let DecoderNamespace = "GFX8"; 1878 let VOP3P = 1; 1879 } 1880} 1881 1882multiclass VOP3P_Real_MAI<bits<7> op> { 1883 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, 1884 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { 1885 let AssemblerPredicate = HasMAIInsts; 1886 let DecoderNamespace = "GFX8"; 1887 let Inst{14} = ?; // op_sel_hi(2) 1888 let Inst{59} = ?; // op_sel_hi(0) 1889 let Inst{60} = ?; // op_sel_hi(1) 1890 } 1891} 1892 1893let Constraints = "" in { 1894multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { 1895 let SubtargetPredicate = isGFX90AOnly, 1896 AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { 1897 def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>, 1898 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>; 1899 1900 def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>, 1901 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>; 1902 } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" 1903} 1904} 1905 1906multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string Op, 1907 VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(Op # "_e64"), 1908 VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"), 1909 VOPProfile Pfl_ACD = PS_ACD.Pfl, 1910 VOPProfile Pfl_VCD = PS_VCD.Pfl> { 1911 if !ne(NameFrom, NameTo) then { 1912 let SubtargetPredicate = PS_ACD.SubtargetPredicate, 1913 OtherPredicates = PS_ACD.OtherPredicates in { 1914 def : InstAlias <NameTo # " " # PS_ACD.AsmOperands, 1915 (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst, 1916 Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2, 1917 CBSZ:$cbsz, ABID:$abid, blgp:$blgp)>, PredicateControl; 1918 def : InstAlias <NameTo # " " # PS_VCD.AsmOperands, 1919 (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst, 1920 Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2, 1921 CBSZ:$cbsz, ABID:$abid, blgp:$blgp)>, PredicateControl; 1922 } 1923 } 1924} 1925 1926multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, 1927 VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), 1928 VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { 1929 let AssemblerPredicate = isGFX940Plus, 1930 DecoderNamespace = "GFX940", 1931 AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { 1932 def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, 1933 VOP3Pe_MAI <op, PS_ACD.Pfl, 1>; 1934 1935 def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>, 1936 VOP3Pe_MAI <op, PS_VCD.Pfl, 0>; 1937 } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940" 1938 1939 let SubtargetPredicate = PS_ACD.SubtargetPredicate, 1940 OtherPredicates = PS_ACD.OtherPredicates, 1941 AssemblerPredicate = isGFX940Plus 1942 in { 1943 defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>; 1944 1945 if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then 1946 defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>; 1947 } 1948} 1949 1950multiclass VOP3P_Real_MFMA_F8F6F4_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, 1951 VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), 1952 VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { 1953 1954 defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; 1955 1956 let AssemblerPredicate = isGFX940Plus, 1957 DecoderNamespace = "GFX940", 1958 AsmString = Name # PS_ACD.AsmOperands, 1959 Constraints = "" in { 1960 def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, 1961 VOP3Pe_MAI <op, PS_ACD.Pfl, 1>, 1962 MFMA_F8F6F4_WithSizeTable_Helper<PS_ACD, F8F8Name#"_gfx940_acd">; 1963 1964 def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>, 1965 VOP3Pe_MAI <op, PS_VCD.Pfl, 0>, 1966 MFMA_F8F6F4_WithSizeTable_Helper<PS_VCD, F8F8Name#"_gfx940_vcd">; 1967 } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940" 1968} 1969 1970multiclass VOP3P_Real_MFMA_gfx950<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, 1971 VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), 1972 VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { 1973 let SubtargetPredicate = HasGFX950Insts, 1974 AssemblerPredicate = HasGFX950Insts in { 1975 defm "" : VOP3P_Real_MFMA_gfx940<op, Name, PS_ACD, PS_VCD>; 1976 } 1977} 1978 1979 1980multiclass VOP3P_Real_MFMA_F8F6F4_gfx950_mc<bits<7> op, string Name> { 1981 defm _f8_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1982 1983 let isAsmParserOnly = true in { // Disable ambiguous disassembly. 1984 defm _f8_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1985 defm _f6_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1986 defm _f8_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1987 defm _f4_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1988 defm _f6_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1989 defm _f6_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1990 defm _f4_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1991 defm _f4_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>; 1992 } 1993} 1994 1995multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> { 1996 defvar PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"); 1997 defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"); 1998 defvar Name = PS_ACD.Mnemonic; 1999 defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; 2000 2001 let SubtargetPredicate = HasGFX950Insts, 2002 DecoderNamespace = "GFX940", 2003 AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { 2004 def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, 2005 VOP3PXe <op, PS_ACD.Pfl, /*acc_cd=*/1>, 2006 MFMA_F8F6F4_WithSizeTable_Helper<PS_ACD, F8F8Name#"_gfx940_acd">; 2007 2008 def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>, 2009 VOP3PXe <op, PS_VCD.Pfl, /*acc_cd=*/0>, 2010 MFMA_F8F6F4_WithSizeTable_Helper<PS_VCD, F8F8Name#"_gfx940_vcd">; 2011 } 2012} 2013 2014multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc<bits<7> op> { 2015 defm _f8_f8 : VOP3PX_Real_ScaledMFMA<op>; 2016 2017 let isAsmParserOnly = 1 in { // Disable ambiguous disassembly. 2018 defm _f8_f6 : VOP3PX_Real_ScaledMFMA<op>; 2019 defm _f6_f8 : VOP3PX_Real_ScaledMFMA<op>; 2020 defm _f8_f4 : VOP3PX_Real_ScaledMFMA<op>; 2021 defm _f4_f8 : VOP3PX_Real_ScaledMFMA<op>; 2022 defm _f6_f6 : VOP3PX_Real_ScaledMFMA<op>; 2023 defm _f6_f4 : VOP3PX_Real_ScaledMFMA<op>; 2024 defm _f4_f6 : VOP3PX_Real_ScaledMFMA<op>; 2025 defm _f4_f4 : VOP3PX_Real_ScaledMFMA<op>; 2026 } 2027} 2028 2029multiclass VOP3P_Real_MFMA_vi<bits<7> op> { 2030 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, 2031 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { 2032 let SubtargetPredicate = isGFX8GFX9NotGFX90A; 2033 let AssemblerPredicate = HasMAIInsts; 2034 let DecoderNamespace = "GFX8"; 2035 let Constraints = ""; 2036 } 2037} 2038 2039multiclass VOP3P_Real_MFMA_vi_gfx90a<bits<7> op> : 2040 VOP3P_Real_MFMA_gfx90a <op>, 2041 VOP3P_Real_MFMA_vi <op>; 2042 2043multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : 2044 VOP3P_Real_MFMA_vi_gfx90a <op>, 2045 VOP3P_Real_MFMA_gfx940 <op, GFX940Name>; 2046 2047multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> { 2048 def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, 2049 VOP3Pe_SMFMAC <op> { 2050 let AssemblerPredicate = isGFX940Plus; 2051 let DecoderNamespace = "GFX8"; 2052 } 2053 def : AMDGPUMnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { 2054 let AssemblerPredicate = isGFX940Plus; 2055 } 2056} 2057 2058defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; 2059defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; 2060defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>; 2061defm V_PK_SUB_I16 : VOP3P_Real_vi <0x03>; 2062defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x04>; 2063defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x05>; 2064defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x06>; 2065defm V_PK_MAX_I16 : VOP3P_Real_vi <0x07>; 2066defm V_PK_MIN_I16 : VOP3P_Real_vi <0x08>; 2067defm V_PK_MAD_U16 : VOP3P_Real_vi <0x09>; 2068 2069defm V_PK_ADD_U16 : VOP3P_Real_vi <0x0a>; 2070defm V_PK_SUB_U16 : VOP3P_Real_vi <0x0b>; 2071defm V_PK_MAX_U16 : VOP3P_Real_vi <0x0c>; 2072defm V_PK_MIN_U16 : VOP3P_Real_vi <0x0d>; 2073defm V_PK_FMA_F16 : VOP3P_Real_vi <0x0e>; 2074defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>; 2075defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>; 2076defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>; 2077defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>; 2078 2079defm V_PK_MINIMUM3_F16 : VOP3P_Real_vi <0x1b>; 2080defm V_PK_MAXIMUM3_F16 : VOP3P_Real_vi <0x1c>; 2081 2082defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>; 2083defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>; 2084defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>; 2085 2086let OtherPredicates = [HasFmaMixInsts], 2087 DecoderNamespace = "GFX9_DL" in { 2088// The mad_mix instructions were renamed and their behaviors changed, 2089// but the opcode stayed the same so we need to put these in a 2090// different DecoderNamespace to avoid the ambiguity. 2091defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>; 2092defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>; 2093defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>; 2094} 2095 2096defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; 2097defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; 2098 2099defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; 2100defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>; 2101defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; 2102 2103defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>; 2104defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>; 2105 2106defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>; 2107defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>; 2108defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40, "v_mfma_f32_32x32x1_2b_f32">; 2109defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41, "v_mfma_f32_16x16x1_4b_f32">; 2110defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42, "v_mfma_f32_4x4x1_16b_f32">; 2111defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44, "v_mfma_f32_32x32x2_f32">; 2112defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45, "v_mfma_f32_16x16x4_f32">; 2113defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48, "v_mfma_f32_32x32x4_2b_f16">; 2114defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49, "v_mfma_f32_16x16x4_4b_f16">; 2115defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a, "v_mfma_f32_4x4x4_16b_f16">; 2116defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c, "v_mfma_f32_32x32x8_f16">; 2117defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d, "v_mfma_f32_16x16x16_f16">; 2118defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">; 2119defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">; 2120defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">; 2121 2122defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA_vi_gfx90a <0x55>; 2123defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA_vi_gfx90a <0x54>; 2124defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x68>; 2125defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x69>; 2126defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6b>; 2127defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6c>; 2128defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6d>; 2129 2130defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>; 2131defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>; 2132defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>; 2133defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>; 2134defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; 2135defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; 2136defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; 2137 2138defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">; 2139defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">; 2140defm V_MFMA_F32_16X16X32_BF16 : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">; 2141defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">; 2142defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">; 2143defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">; 2144 2145defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>; 2146defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mfma_f32_16x16x128_f8f6f4">; 2147defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>; 2148defm V_MFMA_F32_32X32X64_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">; 2149defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>; 2150defm V_DOT2_F32_BF16 : VOP3P_Real_vi<0x1a>; 2151 2152defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; 2153defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; 2154defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">; 2155defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">; 2156 2157defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>; 2158defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>; 2159defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>; 2160defm V_MFMA_F32_16X16X32_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x73>; 2161defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>; 2162defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>; 2163defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>; 2164defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>; 2165 2166defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">; 2167defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">; 2168defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">; 2169defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx940 <0x60, "v_mfma_f32_32x32x8_bf16">; 2170defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx940 <0x61, "v_mfma_f32_16x16x16_bf16">; 2171 2172defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx940 <0x6e, "v_mfma_f64_16x16x4_f64">; 2173defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx940 <0x6f, "v_mfma_f64_4x4x4_4b_f64">; 2174 2175defm V_SMFMAC_F32_16X16X32_F16 : VOP3P_Real_SMFMAC <0x62, "v_smfmac_f32_16x16x32f16">; 2176defm V_SMFMAC_F32_32X32X16_F16 : VOP3P_Real_SMFMAC <0x64, "v_smfmac_f32_32x32x16f16">; 2177defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x16x32bf16">; 2178defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">; 2179defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">; 2180defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">; 2181defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">; 2182defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">; 2183defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">; 2184defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">; 2185defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">; 2186defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">; 2187defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">; 2188defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">; 2189 2190defm V_SMFMAC_F32_16X16X64_F16 : VOP3P_Real_SMFMAC <0x5a, "v_smfmac_f32_16x16x64f16">; 2191defm V_SMFMAC_F32_32X32X32_F16 : VOP3P_Real_SMFMAC <0x5b, "v_smfmac_f32_32x32x32f16">; 2192defm V_SMFMAC_F32_16X16X64_BF16 : VOP3P_Real_SMFMAC <0x39, "v_smfmac_f32_16x16x64bf16">; 2193defm V_SMFMAC_F32_32X32X32_BF16 : VOP3P_Real_SMFMAC <0x46, "v_smfmac_f32_32x32x32bf16">; 2194defm V_SMFMAC_I32_16X16X128_I8 : VOP3P_Real_SMFMAC <0x3a, "v_smfmac_i32_16x16x128i8">; 2195defm V_SMFMAC_I32_32X32X64_I8 : VOP3P_Real_SMFMAC <0x47, "v_smfmac_i32_32x32x64i8">; 2196 2197defm V_SMFMAC_F32_16X16X128_BF8_BF8 : VOP3P_Real_SMFMAC <0x3b, "v_smfmac_f32_16x16x128bf8bf8">; 2198defm V_SMFMAC_F32_16X16X128_BF8_FP8 : VOP3P_Real_SMFMAC <0x3c, "v_smfmac_f32_16x16x128bf8fp8">; 2199defm V_SMFMAC_F32_16X16X128_FP8_BF8 : VOP3P_Real_SMFMAC <0x3d, "v_smfmac_f32_16x16x128fp8bf8">; 2200defm V_SMFMAC_F32_16X16X128_FP8_FP8 : VOP3P_Real_SMFMAC <0x43, "v_smfmac_f32_16x16x128fp8fp8">; 2201defm V_SMFMAC_F32_32X32X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x4b, "v_smfmac_f32_32x32x64bf8bf8">; 2202defm V_SMFMAC_F32_32X32X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x4e, "v_smfmac_f32_32x32x64bf8fp8">; 2203defm V_SMFMAC_F32_32X32X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x4f, "v_smfmac_f32_32x32x64fp8bf8">; 2204defm V_SMFMAC_F32_32X32X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x53, "v_smfmac_f32_32x32x64fp8fp8">; 2205 2206defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; 2207defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; 2208defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; 2209defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; 2210 2211//===----------------------------------------------------------------------===// 2212// GFX10. 2213//===----------------------------------------------------------------------===// 2214 2215let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { 2216 multiclass VOP3P_Real_gfx10<bits<7> op> { 2217 def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, 2218 VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; 2219 } 2220} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 2221 2222multiclass VOP3P_Real_gfx10_gfx11<bits<7> op> : 2223 VOP3P_Real_gfx10<op>, VOP3P_Real_Base<GFX11Gen, op>; 2224 2225multiclass VOP3P_Real_gfx10_gfx11_gfx12<bits<7> op> : 2226 VOP3P_Real_gfx10_gfx11<op>, VOP3P_Real_Base<GFX12Gen, op>; 2227 2228multiclass VOP3P_Real_gfx10_gfx11_gfx12_Triple<bits<7> op> : 2229 VOP3P_Real_gfx10<op>, VOP3P_Realtriple<GFX11Gen, op>, 2230 VOP3P_Realtriple<GFX12Gen, op>; 2231 2232defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x00>; 2233defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x01>; 2234defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x02>; 2235defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x03>; 2236defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x04>; 2237defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x05>; 2238defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x06>; 2239defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x07>; 2240defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x08>; 2241defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x09>; 2242defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0a>; 2243defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0b>; 2244defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0c>; 2245defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0d>; 2246defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0e>; 2247defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0f>; 2248defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x10>; 2249defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>; 2250defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>; 2251defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x20>; 2252defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x21>; 2253defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x22>; 2254 2255defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; 2256defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; 2257 2258defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x13>; 2259defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11_gfx12<0x17>; 2260defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11_gfx12<0x19>; 2261 2262defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>; 2263defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>; 2264