1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl; 14 15class UniformSextInreg<ValueType VT> : PatFrag< 16 (ops node:$src), 17 (sext_inreg $src, VT), 18 [{ return !N->isDivergent(); }]>; 19 20class DivergentSextInreg<ValueType VT> : PatFrag< 21 (ops node:$src), 22 (sext_inreg $src, VT), 23 [{ return N->isDivergent(); }]>; 24 25include "SOPInstructions.td" 26include "VOPInstructions.td" 27include "SMInstructions.td" 28include "FLATInstructions.td" 29include "BUFInstructions.td" 30include "EXPInstructions.td" 31include "DSDIRInstructions.td" 32include "VINTERPInstructions.td" 33 34//===----------------------------------------------------------------------===// 35// VINTRP Instructions 36//===----------------------------------------------------------------------===// 37 38// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 39def VINTRPDst : VINTRPDstOperand <VGPR_32>; 40 41let Uses = [MODE, M0, EXEC] in { 42 43// FIXME: Specify SchedRW for VINTRP instructions. 44 45multiclass V_INTERP_P1_F32_m : VINTRP_m < 46 0x00000000, 47 (outs VINTRPDst:$vdst), 48 (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 49 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 50 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 51 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 52>; 53 54let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { 55 56defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 57 58} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] 59 60let OtherPredicates = [has16BankLDS, isNotGFX90APlus], 61 Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 62 63defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 64 65} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], 66 // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 67 68let OtherPredicates = [isNotGFX90APlus] in { 69let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 70 71defm V_INTERP_P2_F32 : VINTRP_m < 72 0x00000001, 73 (outs VINTRPDst:$vdst), 74 (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr, 75 InterpAttrChan:$attrchan), 76 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 77 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 78 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 79 80} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 81 82defm V_INTERP_MOV_F32 : VINTRP_m < 83 0x00000002, 84 (outs VINTRPDst:$vdst), 85 (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 86 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 87 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 88 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 89 90} // End OtherPredicates = [isNotGFX90APlus] 91 92} // End Uses = [MODE, M0, EXEC] 93 94//===----------------------------------------------------------------------===// 95// Pseudo Instructions 96//===----------------------------------------------------------------------===// 97 98// Insert a branch to an endpgm block to use as a fallback trap. 99def ENDPGM_TRAP : SPseudoInstSI< 100 (outs), (ins), 101 [(AMDGPUendpgm_trap)], 102 "ENDPGM_TRAP"> { 103 let hasSideEffects = 1; 104 let usesCustomInserter = 1; 105} 106 107def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)], 108 "SIMULATED_TRAP"> { 109 let hasSideEffects = 1; 110 let usesCustomInserter = 1; 111} 112 113def ATOMIC_FENCE : SPseudoInstSI< 114 (outs), (ins i32imm:$ordering, i32imm:$scope), 115 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 116 "ATOMIC_FENCE $ordering, $scope"> { 117 let hasSideEffects = 1; 118} 119 120let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 121 122// For use in patterns 123def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 124 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 125 let isPseudo = 1; 126 let isCodeGenOnly = 1; 127 let usesCustomInserter = 1; 128} 129 130// 64-bit vector move instruction. This is mainly used by the 131// SIFoldOperands pass to enable folding of inline immediates. 132def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 133 (ins VSrc_b64:$src0)> { 134 let isReMaterializable = 1; 135 let isAsCheapAsAMove = 1; 136 let isMoveImm = 1; 137 let SchedRW = [Write64Bit]; 138 let Size = 4; 139 let UseNamedOperandTable = 1; 140} 141 142// 64-bit vector move with dpp. Expanded post-RA. 143def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { 144 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 145} 146 147// 64-bit scalar move immediate instruction. This is used to avoid subregs 148// initialization and allow rematerialization. 149def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), 150 (ins i64imm:$src0)> { 151 let isReMaterializable = 1; 152 let isAsCheapAsAMove = 1; 153 let isMoveImm = 1; 154 let SchedRW = [WriteSALU, Write64Bit]; 155 let Size = 4; 156 let Uses = []; 157 let UseNamedOperandTable = 1; 158} 159 160// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 161// WQM pass processes it. 162def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 163 164// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 165// turned into a copy by WQM pass, but does not seed WQM requirements. 166def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 167 168// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so 169// that the @earlyclobber is respected. The @earlyclobber is to make sure that 170// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't 171// accidentally clobber inactive channels of $vdst. 172let Constraints = "@earlyclobber $vdst" in { 173def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 174def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 175} 176 177} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 178 179def WWM_COPY : SPseudoInstSI < 180 (outs unknown:$dst), (ins unknown:$src)> { 181 let hasSideEffects = 0; 182 let isAsCheapAsAMove = 1; 183 let isConvergent = 1; 184} 185 186def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 187 let Uses = [EXEC]; 188 let Defs = [EXEC, SCC]; 189 let hasSideEffects = 0; 190 let mayLoad = 0; 191 let mayStore = 0; 192} 193 194def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 195 let hasSideEffects = 0; 196 let mayLoad = 0; 197 let mayStore = 0; 198} 199 200def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 201 let Uses = [EXEC]; 202 let Defs = [EXEC, SCC]; 203 let hasSideEffects = 0; 204 let mayLoad = 0; 205 let mayStore = 0; 206} 207 208def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 209 let hasSideEffects = 0; 210 let mayLoad = 0; 211 let mayStore = 0; 212} 213 214let usesCustomInserter = 1 in { 215let WaveSizePredicate = isWave32 in 216def S_INVERSE_BALLOT_U32 : SPseudoInstSI< 217 (outs SReg_32:$sdst), (ins SSrc_b32:$mask), 218 [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))] 219>; 220 221let WaveSizePredicate = isWave64 in 222def S_INVERSE_BALLOT_U64 : SPseudoInstSI< 223 (outs SReg_64:$sdst), (ins SSrc_b64:$mask), 224 [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))] 225>; 226} // End usesCustomInserter = 1 227 228// Pseudo instructions used for @llvm.fptrunc.round. The final codegen is done 229// in the ModeRegister pass. 230let Uses = [MODE, EXEC] in { 231def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 232 (ins VGPR_32:$src0, i32imm:$round)>; 233 234def FPTRUNC_ROUND_F32_F64_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 235 (ins VReg_64:$src0, i32imm:$round)>; 236} // End Uses = [MODE, EXEC] 237 238def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))), 239 (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>; 240 241def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), 242 (FPTRUNC_ROUND_F32_F64_PSEUDO $src0, (as_hw_round_mode $round))>; 243 244// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 245// restoring it after we're done. 246let isConvergent = 1 in 247def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>; 248 249foreach vt = Reg32Types.types in { 250def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), 251 (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>; 252} 253 254def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), 255 (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; 256 257let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 258 def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 259 (ins VSrc_b32: $src, VSrc_b32:$strategy), 260 [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { 261 } 262 263 def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 264 (ins VSrc_b32: $src, VSrc_b32:$strategy), 265 [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { 266 } 267} 268 269let usesCustomInserter = 1, Defs = [VCC] in { 270def V_ADD_U64_PSEUDO : VPseudoInstSI < 271 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 272 [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] 273>; 274 275def V_SUB_U64_PSEUDO : VPseudoInstSI < 276 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 277 [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] 278>; 279} // End usesCustomInserter = 1, Defs = [VCC] 280 281let usesCustomInserter = 1, Defs = [SCC] in { 282def S_ADD_U64_PSEUDO : SPseudoInstSI < 283 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 284 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 285>; 286 287def S_SUB_U64_PSEUDO : SPseudoInstSI < 288 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 289 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 290>; 291 292def S_ADD_CO_PSEUDO : SPseudoInstSI < 293 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 294>; 295 296def S_SUB_CO_PSEUDO : SPseudoInstSI < 297 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 298>; 299 300def S_UADDO_PSEUDO : SPseudoInstSI < 301 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 302>; 303 304def S_USUBO_PSEUDO : SPseudoInstSI < 305 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 306>; 307 308let OtherPredicates = [HasShaderCyclesHiLoRegisters] in 309def GET_SHADERCYCLESHILO : SPseudoInstSI< 310 (outs SReg_64:$sdst), (ins), 311 [(set SReg_64:$sdst, (i64 (readcyclecounter)))] 312>; 313 314} // End usesCustomInserter = 1, Defs = [SCC] 315 316let usesCustomInserter = 1 in { 317def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 318 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 319} // End let usesCustomInserter = 1, SALU = 1 320 321// Wrap an instruction by duplicating it, except for setting isTerminator. 322class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 323 base_inst.OutOperandList, 324 base_inst.InOperandList> { 325 let Uses = base_inst.Uses; 326 let Defs = base_inst.Defs; 327 let isTerminator = 1; 328 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 329 let hasSideEffects = base_inst.hasSideEffects; 330 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 331 let CodeSize = base_inst.CodeSize; 332 let SchedRW = base_inst.SchedRW; 333} 334 335let WaveSizePredicate = isWave64 in { 336def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 337def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 338def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; 339def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 340def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; 341def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>; 342} 343 344let WaveSizePredicate = isWave32 in { 345def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 346def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 347def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 348def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 349def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; 350def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>; 351} 352 353 354def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 355 [(int_amdgcn_wave_barrier)]> { 356 let SchedRW = []; 357 let hasNoSchedulingInfo = 1; 358 let hasSideEffects = 1; 359 let mayLoad = 0; 360 let mayStore = 0; 361 let isConvergent = 1; 362 let FixedSize = 1; 363 let Size = 0; 364 let isMeta = 1; 365} 366 367def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), 368 [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { 369 let SchedRW = []; 370 let hasNoSchedulingInfo = 1; 371 let hasSideEffects = 1; 372 let mayLoad = 0; 373 let mayStore = 0; 374 let isConvergent = 1; 375 let FixedSize = 1; 376 let Size = 0; 377 let isMeta = 1; 378} 379 380def SCHED_GROUP_BARRIER : SPseudoInstSI< 381 (outs), 382 (ins i32imm:$mask, i32imm:$size, i32imm:$syncid), 383 [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> { 384 let SchedRW = []; 385 let hasNoSchedulingInfo = 1; 386 let hasSideEffects = 1; 387 let mayLoad = 0; 388 let mayStore = 0; 389 let isConvergent = 1; 390 let FixedSize = 1; 391 let Size = 0; 392 let isMeta = 1; 393} 394 395def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), 396 [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { 397 let SchedRW = []; 398 let hasNoSchedulingInfo = 1; 399 let hasSideEffects = 1; 400 let mayLoad = 0; 401 let mayStore = 0; 402 let isConvergent = 1; 403 let FixedSize = 1; 404 let Size = 0; 405 let isMeta = 1; 406} 407 408// SI pseudo instructions. These are used by the CFG structurizer pass 409// and should be lowered to ISA instructions prior to codegen. 410 411// As we have enhanced control flow intrinsics to work under unstructured CFG, 412// duplicating such intrinsics can be actually treated as legal. On the contrary, 413// by making them non-duplicable, we are observing better code generation result. 414// So we choose to mark them non-duplicable in hope of getting better code 415// generation as well as simplied CFG during Machine IR optimization stage. 416 417let isTerminator = 1, isNotDuplicable = 1 in { 418 419def SI_IF: CFPseudoInstSI < 420 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 421 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 422 let Constraints = ""; 423 let Size = 12; 424 let hasSideEffects = 1; 425 let IsNeverUniform = 1; 426} 427 428def SI_ELSE : CFPseudoInstSI < 429 (outs SReg_1:$dst), 430 (ins SReg_1:$src, brtarget:$target), [], 1, 1> { 431 let Size = 12; 432 let hasSideEffects = 1; 433 let IsNeverUniform = 1; 434} 435 436def SI_WATERFALL_LOOP : CFPseudoInstSI < 437 (outs), 438 (ins brtarget:$target), [], 1> { 439 let Size = 8; 440 let isBranch = 1; 441 let Defs = []; 442} 443 444def SI_LOOP : CFPseudoInstSI < 445 (outs), (ins SReg_1:$saved, brtarget:$target), 446 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 447 let Size = 8; 448 let isBranch = 1; 449 let hasSideEffects = 1; 450 let IsNeverUniform = 1; 451} 452 453} // End isTerminator = 1 454 455def SI_END_CF : CFPseudoInstSI < 456 (outs), (ins SReg_1:$saved), [], 1, 1> { 457 let Size = 4; 458 let isAsCheapAsAMove = 1; 459 let isReMaterializable = 1; 460 let hasSideEffects = 1; 461 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 462 let mayLoad = 1; // FIXME: Should not need memory flags 463 let mayStore = 1; 464} 465 466def SI_IF_BREAK : CFPseudoInstSI < 467 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 468 let Size = 4; 469 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 470 let isAsCheapAsAMove = 1; 471 let isReMaterializable = 1; 472} 473 474// Branch to the early termination block of the shader if SCC is 0. 475// This uses SCC from a previous SALU operation, i.e. the update of 476// a mask of live lanes after a kill/demote operation. 477// Only valid in pixel shaders. 478def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { 479 let Uses = [EXEC,SCC]; 480} 481 482let Uses = [EXEC] in { 483 484multiclass PseudoInstKill <dag ins> { 485 // Even though this pseudo can usually be expanded without an SCC def, we 486 // conservatively assume that it has an SCC def, both because it is sometimes 487 // required in degenerate cases (when V_CMPX cannot be used due to constant 488 // bus limitations) and because it allows us to avoid having to track SCC 489 // liveness across basic blocks. 490 let Defs = [EXEC,SCC] in 491 def _PSEUDO : PseudoInstSI <(outs), ins> { 492 let isConvergent = 1; 493 let usesCustomInserter = 1; 494 } 495 496 let Defs = [EXEC,SCC] in 497 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 498 let isTerminator = 1; 499 } 500} 501 502defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 503let Defs = [VCC] in 504defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 505 506let Defs = [EXEC,VCC] in 507def SI_ILLEGAL_COPY : SPseudoInstSI < 508 (outs unknown:$dst), (ins unknown:$src), 509 [], " ; illegal copy $src to $dst">; 510 511} // End Uses = [EXEC], Defs = [EXEC,VCC] 512 513// Branch on undef scc. Used to avoid intermediate copy from 514// IMPLICIT_DEF to SCC. 515def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> { 516 let isTerminator = 1; 517 let usesCustomInserter = 1; 518 let isBranch = 1; 519} 520 521def SI_PS_LIVE : PseudoInstSI < 522 (outs SReg_1:$dst), (ins), 523 [(set i1:$dst, (int_amdgcn_ps_live))]> { 524 let SALU = 1; 525} 526 527let Uses = [EXEC] in { 528def SI_LIVE_MASK : PseudoInstSI < 529 (outs SReg_1:$dst), (ins), 530 [(set i1:$dst, (int_amdgcn_live_mask))]> { 531 let SALU = 1; 532} 533let Defs = [EXEC,SCC] in { 534// Demote: Turn a pixel shader thread into a helper lane. 535def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; 536} // End Defs = [EXEC,SCC] 537} // End Uses = [EXEC] 538 539def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 540 [(int_amdgcn_unreachable)], 541 "; divergent unreachable"> { 542 let Size = 0; 543 let hasNoSchedulingInfo = 1; 544 let FixedSize = 1; 545 let isMeta = 1; 546 let maybeAtomic = 0; 547} 548 549// Used as an isel pseudo to directly emit initialization with an 550// s_mov_b32 rather than a copy of another initialized 551// register. MachineCSE skips copies, and we don't want to have to 552// fold operands before it runs. 553def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 554 let Defs = [M0]; 555 let usesCustomInserter = 1; 556 let isAsCheapAsAMove = 1; 557 let isReMaterializable = 1; 558} 559 560def SI_INIT_EXEC : SPseudoInstSI < 561 (outs), (ins i64imm:$src), 562 [(int_amdgcn_init_exec (i64 timm:$src))]> { 563 let Defs = [EXEC]; 564 let isAsCheapAsAMove = 1; 565} 566 567def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 568 (outs), (ins SSrc_b32:$input, i32imm:$shift), 569 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 570 let Defs = [EXEC]; 571} 572 573// Sets EXEC to all lanes and returns the previous EXEC. 574def SI_INIT_WHOLE_WAVE : SPseudoInstSI < 575 (outs SReg_1:$dst), (ins), 576 [(set i1:$dst, (int_amdgcn_init_whole_wave))]> { 577 let Defs = [EXEC]; 578 let Uses = [EXEC]; 579 580 let isConvergent = 1; 581} 582 583// Return for returning shaders to a shader variant epilog. 584def SI_RETURN_TO_EPILOG : SPseudoInstSI < 585 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 586 let isTerminator = 1; 587 let isBarrier = 1; 588 let isReturn = 1; 589 let hasNoSchedulingInfo = 1; 590 let DisableWQM = 1; 591 let FixedSize = 1; 592 593 // TODO: Should this be true? 594 let isMeta = 0; 595} 596 597// Return for returning function calls. 598def SI_RETURN : SPseudoInstSI < 599 (outs), (ins), [(AMDGPUret_glue)], 600 "; return"> { 601 let isTerminator = 1; 602 let isBarrier = 1; 603 let isReturn = 1; 604 let SchedRW = [WriteBranch]; 605} 606 607// Return for returning function calls without output register. 608// 609// This version is only needed so we can fill in the output register 610// in the custom inserter. 611def SI_CALL_ISEL : SPseudoInstSI < 612 (outs), (ins SSrc_b64:$src0, unknown:$callee), 613 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 614 let Size = 4; 615 let isCall = 1; 616 let SchedRW = [WriteBranch]; 617 let usesCustomInserter = 1; 618 // TODO: Should really base this on the call target 619 let isConvergent = 1; 620} 621 622def : GCNPat< 623 (AMDGPUcall i64:$src0, (i64 0)), 624 (SI_CALL_ISEL $src0, (i64 0)) 625>; 626 627// Wrapper around s_swappc_b64 with extra $callee parameter to track 628// the called function after regalloc. 629def SI_CALL : SPseudoInstSI < 630 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 631 let Size = 4; 632 let FixedSize = 1; 633 let isCall = 1; 634 let UseNamedOperandTable = 1; 635 let SchedRW = [WriteBranch]; 636 // TODO: Should really base this on the call target 637 let isConvergent = 1; 638} 639 640class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), 641 (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), 642 [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 643 let Size = 4; 644 let FixedSize = 1; 645 let isCall = 1; 646 let isTerminator = 1; 647 let isReturn = 1; 648 let isBarrier = 1; 649 let UseNamedOperandTable = 1; 650 let SchedRW = [WriteBranch]; 651 // TODO: Should really base this on the call target 652 let isConvergent = 1; 653} 654 655// Tail call handling pseudo 656def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; 657def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; 658 659// Handle selecting indirect tail calls 660def : GCNPat< 661 (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), 662 (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 663>; 664 665// Handle selecting indirect tail calls for AMDGPU_gfx 666def : GCNPat< 667 (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)), 668 (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 669>; 670 671// Pseudo for the llvm.amdgcn.cs.chain intrinsic. 672// This is essentially a tail call, but it also takes a mask to put in EXEC 673// right before jumping to the callee. 674class SI_CS_CHAIN_TC< 675 ValueType execvt, Predicate wavesizepred, 676 RegisterOperand execrc = getSOPSrcForVT<execvt>.ret> 677 : SPseudoInstSI <(outs), 678 (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)> { 679 let FixedSize = 0; 680 let isCall = 1; 681 let isTerminator = 1; 682 let isBarrier = 1; 683 let isReturn = 1; 684 let UseNamedOperandTable = 1; 685 let SchedRW = [WriteBranch]; 686 let isConvergent = 1; 687 688 let WaveSizePredicate = wavesizepred; 689} 690 691def SI_CS_CHAIN_TC_W32 : SI_CS_CHAIN_TC<i32, isWave32>; 692def SI_CS_CHAIN_TC_W64 : SI_CS_CHAIN_TC<i64, isWave64>; 693 694// Handle selecting direct & indirect calls via SI_CS_CHAIN_TC_W32/64 695multiclass si_cs_chain_tc_pattern< 696 dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> { 697def : GCNPat< 698 (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec), 699 (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) 700>; 701} 702 703multiclass si_cs_chain_tc_patterns< 704 ValueType execvt, 705 RegisterOperand execrc = getSOPSrcForVT<execvt>.ret, 706 Instruction tc = !if(!eq(execvt, i32), SI_CS_CHAIN_TC_W32, SI_CS_CHAIN_TC_W64) 707 > { 708 defm direct: si_cs_chain_tc_pattern<(tglobaladdr:$callee), execvt, execrc, tc>; 709 defm indirect: si_cs_chain_tc_pattern<(i64 0), execvt, execrc, tc>; 710} 711 712defm : si_cs_chain_tc_patterns<i32>; 713defm : si_cs_chain_tc_patterns<i64>; 714 715def ADJCALLSTACKUP : SPseudoInstSI< 716 (outs), (ins i32imm:$amt0, i32imm:$amt1), 717 [(callseq_start timm:$amt0, timm:$amt1)], 718 "; adjcallstackup $amt0 $amt1"> { 719 let Size = 8; // Worst case. (s_add_u32 + constant) 720 let FixedSize = 1; 721 let hasSideEffects = 1; 722 let usesCustomInserter = 1; 723 let SchedRW = [WriteSALU]; 724 let Defs = [SCC]; 725} 726 727def ADJCALLSTACKDOWN : SPseudoInstSI< 728 (outs), (ins i32imm:$amt1, i32imm:$amt2), 729 [(callseq_end timm:$amt1, timm:$amt2)], 730 "; adjcallstackdown $amt1"> { 731 let Size = 8; // Worst case. (s_add_u32 + constant) 732 let hasSideEffects = 1; 733 let usesCustomInserter = 1; 734 let SchedRW = [WriteSALU]; 735 let Defs = [SCC]; 736} 737 738let Defs = [M0, EXEC, SCC], 739 UseNamedOperandTable = 1 in { 740 741// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 742// addressing implementation. 743class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 744 (outs VGPR_32:$vdst), 745 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 746 let usesCustomInserter = 1; 747} 748 749class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 750 (outs rc:$vdst), 751 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 752 let Constraints = "$src = $vdst"; 753 let usesCustomInserter = 1; 754} 755 756def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 757def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 758def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 759def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 760def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>; 761def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>; 762def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>; 763def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>; 764def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 765def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 766 767def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 768def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 769def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 770def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 771def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>; 772def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>; 773def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>; 774def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>; 775def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 776def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 777 778} // End Uses = [EXEC], Defs = [M0, EXEC] 779 780// This is a pseudo variant of the v_movreld_b32 instruction in which the 781// vector operand appears only twice, once as def and once as use. Using this 782// pseudo avoids problems with the Two Address instructions pass. 783class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 784 RegisterOperand val_ty> : PseudoInstSI < 785 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 786 let Constraints = "$vsrc = $vdst"; 787 let Uses = [M0]; 788} 789 790class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 791 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { 792 let VALU = 1; 793 let VOP1 = 1; 794 let Uses = [M0, EXEC]; 795} 796 797class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 798 RegisterOperand val_ty> : 799 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { 800 let SALU = 1; 801 let SOP1 = 1; 802 let Uses = [M0]; 803} 804 805class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 806 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; 807class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : 808 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; 809 810def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; 811def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; 812def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; 813def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; 814def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; 815def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; 816def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>; 817def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>; 818def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>; 819def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>; 820def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; 821def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; 822 823def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; 824def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; 825def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; 826def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; 827def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; 828def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; 829def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>; 830def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>; 831def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>; 832def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>; 833def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; 834def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; 835 836def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; 837def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; 838def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; 839def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; 840def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; 841 842// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these 843// pseudos we avoid spills or copies being inserted within indirect sequences 844// that switch the VGPR indexing mode. Spills to accvgprs could be effected by 845// this mode switching. 846 847class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 848 (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { 849 let Constraints = "$vsrc = $vdst"; 850 let VALU = 1; 851 let Uses = [M0, EXEC]; 852 let Defs = [M0]; 853} 854 855def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; 856def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; 857def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; 858def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; 859def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; 860def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; 861def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>; 862def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>; 863def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>; 864def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>; 865def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; 866def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; 867 868class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 869 (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { 870 let VALU = 1; 871 let Uses = [M0, EXEC]; 872 let Defs = [M0]; 873} 874 875def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; 876def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; 877def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; 878def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; 879def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; 880def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; 881def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>; 882def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>; 883def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>; 884def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>; 885def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; 886def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; 887 888multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 889 let UseNamedOperandTable = 1, Spill = 1, SALU = 1, Uses = [EXEC] in { 890 def _SAVE : PseudoInstSI < 891 (outs), 892 (ins sgpr_class:$data, i32imm:$addr)> { 893 let mayStore = 1; 894 let mayLoad = 0; 895 } 896 897 def _RESTORE : PseudoInstSI < 898 (outs sgpr_class:$data), 899 (ins i32imm:$addr)> { 900 let mayStore = 0; 901 let mayLoad = 1; 902 } 903 } // End UseNamedOperandTable = 1 904} 905 906// You cannot use M0 as the output of v_readlane_b32 instructions or 907// use it in the sdata operand of SMEM instructions. We still need to 908// be able to spill the physical register m0, so allow it for 909// SI_SPILL_32_* instructions. 910defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 911defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 912defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 913defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 914defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 915defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 916defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; 917defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 918defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>; 919defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>; 920defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>; 921defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; 922defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 923defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 924 925let Spill = 1, VALU = 1, isConvergent = 1 in { 926def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst), 927 (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> { 928 let Size = 4; 929 let FixedSize = 1; 930 let IsNeverUniform = 1; 931 let hasSideEffects = 0; 932 let mayLoad = 0; 933 let mayStore = 0; 934 let hasExtraDefRegAllocReq = 1; 935 let Constraints = "$vdst = $vdst_in"; 936} 937 938def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), 939 (ins VGPR_32:$src0, i32imm:$src1)> { 940 let Size = 4; 941 let FixedSize = 1; 942 let hasSideEffects = 0; 943 let mayLoad = 0; 944 let mayStore = 0; 945 let hasExtraSrcRegAllocReq = 1; 946} 947} // End Spill = 1, VALU = 1, isConvergent = 1 948 949// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register 950// needs to be used and an extra instruction to move between VGPR and AGPR. 951// UsesTmp adds to the total size of an expanded spill in this case. 952multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { 953 let UseNamedOperandTable = 1, Spill = 1, VALU = 1, 954 SchedRW = [WriteVMEM] in { 955 def _SAVE : VPseudoInstSI < 956 (outs), 957 (ins vgpr_class:$vdata, i32imm:$vaddr, 958 SReg_32:$soffset, i32imm:$offset)> { 959 let mayStore = 1; 960 let mayLoad = 0; 961 // (2 * 4) + (8 * num_subregs) bytes maximum 962 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 963 // Size field is unsigned char and cannot fit more. 964 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 965 } 966 967 def _RESTORE : VPseudoInstSI < 968 (outs vgpr_class:$vdata), 969 (ins i32imm:$vaddr, 970 SReg_32:$soffset, i32imm:$offset)> { 971 let mayStore = 0; 972 let mayLoad = 1; 973 974 // (2 * 4) + (8 * num_subregs) bytes maximum 975 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 976 // Size field is unsigned char and cannot fit more. 977 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 978 } 979 } // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] 980} 981 982defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 983defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 984defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 985defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 986defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 987defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 988defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; 989defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 990defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; 991defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; 992defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; 993defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; 994defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 995defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 996 997defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; 998defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; 999defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; 1000defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; 1001defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; 1002defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; 1003defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; 1004defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; 1005defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; 1006defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; 1007defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; 1008defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; 1009defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; 1010defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; 1011 1012defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; 1013defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; 1014defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; 1015defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; 1016defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; 1017defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; 1018defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; 1019defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; 1020defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; 1021defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; 1022defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; 1023defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; 1024defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; 1025defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; 1026 1027let isConvergent = 1 in { 1028 defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; 1029 defm SI_SPILL_WWM_AV32 : SI_SPILL_VGPR <AV_32, 1>; 1030} 1031 1032let isReMaterializable = 1, isAsCheapAsAMove = 1 in 1033def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 1034 (outs SReg_64:$dst), 1035 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 1036 [(set SReg_64:$dst, 1037 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 1038 let Defs = [SCC]; 1039} 1040 1041def : GCNPat < 1042 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 1043 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 1044>; 1045 1046def : GCNPat< 1047 (AMDGPUtrap timm:$trapid), 1048 (S_TRAP $trapid) 1049>; 1050 1051def : GCNPat< 1052 (AMDGPUelse i1:$src, bb:$target), 1053 (SI_ELSE $src, $target) 1054>; 1055 1056def : GCNPat < 1057 (int_amdgcn_kill i1:$src), 1058 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 1059>; 1060 1061def : GCNPat < 1062 (int_amdgcn_kill (i1 (not i1:$src))), 1063 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 1064>; 1065 1066let SubtargetPredicate = NotHasSALUFloatInsts in 1067def : GCNPat < 1068 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 1069 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 1070>; 1071 1072def : GCNPat < 1073 (int_amdgcn_wqm_demote i1:$src), 1074 (SI_DEMOTE_I1 SCSrc_i1:$src, 0) 1075>; 1076 1077def : GCNPat < 1078 (int_amdgcn_wqm_demote (i1 (not i1:$src))), 1079 (SI_DEMOTE_I1 SCSrc_i1:$src, -1) 1080>; 1081 1082 // TODO: we could add more variants for other types of conditionals 1083 1084def : GCNPat < 1085 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1086 (COPY $src) // Return the SGPRs representing i1 src 1087>; 1088 1089def : GCNPat < 1090 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1091 (COPY $src) // Return the SGPRs representing i1 src 1092>; 1093 1094//===----------------------------------------------------------------------===// 1095// VOP1 Patterns 1096//===----------------------------------------------------------------------===// 1097 1098multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { 1099 // f16_to_fp patterns 1100 def : GCNPat < 1101 (f32 (any_f16_to_fp i32:$src0)), 1102 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) 1103 >; 1104 1105 def : GCNPat < 1106 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 1107 (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) 1108 >; 1109 1110 def : GCNPat < 1111 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 1112 (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 1113 >; 1114 1115 def : GCNPat < 1116 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 1117 (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) 1118 >; 1119 1120 def : GCNPat < 1121 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 1122 (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) 1123 >; 1124 1125 // fp_to_fp16 patterns 1126 def : GCNPat < 1127 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1128 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1129 >; 1130 1131 // This is only used on targets without half support 1132 // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering 1133 def : GCNPat < 1134 (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1135 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1136 >; 1137} 1138 1139let True16Predicate = NotHasTrue16BitInsts in 1140defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>; 1141 1142let True16Predicate = UseFakeTrue16Insts in 1143defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>; 1144 1145multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, 1146 Instruction cvt_f32_f16_inst_e64, 1147 RegOrImmOperand VSrc> { 1148 def : GCNPat < 1149 (f64 (any_fpextend f16:$src)), 1150 (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) 1151 >; 1152 1153 def : GCNPat < 1154 (i32 (fp_to_sint f16:$src)), 1155 (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) 1156 >; 1157 1158 def : GCNPat < 1159 (i32 (fp_to_uint f16:$src)), 1160 (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) 1161 >; 1162 1163 def : GCNPat < 1164 (f16 (sint_to_fp i32:$src)), 1165 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src)) 1166 >; 1167 1168 def : GCNPat < 1169 (f16 (uint_to_fp i32:$src)), 1170 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src)) 1171 >; 1172} 1173 1174let True16Predicate = NotHasTrue16BitInsts in 1175defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>; 1176 1177let True16Predicate = UseRealTrue16Insts in 1178defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>; 1179 1180let True16Predicate = UseFakeTrue16Insts in 1181defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>; 1182 1183//===----------------------------------------------------------------------===// 1184// VOP2 Patterns 1185//===----------------------------------------------------------------------===// 1186 1187// NoMods pattern used for mac. If there are any source modifiers then it's 1188// better to select mad instead of mac. 1189class FMADPat <ValueType vt, Instruction inst> 1190 : GCNPat <(vt (any_fmad (vt (VOP3NoMods vt:$src0)), 1191 (vt (VOP3NoMods vt:$src1)), 1192 (vt (VOP3NoMods vt:$src2)))), 1193 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1194 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1195>; 1196 1197// Prefer mac form when there are no modifiers. 1198let AddedComplexity = 9 in { 1199let OtherPredicates = [HasMadMacF32Insts] in 1200def : FMADPat <f32, V_MAC_F32_e64>; 1201 1202// Don't allow source modifiers. If there are any source modifiers then it's 1203// better to select mad instead of mac. 1204let SubtargetPredicate = isGFX6GFX7GFX10, 1205 OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1206def : GCNPat < 1207 (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), 1208 (VOP3NoMods f32:$src1)), 1209 (VOP3NoMods f32:$src2))), 1210 (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1211 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1212>; 1213 1214// Don't allow source modifiers. If there are any source modifiers then it's 1215// better to select fma instead of fmac. 1216let SubtargetPredicate = HasFmaLegacy32 in 1217def : GCNPat < 1218 (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), 1219 (VOP3NoMods f32:$src1), 1220 (VOP3NoMods f32:$src2))), 1221 (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1222 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1223>; 1224 1225let SubtargetPredicate = Has16BitInsts in 1226def : FMADPat <f16, V_MAC_F16_e64>; 1227} // AddedComplexity = 9 1228 1229let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1230def : GCNPat < 1231 (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), 1232 (VOP3Mods f32:$src1, i32:$src1_mod)), 1233 (VOP3Mods f32:$src2, i32:$src2_mod))), 1234 (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, 1235 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1236>; 1237 1238class VOPSelectModsPat <ValueType vt> : GCNPat < 1239 (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods), 1240 (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))), 1241 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 1242 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 1243>; 1244 1245class VOPSelectPat <ValueType vt> : GCNPat < 1246 (vt (select i1:$src0, vt:$src1, vt:$src2)), 1247 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 1248>; 1249class VOPSelectPat_t16 <ValueType vt> : GCNPat < 1250 (vt (select i1:$src0, vt:$src1, vt:$src2)), 1251 (V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0) 1252>; 1253 1254def : VOPSelectModsPat <i32>; 1255def : VOPSelectModsPat <f32>; 1256foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in 1257let True16Predicate = p in { 1258 def : VOPSelectPat <f16>; 1259 def : VOPSelectPat <i16>; 1260} // End True16Predicate = p 1261let True16Predicate = UseRealTrue16Insts in { 1262 def : VOPSelectPat_t16 <f16>; 1263 def : VOPSelectPat_t16 <i16>; 1264} // End True16Predicate = UseRealTrue16Insts 1265 1266let AddedComplexity = 1 in { 1267def : GCNPat < 1268 (i32 (add (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), i32:$val)), 1269 (V_BCNT_U32_B32_e64 $popcnt, $val) 1270>; 1271} 1272 1273def : GCNPat < 1274 (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), 1275 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 1276>; 1277 1278def : GCNPat < 1279 (i16 (add (i16 (trunc (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)))), i16:$val)), 1280 (V_BCNT_U32_B32_e64 $popcnt, $val) 1281>; 1282 1283def : GCNPat < 1284 (i64 (DivergentUnaryFrag<ctpop> i64:$src)), 1285 (REG_SEQUENCE VReg_64, 1286 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), 1287 (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, 1288 (i32 (V_MOV_B32_e32 (i32 0))), sub1) 1289>; 1290 1291/********** ============================================ **********/ 1292/********** Extraction, Insertion, Building and Casting **********/ 1293/********** ============================================ **********/ 1294 1295// Special case for 2 element vectors. REQ_SEQUENCE produces better code 1296// than an INSERT_SUBREG. 1297multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> { 1298 def : GCNPat < 1299 (insertelt vec_type:$vec, elem_type:$elem, 0), 1300 (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1) 1301 >; 1302 1303 def : GCNPat < 1304 (insertelt vec_type:$vec, elem_type:$elem, 1), 1305 (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1) 1306 >; 1307} 1308 1309foreach Index = 0-1 in { 1310 def Extract_Element_v2i32_#Index : Extract_Element < 1311 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 1312 >; 1313 1314 def Extract_Element_v2f32_#Index : Extract_Element < 1315 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 1316 >; 1317} 1318 1319defm : Insert_Element_V2 <SReg_64, i32, v2i32>; 1320defm : Insert_Element_V2 <SReg_64, f32, v2f32>; 1321 1322foreach Index = 0-2 in { 1323 def Extract_Element_v3i32_#Index : Extract_Element < 1324 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1325 >; 1326 def Insert_Element_v3i32_#Index : Insert_Element < 1327 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1328 >; 1329 1330 def Extract_Element_v3f32_#Index : Extract_Element < 1331 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1332 >; 1333 def Insert_Element_v3f32_#Index : Insert_Element < 1334 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1335 >; 1336} 1337 1338foreach Index = 0-3 in { 1339 def Extract_Element_v4i32_#Index : Extract_Element < 1340 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1341 >; 1342 def Insert_Element_v4i32_#Index : Insert_Element < 1343 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1344 >; 1345 1346 def Extract_Element_v4f32_#Index : Extract_Element < 1347 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1348 >; 1349 def Insert_Element_v4f32_#Index : Insert_Element < 1350 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1351 >; 1352} 1353 1354foreach Index = 0-4 in { 1355 def Extract_Element_v5i32_#Index : Extract_Element < 1356 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1357 >; 1358 def Insert_Element_v5i32_#Index : Insert_Element < 1359 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1360 >; 1361 1362 def Extract_Element_v5f32_#Index : Extract_Element < 1363 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1364 >; 1365 def Insert_Element_v5f32_#Index : Insert_Element < 1366 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1367 >; 1368} 1369 1370foreach Index = 0-5 in { 1371 def Extract_Element_v6i32_#Index : Extract_Element < 1372 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1373 >; 1374 def Insert_Element_v6i32_#Index : Insert_Element < 1375 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1376 >; 1377 1378 def Extract_Element_v6f32_#Index : Extract_Element < 1379 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1380 >; 1381 def Insert_Element_v6f32_#Index : Insert_Element < 1382 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1383 >; 1384} 1385 1386foreach Index = 0-6 in { 1387 def Extract_Element_v7i32_#Index : Extract_Element < 1388 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1389 >; 1390 def Insert_Element_v7i32_#Index : Insert_Element < 1391 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1392 >; 1393 1394 def Extract_Element_v7f32_#Index : Extract_Element < 1395 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1396 >; 1397 def Insert_Element_v7f32_#Index : Insert_Element < 1398 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1399 >; 1400} 1401 1402foreach Index = 0-7 in { 1403 def Extract_Element_v8i32_#Index : Extract_Element < 1404 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1405 >; 1406 def Insert_Element_v8i32_#Index : Insert_Element < 1407 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1408 >; 1409 1410 def Extract_Element_v8f32_#Index : Extract_Element < 1411 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1412 >; 1413 def Insert_Element_v8f32_#Index : Insert_Element < 1414 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1415 >; 1416} 1417 1418foreach Index = 0-8 in { 1419 def Extract_Element_v9i32_#Index : Extract_Element < 1420 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1421 >; 1422 def Insert_Element_v9i32_#Index : Insert_Element < 1423 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1424 >; 1425 1426 def Extract_Element_v9f32_#Index : Extract_Element < 1427 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1428 >; 1429 def Insert_Element_v9f32_#Index : Insert_Element < 1430 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1431 >; 1432} 1433 1434foreach Index = 0-9 in { 1435 def Extract_Element_v10i32_#Index : Extract_Element < 1436 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1437 >; 1438 def Insert_Element_v10i32_#Index : Insert_Element < 1439 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1440 >; 1441 1442 def Extract_Element_v10f32_#Index : Extract_Element < 1443 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1444 >; 1445 def Insert_Element_v10f32_#Index : Insert_Element < 1446 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1447 >; 1448} 1449 1450foreach Index = 0-10 in { 1451 def Extract_Element_v11i32_#Index : Extract_Element < 1452 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1453 >; 1454 def Insert_Element_v11i32_#Index : Insert_Element < 1455 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1456 >; 1457 1458 def Extract_Element_v11f32_#Index : Extract_Element < 1459 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1460 >; 1461 def Insert_Element_v11f32_#Index : Insert_Element < 1462 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1463 >; 1464} 1465 1466foreach Index = 0-11 in { 1467 def Extract_Element_v12i32_#Index : Extract_Element < 1468 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1469 >; 1470 def Insert_Element_v12i32_#Index : Insert_Element < 1471 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1472 >; 1473 1474 def Extract_Element_v12f32_#Index : Extract_Element < 1475 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1476 >; 1477 def Insert_Element_v12f32_#Index : Insert_Element < 1478 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1479 >; 1480} 1481 1482foreach Index = 0-15 in { 1483 def Extract_Element_v16i32_#Index : Extract_Element < 1484 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1485 >; 1486 def Insert_Element_v16i32_#Index : Insert_Element < 1487 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1488 >; 1489 1490 def Extract_Element_v16f32_#Index : Extract_Element < 1491 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1492 >; 1493 def Insert_Element_v16f32_#Index : Insert_Element < 1494 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1495 >; 1496} 1497 1498 1499foreach Index = 0-31 in { 1500 def Extract_Element_v32i32_#Index : Extract_Element < 1501 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1502 >; 1503 1504 def Insert_Element_v32i32_#Index : Insert_Element < 1505 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1506 >; 1507 1508 def Extract_Element_v32f32_#Index : Extract_Element < 1509 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1510 >; 1511 1512 def Insert_Element_v32f32_#Index : Insert_Element < 1513 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1514 >; 1515} 1516 1517// FIXME: Why do only some of these type combinations for SReg and 1518// VReg? 1519// 16-bit bitcast 1520def : BitConvert <i16, f16, VGPR_32>; 1521def : BitConvert <f16, i16, VGPR_32>; 1522def : BitConvert <f16, bf16, VGPR_32>; 1523def : BitConvert <bf16, f16, VGPR_32>; 1524 1525def : BitConvert <i16, f16, SReg_32>; 1526def : BitConvert <f16, i16, SReg_32>; 1527def : BitConvert <f16, bf16, SReg_32>; 1528def : BitConvert <bf16, f16, SReg_32>; 1529 1530def : BitConvert <i16, bf16, VGPR_32>; 1531def : BitConvert <bf16, i16, VGPR_32>; 1532def : BitConvert <i16, bf16, SReg_32>; 1533def : BitConvert <bf16, i16, SReg_32>; 1534 1535// 32-bit bitcast 1536def : BitConvert <i32, f32, VGPR_32>; 1537def : BitConvert <f32, i32, VGPR_32>; 1538def : BitConvert <i32, f32, SReg_32>; 1539def : BitConvert <f32, i32, SReg_32>; 1540def : BitConvert <v2i16, i32, SReg_32>; 1541def : BitConvert <i32, v2i16, SReg_32>; 1542def : BitConvert <v2f16, i32, SReg_32>; 1543def : BitConvert <i32, v2f16, SReg_32>; 1544def : BitConvert <v2i16, v2f16, SReg_32>; 1545def : BitConvert <v2f16, v2i16, SReg_32>; 1546def : BitConvert <v2f16, f32, SReg_32>; 1547def : BitConvert <f32, v2f16, SReg_32>; 1548def : BitConvert <v2i16, f32, SReg_32>; 1549def : BitConvert <f32, v2i16, SReg_32>; 1550def : BitConvert <v2bf16, i32, SReg_32>; 1551def : BitConvert <i32, v2bf16, SReg_32>; 1552def : BitConvert <v2bf16, i32, VGPR_32>; 1553def : BitConvert <i32, v2bf16, VGPR_32>; 1554def : BitConvert <v2bf16, v2i16, SReg_32>; 1555def : BitConvert <v2i16, v2bf16, SReg_32>; 1556def : BitConvert <v2bf16, v2i16, VGPR_32>; 1557def : BitConvert <v2i16, v2bf16, VGPR_32>; 1558def : BitConvert <v2bf16, v2f16, SReg_32>; 1559def : BitConvert <v2f16, v2bf16, SReg_32>; 1560def : BitConvert <v2bf16, v2f16, VGPR_32>; 1561def : BitConvert <v2f16, v2bf16, VGPR_32>; 1562def : BitConvert <f32, v2bf16, VGPR_32>; 1563def : BitConvert <v2bf16, f32, VGPR_32>; 1564def : BitConvert <f32, v2bf16, SReg_32>; 1565def : BitConvert <v2bf16, f32, SReg_32>; 1566 1567 1568// 64-bit bitcast 1569def : BitConvert <i64, f64, VReg_64>; 1570def : BitConvert <f64, i64, VReg_64>; 1571def : BitConvert <v2i32, v2f32, VReg_64>; 1572def : BitConvert <v2f32, v2i32, VReg_64>; 1573def : BitConvert <i64, v2i32, VReg_64>; 1574def : BitConvert <v2i32, i64, VReg_64>; 1575def : BitConvert <i64, v2f32, VReg_64>; 1576def : BitConvert <v2f32, i64, VReg_64>; 1577def : BitConvert <f64, v2f32, VReg_64>; 1578def : BitConvert <v2f32, f64, VReg_64>; 1579def : BitConvert <f64, v2i32, VReg_64>; 1580def : BitConvert <v2i32, f64, VReg_64>; 1581def : BitConvert <v4i16, v4f16, VReg_64>; 1582def : BitConvert <v4f16, v4i16, VReg_64>; 1583def : BitConvert <v4bf16, v2i32, VReg_64>; 1584def : BitConvert <v2i32, v4bf16, VReg_64>; 1585def : BitConvert <v4bf16, i64, VReg_64>; 1586def : BitConvert <i64, v4bf16, VReg_64>; 1587def : BitConvert <v4bf16, v4i16, VReg_64>; 1588def : BitConvert <v4i16, v4bf16, VReg_64>; 1589def : BitConvert <v4bf16, v4f16, VReg_64>; 1590def : BitConvert <v4f16, v4bf16, VReg_64>; 1591def : BitConvert <v4bf16, v2f32, VReg_64>; 1592def : BitConvert <v2f32, v4bf16, VReg_64>; 1593def : BitConvert <v4bf16, f64, VReg_64>; 1594def : BitConvert <f64, v4bf16, VReg_64>; 1595 1596 1597// FIXME: Make SGPR 1598def : BitConvert <v2i32, v4f16, VReg_64>; 1599def : BitConvert <v4f16, v2i32, VReg_64>; 1600def : BitConvert <v2i32, v4f16, VReg_64>; 1601def : BitConvert <v2i32, v4i16, VReg_64>; 1602def : BitConvert <v4i16, v2i32, VReg_64>; 1603def : BitConvert <v2f32, v4f16, VReg_64>; 1604def : BitConvert <v4f16, v2f32, VReg_64>; 1605def : BitConvert <v2f32, v4i16, VReg_64>; 1606def : BitConvert <v4i16, v2f32, VReg_64>; 1607def : BitConvert <v4i16, f64, VReg_64>; 1608def : BitConvert <v4f16, f64, VReg_64>; 1609def : BitConvert <f64, v4i16, VReg_64>; 1610def : BitConvert <f64, v4f16, VReg_64>; 1611def : BitConvert <v4i16, i64, VReg_64>; 1612def : BitConvert <v4f16, i64, VReg_64>; 1613def : BitConvert <i64, v4i16, VReg_64>; 1614def : BitConvert <i64, v4f16, VReg_64>; 1615 1616def : BitConvert <v4i32, v4f32, VReg_128>; 1617def : BitConvert <v4f32, v4i32, VReg_128>; 1618 1619// 96-bit bitcast 1620def : BitConvert <v3i32, v3f32, SGPR_96>; 1621def : BitConvert <v3f32, v3i32, SGPR_96>; 1622 1623// 128-bit bitcast 1624def : BitConvert <v2i64, v4i32, SReg_128>; 1625def : BitConvert <v4i32, v2i64, SReg_128>; 1626def : BitConvert <v2f64, v4f32, VReg_128>; 1627def : BitConvert <v2f64, v4i32, VReg_128>; 1628def : BitConvert <v4f32, v2f64, VReg_128>; 1629def : BitConvert <v4i32, v2f64, VReg_128>; 1630def : BitConvert <v2i64, v2f64, VReg_128>; 1631def : BitConvert <v2f64, v2i64, VReg_128>; 1632def : BitConvert <v4f32, v2i64, VReg_128>; 1633def : BitConvert <v2i64, v4f32, VReg_128>; 1634def : BitConvert <v8i16, v4i32, SReg_128>; 1635def : BitConvert <v4i32, v8i16, SReg_128>; 1636def : BitConvert <v8f16, v4f32, VReg_128>; 1637def : BitConvert <v8f16, v4i32, VReg_128>; 1638def : BitConvert <v4f32, v8f16, VReg_128>; 1639def : BitConvert <v4i32, v8f16, VReg_128>; 1640def : BitConvert <v8i16, v8f16, VReg_128>; 1641def : BitConvert <v8f16, v8i16, VReg_128>; 1642def : BitConvert <v4f32, v8i16, VReg_128>; 1643def : BitConvert <v8i16, v4f32, VReg_128>; 1644def : BitConvert <v8i16, v8f16, SReg_128>; 1645def : BitConvert <v8i16, v2i64, SReg_128>; 1646def : BitConvert <v8i16, v2f64, SReg_128>; 1647def : BitConvert <v8f16, v2i64, SReg_128>; 1648def : BitConvert <v8f16, v2f64, SReg_128>; 1649def : BitConvert <v8f16, v8i16, SReg_128>; 1650def : BitConvert <v2i64, v8i16, SReg_128>; 1651def : BitConvert <v2f64, v8i16, SReg_128>; 1652def : BitConvert <v2i64, v8f16, SReg_128>; 1653def : BitConvert <v2f64, v8f16, SReg_128>; 1654 1655def : BitConvert <v4i32, v8bf16, SReg_128>; 1656def : BitConvert <v8bf16, v4i32, SReg_128>; 1657def : BitConvert <v4i32, v8bf16, VReg_128>; 1658def : BitConvert <v8bf16, v4i32, VReg_128>; 1659 1660def : BitConvert <v4f32, v8bf16, SReg_128>; 1661def : BitConvert <v8bf16, v4f32, SReg_128>; 1662def : BitConvert <v4f32, v8bf16, VReg_128>; 1663def : BitConvert <v8bf16, v4f32, VReg_128>; 1664 1665def : BitConvert <v8i16, v8bf16, SReg_128>; 1666def : BitConvert <v8bf16, v8i16, SReg_128>; 1667def : BitConvert <v8i16, v8bf16, VReg_128>; 1668def : BitConvert <v8bf16, v8i16, VReg_128>; 1669 1670def : BitConvert <v8f16, v8bf16, SReg_128>; 1671def : BitConvert <v8bf16, v8f16, SReg_128>; 1672def : BitConvert <v8f16, v8bf16, VReg_128>; 1673def : BitConvert <v8bf16, v8f16, VReg_128>; 1674 1675def : BitConvert <v2f64, v8bf16, SReg_128>; 1676def : BitConvert <v8bf16, v2f64, SReg_128>; 1677def : BitConvert <v2f64, v8bf16, VReg_128>; 1678def : BitConvert <v8bf16, v2f64, VReg_128>; 1679 1680def : BitConvert <v2i64, v8bf16, SReg_128>; 1681def : BitConvert <v8bf16, v2i64, SReg_128>; 1682def : BitConvert <v2i64, v8bf16, VReg_128>; 1683def : BitConvert <v8bf16, v2i64, VReg_128>; 1684 1685 1686// 160-bit bitcast 1687def : BitConvert <v5i32, v5f32, SReg_160>; 1688def : BitConvert <v5f32, v5i32, SReg_160>; 1689def : BitConvert <v5i32, v5f32, VReg_160>; 1690def : BitConvert <v5f32, v5i32, VReg_160>; 1691 1692// 192-bit bitcast 1693def : BitConvert <v6i32, v6f32, SReg_192>; 1694def : BitConvert <v6f32, v6i32, SReg_192>; 1695def : BitConvert <v6i32, v6f32, VReg_192>; 1696def : BitConvert <v6f32, v6i32, VReg_192>; 1697def : BitConvert <v3i64, v3f64, VReg_192>; 1698def : BitConvert <v3f64, v3i64, VReg_192>; 1699def : BitConvert <v3i64, v6i32, VReg_192>; 1700def : BitConvert <v3i64, v6f32, VReg_192>; 1701def : BitConvert <v3f64, v6i32, VReg_192>; 1702def : BitConvert <v3f64, v6f32, VReg_192>; 1703def : BitConvert <v6i32, v3i64, VReg_192>; 1704def : BitConvert <v6f32, v3i64, VReg_192>; 1705def : BitConvert <v6i32, v3f64, VReg_192>; 1706def : BitConvert <v6f32, v3f64, VReg_192>; 1707 1708// 224-bit bitcast 1709def : BitConvert <v7i32, v7f32, SReg_224>; 1710def : BitConvert <v7f32, v7i32, SReg_224>; 1711def : BitConvert <v7i32, v7f32, VReg_224>; 1712def : BitConvert <v7f32, v7i32, VReg_224>; 1713 1714// 256-bit bitcast 1715def : BitConvert <v8i32, v8f32, SReg_256>; 1716def : BitConvert <v8f32, v8i32, SReg_256>; 1717def : BitConvert <v8i32, v8f32, VReg_256>; 1718def : BitConvert <v8f32, v8i32, VReg_256>; 1719def : BitConvert <v4i64, v4f64, VReg_256>; 1720def : BitConvert <v4f64, v4i64, VReg_256>; 1721def : BitConvert <v4i64, v8i32, VReg_256>; 1722def : BitConvert <v4i64, v8f32, VReg_256>; 1723def : BitConvert <v4f64, v8i32, VReg_256>; 1724def : BitConvert <v4f64, v8f32, VReg_256>; 1725def : BitConvert <v8i32, v4i64, VReg_256>; 1726def : BitConvert <v8f32, v4i64, VReg_256>; 1727def : BitConvert <v8i32, v4f64, VReg_256>; 1728def : BitConvert <v8f32, v4f64, VReg_256>; 1729def : BitConvert <v16i16, v16f16, SReg_256>; 1730def : BitConvert <v16f16, v16i16, SReg_256>; 1731def : BitConvert <v16i16, v16f16, VReg_256>; 1732def : BitConvert <v16f16, v16i16, VReg_256>; 1733def : BitConvert <v16f16, v8i32, VReg_256>; 1734def : BitConvert <v16i16, v8i32, VReg_256>; 1735def : BitConvert <v16f16, v8f32, VReg_256>; 1736def : BitConvert <v16i16, v8f32, VReg_256>; 1737def : BitConvert <v8i32, v16f16, VReg_256>; 1738def : BitConvert <v8i32, v16i16, VReg_256>; 1739def : BitConvert <v8f32, v16f16, VReg_256>; 1740def : BitConvert <v8f32, v16i16, VReg_256>; 1741def : BitConvert <v16f16, v4i64, VReg_256>; 1742def : BitConvert <v16i16, v4i64, VReg_256>; 1743def : BitConvert <v16f16, v4f64, VReg_256>; 1744def : BitConvert <v16i16, v4f64, VReg_256>; 1745def : BitConvert <v4i64, v16f16, VReg_256>; 1746def : BitConvert <v4i64, v16i16, VReg_256>; 1747def : BitConvert <v4f64, v16f16, VReg_256>; 1748def : BitConvert <v4f64, v16i16, VReg_256>; 1749 1750 1751def : BitConvert <v8i32, v16bf16, VReg_256>; 1752def : BitConvert <v16bf16, v8i32, VReg_256>; 1753def : BitConvert <v8f32, v16bf16, VReg_256>; 1754def : BitConvert <v16bf16, v8f32, VReg_256>; 1755def : BitConvert <v4i64, v16bf16, VReg_256>; 1756def : BitConvert <v16bf16, v4i64, VReg_256>; 1757def : BitConvert <v4f64, v16bf16, VReg_256>; 1758def : BitConvert <v16bf16, v4f64, VReg_256>; 1759 1760 1761 1762def : BitConvert <v16i16, v16bf16, SReg_256>; 1763def : BitConvert <v16bf16, v16i16, SReg_256>; 1764def : BitConvert <v16i16, v16bf16, VReg_256>; 1765def : BitConvert <v16bf16, v16i16, VReg_256>; 1766 1767def : BitConvert <v16f16, v16bf16, SReg_256>; 1768def : BitConvert <v16bf16, v16f16, SReg_256>; 1769def : BitConvert <v16f16, v16bf16, VReg_256>; 1770def : BitConvert <v16bf16, v16f16, VReg_256>; 1771 1772 1773 1774 1775// 288-bit bitcast 1776def : BitConvert <v9i32, v9f32, SReg_288>; 1777def : BitConvert <v9f32, v9i32, SReg_288>; 1778def : BitConvert <v9i32, v9f32, VReg_288>; 1779def : BitConvert <v9f32, v9i32, VReg_288>; 1780 1781// 320-bit bitcast 1782def : BitConvert <v10i32, v10f32, SReg_320>; 1783def : BitConvert <v10f32, v10i32, SReg_320>; 1784def : BitConvert <v10i32, v10f32, VReg_320>; 1785def : BitConvert <v10f32, v10i32, VReg_320>; 1786 1787// 320-bit bitcast 1788def : BitConvert <v11i32, v11f32, SReg_352>; 1789def : BitConvert <v11f32, v11i32, SReg_352>; 1790def : BitConvert <v11i32, v11f32, VReg_352>; 1791def : BitConvert <v11f32, v11i32, VReg_352>; 1792 1793// 384-bit bitcast 1794def : BitConvert <v12i32, v12f32, SReg_384>; 1795def : BitConvert <v12f32, v12i32, SReg_384>; 1796def : BitConvert <v12i32, v12f32, VReg_384>; 1797def : BitConvert <v12f32, v12i32, VReg_384>; 1798 1799// 512-bit bitcast 1800def : BitConvert <v32f16, v32i16, VReg_512>; 1801def : BitConvert <v32i16, v32f16, VReg_512>; 1802def : BitConvert <v32f16, v16i32, VReg_512>; 1803def : BitConvert <v32f16, v16f32, VReg_512>; 1804def : BitConvert <v16f32, v32f16, VReg_512>; 1805def : BitConvert <v16i32, v32f16, VReg_512>; 1806def : BitConvert <v32i16, v16i32, VReg_512>; 1807def : BitConvert <v32i16, v16f32, VReg_512>; 1808def : BitConvert <v16f32, v32i16, VReg_512>; 1809def : BitConvert <v16i32, v32i16, VReg_512>; 1810def : BitConvert <v16i32, v16f32, VReg_512>; 1811def : BitConvert <v16f32, v16i32, VReg_512>; 1812def : BitConvert <v8i64, v8f64, VReg_512>; 1813def : BitConvert <v8f64, v8i64, VReg_512>; 1814def : BitConvert <v8i64, v16i32, VReg_512>; 1815def : BitConvert <v8f64, v16i32, VReg_512>; 1816def : BitConvert <v16i32, v8i64, VReg_512>; 1817def : BitConvert <v16i32, v8f64, VReg_512>; 1818def : BitConvert <v8i64, v16f32, VReg_512>; 1819def : BitConvert <v8f64, v16f32, VReg_512>; 1820def : BitConvert <v16f32, v8i64, VReg_512>; 1821def : BitConvert <v16f32, v8f64, VReg_512>; 1822 1823 1824 1825def : BitConvert <v32bf16, v32i16, VReg_512>; 1826def : BitConvert <v32i16, v32bf16, VReg_512>; 1827def : BitConvert <v32bf16, v32i16, SReg_512>; 1828def : BitConvert <v32i16, v32bf16, SReg_512>; 1829 1830def : BitConvert <v32bf16, v32f16, VReg_512>; 1831def : BitConvert <v32f16, v32bf16, VReg_512>; 1832def : BitConvert <v32bf16, v32f16, SReg_512>; 1833def : BitConvert <v32f16, v32bf16, SReg_512>; 1834 1835def : BitConvert <v32bf16, v16i32, VReg_512>; 1836def : BitConvert <v16i32, v32bf16, VReg_512>; 1837def : BitConvert <v32bf16, v16i32, SReg_512>; 1838def : BitConvert <v16i32, v32bf16, SReg_512>; 1839 1840def : BitConvert <v32bf16, v16f32, VReg_512>; 1841def : BitConvert <v16f32, v32bf16, VReg_512>; 1842def : BitConvert <v32bf16, v16f32, SReg_512>; 1843def : BitConvert <v16f32, v32bf16, SReg_512>; 1844 1845def : BitConvert <v32bf16, v8f64, VReg_512>; 1846def : BitConvert <v8f64, v32bf16, VReg_512>; 1847def : BitConvert <v32bf16, v8f64, SReg_512>; 1848def : BitConvert <v8f64, v32bf16, SReg_512>; 1849 1850def : BitConvert <v32bf16, v8i64, VReg_512>; 1851def : BitConvert <v8i64, v32bf16, VReg_512>; 1852def : BitConvert <v32bf16, v8i64, SReg_512>; 1853def : BitConvert <v8i64, v32bf16, SReg_512>; 1854 1855// 1024-bit bitcast 1856def : BitConvert <v32i32, v32f32, VReg_1024>; 1857def : BitConvert <v32f32, v32i32, VReg_1024>; 1858def : BitConvert <v16i64, v16f64, VReg_1024>; 1859def : BitConvert <v16f64, v16i64, VReg_1024>; 1860def : BitConvert <v16i64, v32i32, VReg_1024>; 1861def : BitConvert <v32i32, v16i64, VReg_1024>; 1862def : BitConvert <v16f64, v32f32, VReg_1024>; 1863def : BitConvert <v32f32, v16f64, VReg_1024>; 1864def : BitConvert <v16i64, v32f32, VReg_1024>; 1865def : BitConvert <v32i32, v16f64, VReg_1024>; 1866def : BitConvert <v16f64, v32i32, VReg_1024>; 1867def : BitConvert <v32f32, v16i64, VReg_1024>; 1868 1869 1870/********** =================== **********/ 1871/********** Src & Dst modifiers **********/ 1872/********** =================== **********/ 1873 1874 1875// If denormals are not enabled, it only impacts the compare of the 1876// inputs. The output result is not flushed. 1877class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1878 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1879 (inst i32:$src0_modifiers, vt:$src0, 1880 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1881>; 1882 1883def : ClampPat<V_MAX_F32_e64, f32>; 1884let SubtargetPredicate = isNotGFX12Plus in 1885def : ClampPat<V_MAX_F64_e64, f64>; 1886let SubtargetPredicate = isGFX12Plus in 1887def : ClampPat<V_MAX_NUM_F64_e64, f64>; 1888let SubtargetPredicate = NotHasTrue16BitInsts in 1889def : ClampPat<V_MAX_F16_e64, f16>; 1890let SubtargetPredicate = UseRealTrue16Insts in 1891def : ClampPat<V_MAX_F16_t16_e64, f16>; 1892let SubtargetPredicate = UseFakeTrue16Insts in 1893def : ClampPat<V_MAX_F16_fake16_e64, f16>; 1894 1895let SubtargetPredicate = HasVOP3PInsts in { 1896def : GCNPat < 1897 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1898 (V_PK_MAX_F16 $src0_modifiers, $src0, 1899 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1900>; 1901} 1902 1903 1904/********** ================================ **********/ 1905/********** Floating point absolute/negative **********/ 1906/********** ================================ **********/ 1907 1908def : GCNPat < 1909 (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), 1910 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1911>; 1912 1913def : GCNPat < 1914 (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), 1915 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1916>; 1917 1918def : GCNPat < 1919 (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), 1920 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1921>; 1922 1923foreach fp16vt = [f16, bf16] in { 1924def : GCNPat < 1925 (UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)), 1926 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1927>; 1928 1929def : GCNPat < 1930 (UniformUnaryFrag<fabs> (fp16vt SReg_32:$src)), 1931 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1932>; 1933 1934def : GCNPat < 1935 (UniformUnaryFrag<fneg> (fabs (fp16vt SReg_32:$src))), 1936 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1937>; 1938} // End foreach fp16vt = ... 1939 1940def : GCNPat < 1941 (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), 1942 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1943>; 1944 1945def : GCNPat < 1946 (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), 1947 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1948>; 1949 1950// This is really (fneg (fabs v2f16:$src)) 1951// 1952// fabs is not reported as free because there is modifier for it in 1953// VOP3P instructions, so it is turned into the bit op. 1954def : GCNPat < 1955 (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1956 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1957>; 1958 1959def : GCNPat < 1960 (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), 1961 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1962>; 1963 1964 1965// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead 1966// of the real value. 1967def : GCNPat < 1968 (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), 1969 (v2f32 (REG_SEQUENCE SReg_64, 1970 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1971 (i32 (S_MOV_B32 (i32 0x80000000)))), 1972 SReg_32)), sub0, 1973 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1974 (i32 (S_MOV_B32 (i32 0x80000000)))), 1975 SReg_32)), sub1)) 1976>; 1977 1978def : GCNPat < 1979 (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), 1980 (v2f32 (REG_SEQUENCE SReg_64, 1981 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1982 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1983 SReg_32)), sub0, 1984 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1985 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1986 SReg_32)), sub1)) 1987>; 1988 1989def : GCNPat < 1990 (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), 1991 (v2f32 (REG_SEQUENCE SReg_64, 1992 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1993 (i32 (S_MOV_B32 (i32 0x80000000)))), 1994 SReg_32)), sub0, 1995 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1996 (i32 (S_MOV_B32 (i32 0x80000000)))), 1997 SReg_32)), sub1)) 1998>; 1999 2000// FIXME: Use S_BITSET0_B32/B64? 2001def : GCNPat < 2002 (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), 2003 (REG_SEQUENCE SReg_64, 2004 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 2005 sub0, 2006 (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 2007 (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. 2008 sub1) 2009>; 2010 2011def : GCNPat < 2012 (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), 2013 (REG_SEQUENCE SReg_64, 2014 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 2015 sub0, 2016 (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 2017 (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), 2018 sub1) 2019>; 2020 2021def : GCNPat < 2022 (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), 2023 (REG_SEQUENCE SReg_64, 2024 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 2025 sub0, 2026 (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 2027 (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. 2028 sub1) 2029>; 2030 2031 2032def : GCNPat < 2033 (fneg (fabs (f32 VGPR_32:$src))), 2034 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit 2035>; 2036 2037def : GCNPat < 2038 (fabs (f32 VGPR_32:$src)), 2039 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 2040>; 2041 2042def : GCNPat < 2043 (fneg (f32 VGPR_32:$src)), 2044 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 2045>; 2046 2047foreach fp16vt = [f16, bf16] in { 2048foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in 2049let SubtargetPredicate = p in { 2050def : GCNPat < 2051 (fabs (fp16vt VGPR_32:$src)), 2052 (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 2053>; 2054 2055def : GCNPat < 2056 (fneg (fp16vt VGPR_32:$src)), 2057 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 2058>; 2059 2060def : GCNPat < 2061 (fneg (fabs (fp16vt VGPR_32:$src))), 2062 (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 2063>; 2064} 2065 2066let SubtargetPredicate = UseRealTrue16Insts in { 2067def : GCNPat < 2068 (fabs (fp16vt VGPR_16:$src)), 2069 (V_AND_B16_t16_e64 (i32 0), (i16 0x7fff), (i32 0), VGPR_16:$src) 2070>; 2071 2072def : GCNPat < 2073 (fneg (fp16vt VGPR_16:$src)), 2074 (V_XOR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src) 2075>; 2076 2077def : GCNPat < 2078 (fneg (fabs (fp16vt VGPR_16:$src))), 2079 (V_OR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src) // Set sign bit 2080>; 2081} // End SubtargetPredicate = UseRealTrue16Insts 2082} // End foreach fp16vt = ... 2083 2084def : GCNPat < 2085 (fneg (v2f16 VGPR_32:$src)), 2086 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 2087>; 2088 2089def : GCNPat < 2090 (fabs (v2f16 VGPR_32:$src)), 2091 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 2092>; 2093 2094def : GCNPat < 2095 (fneg (v2f16 (fabs VGPR_32:$src))), 2096 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 2097>; 2098 2099def : GCNPat < 2100 (fabs (f64 VReg_64:$src)), 2101 (REG_SEQUENCE VReg_64, 2102 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2103 sub0, 2104 (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), 2105 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2106 sub1) 2107>; 2108 2109def : GCNPat < 2110 (fneg (f64 VReg_64:$src)), 2111 (REG_SEQUENCE VReg_64, 2112 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2113 sub0, 2114 (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 2115 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2116 sub1) 2117>; 2118 2119def : GCNPat < 2120 (fneg (fabs (f64 VReg_64:$src))), 2121 (REG_SEQUENCE VReg_64, 2122 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2123 sub0, 2124 (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 2125 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2126 sub1) 2127>; 2128 2129def : GCNPat < 2130 (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), 2131 (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, 2132 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), 2133 0, 0, 0, 0, 0) 2134> { 2135 let SubtargetPredicate = HasPackedFP32Ops; 2136} 2137 2138foreach fp16vt = [f16, bf16] in { 2139 2140def : GCNPat < 2141 (fcopysign fp16vt:$src0, fp16vt:$src1), 2142 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 2143>; 2144 2145def : GCNPat < 2146 (fcopysign f32:$src0, fp16vt:$src1), 2147 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2148 (V_LSHLREV_B32_e64 (i32 16), $src1)) 2149>; 2150 2151def : GCNPat < 2152 (fcopysign f64:$src0, fp16vt:$src1), 2153 (REG_SEQUENCE SReg_64, 2154 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2155 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 2156 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 2157>; 2158 2159def : GCNPat < 2160 (fcopysign fp16vt:$src0, f32:$src1), 2161 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 2162 (V_LSHRREV_B32_e64 (i32 16), $src1)) 2163>; 2164 2165def : GCNPat < 2166 (fcopysign fp16vt:$src0, f64:$src1), 2167 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 2168 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 2169>; 2170} // End foreach fp16vt = [f16, bf16] 2171 2172/********** ================== **********/ 2173/********** Immediate Patterns **********/ 2174/********** ================== **********/ 2175 2176// FIXME: Remove VGPRImm. Should be inferrable from register bank. 2177 2178foreach vt = [i32, p3, p5, p6, p2] in { 2179 def : GCNPat < 2180 (VGPRImm<(vt imm)>:$imm), 2181 (V_MOV_B32_e32 imm:$imm) 2182 >; 2183 2184 def : GCNPat < 2185 (vt imm:$imm), 2186 (S_MOV_B32 imm:$imm) 2187 >; 2188} 2189 2190// FIXME: The register bank of the frame index should depend on the 2191// users, and transitive users of the add. We may require an 2192// unnecessary copy from SGPR to VGPR. 2193def : GCNPat < 2194 (VGPRImm<(p5 frameindex)>:$fi), 2195 (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) 2196>; 2197 2198def : GCNPat < 2199 (p5 frameindex:$fi), 2200 (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) 2201>; 2202 2203def : GCNPat < 2204 (VGPRImm<(SIlds tglobaladdr:$ga)>), 2205 (V_MOV_B32_e32 $ga) 2206>; 2207 2208def : GCNPat < 2209 (SIlds tglobaladdr:$ga), 2210 (S_MOV_B32 $ga) 2211>; 2212 2213foreach pred = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in { 2214 let True16Predicate = pred in { 2215 def : GCNPat < 2216 (VGPRImm<(i16 imm)>:$imm), 2217 (V_MOV_B32_e32 imm:$imm) 2218 >; 2219 } 2220 2221 // FIXME: Workaround for ordering issue with peephole optimizer where 2222 // a register class copy interferes with immediate folding. Should 2223 // use s_mov_b32, which can be shrunk to s_movk_i32 2224 2225 foreach vt = [f16, bf16] in { 2226 def : GCNPat < 2227 (VGPRImm<(f16 fpimm)>:$imm), 2228 (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm))) 2229 >; 2230 } 2231} 2232 2233let True16Predicate = UseRealTrue16Insts in { 2234 def : GCNPat < 2235 (VGPRImm<(i16 imm)>:$imm), 2236 (V_MOV_B16_t16_e64 0, imm:$imm, 0) 2237 >; 2238 2239 foreach vt = [f16, bf16] in { 2240 def : GCNPat < 2241 (VGPRImm<(vt fpimm)>:$imm), 2242 (V_MOV_B16_t16_e64 0, $imm, 0) 2243 >; 2244 } 2245} 2246 2247// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit 2248// immediate and wil be expanded as needed, but we will only use these patterns 2249// for values which can be encoded. 2250def : GCNPat < 2251 (VGPRImm<(i64 imm)>:$imm), 2252 (V_MOV_B64_PSEUDO imm:$imm) 2253>; 2254 2255def : GCNPat < 2256 (VGPRImm<(f64 fpimm)>:$imm), 2257 (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) 2258>; 2259 2260def : GCNPat < 2261 (i64 imm:$imm), 2262 (S_MOV_B64_IMM_PSEUDO imm:$imm) 2263>; 2264 2265def : GCNPat < 2266 (f64 fpimm:$imm), 2267 (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm))) 2268>; 2269 2270def : GCNPat < 2271 (f32 fpimm:$imm), 2272 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 2273>; 2274 2275def : GCNPat < 2276 (f16 fpimm:$imm), 2277 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 2278>; 2279 2280def : GCNPat < 2281 (VGPRImm<(bf16 fpimm)>:$imm), 2282 (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm))) 2283>; 2284 2285def : GCNPat < 2286 (bf16 fpimm:$imm), 2287 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 2288>; 2289 2290def : GCNPat < 2291 (VGPRImm<(f32 fpimm)>:$imm), 2292 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 2293>; 2294 2295def : GCNPat < 2296 (f32 fpimm:$imm), 2297 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 2298>; 2299 2300foreach vt = [i64, p1, p0, p4] in { // FIXME: Should accept arbitrary addrspace 2301 def : GCNPat < 2302 (VGPRImm<(vt imm)>:$imm), 2303 (V_MOV_B64_PSEUDO imm:$imm) 2304 >; 2305 2306 def : GCNPat < 2307 (vt InlineImm64:$imm), 2308 (S_MOV_B64 InlineImm64:$imm) 2309 >; 2310 2311 def : GCNPat < 2312 (vt imm:$imm), 2313 (S_MOV_B64_IMM_PSEUDO imm:$imm) 2314 >; 2315} 2316 2317def : GCNPat < 2318 (VGPRImm<(f64 fpimm)>:$imm), 2319 (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) 2320>; 2321 2322// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit 2323// immediate and wil be expanded as needed, but we will only use these patterns 2324// for values which can be encoded. 2325def : GCNPat < 2326 (f64 InlineImmFP64:$imm), 2327 (S_MOV_B64 (i64 (bitcast_fpimm_to_i64 $imm))) 2328>; 2329 2330def : GCNPat < 2331 (f64 fpimm:$imm), 2332 (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm))) 2333>; 2334 2335// Set to sign-extended 64-bit value (true = -1, false = 0) 2336def : GCNPat <(i1 imm:$imm), 2337 (S_MOV_B64 imm:$imm)> { 2338 let WaveSizePredicate = isWave64; 2339} 2340 2341def : GCNPat <(i1 imm:$imm), 2342 (S_MOV_B32 imm:$imm)> { 2343 let WaveSizePredicate = isWave32; 2344} 2345 2346/********** ================== **********/ 2347/********** Intrinsic Patterns **********/ 2348/********** ================== **********/ 2349 2350def : GCNPat < 2351 (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), 2352 (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) 2353>; 2354 2355def : GCNPat < 2356 (i32 (sext i1:$src0)), 2357 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2358 /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0) 2359>; 2360 2361class Ext32Pat <SDNode ext> : GCNPat < 2362 (i32 (ext i1:$src0)), 2363 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2364 /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0) 2365>; 2366 2367def : Ext32Pat <zext>; 2368def : Ext32Pat <anyext>; 2369 2370// The multiplication scales from [0,1) to the unsigned integer range, 2371// rounding down a bit to avoid unwanted overflow. 2372def : GCNPat < 2373 (AMDGPUurecip i32:$src0), 2374 (V_CVT_U32_F32_e32 2375 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 2376 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 2377>; 2378 2379//===----------------------------------------------------------------------===// 2380// VOP3 Patterns 2381//===----------------------------------------------------------------------===// 2382 2383def : IMad24Pat<V_MAD_I32_I24_e64, 1>; 2384def : UMad24Pat<V_MAD_U32_U24_e64, 1>; 2385 2386// BFI patterns 2387 2388def BFIImm32 : PatFrag< 2389 (ops node:$x, node:$y, node:$z), 2390 (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), 2391 [{ 2392 auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); 2393 auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); 2394 return X && NotX && 2395 ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); 2396 }] 2397>; 2398 2399 2400// Definition from ISA doc: 2401// (y & x) | (z & ~x) 2402def : AMDGPUPatIgnoreCopies < 2403 (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), 2404 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2405 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2406 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2407>; 2408 2409// (y & C) | (z & ~C) 2410def : AMDGPUPatIgnoreCopies < 2411 (BFIImm32 i32:$x, i32:$y, i32:$z), 2412 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2413>; 2414 2415// 64-bit version 2416def : AMDGPUPatIgnoreCopies < 2417 (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), 2418 (REG_SEQUENCE VReg_64, 2419 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2420 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2421 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2422 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2423 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2424 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2425>; 2426 2427// SHA-256 Ch function 2428// z ^ (x & (y ^ z)) 2429def : AMDGPUPatIgnoreCopies < 2430 (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), 2431 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2432 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2433 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2434>; 2435 2436// 64-bit version 2437def : AMDGPUPatIgnoreCopies < 2438 (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), 2439 (REG_SEQUENCE VReg_64, 2440 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2441 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2442 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2443 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2444 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2445 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2446>; 2447 2448def : AMDGPUPat < 2449 (fcopysign f32:$src0, f32:$src1), 2450 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) 2451>; 2452 2453def : AMDGPUPat < 2454 (fcopysign f32:$src0, f64:$src1), 2455 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2456 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) 2457>; 2458 2459def : AMDGPUPat < 2460 (fcopysign f64:$src0, f64:$src1), 2461 (REG_SEQUENCE SReg_64, 2462 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2463 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2464 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2465 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) 2466>; 2467 2468def : AMDGPUPat < 2469 (fcopysign f64:$src0, f32:$src1), 2470 (REG_SEQUENCE SReg_64, 2471 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2472 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2473 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2474 $src1), sub1) 2475>; 2476 2477def : ROTRPattern <V_ALIGNBIT_B32_e64>; 2478 2479def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 2480 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2481 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2482 2483def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 2484 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2485 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2486 2487/********** ====================== **********/ 2488/********** Indirect addressing **********/ 2489/********** ====================== **********/ 2490 2491multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 2492 // Extract with offset 2493 def : GCNPat< 2494 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 2495 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 2496 >; 2497 2498 // Insert with offset 2499 def : GCNPat< 2500 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 2501 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 2502 >; 2503} 2504 2505defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 2506defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 2507defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 2508defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">; 2509defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">; 2510defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">; 2511defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">; 2512defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 2513defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 2514 2515defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 2516defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 2517defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 2518defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">; 2519defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">; 2520defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">; 2521defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">; 2522defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 2523defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 2524 2525//===----------------------------------------------------------------------===// 2526// SAD Patterns 2527//===----------------------------------------------------------------------===// 2528 2529def : GCNPat < 2530 (add (sub_oneuse (umax i32:$src0, i32:$src1), 2531 (umin i32:$src0, i32:$src1)), 2532 i32:$src2), 2533 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2534>; 2535 2536def : GCNPat < 2537 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 2538 (sub i32:$src0, i32:$src1), 2539 (sub i32:$src1, i32:$src0)), 2540 i32:$src2), 2541 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2542>; 2543 2544//===----------------------------------------------------------------------===// 2545// Conversion Patterns 2546//===----------------------------------------------------------------------===// 2547def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), 2548 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 2549 2550// Handle sext_inreg in i64 2551def : GCNPat < 2552 (i64 (UniformSextInreg<i1> i64:$src)), 2553 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 2554>; 2555 2556def : GCNPat < 2557 (i16 (UniformSextInreg<i1> i16:$src)), 2558 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 2559>; 2560 2561def : GCNPat < 2562 (i16 (UniformSextInreg<i8> i16:$src)), 2563 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 2564>; 2565 2566def : GCNPat < 2567 (i64 (UniformSextInreg<i8> i64:$src)), 2568 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 2569>; 2570 2571def : GCNPat < 2572 (i64 (UniformSextInreg<i16> i64:$src)), 2573 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 2574>; 2575 2576def : GCNPat < 2577 (i64 (UniformSextInreg<i32> i64:$src)), 2578 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 2579>; 2580 2581def : GCNPat< 2582 (i32 (DivergentSextInreg<i1> i32:$src)), 2583 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; 2584 2585def : GCNPat < 2586 (i16 (DivergentSextInreg<i1> i16:$src)), 2587 (V_BFE_I32_e64 $src, (i32 0), (i32 1)) 2588>; 2589 2590def : GCNPat < 2591 (i16 (DivergentSextInreg<i8> i16:$src)), 2592 (V_BFE_I32_e64 $src, (i32 0), (i32 8)) 2593>; 2594 2595def : GCNPat< 2596 (i32 (DivergentSextInreg<i8> i32:$src)), 2597 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) 2598>; 2599 2600def : GCNPat < 2601 (i32 (DivergentSextInreg<i16> i32:$src)), 2602 (V_BFE_I32_e64 $src, (i32 0), (i32 16)) 2603>; 2604 2605def : GCNPat < 2606 (i64 (DivergentSextInreg<i1> i64:$src)), 2607 (REG_SEQUENCE VReg_64, 2608 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, 2609 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) 2610>; 2611 2612def : GCNPat < 2613 (i64 (DivergentSextInreg<i8> i64:$src)), 2614 (REG_SEQUENCE VReg_64, 2615 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0, 2616 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) 2617>; 2618 2619def : GCNPat < 2620 (i64 (DivergentSextInreg<i16> i64:$src)), 2621 (REG_SEQUENCE VReg_64, 2622 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0, 2623 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) 2624>; 2625 2626def : GCNPat < 2627 (i64 (DivergentSextInreg<i32> i64:$src)), 2628 (REG_SEQUENCE VReg_64, 2629 (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, 2630 (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) 2631>; 2632 2633def : GCNPat < 2634 (i64 (zext i32:$src)), 2635 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 2636>; 2637 2638def : GCNPat < 2639 (i64 (anyext i32:$src)), 2640 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 2641>; 2642 2643class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 2644 (i64 (ext i1:$src)), 2645 (REG_SEQUENCE VReg_64, 2646 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2647 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 2648 sub0, (S_MOV_B32 (i32 0)), sub1) 2649>; 2650 2651 2652def : ZExt_i64_i1_Pat<zext>; 2653def : ZExt_i64_i1_Pat<anyext>; 2654 2655// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 2656// REG_SEQUENCE patterns don't support instructions with multiple outputs. 2657def : GCNPat < 2658 (i64 (UniformUnaryFrag<sext> i32:$src)), 2659 (REG_SEQUENCE SReg_64, $src, sub0, 2660 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 2661>; 2662 2663def : GCNPat < 2664 (i64 (DivergentUnaryFrag<sext> i32:$src)), 2665 (REG_SEQUENCE VReg_64, $src, sub0, 2666 (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) 2667>; 2668 2669def : GCNPat < 2670 (i64 (sext i1:$src)), 2671 (REG_SEQUENCE VReg_64, 2672 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2673 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 2674 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2675 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 2676>; 2677 2678class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 2679 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 2680 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 2681>; 2682 2683let OtherPredicates = [NotHasTrue16BitInsts] in { 2684 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2685 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2686} // end OtherPredicates = [NotHasTrue16BitInsts] 2687 2688let OtherPredicates = [HasTrue16BitInsts] in { 2689 def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2690 def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2691} // end OtherPredicates = [HasTrue16BitInsts] 2692 2693def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 2694def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 2695def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 2696def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 2697 2698// If we need to perform a logical operation on i1 values, we need to 2699// use vector comparisons since there is only one SCC register. Vector 2700// comparisons may write to a pair of SGPRs or a single SGPR, so treat 2701// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 2702// instructions resulting in the copies from SCC to these instructions 2703// will be moved to the VALU. 2704 2705let WaveSizePredicate = isWave64 in { 2706def : GCNPat < 2707 (i1 (and i1:$src0, i1:$src1)), 2708 (S_AND_B64 $src0, $src1) 2709>; 2710 2711def : GCNPat < 2712 (i1 (or i1:$src0, i1:$src1)), 2713 (S_OR_B64 $src0, $src1) 2714>; 2715 2716def : GCNPat < 2717 (i1 (xor i1:$src0, i1:$src1)), 2718 (S_XOR_B64 $src0, $src1) 2719>; 2720 2721def : GCNPat < 2722 (i1 (add i1:$src0, i1:$src1)), 2723 (S_XOR_B64 $src0, $src1) 2724>; 2725 2726def : GCNPat < 2727 (i1 (sub i1:$src0, i1:$src1)), 2728 (S_XOR_B64 $src0, $src1) 2729>; 2730 2731let AddedComplexity = 1 in { 2732def : GCNPat < 2733 (i1 (add i1:$src0, (i1 -1))), 2734 (S_NOT_B64 $src0) 2735>; 2736 2737def : GCNPat < 2738 (i1 (sub i1:$src0, (i1 -1))), 2739 (S_NOT_B64 $src0) 2740>; 2741} 2742} // end isWave64 2743 2744let WaveSizePredicate = isWave32 in { 2745def : GCNPat < 2746 (i1 (and i1:$src0, i1:$src1)), 2747 (S_AND_B32 $src0, $src1) 2748>; 2749 2750def : GCNPat < 2751 (i1 (or i1:$src0, i1:$src1)), 2752 (S_OR_B32 $src0, $src1) 2753>; 2754 2755def : GCNPat < 2756 (i1 (xor i1:$src0, i1:$src1)), 2757 (S_XOR_B32 $src0, $src1) 2758>; 2759 2760def : GCNPat < 2761 (i1 (add i1:$src0, i1:$src1)), 2762 (S_XOR_B32 $src0, $src1) 2763>; 2764 2765def : GCNPat < 2766 (i1 (sub i1:$src0, i1:$src1)), 2767 (S_XOR_B32 $src0, $src1) 2768>; 2769 2770let AddedComplexity = 1 in { 2771def : GCNPat < 2772 (i1 (add i1:$src0, (i1 -1))), 2773 (S_NOT_B32 $src0) 2774>; 2775 2776def : GCNPat < 2777 (i1 (sub i1:$src0, (i1 -1))), 2778 (S_NOT_B32 $src0) 2779>; 2780} 2781} // end isWave32 2782 2783def : GCNPat < 2784 (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), 2785 (V_NOT_B32_e32 $src0) 2786>; 2787 2788def : GCNPat < 2789 (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), 2790 (REG_SEQUENCE VReg_64, 2791 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, 2792 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 2793 ) 2794>; 2795 2796let SubtargetPredicate = NotHasTrue16BitInsts in 2797def : GCNPat < 2798 (f16 (sint_to_fp i1:$src)), 2799 (V_CVT_F16_F32_e32 ( 2800 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2801 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2802 SSrc_i1:$src)) 2803>; 2804 2805let True16Predicate = UseRealTrue16Insts in 2806def : GCNPat < 2807 (f16 (sint_to_fp i1:$src)), 2808 (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, 2809 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2810 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2811 SSrc_i1:$src), 2812 /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) 2813>; 2814 2815let True16Predicate = UseFakeTrue16Insts in 2816def : GCNPat < 2817 (f16 (sint_to_fp i1:$src)), 2818 (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, 2819 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2820 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2821 SSrc_i1:$src), 2822 /*clamp*/ 0, /*omod*/ 0) 2823>; 2824 2825let True16Predicate = NotHasTrue16BitInsts in 2826def : GCNPat < 2827 (f16 (uint_to_fp i1:$src)), 2828 (V_CVT_F16_F32_e32 ( 2829 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2830 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2831 SSrc_i1:$src)) 2832>; 2833 2834let True16Predicate = UseRealTrue16Insts in 2835def : GCNPat < 2836 (f16 (uint_to_fp i1:$src)), 2837 (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, 2838 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2839 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2840 SSrc_i1:$src), 2841 /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) 2842>; 2843 2844let True16Predicate = UseFakeTrue16Insts in 2845def : GCNPat < 2846 (f16 (uint_to_fp i1:$src)), 2847 (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, 2848 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2849 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2850 SSrc_i1:$src), 2851 /*clamp*/ 0, /*omod*/ 0) 2852>; 2853 2854def : GCNPat < 2855 (f32 (sint_to_fp i1:$src)), 2856 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2857 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2858 SSrc_i1:$src) 2859>; 2860 2861def : GCNPat < 2862 (f32 (uint_to_fp i1:$src)), 2863 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2864 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2865 SSrc_i1:$src) 2866>; 2867 2868def : GCNPat < 2869 (f64 (sint_to_fp i1:$src)), 2870 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2871 /*src1mod*/(i32 0), /*src1*/(i32 -1), 2872 SSrc_i1:$src)) 2873>; 2874 2875def : GCNPat < 2876 (f64 (uint_to_fp i1:$src)), 2877 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2878 /*src1mod*/(i32 0), /*src1*/(i32 1), 2879 SSrc_i1:$src)) 2880>; 2881 2882//===----------------------------------------------------------------------===// 2883// Miscellaneous Patterns 2884//===----------------------------------------------------------------------===// 2885 2886// Eliminate a zero extension from an fp16 operation if it already 2887// zeros the high bits of the 32-bit register. 2888// 2889// This is complicated on gfx9+. Some instructions maintain the legacy 2890// zeroing behavior, but others preserve the high bits. Some have a 2891// control bit to change the behavior. We can't simply say with 2892// certainty what the source behavior is without more context on how 2893// the src is lowered. e.g. fptrunc + fma may be lowered to a 2894// v_fma_mix* instruction which does not zero, or may not. 2895def : GCNPat< 2896 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2897 (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; 2898 2899let AddedComplexity = 1 in { 2900def : GCNPat< 2901 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2902 (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ 2903 let SubtargetPredicate = HasAddNoCarryInsts; 2904} 2905} // AddedComplexity = 1 2906 2907def : GCNPat< 2908 (i32 (DivergentUnaryFrag<zext> i16:$src)), 2909 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) 2910>; 2911 2912def : GCNPat< 2913 (i64 (DivergentUnaryFrag<zext> i16:$src)), 2914 (REG_SEQUENCE VReg_64, 2915 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, 2916 (S_MOV_B32 (i32 0)), sub1) 2917>; 2918 2919def : GCNPat< 2920 (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), 2921 (COPY VSrc_b16:$src)>; 2922 2923def : GCNPat < 2924 (i32 (trunc i64:$a)), 2925 (EXTRACT_SUBREG $a, sub0) 2926>; 2927 2928def : GCNPat < 2929 (i1 (UniformUnaryFrag<trunc> i32:$a)), 2930 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2931>; 2932 2933def : GCNPat < 2934 (i1 (UniformUnaryFrag<trunc> i16:$a)), 2935 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2936>; 2937 2938def : GCNPat < 2939 (i1 (UniformUnaryFrag<trunc> i64:$a)), 2940 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), 2941 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2942>; 2943 2944def : GCNPat < 2945 (i1 (DivergentUnaryFrag<trunc> i32:$a)), 2946 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2947>; 2948 2949def : GCNPat < 2950 (i1 (DivergentUnaryFrag<trunc> i16:$a)), 2951 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2952>; 2953 2954def IMMBitSelConst : SDNodeXForm<imm, [{ 2955 return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), 2956 MVT::i32); 2957}]>; 2958 2959// Matching separate SRL and TRUNC instructions 2960// with dependent operands (SRL dest is source of TRUNC) 2961// generates three instructions. However, by using bit shifts, 2962// the V_LSHRREV_B32_e64 result can be directly used in the 2963// operand of the V_AND_B32_e64 instruction: 2964// (trunc i32 (srl i32 $a, i32 $b)) -> 2965// v_and_b32_e64 $a, (1 << $b), $a 2966// v_cmp_ne_u32_e64 $a, 0, $a 2967 2968// Handle the VALU case. 2969def : GCNPat < 2970 (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2971 (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), 2972 (i32 0)) 2973>; 2974 2975// Handle the scalar case. 2976def : GCNPat < 2977 (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2978 (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), 2979 (i32 0)) 2980>; 2981 2982def : GCNPat < 2983 (i1 (DivergentUnaryFrag<trunc> i64:$a)), 2984 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), 2985 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2986>; 2987 2988def : GCNPat < 2989 (i32 (bswap i32:$a)), 2990 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2991 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 2992 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 2993>; 2994 2995// FIXME: This should have been narrowed to i32 during legalization. 2996// This pattern should also be skipped for GlobalISel 2997def : GCNPat < 2998 (i64 (bswap i64:$a)), 2999 (REG_SEQUENCE VReg_64, 3000 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 3001 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 3002 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 3003 (i32 24)), 3004 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 3005 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 3006 (i32 8))), 3007 sub0, 3008 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 3009 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 3010 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 3011 (i32 24)), 3012 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 3013 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 3014 (i32 8))), 3015 sub1) 3016>; 3017 3018// FIXME: The AddedComplexity should not be needed, but in GlobalISel 3019// the BFI pattern ends up taking precedence without it. 3020let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 3021// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 3022// 3023// My reading of the manual suggests we should be using src0 for the 3024// register value, but this is what seems to work. 3025def : GCNPat < 3026 (i32 (bswap i32:$a)), 3027 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 3028>; 3029 3030// FIXME: This should have been narrowed to i32 during legalization. 3031// This pattern should also be skipped for GlobalISel 3032def : GCNPat < 3033 (i64 (bswap i64:$a)), 3034 (REG_SEQUENCE VReg_64, 3035 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 3036 (S_MOV_B32 (i32 0x00010203))), 3037 sub0, 3038 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 3039 (S_MOV_B32 (i32 0x00010203))), 3040 sub1) 3041>; 3042 3043// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 3044// The 12s emit 0s. 3045foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in 3046let True16Predicate = p in { 3047def : GCNPat < 3048 (i16 (bswap i16:$a)), 3049 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 3050>; 3051 3052def : GCNPat < 3053 (i32 (zext (bswap i16:$a))), 3054 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 3055>; 3056} 3057 3058let True16Predicate = UseRealTrue16Insts in { 3059def : GCNPat < 3060 (i16 (bswap i16:$a)), 3061 (EXTRACT_SUBREG (V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16) 3062>; 3063 3064def : GCNPat < 3065 (i32 (zext (bswap i16:$a))), 3066 (V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))) 3067>; 3068} 3069 3070// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 3071def : GCNPat < 3072 (v2i16 (bswap v2i16:$a)), 3073 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 3074>; 3075 3076} 3077 3078def : GCNPat< 3079 (i64 (DivergentUnaryFrag<bitreverse> i64:$a)), 3080 (REG_SEQUENCE VReg_64, 3081 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, 3082 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; 3083 3084// If fcanonicalize's operand is implicitly canonicalized, we only need a copy. 3085let AddedComplexity = 8 in { 3086foreach vt = [f16, v2f16, f32, v2f32, f64] in { 3087 def : GCNPat< 3088 (fcanonicalize (vt is_canonicalized:$src)), 3089 (COPY vt:$src) 3090 >; 3091} 3092} 3093 3094// Prefer selecting to max when legal, but using mul is always valid. 3095let AddedComplexity = -5 in { 3096 3097let True16Predicate = NotHasTrue16BitInsts in { 3098def : GCNPat< 3099 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3100 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 3101>; 3102 3103def : GCNPat< 3104 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 3105 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 3106>; 3107} // End True16Predicate 3108 3109let True16Predicate = UseRealTrue16Insts in { 3110def : GCNPat< 3111 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3112 (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) 3113>; 3114 3115def : GCNPat< 3116 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 3117 (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) 3118>; 3119} // End True16Predicate 3120 3121let True16Predicate = UseFakeTrue16Insts in { 3122def : GCNPat< 3123 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3124 (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 3125>; 3126 3127def : GCNPat< 3128 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 3129 (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 3130>; 3131} // End True16Predicate 3132 3133def : GCNPat< 3134 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 3135 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 3136>; 3137 3138def : GCNPat< 3139 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 3140 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 3141>; 3142 3143def : GCNPat< 3144 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 3145 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 3146>; 3147 3148let SubtargetPredicate = HasPackedFP32Ops in { 3149def : GCNPat< 3150 (fcanonicalize (v2f32 (VOP3PMods v2f32:$src, i32:$src_mods))), 3151 (V_PK_MUL_F32 0, (i64 CONST.FP32_ONE), $src_mods, $src) 3152>; 3153} 3154 3155// TODO: Handle fneg like other types. 3156let SubtargetPredicate = isNotGFX12Plus in { 3157def : GCNPat< 3158 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3159 (V_MUL_F64_e64 0, (i64 CONST.FP64_ONE), $src_mods, $src) 3160>; 3161} 3162} // End AddedComplexity = -5 3163 3164multiclass SelectCanonicalizeAsMax< 3165 list<Predicate> f32_preds = [], 3166 list<Predicate> f64_preds = [], 3167 list<Predicate> f16_preds = []> { 3168 def : GCNPat< 3169 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 3170 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 3171 let OtherPredicates = f32_preds; 3172 } 3173 3174 def : GCNPat< 3175 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3176 (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { 3177 let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]); 3178 } 3179 3180 def : GCNPat< 3181 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3182 (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> { 3183 let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]); 3184 } 3185 3186 def : GCNPat< 3187 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3188 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 3189 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); 3190 let True16Predicate = NotHasTrue16BitInsts; 3191 } 3192 3193 def : GCNPat< 3194 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3195 (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 3196 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); 3197 let True16Predicate = UseRealTrue16Insts; 3198 } 3199 3200 def : GCNPat< 3201 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3202 (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 3203 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); 3204 let True16Predicate = UseFakeTrue16Insts; 3205 } 3206 3207 def : GCNPat< 3208 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 3209 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 3210 // FIXME: Should have VOP3P subtarget predicate 3211 let OtherPredicates = f16_preds; 3212 } 3213} 3214 3215// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 3216// mode, and would never flush. For f64, it's faster to do implement 3217// this with a max. For f16/f32 it's a wash, but prefer max when 3218// valid. 3219// 3220// FIXME: Lowering f32/f16 with max is worse since we can use a 3221// smaller encoding if the input is fneg'd. It also adds an extra 3222// register use. 3223let SubtargetPredicate = HasMinMaxDenormModes in { 3224 defm : SelectCanonicalizeAsMax<[], [], []>; 3225} // End SubtargetPredicate = HasMinMaxDenormModes 3226 3227let SubtargetPredicate = NotHasMinMaxDenormModes in { 3228 // Use the max lowering if we don't need to flush. 3229 3230 // FIXME: We don't do use this for f32 as a workaround for the 3231 // library being compiled with the default ieee mode, but 3232 // potentially being called from flushing kernels. Really we should 3233 // not be mixing code expecting different default FP modes, but mul 3234 // works in any FP environment. 3235 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 3236} // End SubtargetPredicate = NotHasMinMaxDenormModes 3237 3238 3239let OtherPredicates = [HasDLInsts] in { 3240// Don't allow source modifiers. If there are any source modifiers then it's 3241// better to select fma instead of fmac. 3242def : GCNPat < 3243 (fma (f32 (VOP3NoMods f32:$src0)), 3244 (f32 (VOP3NoMods f32:$src1)), 3245 (f32 (VOP3NoMods f32:$src2))), 3246 (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3247 SRCMODS.NONE, $src2) 3248>; 3249} // End OtherPredicates = [HasDLInsts] 3250 3251let SubtargetPredicate = isGFX10Plus in { 3252// Don't allow source modifiers. If there are any source modifiers then it's 3253// better to select fma instead of fmac. 3254let True16Predicate = NotHasTrue16BitInsts in 3255def : GCNPat < 3256 (fma (f16 (VOP3NoMods f32:$src0)), 3257 (f16 (VOP3NoMods f32:$src1)), 3258 (f16 (VOP3NoMods f32:$src2))), 3259 (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3260 SRCMODS.NONE, $src2) 3261>; 3262let True16Predicate = UseFakeTrue16Insts in 3263def : GCNPat < 3264 (fma (f16 (VOP3NoMods f16:$src0)), 3265 (f16 (VOP3NoMods f16:$src1)), 3266 (f16 (VOP3NoMods f16:$src2))), 3267 (V_FMAC_F16_fake16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3268 SRCMODS.NONE, $src2) 3269>; 3270} 3271 3272let OtherPredicates = [HasFmacF64Inst] in 3273// Don't allow source modifiers. If there are any source modifiers then it's 3274// better to select fma instead of fmac. 3275def : GCNPat < 3276 (fma (f64 (VOP3NoMods f64:$src0)), 3277 (f64 (VOP3NoMods f64:$src1)), 3278 (f64 (VOP3NoMods f64:$src2))), 3279 (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3280 SRCMODS.NONE, $src2) 3281>; 3282 3283// COPY is workaround tablegen bug from multiple outputs 3284// from S_LSHL_B32's multiple outputs from implicit scc def. 3285let AddedComplexity = 1 in { 3286def : GCNPat < 3287 (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 3288 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 3289>; 3290 3291def : GCNPat < 3292 (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), 3293 (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) 3294>; 3295 3296 3297def : GCNPat < 3298 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 3299 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 3300>; 3301 3302def : GCNPat < 3303 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), 3304 (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 3305>; 3306 3307def : GCNPat < 3308 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 3309 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 3310>; 3311 3312def : GCNPat < 3313 (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), 3314 (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 3315>; 3316 3317foreach vecTy = [v2i16, v2f16, v2bf16] in { 3318 3319defvar Ty = vecTy.ElementType; 3320 3321def : GCNPat < 3322 (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty undef))), 3323 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 3324>; 3325 3326def : GCNPat < 3327 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))), 3328 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 3329>; 3330 3331def : GCNPat < 3332 (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))), 3333 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 3334>; 3335 3336def : GCNPat < 3337 (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))), 3338 (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 3339>; 3340} // End foreach Ty = ... 3341} 3342 3343let SubtargetPredicate = HasVOP3PInsts in { 3344def : GCNPat < 3345 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))), 3346 (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) 3347>; 3348 3349// With multiple uses of the shift, this will duplicate the shift and 3350// increase register pressure. 3351def : GCNPat < 3352 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 3353 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3354>; 3355 3356def : GCNPat < 3357 (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 3358 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 3359 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 3360>; 3361 3362 3363foreach vecTy = [v2i16, v2f16, v2bf16] in { 3364 3365defvar Ty = vecTy.ElementType; 3366defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero); 3367 3368def : GCNPat < 3369 (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty SReg_32:$src1))), 3370 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 3371>; 3372 3373// Take the lower 16 bits from each VGPR_32 and concat them 3374def : GCNPat < 3375 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), 3376 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) 3377>; 3378 3379 3380// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 3381// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 3382def : GCNPat < 3383 (vecTy (DivergentBinFrag<build_vector> (Ty (immzeroTy)), 3384 (Ty !if(!eq(Ty, i16), 3385 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3386 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3387 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) 3388>; 3389 3390 3391// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 3392// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 3393def : GCNPat < 3394 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), 3395 (Ty !if(!eq(Ty, i16), 3396 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3397 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3398 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) 3399>; 3400 3401 3402// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] 3403// Special case, can use V_ALIGNBIT (always uses encoded literal) 3404def : GCNPat < 3405 (vecTy (DivergentBinFrag<build_vector> 3406 (Ty !if(!eq(Ty, i16), 3407 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 3408 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 3409 (Ty VGPR_32:$b))), 3410 (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) 3411>; 3412 3413// Take the upper 16 bits from each VGPR_32 and concat them 3414def : GCNPat < 3415 (vecTy (DivergentBinFrag<build_vector> 3416 (Ty !if(!eq(Ty, i16), 3417 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 3418 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 3419 (Ty !if(!eq(Ty, i16), 3420 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3421 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3422 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) 3423>; 3424 3425 3426} // end foreach Ty 3427 3428} // End SubtargetPredicate = HasVOP3PInsts 3429 3430let AddedComplexity = 5 in { 3431class PackB32Pat<Instruction inst> : GCNPat < 3432 (v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), 3433 (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), 3434 (inst $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) 3435>; 3436} 3437let SubtargetPredicate = isGFX9Plus in { 3438let True16Predicate = NotHasTrue16BitInsts in 3439 def : PackB32Pat<V_PACK_B32_F16_e64>; 3440 3441let True16Predicate = UseRealTrue16Insts in 3442 def : PackB32Pat<V_PACK_B32_F16_t16_e64>; 3443 3444let True16Predicate = UseFakeTrue16Insts in 3445 def : PackB32Pat<V_PACK_B32_F16_fake16_e64>; 3446} // End SubtargetPredicate = isGFX9Plus 3447 3448// With multiple uses of the shift, this will duplicate the shift and 3449// increase register pressure. 3450let SubtargetPredicate = isGFX11Plus in 3451def : GCNPat < 3452 (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))), 3453 (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3454>; 3455 3456 3457def : GCNPat < 3458 (v2f16 (scalar_to_vector f16:$src0)), 3459 (COPY $src0) 3460>; 3461 3462def : GCNPat < 3463 (v2i16 (scalar_to_vector i16:$src0)), 3464 (COPY $src0) 3465>; 3466 3467def : GCNPat < 3468 (v4i16 (scalar_to_vector i16:$src0)), 3469 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3470>; 3471 3472def : GCNPat < 3473 (v4f16 (scalar_to_vector f16:$src0)), 3474 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3475>; 3476 3477def : GCNPat < 3478 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3479 timm:$bank_mask, timm:$bound_ctrl)), 3480 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, 3481 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 3482 (as_i32timm $bank_mask), 3483 (as_i1timm $bound_ctrl)) 3484>; 3485 3486foreach vt = Reg64Types.types in { 3487def : GCNPat < 3488 (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask, 3489 timm:$bank_mask, timm:$bound_ctrl)), 3490 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), 3491 (as_i32timm $row_mask), (as_i32timm $bank_mask), 3492 (as_i1timm $bound_ctrl)) 3493>; 3494} 3495 3496//===----------------------------------------------------------------------===// 3497// Fract Patterns 3498//===----------------------------------------------------------------------===// 3499 3500let SubtargetPredicate = isGFX6 in { 3501 3502// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 3503// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 3504// way to implement it is using V_FRACT_F64. 3505// The workaround for the V_FRACT bug is: 3506// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3507 3508// Convert floor(x) to (x - fract(x)) 3509 3510// Don't bother handling this for GlobalISel, it's handled during 3511// lowering. 3512// 3513// FIXME: DAG should also custom lower this. 3514def : GCNPat < 3515 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 3516 (V_ADD_F64_e64 3517 $mods, 3518 $x, 3519 SRCMODS.NEG, 3520 (V_CNDMASK_B64_PSEUDO 3521 (V_MIN_F64_e64 3522 SRCMODS.NONE, 3523 (V_FRACT_F64_e64 $mods, $x), 3524 SRCMODS.NONE, 3525 (V_MOV_B64_PSEUDO (i64 0x3fefffffffffffff))), 3526 $x, 3527 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 3528>; 3529 3530} // End SubtargetPredicates = isGFX6 3531 3532//============================================================================// 3533// Miscellaneous Optimization Patterns 3534//============================================================================// 3535 3536// Undo sub x, c -> add x, -c canonicalization since c is more likely 3537// an inline immediate than -c. 3538// TODO: Also do for 64-bit. 3539def : GCNPat< 3540 (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3541 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 3542>; 3543 3544def : GCNPat< 3545 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3546 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3547 let SubtargetPredicate = HasAddNoCarryInsts; 3548} 3549 3550def : GCNPat< 3551 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3552 (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3553 let SubtargetPredicate = NotHasAddNoCarryInsts; 3554} 3555 3556 3557// Avoid pointlessly materializing a constant in VGPR. 3558// FIXME: Should also do this for readlane, but tablegen crashes on 3559// the ignored src1. 3560def : GCNPat< 3561 (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), 3562 (S_MOV_B32 SReg_32:$src) 3563>; 3564 3565multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> { 3566 def : GCNPat < 3567 (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 3568 (BFM $a, $b) 3569 >; 3570 3571 def : GCNPat < 3572 (vt (ADD (vt (shl 1, vt:$a)), -1)), 3573 (BFM $a, (i32 0)) 3574 >; 3575} 3576 3577defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>; 3578// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>; 3579defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>; 3580 3581// Bitfield extract patterns 3582 3583def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ 3584 return isMask_32(Imm); 3585}]>; 3586 3587def IMMPopCount : SDNodeXForm<imm, [{ 3588 return CurDAG->getTargetConstant(llvm::popcount(N->getZExtValue()), SDLoc(N), 3589 MVT::i32); 3590}]>; 3591 3592def : AMDGPUPat < 3593 (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), 3594 IMMZeroBasedBitfieldMask:$mask), 3595 (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) 3596>; 3597 3598// x & ((1 << y) - 1) 3599def : AMDGPUPat < 3600 (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), 3601 (V_BFE_U32_e64 $src, (i32 0), $width) 3602>; 3603 3604// x & ~(-1 << y) 3605def : AMDGPUPat < 3606 (DivergentBinFrag<and> i32:$src, 3607 (xor_oneuse (shl_oneuse -1, i32:$width), -1)), 3608 (V_BFE_U32_e64 $src, (i32 0), $width) 3609>; 3610 3611def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{ 3612 return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5; 3613}]>; 3614 3615// x & (-1 >> (bitwidth - y)) 3616def : AMDGPUPat < 3617 (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, uint5Bits:$width))), 3618 (V_BFE_U32_e64 $src, (i32 0), $width) 3619>; 3620 3621// SHA-256 Ma patterns 3622 3623// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y 3624def : AMDGPUPatIgnoreCopies < 3625 (DivergentBinFrag<or> (and i32:$x, i32:$z), 3626 (and i32:$y, (or i32:$x, i32:$z))), 3627 (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 3628 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), 3629 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), 3630 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) 3631>; 3632 3633def : AMDGPUPatIgnoreCopies < 3634 (DivergentBinFrag<or> (and i64:$x, i64:$z), 3635 (and i64:$y, (or i64:$x, i64:$z))), 3636 (REG_SEQUENCE VReg_64, 3637 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 3638 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), 3639 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), 3640 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, 3641 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 3642 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), 3643 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), 3644 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) 3645>; 3646 3647multiclass IntMed3Pat<Instruction med3Inst, 3648 SDPatternOperator min, 3649 SDPatternOperator max> { 3650 3651 // This matches 16 permutations of 3652 // min(max(a, b), max(min(a, b), c)) 3653 def : AMDGPUPat < 3654 (min (max i32:$src0, i32:$src1), 3655 (max (min i32:$src0, i32:$src1), i32:$src2)), 3656 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3657>; 3658 3659 // This matches 16 permutations of 3660 // max(min(x, y), min(max(x, y), z)) 3661 def : AMDGPUPat < 3662 (max (min i32:$src0, i32:$src1), 3663 (min (max i32:$src0, i32:$src1), i32:$src2)), 3664 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3665>; 3666} 3667 3668defm : IntMed3Pat<V_MED3_I32_e64, smin, smax>; 3669defm : IntMed3Pat<V_MED3_U32_e64, umin, umax>; 3670 3671multiclass FPMed3Pat<ValueType vt, 3672 Instruction med3Inst> { 3673 // This matches 16 permutations of max(min(x, y), min(max(x, y), z)) 3674 def : GCNPat< 3675 (fmaxnum_like_nnan 3676 (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3677 (VOP3Mods vt:$src1, i32:$src1_mods)), 3678 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3679 (VOP3Mods vt:$src1, i32:$src1_mods)), 3680 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3681 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3682 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3683 3684 3685 // This matches 16 permutations of min(max(x, y), max(min(x, y), z)) 3686 def : GCNPat< 3687 (fminnum_like_nnan 3688 (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3689 (VOP3Mods vt:$src1, i32:$src1_mods)), 3690 (fmaxnum_like (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3691 (VOP3Mods vt:$src1, i32:$src1_mods)), 3692 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3693 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3694 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3695} 3696 3697multiclass Int16Med3Pat<Instruction med3Inst, 3698 SDPatternOperator min, 3699 SDPatternOperator max, 3700 RegisterOperand outputSrcType> { 3701 // This matches 16 permutations of 3702 // max(min(x, y), min(max(x, y), z)) 3703 def : GCNPat < 3704 (max (min i16:$src0, i16:$src1), 3705 (min (max i16:$src0, i16:$src1), i16:$src2)), 3706 (med3Inst SRCMODS.NONE, outputSrcType:$src0, SRCMODS.NONE, outputSrcType:$src1, 3707 SRCMODS.NONE, outputSrcType:$src2, DSTCLAMP.NONE) 3708>; 3709 3710 // This matches 16 permutations of 3711 // min(max(a, b), max(min(a, b), c)) 3712 def : GCNPat < 3713 (min (max i16:$src0, i16:$src1), 3714 (max (min i16:$src0, i16:$src1), i16:$src2)), 3715 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3716>; 3717} 3718 3719defm : FPMed3Pat<f32, V_MED3_F32_e64>; 3720 3721let SubtargetPredicate = HasMed3_16 in { 3722let True16Predicate = NotHasTrue16BitInsts in 3723defm : FPMed3Pat<f16, V_MED3_F16_e64>; 3724let True16Predicate = UseRealTrue16Insts in 3725defm : FPMed3Pat<f16, V_MED3_F16_t16_e64>; 3726let True16Predicate = UseFakeTrue16Insts in 3727defm : FPMed3Pat<f16, V_MED3_F16_fake16_e64>; 3728} 3729 3730class 3731IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, 3732 SDPatternOperator max_or_min_oneuse> : AMDGPUPat < 3733 (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), 3734 i32:$src2), 3735 (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3736>; 3737 3738class 3739FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3740 SDPatternOperator max_or_min_oneuse> : GCNPat < 3741 (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3742 (VOP3Mods vt:$src1, i32:$src1_mods)), 3743 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3744 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3745 DSTCLAMP.NONE, DSTOMOD.NONE) 3746>; 3747 3748class 3749FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3750 SDPatternOperator max_or_min_oneuse> : GCNPat < 3751 (min_or_max (is_canonicalized_1<fcanonicalize> 3752 (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3753 (VOP3Mods vt:$src1, i32:$src1_mods))), 3754 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3755 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3756 DSTCLAMP.NONE, DSTOMOD.NONE) 3757>; 3758 3759let OtherPredicates = [isGFX11Plus] in { 3760def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; 3761def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; 3762def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; 3763def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; 3764def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3765def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3766def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3767def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3768} 3769 3770let True16Predicate = UseFakeTrue16Insts in { 3771def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3772def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3773def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3774def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3775} 3776 3777let SubtargetPredicate = isGFX9Plus in { 3778let True16Predicate = NotHasTrue16BitInsts in { 3779 defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>; 3780 defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>; 3781} 3782let True16Predicate = UseRealTrue16Insts in { 3783 defm : Int16Med3Pat<V_MED3_I16_t16_e64, smin, smax, VSrcT_b16>; 3784 defm : Int16Med3Pat<V_MED3_U16_t16_e64, umin, umax, VSrcT_b16>; 3785} 3786let True16Predicate = UseFakeTrue16Insts in { 3787 defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>; 3788 defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>; 3789} 3790} // End SubtargetPredicate = [isGFX9Plus] 3791 3792let SubtargetPredicate = isGFX12Plus in { 3793def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3794def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3795def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3796def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3797} 3798 3799let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in { 3800def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3801def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3802def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3803def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3804} 3805 3806// Convert a floating-point power of 2 to the integer exponent. 3807def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{ 3808 const auto &APF = N->getValueAPF(); 3809 int Log2 = APF.getExactLog2Abs(); 3810 assert(Log2 != INT_MIN); 3811 return CurDAG->getSignedTargetConstant(Log2, SDLoc(N), MVT::i32); 3812}]>; 3813 3814// Check if a floating point value is a power of 2 floating-point 3815// immediate where it's preferable to emit a multiply by as an 3816// ldexp. We skip over 0.5 to 4.0 as those are inline immediates 3817// anyway. 3818def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ 3819 if (Imm.isNegative()) 3820 return false; 3821 3822 int Exp = Imm.getExactLog2Abs(); 3823 // Prefer leaving the FP inline immediates as they are. 3824 // 0.5, 1.0, 2.0, 4.0 3825 3826 // For f64 ldexp is always better than materializing a 64-bit 3827 // constant. 3828 return Exp != INT_MIN && (Exp < -1 || Exp > 2); 3829 }], FPPow2ToExponentXForm 3830>; 3831 3832def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ 3833 if (!Imm.isNegative()) 3834 return false; 3835 int Exp = Imm.getExactLog2Abs(); 3836 // Prefer leaving the FP inline immediates as they are. 3837 // 0.5, 1.0, 2.0, 4.0 3838 3839 // For f64 ldexp is always better than materializing a 64-bit 3840 // constant. 3841 return Exp != INT_MIN && (Exp < -1 || Exp > 2); 3842 }], FPPow2ToExponentXForm 3843>; 3844 3845// f64 is different because we also want to handle cases that may 3846// require materialization of the exponent. 3847// TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma 3848// TODO: For f32/f16, it's not a clear win on code size to use ldexp 3849// in place of mul since we have to use the vop3 form. Are there power 3850// savings or some other reason to prefer ldexp over mul? 3851def : GCNPat< 3852 (any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)), 3853 fpimm_pos_pow2_prefer_ldexp_f64:$src1), 3854 (V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0, 3855 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3856>; 3857 3858def : GCNPat< 3859 (any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1), 3860 (V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0, 3861 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3862>; 3863 3864// We want to avoid using VOP3Mods which could pull in another fneg 3865// which we would need to be re-negated (which should never happen in 3866// practice). I don't see a way to apply an SDNodeXForm that accounts 3867// for a second operand. 3868def : GCNPat< 3869 (any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1), 3870 (V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0, 3871 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3872>; 3873 3874class AMDGPUGenericInstruction : GenericInstruction { 3875 let Namespace = "AMDGPU"; 3876} 3877 3878// Convert a wave address to a swizzled vector address (i.e. this is 3879// for copying the stack pointer to a vector address appropriate to 3880// use in the offset field of mubuf instructions). 3881def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { 3882 let OutOperandList = (outs type0:$dst); 3883 let InOperandList = (ins type0:$src); 3884 let hasSideEffects = 0; 3885} 3886 3887// Returns -1 if the input is zero. 3888def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 3889 let OutOperandList = (outs type0:$dst); 3890 let InOperandList = (ins type1:$src); 3891 let hasSideEffects = 0; 3892} 3893 3894// Returns -1 if the input is zero. 3895def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction { 3896 let OutOperandList = (outs type0:$dst); 3897 let InOperandList = (ins type1:$src); 3898 let hasSideEffects = 0; 3899} 3900 3901def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 3902 let OutOperandList = (outs type0:$dst); 3903 let InOperandList = (ins type1:$src); 3904 let hasSideEffects = 0; 3905} 3906 3907class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 3908 let OutOperandList = (outs type0:$dst); 3909 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3910 type2:$soffset, untyped_imm_0:$offset, 3911 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3912 let hasSideEffects = 0; 3913 let mayLoad = 1; 3914} 3915 3916class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 3917 let OutOperandList = (outs type0:$dst); 3918 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3919 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 3920 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3921 let hasSideEffects = 0; 3922 let mayLoad = 1; 3923} 3924 3925def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 3926def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 3927def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 3928def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 3929def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 3930def G_AMDGPU_BUFFER_LOAD_UBYTE_TFE : BufferLoadGenericInstruction; 3931def G_AMDGPU_BUFFER_LOAD_SBYTE_TFE : BufferLoadGenericInstruction; 3932def G_AMDGPU_BUFFER_LOAD_USHORT_TFE : BufferLoadGenericInstruction; 3933def G_AMDGPU_BUFFER_LOAD_SSHORT_TFE : BufferLoadGenericInstruction; 3934def G_AMDGPU_BUFFER_LOAD_TFE : BufferLoadGenericInstruction; 3935def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 3936def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction; 3937def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 3938def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 3939def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 3940 3941class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 3942 let OutOperandList = (outs); 3943 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3944 type2:$soffset, untyped_imm_0:$offset, 3945 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3946 let hasSideEffects = 0; 3947 let mayStore = 1; 3948} 3949 3950class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 3951 let OutOperandList = (outs); 3952 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3953 type2:$soffset, untyped_imm_0:$offset, 3954 untyped_imm_0:$format, 3955 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3956 let hasSideEffects = 0; 3957 let mayStore = 1; 3958} 3959 3960def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 3961def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 3962def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 3963def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 3964def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 3965def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 3966def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 3967 3968def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 3969 let OutOperandList = (outs type0:$dst); 3970 let InOperandList = (ins type0:$src0, type0:$src1); 3971 let hasSideEffects = 0; 3972} 3973 3974def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 3975 let OutOperandList = (outs type0:$dst); 3976 let InOperandList = (ins type0:$src0, type0:$src1); 3977 let hasSideEffects = 0; 3978} 3979 3980foreach N = 0-3 in { 3981def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 3982 let OutOperandList = (outs type0:$dst); 3983 let InOperandList = (ins type0:$src0); 3984 let hasSideEffects = 0; 3985} 3986} 3987 3988def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { 3989 let OutOperandList = (outs type0:$dst); 3990 let InOperandList = (ins type0:$src0, type0:$src1); 3991 let hasSideEffects = 0; 3992} 3993 3994def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { 3995 let OutOperandList = (outs type0:$dst); 3996 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3997 let hasSideEffects = 0; 3998} 3999 4000def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { 4001 let OutOperandList = (outs type0:$dst); 4002 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 4003 let hasSideEffects = 0; 4004} 4005 4006def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { 4007 let OutOperandList = (outs type0:$dst); 4008 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 4009 let hasSideEffects = 0; 4010} 4011 4012def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { 4013 let OutOperandList = (outs type0:$dst); 4014 let InOperandList = (ins type0:$src); 4015 let hasSideEffects = 0; 4016} 4017 4018// Integer multiply-add: arg0 * arg1 + arg2. 4019// 4020// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned), 4021// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out. 4022class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction { 4023 let OutOperandList = (outs type0:$dst, type1:$carry_out); 4024 let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2); 4025 let hasSideEffects = 0; 4026} 4027 4028def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32; 4029def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32; 4030 4031// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 4032// operand Expects a MachineMemOperand in addition to explicit 4033// operands. 4034def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 4035 let OutOperandList = (outs type0:$oldval); 4036 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 4037 let hasSideEffects = 0; 4038 let mayLoad = 1; 4039 let mayStore = 1; 4040} 4041 4042class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { 4043 let OutOperandList = (outs type0:$dst); 4044 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 4045 type2:$soffset, untyped_imm_0:$offset, 4046 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 4047 let hasSideEffects = 0; 4048 let mayLoad = 1; 4049 let mayStore = 1; 4050} 4051 4052def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 4053def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 4054def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 4055def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 4056def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 4057def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 4058def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 4059def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 4060def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; 4061def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 4062def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 4063def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 4064def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 4065def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; 4066def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; 4067def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; 4068 4069def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 4070 let OutOperandList = (outs type0:$dst); 4071 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 4072 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 4073 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 4074 let hasSideEffects = 0; 4075 let mayLoad = 1; 4076 let mayStore = 1; 4077} 4078 4079// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 4080// a workaround for the intrinsic being defined as readnone, but 4081// really needs a memory operand. 4082 4083class SBufferLoadInstruction : AMDGPUGenericInstruction { 4084 let OutOperandList = (outs type0:$dst); 4085 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 4086 let hasSideEffects = 0; 4087 let mayLoad = 1; 4088 let mayStore = 0; 4089} 4090 4091def G_AMDGPU_S_BUFFER_LOAD : SBufferLoadInstruction; 4092def G_AMDGPU_S_BUFFER_LOAD_SBYTE : SBufferLoadInstruction; 4093def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction; 4094def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction; 4095def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction; 4096 4097class SBufferPrefetchInstruction : AMDGPUGenericInstruction { 4098 let OutOperandList = (outs); 4099 let InOperandList = (ins type0:$rsrc, untyped_imm_0:$offset, type1:$len); 4100 let hasSideEffects = 0; 4101 let mayLoad = 1; 4102 let mayStore = 1; 4103} 4104 4105def G_AMDGPU_S_BUFFER_PREFETCH : SBufferPrefetchInstruction; 4106 4107def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { 4108 let OutOperandList = (outs type0:$dst); 4109 let InOperandList = (ins type0:$src0, type0:$src1); 4110 let hasSideEffects = 0; 4111} 4112 4113def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { 4114 let OutOperandList = (outs type0:$dst); 4115 let InOperandList = (ins type0:$src0, type0:$src1); 4116 let hasSideEffects = 0; 4117} 4118 4119// This is equivalent to the G_INTRINSIC*, but the operands may have 4120// been legalized depending on the subtarget requirements. 4121def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 4122 let OutOperandList = (outs type0:$dst); 4123 let InOperandList = (ins unknown:$intrin, variable_ops); 4124 let hasSideEffects = 0; 4125 let mayLoad = 1; 4126 4127 // FIXME: Use separate opcode for atomics. 4128 let mayStore = 1; 4129} 4130 4131def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { 4132 let OutOperandList = (outs type0:$dst); 4133 let InOperandList = (ins unknown:$intrin, variable_ops); 4134 let hasSideEffects = 0; 4135 let mayLoad = 1; 4136 4137 // FIXME: Use separate opcode for atomics. 4138 let mayStore = 1; 4139} 4140 4141def G_AMDGPU_INTRIN_IMAGE_LOAD_NORET : AMDGPUGenericInstruction { 4142 let OutOperandList = (outs); 4143 let InOperandList = (ins unknown:$intrin, variable_ops); 4144 let hasSideEffects = 0; 4145 let mayLoad = 1; 4146 let mayStore = 1; 4147} 4148 4149// This is equivalent to the G_INTRINSIC*, but the operands may have 4150// been legalized depending on the subtarget requirements. 4151def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 4152 let OutOperandList = (outs); 4153 let InOperandList = (ins unknown:$intrin, variable_ops); 4154 let hasSideEffects = 0; 4155 let mayStore = 1; 4156} 4157 4158def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { 4159 let OutOperandList = (outs); 4160 let InOperandList = (ins unknown:$intrin, variable_ops); 4161 let hasSideEffects = 0; 4162 let mayStore = 1; 4163} 4164 4165def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { 4166 let OutOperandList = (outs type0:$dst); 4167 let InOperandList = (ins unknown:$intrin, variable_ops); 4168 let hasSideEffects = 0; 4169 let mayLoad = 1; 4170 let mayStore = 0; 4171} 4172 4173// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop 4174// if necessary. 4175def G_SI_CALL : AMDGPUGenericInstruction { 4176 let OutOperandList = (outs SReg_64:$dst); 4177 let InOperandList = (ins type0:$src0, unknown:$callee); 4178 let Size = 4; 4179 let isCall = 1; 4180 let UseNamedOperandTable = 1; 4181 let SchedRW = [WriteBranch]; 4182 // TODO: Should really base this on the call target 4183 let isConvergent = 1; 4184} 4185 4186// Uniform in vgpr - vgpr with same value in all active lanes. 4187 4188// $dst = $src0 != 0, selected as: 4189// $dst(SCC) = s_cmp_lg $src0, 0 4190// src0 is either exec or 0 (same value for all active lanes), 4191// for example result of comparison of two uniform in vgpr. 4192def G_AMDGPU_COPY_SCC_VCC : AMDGPUGenericInstruction { 4193 let OutOperandList = (outs type0:$dst); 4194 let InOperandList = (ins type1:$src0); 4195 let hasSideEffects = 0; 4196} 4197 4198// $dst = $src0 ? exec : 0, selected as: 4199// SCC = COPY $src0 4200// $dst(SReg_32/64) = s_cselect exec, 0 4201def G_AMDGPU_COPY_VCC_SCC : AMDGPUGenericInstruction { 4202 let OutOperandList = (outs type0:$dst); 4203 let InOperandList = (ins type1:$src0); 4204 let hasSideEffects = 0; 4205} 4206 4207// Move uniform in vgpr to sgpr. Selected as v_readfirstlane_b32. 4208// Semantic difference in READ ANY instead of FIRST(active) LANE allows for 4209// vgpr to sgpr back-to vgpr combine, vgpr has same value in all active lanes 4210// vgprDst = COPY (G_AMDGPU_READANYLANE vgprSrc) -> vgprDst = sgprSrc 4211def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction { 4212 let OutOperandList = (outs type0:$dst); 4213 let InOperandList = (ins type0:$src0); 4214 let hasSideEffects = 0; 4215} 4216 4217//============================================================================// 4218// Dummy Instructions 4219//============================================================================// 4220 4221def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { 4222 let Inst{31-0} = 0x00000000; 4223 let FixedSize = 1; 4224 let Size = 4; 4225 let Uses = [EXEC]; 4226 let hasSideEffects = 1; 4227 let SubtargetPredicate = isGFX10Plus; 4228} 4229