1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include <optional> 31 32 #define DEBUG_TYPE "amdgpu-isel" 33 34 using namespace llvm; 35 using namespace MIPatternMatch; 36 37 #define GET_GLOBALISEL_IMPL 38 #define AMDGPUSubtarget GCNSubtarget 39 #include "AMDGPUGenGlobalISel.inc" 40 #undef GET_GLOBALISEL_IMPL 41 #undef AMDGPUSubtarget 42 43 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 45 const AMDGPUTargetMachine &TM) 46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 47 STI(STI), 48 #define GET_GLOBALISEL_PREDICATES_INIT 49 #include "AMDGPUGenGlobalISel.inc" 50 #undef GET_GLOBALISEL_PREDICATES_INIT 51 #define GET_GLOBALISEL_TEMPORARIES_INIT 52 #include "AMDGPUGenGlobalISel.inc" 53 #undef GET_GLOBALISEL_TEMPORARIES_INIT 54 { 55 } 56 57 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 58 59 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, 60 CodeGenCoverage *CoverageInfo, 61 ProfileSummaryInfo *PSI, 62 BlockFrequencyInfo *BFI) { 63 MRI = &MF.getRegInfo(); 64 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 65 Subtarget->checkSubtargetFeatures(MF.getFunction()); 66 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 67 } 68 69 // Return the wave level SGPR base address if this is a wave address. 70 static Register getWaveAddress(const MachineInstr *Def) { 71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS 72 ? Def->getOperand(1).getReg() 73 : Register(); 74 } 75 76 bool AMDGPUInstructionSelector::isVCC(Register Reg, 77 const MachineRegisterInfo &MRI) const { 78 // The verifier is oblivious to s1 being a valid value for wavesize registers. 79 if (Reg.isPhysical()) 80 return false; 81 82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 83 const TargetRegisterClass *RC = 84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 85 if (RC) { 86 const LLT Ty = MRI.getType(Reg); 87 if (!Ty.isValid() || Ty.getSizeInBits() != 1) 88 return false; 89 // G_TRUNC s1 result is never vcc. 90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && 91 RC->hasSuperClassEq(TRI.getBoolRC()); 92 } 93 94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank); 95 return RB->getID() == AMDGPU::VCCRegBankID; 96 } 97 98 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 99 unsigned NewOpc) const { 100 MI.setDesc(TII.get(NewOpc)); 101 MI.removeOperand(1); // Remove intrinsic ID. 102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 103 104 MachineOperand &Dst = MI.getOperand(0); 105 MachineOperand &Src = MI.getOperand(1); 106 107 // TODO: This should be legalized to s32 if needed 108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 109 return false; 110 111 const TargetRegisterClass *DstRC 112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 113 const TargetRegisterClass *SrcRC 114 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 115 if (!DstRC || DstRC != SrcRC) 116 return false; 117 118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 120 } 121 122 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 123 const DebugLoc &DL = I.getDebugLoc(); 124 MachineBasicBlock *BB = I.getParent(); 125 I.setDesc(TII.get(TargetOpcode::COPY)); 126 127 const MachineOperand &Src = I.getOperand(1); 128 MachineOperand &Dst = I.getOperand(0); 129 Register DstReg = Dst.getReg(); 130 Register SrcReg = Src.getReg(); 131 132 if (isVCC(DstReg, *MRI)) { 133 if (SrcReg == AMDGPU::SCC) { 134 const TargetRegisterClass *RC 135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 136 if (!RC) 137 return true; 138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 139 } 140 141 if (!isVCC(SrcReg, *MRI)) { 142 // TODO: Should probably leave the copy and let copyPhysReg expand it. 143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 144 return false; 145 146 const TargetRegisterClass *SrcRC 147 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 148 149 std::optional<ValueAndVReg> ConstVal = 150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true); 151 if (ConstVal) { 152 unsigned MovOpc = 153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) 155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); 156 } else { 157 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 158 159 // We can't trust the high bits at this point, so clear them. 160 161 // TODO: Skip masking high bits if def is known boolean. 162 163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) { 164 assert(Subtarget->useRealTrue16Insts()); 165 const int64_t NoMods = 0; 166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg) 167 .addImm(NoMods) 168 .addImm(1) 169 .addImm(NoMods) 170 .addReg(SrcReg) 171 .addImm(NoMods); 172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg) 173 .addImm(NoMods) 174 .addImm(0) 175 .addImm(NoMods) 176 .addReg(MaskedReg) 177 .addImm(NoMods); 178 } else { 179 bool IsSGPR = TRI.isSGPRClass(SrcRC); 180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 182 .addImm(1) 183 .addReg(SrcReg); 184 if (IsSGPR) 185 And.setOperandDead(3); // Dead scc 186 187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 188 .addImm(0) 189 .addReg(MaskedReg); 190 } 191 } 192 193 if (!MRI->getRegClassOrNull(SrcReg)) 194 MRI->setRegClass(SrcReg, SrcRC); 195 I.eraseFromParent(); 196 return true; 197 } 198 199 const TargetRegisterClass *RC = 200 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 202 return false; 203 204 return true; 205 } 206 207 for (const MachineOperand &MO : I.operands()) { 208 if (MO.getReg().isPhysical()) 209 continue; 210 211 const TargetRegisterClass *RC = 212 TRI.getConstrainedRegClassForOperand(MO, *MRI); 213 if (!RC) 214 continue; 215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 216 } 217 return true; 218 } 219 220 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { 221 const DebugLoc &DL = I.getDebugLoc(); 222 MachineBasicBlock *BB = I.getParent(); 223 224 unsigned CmpOpc = 225 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; 226 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) 227 .addReg(I.getOperand(1).getReg()) 228 .addImm(0); 229 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) 230 return false; 231 232 Register DstReg = I.getOperand(0).getReg(); 233 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); 234 235 I.eraseFromParent(); 236 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 237 } 238 239 bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { 240 const DebugLoc &DL = I.getDebugLoc(); 241 MachineBasicBlock *BB = I.getParent(); 242 243 Register DstReg = I.getOperand(0).getReg(); 244 Register SrcReg = I.getOperand(1).getReg(); 245 std::optional<ValueAndVReg> Arg = 246 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI); 247 248 if (Arg) { 249 const int64_t Value = Arg->Value.getZExtValue(); 250 if (Value == 0) { 251 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 252 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 253 } else { 254 assert(Value == 1); 255 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec()); 256 } 257 I.eraseFromParent(); 258 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI); 259 } 260 261 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0). 262 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg); 263 264 unsigned SelectOpcode = 265 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 266 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 267 .addReg(TRI.getExec()) 268 .addImm(0); 269 270 I.eraseFromParent(); 271 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 272 } 273 274 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { 275 Register DstReg = I.getOperand(0).getReg(); 276 Register SrcReg = I.getOperand(1).getReg(); 277 278 const DebugLoc &DL = I.getDebugLoc(); 279 MachineBasicBlock *BB = I.getParent(); 280 281 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 282 .addReg(SrcReg); 283 284 I.eraseFromParent(); 285 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); 286 } 287 288 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 289 const Register DefReg = I.getOperand(0).getReg(); 290 const LLT DefTy = MRI->getType(DefReg); 291 292 // S1 G_PHIs should not be selected in instruction-select, instead: 293 // - divergent S1 G_PHI should go through lane mask merging algorithm 294 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering 295 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect 296 if (DefTy == LLT::scalar(1)) 297 return false; 298 299 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 300 301 const RegClassOrRegBank &RegClassOrBank = 302 MRI->getRegClassOrRegBank(DefReg); 303 304 const TargetRegisterClass *DefRC = 305 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 306 if (!DefRC) { 307 if (!DefTy.isValid()) { 308 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 309 return false; 310 } 311 312 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank); 313 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); 314 if (!DefRC) { 315 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 316 return false; 317 } 318 } 319 320 // If inputs have register bank, assign corresponding reg class. 321 // Note: registers don't need to have the same reg bank. 322 for (unsigned i = 1; i != I.getNumOperands(); i += 2) { 323 const Register SrcReg = I.getOperand(i).getReg(); 324 325 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg); 326 if (RB) { 327 const LLT SrcTy = MRI->getType(SrcReg); 328 const TargetRegisterClass *SrcRC = 329 TRI.getRegClassForTypeOnBank(SrcTy, *RB); 330 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 331 return false; 332 } 333 } 334 335 I.setDesc(TII.get(TargetOpcode::PHI)); 336 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 337 } 338 339 MachineOperand 340 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 341 const TargetRegisterClass &SubRC, 342 unsigned SubIdx) const { 343 344 MachineInstr *MI = MO.getParent(); 345 MachineBasicBlock *BB = MO.getParent()->getParent(); 346 Register DstReg = MRI->createVirtualRegister(&SubRC); 347 348 if (MO.isReg()) { 349 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 350 Register Reg = MO.getReg(); 351 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 352 .addReg(Reg, 0, ComposedSubIdx); 353 354 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 355 MO.isKill(), MO.isDead(), MO.isUndef(), 356 MO.isEarlyClobber(), 0, MO.isDebug(), 357 MO.isInternalRead()); 358 } 359 360 assert(MO.isImm()); 361 362 APInt Imm(64, MO.getImm()); 363 364 switch (SubIdx) { 365 default: 366 llvm_unreachable("do not know to split immediate with this sub index."); 367 case AMDGPU::sub0: 368 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 369 case AMDGPU::sub1: 370 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 371 } 372 } 373 374 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 375 switch (Opc) { 376 case AMDGPU::G_AND: 377 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 378 case AMDGPU::G_OR: 379 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 380 case AMDGPU::G_XOR: 381 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 382 default: 383 llvm_unreachable("not a bit op"); 384 } 385 } 386 387 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 388 Register DstReg = I.getOperand(0).getReg(); 389 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 390 391 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 392 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 393 DstRB->getID() != AMDGPU::VCCRegBankID) 394 return false; 395 396 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 397 STI.isWave64()); 398 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 399 400 // Dead implicit-def of scc 401 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 402 true, // isImp 403 false, // isKill 404 true)); // isDead 405 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 406 } 407 408 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 409 MachineBasicBlock *BB = I.getParent(); 410 MachineFunction *MF = BB->getParent(); 411 Register DstReg = I.getOperand(0).getReg(); 412 const DebugLoc &DL = I.getDebugLoc(); 413 LLT Ty = MRI->getType(DstReg); 414 if (Ty.isVector()) 415 return false; 416 417 unsigned Size = Ty.getSizeInBits(); 418 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 419 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 420 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 421 422 if (Size == 32) { 423 if (IsSALU) { 424 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 425 MachineInstr *Add = 426 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 427 .add(I.getOperand(1)) 428 .add(I.getOperand(2)) 429 .setOperandDead(3); // Dead scc 430 I.eraseFromParent(); 431 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 432 } 433 434 if (STI.hasAddNoCarry()) { 435 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 436 I.setDesc(TII.get(Opc)); 437 I.addOperand(*MF, MachineOperand::CreateImm(0)); 438 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 440 } 441 442 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 443 444 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 445 MachineInstr *Add 446 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 447 .addDef(UnusedCarry, RegState::Dead) 448 .add(I.getOperand(1)) 449 .add(I.getOperand(2)) 450 .addImm(0); 451 I.eraseFromParent(); 452 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 453 } 454 455 assert(!Sub && "illegal sub should not reach here"); 456 457 const TargetRegisterClass &RC 458 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 459 const TargetRegisterClass &HalfRC 460 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 461 462 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 463 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 464 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 465 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 466 467 Register DstLo = MRI->createVirtualRegister(&HalfRC); 468 Register DstHi = MRI->createVirtualRegister(&HalfRC); 469 470 if (IsSALU) { 471 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 472 .add(Lo1) 473 .add(Lo2); 474 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 475 .add(Hi1) 476 .add(Hi2) 477 .setOperandDead(3); // Dead scc 478 } else { 479 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 480 Register CarryReg = MRI->createVirtualRegister(CarryRC); 481 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 482 .addDef(CarryReg) 483 .add(Lo1) 484 .add(Lo2) 485 .addImm(0); 486 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 487 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 488 .add(Hi1) 489 .add(Hi2) 490 .addReg(CarryReg, RegState::Kill) 491 .addImm(0); 492 493 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 494 return false; 495 } 496 497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 498 .addReg(DstLo) 499 .addImm(AMDGPU::sub0) 500 .addReg(DstHi) 501 .addImm(AMDGPU::sub1); 502 503 504 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 505 return false; 506 507 I.eraseFromParent(); 508 return true; 509 } 510 511 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 512 MachineInstr &I) const { 513 MachineBasicBlock *BB = I.getParent(); 514 MachineFunction *MF = BB->getParent(); 515 const DebugLoc &DL = I.getDebugLoc(); 516 Register Dst0Reg = I.getOperand(0).getReg(); 517 Register Dst1Reg = I.getOperand(1).getReg(); 518 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 519 I.getOpcode() == AMDGPU::G_UADDE; 520 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 521 I.getOpcode() == AMDGPU::G_USUBE; 522 523 if (isVCC(Dst1Reg, *MRI)) { 524 unsigned NoCarryOpc = 525 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 526 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 527 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 528 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 529 I.addOperand(*MF, MachineOperand::CreateImm(0)); 530 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 531 } 532 533 Register Src0Reg = I.getOperand(2).getReg(); 534 Register Src1Reg = I.getOperand(3).getReg(); 535 536 if (HasCarryIn) { 537 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 538 .addReg(I.getOperand(4).getReg()); 539 } 540 541 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 542 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 543 544 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 545 .add(I.getOperand(2)) 546 .add(I.getOperand(3)); 547 548 if (MRI->use_nodbg_empty(Dst1Reg)) { 549 CarryInst.setOperandDead(3); // Dead scc 550 } else { 551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 552 .addReg(AMDGPU::SCC); 553 if (!MRI->getRegClassOrNull(Dst1Reg)) 554 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 555 } 556 557 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 558 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 559 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 560 return false; 561 562 if (HasCarryIn && 563 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 564 AMDGPU::SReg_32RegClass, *MRI)) 565 return false; 566 567 I.eraseFromParent(); 568 return true; 569 } 570 571 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( 572 MachineInstr &I) const { 573 MachineBasicBlock *BB = I.getParent(); 574 MachineFunction *MF = BB->getParent(); 575 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 576 577 unsigned Opc; 578 if (Subtarget->hasMADIntraFwdBug()) 579 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 580 : AMDGPU::V_MAD_I64_I32_gfx11_e64; 581 else 582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; 583 I.setDesc(TII.get(Opc)); 584 I.addOperand(*MF, MachineOperand::CreateImm(0)); 585 I.addImplicitDefUseOperands(*MF); 586 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 587 } 588 589 // TODO: We should probably legalize these to only using 32-bit results. 590 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 591 MachineBasicBlock *BB = I.getParent(); 592 Register DstReg = I.getOperand(0).getReg(); 593 Register SrcReg = I.getOperand(1).getReg(); 594 LLT DstTy = MRI->getType(DstReg); 595 LLT SrcTy = MRI->getType(SrcReg); 596 const unsigned SrcSize = SrcTy.getSizeInBits(); 597 unsigned DstSize = DstTy.getSizeInBits(); 598 599 // TODO: Should handle any multiple of 32 offset. 600 unsigned Offset = I.getOperand(2).getImm(); 601 if (Offset % 32 != 0 || DstSize > 128) 602 return false; 603 604 // 16-bit operations really use 32-bit registers. 605 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 606 if (DstSize == 16) 607 DstSize = 32; 608 609 const TargetRegisterClass *DstRC = 610 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 611 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 612 return false; 613 614 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 615 const TargetRegisterClass *SrcRC = 616 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 617 if (!SrcRC) 618 return false; 619 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 620 DstSize / 32); 621 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 622 if (!SrcRC) 623 return false; 624 625 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 626 *SrcRC, I.getOperand(1)); 627 const DebugLoc &DL = I.getDebugLoc(); 628 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 629 .addReg(SrcReg, 0, SubReg); 630 631 I.eraseFromParent(); 632 return true; 633 } 634 635 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 636 MachineBasicBlock *BB = MI.getParent(); 637 Register DstReg = MI.getOperand(0).getReg(); 638 LLT DstTy = MRI->getType(DstReg); 639 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 640 641 const unsigned SrcSize = SrcTy.getSizeInBits(); 642 if (SrcSize < 32) 643 return selectImpl(MI, *CoverageInfo); 644 645 const DebugLoc &DL = MI.getDebugLoc(); 646 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 647 const unsigned DstSize = DstTy.getSizeInBits(); 648 const TargetRegisterClass *DstRC = 649 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 650 if (!DstRC) 651 return false; 652 653 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 654 MachineInstrBuilder MIB = 655 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 656 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 657 MachineOperand &Src = MI.getOperand(I + 1); 658 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 659 MIB.addImm(SubRegs[I]); 660 661 const TargetRegisterClass *SrcRC 662 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 663 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 664 return false; 665 } 666 667 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 668 return false; 669 670 MI.eraseFromParent(); 671 return true; 672 } 673 674 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 675 MachineBasicBlock *BB = MI.getParent(); 676 const int NumDst = MI.getNumOperands() - 1; 677 678 MachineOperand &Src = MI.getOperand(NumDst); 679 680 Register SrcReg = Src.getReg(); 681 Register DstReg0 = MI.getOperand(0).getReg(); 682 LLT DstTy = MRI->getType(DstReg0); 683 LLT SrcTy = MRI->getType(SrcReg); 684 685 const unsigned DstSize = DstTy.getSizeInBits(); 686 const unsigned SrcSize = SrcTy.getSizeInBits(); 687 const DebugLoc &DL = MI.getDebugLoc(); 688 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 689 690 const TargetRegisterClass *SrcRC = 691 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 692 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 693 return false; 694 695 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 696 // source, and this relies on the fact that the same subregister indices are 697 // used for both. 698 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 699 for (int I = 0, E = NumDst; I != E; ++I) { 700 MachineOperand &Dst = MI.getOperand(I); 701 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 702 .addReg(SrcReg, 0, SubRegs[I]); 703 704 // Make sure the subregister index is valid for the source register. 705 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 706 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 707 return false; 708 709 const TargetRegisterClass *DstRC = 710 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 711 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 712 return false; 713 } 714 715 MI.eraseFromParent(); 716 return true; 717 } 718 719 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { 720 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC || 721 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR); 722 723 Register Src0 = MI.getOperand(1).getReg(); 724 Register Src1 = MI.getOperand(2).getReg(); 725 LLT SrcTy = MRI->getType(Src0); 726 const unsigned SrcSize = SrcTy.getSizeInBits(); 727 728 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. 729 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) { 730 return selectG_MERGE_VALUES(MI); 731 } 732 733 // Selection logic below is for V2S16 only. 734 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. 735 Register Dst = MI.getOperand(0).getReg(); 736 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) || 737 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC && 738 SrcTy != LLT::scalar(32))) 739 return selectImpl(MI, *CoverageInfo); 740 741 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 742 if (DstBank->getID() == AMDGPU::AGPRRegBankID) 743 return false; 744 745 assert(DstBank->getID() == AMDGPU::SGPRRegBankID || 746 DstBank->getID() == AMDGPU::VGPRRegBankID); 747 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; 748 749 const DebugLoc &DL = MI.getDebugLoc(); 750 MachineBasicBlock *BB = MI.getParent(); 751 752 // First, before trying TableGen patterns, check if both sources are 753 // constants. In those cases, we can trivially compute the final constant 754 // and emit a simple move. 755 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 756 if (ConstSrc1) { 757 auto ConstSrc0 = 758 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true); 759 if (ConstSrc0) { 760 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 761 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 762 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 763 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 764 uint32_t Imm = Lo16 | (Hi16 << 16); 765 766 // VALU 767 if (IsVector) { 768 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm); 769 MI.eraseFromParent(); 770 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI); 771 } 772 773 // SALU 774 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm); 775 MI.eraseFromParent(); 776 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 777 } 778 } 779 780 // Now try TableGen patterns. 781 if (selectImpl(MI, *CoverageInfo)) 782 return true; 783 784 // TODO: This should probably be a combine somewhere 785 // (build_vector $src0, undef) -> copy $src0 786 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 787 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 788 MI.setDesc(TII.get(AMDGPU::COPY)); 789 MI.removeOperand(2); 790 const auto &RC = 791 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 792 return RBI.constrainGenericRegister(Dst, RC, *MRI) && 793 RBI.constrainGenericRegister(Src0, RC, *MRI); 794 } 795 796 // TODO: Can be improved? 797 if (IsVector) { 798 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 799 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) 800 .addImm(0xFFFF) 801 .addReg(Src0); 802 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 803 return false; 804 805 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) 806 .addReg(Src1) 807 .addImm(16) 808 .addReg(TmpReg); 809 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 810 return false; 811 812 MI.eraseFromParent(); 813 return true; 814 } 815 816 Register ShiftSrc0; 817 Register ShiftSrc1; 818 819 // With multiple uses of the shift, this will duplicate the shift and 820 // increase register pressure. 821 // 822 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 823 // => (S_PACK_HH_B32_B16 $src0, $src1) 824 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) 825 // => (S_PACK_HL_B32_B16 $src0, $src1) 826 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) 827 // => (S_PACK_LH_B32_B16 $src0, $src1) 828 // (build_vector $src0, $src1) 829 // => (S_PACK_LL_B32_B16 $src0, $src1) 830 831 bool Shift0 = mi_match( 832 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 833 834 bool Shift1 = mi_match( 835 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 836 837 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 838 if (Shift0 && Shift1) { 839 Opc = AMDGPU::S_PACK_HH_B32_B16; 840 MI.getOperand(1).setReg(ShiftSrc0); 841 MI.getOperand(2).setReg(ShiftSrc1); 842 } else if (Shift1) { 843 Opc = AMDGPU::S_PACK_LH_B32_B16; 844 MI.getOperand(2).setReg(ShiftSrc1); 845 } else if (Shift0) { 846 auto ConstSrc1 = 847 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 848 if (ConstSrc1 && ConstSrc1->Value == 0) { 849 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 850 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 851 .addReg(ShiftSrc0) 852 .addImm(16) 853 .setOperandDead(3); // Dead scc 854 855 MI.eraseFromParent(); 856 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 857 } 858 if (STI.hasSPackHL()) { 859 Opc = AMDGPU::S_PACK_HL_B32_B16; 860 MI.getOperand(1).setReg(ShiftSrc0); 861 } 862 } 863 864 MI.setDesc(TII.get(Opc)); 865 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 866 } 867 868 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 869 const MachineOperand &MO = I.getOperand(0); 870 871 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 872 // regbank check here is to know why getConstrainedRegClassForOperand failed. 873 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 874 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 875 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 876 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 877 return true; 878 } 879 880 return false; 881 } 882 883 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 884 MachineBasicBlock *BB = I.getParent(); 885 886 Register DstReg = I.getOperand(0).getReg(); 887 Register Src0Reg = I.getOperand(1).getReg(); 888 Register Src1Reg = I.getOperand(2).getReg(); 889 LLT Src1Ty = MRI->getType(Src1Reg); 890 891 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 892 unsigned InsSize = Src1Ty.getSizeInBits(); 893 894 int64_t Offset = I.getOperand(3).getImm(); 895 896 // FIXME: These cases should have been illegal and unnecessary to check here. 897 if (Offset % 32 != 0 || InsSize % 32 != 0) 898 return false; 899 900 // Currently not handled by getSubRegFromChannel. 901 if (InsSize > 128) 902 return false; 903 904 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 905 if (SubReg == AMDGPU::NoSubRegister) 906 return false; 907 908 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 909 const TargetRegisterClass *DstRC = 910 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 911 if (!DstRC) 912 return false; 913 914 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 915 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 916 const TargetRegisterClass *Src0RC = 917 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); 918 const TargetRegisterClass *Src1RC = 919 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); 920 921 // Deal with weird cases where the class only partially supports the subreg 922 // index. 923 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 924 if (!Src0RC || !Src1RC) 925 return false; 926 927 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 928 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 929 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 930 return false; 931 932 const DebugLoc &DL = I.getDebugLoc(); 933 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 934 .addReg(Src0Reg) 935 .addReg(Src1Reg) 936 .addImm(SubReg); 937 938 I.eraseFromParent(); 939 return true; 940 } 941 942 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { 943 Register DstReg = MI.getOperand(0).getReg(); 944 Register SrcReg = MI.getOperand(1).getReg(); 945 Register OffsetReg = MI.getOperand(2).getReg(); 946 Register WidthReg = MI.getOperand(3).getReg(); 947 948 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && 949 "scalar BFX instructions are expanded in regbankselect"); 950 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && 951 "64-bit vector BFX instructions are expanded in regbankselect"); 952 953 const DebugLoc &DL = MI.getDebugLoc(); 954 MachineBasicBlock *MBB = MI.getParent(); 955 956 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; 957 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 958 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) 959 .addReg(SrcReg) 960 .addReg(OffsetReg) 961 .addReg(WidthReg); 962 MI.eraseFromParent(); 963 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 964 } 965 966 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 967 if (STI.getLDSBankCount() != 16) 968 return selectImpl(MI, *CoverageInfo); 969 970 Register Dst = MI.getOperand(0).getReg(); 971 Register Src0 = MI.getOperand(2).getReg(); 972 Register M0Val = MI.getOperand(6).getReg(); 973 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 974 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 975 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 976 return false; 977 978 // This requires 2 instructions. It is possible to write a pattern to support 979 // this, but the generated isel emitter doesn't correctly deal with multiple 980 // output instructions using the same physical register input. The copy to m0 981 // is incorrectly placed before the second instruction. 982 // 983 // TODO: Match source modifiers. 984 985 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 986 const DebugLoc &DL = MI.getDebugLoc(); 987 MachineBasicBlock *MBB = MI.getParent(); 988 989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 990 .addReg(M0Val); 991 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 992 .addImm(2) 993 .addImm(MI.getOperand(4).getImm()) // $attr 994 .addImm(MI.getOperand(3).getImm()); // $attrchan 995 996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 997 .addImm(0) // $src0_modifiers 998 .addReg(Src0) // $src0 999 .addImm(MI.getOperand(4).getImm()) // $attr 1000 .addImm(MI.getOperand(3).getImm()) // $attrchan 1001 .addImm(0) // $src2_modifiers 1002 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 1003 .addImm(MI.getOperand(5).getImm()) // $high 1004 .addImm(0) // $clamp 1005 .addImm(0); // $omod 1006 1007 MI.eraseFromParent(); 1008 return true; 1009 } 1010 1011 // Writelane is special in that it can use SGPR and M0 (which would normally 1012 // count as using the constant bus twice - but in this case it is allowed since 1013 // the lane selector doesn't count as a use of the constant bus). However, it is 1014 // still required to abide by the 1 SGPR rule. Fix this up if we might have 1015 // multiple SGPRs. 1016 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 1017 // With a constant bus limit of at least 2, there's no issue. 1018 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 1019 return selectImpl(MI, *CoverageInfo); 1020 1021 MachineBasicBlock *MBB = MI.getParent(); 1022 const DebugLoc &DL = MI.getDebugLoc(); 1023 Register VDst = MI.getOperand(0).getReg(); 1024 Register Val = MI.getOperand(2).getReg(); 1025 Register LaneSelect = MI.getOperand(3).getReg(); 1026 Register VDstIn = MI.getOperand(4).getReg(); 1027 1028 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 1029 1030 std::optional<ValueAndVReg> ConstSelect = 1031 getIConstantVRegValWithLookThrough(LaneSelect, *MRI); 1032 if (ConstSelect) { 1033 // The selector has to be an inline immediate, so we can use whatever for 1034 // the other operands. 1035 MIB.addReg(Val); 1036 MIB.addImm(ConstSelect->Value.getSExtValue() & 1037 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 1038 } else { 1039 std::optional<ValueAndVReg> ConstVal = 1040 getIConstantVRegValWithLookThrough(Val, *MRI); 1041 1042 // If the value written is an inline immediate, we can get away without a 1043 // copy to m0. 1044 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 1045 STI.hasInv2PiInlineImm())) { 1046 MIB.addImm(ConstVal->Value.getSExtValue()); 1047 MIB.addReg(LaneSelect); 1048 } else { 1049 MIB.addReg(Val); 1050 1051 // If the lane selector was originally in a VGPR and copied with 1052 // readfirstlane, there's a hazard to read the same SGPR from the 1053 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 1054 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 1055 1056 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1057 .addReg(LaneSelect); 1058 MIB.addReg(AMDGPU::M0); 1059 } 1060 } 1061 1062 MIB.addReg(VDstIn); 1063 1064 MI.eraseFromParent(); 1065 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1066 } 1067 1068 // We need to handle this here because tablegen doesn't support matching 1069 // instructions with multiple outputs. 1070 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 1071 Register Dst0 = MI.getOperand(0).getReg(); 1072 Register Dst1 = MI.getOperand(1).getReg(); 1073 1074 LLT Ty = MRI->getType(Dst0); 1075 unsigned Opc; 1076 if (Ty == LLT::scalar(32)) 1077 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 1078 else if (Ty == LLT::scalar(64)) 1079 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 1080 else 1081 return false; 1082 1083 // TODO: Match source modifiers. 1084 1085 const DebugLoc &DL = MI.getDebugLoc(); 1086 MachineBasicBlock *MBB = MI.getParent(); 1087 1088 Register Numer = MI.getOperand(3).getReg(); 1089 Register Denom = MI.getOperand(4).getReg(); 1090 unsigned ChooseDenom = MI.getOperand(5).getImm(); 1091 1092 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 1093 1094 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 1095 .addDef(Dst1) 1096 .addImm(0) // $src0_modifiers 1097 .addUse(Src0) // $src0 1098 .addImm(0) // $src1_modifiers 1099 .addUse(Denom) // $src1 1100 .addImm(0) // $src2_modifiers 1101 .addUse(Numer) // $src2 1102 .addImm(0) // $clamp 1103 .addImm(0); // $omod 1104 1105 MI.eraseFromParent(); 1106 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1107 } 1108 1109 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 1110 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 1111 switch (IntrinsicID) { 1112 case Intrinsic::amdgcn_if_break: { 1113 MachineBasicBlock *BB = I.getParent(); 1114 1115 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1116 // SelectionDAG uses for wave32 vs wave64. 1117 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 1118 .add(I.getOperand(0)) 1119 .add(I.getOperand(2)) 1120 .add(I.getOperand(3)); 1121 1122 Register DstReg = I.getOperand(0).getReg(); 1123 Register Src0Reg = I.getOperand(2).getReg(); 1124 Register Src1Reg = I.getOperand(3).getReg(); 1125 1126 I.eraseFromParent(); 1127 1128 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 1129 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1130 1131 return true; 1132 } 1133 case Intrinsic::amdgcn_interp_p1_f16: 1134 return selectInterpP1F16(I); 1135 case Intrinsic::amdgcn_wqm: 1136 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 1137 case Intrinsic::amdgcn_softwqm: 1138 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 1139 case Intrinsic::amdgcn_strict_wwm: 1140 case Intrinsic::amdgcn_wwm: 1141 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); 1142 case Intrinsic::amdgcn_strict_wqm: 1143 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); 1144 case Intrinsic::amdgcn_writelane: 1145 return selectWritelane(I); 1146 case Intrinsic::amdgcn_div_scale: 1147 return selectDivScale(I); 1148 case Intrinsic::amdgcn_icmp: 1149 case Intrinsic::amdgcn_fcmp: 1150 if (selectImpl(I, *CoverageInfo)) 1151 return true; 1152 return selectIntrinsicCmp(I); 1153 case Intrinsic::amdgcn_ballot: 1154 return selectBallot(I); 1155 case Intrinsic::amdgcn_reloc_constant: 1156 return selectRelocConstant(I); 1157 case Intrinsic::amdgcn_groupstaticsize: 1158 return selectGroupStaticSize(I); 1159 case Intrinsic::returnaddress: 1160 return selectReturnAddress(I); 1161 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 1162 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 1163 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 1164 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 1165 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 1166 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 1167 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 1171 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 1175 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: 1176 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: 1177 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: 1178 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: 1179 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: 1180 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: 1181 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: 1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: 1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: 1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: 1185 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: 1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: 1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: 1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: 1189 return selectSMFMACIntrin(I); 1190 case Intrinsic::amdgcn_permlane16_swap: 1191 case Intrinsic::amdgcn_permlane32_swap: 1192 return selectPermlaneSwapIntrin(I, IntrinsicID); 1193 default: 1194 return selectImpl(I, *CoverageInfo); 1195 } 1196 } 1197 1198 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, 1199 const GCNSubtarget &ST) { 1200 if (Size != 16 && Size != 32 && Size != 64) 1201 return -1; 1202 1203 if (Size == 16 && !ST.has16BitInsts()) 1204 return -1; 1205 1206 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, 1207 unsigned FakeS16Opc, unsigned S32Opc, 1208 unsigned S64Opc) { 1209 if (Size == 16) 1210 // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code 1211 return ST.hasTrue16BitInsts() 1212 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc 1213 : S16Opc; 1214 if (Size == 32) 1215 return S32Opc; 1216 return S64Opc; 1217 }; 1218 1219 switch (P) { 1220 default: 1221 llvm_unreachable("Unknown condition code!"); 1222 case CmpInst::ICMP_NE: 1223 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64, 1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64, 1225 AMDGPU::V_CMP_NE_U64_e64); 1226 case CmpInst::ICMP_EQ: 1227 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64, 1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64, 1229 AMDGPU::V_CMP_EQ_U64_e64); 1230 case CmpInst::ICMP_SGT: 1231 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64, 1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64, 1233 AMDGPU::V_CMP_GT_I64_e64); 1234 case CmpInst::ICMP_SGE: 1235 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64, 1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64, 1237 AMDGPU::V_CMP_GE_I64_e64); 1238 case CmpInst::ICMP_SLT: 1239 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64, 1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64, 1241 AMDGPU::V_CMP_LT_I64_e64); 1242 case CmpInst::ICMP_SLE: 1243 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64, 1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64, 1245 AMDGPU::V_CMP_LE_I64_e64); 1246 case CmpInst::ICMP_UGT: 1247 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64, 1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64, 1249 AMDGPU::V_CMP_GT_U64_e64); 1250 case CmpInst::ICMP_UGE: 1251 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64, 1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64, 1253 AMDGPU::V_CMP_GE_U64_e64); 1254 case CmpInst::ICMP_ULT: 1255 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64, 1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64, 1257 AMDGPU::V_CMP_LT_U64_e64); 1258 case CmpInst::ICMP_ULE: 1259 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64, 1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64, 1261 AMDGPU::V_CMP_LE_U64_e64); 1262 1263 case CmpInst::FCMP_OEQ: 1264 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64, 1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64, 1266 AMDGPU::V_CMP_EQ_F64_e64); 1267 case CmpInst::FCMP_OGT: 1268 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64, 1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64, 1270 AMDGPU::V_CMP_GT_F64_e64); 1271 case CmpInst::FCMP_OGE: 1272 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64, 1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64, 1274 AMDGPU::V_CMP_GE_F64_e64); 1275 case CmpInst::FCMP_OLT: 1276 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64, 1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64, 1278 AMDGPU::V_CMP_LT_F64_e64); 1279 case CmpInst::FCMP_OLE: 1280 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64, 1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64, 1282 AMDGPU::V_CMP_LE_F64_e64); 1283 case CmpInst::FCMP_ONE: 1284 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64, 1286 AMDGPU::V_CMP_NEQ_F64_e64); 1287 case CmpInst::FCMP_ORD: 1288 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64, 1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64, 1290 AMDGPU::V_CMP_O_F64_e64); 1291 case CmpInst::FCMP_UNO: 1292 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64, 1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64, 1294 AMDGPU::V_CMP_U_F64_e64); 1295 case CmpInst::FCMP_UEQ: 1296 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64, 1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64, 1298 AMDGPU::V_CMP_NLG_F64_e64); 1299 case CmpInst::FCMP_UGT: 1300 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64, 1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64, 1302 AMDGPU::V_CMP_NLE_F64_e64); 1303 case CmpInst::FCMP_UGE: 1304 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64, 1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64, 1306 AMDGPU::V_CMP_NLT_F64_e64); 1307 case CmpInst::FCMP_ULT: 1308 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64, 1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64, 1310 AMDGPU::V_CMP_NGE_F64_e64); 1311 case CmpInst::FCMP_ULE: 1312 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64, 1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64, 1314 AMDGPU::V_CMP_NGT_F64_e64); 1315 case CmpInst::FCMP_UNE: 1316 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64, 1318 AMDGPU::V_CMP_NEQ_F64_e64); 1319 case CmpInst::FCMP_TRUE: 1320 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64, 1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64, 1322 AMDGPU::V_CMP_TRU_F64_e64); 1323 case CmpInst::FCMP_FALSE: 1324 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64, 1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64, 1326 AMDGPU::V_CMP_F_F64_e64); 1327 } 1328 } 1329 1330 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 1331 unsigned Size) const { 1332 if (Size == 64) { 1333 if (!STI.hasScalarCompareEq64()) 1334 return -1; 1335 1336 switch (P) { 1337 case CmpInst::ICMP_NE: 1338 return AMDGPU::S_CMP_LG_U64; 1339 case CmpInst::ICMP_EQ: 1340 return AMDGPU::S_CMP_EQ_U64; 1341 default: 1342 return -1; 1343 } 1344 } 1345 1346 if (Size == 32) { 1347 switch (P) { 1348 case CmpInst::ICMP_NE: 1349 return AMDGPU::S_CMP_LG_U32; 1350 case CmpInst::ICMP_EQ: 1351 return AMDGPU::S_CMP_EQ_U32; 1352 case CmpInst::ICMP_SGT: 1353 return AMDGPU::S_CMP_GT_I32; 1354 case CmpInst::ICMP_SGE: 1355 return AMDGPU::S_CMP_GE_I32; 1356 case CmpInst::ICMP_SLT: 1357 return AMDGPU::S_CMP_LT_I32; 1358 case CmpInst::ICMP_SLE: 1359 return AMDGPU::S_CMP_LE_I32; 1360 case CmpInst::ICMP_UGT: 1361 return AMDGPU::S_CMP_GT_U32; 1362 case CmpInst::ICMP_UGE: 1363 return AMDGPU::S_CMP_GE_U32; 1364 case CmpInst::ICMP_ULT: 1365 return AMDGPU::S_CMP_LT_U32; 1366 case CmpInst::ICMP_ULE: 1367 return AMDGPU::S_CMP_LE_U32; 1368 case CmpInst::FCMP_OEQ: 1369 return AMDGPU::S_CMP_EQ_F32; 1370 case CmpInst::FCMP_OGT: 1371 return AMDGPU::S_CMP_GT_F32; 1372 case CmpInst::FCMP_OGE: 1373 return AMDGPU::S_CMP_GE_F32; 1374 case CmpInst::FCMP_OLT: 1375 return AMDGPU::S_CMP_LT_F32; 1376 case CmpInst::FCMP_OLE: 1377 return AMDGPU::S_CMP_LE_F32; 1378 case CmpInst::FCMP_ONE: 1379 return AMDGPU::S_CMP_LG_F32; 1380 case CmpInst::FCMP_ORD: 1381 return AMDGPU::S_CMP_O_F32; 1382 case CmpInst::FCMP_UNO: 1383 return AMDGPU::S_CMP_U_F32; 1384 case CmpInst::FCMP_UEQ: 1385 return AMDGPU::S_CMP_NLG_F32; 1386 case CmpInst::FCMP_UGT: 1387 return AMDGPU::S_CMP_NLE_F32; 1388 case CmpInst::FCMP_UGE: 1389 return AMDGPU::S_CMP_NLT_F32; 1390 case CmpInst::FCMP_ULT: 1391 return AMDGPU::S_CMP_NGE_F32; 1392 case CmpInst::FCMP_ULE: 1393 return AMDGPU::S_CMP_NGT_F32; 1394 case CmpInst::FCMP_UNE: 1395 return AMDGPU::S_CMP_NEQ_F32; 1396 default: 1397 llvm_unreachable("Unknown condition code!"); 1398 } 1399 } 1400 1401 if (Size == 16) { 1402 if (!STI.hasSALUFloatInsts()) 1403 return -1; 1404 1405 switch (P) { 1406 case CmpInst::FCMP_OEQ: 1407 return AMDGPU::S_CMP_EQ_F16; 1408 case CmpInst::FCMP_OGT: 1409 return AMDGPU::S_CMP_GT_F16; 1410 case CmpInst::FCMP_OGE: 1411 return AMDGPU::S_CMP_GE_F16; 1412 case CmpInst::FCMP_OLT: 1413 return AMDGPU::S_CMP_LT_F16; 1414 case CmpInst::FCMP_OLE: 1415 return AMDGPU::S_CMP_LE_F16; 1416 case CmpInst::FCMP_ONE: 1417 return AMDGPU::S_CMP_LG_F16; 1418 case CmpInst::FCMP_ORD: 1419 return AMDGPU::S_CMP_O_F16; 1420 case CmpInst::FCMP_UNO: 1421 return AMDGPU::S_CMP_U_F16; 1422 case CmpInst::FCMP_UEQ: 1423 return AMDGPU::S_CMP_NLG_F16; 1424 case CmpInst::FCMP_UGT: 1425 return AMDGPU::S_CMP_NLE_F16; 1426 case CmpInst::FCMP_UGE: 1427 return AMDGPU::S_CMP_NLT_F16; 1428 case CmpInst::FCMP_ULT: 1429 return AMDGPU::S_CMP_NGE_F16; 1430 case CmpInst::FCMP_ULE: 1431 return AMDGPU::S_CMP_NGT_F16; 1432 case CmpInst::FCMP_UNE: 1433 return AMDGPU::S_CMP_NEQ_F16; 1434 default: 1435 llvm_unreachable("Unknown condition code!"); 1436 } 1437 } 1438 1439 return -1; 1440 } 1441 1442 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { 1443 1444 MachineBasicBlock *BB = I.getParent(); 1445 const DebugLoc &DL = I.getDebugLoc(); 1446 1447 Register SrcReg = I.getOperand(2).getReg(); 1448 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1449 1450 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1451 1452 Register CCReg = I.getOperand(0).getReg(); 1453 if (!isVCC(CCReg, *MRI)) { 1454 int Opcode = getS_CMPOpcode(Pred, Size); 1455 if (Opcode == -1) 1456 return false; 1457 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1458 .add(I.getOperand(2)) 1459 .add(I.getOperand(3)); 1460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1461 .addReg(AMDGPU::SCC); 1462 bool Ret = 1463 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1464 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1465 I.eraseFromParent(); 1466 return Ret; 1467 } 1468 1469 if (I.getOpcode() == AMDGPU::G_FCMP) 1470 return false; 1471 1472 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1473 if (Opcode == -1) 1474 return false; 1475 1476 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1477 I.getOperand(0).getReg()) 1478 .add(I.getOperand(2)) 1479 .add(I.getOperand(3)); 1480 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1481 *TRI.getBoolRC(), *MRI); 1482 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1483 I.eraseFromParent(); 1484 return Ret; 1485 } 1486 1487 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { 1488 Register Dst = I.getOperand(0).getReg(); 1489 if (isVCC(Dst, *MRI)) 1490 return false; 1491 1492 LLT DstTy = MRI->getType(Dst); 1493 if (DstTy.getSizeInBits() != STI.getWavefrontSize()) 1494 return false; 1495 1496 MachineBasicBlock *BB = I.getParent(); 1497 const DebugLoc &DL = I.getDebugLoc(); 1498 Register SrcReg = I.getOperand(2).getReg(); 1499 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1500 1501 // i1 inputs are not supported in GlobalISel. 1502 if (Size == 1) 1503 return false; 1504 1505 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1506 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) { 1507 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); 1508 I.eraseFromParent(); 1509 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1510 } 1511 1512 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1513 if (Opcode == -1) 1514 return false; 1515 1516 MachineInstrBuilder SelectedMI; 1517 MachineOperand &LHS = I.getOperand(2); 1518 MachineOperand &RHS = I.getOperand(3); 1519 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg()); 1520 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg()); 1521 Register Src0Reg = 1522 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); 1523 Register Src1Reg = 1524 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true); 1525 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst); 1526 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) 1527 SelectedMI.addImm(Src0Mods); 1528 SelectedMI.addReg(Src0Reg); 1529 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers)) 1530 SelectedMI.addImm(Src1Mods); 1531 SelectedMI.addReg(Src1Reg); 1532 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp)) 1533 SelectedMI.addImm(0); // clamp 1534 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) 1535 SelectedMI.addImm(0); // op_sel 1536 1537 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1538 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) 1539 return false; 1540 1541 I.eraseFromParent(); 1542 return true; 1543 } 1544 1545 // Ballot has to zero bits in input lane-mask that are zero in current exec, 1546 // Done as AND with exec. For inputs that are results of instruction that 1547 // implicitly use same exec, for example compares in same basic block or SCC to 1548 // VCC copy, use copy. 1549 static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, 1550 MachineBasicBlock *MBB) { 1551 MachineInstr *MI = MRI.getVRegDef(Reg); 1552 if (MI->getParent() != MBB) 1553 return false; 1554 1555 // Lane mask generated by SCC to VCC copy. 1556 if (MI->getOpcode() == AMDGPU::COPY) { 1557 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg()); 1558 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg()); 1559 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID && 1560 SrcRB->getID() == AMDGPU::SGPRRegBankID) 1561 return true; 1562 } 1563 1564 // Lane mask generated using compare with same exec. 1565 if (isa<GAnyCmp>(MI)) 1566 return true; 1567 1568 Register LHS, RHS; 1569 // Look through AND. 1570 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS)))) 1571 return isLaneMaskFromSameBlock(LHS, MRI, MBB) || 1572 isLaneMaskFromSameBlock(RHS, MRI, MBB); 1573 1574 return false; 1575 } 1576 1577 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1578 MachineBasicBlock *BB = I.getParent(); 1579 const DebugLoc &DL = I.getDebugLoc(); 1580 Register DstReg = I.getOperand(0).getReg(); 1581 Register SrcReg = I.getOperand(2).getReg(); 1582 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits(); 1583 const unsigned WaveSize = STI.getWavefrontSize(); 1584 1585 // In the common case, the return type matches the wave size. 1586 // However we also support emitting i64 ballots in wave32 mode. 1587 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32)) 1588 return false; 1589 1590 std::optional<ValueAndVReg> Arg = 1591 getIConstantVRegValWithLookThrough(SrcReg, *MRI); 1592 1593 Register Dst = DstReg; 1594 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. 1595 if (BallotSize != WaveSize) { 1596 Dst = MRI->createVirtualRegister(TRI.getBoolRC()); 1597 } 1598 1599 if (Arg) { 1600 const int64_t Value = Arg->Value.getZExtValue(); 1601 if (Value == 0) { 1602 // Dst = S_MOV 0 1603 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1604 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0); 1605 } else { 1606 // Dst = COPY EXEC 1607 assert(Value == 1); 1608 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec()); 1609 } 1610 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) 1611 return false; 1612 } else { 1613 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) { 1614 // Dst = COPY SrcReg 1615 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg); 1616 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) 1617 return false; 1618 } else { 1619 // Dst = S_AND SrcReg, EXEC 1620 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 1621 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst) 1622 .addReg(SrcReg) 1623 .addReg(TRI.getExec()) 1624 .setOperandDead(3); // Dead scc 1625 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) 1626 return false; 1627 } 1628 } 1629 1630 // i64 ballot on Wave32: zero-extend i32 ballot to i64. 1631 if (BallotSize != WaveSize) { 1632 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1633 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); 1634 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1635 .addReg(Dst) 1636 .addImm(AMDGPU::sub0) 1637 .addReg(HiReg) 1638 .addImm(AMDGPU::sub1); 1639 } 1640 1641 I.eraseFromParent(); 1642 return true; 1643 } 1644 1645 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1646 Register DstReg = I.getOperand(0).getReg(); 1647 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1648 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); 1649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1650 return false; 1651 1652 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1653 1654 Module *M = MF->getFunction().getParent(); 1655 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1656 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1657 auto *RelocSymbol = cast<GlobalVariable>( 1658 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1659 1660 MachineBasicBlock *BB = I.getParent(); 1661 BuildMI(*BB, &I, I.getDebugLoc(), 1662 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1663 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1664 1665 I.eraseFromParent(); 1666 return true; 1667 } 1668 1669 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1670 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1671 1672 Register DstReg = I.getOperand(0).getReg(); 1673 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1674 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1675 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1676 1677 MachineBasicBlock *MBB = I.getParent(); 1678 const DebugLoc &DL = I.getDebugLoc(); 1679 1680 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1681 1682 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1683 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1684 MIB.addImm(MFI->getLDSSize()); 1685 } else { 1686 Module *M = MF->getFunction().getParent(); 1687 const GlobalValue *GV = 1688 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1689 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1690 } 1691 1692 I.eraseFromParent(); 1693 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1694 } 1695 1696 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1697 MachineBasicBlock *MBB = I.getParent(); 1698 MachineFunction &MF = *MBB->getParent(); 1699 const DebugLoc &DL = I.getDebugLoc(); 1700 1701 MachineOperand &Dst = I.getOperand(0); 1702 Register DstReg = Dst.getReg(); 1703 unsigned Depth = I.getOperand(2).getImm(); 1704 1705 const TargetRegisterClass *RC 1706 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1707 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1708 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1709 return false; 1710 1711 // Check for kernel and shader functions 1712 if (Depth != 0 || 1713 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1714 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1715 .addImm(0); 1716 I.eraseFromParent(); 1717 return true; 1718 } 1719 1720 MachineFrameInfo &MFI = MF.getFrameInfo(); 1721 // There is a call to @llvm.returnaddress in this function 1722 MFI.setReturnAddressIsTaken(true); 1723 1724 // Get the return address reg and mark it as an implicit live-in 1725 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1726 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1727 AMDGPU::SReg_64RegClass, DL); 1728 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1729 .addReg(LiveIn); 1730 I.eraseFromParent(); 1731 return true; 1732 } 1733 1734 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1735 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1736 // SelectionDAG uses for wave32 vs wave64. 1737 MachineBasicBlock *BB = MI.getParent(); 1738 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1739 .add(MI.getOperand(1)); 1740 1741 Register Reg = MI.getOperand(1).getReg(); 1742 MI.eraseFromParent(); 1743 1744 if (!MRI->getRegClassOrNull(Reg)) 1745 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1746 return true; 1747 } 1748 1749 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1750 MachineInstr &MI, Intrinsic::ID IntrID) const { 1751 MachineBasicBlock *MBB = MI.getParent(); 1752 MachineFunction *MF = MBB->getParent(); 1753 const DebugLoc &DL = MI.getDebugLoc(); 1754 1755 unsigned IndexOperand = MI.getOperand(7).getImm(); 1756 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1757 bool WaveDone = MI.getOperand(9).getImm() != 0; 1758 1759 if (WaveDone && !WaveRelease) 1760 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1761 1762 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1763 IndexOperand &= ~0x3f; 1764 unsigned CountDw = 0; 1765 1766 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1767 CountDw = (IndexOperand >> 24) & 0xf; 1768 IndexOperand &= ~(0xf << 24); 1769 1770 if (CountDw < 1 || CountDw > 4) { 1771 report_fatal_error( 1772 "ds_ordered_count: dword count must be between 1 and 4"); 1773 } 1774 } 1775 1776 if (IndexOperand) 1777 report_fatal_error("ds_ordered_count: bad index operand"); 1778 1779 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1780 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1781 1782 unsigned Offset0 = OrderedCountIndex << 2; 1783 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 1784 1785 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1786 Offset1 |= (CountDw - 1) << 6; 1787 1788 if (STI.getGeneration() < AMDGPUSubtarget::GFX11) 1789 Offset1 |= ShaderType << 2; 1790 1791 unsigned Offset = Offset0 | (Offset1 << 8); 1792 1793 Register M0Val = MI.getOperand(2).getReg(); 1794 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1795 .addReg(M0Val); 1796 1797 Register DstReg = MI.getOperand(0).getReg(); 1798 Register ValReg = MI.getOperand(3).getReg(); 1799 MachineInstrBuilder DS = 1800 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1801 .addReg(ValReg) 1802 .addImm(Offset) 1803 .cloneMemRefs(MI); 1804 1805 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1806 return false; 1807 1808 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1809 MI.eraseFromParent(); 1810 return Ret; 1811 } 1812 1813 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1814 switch (IntrID) { 1815 case Intrinsic::amdgcn_ds_gws_init: 1816 return AMDGPU::DS_GWS_INIT; 1817 case Intrinsic::amdgcn_ds_gws_barrier: 1818 return AMDGPU::DS_GWS_BARRIER; 1819 case Intrinsic::amdgcn_ds_gws_sema_v: 1820 return AMDGPU::DS_GWS_SEMA_V; 1821 case Intrinsic::amdgcn_ds_gws_sema_br: 1822 return AMDGPU::DS_GWS_SEMA_BR; 1823 case Intrinsic::amdgcn_ds_gws_sema_p: 1824 return AMDGPU::DS_GWS_SEMA_P; 1825 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1826 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1827 default: 1828 llvm_unreachable("not a gws intrinsic"); 1829 } 1830 } 1831 1832 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1833 Intrinsic::ID IID) const { 1834 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1835 !STI.hasGWSSemaReleaseAll())) 1836 return false; 1837 1838 // intrinsic ID, vsrc, offset 1839 const bool HasVSrc = MI.getNumOperands() == 3; 1840 assert(HasVSrc || MI.getNumOperands() == 2); 1841 1842 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1843 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1844 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1845 return false; 1846 1847 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1848 unsigned ImmOffset; 1849 1850 MachineBasicBlock *MBB = MI.getParent(); 1851 const DebugLoc &DL = MI.getDebugLoc(); 1852 1853 MachineInstr *Readfirstlane = nullptr; 1854 1855 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1856 // incoming offset, in case there's an add of a constant. We'll have to put it 1857 // back later. 1858 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1859 Readfirstlane = OffsetDef; 1860 BaseOffset = OffsetDef->getOperand(1).getReg(); 1861 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1862 } 1863 1864 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1865 // If we have a constant offset, try to use the 0 in m0 as the base. 1866 // TODO: Look into changing the default m0 initialization value. If the 1867 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1868 // the immediate offset. 1869 1870 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1871 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1872 .addImm(0); 1873 } else { 1874 std::tie(BaseOffset, ImmOffset) = 1875 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB); 1876 1877 if (Readfirstlane) { 1878 // We have the constant offset now, so put the readfirstlane back on the 1879 // variable component. 1880 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1881 return false; 1882 1883 Readfirstlane->getOperand(1).setReg(BaseOffset); 1884 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1885 } else { 1886 if (!RBI.constrainGenericRegister(BaseOffset, 1887 AMDGPU::SReg_32RegClass, *MRI)) 1888 return false; 1889 } 1890 1891 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1893 .addReg(BaseOffset) 1894 .addImm(16) 1895 .setOperandDead(3); // Dead scc 1896 1897 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1898 .addReg(M0Base); 1899 } 1900 1901 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1902 // offset field) % 64. Some versions of the programming guide omit the m0 1903 // part, or claim it's from offset 0. 1904 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1905 1906 if (HasVSrc) { 1907 Register VSrc = MI.getOperand(1).getReg(); 1908 MIB.addReg(VSrc); 1909 1910 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1911 return false; 1912 } 1913 1914 MIB.addImm(ImmOffset) 1915 .cloneMemRefs(MI); 1916 1917 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1924 bool IsAppend) const { 1925 Register PtrBase = MI.getOperand(2).getReg(); 1926 LLT PtrTy = MRI->getType(PtrBase); 1927 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1928 1929 unsigned Offset; 1930 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1931 1932 // TODO: Should this try to look through readfirstlane like GWS? 1933 if (!isDSOffsetLegal(PtrBase, Offset)) { 1934 PtrBase = MI.getOperand(2).getReg(); 1935 Offset = 0; 1936 } 1937 1938 MachineBasicBlock *MBB = MI.getParent(); 1939 const DebugLoc &DL = MI.getDebugLoc(); 1940 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1941 1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1943 .addReg(PtrBase); 1944 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1945 return false; 1946 1947 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1948 .addImm(Offset) 1949 .addImm(IsGDS ? -1 : 0) 1950 .cloneMemRefs(MI); 1951 MI.eraseFromParent(); 1952 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1953 } 1954 1955 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { 1956 MachineFunction *MF = MI.getParent()->getParent(); 1957 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>(); 1958 1959 MFInfo->setInitWholeWave(); 1960 return selectImpl(MI, *CoverageInfo); 1961 } 1962 1963 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1964 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1965 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1966 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1967 if (WGSize <= STI.getWavefrontSize()) { 1968 // If the workgroup fits in a wave, remove s_barrier_signal and lower 1969 // s_barrier/s_barrier_wait to wave_barrier. 1970 if (IntrinsicID == Intrinsic::amdgcn_s_barrier || 1971 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) { 1972 MachineBasicBlock *MBB = MI.getParent(); 1973 const DebugLoc &DL = MI.getDebugLoc(); 1974 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1975 } 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 } 1980 1981 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { 1982 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 1983 MachineBasicBlock *MBB = MI.getParent(); 1984 const DebugLoc &DL = MI.getDebugLoc(); 1985 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) 1986 .addImm(AMDGPU::Barrier::WORKGROUP); 1987 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) 1988 .addImm(AMDGPU::Barrier::WORKGROUP); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 return selectImpl(MI, *CoverageInfo); 1994 } 1995 1996 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1997 bool &IsTexFail) { 1998 if (TexFailCtrl) 1999 IsTexFail = true; 2000 2001 TFE = (TexFailCtrl & 0x1) ? true : false; 2002 TexFailCtrl &= ~(uint64_t)0x1; 2003 LWE = (TexFailCtrl & 0x2) ? true : false; 2004 TexFailCtrl &= ~(uint64_t)0x2; 2005 2006 return TexFailCtrl == 0; 2007 } 2008 2009 bool AMDGPUInstructionSelector::selectImageIntrinsic( 2010 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 2011 MachineBasicBlock *MBB = MI.getParent(); 2012 const DebugLoc &DL = MI.getDebugLoc(); 2013 2014 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 2015 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 2016 2017 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 2018 unsigned IntrOpcode = Intr->BaseOpcode; 2019 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 2020 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); 2021 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); 2022 2023 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 2024 2025 Register VDataIn, VDataOut; 2026 LLT VDataTy; 2027 int NumVDataDwords = -1; 2028 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || 2029 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; 2030 2031 bool Unorm; 2032 if (!BaseOpcode->Sampler) 2033 Unorm = true; 2034 else 2035 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 2036 2037 bool TFE; 2038 bool LWE; 2039 bool IsTexFail = false; 2040 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 2041 TFE, LWE, IsTexFail)) 2042 return false; 2043 2044 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 2045 const bool IsA16 = (Flags & 1) != 0; 2046 const bool IsG16 = (Flags & 2) != 0; 2047 2048 // A16 implies 16 bit gradients if subtarget doesn't support G16 2049 if (IsA16 && !STI.hasG16() && !IsG16) 2050 return false; 2051 2052 unsigned DMask = 0; 2053 unsigned DMaskLanes = 0; 2054 2055 if (BaseOpcode->Atomic) { 2056 VDataOut = MI.getOperand(0).getReg(); 2057 VDataIn = MI.getOperand(2).getReg(); 2058 LLT Ty = MRI->getType(VDataIn); 2059 2060 // Be careful to allow atomic swap on 16-bit element vectors. 2061 const bool Is64Bit = BaseOpcode->AtomicX2 ? 2062 Ty.getSizeInBits() == 128 : 2063 Ty.getSizeInBits() == 64; 2064 2065 if (BaseOpcode->AtomicX2) { 2066 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 2067 2068 DMask = Is64Bit ? 0xf : 0x3; 2069 NumVDataDwords = Is64Bit ? 4 : 2; 2070 } else { 2071 DMask = Is64Bit ? 0x3 : 0x1; 2072 NumVDataDwords = Is64Bit ? 2 : 1; 2073 } 2074 } else { 2075 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 2076 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 2077 2078 if (BaseOpcode->Store) { 2079 VDataIn = MI.getOperand(1).getReg(); 2080 VDataTy = MRI->getType(VDataIn); 2081 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 2082 } else if (BaseOpcode->NoReturn) { 2083 NumVDataDwords = 0; 2084 } else { 2085 VDataOut = MI.getOperand(0).getReg(); 2086 VDataTy = MRI->getType(VDataOut); 2087 NumVDataDwords = DMaskLanes; 2088 2089 if (IsD16 && !STI.hasUnpackedD16VMem()) 2090 NumVDataDwords = (DMaskLanes + 1) / 2; 2091 } 2092 } 2093 2094 // Set G16 opcode 2095 if (Subtarget->hasG16() && IsG16) { 2096 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 2097 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 2098 assert(G16MappingInfo); 2099 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 2100 } 2101 2102 // TODO: Check this in verifier. 2103 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 2104 2105 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); 2106 if (BaseOpcode->Atomic) 2107 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 2108 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 2109 AMDGPU::CPol::VOLATILE)) 2110 return false; 2111 2112 int NumVAddrRegs = 0; 2113 int NumVAddrDwords = 0; 2114 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 2115 // Skip the $noregs and 0s inserted during legalization. 2116 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 2117 if (!AddrOp.isReg()) 2118 continue; // XXX - Break? 2119 2120 Register Addr = AddrOp.getReg(); 2121 if (!Addr) 2122 break; 2123 2124 ++NumVAddrRegs; 2125 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 2126 } 2127 2128 // The legalizer preprocessed the intrinsic arguments. If we aren't using 2129 // NSA, these should have been packed into a single value in the first 2130 // address register 2131 const bool UseNSA = 2132 NumVAddrRegs != 1 && 2133 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs 2134 : NumVAddrDwords == NumVAddrRegs); 2135 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 2136 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 2137 return false; 2138 } 2139 2140 if (IsTexFail) 2141 ++NumVDataDwords; 2142 2143 int Opcode = -1; 2144 if (IsGFX12Plus) { 2145 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 2146 NumVDataDwords, NumVAddrDwords); 2147 } else if (IsGFX11Plus) { 2148 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 2149 UseNSA ? AMDGPU::MIMGEncGfx11NSA 2150 : AMDGPU::MIMGEncGfx11Default, 2151 NumVDataDwords, NumVAddrDwords); 2152 } else if (IsGFX10Plus) { 2153 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 2154 UseNSA ? AMDGPU::MIMGEncGfx10NSA 2155 : AMDGPU::MIMGEncGfx10Default, 2156 NumVDataDwords, NumVAddrDwords); 2157 } else { 2158 if (Subtarget->hasGFX90AInsts()) { 2159 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 2160 NumVDataDwords, NumVAddrDwords); 2161 if (Opcode == -1) { 2162 LLVM_DEBUG( 2163 dbgs() 2164 << "requested image instruction is not supported on this GPU\n"); 2165 return false; 2166 } 2167 } 2168 if (Opcode == -1 && 2169 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2170 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 2171 NumVDataDwords, NumVAddrDwords); 2172 if (Opcode == -1) 2173 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 2174 NumVDataDwords, NumVAddrDwords); 2175 } 2176 if (Opcode == -1) 2177 return false; 2178 2179 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 2180 .cloneMemRefs(MI); 2181 2182 if (VDataOut) { 2183 if (BaseOpcode->AtomicX2) { 2184 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 2185 2186 Register TmpReg = MRI->createVirtualRegister( 2187 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2188 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2189 2190 MIB.addDef(TmpReg); 2191 if (!MRI->use_empty(VDataOut)) { 2192 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 2193 .addReg(TmpReg, RegState::Kill, SubReg); 2194 } 2195 2196 } else { 2197 MIB.addDef(VDataOut); // vdata output 2198 } 2199 } 2200 2201 if (VDataIn) 2202 MIB.addReg(VDataIn); // vdata input 2203 2204 for (int I = 0; I != NumVAddrRegs; ++I) { 2205 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 2206 if (SrcOp.isReg()) { 2207 assert(SrcOp.getReg() != 0); 2208 MIB.addReg(SrcOp.getReg()); 2209 } 2210 } 2211 2212 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 2213 if (BaseOpcode->Sampler) 2214 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 2215 2216 MIB.addImm(DMask); // dmask 2217 2218 if (IsGFX10Plus) 2219 MIB.addImm(DimInfo->Encoding); 2220 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm)) 2221 MIB.addImm(Unorm); 2222 2223 MIB.addImm(CPol); 2224 MIB.addImm(IsA16 && // a16 or r128 2225 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 2226 if (IsGFX10Plus) 2227 MIB.addImm(IsA16 ? -1 : 0); 2228 2229 if (!Subtarget->hasGFX90AInsts()) { 2230 MIB.addImm(TFE); // tfe 2231 } else if (TFE) { 2232 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n"); 2233 return false; 2234 } 2235 2236 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe)) 2237 MIB.addImm(LWE); // lwe 2238 if (!IsGFX10Plus) 2239 MIB.addImm(DimInfo->DA ? -1 : 0); 2240 if (BaseOpcode->HasD16) 2241 MIB.addImm(IsD16 ? -1 : 0); 2242 2243 MI.eraseFromParent(); 2244 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2245 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); 2246 return true; 2247 } 2248 2249 // We need to handle this here because tablegen doesn't support matching 2250 // instructions with multiple outputs. 2251 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( 2252 MachineInstr &MI) const { 2253 Register Dst0 = MI.getOperand(0).getReg(); 2254 Register Dst1 = MI.getOperand(1).getReg(); 2255 2256 const DebugLoc &DL = MI.getDebugLoc(); 2257 MachineBasicBlock *MBB = MI.getParent(); 2258 2259 Register Addr = MI.getOperand(3).getReg(); 2260 Register Data0 = MI.getOperand(4).getReg(); 2261 Register Data1 = MI.getOperand(5).getReg(); 2262 unsigned Offset = MI.getOperand(6).getImm(); 2263 2264 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) 2265 .addDef(Dst1) 2266 .addUse(Addr) 2267 .addUse(Data0) 2268 .addUse(Data1) 2269 .addImm(Offset) 2270 .cloneMemRefs(MI); 2271 2272 MI.eraseFromParent(); 2273 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2274 } 2275 2276 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 2277 MachineInstr &I) const { 2278 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 2279 switch (IntrinsicID) { 2280 case Intrinsic::amdgcn_end_cf: 2281 return selectEndCfIntrinsic(I); 2282 case Intrinsic::amdgcn_ds_ordered_add: 2283 case Intrinsic::amdgcn_ds_ordered_swap: 2284 return selectDSOrderedIntrinsic(I, IntrinsicID); 2285 case Intrinsic::amdgcn_ds_gws_init: 2286 case Intrinsic::amdgcn_ds_gws_barrier: 2287 case Intrinsic::amdgcn_ds_gws_sema_v: 2288 case Intrinsic::amdgcn_ds_gws_sema_br: 2289 case Intrinsic::amdgcn_ds_gws_sema_p: 2290 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2291 return selectDSGWSIntrinsic(I, IntrinsicID); 2292 case Intrinsic::amdgcn_ds_append: 2293 return selectDSAppendConsume(I, true); 2294 case Intrinsic::amdgcn_ds_consume: 2295 return selectDSAppendConsume(I, false); 2296 case Intrinsic::amdgcn_init_whole_wave: 2297 return selectInitWholeWave(I); 2298 case Intrinsic::amdgcn_s_barrier: 2299 case Intrinsic::amdgcn_s_barrier_signal: 2300 case Intrinsic::amdgcn_s_barrier_wait: 2301 return selectSBarrier(I); 2302 case Intrinsic::amdgcn_raw_buffer_load_lds: 2303 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 2304 case Intrinsic::amdgcn_struct_buffer_load_lds: 2305 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: 2306 return selectBufferLoadLds(I); 2307 case Intrinsic::amdgcn_global_load_lds: 2308 return selectGlobalLoadLds(I); 2309 case Intrinsic::amdgcn_exp_compr: 2310 if (!STI.hasCompressedExport()) { 2311 Function &F = I.getMF()->getFunction(); 2312 DiagnosticInfoUnsupported NoFpRet( 2313 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error); 2314 F.getContext().diagnose(NoFpRet); 2315 return false; 2316 } 2317 break; 2318 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 2319 return selectDSBvhStackIntrinsic(I); 2320 case Intrinsic::amdgcn_s_barrier_init: 2321 case Intrinsic::amdgcn_s_barrier_signal_var: 2322 return selectNamedBarrierInit(I, IntrinsicID); 2323 case Intrinsic::amdgcn_s_barrier_join: 2324 case Intrinsic::amdgcn_s_get_named_barrier_state: 2325 return selectNamedBarrierInst(I, IntrinsicID); 2326 case Intrinsic::amdgcn_s_get_barrier_state: 2327 return selectSGetBarrierState(I, IntrinsicID); 2328 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 2329 return selectSBarrierSignalIsfirst(I, IntrinsicID); 2330 } 2331 return selectImpl(I, *CoverageInfo); 2332 } 2333 2334 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 2335 if (selectImpl(I, *CoverageInfo)) 2336 return true; 2337 2338 MachineBasicBlock *BB = I.getParent(); 2339 const DebugLoc &DL = I.getDebugLoc(); 2340 2341 Register DstReg = I.getOperand(0).getReg(); 2342 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 2343 assert(Size <= 32 || Size == 64); 2344 const MachineOperand &CCOp = I.getOperand(1); 2345 Register CCReg = CCOp.getReg(); 2346 if (!isVCC(CCReg, *MRI)) { 2347 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 2348 AMDGPU::S_CSELECT_B32; 2349 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 2350 .addReg(CCReg); 2351 2352 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 2353 // bank, because it does not cover the register class that we used to represent 2354 // for it. So we need to manually set the register class here. 2355 if (!MRI->getRegClassOrNull(CCReg)) 2356 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 2357 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 2358 .add(I.getOperand(2)) 2359 .add(I.getOperand(3)); 2360 2361 bool Ret = false; 2362 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2363 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 2364 I.eraseFromParent(); 2365 return Ret; 2366 } 2367 2368 // Wide VGPR select should have been split in RegBankSelect. 2369 if (Size > 32) 2370 return false; 2371 2372 MachineInstr *Select = 2373 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 2374 .addImm(0) 2375 .add(I.getOperand(3)) 2376 .addImm(0) 2377 .add(I.getOperand(2)) 2378 .add(I.getOperand(1)); 2379 2380 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2381 I.eraseFromParent(); 2382 return Ret; 2383 } 2384 2385 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 2386 Register DstReg = I.getOperand(0).getReg(); 2387 Register SrcReg = I.getOperand(1).getReg(); 2388 const LLT DstTy = MRI->getType(DstReg); 2389 const LLT SrcTy = MRI->getType(SrcReg); 2390 const LLT S1 = LLT::scalar(1); 2391 2392 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2393 const RegisterBank *DstRB; 2394 if (DstTy == S1) { 2395 // This is a special case. We don't treat s1 for legalization artifacts as 2396 // vcc booleans. 2397 DstRB = SrcRB; 2398 } else { 2399 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2400 if (SrcRB != DstRB) 2401 return false; 2402 } 2403 2404 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2405 2406 unsigned DstSize = DstTy.getSizeInBits(); 2407 unsigned SrcSize = SrcTy.getSizeInBits(); 2408 2409 const TargetRegisterClass *SrcRC = 2410 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); 2411 const TargetRegisterClass *DstRC = 2412 TRI.getRegClassForSizeOnBank(DstSize, *DstRB); 2413 if (!SrcRC || !DstRC) 2414 return false; 2415 2416 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2417 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 2418 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 2419 return false; 2420 } 2421 2422 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) { 2423 assert(STI.useRealTrue16Insts()); 2424 const DebugLoc &DL = I.getDebugLoc(); 2425 MachineBasicBlock *MBB = I.getParent(); 2426 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg) 2427 .addReg(SrcReg, 0, AMDGPU::lo16); 2428 I.eraseFromParent(); 2429 return true; 2430 } 2431 2432 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) { 2433 MachineBasicBlock *MBB = I.getParent(); 2434 const DebugLoc &DL = I.getDebugLoc(); 2435 2436 Register LoReg = MRI->createVirtualRegister(DstRC); 2437 Register HiReg = MRI->createVirtualRegister(DstRC); 2438 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 2439 .addReg(SrcReg, 0, AMDGPU::sub0); 2440 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 2441 .addReg(SrcReg, 0, AMDGPU::sub1); 2442 2443 if (IsVALU && STI.hasSDWA()) { 2444 // Write the low 16-bits of the high element into the high 16-bits of the 2445 // low element. 2446 MachineInstr *MovSDWA = 2447 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2448 .addImm(0) // $src0_modifiers 2449 .addReg(HiReg) // $src0 2450 .addImm(0) // $clamp 2451 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2452 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2453 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2454 .addReg(LoReg, RegState::Implicit); 2455 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2456 } else { 2457 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 2458 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 2459 Register ImmReg = MRI->createVirtualRegister(DstRC); 2460 if (IsVALU) { 2461 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 2462 .addImm(16) 2463 .addReg(HiReg); 2464 } else { 2465 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 2466 .addReg(HiReg) 2467 .addImm(16) 2468 .setOperandDead(3); // Dead scc 2469 } 2470 2471 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2472 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2473 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 2474 2475 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 2476 .addImm(0xffff); 2477 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 2478 .addReg(LoReg) 2479 .addReg(ImmReg); 2480 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 2481 .addReg(TmpReg0) 2482 .addReg(TmpReg1); 2483 2484 if (!IsVALU) { 2485 And.setOperandDead(3); // Dead scc 2486 Or.setOperandDead(3); // Dead scc 2487 } 2488 } 2489 2490 I.eraseFromParent(); 2491 return true; 2492 } 2493 2494 if (!DstTy.isScalar()) 2495 return false; 2496 2497 if (SrcSize > 32) { 2498 unsigned SubRegIdx = 2499 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32); 2500 if (SubRegIdx == AMDGPU::NoSubRegister) 2501 return false; 2502 2503 // Deal with weird cases where the class only partially supports the subreg 2504 // index. 2505 const TargetRegisterClass *SrcWithSubRC 2506 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 2507 if (!SrcWithSubRC) 2508 return false; 2509 2510 if (SrcWithSubRC != SrcRC) { 2511 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 2512 return false; 2513 } 2514 2515 I.getOperand(1).setSubReg(SubRegIdx); 2516 } 2517 2518 I.setDesc(TII.get(TargetOpcode::COPY)); 2519 return true; 2520 } 2521 2522 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 2523 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 2524 Mask = maskTrailingOnes<unsigned>(Size); 2525 int SignedMask = static_cast<int>(Mask); 2526 return SignedMask >= -16 && SignedMask <= 64; 2527 } 2528 2529 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 2530 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 2531 Register Reg, const MachineRegisterInfo &MRI, 2532 const TargetRegisterInfo &TRI) const { 2533 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 2534 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank)) 2535 return RB; 2536 2537 // Ignore the type, since we don't use vcc in artifacts. 2538 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank)) 2539 return &RBI.getRegBankFromRegClass(*RC, LLT()); 2540 return nullptr; 2541 } 2542 2543 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 2544 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 2545 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 2546 const DebugLoc &DL = I.getDebugLoc(); 2547 MachineBasicBlock &MBB = *I.getParent(); 2548 const Register DstReg = I.getOperand(0).getReg(); 2549 const Register SrcReg = I.getOperand(1).getReg(); 2550 2551 const LLT DstTy = MRI->getType(DstReg); 2552 const LLT SrcTy = MRI->getType(SrcReg); 2553 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 2554 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 2555 const unsigned DstSize = DstTy.getSizeInBits(); 2556 if (!DstTy.isScalar()) 2557 return false; 2558 2559 // Artifact casts should never use vcc. 2560 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 2561 2562 // FIXME: This should probably be illegal and split earlier. 2563 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 2564 if (DstSize <= 32) 2565 return selectCOPY(I); 2566 2567 const TargetRegisterClass *SrcRC = 2568 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); 2569 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 2570 const TargetRegisterClass *DstRC = 2571 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 2572 2573 Register UndefReg = MRI->createVirtualRegister(SrcRC); 2574 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2575 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2576 .addReg(SrcReg) 2577 .addImm(AMDGPU::sub0) 2578 .addReg(UndefReg) 2579 .addImm(AMDGPU::sub1); 2580 I.eraseFromParent(); 2581 2582 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 2583 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 2584 } 2585 2586 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 2587 // 64-bit should have been split up in RegBankSelect 2588 2589 // Try to use an and with a mask if it will save code size. 2590 unsigned Mask; 2591 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2592 MachineInstr *ExtI = 2593 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2594 .addImm(Mask) 2595 .addReg(SrcReg); 2596 I.eraseFromParent(); 2597 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2598 } 2599 2600 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2601 MachineInstr *ExtI = 2602 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2603 .addReg(SrcReg) 2604 .addImm(0) // Offset 2605 .addImm(SrcSize); // Width 2606 I.eraseFromParent(); 2607 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2608 } 2609 2610 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2611 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2612 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2613 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2614 return false; 2615 2616 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2617 const unsigned SextOpc = SrcSize == 8 ? 2618 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2619 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2620 .addReg(SrcReg); 2621 I.eraseFromParent(); 2622 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2623 } 2624 2625 // Using a single 32-bit SALU to calculate the high half is smaller than 2626 // S_BFE with a literal constant operand. 2627 if (DstSize > 32 && SrcSize == 32) { 2628 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2629 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2630 if (Signed) { 2631 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) 2632 .addReg(SrcReg, 0, SubReg) 2633 .addImm(31) 2634 .setOperandDead(3); // Dead scc 2635 } else { 2636 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) 2637 .addImm(0); 2638 } 2639 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2640 .addReg(SrcReg, 0, SubReg) 2641 .addImm(AMDGPU::sub0) 2642 .addReg(HiReg) 2643 .addImm(AMDGPU::sub1); 2644 I.eraseFromParent(); 2645 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, 2646 *MRI); 2647 } 2648 2649 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2650 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2651 2652 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2653 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2654 // We need a 64-bit register source, but the high bits don't matter. 2655 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2656 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2657 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2658 2659 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2660 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2661 .addReg(SrcReg, 0, SubReg) 2662 .addImm(AMDGPU::sub0) 2663 .addReg(UndefReg) 2664 .addImm(AMDGPU::sub1); 2665 2666 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2667 .addReg(ExtReg) 2668 .addImm(SrcSize << 16); 2669 2670 I.eraseFromParent(); 2671 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2672 } 2673 2674 unsigned Mask; 2675 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2676 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2677 .addReg(SrcReg) 2678 .addImm(Mask) 2679 .setOperandDead(3); // Dead scc 2680 } else { 2681 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2682 .addReg(SrcReg) 2683 .addImm(SrcSize << 16); 2684 } 2685 2686 I.eraseFromParent(); 2687 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2688 } 2689 2690 return false; 2691 } 2692 2693 static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) { 2694 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg; 2695 } 2696 2697 static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { 2698 Register BitcastSrc; 2699 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc)))) 2700 Reg = BitcastSrc; 2701 return Reg; 2702 } 2703 2704 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, 2705 Register &Out) { 2706 Register Trunc; 2707 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc)))) 2708 return false; 2709 2710 Register LShlSrc; 2711 Register Cst; 2712 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) { 2713 Cst = stripCopy(Cst, MRI); 2714 if (mi_match(Cst, MRI, m_SpecificICst(16))) { 2715 Out = stripBitCast(LShlSrc, MRI); 2716 return true; 2717 } 2718 } 2719 2720 MachineInstr *Shuffle = MRI.getVRegDef(Trunc); 2721 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) 2722 return false; 2723 2724 assert(MRI.getType(Shuffle->getOperand(0).getReg()) == 2725 LLT::fixed_vector(2, 16)); 2726 2727 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask(); 2728 assert(Mask.size() == 2); 2729 2730 if (Mask[0] == 1 && Mask[1] <= 1) { 2731 Out = Shuffle->getOperand(0).getReg(); 2732 return true; 2733 } 2734 2735 return false; 2736 } 2737 2738 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { 2739 if (!Subtarget->hasSALUFloatInsts()) 2740 return false; 2741 2742 Register Dst = I.getOperand(0).getReg(); 2743 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2744 if (DstRB->getID() != AMDGPU::SGPRRegBankID) 2745 return false; 2746 2747 Register Src = I.getOperand(1).getReg(); 2748 2749 if (MRI->getType(Dst) == LLT::scalar(32) && 2750 MRI->getType(Src) == LLT::scalar(16)) { 2751 if (isExtractHiElt(*MRI, Src, Src)) { 2752 MachineBasicBlock *BB = I.getParent(); 2753 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) 2754 .addUse(Src); 2755 I.eraseFromParent(); 2756 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 2757 } 2758 } 2759 2760 return false; 2761 } 2762 2763 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2764 // Only manually handle the f64 SGPR case. 2765 // 2766 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2767 // the bit ops theoretically have a second result due to the implicit def of 2768 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2769 // that is easy by disabling the check. The result works, but uses a 2770 // nonsensical sreg32orlds_and_sreg_1 regclass. 2771 // 2772 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2773 // the variadic REG_SEQUENCE operands. 2774 2775 Register Dst = MI.getOperand(0).getReg(); 2776 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2777 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2778 MRI->getType(Dst) != LLT::scalar(64)) 2779 return false; 2780 2781 Register Src = MI.getOperand(1).getReg(); 2782 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2783 if (Fabs) 2784 Src = Fabs->getOperand(1).getReg(); 2785 2786 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2787 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2788 return false; 2789 2790 MachineBasicBlock *BB = MI.getParent(); 2791 const DebugLoc &DL = MI.getDebugLoc(); 2792 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2793 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2794 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2795 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2796 2797 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2798 .addReg(Src, 0, AMDGPU::sub0); 2799 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2800 .addReg(Src, 0, AMDGPU::sub1); 2801 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2802 .addImm(0x80000000); 2803 2804 // Set or toggle sign bit. 2805 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2806 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2807 .addReg(HiReg) 2808 .addReg(ConstReg) 2809 .setOperandDead(3); // Dead scc 2810 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2811 .addReg(LoReg) 2812 .addImm(AMDGPU::sub0) 2813 .addReg(OpReg) 2814 .addImm(AMDGPU::sub1); 2815 MI.eraseFromParent(); 2816 return true; 2817 } 2818 2819 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2820 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2821 Register Dst = MI.getOperand(0).getReg(); 2822 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2823 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2824 MRI->getType(Dst) != LLT::scalar(64)) 2825 return false; 2826 2827 Register Src = MI.getOperand(1).getReg(); 2828 MachineBasicBlock *BB = MI.getParent(); 2829 const DebugLoc &DL = MI.getDebugLoc(); 2830 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2831 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2832 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2833 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2834 2835 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2836 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2837 return false; 2838 2839 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2840 .addReg(Src, 0, AMDGPU::sub0); 2841 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2842 .addReg(Src, 0, AMDGPU::sub1); 2843 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2844 .addImm(0x7fffffff); 2845 2846 // Clear sign bit. 2847 // TODO: Should this used S_BITSET0_*? 2848 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2849 .addReg(HiReg) 2850 .addReg(ConstReg) 2851 .setOperandDead(3); // Dead scc 2852 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2853 .addReg(LoReg) 2854 .addImm(AMDGPU::sub0) 2855 .addReg(OpReg) 2856 .addImm(AMDGPU::sub1); 2857 2858 MI.eraseFromParent(); 2859 return true; 2860 } 2861 2862 static bool isConstant(const MachineInstr &MI) { 2863 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2864 } 2865 2866 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2867 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2868 2869 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; 2870 const MachineInstr *PtrMI = 2871 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg()); 2872 2873 assert(PtrMI); 2874 2875 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2876 return; 2877 2878 GEPInfo GEPInfo; 2879 2880 for (unsigned i = 1; i != 3; ++i) { 2881 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2882 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2883 assert(OpDef); 2884 if (i == 2 && isConstant(*OpDef)) { 2885 // TODO: Could handle constant base + variable offset, but a combine 2886 // probably should have commuted it. 2887 assert(GEPInfo.Imm == 0); 2888 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2889 continue; 2890 } 2891 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2892 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2893 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2894 else 2895 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2896 } 2897 2898 AddrInfo.push_back(GEPInfo); 2899 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2900 } 2901 2902 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2903 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2904 } 2905 2906 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2907 if (!MI.hasOneMemOperand()) 2908 return false; 2909 2910 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2911 const Value *Ptr = MMO->getValue(); 2912 2913 // UndefValue means this is a load of a kernel input. These are uniform. 2914 // Sometimes LDS instructions have constant pointers. 2915 // If Ptr is null, then that means this mem operand contains a 2916 // PseudoSourceValue like GOT. 2917 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2918 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2919 return true; 2920 2921 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2922 return true; 2923 2924 if (MI.getOpcode() == AMDGPU::G_PREFETCH) 2925 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == 2926 AMDGPU::SGPRRegBankID; 2927 2928 const Instruction *I = dyn_cast<Instruction>(Ptr); 2929 return I && I->getMetadata("amdgpu.uniform"); 2930 } 2931 2932 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2933 for (const GEPInfo &GEPInfo : AddrInfo) { 2934 if (!GEPInfo.VgprParts.empty()) 2935 return true; 2936 } 2937 return false; 2938 } 2939 2940 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2941 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2942 unsigned AS = PtrTy.getAddressSpace(); 2943 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2944 STI.ldsRequiresM0Init()) { 2945 MachineBasicBlock *BB = I.getParent(); 2946 2947 // If DS instructions require M0 initialization, insert it before selecting. 2948 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2949 .addImm(-1); 2950 } 2951 } 2952 2953 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2954 MachineInstr &I) const { 2955 initM0(I); 2956 return selectImpl(I, *CoverageInfo); 2957 } 2958 2959 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { 2960 if (Reg.isPhysical()) 2961 return false; 2962 2963 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg); 2964 const unsigned Opcode = MI.getOpcode(); 2965 2966 if (Opcode == AMDGPU::COPY) 2967 return isVCmpResult(MI.getOperand(1).getReg(), MRI); 2968 2969 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR || 2970 Opcode == AMDGPU::G_XOR) 2971 return isVCmpResult(MI.getOperand(1).getReg(), MRI) && 2972 isVCmpResult(MI.getOperand(2).getReg(), MRI); 2973 2974 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) 2975 return GI->is(Intrinsic::amdgcn_class); 2976 2977 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; 2978 } 2979 2980 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2981 MachineBasicBlock *BB = I.getParent(); 2982 MachineOperand &CondOp = I.getOperand(0); 2983 Register CondReg = CondOp.getReg(); 2984 const DebugLoc &DL = I.getDebugLoc(); 2985 2986 unsigned BrOpcode; 2987 Register CondPhysReg; 2988 const TargetRegisterClass *ConstrainRC; 2989 2990 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2991 // whether the branch is uniform when selecting the instruction. In 2992 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2993 // RegBankSelect knows what it's doing if the branch condition is scc, even 2994 // though it currently does not. 2995 if (!isVCC(CondReg, *MRI)) { 2996 if (MRI->getType(CondReg) != LLT::scalar(32)) 2997 return false; 2998 2999 CondPhysReg = AMDGPU::SCC; 3000 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 3001 ConstrainRC = &AMDGPU::SReg_32RegClass; 3002 } else { 3003 // FIXME: Should scc->vcc copies and with exec? 3004 3005 // Unless the value of CondReg is a result of a V_CMP* instruction then we 3006 // need to insert an and with exec. 3007 if (!isVCmpResult(CondReg, *MRI)) { 3008 const bool Is64 = STI.isWave64(); 3009 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 3010 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 3011 3012 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); 3013 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) 3014 .addReg(CondReg) 3015 .addReg(Exec) 3016 .setOperandDead(3); // Dead scc 3017 CondReg = TmpReg; 3018 } 3019 3020 CondPhysReg = TRI.getVCC(); 3021 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 3022 ConstrainRC = TRI.getBoolRC(); 3023 } 3024 3025 if (!MRI->getRegClassOrNull(CondReg)) 3026 MRI->setRegClass(CondReg, ConstrainRC); 3027 3028 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 3029 .addReg(CondReg); 3030 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 3031 .addMBB(I.getOperand(1).getMBB()); 3032 3033 I.eraseFromParent(); 3034 return true; 3035 } 3036 3037 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 3038 MachineInstr &I) const { 3039 Register DstReg = I.getOperand(0).getReg(); 3040 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3041 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 3042 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 3043 if (IsVGPR) 3044 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 3045 3046 return RBI.constrainGenericRegister( 3047 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 3048 } 3049 3050 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 3051 Register DstReg = I.getOperand(0).getReg(); 3052 Register SrcReg = I.getOperand(1).getReg(); 3053 Register MaskReg = I.getOperand(2).getReg(); 3054 LLT Ty = MRI->getType(DstReg); 3055 LLT MaskTy = MRI->getType(MaskReg); 3056 MachineBasicBlock *BB = I.getParent(); 3057 const DebugLoc &DL = I.getDebugLoc(); 3058 3059 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3060 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 3061 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 3062 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 3063 if (DstRB != SrcRB) // Should only happen for hand written MIR. 3064 return false; 3065 3066 // Try to avoid emitting a bit operation when we only need to touch half of 3067 // the 64-bit pointer. 3068 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64); 3069 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 3070 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 3071 3072 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; 3073 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; 3074 3075 if (!IsVGPR && Ty.getSizeInBits() == 64 && 3076 !CanCopyLow32 && !CanCopyHi32) { 3077 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) 3078 .addReg(SrcReg) 3079 .addReg(MaskReg) 3080 .setOperandDead(3); // Dead scc 3081 I.eraseFromParent(); 3082 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3083 } 3084 3085 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 3086 const TargetRegisterClass &RegRC 3087 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3088 3089 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); 3090 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); 3091 const TargetRegisterClass *MaskRC = 3092 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); 3093 3094 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 3095 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 3096 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 3097 return false; 3098 3099 if (Ty.getSizeInBits() == 32) { 3100 assert(MaskTy.getSizeInBits() == 32 && 3101 "ptrmask should have been narrowed during legalize"); 3102 3103 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 3104 .addReg(SrcReg) 3105 .addReg(MaskReg); 3106 3107 if (!IsVGPR) 3108 NewOp.setOperandDead(3); // Dead scc 3109 I.eraseFromParent(); 3110 return true; 3111 } 3112 3113 Register HiReg = MRI->createVirtualRegister(&RegRC); 3114 Register LoReg = MRI->createVirtualRegister(&RegRC); 3115 3116 // Extract the subregisters from the source pointer. 3117 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 3118 .addReg(SrcReg, 0, AMDGPU::sub0); 3119 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 3120 .addReg(SrcReg, 0, AMDGPU::sub1); 3121 3122 Register MaskedLo, MaskedHi; 3123 3124 if (CanCopyLow32) { 3125 // If all the bits in the low half are 1, we only need a copy for it. 3126 MaskedLo = LoReg; 3127 } else { 3128 // Extract the mask subregister and apply the and. 3129 Register MaskLo = MRI->createVirtualRegister(&RegRC); 3130 MaskedLo = MRI->createVirtualRegister(&RegRC); 3131 3132 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 3133 .addReg(MaskReg, 0, AMDGPU::sub0); 3134 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 3135 .addReg(LoReg) 3136 .addReg(MaskLo); 3137 } 3138 3139 if (CanCopyHi32) { 3140 // If all the bits in the high half are 1, we only need a copy for it. 3141 MaskedHi = HiReg; 3142 } else { 3143 Register MaskHi = MRI->createVirtualRegister(&RegRC); 3144 MaskedHi = MRI->createVirtualRegister(&RegRC); 3145 3146 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 3147 .addReg(MaskReg, 0, AMDGPU::sub1); 3148 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 3149 .addReg(HiReg) 3150 .addReg(MaskHi); 3151 } 3152 3153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 3154 .addReg(MaskedLo) 3155 .addImm(AMDGPU::sub0) 3156 .addReg(MaskedHi) 3157 .addImm(AMDGPU::sub1); 3158 I.eraseFromParent(); 3159 return true; 3160 } 3161 3162 /// Return the register to use for the index value, and the subregister to use 3163 /// for the indirectly accessed register. 3164 static std::pair<Register, unsigned> 3165 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, 3166 const TargetRegisterClass *SuperRC, Register IdxReg, 3167 unsigned EltSize, GISelKnownBits &KnownBits) { 3168 Register IdxBaseReg; 3169 int Offset; 3170 3171 std::tie(IdxBaseReg, Offset) = 3172 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits); 3173 if (IdxBaseReg == AMDGPU::NoRegister) { 3174 // This will happen if the index is a known constant. This should ordinarily 3175 // be legalized out, but handle it as a register just in case. 3176 assert(Offset == 0); 3177 IdxBaseReg = IdxReg; 3178 } 3179 3180 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 3181 3182 // Skip out of bounds offsets, or else we would end up using an undefined 3183 // register. 3184 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 3185 return std::pair(IdxReg, SubRegs[0]); 3186 return std::pair(IdxBaseReg, SubRegs[Offset]); 3187 } 3188 3189 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 3190 MachineInstr &MI) const { 3191 Register DstReg = MI.getOperand(0).getReg(); 3192 Register SrcReg = MI.getOperand(1).getReg(); 3193 Register IdxReg = MI.getOperand(2).getReg(); 3194 3195 LLT DstTy = MRI->getType(DstReg); 3196 LLT SrcTy = MRI->getType(SrcReg); 3197 3198 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3199 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 3200 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3201 3202 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3203 // into a waterfall loop. 3204 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3205 return false; 3206 3207 const TargetRegisterClass *SrcRC = 3208 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); 3209 const TargetRegisterClass *DstRC = 3210 TRI.getRegClassForTypeOnBank(DstTy, *DstRB); 3211 if (!SrcRC || !DstRC) 3212 return false; 3213 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 3214 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 3215 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3216 return false; 3217 3218 MachineBasicBlock *BB = MI.getParent(); 3219 const DebugLoc &DL = MI.getDebugLoc(); 3220 const bool Is64 = DstTy.getSizeInBits() == 64; 3221 3222 unsigned SubReg; 3223 std::tie(IdxReg, SubReg) = computeIndirectRegIndex( 3224 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB); 3225 3226 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 3227 if (DstTy.getSizeInBits() != 32 && !Is64) 3228 return false; 3229 3230 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3231 .addReg(IdxReg); 3232 3233 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 3234 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 3235 .addReg(SrcReg, 0, SubReg) 3236 .addReg(SrcReg, RegState::Implicit); 3237 MI.eraseFromParent(); 3238 return true; 3239 } 3240 3241 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 3242 return false; 3243 3244 if (!STI.useVGPRIndexMode()) { 3245 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3246 .addReg(IdxReg); 3247 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 3248 .addReg(SrcReg, 0, SubReg) 3249 .addReg(SrcReg, RegState::Implicit); 3250 MI.eraseFromParent(); 3251 return true; 3252 } 3253 3254 const MCInstrDesc &GPRIDXDesc = 3255 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 3256 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3257 .addReg(SrcReg) 3258 .addReg(IdxReg) 3259 .addImm(SubReg); 3260 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 3266 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 3267 MachineInstr &MI) const { 3268 Register DstReg = MI.getOperand(0).getReg(); 3269 Register VecReg = MI.getOperand(1).getReg(); 3270 Register ValReg = MI.getOperand(2).getReg(); 3271 Register IdxReg = MI.getOperand(3).getReg(); 3272 3273 LLT VecTy = MRI->getType(DstReg); 3274 LLT ValTy = MRI->getType(ValReg); 3275 unsigned VecSize = VecTy.getSizeInBits(); 3276 unsigned ValSize = ValTy.getSizeInBits(); 3277 3278 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 3279 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 3280 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3281 3282 assert(VecTy.getElementType() == ValTy); 3283 3284 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3285 // into a waterfall loop. 3286 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3287 return false; 3288 3289 const TargetRegisterClass *VecRC = 3290 TRI.getRegClassForTypeOnBank(VecTy, *VecRB); 3291 const TargetRegisterClass *ValRC = 3292 TRI.getRegClassForTypeOnBank(ValTy, *ValRB); 3293 3294 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 3295 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 3296 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 3297 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3298 return false; 3299 3300 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 3301 return false; 3302 3303 unsigned SubReg; 3304 std::tie(IdxReg, SubReg) = 3305 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB); 3306 3307 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 3308 STI.useVGPRIndexMode(); 3309 3310 MachineBasicBlock *BB = MI.getParent(); 3311 const DebugLoc &DL = MI.getDebugLoc(); 3312 3313 if (!IndexMode) { 3314 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3315 .addReg(IdxReg); 3316 3317 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 3318 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 3319 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 3320 .addReg(VecReg) 3321 .addReg(ValReg) 3322 .addImm(SubReg); 3323 MI.eraseFromParent(); 3324 return true; 3325 } 3326 3327 const MCInstrDesc &GPRIDXDesc = 3328 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 3329 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3330 .addReg(VecReg) 3331 .addReg(ValReg) 3332 .addReg(IdxReg) 3333 .addImm(SubReg); 3334 3335 MI.eraseFromParent(); 3336 return true; 3337 } 3338 3339 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { 3340 assert(!AMDGPU::isGFX12Plus(STI)); 3341 unsigned Opc; 3342 unsigned Size = MI.getOperand(3).getImm(); 3343 3344 // The struct intrinsic variants add one additional operand over raw. 3345 const bool HasVIndex = MI.getNumOperands() == 9; 3346 Register VIndex; 3347 int OpOffset = 0; 3348 if (HasVIndex) { 3349 VIndex = MI.getOperand(4).getReg(); 3350 OpOffset = 1; 3351 } 3352 3353 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3354 std::optional<ValueAndVReg> MaybeVOffset = 3355 getIConstantVRegValWithLookThrough(VOffset, *MRI); 3356 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); 3357 3358 switch (Size) { 3359 default: 3360 return false; 3361 case 1: 3362 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 3363 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 3364 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 3365 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 3366 break; 3367 case 2: 3368 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 3369 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 3370 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 3371 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 3372 break; 3373 case 4: 3374 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 3375 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 3376 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 3377 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 3378 break; 3379 case 12: 3380 if (!Subtarget->hasLDSLoadB96_B128()) 3381 return false; 3382 3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN 3384 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN 3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN 3386 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; 3387 break; 3388 case 16: 3389 if (!Subtarget->hasLDSLoadB96_B128()) 3390 return false; 3391 3392 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN 3393 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN 3394 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN 3395 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; 3396 break; 3397 } 3398 3399 MachineBasicBlock *MBB = MI.getParent(); 3400 const DebugLoc &DL = MI.getDebugLoc(); 3401 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3402 .add(MI.getOperand(2)); 3403 3404 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); 3405 3406 if (HasVIndex && HasVOffset) { 3407 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 3408 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 3409 .addReg(VIndex) 3410 .addImm(AMDGPU::sub0) 3411 .addReg(VOffset) 3412 .addImm(AMDGPU::sub1); 3413 3414 MIB.addReg(IdxReg); 3415 } else if (HasVIndex) { 3416 MIB.addReg(VIndex); 3417 } else if (HasVOffset) { 3418 MIB.addReg(VOffset); 3419 } 3420 3421 MIB.add(MI.getOperand(1)); // rsrc 3422 MIB.add(MI.getOperand(5 + OpOffset)); // soffset 3423 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset 3424 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); 3425 unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); 3426 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL 3427 : AMDGPU::CPol::ALL_pregfx12)); // cpol 3428 MIB.addImm( 3429 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) 3430 ? 1 3431 : 0); // swz 3432 3433 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3434 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3435 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); 3436 MachinePointerInfo StorePtrI = LoadPtrI; 3437 StorePtrI.V = nullptr; 3438 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3439 3440 auto F = LoadMMO->getFlags() & 3441 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3442 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3443 Size, LoadMMO->getBaseAlign()); 3444 3445 MachineMemOperand *StoreMMO = 3446 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3447 sizeof(int32_t), LoadMMO->getBaseAlign()); 3448 3449 MIB.setMemRefs({LoadMMO, StoreMMO}); 3450 3451 MI.eraseFromParent(); 3452 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3453 } 3454 3455 /// Match a zero extend from a 32-bit value to 64-bits. 3456 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3457 Register ZExtSrc; 3458 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3459 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3460 3461 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3462 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3463 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3464 return Register(); 3465 3466 assert(Def->getNumOperands() == 3 && 3467 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); 3468 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3469 return Def->getOperand(1).getReg(); 3470 } 3471 3472 return Register(); 3473 } 3474 3475 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ 3476 unsigned Opc; 3477 unsigned Size = MI.getOperand(3).getImm(); 3478 3479 switch (Size) { 3480 default: 3481 return false; 3482 case 1: 3483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 3484 break; 3485 case 2: 3486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 3487 break; 3488 case 4: 3489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 3490 break; 3491 case 12: 3492 if (!Subtarget->hasLDSLoadB96_B128()) 3493 return false; 3494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; 3495 break; 3496 case 16: 3497 if (!Subtarget->hasLDSLoadB96_B128()) 3498 return false; 3499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; 3500 break; 3501 } 3502 3503 MachineBasicBlock *MBB = MI.getParent(); 3504 const DebugLoc &DL = MI.getDebugLoc(); 3505 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3506 .add(MI.getOperand(2)); 3507 3508 Register Addr = MI.getOperand(1).getReg(); 3509 Register VOffset; 3510 // Try to split SAddr and VOffset. Global and LDS pointers share the same 3511 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 3512 if (!isSGPR(Addr)) { 3513 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3514 if (isSGPR(AddrDef->Reg)) { 3515 Addr = AddrDef->Reg; 3516 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3517 Register SAddr = 3518 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3519 if (isSGPR(SAddr)) { 3520 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3521 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3522 Addr = SAddr; 3523 VOffset = Off; 3524 } 3525 } 3526 } 3527 } 3528 3529 if (isSGPR(Addr)) { 3530 Opc = AMDGPU::getGlobalSaddrOp(Opc); 3531 if (!VOffset) { 3532 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3533 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3534 .addImm(0); 3535 } 3536 } 3537 3538 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3539 .addReg(Addr); 3540 3541 if (isSGPR(Addr)) 3542 MIB.addReg(VOffset); 3543 3544 MIB.add(MI.getOperand(4)) // offset 3545 .add(MI.getOperand(5)); // cpol 3546 3547 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3548 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3549 LoadPtrI.Offset = MI.getOperand(4).getImm(); 3550 MachinePointerInfo StorePtrI = LoadPtrI; 3551 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 3552 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3553 auto F = LoadMMO->getFlags() & 3554 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3555 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3556 Size, LoadMMO->getBaseAlign()); 3557 MachineMemOperand *StoreMMO = 3558 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3559 sizeof(int32_t), Align(4)); 3560 3561 MIB.setMemRefs({LoadMMO, StoreMMO}); 3562 3563 MI.eraseFromParent(); 3564 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3565 } 3566 3567 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ 3568 MI.setDesc(TII.get(MI.getOperand(1).getImm())); 3569 MI.removeOperand(1); 3570 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3571 return true; 3572 } 3573 3574 // FIXME: This should be removed and let the patterns select. We just need the 3575 // AGPR/VGPR combination versions. 3576 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { 3577 unsigned Opc; 3578 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3579 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 3580 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; 3581 break; 3582 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 3583 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; 3584 break; 3585 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 3586 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; 3587 break; 3588 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 3589 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; 3590 break; 3591 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 3592 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; 3593 break; 3594 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 3595 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; 3596 break; 3597 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 3598 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64; 3599 break; 3600 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 3601 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64; 3602 break; 3603 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 3604 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64; 3605 break; 3606 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 3607 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64; 3608 break; 3609 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 3610 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64; 3611 break; 3612 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 3613 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64; 3614 break; 3615 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 3616 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64; 3617 break; 3618 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 3619 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64; 3620 break; 3621 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: 3622 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64; 3623 break; 3624 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: 3625 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64; 3626 break; 3627 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: 3628 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64; 3629 break; 3630 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: 3631 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64; 3632 break; 3633 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: 3634 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64; 3635 break; 3636 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: 3637 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64; 3638 break; 3639 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: 3640 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64; 3641 break; 3642 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: 3643 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64; 3644 break; 3645 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: 3646 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64; 3647 break; 3648 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: 3649 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64; 3650 break; 3651 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: 3652 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64; 3653 break; 3654 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: 3655 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64; 3656 break; 3657 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: 3658 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64; 3659 break; 3660 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: 3661 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64; 3662 break; 3663 default: 3664 llvm_unreachable("unhandled smfmac intrinsic"); 3665 } 3666 3667 auto VDst_In = MI.getOperand(4); 3668 3669 MI.setDesc(TII.get(Opc)); 3670 MI.removeOperand(4); // VDst_In 3671 MI.removeOperand(1); // Intrinsic ID 3672 MI.addOperand(VDst_In); // Readd VDst_In to the end 3673 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3674 return true; 3675 } 3676 3677 bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin( 3678 MachineInstr &MI, Intrinsic::ID IntrID) const { 3679 if (IntrID == Intrinsic::amdgcn_permlane16_swap && 3680 !Subtarget->hasPermlane16Swap()) 3681 return false; 3682 if (IntrID == Intrinsic::amdgcn_permlane32_swap && 3683 !Subtarget->hasPermlane32Swap()) 3684 return false; 3685 3686 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap 3687 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64 3688 : AMDGPU::V_PERMLANE32_SWAP_B32_e64; 3689 3690 MI.removeOperand(2); 3691 MI.setDesc(TII.get(Opcode)); 3692 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 3693 3694 MachineOperand &FI = MI.getOperand(4); 3695 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0); 3696 3697 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 3698 } 3699 3700 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { 3701 Register DstReg = MI.getOperand(0).getReg(); 3702 Register SrcReg = MI.getOperand(1).getReg(); 3703 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3704 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3705 MachineBasicBlock *MBB = MI.getParent(); 3706 const DebugLoc &DL = MI.getDebugLoc(); 3707 3708 if (IsVALU) { 3709 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 3710 .addImm(Subtarget->getWavefrontSizeLog2()) 3711 .addReg(SrcReg); 3712 } else { 3713 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 3714 .addReg(SrcReg) 3715 .addImm(Subtarget->getWavefrontSizeLog2()) 3716 .setOperandDead(3); // Dead scc 3717 } 3718 3719 const TargetRegisterClass &RC = 3720 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3721 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 3722 return false; 3723 3724 MI.eraseFromParent(); 3725 return true; 3726 } 3727 3728 // Match BITOP3 operation and return a number of matched instructions plus 3729 // truth table. 3730 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R, 3731 SmallVectorImpl<Register> &Src, 3732 const MachineRegisterInfo &MRI) { 3733 unsigned NumOpcodes = 0; 3734 uint8_t LHSBits, RHSBits; 3735 3736 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool { 3737 // Define truth table given Src0, Src1, Src2 bits permutations: 3738 // 0 0 0 3739 // 0 0 1 3740 // 0 1 0 3741 // 0 1 1 3742 // 1 0 0 3743 // 1 0 1 3744 // 1 1 0 3745 // 1 1 1 3746 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa }; 3747 3748 if (mi_match(Op, MRI, m_AllOnesInt())) { 3749 Bits = 0xff; 3750 return true; 3751 } 3752 if (mi_match(Op, MRI, m_ZeroInt())) { 3753 Bits = 0; 3754 return true; 3755 } 3756 3757 for (unsigned I = 0; I < Src.size(); ++I) { 3758 // Try to find existing reused operand 3759 if (Src[I] == Op) { 3760 Bits = SrcBits[I]; 3761 return true; 3762 } 3763 // Try to replace parent operator 3764 if (Src[I] == R) { 3765 Bits = SrcBits[I]; 3766 Src[I] = Op; 3767 return true; 3768 } 3769 } 3770 3771 if (Src.size() == 3) { 3772 // No room left for operands. Try one last time, there can be a 'not' of 3773 // one of our source operands. In this case we can compute the bits 3774 // without growing Src vector. 3775 Register LHS; 3776 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) { 3777 LHS = getSrcRegIgnoringCopies(LHS, MRI); 3778 for (unsigned I = 0; I < Src.size(); ++I) { 3779 if (Src[I] == LHS) { 3780 Bits = ~SrcBits[I]; 3781 return true; 3782 } 3783 } 3784 } 3785 3786 return false; 3787 } 3788 3789 Bits = SrcBits[Src.size()]; 3790 Src.push_back(Op); 3791 return true; 3792 }; 3793 3794 MachineInstr *MI = MRI.getVRegDef(R); 3795 switch (MI->getOpcode()) { 3796 case TargetOpcode::G_AND: 3797 case TargetOpcode::G_OR: 3798 case TargetOpcode::G_XOR: { 3799 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI); 3800 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI); 3801 3802 SmallVector<Register, 3> Backup(Src.begin(), Src.end()); 3803 if (!getOperandBits(LHS, LHSBits) || 3804 !getOperandBits(RHS, RHSBits)) { 3805 Src = Backup; 3806 return std::make_pair(0, 0); 3807 } 3808 3809 // Recursion is naturally limited by the size of the operand vector. 3810 auto Op = BitOp3_Op(LHS, Src, MRI); 3811 if (Op.first) { 3812 NumOpcodes += Op.first; 3813 LHSBits = Op.second; 3814 } 3815 3816 Op = BitOp3_Op(RHS, Src, MRI); 3817 if (Op.first) { 3818 NumOpcodes += Op.first; 3819 RHSBits = Op.second; 3820 } 3821 break; 3822 } 3823 default: 3824 return std::make_pair(0, 0); 3825 } 3826 3827 uint8_t TTbl; 3828 switch (MI->getOpcode()) { 3829 case TargetOpcode::G_AND: 3830 TTbl = LHSBits & RHSBits; 3831 break; 3832 case TargetOpcode::G_OR: 3833 TTbl = LHSBits | RHSBits; 3834 break; 3835 case TargetOpcode::G_XOR: 3836 TTbl = LHSBits ^ RHSBits; 3837 break; 3838 default: 3839 break; 3840 } 3841 3842 return std::make_pair(NumOpcodes + 1, TTbl); 3843 } 3844 3845 bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { 3846 if (!Subtarget->hasBitOp3Insts()) 3847 return false; 3848 3849 Register DstReg = MI.getOperand(0).getReg(); 3850 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3851 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3852 if (!IsVALU) 3853 return false; 3854 3855 SmallVector<Register, 3> Src; 3856 uint8_t TTbl; 3857 unsigned NumOpcodes; 3858 3859 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI); 3860 3861 // Src.empty() case can happen if all operands are all zero or all ones. 3862 // Normally it shall be optimized out before reaching this. 3863 if (NumOpcodes < 2 || Src.empty()) 3864 return false; 3865 3866 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32); 3867 if (NumOpcodes == 2 && IsB32) { 3868 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes 3869 // asm more readable. This cannot be modeled with AddedComplexity because 3870 // selector does not know how many operations did we match. 3871 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) || 3872 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) || 3873 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg()))) 3874 return false; 3875 } else if (NumOpcodes < 4) { 3876 // For a uniform case threshold should be higher to account for moves 3877 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be 3878 // in SGPRs and a readtfirstlane after. 3879 return false; 3880 } 3881 3882 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; 3883 unsigned CBL = STI.getConstantBusLimit(Opc); 3884 MachineBasicBlock *MBB = MI.getParent(); 3885 const DebugLoc &DL = MI.getDebugLoc(); 3886 3887 for (unsigned I = 0; I < Src.size(); ++I) { 3888 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI); 3889 if (RB->getID() != AMDGPU::SGPRRegBankID) 3890 continue; 3891 if (CBL > 0) { 3892 --CBL; 3893 continue; 3894 } 3895 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3896 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg) 3897 .addReg(Src[I]); 3898 Src[I] = NewReg; 3899 } 3900 3901 // Last operand can be ignored, turning a ternary operation into a binary. 3902 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace 3903 // 'c' with 'a' here without changing the answer. In some pathological 3904 // cases it should be possible to get an operation with a single operand 3905 // too if optimizer would not catch it. 3906 while (Src.size() < 3) 3907 Src.push_back(Src[0]); 3908 3909 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg); 3910 if (!IsB32) 3911 MIB.addImm(0); // src_mod0 3912 MIB.addReg(Src[0]); 3913 if (!IsB32) 3914 MIB.addImm(0); // src_mod1 3915 MIB.addReg(Src[1]); 3916 if (!IsB32) 3917 MIB.addImm(0); // src_mod2 3918 MIB.addReg(Src[2]) 3919 .addImm(TTbl); 3920 if (!IsB32) 3921 MIB.addImm(0); // op_sel 3922 3923 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3924 MI.eraseFromParent(); 3925 3926 return true; 3927 } 3928 3929 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { 3930 Register SrcReg = MI.getOperand(0).getReg(); 3931 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 3932 return false; 3933 3934 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 3935 Register SP = 3936 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); 3937 Register WaveAddr = getWaveAddress(DefMI); 3938 MachineBasicBlock *MBB = MI.getParent(); 3939 const DebugLoc &DL = MI.getDebugLoc(); 3940 3941 if (!WaveAddr) { 3942 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) 3944 .addReg(SrcReg) 3945 .addImm(Subtarget->getWavefrontSizeLog2()) 3946 .setOperandDead(3); // Dead scc 3947 } 3948 3949 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) 3950 .addReg(WaveAddr); 3951 3952 MI.eraseFromParent(); 3953 return true; 3954 } 3955 3956 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 3957 3958 if (!I.isPreISelOpcode()) { 3959 if (I.isCopy()) 3960 return selectCOPY(I); 3961 return true; 3962 } 3963 3964 switch (I.getOpcode()) { 3965 case TargetOpcode::G_AND: 3966 case TargetOpcode::G_OR: 3967 case TargetOpcode::G_XOR: 3968 if (selectBITOP3(I)) 3969 return true; 3970 if (selectImpl(I, *CoverageInfo)) 3971 return true; 3972 return selectG_AND_OR_XOR(I); 3973 case TargetOpcode::G_ADD: 3974 case TargetOpcode::G_SUB: 3975 case TargetOpcode::G_PTR_ADD: 3976 if (selectImpl(I, *CoverageInfo)) 3977 return true; 3978 return selectG_ADD_SUB(I); 3979 case TargetOpcode::G_UADDO: 3980 case TargetOpcode::G_USUBO: 3981 case TargetOpcode::G_UADDE: 3982 case TargetOpcode::G_USUBE: 3983 return selectG_UADDO_USUBO_UADDE_USUBE(I); 3984 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3985 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3986 return selectG_AMDGPU_MAD_64_32(I); 3987 case TargetOpcode::G_INTTOPTR: 3988 case TargetOpcode::G_BITCAST: 3989 case TargetOpcode::G_PTRTOINT: 3990 case TargetOpcode::G_FREEZE: 3991 return selectCOPY(I); 3992 case TargetOpcode::G_FNEG: 3993 if (selectImpl(I, *CoverageInfo)) 3994 return true; 3995 return selectG_FNEG(I); 3996 case TargetOpcode::G_FABS: 3997 if (selectImpl(I, *CoverageInfo)) 3998 return true; 3999 return selectG_FABS(I); 4000 case TargetOpcode::G_EXTRACT: 4001 return selectG_EXTRACT(I); 4002 case TargetOpcode::G_MERGE_VALUES: 4003 case TargetOpcode::G_CONCAT_VECTORS: 4004 return selectG_MERGE_VALUES(I); 4005 case TargetOpcode::G_UNMERGE_VALUES: 4006 return selectG_UNMERGE_VALUES(I); 4007 case TargetOpcode::G_BUILD_VECTOR: 4008 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 4009 return selectG_BUILD_VECTOR(I); 4010 case TargetOpcode::G_IMPLICIT_DEF: 4011 return selectG_IMPLICIT_DEF(I); 4012 case TargetOpcode::G_INSERT: 4013 return selectG_INSERT(I); 4014 case TargetOpcode::G_INTRINSIC: 4015 case TargetOpcode::G_INTRINSIC_CONVERGENT: 4016 return selectG_INTRINSIC(I); 4017 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 4018 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 4019 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 4020 case TargetOpcode::G_ICMP: 4021 case TargetOpcode::G_FCMP: 4022 if (selectG_ICMP_or_FCMP(I)) 4023 return true; 4024 return selectImpl(I, *CoverageInfo); 4025 case TargetOpcode::G_LOAD: 4026 case TargetOpcode::G_ZEXTLOAD: 4027 case TargetOpcode::G_SEXTLOAD: 4028 case TargetOpcode::G_STORE: 4029 case TargetOpcode::G_ATOMIC_CMPXCHG: 4030 case TargetOpcode::G_ATOMICRMW_XCHG: 4031 case TargetOpcode::G_ATOMICRMW_ADD: 4032 case TargetOpcode::G_ATOMICRMW_SUB: 4033 case TargetOpcode::G_ATOMICRMW_AND: 4034 case TargetOpcode::G_ATOMICRMW_OR: 4035 case TargetOpcode::G_ATOMICRMW_XOR: 4036 case TargetOpcode::G_ATOMICRMW_MIN: 4037 case TargetOpcode::G_ATOMICRMW_MAX: 4038 case TargetOpcode::G_ATOMICRMW_UMIN: 4039 case TargetOpcode::G_ATOMICRMW_UMAX: 4040 case TargetOpcode::G_ATOMICRMW_UINC_WRAP: 4041 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: 4042 case TargetOpcode::G_ATOMICRMW_FADD: 4043 case TargetOpcode::G_ATOMICRMW_FMIN: 4044 case TargetOpcode::G_ATOMICRMW_FMAX: 4045 return selectG_LOAD_STORE_ATOMICRMW(I); 4046 case TargetOpcode::G_SELECT: 4047 return selectG_SELECT(I); 4048 case TargetOpcode::G_TRUNC: 4049 return selectG_TRUNC(I); 4050 case TargetOpcode::G_SEXT: 4051 case TargetOpcode::G_ZEXT: 4052 case TargetOpcode::G_ANYEXT: 4053 case TargetOpcode::G_SEXT_INREG: 4054 // This is a workaround. For extension from type i1, `selectImpl()` uses 4055 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type 4056 // i1 can only be hold in a SGPR class. 4057 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) && 4058 selectImpl(I, *CoverageInfo)) 4059 return true; 4060 return selectG_SZA_EXT(I); 4061 case TargetOpcode::G_FPEXT: 4062 if (selectG_FPEXT(I)) 4063 return true; 4064 return selectImpl(I, *CoverageInfo); 4065 case TargetOpcode::G_BRCOND: 4066 return selectG_BRCOND(I); 4067 case TargetOpcode::G_GLOBAL_VALUE: 4068 return selectG_GLOBAL_VALUE(I); 4069 case TargetOpcode::G_PTRMASK: 4070 return selectG_PTRMASK(I); 4071 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 4072 return selectG_EXTRACT_VECTOR_ELT(I); 4073 case TargetOpcode::G_INSERT_VECTOR_ELT: 4074 return selectG_INSERT_VECTOR_ELT(I); 4075 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4076 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4077 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: 4078 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4079 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4080 const AMDGPU::ImageDimIntrinsicInfo *Intr = 4081 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I)); 4082 assert(Intr && "not an image intrinsic with image pseudo"); 4083 return selectImageIntrinsic(I, Intr); 4084 } 4085 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: 4086 return selectBVHIntrinsic(I); 4087 case AMDGPU::G_SBFX: 4088 case AMDGPU::G_UBFX: 4089 return selectG_SBFX_UBFX(I); 4090 case AMDGPU::G_SI_CALL: 4091 I.setDesc(TII.get(AMDGPU::SI_CALL)); 4092 return true; 4093 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: 4094 return selectWaveAddress(I); 4095 case AMDGPU::G_STACKRESTORE: 4096 return selectStackRestore(I); 4097 case AMDGPU::G_PHI: 4098 return selectPHI(I); 4099 case AMDGPU::G_AMDGPU_COPY_SCC_VCC: 4100 return selectCOPY_SCC_VCC(I); 4101 case AMDGPU::G_AMDGPU_COPY_VCC_SCC: 4102 return selectCOPY_VCC_SCC(I); 4103 case AMDGPU::G_AMDGPU_READANYLANE: 4104 return selectReadAnyLane(I); 4105 case TargetOpcode::G_CONSTANT: 4106 case TargetOpcode::G_FCONSTANT: 4107 default: 4108 return selectImpl(I, *CoverageInfo); 4109 } 4110 return false; 4111 } 4112 4113 InstructionSelector::ComplexRendererFns 4114 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 4115 return {{ 4116 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 4117 }}; 4118 4119 } 4120 4121 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( 4122 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const { 4123 unsigned Mods = 0; 4124 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 4125 4126 if (MI->getOpcode() == AMDGPU::G_FNEG) { 4127 Src = MI->getOperand(1).getReg(); 4128 Mods |= SISrcMods::NEG; 4129 MI = getDefIgnoringCopies(Src, *MRI); 4130 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { 4131 // Fold fsub [+-]0 into fneg. This may not have folded depending on the 4132 // denormal mode, but we're implicitly canonicalizing in a source operand. 4133 const ConstantFP *LHS = 4134 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); 4135 if (LHS && LHS->isZero()) { 4136 Mods |= SISrcMods::NEG; 4137 Src = MI->getOperand(2).getReg(); 4138 } 4139 } 4140 4141 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { 4142 Src = MI->getOperand(1).getReg(); 4143 Mods |= SISrcMods::ABS; 4144 } 4145 4146 if (OpSel) 4147 Mods |= SISrcMods::OP_SEL_0; 4148 4149 return std::pair(Src, Mods); 4150 } 4151 4152 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( 4153 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, 4154 bool ForceVGPR) const { 4155 if ((Mods != 0 || ForceVGPR) && 4156 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 4157 4158 // If we looked through copies to find source modifiers on an SGPR operand, 4159 // we now have an SGPR register source. To avoid potentially violating the 4160 // constant bus restriction, we need to insert a copy to a VGPR. 4161 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg()); 4162 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(), 4163 TII.get(AMDGPU::COPY), VGPRSrc) 4164 .addReg(Src); 4165 Src = VGPRSrc; 4166 } 4167 4168 return Src; 4169 } 4170 4171 /// 4172 /// This will select either an SGPR or VGPR operand and will save us from 4173 /// having to write an extra tablegen pattern. 4174 InstructionSelector::ComplexRendererFns 4175 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 4176 return {{ 4177 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 4178 }}; 4179 } 4180 4181 InstructionSelector::ComplexRendererFns 4182 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 4183 Register Src; 4184 unsigned Mods; 4185 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 4186 4187 return {{ 4188 [=](MachineInstrBuilder &MIB) { 4189 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4190 }, 4191 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4192 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4194 }}; 4195 } 4196 4197 InstructionSelector::ComplexRendererFns 4198 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 4199 Register Src; 4200 unsigned Mods; 4201 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 4202 /*IsCanonicalizing=*/true, 4203 /*AllowAbs=*/false); 4204 4205 return {{ 4206 [=](MachineInstrBuilder &MIB) { 4207 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4208 }, 4209 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4210 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4211 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4212 }}; 4213 } 4214 4215 InstructionSelector::ComplexRendererFns 4216 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 4217 return {{ 4218 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 4219 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4220 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4221 }}; 4222 } 4223 4224 InstructionSelector::ComplexRendererFns 4225 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 4226 Register Src; 4227 unsigned Mods; 4228 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 4229 4230 return {{ 4231 [=](MachineInstrBuilder &MIB) { 4232 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4233 }, 4234 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4235 }}; 4236 } 4237 4238 InstructionSelector::ComplexRendererFns 4239 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( 4240 MachineOperand &Root) const { 4241 Register Src; 4242 unsigned Mods; 4243 std::tie(Src, Mods) = 4244 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false); 4245 4246 return {{ 4247 [=](MachineInstrBuilder &MIB) { 4248 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4249 }, 4250 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4251 }}; 4252 } 4253 4254 InstructionSelector::ComplexRendererFns 4255 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 4256 Register Src; 4257 unsigned Mods; 4258 std::tie(Src, Mods) = 4259 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, 4260 /*AllowAbs=*/false); 4261 4262 return {{ 4263 [=](MachineInstrBuilder &MIB) { 4264 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4265 }, 4266 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4267 }}; 4268 } 4269 4270 InstructionSelector::ComplexRendererFns 4271 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 4272 Register Reg = Root.getReg(); 4273 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 4274 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS) 4275 return {}; 4276 return {{ 4277 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4278 }}; 4279 } 4280 4281 std::pair<Register, unsigned> 4282 AMDGPUInstructionSelector::selectVOP3PModsImpl( 4283 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { 4284 unsigned Mods = 0; 4285 MachineInstr *MI = MRI.getVRegDef(Src); 4286 4287 if (MI->getOpcode() == AMDGPU::G_FNEG && 4288 // It's possible to see an f32 fneg here, but unlikely. 4289 // TODO: Treat f32 fneg as only high bit. 4290 MRI.getType(Src) == LLT::fixed_vector(2, 16)) { 4291 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 4292 Src = MI->getOperand(1).getReg(); 4293 MI = MRI.getVRegDef(Src); 4294 } 4295 4296 // TODO: Handle G_FSUB 0 as fneg 4297 4298 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 4299 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() 4300 4301 // Packed instructions do not have abs modifiers. 4302 Mods |= SISrcMods::OP_SEL_1; 4303 4304 return std::pair(Src, Mods); 4305 } 4306 4307 InstructionSelector::ComplexRendererFns 4308 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 4309 MachineRegisterInfo &MRI 4310 = Root.getParent()->getParent()->getParent()->getRegInfo(); 4311 4312 Register Src; 4313 unsigned Mods; 4314 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 4315 4316 return {{ 4317 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4318 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4319 }}; 4320 } 4321 4322 InstructionSelector::ComplexRendererFns 4323 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { 4324 MachineRegisterInfo &MRI 4325 = Root.getParent()->getParent()->getParent()->getRegInfo(); 4326 4327 Register Src; 4328 unsigned Mods; 4329 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); 4330 4331 return {{ 4332 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4333 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4334 }}; 4335 } 4336 4337 InstructionSelector::ComplexRendererFns 4338 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { 4339 // Literal i1 value set in intrinsic, represents SrcMods for the next operand. 4340 // Value is in Imm operand as i1 sign extended to int64_t. 4341 // 1(-1) promotes packed values to signed, 0 treats them as unsigned. 4342 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 4343 "expected i1 value"); 4344 unsigned Mods = SISrcMods::OP_SEL_1; 4345 if (Root.getImm() == -1) 4346 Mods ^= SISrcMods::NEG; 4347 return {{ 4348 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4349 }}; 4350 } 4351 4352 InstructionSelector::ComplexRendererFns 4353 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( 4354 MachineOperand &Root) const { 4355 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 4356 "expected i1 value"); 4357 unsigned Mods = SISrcMods::OP_SEL_1; 4358 if (Root.getImm() != 0) 4359 Mods |= SISrcMods::OP_SEL_0; 4360 4361 return {{ 4362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4363 }}; 4364 } 4365 4366 static Register buildRegSequence(SmallVectorImpl<Register> &Elts, 4367 MachineInstr *InsertPt, 4368 MachineRegisterInfo &MRI) { 4369 const TargetRegisterClass *DstRegClass; 4370 switch (Elts.size()) { 4371 case 8: 4372 DstRegClass = &AMDGPU::VReg_256RegClass; 4373 break; 4374 case 4: 4375 DstRegClass = &AMDGPU::VReg_128RegClass; 4376 break; 4377 case 2: 4378 DstRegClass = &AMDGPU::VReg_64RegClass; 4379 break; 4380 default: 4381 llvm_unreachable("unhandled Reg sequence size"); 4382 } 4383 4384 MachineIRBuilder B(*InsertPt); 4385 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) 4386 .addDef(MRI.createVirtualRegister(DstRegClass)); 4387 for (unsigned i = 0; i < Elts.size(); ++i) { 4388 MIB.addReg(Elts[i]); 4389 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); 4390 } 4391 return MIB->getOperand(0).getReg(); 4392 } 4393 4394 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, 4395 SmallVectorImpl<Register> &Elts, Register &Src, 4396 MachineInstr *InsertPt, 4397 MachineRegisterInfo &MRI) { 4398 if (ModOpcode == TargetOpcode::G_FNEG) { 4399 Mods |= SISrcMods::NEG; 4400 // Check if all elements also have abs modifier 4401 SmallVector<Register, 8> NegAbsElts; 4402 for (auto El : Elts) { 4403 Register FabsSrc; 4404 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) 4405 break; 4406 NegAbsElts.push_back(FabsSrc); 4407 } 4408 if (Elts.size() != NegAbsElts.size()) { 4409 // Neg 4410 Src = buildRegSequence(Elts, InsertPt, MRI); 4411 } else { 4412 // Neg and Abs 4413 Mods |= SISrcMods::NEG_HI; 4414 Src = buildRegSequence(NegAbsElts, InsertPt, MRI); 4415 } 4416 } else { 4417 assert(ModOpcode == TargetOpcode::G_FABS); 4418 // Abs 4419 Mods |= SISrcMods::NEG_HI; 4420 Src = buildRegSequence(Elts, InsertPt, MRI); 4421 } 4422 } 4423 4424 InstructionSelector::ComplexRendererFns 4425 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { 4426 Register Src = Root.getReg(); 4427 unsigned Mods = SISrcMods::OP_SEL_1; 4428 SmallVector<Register, 8> EltsF32; 4429 4430 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) { 4431 assert(BV->getNumSources() > 0); 4432 // Based on first element decide which mod we match, neg or abs 4433 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0)); 4434 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) 4435 ? AMDGPU::G_FNEG 4436 : AMDGPU::G_FABS; 4437 for (unsigned i = 0; i < BV->getNumSources(); ++i) { 4438 ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); 4439 if (ElF32->getOpcode() != ModOpcode) 4440 break; 4441 EltsF32.push_back(ElF32->getOperand(1).getReg()); 4442 } 4443 4444 // All elements had ModOpcode modifier 4445 if (BV->getNumSources() == EltsF32.size()) { 4446 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), 4447 *MRI); 4448 } 4449 } 4450 4451 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4452 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 4453 } 4454 4455 InstructionSelector::ComplexRendererFns 4456 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { 4457 Register Src = Root.getReg(); 4458 unsigned Mods = SISrcMods::OP_SEL_1; 4459 SmallVector<Register, 8> EltsV2F16; 4460 4461 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 4462 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 4463 Register FNegSrc; 4464 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) 4465 break; 4466 EltsV2F16.push_back(FNegSrc); 4467 } 4468 4469 // All elements had ModOpcode modifier 4470 if (CV->getNumSources() == EltsV2F16.size()) { 4471 Mods |= SISrcMods::NEG; 4472 Mods |= SISrcMods::NEG_HI; 4473 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); 4474 } 4475 } 4476 4477 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4478 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 4479 } 4480 4481 InstructionSelector::ComplexRendererFns 4482 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { 4483 Register Src = Root.getReg(); 4484 unsigned Mods = SISrcMods::OP_SEL_1; 4485 SmallVector<Register, 8> EltsV2F16; 4486 4487 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 4488 assert(CV->getNumSources() > 0); 4489 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0)); 4490 // Based on first element decide which mod we match, neg or abs 4491 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) 4492 ? AMDGPU::G_FNEG 4493 : AMDGPU::G_FABS; 4494 4495 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 4496 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); 4497 if (ElV2F16->getOpcode() != ModOpcode) 4498 break; 4499 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); 4500 } 4501 4502 // All elements had ModOpcode modifier 4503 if (CV->getNumSources() == EltsV2F16.size()) { 4504 MachineIRBuilder B(*Root.getParent()); 4505 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), 4506 *MRI); 4507 } 4508 } 4509 4510 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4511 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 4512 } 4513 4514 InstructionSelector::ComplexRendererFns 4515 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { 4516 std::optional<FPValueAndVReg> FPValReg; 4517 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { 4518 if (TII.isInlineConstant(FPValReg->Value)) { 4519 return {{[=](MachineInstrBuilder &MIB) { 4520 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); 4521 }}}; 4522 } 4523 // Non-inlineable splat floats should not fall-through for integer immediate 4524 // checks. 4525 return {}; 4526 } 4527 4528 APInt ICst; 4529 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { 4530 if (TII.isInlineConstant(ICst)) { 4531 return { 4532 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; 4533 } 4534 } 4535 4536 return {}; 4537 } 4538 4539 InstructionSelector::ComplexRendererFns 4540 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { 4541 Register Src = 4542 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 4543 unsigned Key = 0; 4544 4545 Register ShiftSrc; 4546 std::optional<ValueAndVReg> ShiftAmt; 4547 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 4548 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 4549 ShiftAmt->Value.getZExtValue() % 8 == 0) { 4550 Key = ShiftAmt->Value.getZExtValue() / 8; 4551 Src = ShiftSrc; 4552 } 4553 4554 return {{ 4555 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4556 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 4557 }}; 4558 } 4559 4560 InstructionSelector::ComplexRendererFns 4561 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { 4562 4563 Register Src = 4564 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 4565 unsigned Key = 0; 4566 4567 Register ShiftSrc; 4568 std::optional<ValueAndVReg> ShiftAmt; 4569 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 4570 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 4571 ShiftAmt->Value.getZExtValue() == 16) { 4572 Src = ShiftSrc; 4573 Key = 1; 4574 } 4575 4576 return {{ 4577 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4578 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 4579 }}; 4580 } 4581 4582 InstructionSelector::ComplexRendererFns 4583 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 4584 Register Src; 4585 unsigned Mods; 4586 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 4587 4588 // FIXME: Handle op_sel 4589 return {{ 4590 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4591 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4592 }}; 4593 } 4594 4595 InstructionSelector::ComplexRendererFns 4596 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { 4597 Register Src; 4598 unsigned Mods; 4599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 4600 /*IsCanonicalizing=*/true, 4601 /*AllowAbs=*/false, 4602 /*OpSel=*/false); 4603 4604 return {{ 4605 [=](MachineInstrBuilder &MIB) { 4606 MIB.addReg( 4607 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 4608 }, 4609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4610 }}; 4611 } 4612 4613 InstructionSelector::ComplexRendererFns 4614 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { 4615 Register Src; 4616 unsigned Mods; 4617 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 4618 /*IsCanonicalizing=*/true, 4619 /*AllowAbs=*/false, 4620 /*OpSel=*/true); 4621 4622 return {{ 4623 [=](MachineInstrBuilder &MIB) { 4624 MIB.addReg( 4625 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 4626 }, 4627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4628 }}; 4629 } 4630 4631 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, 4632 Register &Base, 4633 Register *SOffset, 4634 int64_t *Offset) const { 4635 MachineInstr *MI = Root.getParent(); 4636 MachineBasicBlock *MBB = MI->getParent(); 4637 4638 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 4639 // then we can select all ptr + 32-bit offsets. 4640 SmallVector<GEPInfo, 4> AddrInfo; 4641 getAddrModeInfo(*MI, *MRI, AddrInfo); 4642 4643 if (AddrInfo.empty()) 4644 return false; 4645 4646 const GEPInfo &GEPI = AddrInfo[0]; 4647 std::optional<int64_t> EncodedImm; 4648 4649 if (SOffset && Offset) { 4650 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 4651 /*HasSOffset=*/true); 4652 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && 4653 AddrInfo.size() > 1) { 4654 const GEPInfo &GEPI2 = AddrInfo[1]; 4655 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { 4656 if (Register OffsetReg = 4657 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { 4658 Base = GEPI2.SgprParts[0]; 4659 *SOffset = OffsetReg; 4660 *Offset = *EncodedImm; 4661 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) 4662 return true; 4663 4664 // For unbuffered smem loads, it is illegal for the Immediate Offset 4665 // to be negative if the resulting (Offset + (M0 or SOffset or zero) 4666 // is negative. Handle the case where the Immediate Offset + SOffset 4667 // is negative. 4668 auto SKnown = KB->getKnownBits(*SOffset); 4669 if (*Offset + SKnown.getMinValue().getSExtValue() < 0) 4670 return false; 4671 4672 return true; 4673 } 4674 } 4675 } 4676 return false; 4677 } 4678 4679 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 4680 /*HasSOffset=*/false); 4681 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { 4682 Base = GEPI.SgprParts[0]; 4683 *Offset = *EncodedImm; 4684 return true; 4685 } 4686 4687 // SGPR offset is unsigned. 4688 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) && 4689 GEPI.Imm != 0) { 4690 // If we make it this far we have a load with an 32-bit immediate offset. 4691 // It is OK to select this using a sgpr offset, because we have already 4692 // failed trying to select this load into one of the _IMM variants since 4693 // the _IMM Patterns are considered before the _SGPR patterns. 4694 Base = GEPI.SgprParts[0]; 4695 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4696 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset) 4697 .addImm(GEPI.Imm); 4698 return true; 4699 } 4700 4701 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { 4702 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { 4703 Base = GEPI.SgprParts[0]; 4704 *SOffset = OffsetReg; 4705 return true; 4706 } 4707 } 4708 4709 return false; 4710 } 4711 4712 InstructionSelector::ComplexRendererFns 4713 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 4714 Register Base; 4715 int64_t Offset; 4716 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) 4717 return std::nullopt; 4718 4719 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4720 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 4721 } 4722 4723 InstructionSelector::ComplexRendererFns 4724 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 4725 SmallVector<GEPInfo, 4> AddrInfo; 4726 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 4727 4728 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 4729 return std::nullopt; 4730 4731 const GEPInfo &GEPInfo = AddrInfo[0]; 4732 Register PtrReg = GEPInfo.SgprParts[0]; 4733 std::optional<int64_t> EncodedImm = 4734 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 4735 if (!EncodedImm) 4736 return std::nullopt; 4737 4738 return {{ 4739 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 4740 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 4741 }}; 4742 } 4743 4744 InstructionSelector::ComplexRendererFns 4745 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 4746 Register Base, SOffset; 4747 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) 4748 return std::nullopt; 4749 4750 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4751 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 4752 } 4753 4754 InstructionSelector::ComplexRendererFns 4755 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { 4756 Register Base, SOffset; 4757 int64_t Offset; 4758 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) 4759 return std::nullopt; 4760 4761 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4762 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 4763 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 4764 } 4765 4766 std::pair<Register, int> 4767 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, 4768 uint64_t FlatVariant) const { 4769 MachineInstr *MI = Root.getParent(); 4770 4771 auto Default = std::pair(Root.getReg(), 0); 4772 4773 if (!STI.hasFlatInstOffsets()) 4774 return Default; 4775 4776 Register PtrBase; 4777 int64_t ConstOffset; 4778 std::tie(PtrBase, ConstOffset) = 4779 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4780 4781 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && 4782 !isFlatScratchBaseLegal(Root.getReg()))) 4783 return Default; 4784 4785 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 4786 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) 4787 return Default; 4788 4789 return std::pair(PtrBase, ConstOffset); 4790 } 4791 4792 InstructionSelector::ComplexRendererFns 4793 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 4794 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); 4795 4796 return {{ 4797 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4798 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4799 }}; 4800 } 4801 4802 InstructionSelector::ComplexRendererFns 4803 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { 4804 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); 4805 4806 return {{ 4807 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4808 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4809 }}; 4810 } 4811 4812 InstructionSelector::ComplexRendererFns 4813 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { 4814 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); 4815 4816 return {{ 4817 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4818 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4819 }}; 4820 } 4821 4822 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 4823 InstructionSelector::ComplexRendererFns 4824 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 4825 Register Addr = Root.getReg(); 4826 Register PtrBase; 4827 int64_t ConstOffset; 4828 int64_t ImmOffset = 0; 4829 4830 // Match the immediate offset first, which canonically is moved as low as 4831 // possible. 4832 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 4833 4834 if (ConstOffset != 0) { 4835 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, 4836 SIInstrFlags::FlatGlobal)) { 4837 Addr = PtrBase; 4838 ImmOffset = ConstOffset; 4839 } else { 4840 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 4841 if (isSGPR(PtrBaseDef->Reg)) { 4842 if (ConstOffset > 0) { 4843 // Offset is too large. 4844 // 4845 // saddr + large_offset -> saddr + 4846 // (voffset = large_offset & ~MaxOffset) + 4847 // (large_offset & MaxOffset); 4848 int64_t SplitImmOffset, RemainderOffset; 4849 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( 4850 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 4851 4852 if (isUInt<32>(RemainderOffset)) { 4853 MachineInstr *MI = Root.getParent(); 4854 MachineBasicBlock *MBB = MI->getParent(); 4855 Register HighBits = 4856 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4857 4858 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 4859 HighBits) 4860 .addImm(RemainderOffset); 4861 4862 return {{ 4863 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 4864 [=](MachineInstrBuilder &MIB) { 4865 MIB.addReg(HighBits); 4866 }, // voffset 4867 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 4868 }}; 4869 } 4870 } 4871 4872 // We are adding a 64 bit SGPR and a constant. If constant bus limit 4873 // is 1 we would need to perform 1 or 2 extra moves for each half of 4874 // the constant and it is better to do a scalar add and then issue a 4875 // single VALU instruction to materialize zero. Otherwise it is less 4876 // instructions to perform VALU adds with immediates or inline literals. 4877 unsigned NumLiterals = 4878 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) + 4879 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset))); 4880 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 4881 return std::nullopt; 4882 } 4883 } 4884 } 4885 4886 // Match the variable offset. 4887 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 4888 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 4889 // Look through the SGPR->VGPR copy. 4890 Register SAddr = 4891 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 4892 4893 if (isSGPR(SAddr)) { 4894 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 4895 4896 // It's possible voffset is an SGPR here, but the copy to VGPR will be 4897 // inserted later. 4898 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 4899 return {{[=](MachineInstrBuilder &MIB) { // saddr 4900 MIB.addReg(SAddr); 4901 }, 4902 [=](MachineInstrBuilder &MIB) { // voffset 4903 MIB.addReg(VOffset); 4904 }, 4905 [=](MachineInstrBuilder &MIB) { // offset 4906 MIB.addImm(ImmOffset); 4907 }}}; 4908 } 4909 } 4910 } 4911 4912 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 4913 // drop this. 4914 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 4915 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) 4916 return std::nullopt; 4917 4918 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 4919 // moves required to copy a 64-bit SGPR to VGPR. 4920 MachineInstr *MI = Root.getParent(); 4921 MachineBasicBlock *MBB = MI->getParent(); 4922 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4923 4924 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 4925 .addImm(0); 4926 4927 return {{ 4928 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr 4929 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 4930 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4931 }}; 4932 } 4933 4934 InstructionSelector::ComplexRendererFns 4935 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 4936 Register Addr = Root.getReg(); 4937 Register PtrBase; 4938 int64_t ConstOffset; 4939 int64_t ImmOffset = 0; 4940 4941 // Match the immediate offset first, which canonically is moved as low as 4942 // possible. 4943 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 4944 4945 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && 4946 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 4947 SIInstrFlags::FlatScratch)) { 4948 Addr = PtrBase; 4949 ImmOffset = ConstOffset; 4950 } 4951 4952 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 4953 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 4954 int FI = AddrDef->MI->getOperand(1).getIndex(); 4955 return {{ 4956 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 4957 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4958 }}; 4959 } 4960 4961 Register SAddr = AddrDef->Reg; 4962 4963 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 4964 Register LHS = AddrDef->MI->getOperand(1).getReg(); 4965 Register RHS = AddrDef->MI->getOperand(2).getReg(); 4966 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 4967 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 4968 4969 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 4970 isSGPR(RHSDef->Reg)) { 4971 int FI = LHSDef->MI->getOperand(1).getIndex(); 4972 MachineInstr &I = *Root.getParent(); 4973 MachineBasicBlock *BB = I.getParent(); 4974 const DebugLoc &DL = I.getDebugLoc(); 4975 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4976 4977 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) 4978 .addFrameIndex(FI) 4979 .addReg(RHSDef->Reg) 4980 .setOperandDead(3); // Dead scc 4981 } 4982 } 4983 4984 if (!isSGPR(SAddr)) 4985 return std::nullopt; 4986 4987 return {{ 4988 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 4989 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4990 }}; 4991 } 4992 4993 // Check whether the flat scratch SVS swizzle bug affects this access. 4994 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( 4995 Register VAddr, Register SAddr, uint64_t ImmOffset) const { 4996 if (!Subtarget->hasFlatScratchSVSSwizzleBug()) 4997 return false; 4998 4999 // The bug affects the swizzling of SVS accesses if there is any carry out 5000 // from the two low order bits (i.e. from bit 1 into bit 2) when adding 5001 // voffset to (soffset + inst_offset). 5002 auto VKnown = KB->getKnownBits(VAddr); 5003 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr), 5004 KnownBits::makeConstant(APInt(32, ImmOffset))); 5005 uint64_t VMax = VKnown.getMaxValue().getZExtValue(); 5006 uint64_t SMax = SKnown.getMaxValue().getZExtValue(); 5007 return (VMax & 3) + (SMax & 3) >= 4; 5008 } 5009 5010 InstructionSelector::ComplexRendererFns 5011 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { 5012 Register Addr = Root.getReg(); 5013 Register PtrBase; 5014 int64_t ConstOffset; 5015 int64_t ImmOffset = 0; 5016 5017 // Match the immediate offset first, which canonically is moved as low as 5018 // possible. 5019 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 5020 5021 Register OrigAddr = Addr; 5022 if (ConstOffset != 0 && 5023 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { 5024 Addr = PtrBase; 5025 ImmOffset = ConstOffset; 5026 } 5027 5028 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 5029 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) 5030 return std::nullopt; 5031 5032 Register RHS = AddrDef->MI->getOperand(2).getReg(); 5033 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) 5034 return std::nullopt; 5035 5036 Register LHS = AddrDef->MI->getOperand(1).getReg(); 5037 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 5038 5039 if (OrigAddr != Addr) { 5040 if (!isFlatScratchBaseLegalSVImm(OrigAddr)) 5041 return std::nullopt; 5042 } else { 5043 if (!isFlatScratchBaseLegalSV(OrigAddr)) 5044 return std::nullopt; 5045 } 5046 5047 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) 5048 return std::nullopt; 5049 5050 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 5051 int FI = LHSDef->MI->getOperand(1).getIndex(); 5052 return {{ 5053 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 5054 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 5055 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5056 }}; 5057 } 5058 5059 if (!isSGPR(LHS)) 5060 return std::nullopt; 5061 5062 return {{ 5063 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 5064 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr 5065 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5066 }}; 5067 } 5068 5069 InstructionSelector::ComplexRendererFns 5070 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 5071 MachineInstr *MI = Root.getParent(); 5072 MachineBasicBlock *MBB = MI->getParent(); 5073 MachineFunction *MF = MBB->getParent(); 5074 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5075 5076 int64_t Offset = 0; 5077 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 5078 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 5079 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5080 5081 // TODO: Should this be inside the render function? The iterator seems to 5082 // move. 5083 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 5084 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 5085 HighBits) 5086 .addImm(Offset & ~MaxOffset); 5087 5088 return {{[=](MachineInstrBuilder &MIB) { // rsrc 5089 MIB.addReg(Info->getScratchRSrcReg()); 5090 }, 5091 [=](MachineInstrBuilder &MIB) { // vaddr 5092 MIB.addReg(HighBits); 5093 }, 5094 [=](MachineInstrBuilder &MIB) { // soffset 5095 // Use constant zero for soffset and rely on eliminateFrameIndex 5096 // to choose the appropriate frame register if need be. 5097 MIB.addImm(0); 5098 }, 5099 [=](MachineInstrBuilder &MIB) { // offset 5100 MIB.addImm(Offset & MaxOffset); 5101 }}}; 5102 } 5103 5104 assert(Offset == 0 || Offset == -1); 5105 5106 // Try to fold a frame index directly into the MUBUF vaddr field, and any 5107 // offsets. 5108 std::optional<int> FI; 5109 Register VAddr = Root.getReg(); 5110 5111 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5112 Register PtrBase; 5113 int64_t ConstOffset; 5114 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); 5115 if (ConstOffset != 0) { 5116 if (TII.isLegalMUBUFImmOffset(ConstOffset) && 5117 (!STI.privateMemoryResourceIsRangeChecked() || 5118 KB->signBitIsZero(PtrBase))) { 5119 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); 5120 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 5121 FI = PtrBaseDef->getOperand(1).getIndex(); 5122 else 5123 VAddr = PtrBase; 5124 Offset = ConstOffset; 5125 } 5126 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 5127 FI = RootDef->getOperand(1).getIndex(); 5128 } 5129 5130 return {{[=](MachineInstrBuilder &MIB) { // rsrc 5131 MIB.addReg(Info->getScratchRSrcReg()); 5132 }, 5133 [=](MachineInstrBuilder &MIB) { // vaddr 5134 if (FI) 5135 MIB.addFrameIndex(*FI); 5136 else 5137 MIB.addReg(VAddr); 5138 }, 5139 [=](MachineInstrBuilder &MIB) { // soffset 5140 // Use constant zero for soffset and rely on eliminateFrameIndex 5141 // to choose the appropriate frame register if need be. 5142 MIB.addImm(0); 5143 }, 5144 [=](MachineInstrBuilder &MIB) { // offset 5145 MIB.addImm(Offset); 5146 }}}; 5147 } 5148 5149 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 5150 int64_t Offset) const { 5151 if (!isUInt<16>(Offset)) 5152 return false; 5153 5154 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 5155 return true; 5156 5157 // On Southern Islands instruction with a negative base value and an offset 5158 // don't seem to work. 5159 return KB->signBitIsZero(Base); 5160 } 5161 5162 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 5163 int64_t Offset1, 5164 unsigned Size) const { 5165 if (Offset0 % Size != 0 || Offset1 % Size != 0) 5166 return false; 5167 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 5168 return false; 5169 5170 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 5171 return true; 5172 5173 // On Southern Islands instruction with a negative base value and an offset 5174 // don't seem to work. 5175 return KB->signBitIsZero(Base); 5176 } 5177 5178 // Return whether the operation has NoUnsignedWrap property. 5179 static bool isNoUnsignedWrap(MachineInstr *Addr) { 5180 return Addr->getOpcode() == TargetOpcode::G_OR || 5181 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && 5182 Addr->getFlag(MachineInstr::NoUWrap)); 5183 } 5184 5185 // Check that the base address of flat scratch load/store in the form of `base + 5186 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware 5187 // requirement). We always treat the first operand as the base address here. 5188 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { 5189 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5190 5191 if (isNoUnsignedWrap(AddrMI)) 5192 return true; 5193 5194 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5195 // values. 5196 if (STI.hasSignedScratchOffsets()) 5197 return true; 5198 5199 Register LHS = AddrMI->getOperand(1).getReg(); 5200 Register RHS = AddrMI->getOperand(2).getReg(); 5201 5202 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { 5203 std::optional<ValueAndVReg> RhsValReg = 5204 getIConstantVRegValWithLookThrough(RHS, *MRI); 5205 // If the immediate offset is negative and within certain range, the base 5206 // address cannot also be negative. If the base is also negative, the sum 5207 // would be either negative or much larger than the valid range of scratch 5208 // memory a thread can access. 5209 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && 5210 RhsValReg->Value.getSExtValue() > -0x40000000) 5211 return true; 5212 } 5213 5214 return KB->signBitIsZero(LHS); 5215 } 5216 5217 // Check address value in SGPR/VGPR are legal for flat scratch in the form 5218 // of: SGPR + VGPR. 5219 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { 5220 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5221 5222 if (isNoUnsignedWrap(AddrMI)) 5223 return true; 5224 5225 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5226 // values. 5227 if (STI.hasSignedScratchOffsets()) 5228 return true; 5229 5230 Register LHS = AddrMI->getOperand(1).getReg(); 5231 Register RHS = AddrMI->getOperand(2).getReg(); 5232 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); 5233 } 5234 5235 // Check address value in SGPR/VGPR are legal for flat scratch in the form 5236 // of: SGPR + VGPR + Imm. 5237 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( 5238 Register Addr) const { 5239 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5240 // values. 5241 if (STI.hasSignedScratchOffsets()) 5242 return true; 5243 5244 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5245 Register Base = AddrMI->getOperand(1).getReg(); 5246 std::optional<DefinitionAndSourceRegister> BaseDef = 5247 getDefSrcRegIgnoringCopies(Base, *MRI); 5248 std::optional<ValueAndVReg> RHSOffset = 5249 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); 5250 assert(RHSOffset); 5251 5252 // If the immediate offset is negative and within certain range, the base 5253 // address cannot also be negative. If the base is also negative, the sum 5254 // would be either negative or much larger than the valid range of scratch 5255 // memory a thread can access. 5256 if (isNoUnsignedWrap(BaseDef->MI) && 5257 (isNoUnsignedWrap(AddrMI) || 5258 (RHSOffset->Value.getSExtValue() < 0 && 5259 RHSOffset->Value.getSExtValue() > -0x40000000))) 5260 return true; 5261 5262 Register LHS = BaseDef->MI->getOperand(1).getReg(); 5263 Register RHS = BaseDef->MI->getOperand(2).getReg(); 5264 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); 5265 } 5266 5267 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, 5268 unsigned ShAmtBits) const { 5269 assert(MI.getOpcode() == TargetOpcode::G_AND); 5270 5271 std::optional<APInt> RHS = 5272 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI); 5273 if (!RHS) 5274 return false; 5275 5276 if (RHS->countr_one() >= ShAmtBits) 5277 return true; 5278 5279 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg()); 5280 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; 5281 } 5282 5283 InstructionSelector::ComplexRendererFns 5284 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 5285 MachineOperand &Root) const { 5286 Register Reg = Root.getReg(); 5287 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5288 5289 std::optional<DefinitionAndSourceRegister> Def = 5290 getDefSrcRegIgnoringCopies(Reg, *MRI); 5291 assert(Def && "this shouldn't be an optional result"); 5292 Reg = Def->Reg; 5293 5294 if (Register WaveBase = getWaveAddress(Def->MI)) { 5295 return {{ 5296 [=](MachineInstrBuilder &MIB) { // rsrc 5297 MIB.addReg(Info->getScratchRSrcReg()); 5298 }, 5299 [=](MachineInstrBuilder &MIB) { // soffset 5300 MIB.addReg(WaveBase); 5301 }, 5302 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset 5303 }}; 5304 } 5305 5306 int64_t Offset = 0; 5307 5308 // FIXME: Copy check is a hack 5309 Register BasePtr; 5310 if (mi_match(Reg, *MRI, 5311 m_GPtrAdd(m_Reg(BasePtr), 5312 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) { 5313 if (!TII.isLegalMUBUFImmOffset(Offset)) 5314 return {}; 5315 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI); 5316 Register WaveBase = getWaveAddress(BasePtrDef); 5317 if (!WaveBase) 5318 return {}; 5319 5320 return {{ 5321 [=](MachineInstrBuilder &MIB) { // rsrc 5322 MIB.addReg(Info->getScratchRSrcReg()); 5323 }, 5324 [=](MachineInstrBuilder &MIB) { // soffset 5325 MIB.addReg(WaveBase); 5326 }, 5327 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 5328 }}; 5329 } 5330 5331 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 5332 !TII.isLegalMUBUFImmOffset(Offset)) 5333 return {}; 5334 5335 return {{ 5336 [=](MachineInstrBuilder &MIB) { // rsrc 5337 MIB.addReg(Info->getScratchRSrcReg()); 5338 }, 5339 [=](MachineInstrBuilder &MIB) { // soffset 5340 MIB.addImm(0); 5341 }, 5342 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 5343 }}; 5344 } 5345 5346 std::pair<Register, unsigned> 5347 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 5348 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5349 int64_t ConstAddr = 0; 5350 5351 Register PtrBase; 5352 int64_t Offset; 5353 std::tie(PtrBase, Offset) = 5354 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 5355 5356 if (Offset) { 5357 if (isDSOffsetLegal(PtrBase, Offset)) { 5358 // (add n0, c0) 5359 return std::pair(PtrBase, Offset); 5360 } 5361 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 5362 // TODO 5363 5364 5365 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 5366 // TODO 5367 5368 } 5369 5370 return std::pair(Root.getReg(), 0); 5371 } 5372 5373 InstructionSelector::ComplexRendererFns 5374 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 5375 Register Reg; 5376 unsigned Offset; 5377 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 5378 return {{ 5379 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 5380 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 5381 }}; 5382 } 5383 5384 InstructionSelector::ComplexRendererFns 5385 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 5386 return selectDSReadWrite2(Root, 4); 5387 } 5388 5389 InstructionSelector::ComplexRendererFns 5390 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 5391 return selectDSReadWrite2(Root, 8); 5392 } 5393 5394 InstructionSelector::ComplexRendererFns 5395 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 5396 unsigned Size) const { 5397 Register Reg; 5398 unsigned Offset; 5399 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 5400 return {{ 5401 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 5402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 5403 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 5404 }}; 5405 } 5406 5407 std::pair<Register, unsigned> 5408 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 5409 unsigned Size) const { 5410 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5411 int64_t ConstAddr = 0; 5412 5413 Register PtrBase; 5414 int64_t Offset; 5415 std::tie(PtrBase, Offset) = 5416 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 5417 5418 if (Offset) { 5419 int64_t OffsetValue0 = Offset; 5420 int64_t OffsetValue1 = Offset + Size; 5421 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 5422 // (add n0, c0) 5423 return std::pair(PtrBase, OffsetValue0 / Size); 5424 } 5425 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 5426 // TODO 5427 5428 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 5429 // TODO 5430 5431 } 5432 5433 return std::pair(Root.getReg(), 0); 5434 } 5435 5436 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 5437 /// the base value with the constant offset. There may be intervening copies 5438 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 5439 /// not match the pattern. 5440 std::pair<Register, int64_t> 5441 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 5442 Register Root, const MachineRegisterInfo &MRI) const { 5443 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 5444 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 5445 return {Root, 0}; 5446 5447 MachineOperand &RHS = RootI->getOperand(2); 5448 std::optional<ValueAndVReg> MaybeOffset = 5449 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5450 if (!MaybeOffset) 5451 return {Root, 0}; 5452 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 5453 } 5454 5455 static void addZeroImm(MachineInstrBuilder &MIB) { 5456 MIB.addImm(0); 5457 } 5458 5459 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 5460 /// BasePtr is not valid, a null base pointer will be used. 5461 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5462 uint32_t FormatLo, uint32_t FormatHi, 5463 Register BasePtr) { 5464 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5465 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5466 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5467 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 5468 5469 B.buildInstr(AMDGPU::S_MOV_B32) 5470 .addDef(RSrc2) 5471 .addImm(FormatLo); 5472 B.buildInstr(AMDGPU::S_MOV_B32) 5473 .addDef(RSrc3) 5474 .addImm(FormatHi); 5475 5476 // Build the half of the subregister with the constants before building the 5477 // full 128-bit register. If we are building multiple resource descriptors, 5478 // this will allow CSEing of the 2-component register. 5479 B.buildInstr(AMDGPU::REG_SEQUENCE) 5480 .addDef(RSrcHi) 5481 .addReg(RSrc2) 5482 .addImm(AMDGPU::sub0) 5483 .addReg(RSrc3) 5484 .addImm(AMDGPU::sub1); 5485 5486 Register RSrcLo = BasePtr; 5487 if (!BasePtr) { 5488 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5489 B.buildInstr(AMDGPU::S_MOV_B64) 5490 .addDef(RSrcLo) 5491 .addImm(0); 5492 } 5493 5494 B.buildInstr(AMDGPU::REG_SEQUENCE) 5495 .addDef(RSrc) 5496 .addReg(RSrcLo) 5497 .addImm(AMDGPU::sub0_sub1) 5498 .addReg(RSrcHi) 5499 .addImm(AMDGPU::sub2_sub3); 5500 5501 return RSrc; 5502 } 5503 5504 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5505 const SIInstrInfo &TII, Register BasePtr) { 5506 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 5507 5508 // FIXME: Why are half the "default" bits ignored based on the addressing 5509 // mode? 5510 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 5511 } 5512 5513 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5514 const SIInstrInfo &TII, Register BasePtr) { 5515 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 5516 5517 // FIXME: Why are half the "default" bits ignored based on the addressing 5518 // mode? 5519 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 5520 } 5521 5522 AMDGPUInstructionSelector::MUBUFAddressData 5523 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 5524 MUBUFAddressData Data; 5525 Data.N0 = Src; 5526 5527 Register PtrBase; 5528 int64_t Offset; 5529 5530 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 5531 if (isUInt<32>(Offset)) { 5532 Data.N0 = PtrBase; 5533 Data.Offset = Offset; 5534 } 5535 5536 if (MachineInstr *InputAdd 5537 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 5538 Data.N2 = InputAdd->getOperand(1).getReg(); 5539 Data.N3 = InputAdd->getOperand(2).getReg(); 5540 5541 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 5542 // FIXME: Don't know this was defined by operand 0 5543 // 5544 // TODO: Remove this when we have copy folding optimizations after 5545 // RegBankSelect. 5546 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 5547 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 5548 } 5549 5550 return Data; 5551 } 5552 5553 /// Return if the addr64 mubuf mode should be used for the given address. 5554 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 5555 // (ptr_add N2, N3) -> addr64, or 5556 // (ptr_add (ptr_add N2, N3), C1) -> addr64 5557 if (Addr.N2) 5558 return true; 5559 5560 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 5561 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 5562 } 5563 5564 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 5565 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 5566 /// component. 5567 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 5568 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 5569 if (TII.isLegalMUBUFImmOffset(ImmOffset)) 5570 return; 5571 5572 // Illegal offset, store it in soffset. 5573 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5574 B.buildInstr(AMDGPU::S_MOV_B32) 5575 .addDef(SOffset) 5576 .addImm(ImmOffset); 5577 ImmOffset = 0; 5578 } 5579 5580 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 5581 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 5582 Register &SOffset, int64_t &Offset) const { 5583 // FIXME: Predicates should stop this from reaching here. 5584 // addr64 bit was removed for volcanic islands. 5585 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 5586 return false; 5587 5588 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 5589 if (!shouldUseAddr64(AddrData)) 5590 return false; 5591 5592 Register N0 = AddrData.N0; 5593 Register N2 = AddrData.N2; 5594 Register N3 = AddrData.N3; 5595 Offset = AddrData.Offset; 5596 5597 // Base pointer for the SRD. 5598 Register SRDPtr; 5599 5600 if (N2) { 5601 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5602 assert(N3); 5603 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5604 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 5605 // addr64, and construct the default resource from a 0 address. 5606 VAddr = N0; 5607 } else { 5608 SRDPtr = N3; 5609 VAddr = N2; 5610 } 5611 } else { 5612 // N2 is not divergent. 5613 SRDPtr = N2; 5614 VAddr = N3; 5615 } 5616 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5617 // Use the default null pointer in the resource 5618 VAddr = N0; 5619 } else { 5620 // N0 -> offset, or 5621 // (N0 + C1) -> offset 5622 SRDPtr = N0; 5623 } 5624 5625 MachineIRBuilder B(*Root.getParent()); 5626 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 5627 splitIllegalMUBUFOffset(B, SOffset, Offset); 5628 return true; 5629 } 5630 5631 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 5632 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 5633 int64_t &Offset) const { 5634 5635 // FIXME: Pattern should not reach here. 5636 if (STI.useFlatForGlobal()) 5637 return false; 5638 5639 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 5640 if (shouldUseAddr64(AddrData)) 5641 return false; 5642 5643 // N0 -> offset, or 5644 // (N0 + C1) -> offset 5645 Register SRDPtr = AddrData.N0; 5646 Offset = AddrData.Offset; 5647 5648 // TODO: Look through extensions for 32-bit soffset. 5649 MachineIRBuilder B(*Root.getParent()); 5650 5651 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 5652 splitIllegalMUBUFOffset(B, SOffset, Offset); 5653 return true; 5654 } 5655 5656 InstructionSelector::ComplexRendererFns 5657 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 5658 Register VAddr; 5659 Register RSrcReg; 5660 Register SOffset; 5661 int64_t Offset = 0; 5662 5663 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 5664 return {}; 5665 5666 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 5667 // pattern. 5668 return {{ 5669 [=](MachineInstrBuilder &MIB) { // rsrc 5670 MIB.addReg(RSrcReg); 5671 }, 5672 [=](MachineInstrBuilder &MIB) { // vaddr 5673 MIB.addReg(VAddr); 5674 }, 5675 [=](MachineInstrBuilder &MIB) { // soffset 5676 if (SOffset) 5677 MIB.addReg(SOffset); 5678 else if (STI.hasRestrictedSOffset()) 5679 MIB.addReg(AMDGPU::SGPR_NULL); 5680 else 5681 MIB.addImm(0); 5682 }, 5683 [=](MachineInstrBuilder &MIB) { // offset 5684 MIB.addImm(Offset); 5685 }, 5686 addZeroImm, // cpol 5687 addZeroImm, // tfe 5688 addZeroImm // swz 5689 }}; 5690 } 5691 5692 InstructionSelector::ComplexRendererFns 5693 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 5694 Register RSrcReg; 5695 Register SOffset; 5696 int64_t Offset = 0; 5697 5698 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 5699 return {}; 5700 5701 return {{ 5702 [=](MachineInstrBuilder &MIB) { // rsrc 5703 MIB.addReg(RSrcReg); 5704 }, 5705 [=](MachineInstrBuilder &MIB) { // soffset 5706 if (SOffset) 5707 MIB.addReg(SOffset); 5708 else if (STI.hasRestrictedSOffset()) 5709 MIB.addReg(AMDGPU::SGPR_NULL); 5710 else 5711 MIB.addImm(0); 5712 }, 5713 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 5714 addZeroImm, // cpol 5715 addZeroImm, // tfe 5716 addZeroImm, // swz 5717 }}; 5718 } 5719 5720 InstructionSelector::ComplexRendererFns 5721 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const { 5722 5723 Register SOffset = Root.getReg(); 5724 5725 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt())) 5726 SOffset = AMDGPU::SGPR_NULL; 5727 5728 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 5729 } 5730 5731 /// Get an immediate that must be 32-bits, and treated as zero extended. 5732 static std::optional<uint64_t> 5733 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { 5734 // getIConstantVRegVal sexts any values, so see if that matters. 5735 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI); 5736 if (!OffsetVal || !isInt<32>(*OffsetVal)) 5737 return std::nullopt; 5738 return Lo_32(*OffsetVal); 5739 } 5740 5741 InstructionSelector::ComplexRendererFns 5742 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 5743 std::optional<uint64_t> OffsetVal = 5744 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI); 5745 if (!OffsetVal) 5746 return {}; 5747 5748 std::optional<int64_t> EncodedImm = 5749 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 5750 if (!EncodedImm) 5751 return {}; 5752 5753 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 5754 } 5755 5756 InstructionSelector::ComplexRendererFns 5757 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 5758 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 5759 5760 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 5761 if (!OffsetVal) 5762 return {}; 5763 5764 std::optional<int64_t> EncodedImm = 5765 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 5766 if (!EncodedImm) 5767 return {}; 5768 5769 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 5770 } 5771 5772 InstructionSelector::ComplexRendererFns 5773 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { 5774 // Match the (soffset + offset) pair as a 32-bit register base and 5775 // an immediate offset. 5776 Register SOffset; 5777 unsigned Offset; 5778 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset( 5779 *MRI, Root.getReg(), KB, /*CheckNUW*/ true); 5780 if (!SOffset) 5781 return std::nullopt; 5782 5783 std::optional<int64_t> EncodedOffset = 5784 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true); 5785 if (!EncodedOffset) 5786 return std::nullopt; 5787 5788 assert(MRI->getType(SOffset) == LLT::scalar(32)); 5789 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; 5791 } 5792 5793 std::pair<Register, unsigned> 5794 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, 5795 bool &Matched) const { 5796 Matched = false; 5797 5798 Register Src; 5799 unsigned Mods; 5800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 5801 5802 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) { 5803 assert(MRI->getType(Src) == LLT::scalar(16)); 5804 5805 // Only change Src if src modifier could be gained. In such cases new Src 5806 // could be sgpr but this does not violate constant bus restriction for 5807 // instruction that is being selected. 5808 Src = stripBitCast(Src, *MRI); 5809 5810 const auto CheckAbsNeg = [&]() { 5811 // Be careful about folding modifiers if we already have an abs. fneg is 5812 // applied last, so we don't want to apply an earlier fneg. 5813 if ((Mods & SISrcMods::ABS) == 0) { 5814 unsigned ModsTmp; 5815 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src); 5816 5817 if ((ModsTmp & SISrcMods::NEG) != 0) 5818 Mods ^= SISrcMods::NEG; 5819 5820 if ((ModsTmp & SISrcMods::ABS) != 0) 5821 Mods |= SISrcMods::ABS; 5822 } 5823 }; 5824 5825 CheckAbsNeg(); 5826 5827 // op_sel/op_sel_hi decide the source type and source. 5828 // If the source's op_sel_hi is set, it indicates to do a conversion from 5829 // fp16. If the sources's op_sel is set, it picks the high half of the 5830 // source register. 5831 5832 Mods |= SISrcMods::OP_SEL_1; 5833 5834 if (isExtractHiElt(*MRI, Src, Src)) { 5835 Mods |= SISrcMods::OP_SEL_0; 5836 CheckAbsNeg(); 5837 } 5838 5839 Matched = true; 5840 } 5841 5842 return {Src, Mods}; 5843 } 5844 5845 InstructionSelector::ComplexRendererFns 5846 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt( 5847 MachineOperand &Root) const { 5848 Register Src; 5849 unsigned Mods; 5850 bool Matched; 5851 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 5852 if (!Matched) 5853 return {}; 5854 5855 return {{ 5856 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5857 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 5858 }}; 5859 } 5860 5861 InstructionSelector::ComplexRendererFns 5862 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { 5863 Register Src; 5864 unsigned Mods; 5865 bool Matched; 5866 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 5867 5868 return {{ 5869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 5871 }}; 5872 } 5873 5874 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( 5875 MachineInstr &I, Intrinsic::ID IntrID) const { 5876 MachineBasicBlock *MBB = I.getParent(); 5877 const DebugLoc &DL = I.getDebugLoc(); 5878 Register CCReg = I.getOperand(0).getReg(); 5879 5880 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) 5881 .addImm(I.getOperand(2).getImm()); 5882 5883 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); 5884 5885 I.eraseFromParent(); 5886 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, 5887 *MRI); 5888 } 5889 5890 bool AMDGPUInstructionSelector::selectSGetBarrierState( 5891 MachineInstr &I, Intrinsic::ID IntrID) const { 5892 MachineBasicBlock *MBB = I.getParent(); 5893 const DebugLoc &DL = I.getDebugLoc(); 5894 MachineOperand BarOp = I.getOperand(2); 5895 std::optional<int64_t> BarValImm = 5896 getIConstantVRegSExtVal(BarOp.getReg(), *MRI); 5897 5898 if (!BarValImm) { 5899 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 5900 .addReg(BarOp.getReg()); 5901 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 5902 } 5903 MachineInstrBuilder MIB; 5904 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM 5905 : AMDGPU::S_GET_BARRIER_STATE_M0; 5906 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 5907 5908 auto DstReg = I.getOperand(0).getReg(); 5909 const TargetRegisterClass *DstRC = 5910 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 5911 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 5912 return false; 5913 MIB.addDef(DstReg); 5914 if (BarValImm) { 5915 MIB.addImm(*BarValImm); 5916 } 5917 I.eraseFromParent(); 5918 return true; 5919 } 5920 5921 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { 5922 if (HasInlineConst) { 5923 switch (IntrID) { 5924 default: 5925 llvm_unreachable("not a named barrier op"); 5926 case Intrinsic::amdgcn_s_barrier_join: 5927 return AMDGPU::S_BARRIER_JOIN_IMM; 5928 case Intrinsic::amdgcn_s_get_named_barrier_state: 5929 return AMDGPU::S_GET_BARRIER_STATE_IMM; 5930 }; 5931 } else { 5932 switch (IntrID) { 5933 default: 5934 llvm_unreachable("not a named barrier op"); 5935 case Intrinsic::amdgcn_s_barrier_join: 5936 return AMDGPU::S_BARRIER_JOIN_M0; 5937 case Intrinsic::amdgcn_s_get_named_barrier_state: 5938 return AMDGPU::S_GET_BARRIER_STATE_M0; 5939 }; 5940 } 5941 } 5942 5943 bool AMDGPUInstructionSelector::selectNamedBarrierInit( 5944 MachineInstr &I, Intrinsic::ID IntrID) const { 5945 MachineBasicBlock *MBB = I.getParent(); 5946 const DebugLoc &DL = I.getDebugLoc(); 5947 MachineOperand BarOp = I.getOperand(1); 5948 MachineOperand CntOp = I.getOperand(2); 5949 5950 // BarID = (BarOp >> 4) & 0x3F 5951 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5952 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) 5953 .add(BarOp) 5954 .addImm(4u) 5955 .setOperandDead(3); // Dead scc 5956 5957 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5958 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) 5959 .addReg(TmpReg0) 5960 .addImm(0x3F) 5961 .setOperandDead(3); // Dead scc 5962 5963 // MO = ((CntOp & 0x3F) << shAmt) | BarID 5964 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5965 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2) 5966 .add(CntOp) 5967 .addImm(0x3F) 5968 .setOperandDead(3); // Dead scc 5969 5970 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5971 constexpr unsigned ShAmt = 16; 5972 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3) 5973 .addReg(TmpReg2) 5974 .addImm(ShAmt) 5975 .setOperandDead(3); // Dead scc 5976 5977 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5978 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4) 5979 .addReg(TmpReg1) 5980 .addReg(TmpReg3) 5981 .setOperandDead(3); // Dead scc; 5982 5983 auto CopyMIB = 5984 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); 5985 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 5986 5987 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init 5988 ? AMDGPU::S_BARRIER_INIT_M0 5989 : AMDGPU::S_BARRIER_SIGNAL_M0; 5990 MachineInstrBuilder MIB; 5991 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 5992 5993 I.eraseFromParent(); 5994 return true; 5995 } 5996 5997 bool AMDGPUInstructionSelector::selectNamedBarrierInst( 5998 MachineInstr &I, Intrinsic::ID IntrID) const { 5999 MachineBasicBlock *MBB = I.getParent(); 6000 const DebugLoc &DL = I.getDebugLoc(); 6001 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state 6002 ? I.getOperand(2) 6003 : I.getOperand(1); 6004 std::optional<int64_t> BarValImm = 6005 getIConstantVRegSExtVal(BarOp.getReg(), *MRI); 6006 6007 if (!BarValImm) { 6008 // BarID = (BarOp >> 4) & 0x3F 6009 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6010 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) 6011 .addReg(BarOp.getReg()) 6012 .addImm(4u) 6013 .setOperandDead(3); // Dead scc; 6014 6015 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6016 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) 6017 .addReg(TmpReg0) 6018 .addImm(0x3F) 6019 .setOperandDead(3); // Dead scc; 6020 6021 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 6022 .addReg(TmpReg1); 6023 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 6024 } 6025 6026 MachineInstrBuilder MIB; 6027 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); 6028 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 6029 6030 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { 6031 auto DstReg = I.getOperand(0).getReg(); 6032 const TargetRegisterClass *DstRC = 6033 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 6034 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 6035 return false; 6036 MIB.addDef(DstReg); 6037 } 6038 6039 if (BarValImm) { 6040 auto BarId = ((*BarValImm) >> 4) & 0x3F; 6041 MIB.addImm(BarId); 6042 } 6043 6044 I.eraseFromParent(); 6045 return true; 6046 } 6047 6048 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 6049 const MachineInstr &MI, 6050 int OpIdx) const { 6051 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6052 "Expected G_CONSTANT"); 6053 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 6054 } 6055 6056 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 6057 const MachineInstr &MI, 6058 int OpIdx) const { 6059 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6060 "Expected G_CONSTANT"); 6061 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 6062 } 6063 6064 void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB, 6065 const MachineInstr &MI, 6066 int OpIdx) const { 6067 const MachineOperand &Op = MI.getOperand(1); 6068 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1); 6069 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 6070 } 6071 6072 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 6073 const MachineInstr &MI, 6074 int OpIdx) const { 6075 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6076 "Expected G_CONSTANT"); 6077 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount()); 6078 } 6079 6080 /// This only really exists to satisfy DAG type checking machinery, so is a 6081 /// no-op here. 6082 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 6083 const MachineInstr &MI, 6084 int OpIdx) const { 6085 const MachineOperand &Op = MI.getOperand(OpIdx); 6086 int64_t Imm; 6087 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm))) 6088 MIB.addImm(Imm); 6089 else 6090 MIB.addImm(Op.getImm()); 6091 } 6092 6093 void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB, 6094 const MachineInstr &MI, 6095 int OpIdx) const { 6096 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0); 6097 } 6098 6099 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, 6100 const MachineInstr &MI, 6101 int OpIdx) const { 6102 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6103 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6104 } 6105 6106 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( 6107 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6108 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6109 MIB.addImm( 6110 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6111 } 6112 6113 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( 6114 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6115 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6116 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) 6117 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) 6118 : (int64_t)SISrcMods::DST_OP_SEL); 6119 } 6120 6121 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( 6122 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6123 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6124 MIB.addImm( 6125 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6126 } 6127 6128 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( 6129 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6130 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6131 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) 6132 ? (int64_t)(SISrcMods::OP_SEL_0) 6133 : 0); 6134 } 6135 6136 void AMDGPUInstructionSelector::renderDstSelToOpSelXForm( 6137 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6138 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6139 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL) 6140 : 0); 6141 } 6142 6143 void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm( 6144 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6145 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6146 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0) 6147 : 0); 6148 } 6149 6150 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( 6151 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6152 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6153 MIB.addImm( 6154 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6155 } 6156 6157 void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( 6158 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6159 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6160 MIB.addImm( 6161 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); 6162 } 6163 6164 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, 6165 const MachineInstr &MI, 6166 int OpIdx) const { 6167 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6168 MIB.addImm(MI.getOperand(OpIdx).getImm() & 6169 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 6170 : AMDGPU::CPol::ALL_pregfx12)); 6171 } 6172 6173 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 6174 const MachineInstr &MI, 6175 int OpIdx) const { 6176 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6177 const bool Swizzle = MI.getOperand(OpIdx).getImm() & 6178 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ 6179 : AMDGPU::CPol::SWZ_pregfx12); 6180 MIB.addImm(Swizzle); 6181 } 6182 6183 void AMDGPUInstructionSelector::renderExtractCpolSetGLC( 6184 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6185 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6186 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & 6187 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 6188 : AMDGPU::CPol::ALL_pregfx12); 6189 MIB.addImm(Cpol | AMDGPU::CPol::GLC); 6190 } 6191 6192 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 6193 const MachineInstr &MI, 6194 int OpIdx) const { 6195 MIB.addFrameIndex(MI.getOperand(1).getIndex()); 6196 } 6197 6198 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, 6199 const MachineInstr &MI, 6200 int OpIdx) const { 6201 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF(); 6202 int ExpVal = APF.getExactLog2Abs(); 6203 assert(ExpVal != INT_MIN); 6204 MIB.addImm(ExpVal); 6205 } 6206 6207 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, 6208 const MachineInstr &MI, 6209 int OpIdx) const { 6210 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 6211 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 6212 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 6213 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 6214 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); 6215 } 6216 6217 /// Convert from 2-bit value to enum values used for op_sel* source modifiers. 6218 void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( 6219 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6220 unsigned Val = MI.getOperand(OpIdx).getImm(); 6221 unsigned New = 0; 6222 if (Val & 0x1) 6223 New |= SISrcMods::OP_SEL_0; 6224 if (Val & 0x2) 6225 New |= SISrcMods::OP_SEL_1; 6226 MIB.addImm(New); 6227 } 6228 6229 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { 6230 return TII.isInlineConstant(Imm); 6231 } 6232 6233 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 6234 return TII.isInlineConstant(Imm); 6235 } 6236