109467b48Spatrick //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick /// \file
909467b48Spatrick /// This file implements the targeting of the InstructionSelector class for
1009467b48Spatrick /// AMDGPU.
1109467b48Spatrick /// \todo This should be generated by TableGen.
1209467b48Spatrick //===----------------------------------------------------------------------===//
1309467b48Spatrick
1409467b48Spatrick #include "AMDGPUInstructionSelector.h"
1573471bf0Spatrick #include "AMDGPU.h"
1609467b48Spatrick #include "AMDGPUGlobalISelUtils.h"
1773471bf0Spatrick #include "AMDGPUInstrInfo.h"
1809467b48Spatrick #include "AMDGPURegisterBankInfo.h"
1909467b48Spatrick #include "AMDGPUTargetMachine.h"
2009467b48Spatrick #include "SIMachineFunctionInfo.h"
2173471bf0Spatrick #include "Utils/AMDGPUBaseInfo.h"
2209467b48Spatrick #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
2309467b48Spatrick #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
2409467b48Spatrick #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2573471bf0Spatrick #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26*d415bd75Srobert #include "llvm/CodeGen/MachineFrameInfo.h"
2773471bf0Spatrick #include "llvm/IR/DiagnosticInfo.h"
28*d415bd75Srobert #include "llvm/IR/IntrinsicsAMDGPU.h"
29*d415bd75Srobert #include <optional>
3009467b48Spatrick
3109467b48Spatrick #define DEBUG_TYPE "amdgpu-isel"
3209467b48Spatrick
3309467b48Spatrick using namespace llvm;
3409467b48Spatrick using namespace MIPatternMatch;
3509467b48Spatrick
36097a140dSpatrick static cl::opt<bool> AllowRiskySelect(
37097a140dSpatrick "amdgpu-global-isel-risky-select",
38097a140dSpatrick cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
39097a140dSpatrick cl::init(false),
40097a140dSpatrick cl::ReallyHidden);
41097a140dSpatrick
4209467b48Spatrick #define GET_GLOBALISEL_IMPL
4309467b48Spatrick #define AMDGPUSubtarget GCNSubtarget
4409467b48Spatrick #include "AMDGPUGenGlobalISel.inc"
4509467b48Spatrick #undef GET_GLOBALISEL_IMPL
4609467b48Spatrick #undef AMDGPUSubtarget
4709467b48Spatrick
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)4809467b48Spatrick AMDGPUInstructionSelector::AMDGPUInstructionSelector(
4909467b48Spatrick const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
5009467b48Spatrick const AMDGPUTargetMachine &TM)
51*d415bd75Srobert : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
5209467b48Spatrick STI(STI),
5309467b48Spatrick EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
5409467b48Spatrick #define GET_GLOBALISEL_PREDICATES_INIT
5509467b48Spatrick #include "AMDGPUGenGlobalISel.inc"
5609467b48Spatrick #undef GET_GLOBALISEL_PREDICATES_INIT
5709467b48Spatrick #define GET_GLOBALISEL_TEMPORARIES_INIT
5809467b48Spatrick #include "AMDGPUGenGlobalISel.inc"
5909467b48Spatrick #undef GET_GLOBALISEL_TEMPORARIES_INIT
6009467b48Spatrick {
6109467b48Spatrick }
6209467b48Spatrick
getName()6309467b48Spatrick const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
6409467b48Spatrick
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)6573471bf0Spatrick void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
6673471bf0Spatrick CodeGenCoverage &CoverageInfo,
6773471bf0Spatrick ProfileSummaryInfo *PSI,
6873471bf0Spatrick BlockFrequencyInfo *BFI) {
6909467b48Spatrick MRI = &MF.getRegInfo();
7073471bf0Spatrick Subtarget = &MF.getSubtarget<GCNSubtarget>();
7173471bf0Spatrick InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
7209467b48Spatrick }
7309467b48Spatrick
isVCC(Register Reg,const MachineRegisterInfo & MRI) const7409467b48Spatrick bool AMDGPUInstructionSelector::isVCC(Register Reg,
7509467b48Spatrick const MachineRegisterInfo &MRI) const {
7673471bf0Spatrick // The verifier is oblivious to s1 being a valid value for wavesize registers.
7773471bf0Spatrick if (Reg.isPhysical())
7873471bf0Spatrick return false;
7909467b48Spatrick
8009467b48Spatrick auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
8109467b48Spatrick const TargetRegisterClass *RC =
8209467b48Spatrick RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
8309467b48Spatrick if (RC) {
8409467b48Spatrick const LLT Ty = MRI.getType(Reg);
85*d415bd75Srobert if (!Ty.isValid() || Ty.getSizeInBits() != 1)
86*d415bd75Srobert return false;
87*d415bd75Srobert // G_TRUNC s1 result is never vcc.
88*d415bd75Srobert return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
89*d415bd75Srobert RC->hasSuperClassEq(TRI.getBoolRC());
9009467b48Spatrick }
9109467b48Spatrick
9209467b48Spatrick const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
9309467b48Spatrick return RB->getID() == AMDGPU::VCCRegBankID;
9409467b48Spatrick }
9509467b48Spatrick
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const96097a140dSpatrick bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97097a140dSpatrick unsigned NewOpc) const {
98097a140dSpatrick MI.setDesc(TII.get(NewOpc));
99*d415bd75Srobert MI.removeOperand(1); // Remove intrinsic ID.
100097a140dSpatrick MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101097a140dSpatrick
102097a140dSpatrick MachineOperand &Dst = MI.getOperand(0);
103097a140dSpatrick MachineOperand &Src = MI.getOperand(1);
104097a140dSpatrick
105097a140dSpatrick // TODO: This should be legalized to s32 if needed
106097a140dSpatrick if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107097a140dSpatrick return false;
108097a140dSpatrick
109097a140dSpatrick const TargetRegisterClass *DstRC
110097a140dSpatrick = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111097a140dSpatrick const TargetRegisterClass *SrcRC
112097a140dSpatrick = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113097a140dSpatrick if (!DstRC || DstRC != SrcRC)
114097a140dSpatrick return false;
115097a140dSpatrick
116097a140dSpatrick return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117097a140dSpatrick RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118097a140dSpatrick }
119097a140dSpatrick
selectCOPY(MachineInstr & I) const12009467b48Spatrick bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
12109467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
12209467b48Spatrick MachineBasicBlock *BB = I.getParent();
12309467b48Spatrick I.setDesc(TII.get(TargetOpcode::COPY));
12409467b48Spatrick
12509467b48Spatrick const MachineOperand &Src = I.getOperand(1);
12609467b48Spatrick MachineOperand &Dst = I.getOperand(0);
12709467b48Spatrick Register DstReg = Dst.getReg();
12809467b48Spatrick Register SrcReg = Src.getReg();
12909467b48Spatrick
13009467b48Spatrick if (isVCC(DstReg, *MRI)) {
13109467b48Spatrick if (SrcReg == AMDGPU::SCC) {
13209467b48Spatrick const TargetRegisterClass *RC
13309467b48Spatrick = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
13409467b48Spatrick if (!RC)
13509467b48Spatrick return true;
13609467b48Spatrick return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
13709467b48Spatrick }
13809467b48Spatrick
13909467b48Spatrick if (!isVCC(SrcReg, *MRI)) {
14009467b48Spatrick // TODO: Should probably leave the copy and let copyPhysReg expand it.
14109467b48Spatrick if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
14209467b48Spatrick return false;
14309467b48Spatrick
14409467b48Spatrick const TargetRegisterClass *SrcRC
14509467b48Spatrick = TRI.getConstrainedRegClassForOperand(Src, *MRI);
14609467b48Spatrick
147*d415bd75Srobert std::optional<ValueAndVReg> ConstVal =
148*d415bd75Srobert getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
14973471bf0Spatrick if (ConstVal) {
15073471bf0Spatrick unsigned MovOpc =
15173471bf0Spatrick STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
15273471bf0Spatrick BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
15373471bf0Spatrick .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
15473471bf0Spatrick } else {
15509467b48Spatrick Register MaskedReg = MRI->createVirtualRegister(SrcRC);
15609467b48Spatrick
15709467b48Spatrick // We can't trust the high bits at this point, so clear them.
15809467b48Spatrick
15909467b48Spatrick // TODO: Skip masking high bits if def is known boolean.
16009467b48Spatrick
16173471bf0Spatrick unsigned AndOpc =
16273471bf0Spatrick TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
16309467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
16409467b48Spatrick .addImm(1)
16509467b48Spatrick .addReg(SrcReg);
16609467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
16709467b48Spatrick .addImm(0)
16809467b48Spatrick .addReg(MaskedReg);
16973471bf0Spatrick }
17009467b48Spatrick
17109467b48Spatrick if (!MRI->getRegClassOrNull(SrcReg))
17209467b48Spatrick MRI->setRegClass(SrcReg, SrcRC);
17309467b48Spatrick I.eraseFromParent();
17409467b48Spatrick return true;
17509467b48Spatrick }
17609467b48Spatrick
17709467b48Spatrick const TargetRegisterClass *RC =
17809467b48Spatrick TRI.getConstrainedRegClassForOperand(Dst, *MRI);
17909467b48Spatrick if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
18009467b48Spatrick return false;
18109467b48Spatrick
18209467b48Spatrick return true;
18309467b48Spatrick }
18409467b48Spatrick
18509467b48Spatrick for (const MachineOperand &MO : I.operands()) {
18673471bf0Spatrick if (MO.getReg().isPhysical())
18709467b48Spatrick continue;
18809467b48Spatrick
18909467b48Spatrick const TargetRegisterClass *RC =
19009467b48Spatrick TRI.getConstrainedRegClassForOperand(MO, *MRI);
19109467b48Spatrick if (!RC)
19209467b48Spatrick continue;
19309467b48Spatrick RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
19409467b48Spatrick }
19509467b48Spatrick return true;
19609467b48Spatrick }
19709467b48Spatrick
selectPHI(MachineInstr & I) const19809467b48Spatrick bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
19909467b48Spatrick const Register DefReg = I.getOperand(0).getReg();
20009467b48Spatrick const LLT DefTy = MRI->getType(DefReg);
201097a140dSpatrick if (DefTy == LLT::scalar(1)) {
202097a140dSpatrick if (!AllowRiskySelect) {
203097a140dSpatrick LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
204097a140dSpatrick return false;
205097a140dSpatrick }
206097a140dSpatrick
207097a140dSpatrick LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
208097a140dSpatrick }
20909467b48Spatrick
21009467b48Spatrick // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
21109467b48Spatrick
21209467b48Spatrick const RegClassOrRegBank &RegClassOrBank =
21309467b48Spatrick MRI->getRegClassOrRegBank(DefReg);
21409467b48Spatrick
21509467b48Spatrick const TargetRegisterClass *DefRC
21609467b48Spatrick = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
21709467b48Spatrick if (!DefRC) {
21809467b48Spatrick if (!DefTy.isValid()) {
21909467b48Spatrick LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
22009467b48Spatrick return false;
22109467b48Spatrick }
22209467b48Spatrick
22309467b48Spatrick const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
224*d415bd75Srobert DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
22509467b48Spatrick if (!DefRC) {
22609467b48Spatrick LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
22709467b48Spatrick return false;
22809467b48Spatrick }
22909467b48Spatrick }
23009467b48Spatrick
23109467b48Spatrick // TODO: Verify that all registers have the same bank
23209467b48Spatrick I.setDesc(TII.get(TargetOpcode::PHI));
23309467b48Spatrick return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
23409467b48Spatrick }
23509467b48Spatrick
23609467b48Spatrick MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const23709467b48Spatrick AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
23809467b48Spatrick const TargetRegisterClass &SubRC,
23909467b48Spatrick unsigned SubIdx) const {
24009467b48Spatrick
24109467b48Spatrick MachineInstr *MI = MO.getParent();
24209467b48Spatrick MachineBasicBlock *BB = MO.getParent()->getParent();
24309467b48Spatrick Register DstReg = MRI->createVirtualRegister(&SubRC);
24409467b48Spatrick
24509467b48Spatrick if (MO.isReg()) {
24609467b48Spatrick unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
24709467b48Spatrick Register Reg = MO.getReg();
24809467b48Spatrick BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
24909467b48Spatrick .addReg(Reg, 0, ComposedSubIdx);
25009467b48Spatrick
25109467b48Spatrick return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
25209467b48Spatrick MO.isKill(), MO.isDead(), MO.isUndef(),
25309467b48Spatrick MO.isEarlyClobber(), 0, MO.isDebug(),
25409467b48Spatrick MO.isInternalRead());
25509467b48Spatrick }
25609467b48Spatrick
25709467b48Spatrick assert(MO.isImm());
25809467b48Spatrick
25909467b48Spatrick APInt Imm(64, MO.getImm());
26009467b48Spatrick
26109467b48Spatrick switch (SubIdx) {
26209467b48Spatrick default:
26309467b48Spatrick llvm_unreachable("do not know to split immediate with this sub index.");
26409467b48Spatrick case AMDGPU::sub0:
26509467b48Spatrick return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
26609467b48Spatrick case AMDGPU::sub1:
26709467b48Spatrick return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
26809467b48Spatrick }
26909467b48Spatrick }
27009467b48Spatrick
getLogicalBitOpcode(unsigned Opc,bool Is64)27109467b48Spatrick static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
27209467b48Spatrick switch (Opc) {
27309467b48Spatrick case AMDGPU::G_AND:
27409467b48Spatrick return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
27509467b48Spatrick case AMDGPU::G_OR:
27609467b48Spatrick return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
27709467b48Spatrick case AMDGPU::G_XOR:
27809467b48Spatrick return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
27909467b48Spatrick default:
28009467b48Spatrick llvm_unreachable("not a bit op");
28109467b48Spatrick }
28209467b48Spatrick }
28309467b48Spatrick
selectG_AND_OR_XOR(MachineInstr & I) const28409467b48Spatrick bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
28573471bf0Spatrick Register DstReg = I.getOperand(0).getReg();
28609467b48Spatrick unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
28709467b48Spatrick
28809467b48Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
28973471bf0Spatrick if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
29073471bf0Spatrick DstRB->getID() != AMDGPU::VCCRegBankID)
29173471bf0Spatrick return false;
29209467b48Spatrick
29373471bf0Spatrick bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
29473471bf0Spatrick STI.isWave64());
29573471bf0Spatrick I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
29609467b48Spatrick
29709467b48Spatrick // Dead implicit-def of scc
29809467b48Spatrick I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
29909467b48Spatrick true, // isImp
30009467b48Spatrick false, // isKill
30109467b48Spatrick true)); // isDead
30209467b48Spatrick return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
30309467b48Spatrick }
30409467b48Spatrick
selectG_ADD_SUB(MachineInstr & I) const30509467b48Spatrick bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
30609467b48Spatrick MachineBasicBlock *BB = I.getParent();
30709467b48Spatrick MachineFunction *MF = BB->getParent();
30809467b48Spatrick Register DstReg = I.getOperand(0).getReg();
30909467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
310097a140dSpatrick LLT Ty = MRI->getType(DstReg);
311097a140dSpatrick if (Ty.isVector())
312097a140dSpatrick return false;
313097a140dSpatrick
314097a140dSpatrick unsigned Size = Ty.getSizeInBits();
31509467b48Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
31609467b48Spatrick const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
31709467b48Spatrick const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
31809467b48Spatrick
31909467b48Spatrick if (Size == 32) {
32009467b48Spatrick if (IsSALU) {
32109467b48Spatrick const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
32209467b48Spatrick MachineInstr *Add =
32309467b48Spatrick BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
32409467b48Spatrick .add(I.getOperand(1))
32509467b48Spatrick .add(I.getOperand(2));
32609467b48Spatrick I.eraseFromParent();
32709467b48Spatrick return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
32809467b48Spatrick }
32909467b48Spatrick
33009467b48Spatrick if (STI.hasAddNoCarry()) {
33109467b48Spatrick const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
33209467b48Spatrick I.setDesc(TII.get(Opc));
33309467b48Spatrick I.addOperand(*MF, MachineOperand::CreateImm(0));
33409467b48Spatrick I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
33509467b48Spatrick return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
33609467b48Spatrick }
33709467b48Spatrick
33873471bf0Spatrick const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
33909467b48Spatrick
34009467b48Spatrick Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
34109467b48Spatrick MachineInstr *Add
34209467b48Spatrick = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
34309467b48Spatrick .addDef(UnusedCarry, RegState::Dead)
34409467b48Spatrick .add(I.getOperand(1))
34509467b48Spatrick .add(I.getOperand(2))
34609467b48Spatrick .addImm(0);
34709467b48Spatrick I.eraseFromParent();
34809467b48Spatrick return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
34909467b48Spatrick }
35009467b48Spatrick
35109467b48Spatrick assert(!Sub && "illegal sub should not reach here");
35209467b48Spatrick
35309467b48Spatrick const TargetRegisterClass &RC
35409467b48Spatrick = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
35509467b48Spatrick const TargetRegisterClass &HalfRC
35609467b48Spatrick = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
35709467b48Spatrick
35809467b48Spatrick MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
35909467b48Spatrick MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
36009467b48Spatrick MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
36109467b48Spatrick MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
36209467b48Spatrick
36309467b48Spatrick Register DstLo = MRI->createVirtualRegister(&HalfRC);
36409467b48Spatrick Register DstHi = MRI->createVirtualRegister(&HalfRC);
36509467b48Spatrick
36609467b48Spatrick if (IsSALU) {
36709467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
36809467b48Spatrick .add(Lo1)
36909467b48Spatrick .add(Lo2);
37009467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
37109467b48Spatrick .add(Hi1)
37209467b48Spatrick .add(Hi2);
37309467b48Spatrick } else {
37409467b48Spatrick const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
37509467b48Spatrick Register CarryReg = MRI->createVirtualRegister(CarryRC);
37673471bf0Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
37709467b48Spatrick .addDef(CarryReg)
37809467b48Spatrick .add(Lo1)
37909467b48Spatrick .add(Lo2)
38009467b48Spatrick .addImm(0);
38109467b48Spatrick MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
38209467b48Spatrick .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
38309467b48Spatrick .add(Hi1)
38409467b48Spatrick .add(Hi2)
38509467b48Spatrick .addReg(CarryReg, RegState::Kill)
38609467b48Spatrick .addImm(0);
38709467b48Spatrick
38809467b48Spatrick if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
38909467b48Spatrick return false;
39009467b48Spatrick }
39109467b48Spatrick
39209467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
39309467b48Spatrick .addReg(DstLo)
39409467b48Spatrick .addImm(AMDGPU::sub0)
39509467b48Spatrick .addReg(DstHi)
39609467b48Spatrick .addImm(AMDGPU::sub1);
39709467b48Spatrick
39809467b48Spatrick
39909467b48Spatrick if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
40009467b48Spatrick return false;
40109467b48Spatrick
40209467b48Spatrick I.eraseFromParent();
40309467b48Spatrick return true;
40409467b48Spatrick }
40509467b48Spatrick
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const40609467b48Spatrick bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
40709467b48Spatrick MachineInstr &I) const {
40809467b48Spatrick MachineBasicBlock *BB = I.getParent();
40909467b48Spatrick MachineFunction *MF = BB->getParent();
41009467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
41109467b48Spatrick Register Dst0Reg = I.getOperand(0).getReg();
41209467b48Spatrick Register Dst1Reg = I.getOperand(1).getReg();
41309467b48Spatrick const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
41409467b48Spatrick I.getOpcode() == AMDGPU::G_UADDE;
41509467b48Spatrick const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
41609467b48Spatrick I.getOpcode() == AMDGPU::G_USUBE;
41709467b48Spatrick
41809467b48Spatrick if (isVCC(Dst1Reg, *MRI)) {
41973471bf0Spatrick unsigned NoCarryOpc =
42073471bf0Spatrick IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
42109467b48Spatrick unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
42209467b48Spatrick I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
42309467b48Spatrick I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
42409467b48Spatrick I.addOperand(*MF, MachineOperand::CreateImm(0));
42509467b48Spatrick return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
42609467b48Spatrick }
42709467b48Spatrick
42809467b48Spatrick Register Src0Reg = I.getOperand(2).getReg();
42909467b48Spatrick Register Src1Reg = I.getOperand(3).getReg();
43009467b48Spatrick
43109467b48Spatrick if (HasCarryIn) {
43209467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
43309467b48Spatrick .addReg(I.getOperand(4).getReg());
43409467b48Spatrick }
43509467b48Spatrick
43609467b48Spatrick unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
43709467b48Spatrick unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
43809467b48Spatrick
43909467b48Spatrick BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
44009467b48Spatrick .add(I.getOperand(2))
44109467b48Spatrick .add(I.getOperand(3));
44209467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
44309467b48Spatrick .addReg(AMDGPU::SCC);
44409467b48Spatrick
44509467b48Spatrick if (!MRI->getRegClassOrNull(Dst1Reg))
44609467b48Spatrick MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
44709467b48Spatrick
44809467b48Spatrick if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
44909467b48Spatrick !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
45009467b48Spatrick !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
45109467b48Spatrick return false;
45209467b48Spatrick
45309467b48Spatrick if (HasCarryIn &&
45409467b48Spatrick !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
45509467b48Spatrick AMDGPU::SReg_32RegClass, *MRI))
45609467b48Spatrick return false;
45709467b48Spatrick
45809467b48Spatrick I.eraseFromParent();
45909467b48Spatrick return true;
46009467b48Spatrick }
46109467b48Spatrick
selectG_AMDGPU_MAD_64_32(MachineInstr & I) const462*d415bd75Srobert bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
463*d415bd75Srobert MachineInstr &I) const {
464*d415bd75Srobert MachineBasicBlock *BB = I.getParent();
465*d415bd75Srobert MachineFunction *MF = BB->getParent();
466*d415bd75Srobert const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
467*d415bd75Srobert
468*d415bd75Srobert unsigned Opc;
469*d415bd75Srobert if (Subtarget->hasMADIntraFwdBug())
470*d415bd75Srobert Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
471*d415bd75Srobert : AMDGPU::V_MAD_I64_I32_gfx11_e64;
472*d415bd75Srobert else
473*d415bd75Srobert Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
474*d415bd75Srobert I.setDesc(TII.get(Opc));
475*d415bd75Srobert I.addOperand(*MF, MachineOperand::CreateImm(0));
476*d415bd75Srobert I.addImplicitDefUseOperands(*MF);
477*d415bd75Srobert return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
478*d415bd75Srobert }
479*d415bd75Srobert
480097a140dSpatrick // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const48109467b48Spatrick bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
48209467b48Spatrick MachineBasicBlock *BB = I.getParent();
48309467b48Spatrick Register DstReg = I.getOperand(0).getReg();
48409467b48Spatrick Register SrcReg = I.getOperand(1).getReg();
48509467b48Spatrick LLT DstTy = MRI->getType(DstReg);
48609467b48Spatrick LLT SrcTy = MRI->getType(SrcReg);
48709467b48Spatrick const unsigned SrcSize = SrcTy.getSizeInBits();
488097a140dSpatrick unsigned DstSize = DstTy.getSizeInBits();
48909467b48Spatrick
49009467b48Spatrick // TODO: Should handle any multiple of 32 offset.
49109467b48Spatrick unsigned Offset = I.getOperand(2).getImm();
492097a140dSpatrick if (Offset % 32 != 0 || DstSize > 128)
493097a140dSpatrick return false;
494097a140dSpatrick
495097a140dSpatrick // 16-bit operations really use 32-bit registers.
496097a140dSpatrick // FIXME: Probably should not allow 16-bit G_EXTRACT results.
497097a140dSpatrick if (DstSize == 16)
498097a140dSpatrick DstSize = 32;
499097a140dSpatrick
500097a140dSpatrick const TargetRegisterClass *DstRC =
501097a140dSpatrick TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
502097a140dSpatrick if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
50309467b48Spatrick return false;
50409467b48Spatrick
50509467b48Spatrick const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
50609467b48Spatrick const TargetRegisterClass *SrcRC =
507*d415bd75Srobert TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
50809467b48Spatrick if (!SrcRC)
50909467b48Spatrick return false;
510097a140dSpatrick unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
511097a140dSpatrick DstSize / 32);
512097a140dSpatrick SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
513097a140dSpatrick if (!SrcRC)
514097a140dSpatrick return false;
51509467b48Spatrick
516097a140dSpatrick SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
517097a140dSpatrick *SrcRC, I.getOperand(1));
51809467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
519097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
520097a140dSpatrick .addReg(SrcReg, 0, SubReg);
52109467b48Spatrick
52209467b48Spatrick I.eraseFromParent();
52309467b48Spatrick return true;
52409467b48Spatrick }
52509467b48Spatrick
selectG_FMA_FMAD(MachineInstr & I) const526*d415bd75Srobert bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const {
527*d415bd75Srobert assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD);
528*d415bd75Srobert
529*d415bd75Srobert // Try to manually select MAD_MIX/FMA_MIX.
530*d415bd75Srobert Register Dst = I.getOperand(0).getReg();
531*d415bd75Srobert LLT ResultTy = MRI->getType(Dst);
532*d415bd75Srobert bool IsFMA = I.getOpcode() == AMDGPU::G_FMA;
533*d415bd75Srobert if (ResultTy != LLT::scalar(32) ||
534*d415bd75Srobert (IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts()))
535*d415bd75Srobert return false;
536*d415bd75Srobert
537*d415bd75Srobert // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
538*d415bd75Srobert // using the conversion from f16.
539*d415bd75Srobert bool MatchedSrc0, MatchedSrc1, MatchedSrc2;
540*d415bd75Srobert auto [Src0, Src0Mods] =
541*d415bd75Srobert selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0);
542*d415bd75Srobert auto [Src1, Src1Mods] =
543*d415bd75Srobert selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1);
544*d415bd75Srobert auto [Src2, Src2Mods] =
545*d415bd75Srobert selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2);
546*d415bd75Srobert
547*d415bd75Srobert #ifndef NDEBUG
548*d415bd75Srobert const SIMachineFunctionInfo *MFI =
549*d415bd75Srobert I.getMF()->getInfo<SIMachineFunctionInfo>();
550*d415bd75Srobert AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
551*d415bd75Srobert assert((IsFMA || !Mode.allFP32Denormals()) &&
552*d415bd75Srobert "fmad selected with denormals enabled");
553*d415bd75Srobert #endif
554*d415bd75Srobert
555*d415bd75Srobert // TODO: We can select this with f32 denormals enabled if all the sources are
556*d415bd75Srobert // converted from f16 (in which case fmad isn't legal).
557*d415bd75Srobert if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2)
558*d415bd75Srobert return false;
559*d415bd75Srobert
560*d415bd75Srobert const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32;
561*d415bd75Srobert MachineInstr *MixInst =
562*d415bd75Srobert BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst)
563*d415bd75Srobert .addImm(Src0Mods)
564*d415bd75Srobert .addReg(copyToVGPRIfSrcFolded(Src0, Src0Mods, I.getOperand(1), &I))
565*d415bd75Srobert .addImm(Src1Mods)
566*d415bd75Srobert .addReg(copyToVGPRIfSrcFolded(Src1, Src1Mods, I.getOperand(2), &I))
567*d415bd75Srobert .addImm(Src2Mods)
568*d415bd75Srobert .addReg(copyToVGPRIfSrcFolded(Src2, Src2Mods, I.getOperand(3), &I))
569*d415bd75Srobert .addImm(0)
570*d415bd75Srobert .addImm(0)
571*d415bd75Srobert .addImm(0);
572*d415bd75Srobert
573*d415bd75Srobert if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI))
574*d415bd75Srobert return false;
575*d415bd75Srobert
576*d415bd75Srobert I.eraseFromParent();
577*d415bd75Srobert return true;
578*d415bd75Srobert }
579*d415bd75Srobert
selectG_MERGE_VALUES(MachineInstr & MI) const58009467b48Spatrick bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
58109467b48Spatrick MachineBasicBlock *BB = MI.getParent();
58209467b48Spatrick Register DstReg = MI.getOperand(0).getReg();
58309467b48Spatrick LLT DstTy = MRI->getType(DstReg);
58409467b48Spatrick LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
58509467b48Spatrick
58609467b48Spatrick const unsigned SrcSize = SrcTy.getSizeInBits();
58709467b48Spatrick if (SrcSize < 32)
58809467b48Spatrick return selectImpl(MI, *CoverageInfo);
58909467b48Spatrick
59009467b48Spatrick const DebugLoc &DL = MI.getDebugLoc();
59109467b48Spatrick const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
59209467b48Spatrick const unsigned DstSize = DstTy.getSizeInBits();
59309467b48Spatrick const TargetRegisterClass *DstRC =
594*d415bd75Srobert TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
59509467b48Spatrick if (!DstRC)
59609467b48Spatrick return false;
59709467b48Spatrick
59809467b48Spatrick ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
59909467b48Spatrick MachineInstrBuilder MIB =
60009467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
60109467b48Spatrick for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
60209467b48Spatrick MachineOperand &Src = MI.getOperand(I + 1);
60309467b48Spatrick MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
60409467b48Spatrick MIB.addImm(SubRegs[I]);
60509467b48Spatrick
60609467b48Spatrick const TargetRegisterClass *SrcRC
60709467b48Spatrick = TRI.getConstrainedRegClassForOperand(Src, *MRI);
60809467b48Spatrick if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
60909467b48Spatrick return false;
61009467b48Spatrick }
61109467b48Spatrick
61209467b48Spatrick if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
61309467b48Spatrick return false;
61409467b48Spatrick
61509467b48Spatrick MI.eraseFromParent();
61609467b48Spatrick return true;
61709467b48Spatrick }
61809467b48Spatrick
selectG_UNMERGE_VALUES(MachineInstr & MI) const61909467b48Spatrick bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
62009467b48Spatrick MachineBasicBlock *BB = MI.getParent();
62109467b48Spatrick const int NumDst = MI.getNumOperands() - 1;
62209467b48Spatrick
62309467b48Spatrick MachineOperand &Src = MI.getOperand(NumDst);
62409467b48Spatrick
62509467b48Spatrick Register SrcReg = Src.getReg();
62609467b48Spatrick Register DstReg0 = MI.getOperand(0).getReg();
62709467b48Spatrick LLT DstTy = MRI->getType(DstReg0);
62809467b48Spatrick LLT SrcTy = MRI->getType(SrcReg);
62909467b48Spatrick
63009467b48Spatrick const unsigned DstSize = DstTy.getSizeInBits();
63109467b48Spatrick const unsigned SrcSize = SrcTy.getSizeInBits();
63209467b48Spatrick const DebugLoc &DL = MI.getDebugLoc();
63309467b48Spatrick const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
63409467b48Spatrick
63509467b48Spatrick const TargetRegisterClass *SrcRC =
636*d415bd75Srobert TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
63709467b48Spatrick if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
63809467b48Spatrick return false;
63909467b48Spatrick
64009467b48Spatrick // Note we could have mixed SGPR and VGPR destination banks for an SGPR
64109467b48Spatrick // source, and this relies on the fact that the same subregister indices are
64209467b48Spatrick // used for both.
64309467b48Spatrick ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
64409467b48Spatrick for (int I = 0, E = NumDst; I != E; ++I) {
64509467b48Spatrick MachineOperand &Dst = MI.getOperand(I);
64609467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
64773471bf0Spatrick .addReg(SrcReg, 0, SubRegs[I]);
64873471bf0Spatrick
64973471bf0Spatrick // Make sure the subregister index is valid for the source register.
65073471bf0Spatrick SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
65173471bf0Spatrick if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
65273471bf0Spatrick return false;
65309467b48Spatrick
65409467b48Spatrick const TargetRegisterClass *DstRC =
65509467b48Spatrick TRI.getConstrainedRegClassForOperand(Dst, *MRI);
65609467b48Spatrick if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
65709467b48Spatrick return false;
65809467b48Spatrick }
65909467b48Spatrick
66009467b48Spatrick MI.eraseFromParent();
66109467b48Spatrick return true;
66209467b48Spatrick }
66309467b48Spatrick
selectG_BUILD_VECTOR(MachineInstr & MI) const664*d415bd75Srobert bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
665*d415bd75Srobert assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
666*d415bd75Srobert MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
667097a140dSpatrick
668097a140dSpatrick Register Src0 = MI.getOperand(1).getReg();
669097a140dSpatrick Register Src1 = MI.getOperand(2).getReg();
670*d415bd75Srobert LLT SrcTy = MRI->getType(Src0);
671*d415bd75Srobert const unsigned SrcSize = SrcTy.getSizeInBits();
672*d415bd75Srobert
673*d415bd75Srobert // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
674*d415bd75Srobert if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
675*d415bd75Srobert return selectG_MERGE_VALUES(MI);
676*d415bd75Srobert }
677*d415bd75Srobert
678*d415bd75Srobert // Selection logic below is for V2S16 only.
679*d415bd75Srobert // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
680*d415bd75Srobert Register Dst = MI.getOperand(0).getReg();
681*d415bd75Srobert if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
682*d415bd75Srobert (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
683*d415bd75Srobert SrcTy != LLT::scalar(32)))
684*d415bd75Srobert return selectImpl(MI, *CoverageInfo);
685*d415bd75Srobert
686*d415bd75Srobert const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
687*d415bd75Srobert if (DstBank->getID() == AMDGPU::AGPRRegBankID)
688097a140dSpatrick return false;
689097a140dSpatrick
690*d415bd75Srobert assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
691*d415bd75Srobert DstBank->getID() == AMDGPU::VGPRRegBankID);
692*d415bd75Srobert const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
693*d415bd75Srobert
694097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
695097a140dSpatrick MachineBasicBlock *BB = MI.getParent();
696097a140dSpatrick
697*d415bd75Srobert // First, before trying TableGen patterns, check if both sources are
698*d415bd75Srobert // constants. In those cases, we can trivially compute the final constant
699*d415bd75Srobert // and emit a simple move.
700*d415bd75Srobert auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
70173471bf0Spatrick if (ConstSrc1) {
70273471bf0Spatrick auto ConstSrc0 =
703*d415bd75Srobert getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
70473471bf0Spatrick if (ConstSrc0) {
70573471bf0Spatrick const int64_t K0 = ConstSrc0->Value.getSExtValue();
70673471bf0Spatrick const int64_t K1 = ConstSrc1->Value.getSExtValue();
70773471bf0Spatrick uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
70873471bf0Spatrick uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
709*d415bd75Srobert uint32_t Imm = Lo16 | (Hi16 << 16);
71073471bf0Spatrick
711*d415bd75Srobert // VALU
712*d415bd75Srobert if (IsVector) {
713*d415bd75Srobert BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
714*d415bd75Srobert MI.eraseFromParent();
715*d415bd75Srobert return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
716*d415bd75Srobert }
717*d415bd75Srobert
718*d415bd75Srobert // SALU
719*d415bd75Srobert BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
72073471bf0Spatrick MI.eraseFromParent();
72173471bf0Spatrick return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
72273471bf0Spatrick }
72373471bf0Spatrick }
72473471bf0Spatrick
725*d415bd75Srobert // Now try TableGen patterns.
726*d415bd75Srobert if (selectImpl(MI, *CoverageInfo))
727*d415bd75Srobert return true;
728*d415bd75Srobert
729097a140dSpatrick // TODO: This should probably be a combine somewhere
730*d415bd75Srobert // (build_vector $src0, undef) -> copy $src0
731097a140dSpatrick MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
732*d415bd75Srobert if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
733097a140dSpatrick MI.setDesc(TII.get(AMDGPU::COPY));
734*d415bd75Srobert MI.removeOperand(2);
735*d415bd75Srobert const auto &RC =
736*d415bd75Srobert IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
737*d415bd75Srobert return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
738*d415bd75Srobert RBI.constrainGenericRegister(Src0, RC, *MRI);
739*d415bd75Srobert }
740*d415bd75Srobert
741*d415bd75Srobert // TODO: Can be improved?
742*d415bd75Srobert if (IsVector) {
743*d415bd75Srobert Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
744*d415bd75Srobert auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
745*d415bd75Srobert .addImm(0xFFFF)
746*d415bd75Srobert .addReg(Src0);
747*d415bd75Srobert if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
748*d415bd75Srobert return false;
749*d415bd75Srobert
750*d415bd75Srobert MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
751*d415bd75Srobert .addReg(Src1)
752*d415bd75Srobert .addImm(16)
753*d415bd75Srobert .addReg(TmpReg);
754*d415bd75Srobert if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
755*d415bd75Srobert return false;
756*d415bd75Srobert
757*d415bd75Srobert MI.eraseFromParent();
758*d415bd75Srobert return true;
759097a140dSpatrick }
760097a140dSpatrick
761097a140dSpatrick Register ShiftSrc0;
762097a140dSpatrick Register ShiftSrc1;
763097a140dSpatrick
764097a140dSpatrick // With multiple uses of the shift, this will duplicate the shift and
765097a140dSpatrick // increase register pressure.
766097a140dSpatrick //
767*d415bd75Srobert // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
768097a140dSpatrick // => (S_PACK_HH_B32_B16 $src0, $src1)
769*d415bd75Srobert // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
770*d415bd75Srobert // => (S_PACK_HL_B32_B16 $src0, $src1)
771*d415bd75Srobert // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
772097a140dSpatrick // => (S_PACK_LH_B32_B16 $src0, $src1)
773*d415bd75Srobert // (build_vector $src0, $src1)
774097a140dSpatrick // => (S_PACK_LL_B32_B16 $src0, $src1)
775097a140dSpatrick
776097a140dSpatrick bool Shift0 = mi_match(
77773471bf0Spatrick Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
778097a140dSpatrick
779097a140dSpatrick bool Shift1 = mi_match(
78073471bf0Spatrick Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
781097a140dSpatrick
782097a140dSpatrick unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
783097a140dSpatrick if (Shift0 && Shift1) {
784097a140dSpatrick Opc = AMDGPU::S_PACK_HH_B32_B16;
785097a140dSpatrick MI.getOperand(1).setReg(ShiftSrc0);
786097a140dSpatrick MI.getOperand(2).setReg(ShiftSrc1);
787097a140dSpatrick } else if (Shift1) {
788097a140dSpatrick Opc = AMDGPU::S_PACK_LH_B32_B16;
789097a140dSpatrick MI.getOperand(2).setReg(ShiftSrc1);
790*d415bd75Srobert } else if (Shift0) {
791*d415bd75Srobert auto ConstSrc1 =
792*d415bd75Srobert getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
793*d415bd75Srobert if (ConstSrc1 && ConstSrc1->Value == 0) {
794097a140dSpatrick // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
795097a140dSpatrick auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
796097a140dSpatrick .addReg(ShiftSrc0)
797097a140dSpatrick .addImm(16);
798097a140dSpatrick
799097a140dSpatrick MI.eraseFromParent();
800097a140dSpatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
801097a140dSpatrick }
802*d415bd75Srobert if (STI.hasSPackHL()) {
803*d415bd75Srobert Opc = AMDGPU::S_PACK_HL_B32_B16;
804*d415bd75Srobert MI.getOperand(1).setReg(ShiftSrc0);
805*d415bd75Srobert }
806*d415bd75Srobert }
807097a140dSpatrick
808097a140dSpatrick MI.setDesc(TII.get(Opc));
809097a140dSpatrick return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
810097a140dSpatrick }
811097a140dSpatrick
selectG_PTR_ADD(MachineInstr & I) const81209467b48Spatrick bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
81309467b48Spatrick return selectG_ADD_SUB(I);
81409467b48Spatrick }
81509467b48Spatrick
selectG_IMPLICIT_DEF(MachineInstr & I) const81609467b48Spatrick bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
81709467b48Spatrick const MachineOperand &MO = I.getOperand(0);
81809467b48Spatrick
81909467b48Spatrick // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
82009467b48Spatrick // regbank check here is to know why getConstrainedRegClassForOperand failed.
82109467b48Spatrick const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
82209467b48Spatrick if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
82309467b48Spatrick (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
82409467b48Spatrick I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
82509467b48Spatrick return true;
82609467b48Spatrick }
82709467b48Spatrick
82809467b48Spatrick return false;
82909467b48Spatrick }
83009467b48Spatrick
selectG_INSERT(MachineInstr & I) const83109467b48Spatrick bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
83209467b48Spatrick MachineBasicBlock *BB = I.getParent();
83309467b48Spatrick
83409467b48Spatrick Register DstReg = I.getOperand(0).getReg();
83509467b48Spatrick Register Src0Reg = I.getOperand(1).getReg();
83609467b48Spatrick Register Src1Reg = I.getOperand(2).getReg();
83709467b48Spatrick LLT Src1Ty = MRI->getType(Src1Reg);
83809467b48Spatrick
83909467b48Spatrick unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
84009467b48Spatrick unsigned InsSize = Src1Ty.getSizeInBits();
84109467b48Spatrick
84209467b48Spatrick int64_t Offset = I.getOperand(3).getImm();
843097a140dSpatrick
844097a140dSpatrick // FIXME: These cases should have been illegal and unnecessary to check here.
845097a140dSpatrick if (Offset % 32 != 0 || InsSize % 32 != 0)
84609467b48Spatrick return false;
84709467b48Spatrick
84873471bf0Spatrick // Currently not handled by getSubRegFromChannel.
84973471bf0Spatrick if (InsSize > 128)
85073471bf0Spatrick return false;
85173471bf0Spatrick
85209467b48Spatrick unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
85309467b48Spatrick if (SubReg == AMDGPU::NoSubRegister)
85409467b48Spatrick return false;
85509467b48Spatrick
85609467b48Spatrick const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
85709467b48Spatrick const TargetRegisterClass *DstRC =
858*d415bd75Srobert TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
85909467b48Spatrick if (!DstRC)
86009467b48Spatrick return false;
86109467b48Spatrick
86209467b48Spatrick const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
86309467b48Spatrick const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
86409467b48Spatrick const TargetRegisterClass *Src0RC =
865*d415bd75Srobert TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
86609467b48Spatrick const TargetRegisterClass *Src1RC =
867*d415bd75Srobert TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
86809467b48Spatrick
86909467b48Spatrick // Deal with weird cases where the class only partially supports the subreg
87009467b48Spatrick // index.
87109467b48Spatrick Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
872097a140dSpatrick if (!Src0RC || !Src1RC)
87309467b48Spatrick return false;
87409467b48Spatrick
87509467b48Spatrick if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
87609467b48Spatrick !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
87709467b48Spatrick !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
87809467b48Spatrick return false;
87909467b48Spatrick
88009467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
88109467b48Spatrick BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
88209467b48Spatrick .addReg(Src0Reg)
88309467b48Spatrick .addReg(Src1Reg)
88409467b48Spatrick .addImm(SubReg);
88509467b48Spatrick
88609467b48Spatrick I.eraseFromParent();
88709467b48Spatrick return true;
88809467b48Spatrick }
88909467b48Spatrick
selectG_SBFX_UBFX(MachineInstr & MI) const89073471bf0Spatrick bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
89173471bf0Spatrick Register DstReg = MI.getOperand(0).getReg();
89273471bf0Spatrick Register SrcReg = MI.getOperand(1).getReg();
89373471bf0Spatrick Register OffsetReg = MI.getOperand(2).getReg();
89473471bf0Spatrick Register WidthReg = MI.getOperand(3).getReg();
89573471bf0Spatrick
89673471bf0Spatrick assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
89773471bf0Spatrick "scalar BFX instructions are expanded in regbankselect");
89873471bf0Spatrick assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
89973471bf0Spatrick "64-bit vector BFX instructions are expanded in regbankselect");
90073471bf0Spatrick
90173471bf0Spatrick const DebugLoc &DL = MI.getDebugLoc();
90273471bf0Spatrick MachineBasicBlock *MBB = MI.getParent();
90373471bf0Spatrick
90473471bf0Spatrick bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
90573471bf0Spatrick unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
90673471bf0Spatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
90773471bf0Spatrick .addReg(SrcReg)
90873471bf0Spatrick .addReg(OffsetReg)
90973471bf0Spatrick .addReg(WidthReg);
91073471bf0Spatrick MI.eraseFromParent();
91173471bf0Spatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
91273471bf0Spatrick }
91373471bf0Spatrick
selectInterpP1F16(MachineInstr & MI) const914097a140dSpatrick bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
915097a140dSpatrick if (STI.getLDSBankCount() != 16)
916097a140dSpatrick return selectImpl(MI, *CoverageInfo);
917097a140dSpatrick
918097a140dSpatrick Register Dst = MI.getOperand(0).getReg();
919097a140dSpatrick Register Src0 = MI.getOperand(2).getReg();
920097a140dSpatrick Register M0Val = MI.getOperand(6).getReg();
921097a140dSpatrick if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
922097a140dSpatrick !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
923097a140dSpatrick !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
924097a140dSpatrick return false;
925097a140dSpatrick
926097a140dSpatrick // This requires 2 instructions. It is possible to write a pattern to support
927097a140dSpatrick // this, but the generated isel emitter doesn't correctly deal with multiple
928097a140dSpatrick // output instructions using the same physical register input. The copy to m0
929097a140dSpatrick // is incorrectly placed before the second instruction.
930097a140dSpatrick //
931097a140dSpatrick // TODO: Match source modifiers.
932097a140dSpatrick
933097a140dSpatrick Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
934097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
935097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
936097a140dSpatrick
937097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
938097a140dSpatrick .addReg(M0Val);
939097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
940097a140dSpatrick .addImm(2)
941097a140dSpatrick .addImm(MI.getOperand(4).getImm()) // $attr
942097a140dSpatrick .addImm(MI.getOperand(3).getImm()); // $attrchan
943097a140dSpatrick
944097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
945097a140dSpatrick .addImm(0) // $src0_modifiers
946097a140dSpatrick .addReg(Src0) // $src0
947097a140dSpatrick .addImm(MI.getOperand(4).getImm()) // $attr
948097a140dSpatrick .addImm(MI.getOperand(3).getImm()) // $attrchan
949097a140dSpatrick .addImm(0) // $src2_modifiers
950097a140dSpatrick .addReg(InterpMov) // $src2 - 2 f16 values selected by high
951097a140dSpatrick .addImm(MI.getOperand(5).getImm()) // $high
952097a140dSpatrick .addImm(0) // $clamp
953097a140dSpatrick .addImm(0); // $omod
954097a140dSpatrick
955097a140dSpatrick MI.eraseFromParent();
956097a140dSpatrick return true;
957097a140dSpatrick }
958097a140dSpatrick
95973471bf0Spatrick // Writelane is special in that it can use SGPR and M0 (which would normally
96073471bf0Spatrick // count as using the constant bus twice - but in this case it is allowed since
96173471bf0Spatrick // the lane selector doesn't count as a use of the constant bus). However, it is
96273471bf0Spatrick // still required to abide by the 1 SGPR rule. Fix this up if we might have
96373471bf0Spatrick // multiple SGPRs.
selectWritelane(MachineInstr & MI) const96473471bf0Spatrick bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
96573471bf0Spatrick // With a constant bus limit of at least 2, there's no issue.
96673471bf0Spatrick if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
96773471bf0Spatrick return selectImpl(MI, *CoverageInfo);
96873471bf0Spatrick
96973471bf0Spatrick MachineBasicBlock *MBB = MI.getParent();
97073471bf0Spatrick const DebugLoc &DL = MI.getDebugLoc();
97173471bf0Spatrick Register VDst = MI.getOperand(0).getReg();
97273471bf0Spatrick Register Val = MI.getOperand(2).getReg();
97373471bf0Spatrick Register LaneSelect = MI.getOperand(3).getReg();
97473471bf0Spatrick Register VDstIn = MI.getOperand(4).getReg();
97573471bf0Spatrick
97673471bf0Spatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
97773471bf0Spatrick
978*d415bd75Srobert std::optional<ValueAndVReg> ConstSelect =
979*d415bd75Srobert getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
98073471bf0Spatrick if (ConstSelect) {
98173471bf0Spatrick // The selector has to be an inline immediate, so we can use whatever for
98273471bf0Spatrick // the other operands.
98373471bf0Spatrick MIB.addReg(Val);
98473471bf0Spatrick MIB.addImm(ConstSelect->Value.getSExtValue() &
98573471bf0Spatrick maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
98673471bf0Spatrick } else {
987*d415bd75Srobert std::optional<ValueAndVReg> ConstVal =
988*d415bd75Srobert getIConstantVRegValWithLookThrough(Val, *MRI);
98973471bf0Spatrick
99073471bf0Spatrick // If the value written is an inline immediate, we can get away without a
99173471bf0Spatrick // copy to m0.
99273471bf0Spatrick if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
99373471bf0Spatrick STI.hasInv2PiInlineImm())) {
99473471bf0Spatrick MIB.addImm(ConstVal->Value.getSExtValue());
99573471bf0Spatrick MIB.addReg(LaneSelect);
99673471bf0Spatrick } else {
99773471bf0Spatrick MIB.addReg(Val);
99873471bf0Spatrick
99973471bf0Spatrick // If the lane selector was originally in a VGPR and copied with
100073471bf0Spatrick // readfirstlane, there's a hazard to read the same SGPR from the
100173471bf0Spatrick // VALU. Constrain to a different SGPR to help avoid needing a nop later.
100273471bf0Spatrick RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
100373471bf0Spatrick
100473471bf0Spatrick BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
100573471bf0Spatrick .addReg(LaneSelect);
100673471bf0Spatrick MIB.addReg(AMDGPU::M0);
100773471bf0Spatrick }
100873471bf0Spatrick }
100973471bf0Spatrick
101073471bf0Spatrick MIB.addReg(VDstIn);
101173471bf0Spatrick
101273471bf0Spatrick MI.eraseFromParent();
101373471bf0Spatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
101473471bf0Spatrick }
101573471bf0Spatrick
1016097a140dSpatrick // We need to handle this here because tablegen doesn't support matching
1017097a140dSpatrick // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const1018097a140dSpatrick bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1019097a140dSpatrick Register Dst0 = MI.getOperand(0).getReg();
1020097a140dSpatrick Register Dst1 = MI.getOperand(1).getReg();
1021097a140dSpatrick
1022097a140dSpatrick LLT Ty = MRI->getType(Dst0);
1023097a140dSpatrick unsigned Opc;
1024097a140dSpatrick if (Ty == LLT::scalar(32))
102573471bf0Spatrick Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1026097a140dSpatrick else if (Ty == LLT::scalar(64))
102773471bf0Spatrick Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1028097a140dSpatrick else
1029097a140dSpatrick return false;
1030097a140dSpatrick
103173471bf0Spatrick // TODO: Match source modifiers.
103273471bf0Spatrick
1033097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
1034097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
1035097a140dSpatrick
1036097a140dSpatrick Register Numer = MI.getOperand(3).getReg();
1037097a140dSpatrick Register Denom = MI.getOperand(4).getReg();
1038097a140dSpatrick unsigned ChooseDenom = MI.getOperand(5).getImm();
1039097a140dSpatrick
1040097a140dSpatrick Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1041097a140dSpatrick
1042097a140dSpatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1043097a140dSpatrick .addDef(Dst1)
104473471bf0Spatrick .addImm(0) // $src0_modifiers
104573471bf0Spatrick .addUse(Src0) // $src0
104673471bf0Spatrick .addImm(0) // $src1_modifiers
104773471bf0Spatrick .addUse(Denom) // $src1
104873471bf0Spatrick .addImm(0) // $src2_modifiers
104973471bf0Spatrick .addUse(Numer) // $src2
105073471bf0Spatrick .addImm(0) // $clamp
105173471bf0Spatrick .addImm(0); // $omod
1052097a140dSpatrick
1053097a140dSpatrick MI.eraseFromParent();
1054097a140dSpatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1055097a140dSpatrick }
1056097a140dSpatrick
selectG_INTRINSIC(MachineInstr & I) const105709467b48Spatrick bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
105809467b48Spatrick unsigned IntrinsicID = I.getIntrinsicID();
105909467b48Spatrick switch (IntrinsicID) {
106009467b48Spatrick case Intrinsic::amdgcn_if_break: {
106109467b48Spatrick MachineBasicBlock *BB = I.getParent();
106209467b48Spatrick
1063*d415bd75Srobert // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
106409467b48Spatrick // SelectionDAG uses for wave32 vs wave64.
106509467b48Spatrick BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
106609467b48Spatrick .add(I.getOperand(0))
106709467b48Spatrick .add(I.getOperand(2))
106809467b48Spatrick .add(I.getOperand(3));
106909467b48Spatrick
107009467b48Spatrick Register DstReg = I.getOperand(0).getReg();
107109467b48Spatrick Register Src0Reg = I.getOperand(2).getReg();
107209467b48Spatrick Register Src1Reg = I.getOperand(3).getReg();
107309467b48Spatrick
107409467b48Spatrick I.eraseFromParent();
107509467b48Spatrick
107609467b48Spatrick for (Register Reg : { DstReg, Src0Reg, Src1Reg })
107709467b48Spatrick MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
107809467b48Spatrick
107909467b48Spatrick return true;
108009467b48Spatrick }
1081097a140dSpatrick case Intrinsic::amdgcn_interp_p1_f16:
1082097a140dSpatrick return selectInterpP1F16(I);
1083097a140dSpatrick case Intrinsic::amdgcn_wqm:
1084097a140dSpatrick return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1085097a140dSpatrick case Intrinsic::amdgcn_softwqm:
1086097a140dSpatrick return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
108773471bf0Spatrick case Intrinsic::amdgcn_strict_wwm:
1088097a140dSpatrick case Intrinsic::amdgcn_wwm:
108973471bf0Spatrick return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
109073471bf0Spatrick case Intrinsic::amdgcn_strict_wqm:
109173471bf0Spatrick return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
109273471bf0Spatrick case Intrinsic::amdgcn_writelane:
109373471bf0Spatrick return selectWritelane(I);
1094097a140dSpatrick case Intrinsic::amdgcn_div_scale:
1095097a140dSpatrick return selectDivScale(I);
1096097a140dSpatrick case Intrinsic::amdgcn_icmp:
1097*d415bd75Srobert case Intrinsic::amdgcn_fcmp:
1098*d415bd75Srobert if (selectImpl(I, *CoverageInfo))
1099*d415bd75Srobert return true;
1100*d415bd75Srobert return selectIntrinsicCmp(I);
1101097a140dSpatrick case Intrinsic::amdgcn_ballot:
1102097a140dSpatrick return selectBallot(I);
110373471bf0Spatrick case Intrinsic::amdgcn_reloc_constant:
110473471bf0Spatrick return selectRelocConstant(I);
110573471bf0Spatrick case Intrinsic::amdgcn_groupstaticsize:
110673471bf0Spatrick return selectGroupStaticSize(I);
110773471bf0Spatrick case Intrinsic::returnaddress:
110873471bf0Spatrick return selectReturnAddress(I);
1109*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1110*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1111*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1112*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1113*d415bd75Srobert case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1114*d415bd75Srobert case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1115*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1116*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1117*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1118*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1119*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1120*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1121*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1122*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1123*d415bd75Srobert return selectSMFMACIntrin(I);
112409467b48Spatrick default:
112509467b48Spatrick return selectImpl(I, *CoverageInfo);
112609467b48Spatrick }
112709467b48Spatrick }
112809467b48Spatrick
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size,const GCNSubtarget & ST)1129*d415bd75Srobert static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1130*d415bd75Srobert const GCNSubtarget &ST) {
1131*d415bd75Srobert if (Size != 16 && Size != 32 && Size != 64)
113209467b48Spatrick return -1;
1133*d415bd75Srobert
1134*d415bd75Srobert if (Size == 16 && !ST.has16BitInsts())
1135*d415bd75Srobert return -1;
1136*d415bd75Srobert
1137*d415bd75Srobert const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1138*d415bd75Srobert unsigned S64Opc) {
1139*d415bd75Srobert if (Size == 16)
1140*d415bd75Srobert return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1141*d415bd75Srobert if (Size == 32)
1142*d415bd75Srobert return S32Opc;
1143*d415bd75Srobert return S64Opc;
1144*d415bd75Srobert };
1145*d415bd75Srobert
114609467b48Spatrick switch (P) {
114709467b48Spatrick default:
114809467b48Spatrick llvm_unreachable("Unknown condition code!");
114909467b48Spatrick case CmpInst::ICMP_NE:
1150*d415bd75Srobert return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1151*d415bd75Srobert AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
115209467b48Spatrick case CmpInst::ICMP_EQ:
1153*d415bd75Srobert return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1154*d415bd75Srobert AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
115509467b48Spatrick case CmpInst::ICMP_SGT:
1156*d415bd75Srobert return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1157*d415bd75Srobert AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
115809467b48Spatrick case CmpInst::ICMP_SGE:
1159*d415bd75Srobert return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1160*d415bd75Srobert AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
116109467b48Spatrick case CmpInst::ICMP_SLT:
1162*d415bd75Srobert return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1163*d415bd75Srobert AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
116409467b48Spatrick case CmpInst::ICMP_SLE:
1165*d415bd75Srobert return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1166*d415bd75Srobert AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
116709467b48Spatrick case CmpInst::ICMP_UGT:
1168*d415bd75Srobert return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1169*d415bd75Srobert AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
117009467b48Spatrick case CmpInst::ICMP_UGE:
1171*d415bd75Srobert return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1172*d415bd75Srobert AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
117309467b48Spatrick case CmpInst::ICMP_ULT:
1174*d415bd75Srobert return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1175*d415bd75Srobert AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
117609467b48Spatrick case CmpInst::ICMP_ULE:
1177*d415bd75Srobert return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1178*d415bd75Srobert AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1179*d415bd75Srobert
1180*d415bd75Srobert case CmpInst::FCMP_OEQ:
1181*d415bd75Srobert return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1182*d415bd75Srobert AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1183*d415bd75Srobert case CmpInst::FCMP_OGT:
1184*d415bd75Srobert return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1185*d415bd75Srobert AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1186*d415bd75Srobert case CmpInst::FCMP_OGE:
1187*d415bd75Srobert return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1188*d415bd75Srobert AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1189*d415bd75Srobert case CmpInst::FCMP_OLT:
1190*d415bd75Srobert return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1191*d415bd75Srobert AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1192*d415bd75Srobert case CmpInst::FCMP_OLE:
1193*d415bd75Srobert return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1194*d415bd75Srobert AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1195*d415bd75Srobert case CmpInst::FCMP_ONE:
1196*d415bd75Srobert return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1197*d415bd75Srobert AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1198*d415bd75Srobert case CmpInst::FCMP_ORD:
1199*d415bd75Srobert return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1200*d415bd75Srobert AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1201*d415bd75Srobert case CmpInst::FCMP_UNO:
1202*d415bd75Srobert return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1203*d415bd75Srobert AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1204*d415bd75Srobert case CmpInst::FCMP_UEQ:
1205*d415bd75Srobert return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1206*d415bd75Srobert AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1207*d415bd75Srobert case CmpInst::FCMP_UGT:
1208*d415bd75Srobert return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1209*d415bd75Srobert AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1210*d415bd75Srobert case CmpInst::FCMP_UGE:
1211*d415bd75Srobert return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1212*d415bd75Srobert AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1213*d415bd75Srobert case CmpInst::FCMP_ULT:
1214*d415bd75Srobert return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1215*d415bd75Srobert AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1216*d415bd75Srobert case CmpInst::FCMP_ULE:
1217*d415bd75Srobert return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1218*d415bd75Srobert AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1219*d415bd75Srobert case CmpInst::FCMP_UNE:
1220*d415bd75Srobert return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1221*d415bd75Srobert AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1222*d415bd75Srobert case CmpInst::FCMP_TRUE:
1223*d415bd75Srobert return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1224*d415bd75Srobert AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1225*d415bd75Srobert case CmpInst::FCMP_FALSE:
1226*d415bd75Srobert return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1227*d415bd75Srobert AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
122809467b48Spatrick }
122909467b48Spatrick }
123009467b48Spatrick
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const123109467b48Spatrick int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
123209467b48Spatrick unsigned Size) const {
123309467b48Spatrick if (Size == 64) {
123409467b48Spatrick if (!STI.hasScalarCompareEq64())
123509467b48Spatrick return -1;
123609467b48Spatrick
123709467b48Spatrick switch (P) {
123809467b48Spatrick case CmpInst::ICMP_NE:
123909467b48Spatrick return AMDGPU::S_CMP_LG_U64;
124009467b48Spatrick case CmpInst::ICMP_EQ:
124109467b48Spatrick return AMDGPU::S_CMP_EQ_U64;
124209467b48Spatrick default:
124309467b48Spatrick return -1;
124409467b48Spatrick }
124509467b48Spatrick }
124609467b48Spatrick
124709467b48Spatrick if (Size != 32)
124809467b48Spatrick return -1;
124909467b48Spatrick
125009467b48Spatrick switch (P) {
125109467b48Spatrick case CmpInst::ICMP_NE:
125209467b48Spatrick return AMDGPU::S_CMP_LG_U32;
125309467b48Spatrick case CmpInst::ICMP_EQ:
125409467b48Spatrick return AMDGPU::S_CMP_EQ_U32;
125509467b48Spatrick case CmpInst::ICMP_SGT:
125609467b48Spatrick return AMDGPU::S_CMP_GT_I32;
125709467b48Spatrick case CmpInst::ICMP_SGE:
125809467b48Spatrick return AMDGPU::S_CMP_GE_I32;
125909467b48Spatrick case CmpInst::ICMP_SLT:
126009467b48Spatrick return AMDGPU::S_CMP_LT_I32;
126109467b48Spatrick case CmpInst::ICMP_SLE:
126209467b48Spatrick return AMDGPU::S_CMP_LE_I32;
126309467b48Spatrick case CmpInst::ICMP_UGT:
126409467b48Spatrick return AMDGPU::S_CMP_GT_U32;
126509467b48Spatrick case CmpInst::ICMP_UGE:
126609467b48Spatrick return AMDGPU::S_CMP_GE_U32;
126709467b48Spatrick case CmpInst::ICMP_ULT:
126809467b48Spatrick return AMDGPU::S_CMP_LT_U32;
126909467b48Spatrick case CmpInst::ICMP_ULE:
127009467b48Spatrick return AMDGPU::S_CMP_LE_U32;
127109467b48Spatrick default:
127209467b48Spatrick llvm_unreachable("Unknown condition code!");
127309467b48Spatrick }
127409467b48Spatrick }
127509467b48Spatrick
selectG_ICMP(MachineInstr & I) const127609467b48Spatrick bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
127709467b48Spatrick MachineBasicBlock *BB = I.getParent();
127809467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
127909467b48Spatrick
128009467b48Spatrick Register SrcReg = I.getOperand(2).getReg();
128109467b48Spatrick unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
128209467b48Spatrick
128309467b48Spatrick auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
128409467b48Spatrick
128509467b48Spatrick Register CCReg = I.getOperand(0).getReg();
128609467b48Spatrick if (!isVCC(CCReg, *MRI)) {
128709467b48Spatrick int Opcode = getS_CMPOpcode(Pred, Size);
128809467b48Spatrick if (Opcode == -1)
128909467b48Spatrick return false;
129009467b48Spatrick MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
129109467b48Spatrick .add(I.getOperand(2))
129209467b48Spatrick .add(I.getOperand(3));
129309467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
129409467b48Spatrick .addReg(AMDGPU::SCC);
129509467b48Spatrick bool Ret =
129609467b48Spatrick constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
129709467b48Spatrick RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
129809467b48Spatrick I.eraseFromParent();
129909467b48Spatrick return Ret;
130009467b48Spatrick }
130109467b48Spatrick
1302*d415bd75Srobert int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
130309467b48Spatrick if (Opcode == -1)
130409467b48Spatrick return false;
130509467b48Spatrick
130609467b48Spatrick MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
130709467b48Spatrick I.getOperand(0).getReg())
130809467b48Spatrick .add(I.getOperand(2))
130909467b48Spatrick .add(I.getOperand(3));
131009467b48Spatrick RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
131109467b48Spatrick *TRI.getBoolRC(), *MRI);
131209467b48Spatrick bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
131309467b48Spatrick I.eraseFromParent();
131409467b48Spatrick return Ret;
131509467b48Spatrick }
131609467b48Spatrick
selectIntrinsicCmp(MachineInstr & I) const1317*d415bd75Srobert bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1318097a140dSpatrick Register Dst = I.getOperand(0).getReg();
1319097a140dSpatrick if (isVCC(Dst, *MRI))
1320097a140dSpatrick return false;
1321097a140dSpatrick
1322*d415bd75Srobert LLT DstTy = MRI->getType(Dst);
1323*d415bd75Srobert if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1324097a140dSpatrick return false;
1325097a140dSpatrick
1326097a140dSpatrick MachineBasicBlock *BB = I.getParent();
1327097a140dSpatrick const DebugLoc &DL = I.getDebugLoc();
1328097a140dSpatrick Register SrcReg = I.getOperand(2).getReg();
1329097a140dSpatrick unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1330097a140dSpatrick
1331*d415bd75Srobert // i1 inputs are not supported in GlobalISel.
1332*d415bd75Srobert if (Size == 1)
1333*d415bd75Srobert return false;
1334*d415bd75Srobert
1335*d415bd75Srobert auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1336*d415bd75Srobert if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1337*d415bd75Srobert BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1338*d415bd75Srobert I.eraseFromParent();
1339*d415bd75Srobert return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1340*d415bd75Srobert }
1341*d415bd75Srobert
1342*d415bd75Srobert const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1343097a140dSpatrick if (Opcode == -1)
1344097a140dSpatrick return false;
1345097a140dSpatrick
1346*d415bd75Srobert MachineInstr *SelectedMI;
1347*d415bd75Srobert if (CmpInst::isFPPredicate(Pred)) {
1348*d415bd75Srobert MachineOperand &LHS = I.getOperand(2);
1349*d415bd75Srobert MachineOperand &RHS = I.getOperand(3);
1350*d415bd75Srobert auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1351*d415bd75Srobert auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1352*d415bd75Srobert Register Src0Reg =
1353*d415bd75Srobert copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1354*d415bd75Srobert Register Src1Reg =
1355*d415bd75Srobert copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1356*d415bd75Srobert SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1357*d415bd75Srobert .addImm(Src0Mods)
1358*d415bd75Srobert .addReg(Src0Reg)
1359*d415bd75Srobert .addImm(Src1Mods)
1360*d415bd75Srobert .addReg(Src1Reg)
1361*d415bd75Srobert .addImm(0); // clamp
1362*d415bd75Srobert } else {
1363*d415bd75Srobert SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1364097a140dSpatrick .add(I.getOperand(2))
1365097a140dSpatrick .add(I.getOperand(3));
1366*d415bd75Srobert }
1367*d415bd75Srobert
1368*d415bd75Srobert RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1369*d415bd75Srobert if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1370*d415bd75Srobert return false;
1371*d415bd75Srobert
1372097a140dSpatrick I.eraseFromParent();
1373*d415bd75Srobert return true;
137409467b48Spatrick }
137509467b48Spatrick
selectBallot(MachineInstr & I) const1376097a140dSpatrick bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1377097a140dSpatrick MachineBasicBlock *BB = I.getParent();
1378097a140dSpatrick const DebugLoc &DL = I.getDebugLoc();
1379097a140dSpatrick Register DstReg = I.getOperand(0).getReg();
1380097a140dSpatrick const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1381097a140dSpatrick const bool Is64 = Size == 64;
138209467b48Spatrick
1383097a140dSpatrick if (Size != STI.getWavefrontSize())
1384097a140dSpatrick return false;
138509467b48Spatrick
1386*d415bd75Srobert std::optional<ValueAndVReg> Arg =
1387*d415bd75Srobert getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
138809467b48Spatrick
1389*d415bd75Srobert if (Arg) {
1390*d415bd75Srobert const int64_t Value = Arg->Value.getSExtValue();
1391097a140dSpatrick if (Value == 0) {
1392097a140dSpatrick unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1393097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1394097a140dSpatrick } else if (Value == -1) { // all ones
1395097a140dSpatrick Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1396097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1397097a140dSpatrick } else
1398097a140dSpatrick return false;
139909467b48Spatrick } else {
1400097a140dSpatrick Register SrcReg = I.getOperand(2).getReg();
1401097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
140209467b48Spatrick }
140309467b48Spatrick
1404097a140dSpatrick I.eraseFromParent();
1405097a140dSpatrick return true;
140609467b48Spatrick }
140709467b48Spatrick
selectRelocConstant(MachineInstr & I) const140873471bf0Spatrick bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
140973471bf0Spatrick Register DstReg = I.getOperand(0).getReg();
141073471bf0Spatrick const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1411*d415bd75Srobert const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
141273471bf0Spatrick if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
141373471bf0Spatrick return false;
141473471bf0Spatrick
141573471bf0Spatrick const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
141673471bf0Spatrick
141773471bf0Spatrick Module *M = MF->getFunction().getParent();
141873471bf0Spatrick const MDNode *Metadata = I.getOperand(2).getMetadata();
141973471bf0Spatrick auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
142073471bf0Spatrick auto RelocSymbol = cast<GlobalVariable>(
142173471bf0Spatrick M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
142273471bf0Spatrick
142373471bf0Spatrick MachineBasicBlock *BB = I.getParent();
142473471bf0Spatrick BuildMI(*BB, &I, I.getDebugLoc(),
142573471bf0Spatrick TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
142673471bf0Spatrick .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
142773471bf0Spatrick
142873471bf0Spatrick I.eraseFromParent();
142973471bf0Spatrick return true;
143073471bf0Spatrick }
143173471bf0Spatrick
selectGroupStaticSize(MachineInstr & I) const143273471bf0Spatrick bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
143373471bf0Spatrick Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
143473471bf0Spatrick
143573471bf0Spatrick Register DstReg = I.getOperand(0).getReg();
143673471bf0Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
143773471bf0Spatrick unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
143873471bf0Spatrick AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
143973471bf0Spatrick
144073471bf0Spatrick MachineBasicBlock *MBB = I.getParent();
144173471bf0Spatrick const DebugLoc &DL = I.getDebugLoc();
144273471bf0Spatrick
144373471bf0Spatrick auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
144473471bf0Spatrick
144573471bf0Spatrick if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
144673471bf0Spatrick const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
144773471bf0Spatrick MIB.addImm(MFI->getLDSSize());
144873471bf0Spatrick } else {
144973471bf0Spatrick Module *M = MF->getFunction().getParent();
145073471bf0Spatrick const GlobalValue *GV
145173471bf0Spatrick = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
145273471bf0Spatrick MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
145373471bf0Spatrick }
145473471bf0Spatrick
145573471bf0Spatrick I.eraseFromParent();
145673471bf0Spatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
145773471bf0Spatrick }
145873471bf0Spatrick
selectReturnAddress(MachineInstr & I) const145973471bf0Spatrick bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
146073471bf0Spatrick MachineBasicBlock *MBB = I.getParent();
146173471bf0Spatrick MachineFunction &MF = *MBB->getParent();
146273471bf0Spatrick const DebugLoc &DL = I.getDebugLoc();
146373471bf0Spatrick
146473471bf0Spatrick MachineOperand &Dst = I.getOperand(0);
146573471bf0Spatrick Register DstReg = Dst.getReg();
146673471bf0Spatrick unsigned Depth = I.getOperand(2).getImm();
146773471bf0Spatrick
146873471bf0Spatrick const TargetRegisterClass *RC
146973471bf0Spatrick = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
147073471bf0Spatrick if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
147173471bf0Spatrick !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
147273471bf0Spatrick return false;
147373471bf0Spatrick
147473471bf0Spatrick // Check for kernel and shader functions
147573471bf0Spatrick if (Depth != 0 ||
147673471bf0Spatrick MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
147773471bf0Spatrick BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
147873471bf0Spatrick .addImm(0);
147973471bf0Spatrick I.eraseFromParent();
148073471bf0Spatrick return true;
148173471bf0Spatrick }
148273471bf0Spatrick
148373471bf0Spatrick MachineFrameInfo &MFI = MF.getFrameInfo();
148473471bf0Spatrick // There is a call to @llvm.returnaddress in this function
148573471bf0Spatrick MFI.setReturnAddressIsTaken(true);
148673471bf0Spatrick
148773471bf0Spatrick // Get the return address reg and mark it as an implicit live-in
148873471bf0Spatrick Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
148973471bf0Spatrick Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1490*d415bd75Srobert AMDGPU::SReg_64RegClass, DL);
149173471bf0Spatrick BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
149273471bf0Spatrick .addReg(LiveIn);
149373471bf0Spatrick I.eraseFromParent();
149473471bf0Spatrick return true;
149573471bf0Spatrick }
149673471bf0Spatrick
selectEndCfIntrinsic(MachineInstr & MI) const1497097a140dSpatrick bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1498*d415bd75Srobert // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1499097a140dSpatrick // SelectionDAG uses for wave32 vs wave64.
1500097a140dSpatrick MachineBasicBlock *BB = MI.getParent();
1501097a140dSpatrick BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1502097a140dSpatrick .add(MI.getOperand(1));
150309467b48Spatrick
1504097a140dSpatrick Register Reg = MI.getOperand(1).getReg();
150509467b48Spatrick MI.eraseFromParent();
150609467b48Spatrick
1507097a140dSpatrick if (!MRI->getRegClassOrNull(Reg))
1508097a140dSpatrick MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1509097a140dSpatrick return true;
151009467b48Spatrick }
151109467b48Spatrick
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const151209467b48Spatrick bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
151309467b48Spatrick MachineInstr &MI, Intrinsic::ID IntrID) const {
151409467b48Spatrick MachineBasicBlock *MBB = MI.getParent();
151509467b48Spatrick MachineFunction *MF = MBB->getParent();
151609467b48Spatrick const DebugLoc &DL = MI.getDebugLoc();
151709467b48Spatrick
151809467b48Spatrick unsigned IndexOperand = MI.getOperand(7).getImm();
151909467b48Spatrick bool WaveRelease = MI.getOperand(8).getImm() != 0;
152009467b48Spatrick bool WaveDone = MI.getOperand(9).getImm() != 0;
152109467b48Spatrick
152209467b48Spatrick if (WaveDone && !WaveRelease)
152309467b48Spatrick report_fatal_error("ds_ordered_count: wave_done requires wave_release");
152409467b48Spatrick
152509467b48Spatrick unsigned OrderedCountIndex = IndexOperand & 0x3f;
152609467b48Spatrick IndexOperand &= ~0x3f;
152709467b48Spatrick unsigned CountDw = 0;
152809467b48Spatrick
152909467b48Spatrick if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
153009467b48Spatrick CountDw = (IndexOperand >> 24) & 0xf;
153109467b48Spatrick IndexOperand &= ~(0xf << 24);
153209467b48Spatrick
153309467b48Spatrick if (CountDw < 1 || CountDw > 4) {
153409467b48Spatrick report_fatal_error(
153509467b48Spatrick "ds_ordered_count: dword count must be between 1 and 4");
153609467b48Spatrick }
153709467b48Spatrick }
153809467b48Spatrick
153909467b48Spatrick if (IndexOperand)
154009467b48Spatrick report_fatal_error("ds_ordered_count: bad index operand");
154109467b48Spatrick
154209467b48Spatrick unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
154373471bf0Spatrick unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
154409467b48Spatrick
154509467b48Spatrick unsigned Offset0 = OrderedCountIndex << 2;
1546*d415bd75Srobert unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
154709467b48Spatrick
154809467b48Spatrick if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
154909467b48Spatrick Offset1 |= (CountDw - 1) << 6;
155009467b48Spatrick
1551*d415bd75Srobert if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1552*d415bd75Srobert Offset1 |= ShaderType << 2;
1553*d415bd75Srobert
155409467b48Spatrick unsigned Offset = Offset0 | (Offset1 << 8);
155509467b48Spatrick
155609467b48Spatrick Register M0Val = MI.getOperand(2).getReg();
155709467b48Spatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
155809467b48Spatrick .addReg(M0Val);
155909467b48Spatrick
156009467b48Spatrick Register DstReg = MI.getOperand(0).getReg();
156109467b48Spatrick Register ValReg = MI.getOperand(3).getReg();
156209467b48Spatrick MachineInstrBuilder DS =
156309467b48Spatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
156409467b48Spatrick .addReg(ValReg)
156509467b48Spatrick .addImm(Offset)
156609467b48Spatrick .cloneMemRefs(MI);
156709467b48Spatrick
156809467b48Spatrick if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
156909467b48Spatrick return false;
157009467b48Spatrick
157109467b48Spatrick bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
157209467b48Spatrick MI.eraseFromParent();
157309467b48Spatrick return Ret;
157409467b48Spatrick }
157509467b48Spatrick
gwsIntrinToOpcode(unsigned IntrID)1576097a140dSpatrick static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1577097a140dSpatrick switch (IntrID) {
1578097a140dSpatrick case Intrinsic::amdgcn_ds_gws_init:
1579097a140dSpatrick return AMDGPU::DS_GWS_INIT;
1580097a140dSpatrick case Intrinsic::amdgcn_ds_gws_barrier:
1581097a140dSpatrick return AMDGPU::DS_GWS_BARRIER;
1582097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_v:
1583097a140dSpatrick return AMDGPU::DS_GWS_SEMA_V;
1584097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_br:
1585097a140dSpatrick return AMDGPU::DS_GWS_SEMA_BR;
1586097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_p:
1587097a140dSpatrick return AMDGPU::DS_GWS_SEMA_P;
1588097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_release_all:
1589097a140dSpatrick return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1590097a140dSpatrick default:
1591097a140dSpatrick llvm_unreachable("not a gws intrinsic");
159209467b48Spatrick }
159309467b48Spatrick }
159409467b48Spatrick
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1595097a140dSpatrick bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1596097a140dSpatrick Intrinsic::ID IID) const {
1597097a140dSpatrick if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1598097a140dSpatrick !STI.hasGWSSemaReleaseAll())
1599097a140dSpatrick return false;
160009467b48Spatrick
1601097a140dSpatrick // intrinsic ID, vsrc, offset
1602097a140dSpatrick const bool HasVSrc = MI.getNumOperands() == 3;
1603097a140dSpatrick assert(HasVSrc || MI.getNumOperands() == 2);
1604097a140dSpatrick
1605097a140dSpatrick Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1606097a140dSpatrick const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1607097a140dSpatrick if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1608097a140dSpatrick return false;
1609097a140dSpatrick
1610097a140dSpatrick MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1611097a140dSpatrick unsigned ImmOffset;
1612097a140dSpatrick
1613097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
1614097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
1615097a140dSpatrick
1616097a140dSpatrick MachineInstr *Readfirstlane = nullptr;
1617097a140dSpatrick
1618097a140dSpatrick // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1619097a140dSpatrick // incoming offset, in case there's an add of a constant. We'll have to put it
1620097a140dSpatrick // back later.
1621097a140dSpatrick if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1622097a140dSpatrick Readfirstlane = OffsetDef;
1623097a140dSpatrick BaseOffset = OffsetDef->getOperand(1).getReg();
1624097a140dSpatrick OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1625097a140dSpatrick }
1626097a140dSpatrick
1627097a140dSpatrick if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1628097a140dSpatrick // If we have a constant offset, try to use the 0 in m0 as the base.
1629097a140dSpatrick // TODO: Look into changing the default m0 initialization value. If the
1630097a140dSpatrick // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1631097a140dSpatrick // the immediate offset.
1632097a140dSpatrick
1633097a140dSpatrick ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1634097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1635097a140dSpatrick .addImm(0);
1636097a140dSpatrick } else {
163773471bf0Spatrick std::tie(BaseOffset, ImmOffset) =
1638*d415bd75Srobert AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits);
1639097a140dSpatrick
1640097a140dSpatrick if (Readfirstlane) {
1641097a140dSpatrick // We have the constant offset now, so put the readfirstlane back on the
1642097a140dSpatrick // variable component.
1643097a140dSpatrick if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1644097a140dSpatrick return false;
1645097a140dSpatrick
1646097a140dSpatrick Readfirstlane->getOperand(1).setReg(BaseOffset);
1647097a140dSpatrick BaseOffset = Readfirstlane->getOperand(0).getReg();
1648097a140dSpatrick } else {
1649097a140dSpatrick if (!RBI.constrainGenericRegister(BaseOffset,
1650097a140dSpatrick AMDGPU::SReg_32RegClass, *MRI))
1651097a140dSpatrick return false;
1652097a140dSpatrick }
1653097a140dSpatrick
1654097a140dSpatrick Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1655097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1656097a140dSpatrick .addReg(BaseOffset)
1657097a140dSpatrick .addImm(16);
1658097a140dSpatrick
1659097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1660097a140dSpatrick .addReg(M0Base);
1661097a140dSpatrick }
1662097a140dSpatrick
1663097a140dSpatrick // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1664097a140dSpatrick // offset field) % 64. Some versions of the programming guide omit the m0
1665097a140dSpatrick // part, or claim it's from offset 0.
1666097a140dSpatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1667097a140dSpatrick
1668097a140dSpatrick if (HasVSrc) {
1669097a140dSpatrick Register VSrc = MI.getOperand(1).getReg();
1670097a140dSpatrick MIB.addReg(VSrc);
167173471bf0Spatrick
1672097a140dSpatrick if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1673097a140dSpatrick return false;
1674097a140dSpatrick }
1675097a140dSpatrick
1676097a140dSpatrick MIB.addImm(ImmOffset)
1677097a140dSpatrick .cloneMemRefs(MI);
1678097a140dSpatrick
1679*d415bd75Srobert TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1680*d415bd75Srobert
1681097a140dSpatrick MI.eraseFromParent();
168209467b48Spatrick return true;
168309467b48Spatrick }
1684097a140dSpatrick
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1685097a140dSpatrick bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1686097a140dSpatrick bool IsAppend) const {
1687097a140dSpatrick Register PtrBase = MI.getOperand(2).getReg();
1688097a140dSpatrick LLT PtrTy = MRI->getType(PtrBase);
1689097a140dSpatrick bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1690097a140dSpatrick
1691097a140dSpatrick unsigned Offset;
1692097a140dSpatrick std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1693097a140dSpatrick
1694097a140dSpatrick // TODO: Should this try to look through readfirstlane like GWS?
169573471bf0Spatrick if (!isDSOffsetLegal(PtrBase, Offset)) {
1696097a140dSpatrick PtrBase = MI.getOperand(2).getReg();
1697097a140dSpatrick Offset = 0;
1698097a140dSpatrick }
1699097a140dSpatrick
1700097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
1701097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
1702097a140dSpatrick const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1703097a140dSpatrick
1704097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1705097a140dSpatrick .addReg(PtrBase);
170673471bf0Spatrick if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
170773471bf0Spatrick return false;
170873471bf0Spatrick
170973471bf0Spatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1710097a140dSpatrick .addImm(Offset)
1711097a140dSpatrick .addImm(IsGDS ? -1 : 0)
1712097a140dSpatrick .cloneMemRefs(MI);
1713097a140dSpatrick MI.eraseFromParent();
171473471bf0Spatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
171573471bf0Spatrick }
171673471bf0Spatrick
selectSBarrier(MachineInstr & MI) const171773471bf0Spatrick bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
171873471bf0Spatrick if (TM.getOptLevel() > CodeGenOpt::None) {
171973471bf0Spatrick unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
172073471bf0Spatrick if (WGSize <= STI.getWavefrontSize()) {
172173471bf0Spatrick MachineBasicBlock *MBB = MI.getParent();
172273471bf0Spatrick const DebugLoc &DL = MI.getDebugLoc();
172373471bf0Spatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
172473471bf0Spatrick MI.eraseFromParent();
1725097a140dSpatrick return true;
1726097a140dSpatrick }
172773471bf0Spatrick }
172873471bf0Spatrick return selectImpl(MI, *CoverageInfo);
172973471bf0Spatrick }
1730097a140dSpatrick
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)1731097a140dSpatrick static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1732097a140dSpatrick bool &IsTexFail) {
1733097a140dSpatrick if (TexFailCtrl)
1734097a140dSpatrick IsTexFail = true;
1735097a140dSpatrick
1736*d415bd75Srobert TFE = (TexFailCtrl & 0x1) ? true : false;
1737097a140dSpatrick TexFailCtrl &= ~(uint64_t)0x1;
1738*d415bd75Srobert LWE = (TexFailCtrl & 0x2) ? true : false;
1739097a140dSpatrick TexFailCtrl &= ~(uint64_t)0x2;
1740097a140dSpatrick
1741097a140dSpatrick return TexFailCtrl == 0;
1742097a140dSpatrick }
1743097a140dSpatrick
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const1744097a140dSpatrick bool AMDGPUInstructionSelector::selectImageIntrinsic(
1745097a140dSpatrick MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1746097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
1747097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
1748097a140dSpatrick
1749097a140dSpatrick const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1750097a140dSpatrick AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1751097a140dSpatrick
1752097a140dSpatrick const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1753097a140dSpatrick unsigned IntrOpcode = Intr->BaseOpcode;
175473471bf0Spatrick const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1755*d415bd75Srobert const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1756097a140dSpatrick
175773471bf0Spatrick const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1758097a140dSpatrick
1759097a140dSpatrick Register VDataIn, VDataOut;
1760097a140dSpatrick LLT VDataTy;
1761097a140dSpatrick int NumVDataDwords = -1;
1762*d415bd75Srobert bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1763*d415bd75Srobert MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1764097a140dSpatrick
1765097a140dSpatrick bool Unorm;
176673471bf0Spatrick if (!BaseOpcode->Sampler)
1767097a140dSpatrick Unorm = true;
176873471bf0Spatrick else
176973471bf0Spatrick Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1770097a140dSpatrick
1771097a140dSpatrick bool TFE;
1772097a140dSpatrick bool LWE;
1773097a140dSpatrick bool IsTexFail = false;
177473471bf0Spatrick if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
177573471bf0Spatrick TFE, LWE, IsTexFail))
1776097a140dSpatrick return false;
1777097a140dSpatrick
177873471bf0Spatrick const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1779097a140dSpatrick const bool IsA16 = (Flags & 1) != 0;
1780097a140dSpatrick const bool IsG16 = (Flags & 2) != 0;
1781097a140dSpatrick
178273471bf0Spatrick // A16 implies 16 bit gradients if subtarget doesn't support G16
178373471bf0Spatrick if (IsA16 && !STI.hasG16() && !IsG16)
1784097a140dSpatrick return false;
1785097a140dSpatrick
1786097a140dSpatrick unsigned DMask = 0;
1787097a140dSpatrick unsigned DMaskLanes = 0;
1788097a140dSpatrick
1789097a140dSpatrick if (BaseOpcode->Atomic) {
1790097a140dSpatrick VDataOut = MI.getOperand(0).getReg();
1791097a140dSpatrick VDataIn = MI.getOperand(2).getReg();
1792097a140dSpatrick LLT Ty = MRI->getType(VDataIn);
1793097a140dSpatrick
1794097a140dSpatrick // Be careful to allow atomic swap on 16-bit element vectors.
1795097a140dSpatrick const bool Is64Bit = BaseOpcode->AtomicX2 ?
1796097a140dSpatrick Ty.getSizeInBits() == 128 :
1797097a140dSpatrick Ty.getSizeInBits() == 64;
1798097a140dSpatrick
1799097a140dSpatrick if (BaseOpcode->AtomicX2) {
1800097a140dSpatrick assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1801097a140dSpatrick
1802097a140dSpatrick DMask = Is64Bit ? 0xf : 0x3;
1803097a140dSpatrick NumVDataDwords = Is64Bit ? 4 : 2;
1804097a140dSpatrick } else {
1805097a140dSpatrick DMask = Is64Bit ? 0x3 : 0x1;
1806097a140dSpatrick NumVDataDwords = Is64Bit ? 2 : 1;
1807097a140dSpatrick }
1808097a140dSpatrick } else {
180973471bf0Spatrick DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1810*d415bd75Srobert DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
181173471bf0Spatrick
1812097a140dSpatrick if (BaseOpcode->Store) {
1813097a140dSpatrick VDataIn = MI.getOperand(1).getReg();
1814097a140dSpatrick VDataTy = MRI->getType(VDataIn);
1815097a140dSpatrick NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1816097a140dSpatrick } else {
1817097a140dSpatrick VDataOut = MI.getOperand(0).getReg();
1818097a140dSpatrick VDataTy = MRI->getType(VDataOut);
1819097a140dSpatrick NumVDataDwords = DMaskLanes;
1820097a140dSpatrick
1821097a140dSpatrick if (IsD16 && !STI.hasUnpackedD16VMem())
1822097a140dSpatrick NumVDataDwords = (DMaskLanes + 1) / 2;
1823097a140dSpatrick }
1824097a140dSpatrick }
1825097a140dSpatrick
1826097a140dSpatrick // Set G16 opcode
1827097a140dSpatrick if (IsG16 && !IsA16) {
1828097a140dSpatrick const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1829097a140dSpatrick AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1830097a140dSpatrick assert(G16MappingInfo);
1831097a140dSpatrick IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1832097a140dSpatrick }
1833097a140dSpatrick
1834097a140dSpatrick // TODO: Check this in verifier.
1835097a140dSpatrick assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1836097a140dSpatrick
183773471bf0Spatrick unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
183873471bf0Spatrick if (BaseOpcode->Atomic)
183973471bf0Spatrick CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
184073471bf0Spatrick if (CPol & ~AMDGPU::CPol::ALL)
1841097a140dSpatrick return false;
1842097a140dSpatrick
1843097a140dSpatrick int NumVAddrRegs = 0;
1844097a140dSpatrick int NumVAddrDwords = 0;
184573471bf0Spatrick for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1846097a140dSpatrick // Skip the $noregs and 0s inserted during legalization.
184773471bf0Spatrick MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1848097a140dSpatrick if (!AddrOp.isReg())
1849097a140dSpatrick continue; // XXX - Break?
1850097a140dSpatrick
1851097a140dSpatrick Register Addr = AddrOp.getReg();
1852097a140dSpatrick if (!Addr)
1853097a140dSpatrick break;
1854097a140dSpatrick
1855097a140dSpatrick ++NumVAddrRegs;
1856097a140dSpatrick NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1857097a140dSpatrick }
1858097a140dSpatrick
1859097a140dSpatrick // The legalizer preprocessed the intrinsic arguments. If we aren't using
1860*d415bd75Srobert // NSA, these should have been packed into a single value in the first
1861097a140dSpatrick // address register
1862097a140dSpatrick const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1863097a140dSpatrick if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1864097a140dSpatrick LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1865097a140dSpatrick return false;
1866097a140dSpatrick }
1867097a140dSpatrick
1868097a140dSpatrick if (IsTexFail)
1869097a140dSpatrick ++NumVDataDwords;
1870097a140dSpatrick
1871097a140dSpatrick int Opcode = -1;
1872*d415bd75Srobert if (IsGFX11Plus) {
1873*d415bd75Srobert Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1874*d415bd75Srobert UseNSA ? AMDGPU::MIMGEncGfx11NSA
1875*d415bd75Srobert : AMDGPU::MIMGEncGfx11Default,
1876*d415bd75Srobert NumVDataDwords, NumVAddrDwords);
1877*d415bd75Srobert } else if (IsGFX10Plus) {
1878097a140dSpatrick Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1879097a140dSpatrick UseNSA ? AMDGPU::MIMGEncGfx10NSA
1880097a140dSpatrick : AMDGPU::MIMGEncGfx10Default,
1881097a140dSpatrick NumVDataDwords, NumVAddrDwords);
1882097a140dSpatrick } else {
1883*d415bd75Srobert if (Subtarget->hasGFX90AInsts()) {
1884*d415bd75Srobert Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1885*d415bd75Srobert NumVDataDwords, NumVAddrDwords);
1886*d415bd75Srobert if (Opcode == -1) {
1887*d415bd75Srobert LLVM_DEBUG(
1888*d415bd75Srobert dbgs()
1889*d415bd75Srobert << "requested image instruction is not supported on this GPU\n");
1890*d415bd75Srobert return false;
1891*d415bd75Srobert }
1892*d415bd75Srobert }
1893*d415bd75Srobert if (Opcode == -1 &&
1894*d415bd75Srobert STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1895097a140dSpatrick Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1896097a140dSpatrick NumVDataDwords, NumVAddrDwords);
1897097a140dSpatrick if (Opcode == -1)
1898097a140dSpatrick Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1899097a140dSpatrick NumVDataDwords, NumVAddrDwords);
1900097a140dSpatrick }
1901097a140dSpatrick assert(Opcode != -1);
1902097a140dSpatrick
1903097a140dSpatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1904097a140dSpatrick .cloneMemRefs(MI);
1905097a140dSpatrick
1906097a140dSpatrick if (VDataOut) {
1907097a140dSpatrick if (BaseOpcode->AtomicX2) {
1908097a140dSpatrick const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1909097a140dSpatrick
1910097a140dSpatrick Register TmpReg = MRI->createVirtualRegister(
1911097a140dSpatrick Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1912097a140dSpatrick unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1913097a140dSpatrick
1914097a140dSpatrick MIB.addDef(TmpReg);
191573471bf0Spatrick if (!MRI->use_empty(VDataOut)) {
1916097a140dSpatrick BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1917097a140dSpatrick .addReg(TmpReg, RegState::Kill, SubReg);
191873471bf0Spatrick }
1919097a140dSpatrick
1920097a140dSpatrick } else {
1921097a140dSpatrick MIB.addDef(VDataOut); // vdata output
1922097a140dSpatrick }
1923097a140dSpatrick }
1924097a140dSpatrick
1925097a140dSpatrick if (VDataIn)
1926097a140dSpatrick MIB.addReg(VDataIn); // vdata input
1927097a140dSpatrick
192873471bf0Spatrick for (int I = 0; I != NumVAddrRegs; ++I) {
192973471bf0Spatrick MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1930097a140dSpatrick if (SrcOp.isReg()) {
1931097a140dSpatrick assert(SrcOp.getReg() != 0);
1932097a140dSpatrick MIB.addReg(SrcOp.getReg());
1933097a140dSpatrick }
1934097a140dSpatrick }
1935097a140dSpatrick
193673471bf0Spatrick MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1937097a140dSpatrick if (BaseOpcode->Sampler)
193873471bf0Spatrick MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1939097a140dSpatrick
1940097a140dSpatrick MIB.addImm(DMask); // dmask
1941097a140dSpatrick
194273471bf0Spatrick if (IsGFX10Plus)
1943097a140dSpatrick MIB.addImm(DimInfo->Encoding);
1944097a140dSpatrick MIB.addImm(Unorm);
1945097a140dSpatrick
194673471bf0Spatrick MIB.addImm(CPol);
1947097a140dSpatrick MIB.addImm(IsA16 && // a16 or r128
1948097a140dSpatrick STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
194973471bf0Spatrick if (IsGFX10Plus)
1950097a140dSpatrick MIB.addImm(IsA16 ? -1 : 0);
1951097a140dSpatrick
1952*d415bd75Srobert if (!Subtarget->hasGFX90AInsts()) {
1953097a140dSpatrick MIB.addImm(TFE); // tfe
1954*d415bd75Srobert } else if (TFE) {
1955*d415bd75Srobert LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
1956*d415bd75Srobert return false;
1957*d415bd75Srobert }
1958*d415bd75Srobert
1959097a140dSpatrick MIB.addImm(LWE); // lwe
196073471bf0Spatrick if (!IsGFX10Plus)
1961097a140dSpatrick MIB.addImm(DimInfo->DA ? -1 : 0);
1962097a140dSpatrick if (BaseOpcode->HasD16)
1963097a140dSpatrick MIB.addImm(IsD16 ? -1 : 0);
1964097a140dSpatrick
196573471bf0Spatrick if (IsTexFail) {
196673471bf0Spatrick // An image load instruction with TFE/LWE only conditionally writes to its
196773471bf0Spatrick // result registers. Initialize them to zero so that we always get well
196873471bf0Spatrick // defined result values.
196973471bf0Spatrick assert(VDataOut && !VDataIn);
197073471bf0Spatrick Register Tied = MRI->cloneVirtualRegister(VDataOut);
197173471bf0Spatrick Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
197273471bf0Spatrick BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
197373471bf0Spatrick .addImm(0);
197473471bf0Spatrick auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
197573471bf0Spatrick if (STI.usePRTStrictNull()) {
197673471bf0Spatrick // With enable-prt-strict-null enabled, initialize all result registers to
197773471bf0Spatrick // zero.
197873471bf0Spatrick auto RegSeq =
197973471bf0Spatrick BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
198073471bf0Spatrick for (auto Sub : Parts)
198173471bf0Spatrick RegSeq.addReg(Zero).addImm(Sub);
198273471bf0Spatrick } else {
198373471bf0Spatrick // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
198473471bf0Spatrick // result register.
198573471bf0Spatrick Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
198673471bf0Spatrick BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
198773471bf0Spatrick auto RegSeq =
198873471bf0Spatrick BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
198973471bf0Spatrick for (auto Sub : Parts.drop_back(1))
199073471bf0Spatrick RegSeq.addReg(Undef).addImm(Sub);
199173471bf0Spatrick RegSeq.addReg(Zero).addImm(Parts.back());
199273471bf0Spatrick }
199373471bf0Spatrick MIB.addReg(Tied, RegState::Implicit);
199473471bf0Spatrick MIB->tieOperands(0, MIB->getNumOperands() - 1);
199573471bf0Spatrick }
199673471bf0Spatrick
1997097a140dSpatrick MI.eraseFromParent();
1998*d415bd75Srobert constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1999*d415bd75Srobert TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2000*d415bd75Srobert return true;
2001*d415bd75Srobert }
2002*d415bd75Srobert
2003*d415bd75Srobert // We need to handle this here because tablegen doesn't support matching
2004*d415bd75Srobert // instructions with multiple outputs.
selectDSBvhStackIntrinsic(MachineInstr & MI) const2005*d415bd75Srobert bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2006*d415bd75Srobert MachineInstr &MI) const {
2007*d415bd75Srobert Register Dst0 = MI.getOperand(0).getReg();
2008*d415bd75Srobert Register Dst1 = MI.getOperand(1).getReg();
2009*d415bd75Srobert
2010*d415bd75Srobert const DebugLoc &DL = MI.getDebugLoc();
2011*d415bd75Srobert MachineBasicBlock *MBB = MI.getParent();
2012*d415bd75Srobert
2013*d415bd75Srobert Register Addr = MI.getOperand(3).getReg();
2014*d415bd75Srobert Register Data0 = MI.getOperand(4).getReg();
2015*d415bd75Srobert Register Data1 = MI.getOperand(5).getReg();
2016*d415bd75Srobert unsigned Offset = MI.getOperand(6).getImm();
2017*d415bd75Srobert
2018*d415bd75Srobert auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2019*d415bd75Srobert .addDef(Dst1)
2020*d415bd75Srobert .addUse(Addr)
2021*d415bd75Srobert .addUse(Data0)
2022*d415bd75Srobert .addUse(Data1)
2023*d415bd75Srobert .addImm(Offset)
2024*d415bd75Srobert .cloneMemRefs(MI);
2025*d415bd75Srobert
2026*d415bd75Srobert MI.eraseFromParent();
2027097a140dSpatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2028097a140dSpatrick }
2029097a140dSpatrick
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const2030097a140dSpatrick bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2031097a140dSpatrick MachineInstr &I) const {
2032097a140dSpatrick unsigned IntrinsicID = I.getIntrinsicID();
2033097a140dSpatrick switch (IntrinsicID) {
2034097a140dSpatrick case Intrinsic::amdgcn_end_cf:
2035097a140dSpatrick return selectEndCfIntrinsic(I);
203609467b48Spatrick case Intrinsic::amdgcn_ds_ordered_add:
203709467b48Spatrick case Intrinsic::amdgcn_ds_ordered_swap:
203809467b48Spatrick return selectDSOrderedIntrinsic(I, IntrinsicID);
2039097a140dSpatrick case Intrinsic::amdgcn_ds_gws_init:
2040097a140dSpatrick case Intrinsic::amdgcn_ds_gws_barrier:
2041097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_v:
2042097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_br:
2043097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_p:
2044097a140dSpatrick case Intrinsic::amdgcn_ds_gws_sema_release_all:
2045097a140dSpatrick return selectDSGWSIntrinsic(I, IntrinsicID);
2046097a140dSpatrick case Intrinsic::amdgcn_ds_append:
2047097a140dSpatrick return selectDSAppendConsume(I, true);
2048097a140dSpatrick case Intrinsic::amdgcn_ds_consume:
2049097a140dSpatrick return selectDSAppendConsume(I, false);
205073471bf0Spatrick case Intrinsic::amdgcn_s_barrier:
205173471bf0Spatrick return selectSBarrier(I);
2052*d415bd75Srobert case Intrinsic::amdgcn_raw_buffer_load_lds:
2053*d415bd75Srobert case Intrinsic::amdgcn_struct_buffer_load_lds:
2054*d415bd75Srobert return selectBufferLoadLds(I);
2055*d415bd75Srobert case Intrinsic::amdgcn_global_load_lds:
2056*d415bd75Srobert return selectGlobalLoadLds(I);
2057*d415bd75Srobert case Intrinsic::amdgcn_exp_compr:
2058*d415bd75Srobert if (!STI.hasCompressedExport()) {
2059*d415bd75Srobert Function &F = I.getMF()->getFunction();
2060*d415bd75Srobert DiagnosticInfoUnsupported NoFpRet(
2061*d415bd75Srobert F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2062*d415bd75Srobert F.getContext().diagnose(NoFpRet);
2063*d415bd75Srobert return false;
2064*d415bd75Srobert }
2065*d415bd75Srobert break;
2066*d415bd75Srobert case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2067*d415bd75Srobert return selectDSBvhStackIntrinsic(I);
2068*d415bd75Srobert }
206909467b48Spatrick return selectImpl(I, *CoverageInfo);
207009467b48Spatrick }
207109467b48Spatrick
selectG_SELECT(MachineInstr & I) const207209467b48Spatrick bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2073097a140dSpatrick if (selectImpl(I, *CoverageInfo))
2074097a140dSpatrick return true;
2075097a140dSpatrick
207609467b48Spatrick MachineBasicBlock *BB = I.getParent();
207709467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
207809467b48Spatrick
207909467b48Spatrick Register DstReg = I.getOperand(0).getReg();
208009467b48Spatrick unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
208109467b48Spatrick assert(Size <= 32 || Size == 64);
208209467b48Spatrick const MachineOperand &CCOp = I.getOperand(1);
208309467b48Spatrick Register CCReg = CCOp.getReg();
208409467b48Spatrick if (!isVCC(CCReg, *MRI)) {
208509467b48Spatrick unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
208609467b48Spatrick AMDGPU::S_CSELECT_B32;
208709467b48Spatrick MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
208809467b48Spatrick .addReg(CCReg);
208909467b48Spatrick
209009467b48Spatrick // The generic constrainSelectedInstRegOperands doesn't work for the scc register
209109467b48Spatrick // bank, because it does not cover the register class that we used to represent
209209467b48Spatrick // for it. So we need to manually set the register class here.
209309467b48Spatrick if (!MRI->getRegClassOrNull(CCReg))
209409467b48Spatrick MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
209509467b48Spatrick MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
209609467b48Spatrick .add(I.getOperand(2))
209709467b48Spatrick .add(I.getOperand(3));
209809467b48Spatrick
2099*d415bd75Srobert bool Ret = false;
2100*d415bd75Srobert Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2101*d415bd75Srobert Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
210209467b48Spatrick I.eraseFromParent();
210309467b48Spatrick return Ret;
210409467b48Spatrick }
210509467b48Spatrick
210609467b48Spatrick // Wide VGPR select should have been split in RegBankSelect.
210709467b48Spatrick if (Size > 32)
210809467b48Spatrick return false;
210909467b48Spatrick
211009467b48Spatrick MachineInstr *Select =
211109467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
211209467b48Spatrick .addImm(0)
211309467b48Spatrick .add(I.getOperand(3))
211409467b48Spatrick .addImm(0)
211509467b48Spatrick .add(I.getOperand(2))
211609467b48Spatrick .add(I.getOperand(1));
211709467b48Spatrick
211809467b48Spatrick bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
211909467b48Spatrick I.eraseFromParent();
212009467b48Spatrick return Ret;
212109467b48Spatrick }
212209467b48Spatrick
sizeToSubRegIndex(unsigned Size)212309467b48Spatrick static int sizeToSubRegIndex(unsigned Size) {
212409467b48Spatrick switch (Size) {
212509467b48Spatrick case 32:
212609467b48Spatrick return AMDGPU::sub0;
212709467b48Spatrick case 64:
212809467b48Spatrick return AMDGPU::sub0_sub1;
212909467b48Spatrick case 96:
213009467b48Spatrick return AMDGPU::sub0_sub1_sub2;
213109467b48Spatrick case 128:
213209467b48Spatrick return AMDGPU::sub0_sub1_sub2_sub3;
213309467b48Spatrick case 256:
213409467b48Spatrick return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
213509467b48Spatrick default:
213609467b48Spatrick if (Size < 32)
213709467b48Spatrick return AMDGPU::sub0;
213809467b48Spatrick if (Size > 256)
213909467b48Spatrick return -1;
214009467b48Spatrick return sizeToSubRegIndex(PowerOf2Ceil(Size));
214109467b48Spatrick }
214209467b48Spatrick }
214309467b48Spatrick
selectG_TRUNC(MachineInstr & I) const214409467b48Spatrick bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
214509467b48Spatrick Register DstReg = I.getOperand(0).getReg();
214609467b48Spatrick Register SrcReg = I.getOperand(1).getReg();
214709467b48Spatrick const LLT DstTy = MRI->getType(DstReg);
214809467b48Spatrick const LLT SrcTy = MRI->getType(SrcReg);
214909467b48Spatrick const LLT S1 = LLT::scalar(1);
215009467b48Spatrick
215109467b48Spatrick const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
215209467b48Spatrick const RegisterBank *DstRB;
215309467b48Spatrick if (DstTy == S1) {
215409467b48Spatrick // This is a special case. We don't treat s1 for legalization artifacts as
215509467b48Spatrick // vcc booleans.
215609467b48Spatrick DstRB = SrcRB;
215709467b48Spatrick } else {
215809467b48Spatrick DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
215909467b48Spatrick if (SrcRB != DstRB)
216009467b48Spatrick return false;
216109467b48Spatrick }
216209467b48Spatrick
2163097a140dSpatrick const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2164097a140dSpatrick
216509467b48Spatrick unsigned DstSize = DstTy.getSizeInBits();
216609467b48Spatrick unsigned SrcSize = SrcTy.getSizeInBits();
216709467b48Spatrick
2168*d415bd75Srobert const TargetRegisterClass *SrcRC =
2169*d415bd75Srobert TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2170*d415bd75Srobert const TargetRegisterClass *DstRC =
2171*d415bd75Srobert TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2172097a140dSpatrick if (!SrcRC || !DstRC)
2173097a140dSpatrick return false;
2174097a140dSpatrick
2175097a140dSpatrick if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2176097a140dSpatrick !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2177097a140dSpatrick LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2178097a140dSpatrick return false;
2179097a140dSpatrick }
2180097a140dSpatrick
218173471bf0Spatrick if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2182097a140dSpatrick MachineBasicBlock *MBB = I.getParent();
2183097a140dSpatrick const DebugLoc &DL = I.getDebugLoc();
2184097a140dSpatrick
2185097a140dSpatrick Register LoReg = MRI->createVirtualRegister(DstRC);
2186097a140dSpatrick Register HiReg = MRI->createVirtualRegister(DstRC);
2187097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2188097a140dSpatrick .addReg(SrcReg, 0, AMDGPU::sub0);
2189097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2190097a140dSpatrick .addReg(SrcReg, 0, AMDGPU::sub1);
2191097a140dSpatrick
2192097a140dSpatrick if (IsVALU && STI.hasSDWA()) {
2193097a140dSpatrick // Write the low 16-bits of the high element into the high 16-bits of the
2194097a140dSpatrick // low element.
2195097a140dSpatrick MachineInstr *MovSDWA =
2196097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2197097a140dSpatrick .addImm(0) // $src0_modifiers
2198097a140dSpatrick .addReg(HiReg) // $src0
2199097a140dSpatrick .addImm(0) // $clamp
2200097a140dSpatrick .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2201097a140dSpatrick .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2202097a140dSpatrick .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2203097a140dSpatrick .addReg(LoReg, RegState::Implicit);
2204097a140dSpatrick MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2205097a140dSpatrick } else {
2206097a140dSpatrick Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2207097a140dSpatrick Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2208097a140dSpatrick Register ImmReg = MRI->createVirtualRegister(DstRC);
2209097a140dSpatrick if (IsVALU) {
2210097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2211097a140dSpatrick .addImm(16)
2212097a140dSpatrick .addReg(HiReg);
2213097a140dSpatrick } else {
2214097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2215097a140dSpatrick .addReg(HiReg)
2216097a140dSpatrick .addImm(16);
2217097a140dSpatrick }
2218097a140dSpatrick
2219097a140dSpatrick unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2220097a140dSpatrick unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2221097a140dSpatrick unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2222097a140dSpatrick
2223097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2224097a140dSpatrick .addImm(0xffff);
2225097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2226097a140dSpatrick .addReg(LoReg)
2227097a140dSpatrick .addReg(ImmReg);
2228097a140dSpatrick BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2229097a140dSpatrick .addReg(TmpReg0)
2230097a140dSpatrick .addReg(TmpReg1);
2231097a140dSpatrick }
2232097a140dSpatrick
2233097a140dSpatrick I.eraseFromParent();
2234097a140dSpatrick return true;
2235097a140dSpatrick }
2236097a140dSpatrick
2237097a140dSpatrick if (!DstTy.isScalar())
2238097a140dSpatrick return false;
223909467b48Spatrick
224009467b48Spatrick if (SrcSize > 32) {
224109467b48Spatrick int SubRegIdx = sizeToSubRegIndex(DstSize);
224209467b48Spatrick if (SubRegIdx == -1)
224309467b48Spatrick return false;
224409467b48Spatrick
224509467b48Spatrick // Deal with weird cases where the class only partially supports the subreg
224609467b48Spatrick // index.
2247097a140dSpatrick const TargetRegisterClass *SrcWithSubRC
2248097a140dSpatrick = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2249097a140dSpatrick if (!SrcWithSubRC)
225009467b48Spatrick return false;
225109467b48Spatrick
2252097a140dSpatrick if (SrcWithSubRC != SrcRC) {
2253097a140dSpatrick if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2254097a140dSpatrick return false;
225509467b48Spatrick }
225609467b48Spatrick
2257097a140dSpatrick I.getOperand(1).setSubReg(SubRegIdx);
225809467b48Spatrick }
225909467b48Spatrick
226009467b48Spatrick I.setDesc(TII.get(TargetOpcode::COPY));
226109467b48Spatrick return true;
226209467b48Spatrick }
226309467b48Spatrick
226409467b48Spatrick /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)226509467b48Spatrick static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
226609467b48Spatrick Mask = maskTrailingOnes<unsigned>(Size);
226709467b48Spatrick int SignedMask = static_cast<int>(Mask);
226809467b48Spatrick return SignedMask >= -16 && SignedMask <= 64;
226909467b48Spatrick }
227009467b48Spatrick
227109467b48Spatrick // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const227209467b48Spatrick const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
227309467b48Spatrick Register Reg, const MachineRegisterInfo &MRI,
227409467b48Spatrick const TargetRegisterInfo &TRI) const {
227509467b48Spatrick const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
227609467b48Spatrick if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
227709467b48Spatrick return RB;
227809467b48Spatrick
227909467b48Spatrick // Ignore the type, since we don't use vcc in artifacts.
228009467b48Spatrick if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
228109467b48Spatrick return &RBI.getRegBankFromRegClass(*RC, LLT());
228209467b48Spatrick return nullptr;
228309467b48Spatrick }
228409467b48Spatrick
selectG_SZA_EXT(MachineInstr & I) const228509467b48Spatrick bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2286097a140dSpatrick bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2287097a140dSpatrick bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
228809467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
228909467b48Spatrick MachineBasicBlock &MBB = *I.getParent();
229009467b48Spatrick const Register DstReg = I.getOperand(0).getReg();
229109467b48Spatrick const Register SrcReg = I.getOperand(1).getReg();
229209467b48Spatrick
229309467b48Spatrick const LLT DstTy = MRI->getType(DstReg);
229409467b48Spatrick const LLT SrcTy = MRI->getType(SrcReg);
2295097a140dSpatrick const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2296097a140dSpatrick I.getOperand(2).getImm() : SrcTy.getSizeInBits();
229709467b48Spatrick const unsigned DstSize = DstTy.getSizeInBits();
229809467b48Spatrick if (!DstTy.isScalar())
229909467b48Spatrick return false;
230009467b48Spatrick
230109467b48Spatrick // Artifact casts should never use vcc.
230209467b48Spatrick const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
230309467b48Spatrick
230473471bf0Spatrick // FIXME: This should probably be illegal and split earlier.
230573471bf0Spatrick if (I.getOpcode() == AMDGPU::G_ANYEXT) {
230673471bf0Spatrick if (DstSize <= 32)
230773471bf0Spatrick return selectCOPY(I);
230873471bf0Spatrick
230973471bf0Spatrick const TargetRegisterClass *SrcRC =
2310*d415bd75Srobert TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
231173471bf0Spatrick const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
231273471bf0Spatrick const TargetRegisterClass *DstRC =
2313*d415bd75Srobert TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
231473471bf0Spatrick
231573471bf0Spatrick Register UndefReg = MRI->createVirtualRegister(SrcRC);
231673471bf0Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
231773471bf0Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
231873471bf0Spatrick .addReg(SrcReg)
231973471bf0Spatrick .addImm(AMDGPU::sub0)
232073471bf0Spatrick .addReg(UndefReg)
232173471bf0Spatrick .addImm(AMDGPU::sub1);
232273471bf0Spatrick I.eraseFromParent();
232373471bf0Spatrick
232473471bf0Spatrick return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
232573471bf0Spatrick RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
232673471bf0Spatrick }
232773471bf0Spatrick
232809467b48Spatrick if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
232909467b48Spatrick // 64-bit should have been split up in RegBankSelect
233009467b48Spatrick
233109467b48Spatrick // Try to use an and with a mask if it will save code size.
233209467b48Spatrick unsigned Mask;
233309467b48Spatrick if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
233409467b48Spatrick MachineInstr *ExtI =
233509467b48Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
233609467b48Spatrick .addImm(Mask)
233709467b48Spatrick .addReg(SrcReg);
233809467b48Spatrick I.eraseFromParent();
233909467b48Spatrick return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
234009467b48Spatrick }
234109467b48Spatrick
234273471bf0Spatrick const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
234309467b48Spatrick MachineInstr *ExtI =
234409467b48Spatrick BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
234509467b48Spatrick .addReg(SrcReg)
234609467b48Spatrick .addImm(0) // Offset
234709467b48Spatrick .addImm(SrcSize); // Width
234809467b48Spatrick I.eraseFromParent();
234909467b48Spatrick return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
235009467b48Spatrick }
235109467b48Spatrick
235209467b48Spatrick if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2353097a140dSpatrick const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2354097a140dSpatrick AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2355097a140dSpatrick if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
235609467b48Spatrick return false;
235709467b48Spatrick
235809467b48Spatrick if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
235909467b48Spatrick const unsigned SextOpc = SrcSize == 8 ?
236009467b48Spatrick AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
236109467b48Spatrick BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
236209467b48Spatrick .addReg(SrcReg);
236309467b48Spatrick I.eraseFromParent();
236409467b48Spatrick return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
236509467b48Spatrick }
236609467b48Spatrick
2367*d415bd75Srobert // Using a single 32-bit SALU to calculate the high half is smaller than
2368*d415bd75Srobert // S_BFE with a literal constant operand.
2369*d415bd75Srobert if (DstSize > 32 && SrcSize == 32) {
2370*d415bd75Srobert Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2371*d415bd75Srobert unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2372*d415bd75Srobert if (Signed) {
2373*d415bd75Srobert BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2374*d415bd75Srobert .addReg(SrcReg, 0, SubReg)
2375*d415bd75Srobert .addImm(31);
2376*d415bd75Srobert } else {
2377*d415bd75Srobert BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2378*d415bd75Srobert .addImm(0);
2379*d415bd75Srobert }
2380*d415bd75Srobert BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2381*d415bd75Srobert .addReg(SrcReg, 0, SubReg)
2382*d415bd75Srobert .addImm(AMDGPU::sub0)
2383*d415bd75Srobert .addReg(HiReg)
2384*d415bd75Srobert .addImm(AMDGPU::sub1);
2385*d415bd75Srobert I.eraseFromParent();
2386*d415bd75Srobert return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2387*d415bd75Srobert *MRI);
2388*d415bd75Srobert }
2389*d415bd75Srobert
239009467b48Spatrick const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
239109467b48Spatrick const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
239209467b48Spatrick
239309467b48Spatrick // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2394097a140dSpatrick if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
239509467b48Spatrick // We need a 64-bit register source, but the high bits don't matter.
239609467b48Spatrick Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
239709467b48Spatrick Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2398*d415bd75Srobert unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2399097a140dSpatrick
240009467b48Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
240109467b48Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2402097a140dSpatrick .addReg(SrcReg, 0, SubReg)
240309467b48Spatrick .addImm(AMDGPU::sub0)
240409467b48Spatrick .addReg(UndefReg)
240509467b48Spatrick .addImm(AMDGPU::sub1);
240609467b48Spatrick
240709467b48Spatrick BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
240809467b48Spatrick .addReg(ExtReg)
240909467b48Spatrick .addImm(SrcSize << 16);
241009467b48Spatrick
241109467b48Spatrick I.eraseFromParent();
241209467b48Spatrick return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
241309467b48Spatrick }
241409467b48Spatrick
241509467b48Spatrick unsigned Mask;
241609467b48Spatrick if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
241709467b48Spatrick BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
241809467b48Spatrick .addReg(SrcReg)
241909467b48Spatrick .addImm(Mask);
242009467b48Spatrick } else {
242109467b48Spatrick BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
242209467b48Spatrick .addReg(SrcReg)
242309467b48Spatrick .addImm(SrcSize << 16);
242409467b48Spatrick }
242509467b48Spatrick
242609467b48Spatrick I.eraseFromParent();
242709467b48Spatrick return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
242809467b48Spatrick }
242909467b48Spatrick
243009467b48Spatrick return false;
243109467b48Spatrick }
243209467b48Spatrick
selectG_CONSTANT(MachineInstr & I) const243309467b48Spatrick bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
243409467b48Spatrick MachineBasicBlock *BB = I.getParent();
243509467b48Spatrick MachineOperand &ImmOp = I.getOperand(1);
243673471bf0Spatrick Register DstReg = I.getOperand(0).getReg();
243773471bf0Spatrick unsigned Size = MRI->getType(DstReg).getSizeInBits();
243809467b48Spatrick
243909467b48Spatrick // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
244009467b48Spatrick if (ImmOp.isFPImm()) {
244109467b48Spatrick const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
244209467b48Spatrick ImmOp.ChangeToImmediate(Imm.getZExtValue());
244309467b48Spatrick } else if (ImmOp.isCImm()) {
244473471bf0Spatrick ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
244509467b48Spatrick } else {
244673471bf0Spatrick llvm_unreachable("Not supported by g_constants");
244709467b48Spatrick }
244809467b48Spatrick
244973471bf0Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
245073471bf0Spatrick const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
245109467b48Spatrick
245273471bf0Spatrick unsigned Opcode;
245373471bf0Spatrick if (DstRB->getID() == AMDGPU::VCCRegBankID) {
245473471bf0Spatrick Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
245573471bf0Spatrick } else {
245673471bf0Spatrick Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
245773471bf0Spatrick
245873471bf0Spatrick // We should never produce s1 values on banks other than VCC. If the user of
245973471bf0Spatrick // this already constrained the register, we may incorrectly think it's VCC
246073471bf0Spatrick // if it wasn't originally.
246173471bf0Spatrick if (Size == 1)
246273471bf0Spatrick return false;
246373471bf0Spatrick }
246473471bf0Spatrick
246573471bf0Spatrick if (Size != 64) {
246609467b48Spatrick I.setDesc(TII.get(Opcode));
246709467b48Spatrick I.addImplicitDefUseOperands(*MF);
246809467b48Spatrick return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
246909467b48Spatrick }
247009467b48Spatrick
247109467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
247209467b48Spatrick
247309467b48Spatrick APInt Imm(Size, I.getOperand(1).getImm());
247409467b48Spatrick
247509467b48Spatrick MachineInstr *ResInst;
247609467b48Spatrick if (IsSgpr && TII.isInlineConstant(Imm)) {
247709467b48Spatrick ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
247809467b48Spatrick .addImm(I.getOperand(1).getImm());
247909467b48Spatrick } else {
248009467b48Spatrick const TargetRegisterClass *RC = IsSgpr ?
248109467b48Spatrick &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
248209467b48Spatrick Register LoReg = MRI->createVirtualRegister(RC);
248309467b48Spatrick Register HiReg = MRI->createVirtualRegister(RC);
248409467b48Spatrick
248509467b48Spatrick BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
248609467b48Spatrick .addImm(Imm.trunc(32).getZExtValue());
248709467b48Spatrick
248809467b48Spatrick BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
248909467b48Spatrick .addImm(Imm.ashr(32).getZExtValue());
249009467b48Spatrick
249109467b48Spatrick ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
249209467b48Spatrick .addReg(LoReg)
249309467b48Spatrick .addImm(AMDGPU::sub0)
249409467b48Spatrick .addReg(HiReg)
249509467b48Spatrick .addImm(AMDGPU::sub1);
249609467b48Spatrick }
249709467b48Spatrick
249809467b48Spatrick // We can't call constrainSelectedInstRegOperands here, because it doesn't
249909467b48Spatrick // work for target independent opcodes
250009467b48Spatrick I.eraseFromParent();
250109467b48Spatrick const TargetRegisterClass *DstRC =
250209467b48Spatrick TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
250309467b48Spatrick if (!DstRC)
250409467b48Spatrick return true;
250509467b48Spatrick return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
250609467b48Spatrick }
250709467b48Spatrick
selectG_FNEG(MachineInstr & MI) const2508097a140dSpatrick bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2509097a140dSpatrick // Only manually handle the f64 SGPR case.
2510097a140dSpatrick //
2511097a140dSpatrick // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2512097a140dSpatrick // the bit ops theoretically have a second result due to the implicit def of
2513097a140dSpatrick // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2514097a140dSpatrick // that is easy by disabling the check. The result works, but uses a
2515097a140dSpatrick // nonsensical sreg32orlds_and_sreg_1 regclass.
2516097a140dSpatrick //
2517097a140dSpatrick // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2518097a140dSpatrick // the variadic REG_SEQUENCE operands.
2519097a140dSpatrick
2520097a140dSpatrick Register Dst = MI.getOperand(0).getReg();
2521097a140dSpatrick const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2522097a140dSpatrick if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2523097a140dSpatrick MRI->getType(Dst) != LLT::scalar(64))
2524097a140dSpatrick return false;
2525097a140dSpatrick
2526097a140dSpatrick Register Src = MI.getOperand(1).getReg();
2527097a140dSpatrick MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2528097a140dSpatrick if (Fabs)
2529097a140dSpatrick Src = Fabs->getOperand(1).getReg();
2530097a140dSpatrick
2531097a140dSpatrick if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2532097a140dSpatrick !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2533097a140dSpatrick return false;
2534097a140dSpatrick
2535097a140dSpatrick MachineBasicBlock *BB = MI.getParent();
2536097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
2537097a140dSpatrick Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2538097a140dSpatrick Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2539097a140dSpatrick Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2540097a140dSpatrick Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2541097a140dSpatrick
2542097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2543097a140dSpatrick .addReg(Src, 0, AMDGPU::sub0);
2544097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2545097a140dSpatrick .addReg(Src, 0, AMDGPU::sub1);
2546097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2547097a140dSpatrick .addImm(0x80000000);
2548097a140dSpatrick
2549097a140dSpatrick // Set or toggle sign bit.
2550097a140dSpatrick unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2551097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2552097a140dSpatrick .addReg(HiReg)
2553097a140dSpatrick .addReg(ConstReg);
2554097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2555097a140dSpatrick .addReg(LoReg)
2556097a140dSpatrick .addImm(AMDGPU::sub0)
2557097a140dSpatrick .addReg(OpReg)
2558097a140dSpatrick .addImm(AMDGPU::sub1);
2559097a140dSpatrick MI.eraseFromParent();
2560097a140dSpatrick return true;
2561097a140dSpatrick }
2562097a140dSpatrick
2563097a140dSpatrick // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2564097a140dSpatrick bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2565097a140dSpatrick Register Dst = MI.getOperand(0).getReg();
2566097a140dSpatrick const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2567097a140dSpatrick if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2568097a140dSpatrick MRI->getType(Dst) != LLT::scalar(64))
2569097a140dSpatrick return false;
2570097a140dSpatrick
2571097a140dSpatrick Register Src = MI.getOperand(1).getReg();
2572097a140dSpatrick MachineBasicBlock *BB = MI.getParent();
2573097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
2574097a140dSpatrick Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2575097a140dSpatrick Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2576097a140dSpatrick Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2577097a140dSpatrick Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2578097a140dSpatrick
2579097a140dSpatrick if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2580097a140dSpatrick !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2581097a140dSpatrick return false;
2582097a140dSpatrick
2583097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2584097a140dSpatrick .addReg(Src, 0, AMDGPU::sub0);
2585097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2586097a140dSpatrick .addReg(Src, 0, AMDGPU::sub1);
2587097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2588097a140dSpatrick .addImm(0x7fffffff);
2589097a140dSpatrick
2590097a140dSpatrick // Clear sign bit.
2591097a140dSpatrick // TODO: Should this used S_BITSET0_*?
2592097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2593097a140dSpatrick .addReg(HiReg)
2594097a140dSpatrick .addReg(ConstReg);
2595097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2596097a140dSpatrick .addReg(LoReg)
2597097a140dSpatrick .addImm(AMDGPU::sub0)
2598097a140dSpatrick .addReg(OpReg)
2599097a140dSpatrick .addImm(AMDGPU::sub1);
2600097a140dSpatrick
2601097a140dSpatrick MI.eraseFromParent();
2602097a140dSpatrick return true;
2603097a140dSpatrick }
2604097a140dSpatrick
isConstant(const MachineInstr & MI)260509467b48Spatrick static bool isConstant(const MachineInstr &MI) {
260609467b48Spatrick return MI.getOpcode() == TargetOpcode::G_CONSTANT;
260709467b48Spatrick }
260809467b48Spatrick
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const260909467b48Spatrick void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
261009467b48Spatrick const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
261109467b48Spatrick
261209467b48Spatrick const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
261309467b48Spatrick
261409467b48Spatrick assert(PtrMI);
261509467b48Spatrick
261609467b48Spatrick if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
261709467b48Spatrick return;
261809467b48Spatrick
2619*d415bd75Srobert GEPInfo GEPInfo;
262009467b48Spatrick
262109467b48Spatrick for (unsigned i = 1; i != 3; ++i) {
262209467b48Spatrick const MachineOperand &GEPOp = PtrMI->getOperand(i);
262309467b48Spatrick const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
262409467b48Spatrick assert(OpDef);
262509467b48Spatrick if (i == 2 && isConstant(*OpDef)) {
262609467b48Spatrick // TODO: Could handle constant base + variable offset, but a combine
262709467b48Spatrick // probably should have commuted it.
262809467b48Spatrick assert(GEPInfo.Imm == 0);
262909467b48Spatrick GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
263009467b48Spatrick continue;
263109467b48Spatrick }
263209467b48Spatrick const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
263309467b48Spatrick if (OpBank->getID() == AMDGPU::SGPRRegBankID)
263409467b48Spatrick GEPInfo.SgprParts.push_back(GEPOp.getReg());
263509467b48Spatrick else
263609467b48Spatrick GEPInfo.VgprParts.push_back(GEPOp.getReg());
263709467b48Spatrick }
263809467b48Spatrick
263909467b48Spatrick AddrInfo.push_back(GEPInfo);
264009467b48Spatrick getAddrModeInfo(*PtrMI, MRI, AddrInfo);
264109467b48Spatrick }
264209467b48Spatrick
isSGPR(Register Reg) const264373471bf0Spatrick bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
264473471bf0Spatrick return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
264573471bf0Spatrick }
264673471bf0Spatrick
isInstrUniform(const MachineInstr & MI) const264709467b48Spatrick bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
264809467b48Spatrick if (!MI.hasOneMemOperand())
264909467b48Spatrick return false;
265009467b48Spatrick
265109467b48Spatrick const MachineMemOperand *MMO = *MI.memoperands_begin();
265209467b48Spatrick const Value *Ptr = MMO->getValue();
265309467b48Spatrick
265409467b48Spatrick // UndefValue means this is a load of a kernel input. These are uniform.
265509467b48Spatrick // Sometimes LDS instructions have constant pointers.
265609467b48Spatrick // If Ptr is null, then that means this mem operand contains a
265709467b48Spatrick // PseudoSourceValue like GOT.
265809467b48Spatrick if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
265909467b48Spatrick isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
266009467b48Spatrick return true;
266109467b48Spatrick
266209467b48Spatrick if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
266309467b48Spatrick return true;
266409467b48Spatrick
266509467b48Spatrick const Instruction *I = dyn_cast<Instruction>(Ptr);
266609467b48Spatrick return I && I->getMetadata("amdgpu.uniform");
266709467b48Spatrick }
266809467b48Spatrick
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const266909467b48Spatrick bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
267009467b48Spatrick for (const GEPInfo &GEPInfo : AddrInfo) {
267109467b48Spatrick if (!GEPInfo.VgprParts.empty())
267209467b48Spatrick return true;
267309467b48Spatrick }
267409467b48Spatrick return false;
267509467b48Spatrick }
267609467b48Spatrick
initM0(MachineInstr & I) const267709467b48Spatrick void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
267809467b48Spatrick const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
267909467b48Spatrick unsigned AS = PtrTy.getAddressSpace();
268009467b48Spatrick if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
268109467b48Spatrick STI.ldsRequiresM0Init()) {
268273471bf0Spatrick MachineBasicBlock *BB = I.getParent();
268373471bf0Spatrick
2684*d415bd75Srobert // If DS instructions require M0 initialization, insert it before selecting.
268509467b48Spatrick BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
268609467b48Spatrick .addImm(-1);
268709467b48Spatrick }
268809467b48Spatrick }
268909467b48Spatrick
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const269073471bf0Spatrick bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
269173471bf0Spatrick MachineInstr &I) const {
269209467b48Spatrick initM0(I);
269309467b48Spatrick return selectImpl(I, *CoverageInfo);
269409467b48Spatrick }
269509467b48Spatrick
isVCmpResult(Register Reg,MachineRegisterInfo & MRI)2696*d415bd75Srobert static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2697*d415bd75Srobert if (Reg.isPhysical())
2698*d415bd75Srobert return false;
2699097a140dSpatrick
2700*d415bd75Srobert MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2701*d415bd75Srobert const unsigned Opcode = MI.getOpcode();
2702097a140dSpatrick
2703*d415bd75Srobert if (Opcode == AMDGPU::COPY)
2704*d415bd75Srobert return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2705097a140dSpatrick
2706*d415bd75Srobert if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2707*d415bd75Srobert Opcode == AMDGPU::G_XOR)
2708*d415bd75Srobert return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2709*d415bd75Srobert isVCmpResult(MI.getOperand(2).getReg(), MRI);
2710097a140dSpatrick
2711*d415bd75Srobert if (Opcode == TargetOpcode::G_INTRINSIC)
2712*d415bd75Srobert return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2713097a140dSpatrick
2714*d415bd75Srobert return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2715097a140dSpatrick }
2716097a140dSpatrick
selectG_BRCOND(MachineInstr & I) const271709467b48Spatrick bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
271809467b48Spatrick MachineBasicBlock *BB = I.getParent();
271909467b48Spatrick MachineOperand &CondOp = I.getOperand(0);
272009467b48Spatrick Register CondReg = CondOp.getReg();
272109467b48Spatrick const DebugLoc &DL = I.getDebugLoc();
272209467b48Spatrick
272309467b48Spatrick unsigned BrOpcode;
272409467b48Spatrick Register CondPhysReg;
272509467b48Spatrick const TargetRegisterClass *ConstrainRC;
272609467b48Spatrick
272709467b48Spatrick // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
272809467b48Spatrick // whether the branch is uniform when selecting the instruction. In
272909467b48Spatrick // GlobalISel, we should push that decision into RegBankSelect. Assume for now
273009467b48Spatrick // RegBankSelect knows what it's doing if the branch condition is scc, even
273109467b48Spatrick // though it currently does not.
273209467b48Spatrick if (!isVCC(CondReg, *MRI)) {
273309467b48Spatrick if (MRI->getType(CondReg) != LLT::scalar(32))
273409467b48Spatrick return false;
273509467b48Spatrick
273609467b48Spatrick CondPhysReg = AMDGPU::SCC;
273709467b48Spatrick BrOpcode = AMDGPU::S_CBRANCH_SCC1;
273873471bf0Spatrick ConstrainRC = &AMDGPU::SReg_32RegClass;
273909467b48Spatrick } else {
274009467b48Spatrick // FIXME: Should scc->vcc copies and with exec?
2741*d415bd75Srobert
2742*d415bd75Srobert // Unless the value of CondReg is a result of a V_CMP* instruction then we
2743*d415bd75Srobert // need to insert an and with exec.
2744*d415bd75Srobert if (!isVCmpResult(CondReg, *MRI)) {
2745*d415bd75Srobert const bool Is64 = STI.isWave64();
2746*d415bd75Srobert const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2747*d415bd75Srobert const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2748*d415bd75Srobert
2749*d415bd75Srobert Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2750*d415bd75Srobert BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2751*d415bd75Srobert .addReg(CondReg)
2752*d415bd75Srobert .addReg(Exec);
2753*d415bd75Srobert CondReg = TmpReg;
2754*d415bd75Srobert }
2755*d415bd75Srobert
275609467b48Spatrick CondPhysReg = TRI.getVCC();
275709467b48Spatrick BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
275809467b48Spatrick ConstrainRC = TRI.getBoolRC();
275909467b48Spatrick }
276009467b48Spatrick
276109467b48Spatrick if (!MRI->getRegClassOrNull(CondReg))
276209467b48Spatrick MRI->setRegClass(CondReg, ConstrainRC);
276309467b48Spatrick
276409467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
276509467b48Spatrick .addReg(CondReg);
276609467b48Spatrick BuildMI(*BB, &I, DL, TII.get(BrOpcode))
276709467b48Spatrick .addMBB(I.getOperand(1).getMBB());
276809467b48Spatrick
276909467b48Spatrick I.eraseFromParent();
277009467b48Spatrick return true;
277109467b48Spatrick }
277209467b48Spatrick
selectG_GLOBAL_VALUE(MachineInstr & I) const277373471bf0Spatrick bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2774097a140dSpatrick MachineInstr &I) const {
277509467b48Spatrick Register DstReg = I.getOperand(0).getReg();
277609467b48Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
277709467b48Spatrick const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
277809467b48Spatrick I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
277909467b48Spatrick if (IsVGPR)
278009467b48Spatrick I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
278109467b48Spatrick
278209467b48Spatrick return RBI.constrainGenericRegister(
278309467b48Spatrick DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
278409467b48Spatrick }
278509467b48Spatrick
selectG_PTRMASK(MachineInstr & I) const2786097a140dSpatrick bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
278709467b48Spatrick Register DstReg = I.getOperand(0).getReg();
278809467b48Spatrick Register SrcReg = I.getOperand(1).getReg();
2789097a140dSpatrick Register MaskReg = I.getOperand(2).getReg();
2790097a140dSpatrick LLT Ty = MRI->getType(DstReg);
2791097a140dSpatrick LLT MaskTy = MRI->getType(MaskReg);
2792*d415bd75Srobert MachineBasicBlock *BB = I.getParent();
2793*d415bd75Srobert const DebugLoc &DL = I.getDebugLoc();
279409467b48Spatrick
279509467b48Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
279609467b48Spatrick const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2797097a140dSpatrick const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
279809467b48Spatrick const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2799097a140dSpatrick if (DstRB != SrcRB) // Should only happen for hand written MIR.
2800097a140dSpatrick return false;
2801097a140dSpatrick
2802*d415bd75Srobert // Try to avoid emitting a bit operation when we only need to touch half of
2803*d415bd75Srobert // the 64-bit pointer.
2804*d415bd75Srobert APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
2805*d415bd75Srobert const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2806*d415bd75Srobert const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2807*d415bd75Srobert
2808*d415bd75Srobert const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2809*d415bd75Srobert const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2810*d415bd75Srobert
2811*d415bd75Srobert if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2812*d415bd75Srobert !CanCopyLow32 && !CanCopyHi32) {
2813*d415bd75Srobert auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2814*d415bd75Srobert .addReg(SrcReg)
2815*d415bd75Srobert .addReg(MaskReg);
2816*d415bd75Srobert I.eraseFromParent();
2817*d415bd75Srobert return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2818*d415bd75Srobert }
2819*d415bd75Srobert
282009467b48Spatrick unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
282109467b48Spatrick const TargetRegisterClass &RegRC
282209467b48Spatrick = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
282309467b48Spatrick
2824*d415bd75Srobert const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2825*d415bd75Srobert const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2826097a140dSpatrick const TargetRegisterClass *MaskRC =
2827*d415bd75Srobert TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2828097a140dSpatrick
282909467b48Spatrick if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2830097a140dSpatrick !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2831097a140dSpatrick !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
283209467b48Spatrick return false;
283309467b48Spatrick
283409467b48Spatrick if (Ty.getSizeInBits() == 32) {
2835097a140dSpatrick assert(MaskTy.getSizeInBits() == 32 &&
2836097a140dSpatrick "ptrmask should have been narrowed during legalize");
2837097a140dSpatrick
283809467b48Spatrick BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
283909467b48Spatrick .addReg(SrcReg)
2840097a140dSpatrick .addReg(MaskReg);
284109467b48Spatrick I.eraseFromParent();
284209467b48Spatrick return true;
284309467b48Spatrick }
284409467b48Spatrick
284509467b48Spatrick Register HiReg = MRI->createVirtualRegister(&RegRC);
284609467b48Spatrick Register LoReg = MRI->createVirtualRegister(&RegRC);
284709467b48Spatrick
2848097a140dSpatrick // Extract the subregisters from the source pointer.
284909467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
285009467b48Spatrick .addReg(SrcReg, 0, AMDGPU::sub0);
285109467b48Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
285209467b48Spatrick .addReg(SrcReg, 0, AMDGPU::sub1);
285309467b48Spatrick
2854097a140dSpatrick Register MaskedLo, MaskedHi;
2855097a140dSpatrick
2856*d415bd75Srobert if (CanCopyLow32) {
2857097a140dSpatrick // If all the bits in the low half are 1, we only need a copy for it.
2858097a140dSpatrick MaskedLo = LoReg;
2859097a140dSpatrick } else {
2860097a140dSpatrick // Extract the mask subregister and apply the and.
2861097a140dSpatrick Register MaskLo = MRI->createVirtualRegister(&RegRC);
2862097a140dSpatrick MaskedLo = MRI->createVirtualRegister(&RegRC);
2863097a140dSpatrick
2864097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2865097a140dSpatrick .addReg(MaskReg, 0, AMDGPU::sub0);
2866097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
286709467b48Spatrick .addReg(LoReg)
2868097a140dSpatrick .addReg(MaskLo);
2869097a140dSpatrick }
2870097a140dSpatrick
2871*d415bd75Srobert if (CanCopyHi32) {
2872097a140dSpatrick // If all the bits in the high half are 1, we only need a copy for it.
2873097a140dSpatrick MaskedHi = HiReg;
2874097a140dSpatrick } else {
2875097a140dSpatrick Register MaskHi = MRI->createVirtualRegister(&RegRC);
2876097a140dSpatrick MaskedHi = MRI->createVirtualRegister(&RegRC);
2877097a140dSpatrick
2878097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2879097a140dSpatrick .addReg(MaskReg, 0, AMDGPU::sub1);
2880097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
288109467b48Spatrick .addReg(HiReg)
2882097a140dSpatrick .addReg(MaskHi);
2883097a140dSpatrick }
2884097a140dSpatrick
2885097a140dSpatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2886097a140dSpatrick .addReg(MaskedLo)
2887097a140dSpatrick .addImm(AMDGPU::sub0)
2888097a140dSpatrick .addReg(MaskedHi)
288909467b48Spatrick .addImm(AMDGPU::sub1);
289009467b48Spatrick I.eraseFromParent();
289109467b48Spatrick return true;
289209467b48Spatrick }
289309467b48Spatrick
2894097a140dSpatrick /// Return the register to use for the index value, and the subregister to use
2895097a140dSpatrick /// for the indirectly accessed register.
2896097a140dSpatrick static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize,GISelKnownBits & KnownBits)2897*d415bd75Srobert computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
2898*d415bd75Srobert const TargetRegisterClass *SuperRC, Register IdxReg,
2899*d415bd75Srobert unsigned EltSize, GISelKnownBits &KnownBits) {
2900097a140dSpatrick Register IdxBaseReg;
2901097a140dSpatrick int Offset;
2902097a140dSpatrick
2903*d415bd75Srobert std::tie(IdxBaseReg, Offset) =
2904*d415bd75Srobert AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
2905097a140dSpatrick if (IdxBaseReg == AMDGPU::NoRegister) {
2906097a140dSpatrick // This will happen if the index is a known constant. This should ordinarily
2907097a140dSpatrick // be legalized out, but handle it as a register just in case.
2908097a140dSpatrick assert(Offset == 0);
2909097a140dSpatrick IdxBaseReg = IdxReg;
2910097a140dSpatrick }
2911097a140dSpatrick
2912097a140dSpatrick ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2913097a140dSpatrick
2914097a140dSpatrick // Skip out of bounds offsets, or else we would end up using an undefined
2915097a140dSpatrick // register.
2916097a140dSpatrick if (static_cast<unsigned>(Offset) >= SubRegs.size())
2917*d415bd75Srobert return std::pair(IdxReg, SubRegs[0]);
2918*d415bd75Srobert return std::pair(IdxBaseReg, SubRegs[Offset]);
2919097a140dSpatrick }
2920097a140dSpatrick
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const292109467b48Spatrick bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
292209467b48Spatrick MachineInstr &MI) const {
292309467b48Spatrick Register DstReg = MI.getOperand(0).getReg();
292409467b48Spatrick Register SrcReg = MI.getOperand(1).getReg();
292509467b48Spatrick Register IdxReg = MI.getOperand(2).getReg();
292609467b48Spatrick
292709467b48Spatrick LLT DstTy = MRI->getType(DstReg);
292809467b48Spatrick LLT SrcTy = MRI->getType(SrcReg);
292909467b48Spatrick
293009467b48Spatrick const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
293109467b48Spatrick const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
293209467b48Spatrick const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
293309467b48Spatrick
293409467b48Spatrick // The index must be scalar. If it wasn't RegBankSelect should have moved this
293509467b48Spatrick // into a waterfall loop.
293609467b48Spatrick if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
293709467b48Spatrick return false;
293809467b48Spatrick
2939*d415bd75Srobert const TargetRegisterClass *SrcRC =
2940*d415bd75Srobert TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
2941*d415bd75Srobert const TargetRegisterClass *DstRC =
2942*d415bd75Srobert TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
2943097a140dSpatrick if (!SrcRC || !DstRC)
2944097a140dSpatrick return false;
294509467b48Spatrick if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
294609467b48Spatrick !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
294709467b48Spatrick !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
294809467b48Spatrick return false;
294909467b48Spatrick
295009467b48Spatrick MachineBasicBlock *BB = MI.getParent();
295109467b48Spatrick const DebugLoc &DL = MI.getDebugLoc();
295209467b48Spatrick const bool Is64 = DstTy.getSizeInBits() == 64;
295309467b48Spatrick
2954097a140dSpatrick unsigned SubReg;
2955*d415bd75Srobert std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
2956*d415bd75Srobert *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits);
295709467b48Spatrick
295809467b48Spatrick if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
295909467b48Spatrick if (DstTy.getSizeInBits() != 32 && !Is64)
296009467b48Spatrick return false;
296109467b48Spatrick
296209467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
296309467b48Spatrick .addReg(IdxReg);
296409467b48Spatrick
296509467b48Spatrick unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
296609467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
296709467b48Spatrick .addReg(SrcReg, 0, SubReg)
296809467b48Spatrick .addReg(SrcReg, RegState::Implicit);
296909467b48Spatrick MI.eraseFromParent();
297009467b48Spatrick return true;
297109467b48Spatrick }
297209467b48Spatrick
297309467b48Spatrick if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
297409467b48Spatrick return false;
297509467b48Spatrick
297609467b48Spatrick if (!STI.useVGPRIndexMode()) {
297709467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
297809467b48Spatrick .addReg(IdxReg);
297909467b48Spatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
298073471bf0Spatrick .addReg(SrcReg, 0, SubReg)
298109467b48Spatrick .addReg(SrcReg, RegState::Implicit);
298209467b48Spatrick MI.eraseFromParent();
298309467b48Spatrick return true;
298409467b48Spatrick }
298509467b48Spatrick
298673471bf0Spatrick const MCInstrDesc &GPRIDXDesc =
298773471bf0Spatrick TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
298873471bf0Spatrick BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
298973471bf0Spatrick .addReg(SrcReg)
299009467b48Spatrick .addReg(IdxReg)
299173471bf0Spatrick .addImm(SubReg);
299209467b48Spatrick
299309467b48Spatrick MI.eraseFromParent();
299409467b48Spatrick return true;
299509467b48Spatrick }
299609467b48Spatrick
2997097a140dSpatrick // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const2998097a140dSpatrick bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2999097a140dSpatrick MachineInstr &MI) const {
3000097a140dSpatrick Register DstReg = MI.getOperand(0).getReg();
3001097a140dSpatrick Register VecReg = MI.getOperand(1).getReg();
3002097a140dSpatrick Register ValReg = MI.getOperand(2).getReg();
3003097a140dSpatrick Register IdxReg = MI.getOperand(3).getReg();
3004097a140dSpatrick
3005097a140dSpatrick LLT VecTy = MRI->getType(DstReg);
3006097a140dSpatrick LLT ValTy = MRI->getType(ValReg);
3007097a140dSpatrick unsigned VecSize = VecTy.getSizeInBits();
3008097a140dSpatrick unsigned ValSize = ValTy.getSizeInBits();
3009097a140dSpatrick
3010097a140dSpatrick const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3011097a140dSpatrick const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3012097a140dSpatrick const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3013097a140dSpatrick
3014097a140dSpatrick assert(VecTy.getElementType() == ValTy);
3015097a140dSpatrick
3016097a140dSpatrick // The index must be scalar. If it wasn't RegBankSelect should have moved this
3017097a140dSpatrick // into a waterfall loop.
3018097a140dSpatrick if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3019097a140dSpatrick return false;
3020097a140dSpatrick
3021*d415bd75Srobert const TargetRegisterClass *VecRC =
3022*d415bd75Srobert TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3023*d415bd75Srobert const TargetRegisterClass *ValRC =
3024*d415bd75Srobert TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3025097a140dSpatrick
3026097a140dSpatrick if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3027097a140dSpatrick !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3028097a140dSpatrick !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3029097a140dSpatrick !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3030097a140dSpatrick return false;
3031097a140dSpatrick
3032097a140dSpatrick if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3033097a140dSpatrick return false;
3034097a140dSpatrick
3035097a140dSpatrick unsigned SubReg;
3036097a140dSpatrick std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
3037*d415bd75Srobert ValSize / 8, *KnownBits);
3038097a140dSpatrick
3039097a140dSpatrick const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3040097a140dSpatrick STI.useVGPRIndexMode();
3041097a140dSpatrick
3042097a140dSpatrick MachineBasicBlock *BB = MI.getParent();
3043097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
3044097a140dSpatrick
304573471bf0Spatrick if (!IndexMode) {
3046097a140dSpatrick BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3047097a140dSpatrick .addReg(IdxReg);
3048097a140dSpatrick
304973471bf0Spatrick const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
305073471bf0Spatrick VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3051097a140dSpatrick BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3052097a140dSpatrick .addReg(VecReg)
3053097a140dSpatrick .addReg(ValReg)
3054097a140dSpatrick .addImm(SubReg);
305573471bf0Spatrick MI.eraseFromParent();
305673471bf0Spatrick return true;
305773471bf0Spatrick }
3058097a140dSpatrick
305973471bf0Spatrick const MCInstrDesc &GPRIDXDesc =
306073471bf0Spatrick TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
306173471bf0Spatrick BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
306273471bf0Spatrick .addReg(VecReg)
306373471bf0Spatrick .addReg(ValReg)
306473471bf0Spatrick .addReg(IdxReg)
306573471bf0Spatrick .addImm(SubReg);
3066097a140dSpatrick
3067097a140dSpatrick MI.eraseFromParent();
3068097a140dSpatrick return true;
3069097a140dSpatrick }
3070097a140dSpatrick
selectBufferLoadLds(MachineInstr & MI) const3071*d415bd75Srobert bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3072*d415bd75Srobert unsigned Opc;
3073*d415bd75Srobert unsigned Size = MI.getOperand(3).getImm();
3074*d415bd75Srobert
3075*d415bd75Srobert // The struct intrinsic variants add one additional operand over raw.
3076*d415bd75Srobert const bool HasVIndex = MI.getNumOperands() == 9;
3077*d415bd75Srobert Register VIndex;
3078*d415bd75Srobert int OpOffset = 0;
3079*d415bd75Srobert if (HasVIndex) {
3080*d415bd75Srobert VIndex = MI.getOperand(4).getReg();
3081*d415bd75Srobert OpOffset = 1;
3082097a140dSpatrick }
3083097a140dSpatrick
3084*d415bd75Srobert Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3085*d415bd75Srobert std::optional<ValueAndVReg> MaybeVOffset =
3086*d415bd75Srobert getIConstantVRegValWithLookThrough(VOffset, *MRI);
3087*d415bd75Srobert const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3088097a140dSpatrick
3089*d415bd75Srobert switch (Size) {
3090*d415bd75Srobert default:
3091097a140dSpatrick return false;
3092*d415bd75Srobert case 1:
3093*d415bd75Srobert Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3094*d415bd75Srobert : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3095*d415bd75Srobert : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3096*d415bd75Srobert : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3097*d415bd75Srobert break;
3098*d415bd75Srobert case 2:
3099*d415bd75Srobert Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3100*d415bd75Srobert : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3101*d415bd75Srobert : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3102*d415bd75Srobert : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3103*d415bd75Srobert break;
3104*d415bd75Srobert case 4:
3105*d415bd75Srobert Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3106*d415bd75Srobert : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3107*d415bd75Srobert : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3108*d415bd75Srobert : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3109*d415bd75Srobert break;
3110*d415bd75Srobert }
3111097a140dSpatrick
3112097a140dSpatrick MachineBasicBlock *MBB = MI.getParent();
3113097a140dSpatrick const DebugLoc &DL = MI.getDebugLoc();
3114*d415bd75Srobert BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3115*d415bd75Srobert .add(MI.getOperand(2));
3116097a140dSpatrick
3117*d415bd75Srobert auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3118097a140dSpatrick
3119*d415bd75Srobert if (HasVIndex && HasVOffset) {
312073471bf0Spatrick Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3121*d415bd75Srobert BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3122*d415bd75Srobert .addReg(VIndex)
312373471bf0Spatrick .addImm(AMDGPU::sub0)
3124*d415bd75Srobert .addReg(VOffset)
312573471bf0Spatrick .addImm(AMDGPU::sub1);
312673471bf0Spatrick
3127*d415bd75Srobert MIB.addReg(IdxReg);
312873471bf0Spatrick } else if (HasVIndex) {
3129*d415bd75Srobert MIB.addReg(VIndex);
313073471bf0Spatrick } else if (HasVOffset) {
3131*d415bd75Srobert MIB.addReg(VOffset);
313273471bf0Spatrick }
313373471bf0Spatrick
3134*d415bd75Srobert MIB.add(MI.getOperand(1)); // rsrc
3135*d415bd75Srobert MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3136*d415bd75Srobert MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3137*d415bd75Srobert unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3138*d415bd75Srobert MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3139*d415bd75Srobert MIB.addImm((Aux >> 3) & 1); // swz
3140*d415bd75Srobert
3141*d415bd75Srobert MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3142*d415bd75Srobert MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3143*d415bd75Srobert LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3144*d415bd75Srobert MachinePointerInfo StorePtrI = LoadPtrI;
3145*d415bd75Srobert StorePtrI.V = nullptr;
3146*d415bd75Srobert StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3147*d415bd75Srobert
3148*d415bd75Srobert auto F = LoadMMO->getFlags() &
3149*d415bd75Srobert ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3150*d415bd75Srobert LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3151*d415bd75Srobert Size, LoadMMO->getBaseAlign());
3152*d415bd75Srobert
3153*d415bd75Srobert MachineMemOperand *StoreMMO =
3154*d415bd75Srobert MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3155*d415bd75Srobert sizeof(int32_t), LoadMMO->getBaseAlign());
3156*d415bd75Srobert
3157*d415bd75Srobert MIB.setMemRefs({LoadMMO, StoreMMO});
315873471bf0Spatrick
315973471bf0Spatrick MI.eraseFromParent();
3160*d415bd75Srobert return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
316173471bf0Spatrick }
316273471bf0Spatrick
3163*d415bd75Srobert /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3164*d415bd75Srobert static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3165*d415bd75Srobert Register ZExtSrc;
3166*d415bd75Srobert if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3167*d415bd75Srobert return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
316873471bf0Spatrick
3169*d415bd75Srobert // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3170*d415bd75Srobert const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3171*d415bd75Srobert if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3172*d415bd75Srobert return Register();
3173*d415bd75Srobert
3174*d415bd75Srobert assert(Def->getNumOperands() == 3 &&
3175*d415bd75Srobert MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3176*d415bd75Srobert if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3177*d415bd75Srobert return Def->getOperand(1).getReg();
3178*d415bd75Srobert }
3179*d415bd75Srobert
3180*d415bd75Srobert return Register();
3181*d415bd75Srobert }
3182*d415bd75Srobert
selectGlobalLoadLds(MachineInstr & MI) const3183*d415bd75Srobert bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3184*d415bd75Srobert unsigned Opc;
3185*d415bd75Srobert unsigned Size = MI.getOperand(3).getImm();
3186*d415bd75Srobert
3187*d415bd75Srobert switch (Size) {
3188*d415bd75Srobert default:
3189*d415bd75Srobert return false;
3190*d415bd75Srobert case 1:
3191*d415bd75Srobert Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3192*d415bd75Srobert break;
3193*d415bd75Srobert case 2:
3194*d415bd75Srobert Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3195*d415bd75Srobert break;
3196*d415bd75Srobert case 4:
3197*d415bd75Srobert Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3198*d415bd75Srobert break;
319973471bf0Spatrick }
320073471bf0Spatrick
320173471bf0Spatrick MachineBasicBlock *MBB = MI.getParent();
320273471bf0Spatrick const DebugLoc &DL = MI.getDebugLoc();
3203*d415bd75Srobert BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3204*d415bd75Srobert .add(MI.getOperand(2));
320573471bf0Spatrick
3206*d415bd75Srobert Register Addr = MI.getOperand(1).getReg();
3207*d415bd75Srobert Register VOffset;
3208*d415bd75Srobert // Try to split SAddr and VOffset. Global and LDS pointers share the same
3209*d415bd75Srobert // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3210*d415bd75Srobert if (!isSGPR(Addr)) {
3211*d415bd75Srobert auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3212*d415bd75Srobert if (isSGPR(AddrDef->Reg)) {
3213*d415bd75Srobert Addr = AddrDef->Reg;
3214*d415bd75Srobert } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3215*d415bd75Srobert Register SAddr =
3216*d415bd75Srobert getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3217*d415bd75Srobert if (isSGPR(SAddr)) {
3218*d415bd75Srobert Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3219*d415bd75Srobert if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3220*d415bd75Srobert Addr = SAddr;
3221*d415bd75Srobert VOffset = Off;
3222*d415bd75Srobert }
3223*d415bd75Srobert }
3224*d415bd75Srobert }
322573471bf0Spatrick }
322673471bf0Spatrick
3227*d415bd75Srobert if (isSGPR(Addr)) {
3228*d415bd75Srobert Opc = AMDGPU::getGlobalSaddrOp(Opc);
3229*d415bd75Srobert if (!VOffset) {
3230*d415bd75Srobert VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3231*d415bd75Srobert BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3232*d415bd75Srobert .addImm(0);
3233*d415bd75Srobert }
3234*d415bd75Srobert }
323573471bf0Spatrick
323673471bf0Spatrick auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3237*d415bd75Srobert .addReg(Addr);
3238*d415bd75Srobert
3239*d415bd75Srobert if (isSGPR(Addr))
3240*d415bd75Srobert MIB.addReg(VOffset);
3241*d415bd75Srobert
3242*d415bd75Srobert MIB.add(MI.getOperand(4)) // offset
3243*d415bd75Srobert .add(MI.getOperand(5)); // cpol
3244*d415bd75Srobert
3245*d415bd75Srobert MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3246*d415bd75Srobert MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3247*d415bd75Srobert LoadPtrI.Offset = MI.getOperand(4).getImm();
3248*d415bd75Srobert MachinePointerInfo StorePtrI = LoadPtrI;
3249*d415bd75Srobert LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3250*d415bd75Srobert StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3251*d415bd75Srobert auto F = LoadMMO->getFlags() &
3252*d415bd75Srobert ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3253*d415bd75Srobert LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3254*d415bd75Srobert Size, LoadMMO->getBaseAlign());
3255*d415bd75Srobert MachineMemOperand *StoreMMO =
3256*d415bd75Srobert MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3257*d415bd75Srobert sizeof(int32_t), Align(4));
3258*d415bd75Srobert
3259*d415bd75Srobert MIB.setMemRefs({LoadMMO, StoreMMO});
326073471bf0Spatrick
326173471bf0Spatrick MI.eraseFromParent();
326273471bf0Spatrick return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
326373471bf0Spatrick }
326473471bf0Spatrick
selectBVHIntrinsic(MachineInstr & MI) const326573471bf0Spatrick bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
326673471bf0Spatrick MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3267*d415bd75Srobert MI.removeOperand(1);
326873471bf0Spatrick MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
326973471bf0Spatrick return true;
327073471bf0Spatrick }
327173471bf0Spatrick
selectSMFMACIntrin(MachineInstr & MI) const3272*d415bd75Srobert bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3273*d415bd75Srobert unsigned Opc;
3274*d415bd75Srobert switch (MI.getIntrinsicID()) {
3275*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3276*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3277*d415bd75Srobert break;
3278*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3279*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3280*d415bd75Srobert break;
3281*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3282*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3283*d415bd75Srobert break;
3284*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3285*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3286*d415bd75Srobert break;
3287*d415bd75Srobert case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3288*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3289*d415bd75Srobert break;
3290*d415bd75Srobert case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3291*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3292*d415bd75Srobert break;
3293*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3294*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3295*d415bd75Srobert break;
3296*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3297*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3298*d415bd75Srobert break;
3299*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3300*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3301*d415bd75Srobert break;
3302*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3303*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3304*d415bd75Srobert break;
3305*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3306*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3307*d415bd75Srobert break;
3308*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3309*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3310*d415bd75Srobert break;
3311*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3312*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3313*d415bd75Srobert break;
3314*d415bd75Srobert case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3315*d415bd75Srobert Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3316*d415bd75Srobert break;
3317*d415bd75Srobert default:
3318*d415bd75Srobert llvm_unreachable("unhandled smfmac intrinsic");
3319*d415bd75Srobert }
3320*d415bd75Srobert
3321*d415bd75Srobert auto VDst_In = MI.getOperand(4);
3322*d415bd75Srobert
3323*d415bd75Srobert MI.setDesc(TII.get(Opc));
3324*d415bd75Srobert MI.removeOperand(4); // VDst_In
3325*d415bd75Srobert MI.removeOperand(1); // Intrinsic ID
3326*d415bd75Srobert MI.addOperand(VDst_In); // Readd VDst_In to the end
3327*d415bd75Srobert MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3328*d415bd75Srobert return true;
3329*d415bd75Srobert }
3330*d415bd75Srobert
selectWaveAddress(MachineInstr & MI) const3331*d415bd75Srobert bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3332*d415bd75Srobert Register DstReg = MI.getOperand(0).getReg();
3333*d415bd75Srobert Register SrcReg = MI.getOperand(1).getReg();
3334*d415bd75Srobert const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3335*d415bd75Srobert const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3336*d415bd75Srobert MachineBasicBlock *MBB = MI.getParent();
3337*d415bd75Srobert const DebugLoc &DL = MI.getDebugLoc();
3338*d415bd75Srobert
3339*d415bd75Srobert if (IsVALU) {
3340*d415bd75Srobert BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3341*d415bd75Srobert .addImm(Subtarget->getWavefrontSizeLog2())
3342*d415bd75Srobert .addReg(SrcReg);
3343*d415bd75Srobert } else {
3344*d415bd75Srobert BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3345*d415bd75Srobert .addReg(SrcReg)
3346*d415bd75Srobert .addImm(Subtarget->getWavefrontSizeLog2());
3347*d415bd75Srobert }
3348*d415bd75Srobert
3349*d415bd75Srobert const TargetRegisterClass &RC =
3350*d415bd75Srobert IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3351*d415bd75Srobert if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3352*d415bd75Srobert return false;
3353*d415bd75Srobert
3354*d415bd75Srobert MI.eraseFromParent();
3355*d415bd75Srobert return true;
3356*d415bd75Srobert }
3357*d415bd75Srobert
select(MachineInstr & I)335809467b48Spatrick bool AMDGPUInstructionSelector::select(MachineInstr &I) {
335909467b48Spatrick if (I.isPHI())
336009467b48Spatrick return selectPHI(I);
336109467b48Spatrick
336209467b48Spatrick if (!I.isPreISelOpcode()) {
336309467b48Spatrick if (I.isCopy())
336409467b48Spatrick return selectCOPY(I);
336509467b48Spatrick return true;
336609467b48Spatrick }
336709467b48Spatrick
336809467b48Spatrick switch (I.getOpcode()) {
336909467b48Spatrick case TargetOpcode::G_AND:
337009467b48Spatrick case TargetOpcode::G_OR:
337109467b48Spatrick case TargetOpcode::G_XOR:
3372097a140dSpatrick if (selectImpl(I, *CoverageInfo))
337309467b48Spatrick return true;
3374097a140dSpatrick return selectG_AND_OR_XOR(I);
337509467b48Spatrick case TargetOpcode::G_ADD:
337609467b48Spatrick case TargetOpcode::G_SUB:
337709467b48Spatrick if (selectImpl(I, *CoverageInfo))
337809467b48Spatrick return true;
337909467b48Spatrick return selectG_ADD_SUB(I);
338009467b48Spatrick case TargetOpcode::G_UADDO:
338109467b48Spatrick case TargetOpcode::G_USUBO:
338209467b48Spatrick case TargetOpcode::G_UADDE:
338309467b48Spatrick case TargetOpcode::G_USUBE:
338409467b48Spatrick return selectG_UADDO_USUBO_UADDE_USUBE(I);
3385*d415bd75Srobert case AMDGPU::G_AMDGPU_MAD_U64_U32:
3386*d415bd75Srobert case AMDGPU::G_AMDGPU_MAD_I64_I32:
3387*d415bd75Srobert return selectG_AMDGPU_MAD_64_32(I);
338809467b48Spatrick case TargetOpcode::G_INTTOPTR:
338909467b48Spatrick case TargetOpcode::G_BITCAST:
339009467b48Spatrick case TargetOpcode::G_PTRTOINT:
339109467b48Spatrick return selectCOPY(I);
339209467b48Spatrick case TargetOpcode::G_CONSTANT:
339309467b48Spatrick case TargetOpcode::G_FCONSTANT:
339409467b48Spatrick return selectG_CONSTANT(I);
3395097a140dSpatrick case TargetOpcode::G_FNEG:
3396097a140dSpatrick if (selectImpl(I, *CoverageInfo))
3397097a140dSpatrick return true;
3398097a140dSpatrick return selectG_FNEG(I);
3399097a140dSpatrick case TargetOpcode::G_FABS:
3400097a140dSpatrick if (selectImpl(I, *CoverageInfo))
3401097a140dSpatrick return true;
3402097a140dSpatrick return selectG_FABS(I);
340309467b48Spatrick case TargetOpcode::G_EXTRACT:
340409467b48Spatrick return selectG_EXTRACT(I);
3405*d415bd75Srobert case TargetOpcode::G_FMA:
3406*d415bd75Srobert case TargetOpcode::G_FMAD:
3407*d415bd75Srobert if (selectG_FMA_FMAD(I))
3408*d415bd75Srobert return true;
3409*d415bd75Srobert return selectImpl(I, *CoverageInfo);
341009467b48Spatrick case TargetOpcode::G_MERGE_VALUES:
341109467b48Spatrick case TargetOpcode::G_CONCAT_VECTORS:
341209467b48Spatrick return selectG_MERGE_VALUES(I);
341309467b48Spatrick case TargetOpcode::G_UNMERGE_VALUES:
341409467b48Spatrick return selectG_UNMERGE_VALUES(I);
3415*d415bd75Srobert case TargetOpcode::G_BUILD_VECTOR:
3416097a140dSpatrick case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3417*d415bd75Srobert return selectG_BUILD_VECTOR(I);
341809467b48Spatrick case TargetOpcode::G_PTR_ADD:
3419*d415bd75Srobert if (selectImpl(I, *CoverageInfo))
3420*d415bd75Srobert return true;
342109467b48Spatrick return selectG_PTR_ADD(I);
342209467b48Spatrick case TargetOpcode::G_IMPLICIT_DEF:
342309467b48Spatrick return selectG_IMPLICIT_DEF(I);
342473471bf0Spatrick case TargetOpcode::G_FREEZE:
342573471bf0Spatrick return selectCOPY(I);
342609467b48Spatrick case TargetOpcode::G_INSERT:
342709467b48Spatrick return selectG_INSERT(I);
342809467b48Spatrick case TargetOpcode::G_INTRINSIC:
342909467b48Spatrick return selectG_INTRINSIC(I);
343009467b48Spatrick case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
343109467b48Spatrick return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
343209467b48Spatrick case TargetOpcode::G_ICMP:
343309467b48Spatrick if (selectG_ICMP(I))
343409467b48Spatrick return true;
343509467b48Spatrick return selectImpl(I, *CoverageInfo);
343609467b48Spatrick case TargetOpcode::G_LOAD:
343773471bf0Spatrick case TargetOpcode::G_STORE:
343809467b48Spatrick case TargetOpcode::G_ATOMIC_CMPXCHG:
343909467b48Spatrick case TargetOpcode::G_ATOMICRMW_XCHG:
344009467b48Spatrick case TargetOpcode::G_ATOMICRMW_ADD:
344109467b48Spatrick case TargetOpcode::G_ATOMICRMW_SUB:
344209467b48Spatrick case TargetOpcode::G_ATOMICRMW_AND:
344309467b48Spatrick case TargetOpcode::G_ATOMICRMW_OR:
344409467b48Spatrick case TargetOpcode::G_ATOMICRMW_XOR:
344509467b48Spatrick case TargetOpcode::G_ATOMICRMW_MIN:
344609467b48Spatrick case TargetOpcode::G_ATOMICRMW_MAX:
344709467b48Spatrick case TargetOpcode::G_ATOMICRMW_UMIN:
344809467b48Spatrick case TargetOpcode::G_ATOMICRMW_UMAX:
344909467b48Spatrick case TargetOpcode::G_ATOMICRMW_FADD:
345073471bf0Spatrick case AMDGPU::G_AMDGPU_ATOMIC_INC:
345173471bf0Spatrick case AMDGPU::G_AMDGPU_ATOMIC_DEC:
345273471bf0Spatrick case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
345373471bf0Spatrick case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
345473471bf0Spatrick return selectG_LOAD_STORE_ATOMICRMW(I);
345509467b48Spatrick case TargetOpcode::G_SELECT:
345609467b48Spatrick return selectG_SELECT(I);
345709467b48Spatrick case TargetOpcode::G_TRUNC:
345809467b48Spatrick return selectG_TRUNC(I);
345909467b48Spatrick case TargetOpcode::G_SEXT:
346009467b48Spatrick case TargetOpcode::G_ZEXT:
346109467b48Spatrick case TargetOpcode::G_ANYEXT:
3462097a140dSpatrick case TargetOpcode::G_SEXT_INREG:
346309467b48Spatrick if (selectImpl(I, *CoverageInfo))
346409467b48Spatrick return true;
346509467b48Spatrick return selectG_SZA_EXT(I);
346609467b48Spatrick case TargetOpcode::G_BRCOND:
346709467b48Spatrick return selectG_BRCOND(I);
3468097a140dSpatrick case TargetOpcode::G_GLOBAL_VALUE:
346973471bf0Spatrick return selectG_GLOBAL_VALUE(I);
3470097a140dSpatrick case TargetOpcode::G_PTRMASK:
3471097a140dSpatrick return selectG_PTRMASK(I);
347209467b48Spatrick case TargetOpcode::G_EXTRACT_VECTOR_ELT:
347309467b48Spatrick return selectG_EXTRACT_VECTOR_ELT(I);
3474097a140dSpatrick case TargetOpcode::G_INSERT_VECTOR_ELT:
3475097a140dSpatrick return selectG_INSERT_VECTOR_ELT(I);
3476097a140dSpatrick case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3477*d415bd75Srobert case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3478*d415bd75Srobert case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3479*d415bd75Srobert case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3480097a140dSpatrick const AMDGPU::ImageDimIntrinsicInfo *Intr
3481097a140dSpatrick = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3482097a140dSpatrick assert(Intr && "not an image intrinsic with image pseudo");
3483097a140dSpatrick return selectImageIntrinsic(I, Intr);
3484097a140dSpatrick }
348573471bf0Spatrick case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
348673471bf0Spatrick return selectBVHIntrinsic(I);
348773471bf0Spatrick case AMDGPU::G_SBFX:
348873471bf0Spatrick case AMDGPU::G_UBFX:
348973471bf0Spatrick return selectG_SBFX_UBFX(I);
3490*d415bd75Srobert case AMDGPU::G_SI_CALL:
3491*d415bd75Srobert I.setDesc(TII.get(AMDGPU::SI_CALL));
3492*d415bd75Srobert return true;
3493*d415bd75Srobert case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3494*d415bd75Srobert return selectWaveAddress(I);
349509467b48Spatrick default:
349609467b48Spatrick return selectImpl(I, *CoverageInfo);
349709467b48Spatrick }
349809467b48Spatrick return false;
349909467b48Spatrick }
350009467b48Spatrick
350109467b48Spatrick InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const350209467b48Spatrick AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
350309467b48Spatrick return {{
350409467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
350509467b48Spatrick }};
350609467b48Spatrick
350709467b48Spatrick }
350809467b48Spatrick
selectVOP3ModsImpl(MachineOperand & Root,bool AllowAbs,bool OpSel) const3509*d415bd75Srobert std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3510*d415bd75Srobert MachineOperand &Root, bool AllowAbs, bool OpSel) const {
3511097a140dSpatrick Register Src = Root.getReg();
351209467b48Spatrick unsigned Mods = 0;
3513097a140dSpatrick MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
351409467b48Spatrick
3515*d415bd75Srobert if (MI->getOpcode() == AMDGPU::G_FNEG) {
351609467b48Spatrick Src = MI->getOperand(1).getReg();
351709467b48Spatrick Mods |= SISrcMods::NEG;
3518097a140dSpatrick MI = getDefIgnoringCopies(Src, *MRI);
351909467b48Spatrick }
352009467b48Spatrick
3521*d415bd75Srobert if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
352209467b48Spatrick Src = MI->getOperand(1).getReg();
352309467b48Spatrick Mods |= SISrcMods::ABS;
352409467b48Spatrick }
352509467b48Spatrick
3526*d415bd75Srobert if (OpSel)
3527*d415bd75Srobert Mods |= SISrcMods::OP_SEL_0;
3528*d415bd75Srobert
3529*d415bd75Srobert return std::pair(Src, Mods);
3530*d415bd75Srobert }
3531*d415bd75Srobert
copyToVGPRIfSrcFolded(Register Src,unsigned Mods,MachineOperand Root,MachineInstr * InsertPt,bool ForceVGPR) const3532*d415bd75Srobert Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3533*d415bd75Srobert Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3534*d415bd75Srobert bool ForceVGPR) const {
3535*d415bd75Srobert if ((Mods != 0 || ForceVGPR) &&
3536097a140dSpatrick RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3537097a140dSpatrick
3538097a140dSpatrick // If we looked through copies to find source modifiers on an SGPR operand,
3539097a140dSpatrick // we now have an SGPR register source. To avoid potentially violating the
3540097a140dSpatrick // constant bus restriction, we need to insert a copy to a VGPR.
3541*d415bd75Srobert Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3542*d415bd75Srobert BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3543097a140dSpatrick TII.get(AMDGPU::COPY), VGPRSrc)
3544097a140dSpatrick .addReg(Src);
3545097a140dSpatrick Src = VGPRSrc;
3546097a140dSpatrick }
3547097a140dSpatrick
3548*d415bd75Srobert return Src;
354909467b48Spatrick }
355009467b48Spatrick
355109467b48Spatrick ///
355209467b48Spatrick /// This will select either an SGPR or VGPR operand and will save us from
355309467b48Spatrick /// having to write an extra tablegen pattern.
355409467b48Spatrick InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const355509467b48Spatrick AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
355609467b48Spatrick return {{
355709467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
355809467b48Spatrick }};
355909467b48Spatrick }
356009467b48Spatrick
356109467b48Spatrick InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const356209467b48Spatrick AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
356309467b48Spatrick Register Src;
356409467b48Spatrick unsigned Mods;
3565097a140dSpatrick std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
356609467b48Spatrick
356709467b48Spatrick return {{
3568*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3569*d415bd75Srobert MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3570*d415bd75Srobert },
357109467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
357209467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
357309467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
357409467b48Spatrick }};
357509467b48Spatrick }
357609467b48Spatrick
357709467b48Spatrick InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const357873471bf0Spatrick AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
357973471bf0Spatrick Register Src;
358073471bf0Spatrick unsigned Mods;
358173471bf0Spatrick std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
358273471bf0Spatrick
358373471bf0Spatrick return {{
3584*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3585*d415bd75Srobert MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3586*d415bd75Srobert },
358773471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
358873471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
358973471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
359073471bf0Spatrick }};
359173471bf0Spatrick }
359273471bf0Spatrick
359373471bf0Spatrick InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const359409467b48Spatrick AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
359509467b48Spatrick return {{
359609467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
359709467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
359809467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
359909467b48Spatrick }};
360009467b48Spatrick }
360109467b48Spatrick
360209467b48Spatrick InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const360309467b48Spatrick AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
360409467b48Spatrick Register Src;
360509467b48Spatrick unsigned Mods;
3606097a140dSpatrick std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3607097a140dSpatrick
3608097a140dSpatrick return {{
3609*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3610*d415bd75Srobert MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3611*d415bd75Srobert },
3612097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3613097a140dSpatrick }};
3614097a140dSpatrick }
3615097a140dSpatrick
3616097a140dSpatrick InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const361773471bf0Spatrick AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
361873471bf0Spatrick Register Src;
361973471bf0Spatrick unsigned Mods;
362073471bf0Spatrick std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
362173471bf0Spatrick
362273471bf0Spatrick return {{
3623*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3624*d415bd75Srobert MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3625*d415bd75Srobert },
362673471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
362773471bf0Spatrick }};
362873471bf0Spatrick }
362973471bf0Spatrick
363073471bf0Spatrick InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const3631097a140dSpatrick AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3632097a140dSpatrick Register Reg = Root.getReg();
3633097a140dSpatrick const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3634*d415bd75Srobert if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3635097a140dSpatrick return {};
3636097a140dSpatrick return {{
3637097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3638097a140dSpatrick }};
3639097a140dSpatrick }
3640097a140dSpatrick
3641097a140dSpatrick std::pair<Register, unsigned>
selectVOP3PModsImpl(Register Src,const MachineRegisterInfo & MRI,bool IsDOT) const3642097a140dSpatrick AMDGPUInstructionSelector::selectVOP3PModsImpl(
3643*d415bd75Srobert Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3644097a140dSpatrick unsigned Mods = 0;
3645097a140dSpatrick MachineInstr *MI = MRI.getVRegDef(Src);
3646097a140dSpatrick
3647097a140dSpatrick if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3648097a140dSpatrick // It's possible to see an f32 fneg here, but unlikely.
3649097a140dSpatrick // TODO: Treat f32 fneg as only high bit.
365073471bf0Spatrick MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3651097a140dSpatrick Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3652097a140dSpatrick Src = MI->getOperand(1).getReg();
3653097a140dSpatrick MI = MRI.getVRegDef(Src);
3654097a140dSpatrick }
3655097a140dSpatrick
3656097a140dSpatrick // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3657*d415bd75Srobert (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3658097a140dSpatrick
3659097a140dSpatrick // Packed instructions do not have abs modifiers.
3660097a140dSpatrick Mods |= SISrcMods::OP_SEL_1;
3661097a140dSpatrick
3662*d415bd75Srobert return std::pair(Src, Mods);
3663097a140dSpatrick }
3664097a140dSpatrick
3665097a140dSpatrick InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const3666097a140dSpatrick AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3667097a140dSpatrick MachineRegisterInfo &MRI
3668097a140dSpatrick = Root.getParent()->getParent()->getParent()->getRegInfo();
3669097a140dSpatrick
3670097a140dSpatrick Register Src;
3671097a140dSpatrick unsigned Mods;
3672097a140dSpatrick std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
367309467b48Spatrick
367409467b48Spatrick return {{
367509467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
367609467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
367709467b48Spatrick }};
367809467b48Spatrick }
367909467b48Spatrick
368009467b48Spatrick InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand & Root) const3681*d415bd75Srobert AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3682*d415bd75Srobert MachineRegisterInfo &MRI
3683*d415bd75Srobert = Root.getParent()->getParent()->getParent()->getRegInfo();
3684*d415bd75Srobert
368509467b48Spatrick Register Src;
368609467b48Spatrick unsigned Mods;
3687*d415bd75Srobert std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
368809467b48Spatrick
368909467b48Spatrick return {{
369009467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
369109467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
369209467b48Spatrick }};
369309467b48Spatrick }
369409467b48Spatrick
369509467b48Spatrick InstructionSelector::ComplexRendererFns
selectDotIUVOP3PMods(MachineOperand & Root) const3696*d415bd75Srobert AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
3697*d415bd75Srobert // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3698*d415bd75Srobert // Value is in Imm operand as i1 sign extended to int64_t.
3699*d415bd75Srobert // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3700*d415bd75Srobert assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3701*d415bd75Srobert "expected i1 value");
3702*d415bd75Srobert unsigned Mods = SISrcMods::OP_SEL_1;
3703*d415bd75Srobert if (Root.getImm() == -1)
3704*d415bd75Srobert Mods ^= SISrcMods::NEG;
370509467b48Spatrick return {{
3706*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
370709467b48Spatrick }};
370809467b48Spatrick }
370909467b48Spatrick
371009467b48Spatrick InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand & Root) const3711*d415bd75Srobert AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3712*d415bd75Srobert MachineOperand &Root) const {
3713*d415bd75Srobert assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3714*d415bd75Srobert "expected i1 value");
3715*d415bd75Srobert unsigned Mods = SISrcMods::OP_SEL_1;
3716*d415bd75Srobert if (Root.getImm() != 0)
3717*d415bd75Srobert Mods |= SISrcMods::OP_SEL_0;
371809467b48Spatrick
371909467b48Spatrick return {{
3720*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
372109467b48Spatrick }};
372209467b48Spatrick }
372309467b48Spatrick
372409467b48Spatrick InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const3725*d415bd75Srobert AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3726*d415bd75Srobert Register Src;
3727*d415bd75Srobert unsigned Mods;
3728*d415bd75Srobert std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3729*d415bd75Srobert
3730*d415bd75Srobert // FIXME: Handle op_sel
3731*d415bd75Srobert return {{
3732*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3733*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3734*d415bd75Srobert }};
3735*d415bd75Srobert }
3736*d415bd75Srobert
3737*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand & Root) const3738*d415bd75Srobert AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
3739*d415bd75Srobert Register Src;
3740*d415bd75Srobert unsigned Mods;
3741*d415bd75Srobert std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3742*d415bd75Srobert /* AllowAbs */ false,
3743*d415bd75Srobert /* OpSel */ false);
3744*d415bd75Srobert
3745*d415bd75Srobert return {{
3746*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3747*d415bd75Srobert MIB.addReg(
3748*d415bd75Srobert copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3749*d415bd75Srobert },
3750*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3751*d415bd75Srobert }};
3752*d415bd75Srobert }
3753*d415bd75Srobert
3754*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand & Root) const3755*d415bd75Srobert AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
3756*d415bd75Srobert Register Src;
3757*d415bd75Srobert unsigned Mods;
3758*d415bd75Srobert std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3759*d415bd75Srobert /* AllowAbs */ false,
3760*d415bd75Srobert /* OpSel */ true);
3761*d415bd75Srobert
3762*d415bd75Srobert return {{
3763*d415bd75Srobert [=](MachineInstrBuilder &MIB) {
3764*d415bd75Srobert MIB.addReg(
3765*d415bd75Srobert copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3766*d415bd75Srobert },
3767*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3768*d415bd75Srobert }};
3769*d415bd75Srobert }
3770*d415bd75Srobert
selectSmrdOffset(MachineOperand & Root,Register & Base,Register * SOffset,int64_t * Offset) const3771*d415bd75Srobert bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
3772*d415bd75Srobert Register &Base,
3773*d415bd75Srobert Register *SOffset,
3774*d415bd75Srobert int64_t *Offset) const {
3775*d415bd75Srobert MachineInstr *MI = Root.getParent();
3776*d415bd75Srobert MachineBasicBlock *MBB = MI->getParent();
3777*d415bd75Srobert
3778*d415bd75Srobert // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3779*d415bd75Srobert // then we can select all ptr + 32-bit offsets.
3780*d415bd75Srobert SmallVector<GEPInfo, 4> AddrInfo;
3781*d415bd75Srobert getAddrModeInfo(*MI, *MRI, AddrInfo);
3782*d415bd75Srobert
3783*d415bd75Srobert if (AddrInfo.empty())
3784*d415bd75Srobert return false;
3785*d415bd75Srobert
3786*d415bd75Srobert const GEPInfo &GEPI = AddrInfo[0];
3787*d415bd75Srobert std::optional<int64_t> EncodedImm =
3788*d415bd75Srobert AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
3789*d415bd75Srobert
3790*d415bd75Srobert if (SOffset && Offset) {
3791*d415bd75Srobert if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
3792*d415bd75Srobert AddrInfo.size() > 1) {
3793*d415bd75Srobert const GEPInfo &GEPI2 = AddrInfo[1];
3794*d415bd75Srobert if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
3795*d415bd75Srobert if (Register OffsetReg =
3796*d415bd75Srobert matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
3797*d415bd75Srobert Base = GEPI2.SgprParts[0];
3798*d415bd75Srobert *SOffset = OffsetReg;
3799*d415bd75Srobert *Offset = *EncodedImm;
3800*d415bd75Srobert return true;
3801*d415bd75Srobert }
3802*d415bd75Srobert }
3803*d415bd75Srobert }
3804*d415bd75Srobert return false;
3805*d415bd75Srobert }
3806*d415bd75Srobert
3807*d415bd75Srobert if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
3808*d415bd75Srobert Base = GEPI.SgprParts[0];
3809*d415bd75Srobert *Offset = *EncodedImm;
3810*d415bd75Srobert return true;
3811*d415bd75Srobert }
3812*d415bd75Srobert
3813*d415bd75Srobert // SGPR offset is unsigned.
3814*d415bd75Srobert if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
3815*d415bd75Srobert GEPI.Imm != 0) {
3816*d415bd75Srobert // If we make it this far we have a load with an 32-bit immediate offset.
3817*d415bd75Srobert // It is OK to select this using a sgpr offset, because we have already
3818*d415bd75Srobert // failed trying to select this load into one of the _IMM variants since
3819*d415bd75Srobert // the _IMM Patterns are considered before the _SGPR patterns.
3820*d415bd75Srobert Base = GEPI.SgprParts[0];
3821*d415bd75Srobert *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3822*d415bd75Srobert BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
3823*d415bd75Srobert .addImm(GEPI.Imm);
3824*d415bd75Srobert return true;
3825*d415bd75Srobert }
3826*d415bd75Srobert
3827*d415bd75Srobert if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
3828*d415bd75Srobert if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
3829*d415bd75Srobert Base = GEPI.SgprParts[0];
3830*d415bd75Srobert *SOffset = OffsetReg;
3831*d415bd75Srobert return true;
3832*d415bd75Srobert }
3833*d415bd75Srobert }
3834*d415bd75Srobert
3835*d415bd75Srobert return false;
3836*d415bd75Srobert }
3837*d415bd75Srobert
3838*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const3839*d415bd75Srobert AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3840*d415bd75Srobert Register Base;
3841*d415bd75Srobert int64_t Offset;
3842*d415bd75Srobert if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
3843*d415bd75Srobert return std::nullopt;
3844*d415bd75Srobert
3845*d415bd75Srobert return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3846*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
3847*d415bd75Srobert }
3848*d415bd75Srobert
3849*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const385009467b48Spatrick AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
385109467b48Spatrick SmallVector<GEPInfo, 4> AddrInfo;
385209467b48Spatrick getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
385309467b48Spatrick
385409467b48Spatrick if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3855*d415bd75Srobert return std::nullopt;
385609467b48Spatrick
385709467b48Spatrick const GEPInfo &GEPInfo = AddrInfo[0];
3858097a140dSpatrick Register PtrReg = GEPInfo.SgprParts[0];
3859*d415bd75Srobert std::optional<int64_t> EncodedImm =
3860097a140dSpatrick AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3861097a140dSpatrick if (!EncodedImm)
3862*d415bd75Srobert return std::nullopt;
386309467b48Spatrick
386409467b48Spatrick return {{
386509467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3866097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
386709467b48Spatrick }};
386809467b48Spatrick }
386909467b48Spatrick
387009467b48Spatrick InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const387109467b48Spatrick AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3872*d415bd75Srobert Register Base, SOffset;
3873*d415bd75Srobert if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
3874*d415bd75Srobert return std::nullopt;
387509467b48Spatrick
3876*d415bd75Srobert return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3877*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
3878*d415bd75Srobert }
387909467b48Spatrick
3880*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectSmrdSgprImm(MachineOperand & Root) const3881*d415bd75Srobert AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
3882*d415bd75Srobert Register Base, SOffset;
3883*d415bd75Srobert int64_t Offset;
3884*d415bd75Srobert if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
3885*d415bd75Srobert return std::nullopt;
388609467b48Spatrick
3887*d415bd75Srobert return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3888*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
3889*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
389009467b48Spatrick }
389109467b48Spatrick
389273471bf0Spatrick std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root,uint64_t FlatVariant) const389373471bf0Spatrick AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
389473471bf0Spatrick uint64_t FlatVariant) const {
389509467b48Spatrick MachineInstr *MI = Root.getParent();
389609467b48Spatrick
3897*d415bd75Srobert auto Default = std::pair(Root.getReg(), 0);
389809467b48Spatrick
389909467b48Spatrick if (!STI.hasFlatInstOffsets())
390009467b48Spatrick return Default;
390109467b48Spatrick
390273471bf0Spatrick Register PtrBase;
390373471bf0Spatrick int64_t ConstOffset;
390473471bf0Spatrick std::tie(PtrBase, ConstOffset) =
390573471bf0Spatrick getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
390673471bf0Spatrick if (ConstOffset == 0)
390709467b48Spatrick return Default;
390809467b48Spatrick
390909467b48Spatrick unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
391073471bf0Spatrick if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
391109467b48Spatrick return Default;
391209467b48Spatrick
3913*d415bd75Srobert return std::pair(PtrBase, ConstOffset);
391409467b48Spatrick }
391509467b48Spatrick
391609467b48Spatrick InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const391709467b48Spatrick AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
391873471bf0Spatrick auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
391973471bf0Spatrick
392073471bf0Spatrick return {{
392173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
392273471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
392373471bf0Spatrick }};
392409467b48Spatrick }
392509467b48Spatrick
392609467b48Spatrick InstructionSelector::ComplexRendererFns
selectGlobalOffset(MachineOperand & Root) const392773471bf0Spatrick AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
392873471bf0Spatrick auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
392973471bf0Spatrick
393073471bf0Spatrick return {{
393173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
393273471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
393373471bf0Spatrick }};
393409467b48Spatrick }
393509467b48Spatrick
393673471bf0Spatrick InstructionSelector::ComplexRendererFns
selectScratchOffset(MachineOperand & Root) const393773471bf0Spatrick AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
393873471bf0Spatrick auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
393973471bf0Spatrick
394073471bf0Spatrick return {{
394173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
394273471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
394373471bf0Spatrick }};
394473471bf0Spatrick }
394573471bf0Spatrick
394673471bf0Spatrick // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
394773471bf0Spatrick InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const394873471bf0Spatrick AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
394973471bf0Spatrick Register Addr = Root.getReg();
395073471bf0Spatrick Register PtrBase;
395173471bf0Spatrick int64_t ConstOffset;
395273471bf0Spatrick int64_t ImmOffset = 0;
395373471bf0Spatrick
395473471bf0Spatrick // Match the immediate offset first, which canonically is moved as low as
395573471bf0Spatrick // possible.
395673471bf0Spatrick std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
395773471bf0Spatrick
395873471bf0Spatrick if (ConstOffset != 0) {
395973471bf0Spatrick if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
396073471bf0Spatrick SIInstrFlags::FlatGlobal)) {
396173471bf0Spatrick Addr = PtrBase;
396273471bf0Spatrick ImmOffset = ConstOffset;
396373471bf0Spatrick } else {
396473471bf0Spatrick auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
396573471bf0Spatrick if (isSGPR(PtrBaseDef->Reg)) {
396673471bf0Spatrick if (ConstOffset > 0) {
396773471bf0Spatrick // Offset is too large.
396873471bf0Spatrick //
396973471bf0Spatrick // saddr + large_offset -> saddr +
397073471bf0Spatrick // (voffset = large_offset & ~MaxOffset) +
397173471bf0Spatrick // (large_offset & MaxOffset);
397273471bf0Spatrick int64_t SplitImmOffset, RemainderOffset;
397373471bf0Spatrick std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
397473471bf0Spatrick ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
397573471bf0Spatrick
397673471bf0Spatrick if (isUInt<32>(RemainderOffset)) {
397773471bf0Spatrick MachineInstr *MI = Root.getParent();
397873471bf0Spatrick MachineBasicBlock *MBB = MI->getParent();
397973471bf0Spatrick Register HighBits =
398073471bf0Spatrick MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
398173471bf0Spatrick
398273471bf0Spatrick BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
398373471bf0Spatrick HighBits)
398473471bf0Spatrick .addImm(RemainderOffset);
398573471bf0Spatrick
398673471bf0Spatrick return {{
398773471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
398873471bf0Spatrick [=](MachineInstrBuilder &MIB) {
398973471bf0Spatrick MIB.addReg(HighBits);
399073471bf0Spatrick }, // voffset
399173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
399273471bf0Spatrick }};
399373471bf0Spatrick }
399473471bf0Spatrick }
399573471bf0Spatrick
399673471bf0Spatrick // We are adding a 64 bit SGPR and a constant. If constant bus limit
399773471bf0Spatrick // is 1 we would need to perform 1 or 2 extra moves for each half of
399873471bf0Spatrick // the constant and it is better to do a scalar add and then issue a
399973471bf0Spatrick // single VALU instruction to materialize zero. Otherwise it is less
400073471bf0Spatrick // instructions to perform VALU adds with immediates or inline literals.
400173471bf0Spatrick unsigned NumLiterals =
400273471bf0Spatrick !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
400373471bf0Spatrick !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
400473471bf0Spatrick if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4005*d415bd75Srobert return std::nullopt;
400673471bf0Spatrick }
400773471bf0Spatrick }
400873471bf0Spatrick }
400973471bf0Spatrick
401073471bf0Spatrick // Match the variable offset.
4011*d415bd75Srobert auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
401273471bf0Spatrick if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
401373471bf0Spatrick // Look through the SGPR->VGPR copy.
401473471bf0Spatrick Register SAddr =
401573471bf0Spatrick getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
401673471bf0Spatrick
4017*d415bd75Srobert if (isSGPR(SAddr)) {
401873471bf0Spatrick Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
401973471bf0Spatrick
402073471bf0Spatrick // It's possible voffset is an SGPR here, but the copy to VGPR will be
402173471bf0Spatrick // inserted later.
402273471bf0Spatrick if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
402373471bf0Spatrick return {{[=](MachineInstrBuilder &MIB) { // saddr
402473471bf0Spatrick MIB.addReg(SAddr);
402573471bf0Spatrick },
402673471bf0Spatrick [=](MachineInstrBuilder &MIB) { // voffset
402773471bf0Spatrick MIB.addReg(VOffset);
402873471bf0Spatrick },
402973471bf0Spatrick [=](MachineInstrBuilder &MIB) { // offset
403073471bf0Spatrick MIB.addImm(ImmOffset);
403173471bf0Spatrick }}};
403273471bf0Spatrick }
403373471bf0Spatrick }
403473471bf0Spatrick }
403573471bf0Spatrick
403673471bf0Spatrick // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
403773471bf0Spatrick // drop this.
403873471bf0Spatrick if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
403973471bf0Spatrick AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4040*d415bd75Srobert return std::nullopt;
404173471bf0Spatrick
404273471bf0Spatrick // It's cheaper to materialize a single 32-bit zero for vaddr than the two
404373471bf0Spatrick // moves required to copy a 64-bit SGPR to VGPR.
404473471bf0Spatrick MachineInstr *MI = Root.getParent();
404573471bf0Spatrick MachineBasicBlock *MBB = MI->getParent();
404673471bf0Spatrick Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
404773471bf0Spatrick
404873471bf0Spatrick BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
404973471bf0Spatrick .addImm(0);
405073471bf0Spatrick
405173471bf0Spatrick return {{
405273471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
405373471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
405473471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
405573471bf0Spatrick }};
405673471bf0Spatrick }
405773471bf0Spatrick
405873471bf0Spatrick InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const405973471bf0Spatrick AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
406073471bf0Spatrick Register Addr = Root.getReg();
406173471bf0Spatrick Register PtrBase;
406273471bf0Spatrick int64_t ConstOffset;
406373471bf0Spatrick int64_t ImmOffset = 0;
406473471bf0Spatrick
406573471bf0Spatrick // Match the immediate offset first, which canonically is moved as low as
406673471bf0Spatrick // possible.
406773471bf0Spatrick std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
406873471bf0Spatrick
406973471bf0Spatrick if (ConstOffset != 0 &&
407073471bf0Spatrick TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
407173471bf0Spatrick SIInstrFlags::FlatScratch)) {
407273471bf0Spatrick Addr = PtrBase;
407373471bf0Spatrick ImmOffset = ConstOffset;
407473471bf0Spatrick }
407573471bf0Spatrick
407673471bf0Spatrick auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
407773471bf0Spatrick if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
407873471bf0Spatrick int FI = AddrDef->MI->getOperand(1).getIndex();
407973471bf0Spatrick return {{
408073471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
408173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
408273471bf0Spatrick }};
408373471bf0Spatrick }
408473471bf0Spatrick
408573471bf0Spatrick Register SAddr = AddrDef->Reg;
408673471bf0Spatrick
408773471bf0Spatrick if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
408873471bf0Spatrick Register LHS = AddrDef->MI->getOperand(1).getReg();
408973471bf0Spatrick Register RHS = AddrDef->MI->getOperand(2).getReg();
409073471bf0Spatrick auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
409173471bf0Spatrick auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
409273471bf0Spatrick
4093*d415bd75Srobert if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
409473471bf0Spatrick isSGPR(RHSDef->Reg)) {
409573471bf0Spatrick int FI = LHSDef->MI->getOperand(1).getIndex();
409673471bf0Spatrick MachineInstr &I = *Root.getParent();
409773471bf0Spatrick MachineBasicBlock *BB = I.getParent();
409873471bf0Spatrick const DebugLoc &DL = I.getDebugLoc();
409973471bf0Spatrick SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
410073471bf0Spatrick
410173471bf0Spatrick BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
410273471bf0Spatrick .addFrameIndex(FI)
410373471bf0Spatrick .addReg(RHSDef->Reg);
410473471bf0Spatrick }
410573471bf0Spatrick }
410673471bf0Spatrick
410773471bf0Spatrick if (!isSGPR(SAddr))
4108*d415bd75Srobert return std::nullopt;
410973471bf0Spatrick
411073471bf0Spatrick return {{
411173471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
411273471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
411373471bf0Spatrick }};
411409467b48Spatrick }
411509467b48Spatrick
4116*d415bd75Srobert // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(Register VAddr,Register SAddr,uint64_t ImmOffset) const4117*d415bd75Srobert bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4118*d415bd75Srobert Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4119*d415bd75Srobert if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4120*d415bd75Srobert return false;
4121*d415bd75Srobert
4122*d415bd75Srobert // The bug affects the swizzling of SVS accesses if there is any carry out
4123*d415bd75Srobert // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4124*d415bd75Srobert // voffset to (soffset + inst_offset).
4125*d415bd75Srobert auto VKnown = KnownBits->getKnownBits(VAddr);
4126*d415bd75Srobert auto SKnown = KnownBits::computeForAddSub(
4127*d415bd75Srobert true, false, KnownBits->getKnownBits(SAddr),
4128*d415bd75Srobert KnownBits::makeConstant(APInt(32, ImmOffset)));
4129*d415bd75Srobert uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4130*d415bd75Srobert uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4131*d415bd75Srobert return (VMax & 3) + (SMax & 3) >= 4;
4132*d415bd75Srobert }
4133*d415bd75Srobert
4134*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand & Root) const4135*d415bd75Srobert AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4136*d415bd75Srobert Register Addr = Root.getReg();
4137*d415bd75Srobert Register PtrBase;
4138*d415bd75Srobert int64_t ConstOffset;
4139*d415bd75Srobert int64_t ImmOffset = 0;
4140*d415bd75Srobert
4141*d415bd75Srobert // Match the immediate offset first, which canonically is moved as low as
4142*d415bd75Srobert // possible.
4143*d415bd75Srobert std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4144*d415bd75Srobert
4145*d415bd75Srobert if (ConstOffset != 0 &&
4146*d415bd75Srobert TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4147*d415bd75Srobert Addr = PtrBase;
4148*d415bd75Srobert ImmOffset = ConstOffset;
4149*d415bd75Srobert }
4150*d415bd75Srobert
4151*d415bd75Srobert auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4152*d415bd75Srobert if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4153*d415bd75Srobert return std::nullopt;
4154*d415bd75Srobert
4155*d415bd75Srobert Register RHS = AddrDef->MI->getOperand(2).getReg();
4156*d415bd75Srobert if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4157*d415bd75Srobert return std::nullopt;
4158*d415bd75Srobert
4159*d415bd75Srobert Register LHS = AddrDef->MI->getOperand(1).getReg();
4160*d415bd75Srobert auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4161*d415bd75Srobert
4162*d415bd75Srobert if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4163*d415bd75Srobert return std::nullopt;
4164*d415bd75Srobert
4165*d415bd75Srobert if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4166*d415bd75Srobert int FI = LHSDef->MI->getOperand(1).getIndex();
4167*d415bd75Srobert return {{
4168*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4169*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4170*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4171*d415bd75Srobert }};
4172*d415bd75Srobert }
4173*d415bd75Srobert
4174*d415bd75Srobert if (!isSGPR(LHS))
4175*d415bd75Srobert return std::nullopt;
4176*d415bd75Srobert
4177*d415bd75Srobert return {{
4178*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4179*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4180*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4181*d415bd75Srobert }};
4182*d415bd75Srobert }
4183*d415bd75Srobert
418409467b48Spatrick InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const418509467b48Spatrick AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
418609467b48Spatrick MachineInstr *MI = Root.getParent();
418709467b48Spatrick MachineBasicBlock *MBB = MI->getParent();
418809467b48Spatrick MachineFunction *MF = MBB->getParent();
418909467b48Spatrick const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
419009467b48Spatrick
419109467b48Spatrick int64_t Offset = 0;
4192097a140dSpatrick if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4193097a140dSpatrick Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
419409467b48Spatrick Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
419509467b48Spatrick
419609467b48Spatrick // TODO: Should this be inside the render function? The iterator seems to
419709467b48Spatrick // move.
419809467b48Spatrick BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
419909467b48Spatrick HighBits)
420009467b48Spatrick .addImm(Offset & ~4095);
420109467b48Spatrick
420209467b48Spatrick return {{[=](MachineInstrBuilder &MIB) { // rsrc
420309467b48Spatrick MIB.addReg(Info->getScratchRSrcReg());
420409467b48Spatrick },
420509467b48Spatrick [=](MachineInstrBuilder &MIB) { // vaddr
420609467b48Spatrick MIB.addReg(HighBits);
420709467b48Spatrick },
420809467b48Spatrick [=](MachineInstrBuilder &MIB) { // soffset
420973471bf0Spatrick // Use constant zero for soffset and rely on eliminateFrameIndex
421073471bf0Spatrick // to choose the appropriate frame register if need be.
4211097a140dSpatrick MIB.addImm(0);
421209467b48Spatrick },
421309467b48Spatrick [=](MachineInstrBuilder &MIB) { // offset
421409467b48Spatrick MIB.addImm(Offset & 4095);
421509467b48Spatrick }}};
421609467b48Spatrick }
421709467b48Spatrick
4218097a140dSpatrick assert(Offset == 0 || Offset == -1);
421909467b48Spatrick
422009467b48Spatrick // Try to fold a frame index directly into the MUBUF vaddr field, and any
422109467b48Spatrick // offsets.
4222*d415bd75Srobert std::optional<int> FI;
422309467b48Spatrick Register VAddr = Root.getReg();
422409467b48Spatrick if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
422573471bf0Spatrick Register PtrBase;
422673471bf0Spatrick int64_t ConstOffset;
422773471bf0Spatrick std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
422873471bf0Spatrick if (ConstOffset != 0) {
422973471bf0Spatrick if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
423009467b48Spatrick (!STI.privateMemoryResourceIsRangeChecked() ||
423173471bf0Spatrick KnownBits->signBitIsZero(PtrBase))) {
423273471bf0Spatrick const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
423373471bf0Spatrick if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
423473471bf0Spatrick FI = PtrBaseDef->getOperand(1).getIndex();
423509467b48Spatrick else
423673471bf0Spatrick VAddr = PtrBase;
423773471bf0Spatrick Offset = ConstOffset;
423809467b48Spatrick }
423909467b48Spatrick } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
424009467b48Spatrick FI = RootDef->getOperand(1).getIndex();
424109467b48Spatrick }
424209467b48Spatrick }
424309467b48Spatrick
424409467b48Spatrick return {{[=](MachineInstrBuilder &MIB) { // rsrc
424509467b48Spatrick MIB.addReg(Info->getScratchRSrcReg());
424609467b48Spatrick },
424709467b48Spatrick [=](MachineInstrBuilder &MIB) { // vaddr
4248*d415bd75Srobert if (FI)
4249*d415bd75Srobert MIB.addFrameIndex(*FI);
425009467b48Spatrick else
425109467b48Spatrick MIB.addReg(VAddr);
425209467b48Spatrick },
425309467b48Spatrick [=](MachineInstrBuilder &MIB) { // soffset
425473471bf0Spatrick // Use constant zero for soffset and rely on eliminateFrameIndex
425573471bf0Spatrick // to choose the appropriate frame register if need be.
4256097a140dSpatrick MIB.addImm(0);
425709467b48Spatrick },
425809467b48Spatrick [=](MachineInstrBuilder &MIB) { // offset
425909467b48Spatrick MIB.addImm(Offset);
426009467b48Spatrick }}};
426109467b48Spatrick }
426209467b48Spatrick
isDSOffsetLegal(Register Base,int64_t Offset) const4263097a140dSpatrick bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
426473471bf0Spatrick int64_t Offset) const {
426573471bf0Spatrick if (!isUInt<16>(Offset))
426673471bf0Spatrick return false;
426773471bf0Spatrick
426873471bf0Spatrick if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
426973471bf0Spatrick return true;
427073471bf0Spatrick
427173471bf0Spatrick // On Southern Islands instruction with a negative base value and an offset
427273471bf0Spatrick // don't seem to work.
427373471bf0Spatrick return KnownBits->signBitIsZero(Base);
427473471bf0Spatrick }
427573471bf0Spatrick
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const427673471bf0Spatrick bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
427773471bf0Spatrick int64_t Offset1,
427873471bf0Spatrick unsigned Size) const {
427973471bf0Spatrick if (Offset0 % Size != 0 || Offset1 % Size != 0)
428073471bf0Spatrick return false;
428173471bf0Spatrick if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
428209467b48Spatrick return false;
428309467b48Spatrick
428409467b48Spatrick if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
428509467b48Spatrick return true;
428609467b48Spatrick
428709467b48Spatrick // On Southern Islands instruction with a negative base value and an offset
428809467b48Spatrick // don't seem to work.
4289097a140dSpatrick return KnownBits->signBitIsZero(Base);
429009467b48Spatrick }
429109467b48Spatrick
isUnneededShiftMask(const MachineInstr & MI,unsigned ShAmtBits) const4292*d415bd75Srobert bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4293*d415bd75Srobert unsigned ShAmtBits) const {
4294*d415bd75Srobert assert(MI.getOpcode() == TargetOpcode::G_AND);
4295*d415bd75Srobert
4296*d415bd75Srobert std::optional<APInt> RHS =
4297*d415bd75Srobert getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4298*d415bd75Srobert if (!RHS)
4299*d415bd75Srobert return false;
4300*d415bd75Srobert
4301*d415bd75Srobert if (RHS->countTrailingOnes() >= ShAmtBits)
4302*d415bd75Srobert return true;
4303*d415bd75Srobert
4304*d415bd75Srobert const APInt &LHSKnownZeros =
4305*d415bd75Srobert KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
4306*d415bd75Srobert return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
4307*d415bd75Srobert }
4308*d415bd75Srobert
4309*d415bd75Srobert // Return the wave level SGPR base address if this is a wave address.
getWaveAddress(const MachineInstr * Def)4310*d415bd75Srobert static Register getWaveAddress(const MachineInstr *Def) {
4311*d415bd75Srobert return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
4312*d415bd75Srobert ? Def->getOperand(1).getReg()
4313*d415bd75Srobert : Register();
4314*d415bd75Srobert }
4315*d415bd75Srobert
431609467b48Spatrick InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const431709467b48Spatrick AMDGPUInstructionSelector::selectMUBUFScratchOffset(
431809467b48Spatrick MachineOperand &Root) const {
4319*d415bd75Srobert Register Reg = Root.getReg();
4320*d415bd75Srobert const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4321*d415bd75Srobert
4322*d415bd75Srobert const MachineInstr *Def = MRI->getVRegDef(Reg);
4323*d415bd75Srobert if (Register WaveBase = getWaveAddress(Def)) {
4324*d415bd75Srobert return {{
4325*d415bd75Srobert [=](MachineInstrBuilder &MIB) { // rsrc
4326*d415bd75Srobert MIB.addReg(Info->getScratchRSrcReg());
4327*d415bd75Srobert },
4328*d415bd75Srobert [=](MachineInstrBuilder &MIB) { // soffset
4329*d415bd75Srobert MIB.addReg(WaveBase);
4330*d415bd75Srobert },
4331*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4332*d415bd75Srobert }};
4333*d415bd75Srobert }
433409467b48Spatrick
433509467b48Spatrick int64_t Offset = 0;
4336*d415bd75Srobert
4337*d415bd75Srobert // FIXME: Copy check is a hack
4338*d415bd75Srobert Register BasePtr;
4339*d415bd75Srobert if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
4340*d415bd75Srobert if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
4341*d415bd75Srobert return {};
4342*d415bd75Srobert const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
4343*d415bd75Srobert Register WaveBase = getWaveAddress(BasePtrDef);
4344*d415bd75Srobert if (!WaveBase)
4345*d415bd75Srobert return {};
4346*d415bd75Srobert
4347*d415bd75Srobert return {{
4348*d415bd75Srobert [=](MachineInstrBuilder &MIB) { // rsrc
4349*d415bd75Srobert MIB.addReg(Info->getScratchRSrcReg());
4350*d415bd75Srobert },
4351*d415bd75Srobert [=](MachineInstrBuilder &MIB) { // soffset
4352*d415bd75Srobert MIB.addReg(WaveBase);
4353*d415bd75Srobert },
4354*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4355*d415bd75Srobert }};
4356*d415bd75Srobert }
4357*d415bd75Srobert
435809467b48Spatrick if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
435909467b48Spatrick !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
436009467b48Spatrick return {};
436109467b48Spatrick
436209467b48Spatrick return {{
4363097a140dSpatrick [=](MachineInstrBuilder &MIB) { // rsrc
436409467b48Spatrick MIB.addReg(Info->getScratchRSrcReg());
4365097a140dSpatrick },
4366097a140dSpatrick [=](MachineInstrBuilder &MIB) { // soffset
4367097a140dSpatrick MIB.addImm(0);
4368097a140dSpatrick },
436909467b48Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
437009467b48Spatrick }};
437109467b48Spatrick }
437209467b48Spatrick
4373097a140dSpatrick std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const4374097a140dSpatrick AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
437509467b48Spatrick const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4376097a140dSpatrick if (!RootDef)
4377*d415bd75Srobert return std::pair(Root.getReg(), 0);
437809467b48Spatrick
437909467b48Spatrick int64_t ConstAddr = 0;
4380097a140dSpatrick
4381097a140dSpatrick Register PtrBase;
4382097a140dSpatrick int64_t Offset;
4383097a140dSpatrick std::tie(PtrBase, Offset) =
4384097a140dSpatrick getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4385097a140dSpatrick
4386097a140dSpatrick if (Offset) {
438773471bf0Spatrick if (isDSOffsetLegal(PtrBase, Offset)) {
438809467b48Spatrick // (add n0, c0)
4389*d415bd75Srobert return std::pair(PtrBase, Offset);
439009467b48Spatrick }
439109467b48Spatrick } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4392097a140dSpatrick // TODO
439309467b48Spatrick
439409467b48Spatrick
439509467b48Spatrick } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4396097a140dSpatrick // TODO
439709467b48Spatrick
439809467b48Spatrick }
439909467b48Spatrick
4400*d415bd75Srobert return std::pair(Root.getReg(), 0);
4401097a140dSpatrick }
4402097a140dSpatrick
4403097a140dSpatrick InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const4404097a140dSpatrick AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4405097a140dSpatrick Register Reg;
4406097a140dSpatrick unsigned Offset;
4407097a140dSpatrick std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
440809467b48Spatrick return {{
4409097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4410097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
441109467b48Spatrick }};
441209467b48Spatrick }
441309467b48Spatrick
4414097a140dSpatrick InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const4415097a140dSpatrick AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
441673471bf0Spatrick return selectDSReadWrite2(Root, 4);
441773471bf0Spatrick }
441873471bf0Spatrick
441973471bf0Spatrick InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const442073471bf0Spatrick AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
442173471bf0Spatrick return selectDSReadWrite2(Root, 8);
442273471bf0Spatrick }
442373471bf0Spatrick
442473471bf0Spatrick InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const442573471bf0Spatrick AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
442673471bf0Spatrick unsigned Size) const {
4427097a140dSpatrick Register Reg;
4428097a140dSpatrick unsigned Offset;
442973471bf0Spatrick std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4430097a140dSpatrick return {{
4431097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4432097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4433097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4434097a140dSpatrick }};
4435097a140dSpatrick }
4436097a140dSpatrick
4437097a140dSpatrick std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const443873471bf0Spatrick AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
443973471bf0Spatrick unsigned Size) const {
4440097a140dSpatrick const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4441097a140dSpatrick if (!RootDef)
4442*d415bd75Srobert return std::pair(Root.getReg(), 0);
4443097a140dSpatrick
4444097a140dSpatrick int64_t ConstAddr = 0;
4445097a140dSpatrick
4446097a140dSpatrick Register PtrBase;
4447097a140dSpatrick int64_t Offset;
4448097a140dSpatrick std::tie(PtrBase, Offset) =
4449097a140dSpatrick getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4450097a140dSpatrick
4451097a140dSpatrick if (Offset) {
445273471bf0Spatrick int64_t OffsetValue0 = Offset;
445373471bf0Spatrick int64_t OffsetValue1 = Offset + Size;
445473471bf0Spatrick if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4455097a140dSpatrick // (add n0, c0)
4456*d415bd75Srobert return std::pair(PtrBase, OffsetValue0 / Size);
4457097a140dSpatrick }
4458097a140dSpatrick } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4459097a140dSpatrick // TODO
4460097a140dSpatrick
4461097a140dSpatrick } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4462097a140dSpatrick // TODO
4463097a140dSpatrick
4464097a140dSpatrick }
4465097a140dSpatrick
4466*d415bd75Srobert return std::pair(Root.getReg(), 0);
4467097a140dSpatrick }
4468097a140dSpatrick
4469097a140dSpatrick /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4470097a140dSpatrick /// the base value with the constant offset. There may be intervening copies
4471097a140dSpatrick /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4472097a140dSpatrick /// not match the pattern.
4473097a140dSpatrick std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const4474097a140dSpatrick AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4475097a140dSpatrick Register Root, const MachineRegisterInfo &MRI) const {
447673471bf0Spatrick MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4477097a140dSpatrick if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4478097a140dSpatrick return {Root, 0};
4479097a140dSpatrick
4480097a140dSpatrick MachineOperand &RHS = RootI->getOperand(2);
4481*d415bd75Srobert std::optional<ValueAndVReg> MaybeOffset =
4482*d415bd75Srobert getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4483097a140dSpatrick if (!MaybeOffset)
4484097a140dSpatrick return {Root, 0};
448573471bf0Spatrick return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4486097a140dSpatrick }
4487097a140dSpatrick
addZeroImm(MachineInstrBuilder & MIB)4488097a140dSpatrick static void addZeroImm(MachineInstrBuilder &MIB) {
4489097a140dSpatrick MIB.addImm(0);
4490097a140dSpatrick }
4491097a140dSpatrick
4492097a140dSpatrick /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4493097a140dSpatrick /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)4494097a140dSpatrick static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4495097a140dSpatrick uint32_t FormatLo, uint32_t FormatHi,
4496097a140dSpatrick Register BasePtr) {
4497097a140dSpatrick Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4498097a140dSpatrick Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4499097a140dSpatrick Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4500097a140dSpatrick Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4501097a140dSpatrick
4502097a140dSpatrick B.buildInstr(AMDGPU::S_MOV_B32)
4503097a140dSpatrick .addDef(RSrc2)
4504097a140dSpatrick .addImm(FormatLo);
4505097a140dSpatrick B.buildInstr(AMDGPU::S_MOV_B32)
4506097a140dSpatrick .addDef(RSrc3)
4507097a140dSpatrick .addImm(FormatHi);
4508097a140dSpatrick
4509097a140dSpatrick // Build the half of the subregister with the constants before building the
4510097a140dSpatrick // full 128-bit register. If we are building multiple resource descriptors,
4511097a140dSpatrick // this will allow CSEing of the 2-component register.
4512097a140dSpatrick B.buildInstr(AMDGPU::REG_SEQUENCE)
4513097a140dSpatrick .addDef(RSrcHi)
4514097a140dSpatrick .addReg(RSrc2)
4515097a140dSpatrick .addImm(AMDGPU::sub0)
4516097a140dSpatrick .addReg(RSrc3)
4517097a140dSpatrick .addImm(AMDGPU::sub1);
4518097a140dSpatrick
4519097a140dSpatrick Register RSrcLo = BasePtr;
4520097a140dSpatrick if (!BasePtr) {
4521097a140dSpatrick RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4522097a140dSpatrick B.buildInstr(AMDGPU::S_MOV_B64)
4523097a140dSpatrick .addDef(RSrcLo)
4524097a140dSpatrick .addImm(0);
4525097a140dSpatrick }
4526097a140dSpatrick
4527097a140dSpatrick B.buildInstr(AMDGPU::REG_SEQUENCE)
4528097a140dSpatrick .addDef(RSrc)
4529097a140dSpatrick .addReg(RSrcLo)
4530097a140dSpatrick .addImm(AMDGPU::sub0_sub1)
4531097a140dSpatrick .addReg(RSrcHi)
4532097a140dSpatrick .addImm(AMDGPU::sub2_sub3);
4533097a140dSpatrick
4534097a140dSpatrick return RSrc;
4535097a140dSpatrick }
4536097a140dSpatrick
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)4537097a140dSpatrick static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4538097a140dSpatrick const SIInstrInfo &TII, Register BasePtr) {
4539097a140dSpatrick uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4540097a140dSpatrick
4541097a140dSpatrick // FIXME: Why are half the "default" bits ignored based on the addressing
4542097a140dSpatrick // mode?
4543097a140dSpatrick return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4544097a140dSpatrick }
4545097a140dSpatrick
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)4546097a140dSpatrick static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4547097a140dSpatrick const SIInstrInfo &TII, Register BasePtr) {
4548097a140dSpatrick uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4549097a140dSpatrick
4550097a140dSpatrick // FIXME: Why are half the "default" bits ignored based on the addressing
4551097a140dSpatrick // mode?
4552097a140dSpatrick return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4553097a140dSpatrick }
4554097a140dSpatrick
4555097a140dSpatrick AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const4556097a140dSpatrick AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4557097a140dSpatrick MUBUFAddressData Data;
4558097a140dSpatrick Data.N0 = Src;
4559097a140dSpatrick
4560097a140dSpatrick Register PtrBase;
4561097a140dSpatrick int64_t Offset;
4562097a140dSpatrick
4563097a140dSpatrick std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4564097a140dSpatrick if (isUInt<32>(Offset)) {
4565097a140dSpatrick Data.N0 = PtrBase;
4566097a140dSpatrick Data.Offset = Offset;
4567097a140dSpatrick }
4568097a140dSpatrick
4569097a140dSpatrick if (MachineInstr *InputAdd
4570097a140dSpatrick = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4571097a140dSpatrick Data.N2 = InputAdd->getOperand(1).getReg();
4572097a140dSpatrick Data.N3 = InputAdd->getOperand(2).getReg();
4573097a140dSpatrick
4574097a140dSpatrick // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4575097a140dSpatrick // FIXME: Don't know this was defined by operand 0
4576097a140dSpatrick //
4577097a140dSpatrick // TODO: Remove this when we have copy folding optimizations after
4578097a140dSpatrick // RegBankSelect.
4579097a140dSpatrick Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4580097a140dSpatrick Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4581097a140dSpatrick }
4582097a140dSpatrick
4583097a140dSpatrick return Data;
4584097a140dSpatrick }
4585097a140dSpatrick
4586097a140dSpatrick /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const4587097a140dSpatrick bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4588097a140dSpatrick // (ptr_add N2, N3) -> addr64, or
4589097a140dSpatrick // (ptr_add (ptr_add N2, N3), C1) -> addr64
4590097a140dSpatrick if (Addr.N2)
4591097a140dSpatrick return true;
4592097a140dSpatrick
4593097a140dSpatrick const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4594097a140dSpatrick return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4595097a140dSpatrick }
4596097a140dSpatrick
4597097a140dSpatrick /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4598097a140dSpatrick /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4599097a140dSpatrick /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const4600097a140dSpatrick void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4601097a140dSpatrick MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4602097a140dSpatrick if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4603097a140dSpatrick return;
4604097a140dSpatrick
4605097a140dSpatrick // Illegal offset, store it in soffset.
4606097a140dSpatrick SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4607097a140dSpatrick B.buildInstr(AMDGPU::S_MOV_B32)
4608097a140dSpatrick .addDef(SOffset)
4609097a140dSpatrick .addImm(ImmOffset);
4610097a140dSpatrick ImmOffset = 0;
4611097a140dSpatrick }
4612097a140dSpatrick
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const4613097a140dSpatrick bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4614097a140dSpatrick MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4615097a140dSpatrick Register &SOffset, int64_t &Offset) const {
4616097a140dSpatrick // FIXME: Predicates should stop this from reaching here.
4617097a140dSpatrick // addr64 bit was removed for volcanic islands.
4618097a140dSpatrick if (!STI.hasAddr64() || STI.useFlatForGlobal())
4619097a140dSpatrick return false;
4620097a140dSpatrick
4621097a140dSpatrick MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4622097a140dSpatrick if (!shouldUseAddr64(AddrData))
4623097a140dSpatrick return false;
4624097a140dSpatrick
4625097a140dSpatrick Register N0 = AddrData.N0;
4626097a140dSpatrick Register N2 = AddrData.N2;
4627097a140dSpatrick Register N3 = AddrData.N3;
4628097a140dSpatrick Offset = AddrData.Offset;
4629097a140dSpatrick
4630097a140dSpatrick // Base pointer for the SRD.
4631097a140dSpatrick Register SRDPtr;
4632097a140dSpatrick
4633097a140dSpatrick if (N2) {
4634097a140dSpatrick if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4635097a140dSpatrick assert(N3);
4636097a140dSpatrick if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4637097a140dSpatrick // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4638097a140dSpatrick // addr64, and construct the default resource from a 0 address.
4639097a140dSpatrick VAddr = N0;
4640097a140dSpatrick } else {
4641097a140dSpatrick SRDPtr = N3;
4642097a140dSpatrick VAddr = N2;
4643097a140dSpatrick }
4644097a140dSpatrick } else {
4645097a140dSpatrick // N2 is not divergent.
4646097a140dSpatrick SRDPtr = N2;
4647097a140dSpatrick VAddr = N3;
4648097a140dSpatrick }
4649097a140dSpatrick } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4650097a140dSpatrick // Use the default null pointer in the resource
4651097a140dSpatrick VAddr = N0;
4652097a140dSpatrick } else {
4653097a140dSpatrick // N0 -> offset, or
4654097a140dSpatrick // (N0 + C1) -> offset
4655097a140dSpatrick SRDPtr = N0;
4656097a140dSpatrick }
4657097a140dSpatrick
4658097a140dSpatrick MachineIRBuilder B(*Root.getParent());
4659097a140dSpatrick RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4660097a140dSpatrick splitIllegalMUBUFOffset(B, SOffset, Offset);
4661097a140dSpatrick return true;
4662097a140dSpatrick }
4663097a140dSpatrick
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const4664097a140dSpatrick bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4665097a140dSpatrick MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4666097a140dSpatrick int64_t &Offset) const {
466773471bf0Spatrick
466873471bf0Spatrick // FIXME: Pattern should not reach here.
466973471bf0Spatrick if (STI.useFlatForGlobal())
467073471bf0Spatrick return false;
467173471bf0Spatrick
4672097a140dSpatrick MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4673097a140dSpatrick if (shouldUseAddr64(AddrData))
4674097a140dSpatrick return false;
4675097a140dSpatrick
4676097a140dSpatrick // N0 -> offset, or
4677097a140dSpatrick // (N0 + C1) -> offset
4678097a140dSpatrick Register SRDPtr = AddrData.N0;
4679097a140dSpatrick Offset = AddrData.Offset;
4680097a140dSpatrick
4681097a140dSpatrick // TODO: Look through extensions for 32-bit soffset.
4682097a140dSpatrick MachineIRBuilder B(*Root.getParent());
4683097a140dSpatrick
4684097a140dSpatrick RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4685097a140dSpatrick splitIllegalMUBUFOffset(B, SOffset, Offset);
4686097a140dSpatrick return true;
4687097a140dSpatrick }
4688097a140dSpatrick
4689097a140dSpatrick InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const4690097a140dSpatrick AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4691097a140dSpatrick Register VAddr;
4692097a140dSpatrick Register RSrcReg;
4693097a140dSpatrick Register SOffset;
4694097a140dSpatrick int64_t Offset = 0;
4695097a140dSpatrick
4696097a140dSpatrick if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4697097a140dSpatrick return {};
4698097a140dSpatrick
4699097a140dSpatrick // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4700097a140dSpatrick // pattern.
4701097a140dSpatrick return {{
4702097a140dSpatrick [=](MachineInstrBuilder &MIB) { // rsrc
4703097a140dSpatrick MIB.addReg(RSrcReg);
4704097a140dSpatrick },
4705097a140dSpatrick [=](MachineInstrBuilder &MIB) { // vaddr
4706097a140dSpatrick MIB.addReg(VAddr);
4707097a140dSpatrick },
4708097a140dSpatrick [=](MachineInstrBuilder &MIB) { // soffset
4709097a140dSpatrick if (SOffset)
4710097a140dSpatrick MIB.addReg(SOffset);
4711097a140dSpatrick else
4712097a140dSpatrick MIB.addImm(0);
4713097a140dSpatrick },
4714097a140dSpatrick [=](MachineInstrBuilder &MIB) { // offset
4715097a140dSpatrick MIB.addImm(Offset);
4716097a140dSpatrick },
471773471bf0Spatrick addZeroImm, // cpol
4718097a140dSpatrick addZeroImm, // tfe
4719097a140dSpatrick addZeroImm // swz
4720097a140dSpatrick }};
4721097a140dSpatrick }
4722097a140dSpatrick
4723097a140dSpatrick InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const4724097a140dSpatrick AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4725097a140dSpatrick Register RSrcReg;
4726097a140dSpatrick Register SOffset;
4727097a140dSpatrick int64_t Offset = 0;
4728097a140dSpatrick
4729097a140dSpatrick if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4730097a140dSpatrick return {};
4731097a140dSpatrick
4732097a140dSpatrick return {{
4733097a140dSpatrick [=](MachineInstrBuilder &MIB) { // rsrc
4734097a140dSpatrick MIB.addReg(RSrcReg);
4735097a140dSpatrick },
4736097a140dSpatrick [=](MachineInstrBuilder &MIB) { // soffset
4737097a140dSpatrick if (SOffset)
4738097a140dSpatrick MIB.addReg(SOffset);
4739097a140dSpatrick else
4740097a140dSpatrick MIB.addImm(0);
4741097a140dSpatrick },
4742097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
474373471bf0Spatrick addZeroImm, // cpol
4744097a140dSpatrick addZeroImm, // tfe
474573471bf0Spatrick addZeroImm, // swz
4746097a140dSpatrick }};
4747097a140dSpatrick }
4748097a140dSpatrick
4749097a140dSpatrick InstructionSelector::ComplexRendererFns
selectMUBUFAddr64Atomic(MachineOperand & Root) const4750097a140dSpatrick AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4751097a140dSpatrick Register VAddr;
4752097a140dSpatrick Register RSrcReg;
4753097a140dSpatrick Register SOffset;
4754097a140dSpatrick int64_t Offset = 0;
4755097a140dSpatrick
4756097a140dSpatrick if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4757097a140dSpatrick return {};
4758097a140dSpatrick
4759097a140dSpatrick // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4760097a140dSpatrick // pattern.
4761097a140dSpatrick return {{
4762097a140dSpatrick [=](MachineInstrBuilder &MIB) { // rsrc
4763097a140dSpatrick MIB.addReg(RSrcReg);
4764097a140dSpatrick },
4765097a140dSpatrick [=](MachineInstrBuilder &MIB) { // vaddr
4766097a140dSpatrick MIB.addReg(VAddr);
4767097a140dSpatrick },
4768097a140dSpatrick [=](MachineInstrBuilder &MIB) { // soffset
4769097a140dSpatrick if (SOffset)
4770097a140dSpatrick MIB.addReg(SOffset);
4771097a140dSpatrick else
4772097a140dSpatrick MIB.addImm(0);
4773097a140dSpatrick },
4774097a140dSpatrick [=](MachineInstrBuilder &MIB) { // offset
4775097a140dSpatrick MIB.addImm(Offset);
4776097a140dSpatrick },
477773471bf0Spatrick [=](MachineInstrBuilder &MIB) {
477873471bf0Spatrick MIB.addImm(AMDGPU::CPol::GLC); // cpol
477973471bf0Spatrick }
4780097a140dSpatrick }};
4781097a140dSpatrick }
4782097a140dSpatrick
4783097a140dSpatrick InstructionSelector::ComplexRendererFns
selectMUBUFOffsetAtomic(MachineOperand & Root) const4784097a140dSpatrick AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4785097a140dSpatrick Register RSrcReg;
4786097a140dSpatrick Register SOffset;
4787097a140dSpatrick int64_t Offset = 0;
4788097a140dSpatrick
4789097a140dSpatrick if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4790097a140dSpatrick return {};
4791097a140dSpatrick
4792097a140dSpatrick return {{
4793097a140dSpatrick [=](MachineInstrBuilder &MIB) { // rsrc
4794097a140dSpatrick MIB.addReg(RSrcReg);
4795097a140dSpatrick },
4796097a140dSpatrick [=](MachineInstrBuilder &MIB) { // soffset
4797097a140dSpatrick if (SOffset)
4798097a140dSpatrick MIB.addReg(SOffset);
4799097a140dSpatrick else
4800097a140dSpatrick MIB.addImm(0);
4801097a140dSpatrick },
4802097a140dSpatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
480373471bf0Spatrick [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4804097a140dSpatrick }};
4805097a140dSpatrick }
4806097a140dSpatrick
4807097a140dSpatrick /// Get an immediate that must be 32-bits, and treated as zero extended.
4808*d415bd75Srobert static std::optional<uint64_t>
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)4809*d415bd75Srobert getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
4810*d415bd75Srobert // getIConstantVRegVal sexts any values, so see if that matters.
4811*d415bd75Srobert std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
4812097a140dSpatrick if (!OffsetVal || !isInt<32>(*OffsetVal))
4813*d415bd75Srobert return std::nullopt;
4814097a140dSpatrick return Lo_32(*OffsetVal);
4815097a140dSpatrick }
4816097a140dSpatrick
4817097a140dSpatrick InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const4818097a140dSpatrick AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4819*d415bd75Srobert std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4820097a140dSpatrick if (!OffsetVal)
4821097a140dSpatrick return {};
4822097a140dSpatrick
4823*d415bd75Srobert std::optional<int64_t> EncodedImm =
4824097a140dSpatrick AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4825097a140dSpatrick if (!EncodedImm)
4826097a140dSpatrick return {};
4827097a140dSpatrick
4828097a140dSpatrick return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4829097a140dSpatrick }
4830097a140dSpatrick
4831097a140dSpatrick InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const4832097a140dSpatrick AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4833097a140dSpatrick assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
4834097a140dSpatrick
4835*d415bd75Srobert std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4836097a140dSpatrick if (!OffsetVal)
4837097a140dSpatrick return {};
4838097a140dSpatrick
4839*d415bd75Srobert std::optional<int64_t> EncodedImm =
4840*d415bd75Srobert AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4841097a140dSpatrick if (!EncodedImm)
4842097a140dSpatrick return {};
4843097a140dSpatrick
4844097a140dSpatrick return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4845097a140dSpatrick }
4846097a140dSpatrick
4847*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectSMRDBufferSgprImm(MachineOperand & Root) const4848*d415bd75Srobert AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
4849*d415bd75Srobert // Match the (soffset + offset) pair as a 32-bit register base and
4850*d415bd75Srobert // an immediate offset.
4851*d415bd75Srobert Register SOffset;
4852*d415bd75Srobert unsigned Offset;
4853*d415bd75Srobert std::tie(SOffset, Offset) =
4854*d415bd75Srobert AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KnownBits);
4855*d415bd75Srobert if (!SOffset)
4856*d415bd75Srobert return std::nullopt;
4857*d415bd75Srobert
4858*d415bd75Srobert std::optional<int64_t> EncodedOffset =
4859*d415bd75Srobert AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
4860*d415bd75Srobert if (!EncodedOffset)
4861*d415bd75Srobert return std::nullopt;
4862*d415bd75Srobert
4863*d415bd75Srobert assert(MRI->getType(SOffset) == LLT::scalar(32));
4864*d415bd75Srobert return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4865*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
4866*d415bd75Srobert }
4867*d415bd75Srobert
4868*d415bd75Srobert // Variant of stripBitCast that returns the instruction instead of a
4869*d415bd75Srobert // MachineOperand.
stripBitCast(MachineInstr * MI,MachineRegisterInfo & MRI)4870*d415bd75Srobert static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
4871*d415bd75Srobert if (MI->getOpcode() == AMDGPU::G_BITCAST)
4872*d415bd75Srobert return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4873*d415bd75Srobert return MI;
4874*d415bd75Srobert }
4875*d415bd75Srobert
4876*d415bd75Srobert // Figure out if this is really an extract of the high 16-bits of a dword,
4877*d415bd75Srobert // returns nullptr if it isn't.
isExtractHiElt(MachineInstr * Inst,MachineRegisterInfo & MRI)4878*d415bd75Srobert static MachineInstr *isExtractHiElt(MachineInstr *Inst,
4879*d415bd75Srobert MachineRegisterInfo &MRI) {
4880*d415bd75Srobert Inst = stripBitCast(Inst, MRI);
4881*d415bd75Srobert
4882*d415bd75Srobert if (Inst->getOpcode() != AMDGPU::G_TRUNC)
4883*d415bd75Srobert return nullptr;
4884*d415bd75Srobert
4885*d415bd75Srobert MachineInstr *TruncOp =
4886*d415bd75Srobert getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
4887*d415bd75Srobert TruncOp = stripBitCast(TruncOp, MRI);
4888*d415bd75Srobert
4889*d415bd75Srobert // G_LSHR x, (G_CONSTANT i32 16)
4890*d415bd75Srobert if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
4891*d415bd75Srobert auto SrlAmount = getIConstantVRegValWithLookThrough(
4892*d415bd75Srobert TruncOp->getOperand(2).getReg(), MRI);
4893*d415bd75Srobert if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
4894*d415bd75Srobert MachineInstr *SrlOp =
4895*d415bd75Srobert getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
4896*d415bd75Srobert return stripBitCast(SrlOp, MRI);
4897*d415bd75Srobert }
4898*d415bd75Srobert }
4899*d415bd75Srobert
4900*d415bd75Srobert // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
4901*d415bd75Srobert // 1, 0 swaps the low/high 16 bits.
4902*d415bd75Srobert // 1, 1 sets the high 16 bits to be the same as the low 16.
4903*d415bd75Srobert // in any case, it selects the high elts.
4904*d415bd75Srobert if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
4905*d415bd75Srobert assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
4906*d415bd75Srobert LLT::fixed_vector(2, 16));
4907*d415bd75Srobert
4908*d415bd75Srobert ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
4909*d415bd75Srobert assert(Mask.size() == 2);
4910*d415bd75Srobert
4911*d415bd75Srobert if (Mask[0] == 1 && Mask[1] <= 1) {
4912*d415bd75Srobert MachineInstr *LHS =
4913*d415bd75Srobert getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
4914*d415bd75Srobert return stripBitCast(LHS, MRI);
4915*d415bd75Srobert }
4916*d415bd75Srobert }
4917*d415bd75Srobert
4918*d415bd75Srobert return nullptr;
4919*d415bd75Srobert }
4920*d415bd75Srobert
4921*d415bd75Srobert std::pair<Register, unsigned>
selectVOP3PMadMixModsImpl(MachineOperand & Root,bool & Matched) const4922*d415bd75Srobert AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
4923*d415bd75Srobert bool &Matched) const {
4924*d415bd75Srobert Matched = false;
4925*d415bd75Srobert
4926*d415bd75Srobert Register Src;
4927*d415bd75Srobert unsigned Mods;
4928*d415bd75Srobert std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4929*d415bd75Srobert
4930*d415bd75Srobert MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4931*d415bd75Srobert if (MI->getOpcode() == AMDGPU::G_FPEXT) {
4932*d415bd75Srobert MachineOperand *MO = &MI->getOperand(1);
4933*d415bd75Srobert Src = MO->getReg();
4934*d415bd75Srobert MI = getDefIgnoringCopies(Src, *MRI);
4935*d415bd75Srobert
4936*d415bd75Srobert assert(MRI->getType(Src) == LLT::scalar(16));
4937*d415bd75Srobert
4938*d415bd75Srobert // See through bitcasts.
4939*d415bd75Srobert // FIXME: Would be nice to use stripBitCast here.
4940*d415bd75Srobert if (MI->getOpcode() == AMDGPU::G_BITCAST) {
4941*d415bd75Srobert MO = &MI->getOperand(1);
4942*d415bd75Srobert Src = MO->getReg();
4943*d415bd75Srobert MI = getDefIgnoringCopies(Src, *MRI);
4944*d415bd75Srobert }
4945*d415bd75Srobert
4946*d415bd75Srobert const auto CheckAbsNeg = [&]() {
4947*d415bd75Srobert // Be careful about folding modifiers if we already have an abs. fneg is
4948*d415bd75Srobert // applied last, so we don't want to apply an earlier fneg.
4949*d415bd75Srobert if ((Mods & SISrcMods::ABS) == 0) {
4950*d415bd75Srobert unsigned ModsTmp;
4951*d415bd75Srobert std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
4952*d415bd75Srobert MI = getDefIgnoringCopies(Src, *MRI);
4953*d415bd75Srobert
4954*d415bd75Srobert if ((ModsTmp & SISrcMods::NEG) != 0)
4955*d415bd75Srobert Mods ^= SISrcMods::NEG;
4956*d415bd75Srobert
4957*d415bd75Srobert if ((ModsTmp & SISrcMods::ABS) != 0)
4958*d415bd75Srobert Mods |= SISrcMods::ABS;
4959*d415bd75Srobert }
4960*d415bd75Srobert };
4961*d415bd75Srobert
4962*d415bd75Srobert CheckAbsNeg();
4963*d415bd75Srobert
4964*d415bd75Srobert // op_sel/op_sel_hi decide the source type and source.
4965*d415bd75Srobert // If the source's op_sel_hi is set, it indicates to do a conversion from
4966*d415bd75Srobert // fp16. If the sources's op_sel is set, it picks the high half of the
4967*d415bd75Srobert // source register.
4968*d415bd75Srobert
4969*d415bd75Srobert Mods |= SISrcMods::OP_SEL_1;
4970*d415bd75Srobert
4971*d415bd75Srobert if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
4972*d415bd75Srobert Mods |= SISrcMods::OP_SEL_0;
4973*d415bd75Srobert MI = ExtractHiEltMI;
4974*d415bd75Srobert MO = &MI->getOperand(0);
4975*d415bd75Srobert Src = MO->getReg();
4976*d415bd75Srobert
4977*d415bd75Srobert CheckAbsNeg();
4978*d415bd75Srobert }
4979*d415bd75Srobert
4980*d415bd75Srobert Matched = true;
4981*d415bd75Srobert }
4982*d415bd75Srobert
4983*d415bd75Srobert return {Src, Mods};
4984*d415bd75Srobert }
4985*d415bd75Srobert
4986*d415bd75Srobert InstructionSelector::ComplexRendererFns
selectVOP3PMadMixMods(MachineOperand & Root) const4987*d415bd75Srobert AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
4988*d415bd75Srobert Register Src;
4989*d415bd75Srobert unsigned Mods;
4990*d415bd75Srobert bool Matched;
4991*d415bd75Srobert std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
4992*d415bd75Srobert
4993*d415bd75Srobert return {{
4994*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4995*d415bd75Srobert [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4996*d415bd75Srobert }};
4997*d415bd75Srobert }
4998*d415bd75Srobert
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const499909467b48Spatrick void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
500009467b48Spatrick const MachineInstr &MI,
500109467b48Spatrick int OpIdx) const {
500209467b48Spatrick assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
500309467b48Spatrick "Expected G_CONSTANT");
5004097a140dSpatrick MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
500509467b48Spatrick }
500609467b48Spatrick
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const500709467b48Spatrick void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
500809467b48Spatrick const MachineInstr &MI,
500909467b48Spatrick int OpIdx) const {
501009467b48Spatrick assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
501109467b48Spatrick "Expected G_CONSTANT");
501209467b48Spatrick MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
501309467b48Spatrick }
501409467b48Spatrick
renderBitcastImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const501509467b48Spatrick void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
501609467b48Spatrick const MachineInstr &MI,
501709467b48Spatrick int OpIdx) const {
501809467b48Spatrick assert(OpIdx == -1);
501909467b48Spatrick
502009467b48Spatrick const MachineOperand &Op = MI.getOperand(1);
502109467b48Spatrick if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
502209467b48Spatrick MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
502309467b48Spatrick else {
502409467b48Spatrick assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
502509467b48Spatrick MIB.addImm(Op.getCImm()->getSExtValue());
502609467b48Spatrick }
502709467b48Spatrick }
502809467b48Spatrick
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const502909467b48Spatrick void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
503009467b48Spatrick const MachineInstr &MI,
503109467b48Spatrick int OpIdx) const {
503209467b48Spatrick assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
503309467b48Spatrick "Expected G_CONSTANT");
503409467b48Spatrick MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
503509467b48Spatrick }
503609467b48Spatrick
503709467b48Spatrick /// This only really exists to satisfy DAG type checking machinery, so is a
503809467b48Spatrick /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const503909467b48Spatrick void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
504009467b48Spatrick const MachineInstr &MI,
504109467b48Spatrick int OpIdx) const {
504209467b48Spatrick MIB.addImm(MI.getOperand(OpIdx).getImm());
504309467b48Spatrick }
504409467b48Spatrick
renderExtractCPol(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const504573471bf0Spatrick void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5046097a140dSpatrick const MachineInstr &MI,
5047097a140dSpatrick int OpIdx) const {
5048097a140dSpatrick assert(OpIdx >= 0 && "expected to match an immediate operand");
504973471bf0Spatrick MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
5050097a140dSpatrick }
5051097a140dSpatrick
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5052097a140dSpatrick void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5053097a140dSpatrick const MachineInstr &MI,
5054097a140dSpatrick int OpIdx) const {
5055097a140dSpatrick assert(OpIdx >= 0 && "expected to match an immediate operand");
5056097a140dSpatrick MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
5057097a140dSpatrick }
5058097a140dSpatrick
renderSetGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const505973471bf0Spatrick void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
506073471bf0Spatrick const MachineInstr &MI,
506173471bf0Spatrick int OpIdx) const {
506273471bf0Spatrick assert(OpIdx >= 0 && "expected to match an immediate operand");
506373471bf0Spatrick MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
506473471bf0Spatrick }
506573471bf0Spatrick
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const506673471bf0Spatrick void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
506773471bf0Spatrick const MachineInstr &MI,
506873471bf0Spatrick int OpIdx) const {
506973471bf0Spatrick MIB.addFrameIndex((MI.getOperand(1).getIndex()));
507073471bf0Spatrick }
507173471bf0Spatrick
isInlineImmediate16(int64_t Imm) const507209467b48Spatrick bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
507309467b48Spatrick return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
507409467b48Spatrick }
507509467b48Spatrick
isInlineImmediate32(int64_t Imm) const507609467b48Spatrick bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
507709467b48Spatrick return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
507809467b48Spatrick }
507909467b48Spatrick
isInlineImmediate64(int64_t Imm) const508009467b48Spatrick bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
508109467b48Spatrick return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
508209467b48Spatrick }
508309467b48Spatrick
isInlineImmediate(const APFloat & Imm) const508409467b48Spatrick bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
508509467b48Spatrick return TII.isInlineConstant(Imm);
508609467b48Spatrick }
5087