15f757f3fSDimitry Andric //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// 25f757f3fSDimitry Andric // 35f757f3fSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45f757f3fSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55f757f3fSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65f757f3fSDimitry Andric // 75f757f3fSDimitry Andric //===----------------------------------------------------------------------===// 85f757f3fSDimitry Andric // 95f757f3fSDimitry Andric /// \file 105f757f3fSDimitry Andric /// GlobalISel pass that selects divergent i1 phis as lane mask phis. 115f757f3fSDimitry Andric /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. 125f757f3fSDimitry Andric /// Handles all cases of temporal divergence. 135f757f3fSDimitry Andric /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass 145f757f3fSDimitry Andric /// currently depends on LCSSA to insert phis with one incoming. 155f757f3fSDimitry Andric // 165f757f3fSDimitry Andric //===----------------------------------------------------------------------===// 175f757f3fSDimitry Andric 185f757f3fSDimitry Andric #include "AMDGPU.h" 19*0fca6ea1SDimitry Andric #include "SILowerI1Copies.h" 20*0fca6ea1SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 215f757f3fSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 22*0fca6ea1SDimitry Andric #include "llvm/CodeGen/MachineUniformityAnalysis.h" 23*0fca6ea1SDimitry Andric #include "llvm/InitializePasses.h" 245f757f3fSDimitry Andric 255f757f3fSDimitry Andric #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" 265f757f3fSDimitry Andric 275f757f3fSDimitry Andric using namespace llvm; 285f757f3fSDimitry Andric 295f757f3fSDimitry Andric namespace { 305f757f3fSDimitry Andric 315f757f3fSDimitry Andric class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { 325f757f3fSDimitry Andric public: 335f757f3fSDimitry Andric static char ID; 345f757f3fSDimitry Andric 355f757f3fSDimitry Andric public: 365f757f3fSDimitry Andric AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) { 375f757f3fSDimitry Andric initializeAMDGPUGlobalISelDivergenceLoweringPass( 385f757f3fSDimitry Andric *PassRegistry::getPassRegistry()); 395f757f3fSDimitry Andric } 405f757f3fSDimitry Andric 415f757f3fSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 425f757f3fSDimitry Andric 435f757f3fSDimitry Andric StringRef getPassName() const override { 445f757f3fSDimitry Andric return "AMDGPU GlobalISel divergence lowering"; 455f757f3fSDimitry Andric } 465f757f3fSDimitry Andric 475f757f3fSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 485f757f3fSDimitry Andric AU.setPreservesCFG(); 49*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 50*0fca6ea1SDimitry Andric AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 51*0fca6ea1SDimitry Andric AU.addRequired<MachineUniformityAnalysisPass>(); 525f757f3fSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 535f757f3fSDimitry Andric } 545f757f3fSDimitry Andric }; 555f757f3fSDimitry Andric 56*0fca6ea1SDimitry Andric class DivergenceLoweringHelper : public PhiLoweringHelper { 57*0fca6ea1SDimitry Andric public: 58*0fca6ea1SDimitry Andric DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 59*0fca6ea1SDimitry Andric MachinePostDominatorTree *PDT, 60*0fca6ea1SDimitry Andric MachineUniformityInfo *MUI); 61*0fca6ea1SDimitry Andric 62*0fca6ea1SDimitry Andric private: 63*0fca6ea1SDimitry Andric MachineUniformityInfo *MUI = nullptr; 64*0fca6ea1SDimitry Andric MachineIRBuilder B; 65*0fca6ea1SDimitry Andric Register buildRegCopyToLaneMask(Register Reg); 66*0fca6ea1SDimitry Andric 67*0fca6ea1SDimitry Andric public: 68*0fca6ea1SDimitry Andric void markAsLaneMask(Register DstReg) const override; 69*0fca6ea1SDimitry Andric void getCandidatesForLowering( 70*0fca6ea1SDimitry Andric SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 71*0fca6ea1SDimitry Andric void collectIncomingValuesFromPhi( 72*0fca6ea1SDimitry Andric const MachineInstr *MI, 73*0fca6ea1SDimitry Andric SmallVectorImpl<Incoming> &Incomings) const override; 74*0fca6ea1SDimitry Andric void replaceDstReg(Register NewReg, Register OldReg, 75*0fca6ea1SDimitry Andric MachineBasicBlock *MBB) override; 76*0fca6ea1SDimitry Andric void buildMergeLaneMasks(MachineBasicBlock &MBB, 77*0fca6ea1SDimitry Andric MachineBasicBlock::iterator I, const DebugLoc &DL, 78*0fca6ea1SDimitry Andric Register DstReg, Register PrevReg, 79*0fca6ea1SDimitry Andric Register CurReg) override; 80*0fca6ea1SDimitry Andric void constrainAsLaneMask(Incoming &In) override; 81*0fca6ea1SDimitry Andric }; 82*0fca6ea1SDimitry Andric 83*0fca6ea1SDimitry Andric DivergenceLoweringHelper::DivergenceLoweringHelper( 84*0fca6ea1SDimitry Andric MachineFunction *MF, MachineDominatorTree *DT, 85*0fca6ea1SDimitry Andric MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) 86*0fca6ea1SDimitry Andric : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} 87*0fca6ea1SDimitry Andric 88*0fca6ea1SDimitry Andric // _(s1) -> SReg_32/64(s1) 89*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { 90*0fca6ea1SDimitry Andric assert(MRI->getType(DstReg) == LLT::scalar(1)); 91*0fca6ea1SDimitry Andric 92*0fca6ea1SDimitry Andric if (MRI->getRegClassOrNull(DstReg)) { 93*0fca6ea1SDimitry Andric if (MRI->constrainRegClass(DstReg, ST->getBoolRC())) 94*0fca6ea1SDimitry Andric return; 95*0fca6ea1SDimitry Andric llvm_unreachable("Failed to constrain register class"); 96*0fca6ea1SDimitry Andric } 97*0fca6ea1SDimitry Andric 98*0fca6ea1SDimitry Andric MRI->setRegClass(DstReg, ST->getBoolRC()); 99*0fca6ea1SDimitry Andric } 100*0fca6ea1SDimitry Andric 101*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::getCandidatesForLowering( 102*0fca6ea1SDimitry Andric SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 103*0fca6ea1SDimitry Andric LLT S1 = LLT::scalar(1); 104*0fca6ea1SDimitry Andric 105*0fca6ea1SDimitry Andric // Add divergent i1 phis to the list 106*0fca6ea1SDimitry Andric for (MachineBasicBlock &MBB : *MF) { 107*0fca6ea1SDimitry Andric for (MachineInstr &MI : MBB.phis()) { 108*0fca6ea1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 109*0fca6ea1SDimitry Andric if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst)) 110*0fca6ea1SDimitry Andric Vreg1Phis.push_back(&MI); 111*0fca6ea1SDimitry Andric } 112*0fca6ea1SDimitry Andric } 113*0fca6ea1SDimitry Andric } 114*0fca6ea1SDimitry Andric 115*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::collectIncomingValuesFromPhi( 116*0fca6ea1SDimitry Andric const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 117*0fca6ea1SDimitry Andric for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 118*0fca6ea1SDimitry Andric Incomings.emplace_back(MI->getOperand(i).getReg(), 119*0fca6ea1SDimitry Andric MI->getOperand(i + 1).getMBB(), Register()); 120*0fca6ea1SDimitry Andric } 121*0fca6ea1SDimitry Andric } 122*0fca6ea1SDimitry Andric 123*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 124*0fca6ea1SDimitry Andric MachineBasicBlock *MBB) { 125*0fca6ea1SDimitry Andric BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg) 126*0fca6ea1SDimitry Andric .addReg(NewReg); 127*0fca6ea1SDimitry Andric } 128*0fca6ea1SDimitry Andric 129*0fca6ea1SDimitry Andric // Copy Reg to new lane mask register, insert a copy after instruction that 130*0fca6ea1SDimitry Andric // defines Reg while skipping phis if needed. 131*0fca6ea1SDimitry Andric Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { 132*0fca6ea1SDimitry Andric Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); 133*0fca6ea1SDimitry Andric MachineInstr *Instr = MRI->getVRegDef(Reg); 134*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = Instr->getParent(); 135*0fca6ea1SDimitry Andric B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator()))); 136*0fca6ea1SDimitry Andric B.buildCopy(LaneMask, Reg); 137*0fca6ea1SDimitry Andric return LaneMask; 138*0fca6ea1SDimitry Andric } 139*0fca6ea1SDimitry Andric 140*0fca6ea1SDimitry Andric // bb.previous 141*0fca6ea1SDimitry Andric // %PrevReg = ... 142*0fca6ea1SDimitry Andric // 143*0fca6ea1SDimitry Andric // bb.current 144*0fca6ea1SDimitry Andric // %CurReg = ... 145*0fca6ea1SDimitry Andric // 146*0fca6ea1SDimitry Andric // %DstReg - not defined 147*0fca6ea1SDimitry Andric // 148*0fca6ea1SDimitry Andric // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) 149*0fca6ea1SDimitry Andric // 150*0fca6ea1SDimitry Andric // bb.previous 151*0fca6ea1SDimitry Andric // %PrevReg = ... 152*0fca6ea1SDimitry Andric // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg 153*0fca6ea1SDimitry Andric // 154*0fca6ea1SDimitry Andric // bb.current 155*0fca6ea1SDimitry Andric // %CurReg = ... 156*0fca6ea1SDimitry Andric // %CurRegCopy:sreg_32(s1) = COPY %CurReg 157*0fca6ea1SDimitry Andric // ... 158*0fca6ea1SDimitry Andric // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 159*0fca6ea1SDimitry Andric // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 160*0fca6ea1SDimitry Andric // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg 161*0fca6ea1SDimitry Andric // 162*0fca6ea1SDimitry Andric // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg 163*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::buildMergeLaneMasks( 164*0fca6ea1SDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, 165*0fca6ea1SDimitry Andric Register DstReg, Register PrevReg, Register CurReg) { 166*0fca6ea1SDimitry Andric // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) 167*0fca6ea1SDimitry Andric // TODO: check if inputs are constants or results of a compare. 168*0fca6ea1SDimitry Andric 169*0fca6ea1SDimitry Andric Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg); 170*0fca6ea1SDimitry Andric Register CurRegCopy = buildRegCopyToLaneMask(CurReg); 171*0fca6ea1SDimitry Andric Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 172*0fca6ea1SDimitry Andric Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 173*0fca6ea1SDimitry Andric 174*0fca6ea1SDimitry Andric B.setInsertPt(MBB, I); 175*0fca6ea1SDimitry Andric B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg}); 176*0fca6ea1SDimitry Andric B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy}); 177*0fca6ea1SDimitry Andric B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg}); 178*0fca6ea1SDimitry Andric } 179*0fca6ea1SDimitry Andric 180*0fca6ea1SDimitry Andric // GlobalISel has to constrain S1 incoming taken as-is with lane mask register 181*0fca6ea1SDimitry Andric // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, 182*0fca6ea1SDimitry Andric // Incoming.Reg becomes that new lane mask. 183*0fca6ea1SDimitry Andric void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { 184*0fca6ea1SDimitry Andric B.setInsertPt(*In.Block, In.Block->getFirstTerminator()); 185*0fca6ea1SDimitry Andric 186*0fca6ea1SDimitry Andric auto Copy = B.buildCopy(LLT::scalar(1), In.Reg); 187*0fca6ea1SDimitry Andric MRI->setRegClass(Copy.getReg(0), ST->getBoolRC()); 188*0fca6ea1SDimitry Andric In.Reg = Copy.getReg(0); 189*0fca6ea1SDimitry Andric } 190*0fca6ea1SDimitry Andric 1915f757f3fSDimitry Andric } // End anonymous namespace. 1925f757f3fSDimitry Andric 1935f757f3fSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 1945f757f3fSDimitry Andric "AMDGPU GlobalISel divergence lowering", false, false) 195*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 196*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 197*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) 1985f757f3fSDimitry Andric INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 1995f757f3fSDimitry Andric "AMDGPU GlobalISel divergence lowering", false, false) 2005f757f3fSDimitry Andric 2015f757f3fSDimitry Andric char AMDGPUGlobalISelDivergenceLowering::ID = 0; 2025f757f3fSDimitry Andric 2035f757f3fSDimitry Andric char &llvm::AMDGPUGlobalISelDivergenceLoweringID = 2045f757f3fSDimitry Andric AMDGPUGlobalISelDivergenceLowering::ID; 2055f757f3fSDimitry Andric 2065f757f3fSDimitry Andric FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { 2075f757f3fSDimitry Andric return new AMDGPUGlobalISelDivergenceLowering(); 2085f757f3fSDimitry Andric } 2095f757f3fSDimitry Andric 2105f757f3fSDimitry Andric bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( 2115f757f3fSDimitry Andric MachineFunction &MF) { 212*0fca6ea1SDimitry Andric MachineDominatorTree &DT = 213*0fca6ea1SDimitry Andric getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 214*0fca6ea1SDimitry Andric MachinePostDominatorTree &PDT = 215*0fca6ea1SDimitry Andric getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 216*0fca6ea1SDimitry Andric MachineUniformityInfo &MUI = 217*0fca6ea1SDimitry Andric getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); 218*0fca6ea1SDimitry Andric 219*0fca6ea1SDimitry Andric DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); 220*0fca6ea1SDimitry Andric 221*0fca6ea1SDimitry Andric return Helper.lowerPhis(); 2225f757f3fSDimitry Andric } 223