xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp (revision 4b24c2dfb5c02896ec2e9855ac72eb0771d0764d)
16892c175SPetar Avramovic //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
26892c175SPetar Avramovic //
36892c175SPetar Avramovic // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
46892c175SPetar Avramovic // See https://llvm.org/LICENSE.txt for license information.
56892c175SPetar Avramovic // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66892c175SPetar Avramovic //
76892c175SPetar Avramovic //===----------------------------------------------------------------------===//
86892c175SPetar Avramovic //
96892c175SPetar Avramovic /// \file
106892c175SPetar Avramovic /// GlobalISel pass that selects divergent i1 phis as lane mask phis.
116892c175SPetar Avramovic /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
126892c175SPetar Avramovic /// Handles all cases of temporal divergence.
136892c175SPetar Avramovic /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
146892c175SPetar Avramovic /// currently depends on LCSSA to insert phis with one incoming.
156892c175SPetar Avramovic //
166892c175SPetar Avramovic //===----------------------------------------------------------------------===//
176892c175SPetar Avramovic 
186892c175SPetar Avramovic #include "AMDGPU.h"
1906f711a9SPetar Avramovic #include "SILowerI1Copies.h"
2006f711a9SPetar Avramovic #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
216892c175SPetar Avramovic #include "llvm/CodeGen/MachineFunctionPass.h"
2206f711a9SPetar Avramovic #include "llvm/CodeGen/MachineUniformityAnalysis.h"
2306f711a9SPetar Avramovic #include "llvm/InitializePasses.h"
246892c175SPetar Avramovic 
256892c175SPetar Avramovic #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
266892c175SPetar Avramovic 
276892c175SPetar Avramovic using namespace llvm;
286892c175SPetar Avramovic 
296892c175SPetar Avramovic namespace {
306892c175SPetar Avramovic 
316892c175SPetar Avramovic class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
326892c175SPetar Avramovic public:
336892c175SPetar Avramovic   static char ID;
346892c175SPetar Avramovic 
356892c175SPetar Avramovic public:
AMDGPUGlobalISelDivergenceLowering()366892c175SPetar Avramovic   AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {
376892c175SPetar Avramovic     initializeAMDGPUGlobalISelDivergenceLoweringPass(
386892c175SPetar Avramovic         *PassRegistry::getPassRegistry());
396892c175SPetar Avramovic   }
406892c175SPetar Avramovic 
416892c175SPetar Avramovic   bool runOnMachineFunction(MachineFunction &MF) override;
426892c175SPetar Avramovic 
getPassName() const436892c175SPetar Avramovic   StringRef getPassName() const override {
446892c175SPetar Avramovic     return "AMDGPU GlobalISel divergence lowering";
456892c175SPetar Avramovic   }
466892c175SPetar Avramovic 
getAnalysisUsage(AnalysisUsage & AU) const476892c175SPetar Avramovic   void getAnalysisUsage(AnalysisUsage &AU) const override {
486892c175SPetar Avramovic     AU.setPreservesCFG();
49837dc542Spaperchalice     AU.addRequired<MachineDominatorTreeWrapperPass>();
50*4b24c2dfSpaperchalice     AU.addRequired<MachinePostDominatorTreeWrapperPass>();
5106f711a9SPetar Avramovic     AU.addRequired<MachineUniformityAnalysisPass>();
526892c175SPetar Avramovic     MachineFunctionPass::getAnalysisUsage(AU);
536892c175SPetar Avramovic   }
546892c175SPetar Avramovic };
556892c175SPetar Avramovic 
5606f711a9SPetar Avramovic class DivergenceLoweringHelper : public PhiLoweringHelper {
5706f711a9SPetar Avramovic public:
5806f711a9SPetar Avramovic   DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
5906f711a9SPetar Avramovic                            MachinePostDominatorTree *PDT,
6006f711a9SPetar Avramovic                            MachineUniformityInfo *MUI);
6106f711a9SPetar Avramovic 
6206f711a9SPetar Avramovic private:
6306f711a9SPetar Avramovic   MachineUniformityInfo *MUI = nullptr;
6406f711a9SPetar Avramovic   MachineIRBuilder B;
6506f711a9SPetar Avramovic   Register buildRegCopyToLaneMask(Register Reg);
6606f711a9SPetar Avramovic 
6706f711a9SPetar Avramovic public:
6806f711a9SPetar Avramovic   void markAsLaneMask(Register DstReg) const override;
6906f711a9SPetar Avramovic   void getCandidatesForLowering(
7006f711a9SPetar Avramovic       SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
7106f711a9SPetar Avramovic   void collectIncomingValuesFromPhi(
7206f711a9SPetar Avramovic       const MachineInstr *MI,
7306f711a9SPetar Avramovic       SmallVectorImpl<Incoming> &Incomings) const override;
7406f711a9SPetar Avramovic   void replaceDstReg(Register NewReg, Register OldReg,
7506f711a9SPetar Avramovic                      MachineBasicBlock *MBB) override;
7606f711a9SPetar Avramovic   void buildMergeLaneMasks(MachineBasicBlock &MBB,
7706f711a9SPetar Avramovic                            MachineBasicBlock::iterator I, const DebugLoc &DL,
7806f711a9SPetar Avramovic                            Register DstReg, Register PrevReg,
7906f711a9SPetar Avramovic                            Register CurReg) override;
8006f711a9SPetar Avramovic   void constrainAsLaneMask(Incoming &In) override;
8106f711a9SPetar Avramovic };
8206f711a9SPetar Avramovic 
DivergenceLoweringHelper(MachineFunction * MF,MachineDominatorTree * DT,MachinePostDominatorTree * PDT,MachineUniformityInfo * MUI)8306f711a9SPetar Avramovic DivergenceLoweringHelper::DivergenceLoweringHelper(
8406f711a9SPetar Avramovic     MachineFunction *MF, MachineDominatorTree *DT,
8506f711a9SPetar Avramovic     MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
8606f711a9SPetar Avramovic     : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
8706f711a9SPetar Avramovic 
8806f711a9SPetar Avramovic // _(s1) -> SReg_32/64(s1)
markAsLaneMask(Register DstReg) const8906f711a9SPetar Avramovic void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
9006f711a9SPetar Avramovic   assert(MRI->getType(DstReg) == LLT::scalar(1));
9106f711a9SPetar Avramovic 
9206f711a9SPetar Avramovic   if (MRI->getRegClassOrNull(DstReg)) {
9306f711a9SPetar Avramovic     if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
9406f711a9SPetar Avramovic       return;
9506f711a9SPetar Avramovic     llvm_unreachable("Failed to constrain register class");
9606f711a9SPetar Avramovic   }
9706f711a9SPetar Avramovic 
9806f711a9SPetar Avramovic   MRI->setRegClass(DstReg, ST->getBoolRC());
9906f711a9SPetar Avramovic }
10006f711a9SPetar Avramovic 
getCandidatesForLowering(SmallVectorImpl<MachineInstr * > & Vreg1Phis) const10106f711a9SPetar Avramovic void DivergenceLoweringHelper::getCandidatesForLowering(
10206f711a9SPetar Avramovic     SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
10306f711a9SPetar Avramovic   LLT S1 = LLT::scalar(1);
10406f711a9SPetar Avramovic 
10506f711a9SPetar Avramovic   // Add divergent i1 phis to the list
10606f711a9SPetar Avramovic   for (MachineBasicBlock &MBB : *MF) {
10706f711a9SPetar Avramovic     for (MachineInstr &MI : MBB.phis()) {
10806f711a9SPetar Avramovic       Register Dst = MI.getOperand(0).getReg();
10906f711a9SPetar Avramovic       if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
11006f711a9SPetar Avramovic         Vreg1Phis.push_back(&MI);
11106f711a9SPetar Avramovic     }
11206f711a9SPetar Avramovic   }
11306f711a9SPetar Avramovic }
11406f711a9SPetar Avramovic 
collectIncomingValuesFromPhi(const MachineInstr * MI,SmallVectorImpl<Incoming> & Incomings) const11506f711a9SPetar Avramovic void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
11606f711a9SPetar Avramovic     const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
11706f711a9SPetar Avramovic   for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
11806f711a9SPetar Avramovic     Incomings.emplace_back(MI->getOperand(i).getReg(),
11906f711a9SPetar Avramovic                            MI->getOperand(i + 1).getMBB(), Register());
12006f711a9SPetar Avramovic   }
12106f711a9SPetar Avramovic }
12206f711a9SPetar Avramovic 
replaceDstReg(Register NewReg,Register OldReg,MachineBasicBlock * MBB)12306f711a9SPetar Avramovic void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
12406f711a9SPetar Avramovic                                              MachineBasicBlock *MBB) {
12506f711a9SPetar Avramovic   BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
12606f711a9SPetar Avramovic       .addReg(NewReg);
12706f711a9SPetar Avramovic }
12806f711a9SPetar Avramovic 
12906f711a9SPetar Avramovic // Copy Reg to new lane mask register, insert a copy after instruction that
13006f711a9SPetar Avramovic // defines Reg while skipping phis if needed.
buildRegCopyToLaneMask(Register Reg)13106f711a9SPetar Avramovic Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
13206f711a9SPetar Avramovic   Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
13306f711a9SPetar Avramovic   MachineInstr *Instr = MRI->getVRegDef(Reg);
13406f711a9SPetar Avramovic   MachineBasicBlock *MBB = Instr->getParent();
13506f711a9SPetar Avramovic   B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
13606f711a9SPetar Avramovic   B.buildCopy(LaneMask, Reg);
13706f711a9SPetar Avramovic   return LaneMask;
13806f711a9SPetar Avramovic }
13906f711a9SPetar Avramovic 
14006f711a9SPetar Avramovic // bb.previous
14106f711a9SPetar Avramovic //   %PrevReg = ...
14206f711a9SPetar Avramovic //
14306f711a9SPetar Avramovic // bb.current
14406f711a9SPetar Avramovic //   %CurReg = ...
14506f711a9SPetar Avramovic //
14606f711a9SPetar Avramovic //   %DstReg - not defined
14706f711a9SPetar Avramovic //
14806f711a9SPetar Avramovic // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
14906f711a9SPetar Avramovic //
15006f711a9SPetar Avramovic // bb.previous
15106f711a9SPetar Avramovic //   %PrevReg = ...
15206f711a9SPetar Avramovic //   %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
15306f711a9SPetar Avramovic //
15406f711a9SPetar Avramovic // bb.current
15506f711a9SPetar Avramovic //   %CurReg = ...
15606f711a9SPetar Avramovic //   %CurRegCopy:sreg_32(s1) = COPY %CurReg
15706f711a9SPetar Avramovic //   ...
15806f711a9SPetar Avramovic //   %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
15906f711a9SPetar Avramovic //   %CurMaskedReg:sreg_32(s1)  = AND %ExecReg, CurRegCopy - inactive lanes to 0
16006f711a9SPetar Avramovic //   %DstReg:sreg_32(s1)        = OR %PrevMaskedReg, CurMaskedReg
16106f711a9SPetar Avramovic //
16206f711a9SPetar Avramovic // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
buildMergeLaneMasks(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,Register PrevReg,Register CurReg)16306f711a9SPetar Avramovic void DivergenceLoweringHelper::buildMergeLaneMasks(
16406f711a9SPetar Avramovic     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
16506f711a9SPetar Avramovic     Register DstReg, Register PrevReg, Register CurReg) {
16606f711a9SPetar Avramovic   // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
16706f711a9SPetar Avramovic   // TODO: check if inputs are constants or results of a compare.
16806f711a9SPetar Avramovic 
16906f711a9SPetar Avramovic   Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
17006f711a9SPetar Avramovic   Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
17106f711a9SPetar Avramovic   Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
17206f711a9SPetar Avramovic   Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
17306f711a9SPetar Avramovic 
17406f711a9SPetar Avramovic   B.setInsertPt(MBB, I);
17506f711a9SPetar Avramovic   B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
17606f711a9SPetar Avramovic   B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
17706f711a9SPetar Avramovic   B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
17806f711a9SPetar Avramovic }
17906f711a9SPetar Avramovic 
1806c2eec5cSPetar Avramovic // GlobalISel has to constrain S1 incoming taken as-is with lane mask register
1816c2eec5cSPetar Avramovic // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
1826c2eec5cSPetar Avramovic // Incoming.Reg becomes that new lane mask.
constrainAsLaneMask(Incoming & In)1836c2eec5cSPetar Avramovic void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
1846c2eec5cSPetar Avramovic   B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
1856c2eec5cSPetar Avramovic 
1866c2eec5cSPetar Avramovic   auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
1876c2eec5cSPetar Avramovic   MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
1886c2eec5cSPetar Avramovic   In.Reg = Copy.getReg(0);
1896c2eec5cSPetar Avramovic }
19006f711a9SPetar Avramovic 
1916892c175SPetar Avramovic } // End anonymous namespace.
1926892c175SPetar Avramovic 
1936892c175SPetar Avramovic INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
1946892c175SPetar Avramovic                       "AMDGPU GlobalISel divergence lowering", false, false)
195837dc542Spaperchalice INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
196*4b24c2dfSpaperchalice INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
19706f711a9SPetar Avramovic INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
1986892c175SPetar Avramovic INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
1996892c175SPetar Avramovic                     "AMDGPU GlobalISel divergence lowering", false, false)
2006892c175SPetar Avramovic 
2016892c175SPetar Avramovic char AMDGPUGlobalISelDivergenceLowering::ID = 0;
2026892c175SPetar Avramovic 
2036892c175SPetar Avramovic char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
2046892c175SPetar Avramovic     AMDGPUGlobalISelDivergenceLowering::ID;
2056892c175SPetar Avramovic 
createAMDGPUGlobalISelDivergenceLoweringPass()2066892c175SPetar Avramovic FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
2076892c175SPetar Avramovic   return new AMDGPUGlobalISelDivergenceLowering();
2086892c175SPetar Avramovic }
2096892c175SPetar Avramovic 
runOnMachineFunction(MachineFunction & MF)2106892c175SPetar Avramovic bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
2116892c175SPetar Avramovic     MachineFunction &MF) {
212837dc542Spaperchalice   MachineDominatorTree &DT =
213837dc542Spaperchalice       getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
214*4b24c2dfSpaperchalice   MachinePostDominatorTree &PDT =
215*4b24c2dfSpaperchalice       getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
21606f711a9SPetar Avramovic   MachineUniformityInfo &MUI =
21706f711a9SPetar Avramovic       getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
21806f711a9SPetar Avramovic 
21906f711a9SPetar Avramovic   DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
22006f711a9SPetar Avramovic 
22106f711a9SPetar Avramovic   return Helper.lowerPhis();
2226892c175SPetar Avramovic }
223