10b57cec5SDimitry Andric //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 100b57cec5SDimitry Andric // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 110b57cec5SDimitry Andric // form and a wave-level control flow graph. 120b57cec5SDimitry Andric // 130b57cec5SDimitry Andric // Before this pass, values that are semantically i1 and are defined and used 140b57cec5SDimitry Andric // within the same basic block are already represented as lane masks in scalar 150b57cec5SDimitry Andric // registers. However, values that cross basic blocks are always transferred 160b57cec5SDimitry Andric // between basic blocks in vreg_1 virtual registers and are lowered by this 170b57cec5SDimitry Andric // pass. 180b57cec5SDimitry Andric // 190b57cec5SDimitry Andric // The only instructions that use or define vreg_1 virtual registers are COPY, 200b57cec5SDimitry Andric // PHI, and IMPLICIT_DEF. 210b57cec5SDimitry Andric // 220b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 230b57cec5SDimitry Andric 245f757f3fSDimitry Andric #include "SILowerI1Copies.h" 250b57cec5SDimitry Andric #include "AMDGPU.h" 260b57cec5SDimitry Andric #include "llvm/CodeGen/MachineSSAUpdater.h" 27480093f4SDimitry Andric #include "llvm/InitializePasses.h" 285f757f3fSDimitry Andric #include "llvm/Target/CGPassBuilderOption.h" 290b57cec5SDimitry Andric 300b57cec5SDimitry Andric #define DEBUG_TYPE "si-i1-copies" 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric using namespace llvm; 330b57cec5SDimitry Andric 34*0fca6ea1SDimitry Andric static Register 35*0fca6ea1SDimitry Andric insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 36*0fca6ea1SDimitry Andric MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs); 370b57cec5SDimitry Andric 380b57cec5SDimitry Andric namespace { 390b57cec5SDimitry Andric 400b57cec5SDimitry Andric class SILowerI1Copies : public MachineFunctionPass { 410b57cec5SDimitry Andric public: 420b57cec5SDimitry Andric static char ID; 430b57cec5SDimitry Andric 440b57cec5SDimitry Andric SILowerI1Copies() : MachineFunctionPass(ID) { 450b57cec5SDimitry Andric initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 460b57cec5SDimitry Andric } 470b57cec5SDimitry Andric 480b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Lower i1 Copies"; } 510b57cec5SDimitry Andric 520b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 530b57cec5SDimitry Andric AU.setPreservesCFG(); 54*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 55*0fca6ea1SDimitry Andric AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 560b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 570b57cec5SDimitry Andric } 585f757f3fSDimitry Andric }; 595f757f3fSDimitry Andric 605f757f3fSDimitry Andric class Vreg1LoweringHelper : public PhiLoweringHelper { 615f757f3fSDimitry Andric public: 625f757f3fSDimitry Andric Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 635f757f3fSDimitry Andric MachinePostDominatorTree *PDT); 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric private: 665f757f3fSDimitry Andric DenseSet<Register> ConstrainRegs; 675f757f3fSDimitry Andric 685f757f3fSDimitry Andric public: 695f757f3fSDimitry Andric void markAsLaneMask(Register DstReg) const override; 705f757f3fSDimitry Andric void getCandidatesForLowering( 715f757f3fSDimitry Andric SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 725f757f3fSDimitry Andric void collectIncomingValuesFromPhi( 735f757f3fSDimitry Andric const MachineInstr *MI, 745f757f3fSDimitry Andric SmallVectorImpl<Incoming> &Incomings) const override; 755f757f3fSDimitry Andric void replaceDstReg(Register NewReg, Register OldReg, 765f757f3fSDimitry Andric MachineBasicBlock *MBB) override; 770b57cec5SDimitry Andric void buildMergeLaneMasks(MachineBasicBlock &MBB, 780b57cec5SDimitry Andric MachineBasicBlock::iterator I, const DebugLoc &DL, 795f757f3fSDimitry Andric Register DstReg, Register PrevReg, 805f757f3fSDimitry Andric Register CurReg) override; 81*0fca6ea1SDimitry Andric void constrainAsLaneMask(Incoming &In) override; 820b57cec5SDimitry Andric 835f757f3fSDimitry Andric bool lowerCopiesFromI1(); 845f757f3fSDimitry Andric bool lowerCopiesToI1(); 855f757f3fSDimitry Andric bool cleanConstrainRegs(bool Changed); 86e8d8bef9SDimitry Andric bool isVreg1(Register Reg) const { 87e8d8bef9SDimitry Andric return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 880b57cec5SDimitry Andric } 890b57cec5SDimitry Andric }; 900b57cec5SDimitry Andric 915f757f3fSDimitry Andric Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, 925f757f3fSDimitry Andric MachineDominatorTree *DT, 935f757f3fSDimitry Andric MachinePostDominatorTree *PDT) 945f757f3fSDimitry Andric : PhiLoweringHelper(MF, DT, PDT) {} 955f757f3fSDimitry Andric 965f757f3fSDimitry Andric bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { 975f757f3fSDimitry Andric assert(Changed || ConstrainRegs.empty()); 985f757f3fSDimitry Andric for (Register Reg : ConstrainRegs) 995f757f3fSDimitry Andric MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 1005f757f3fSDimitry Andric ConstrainRegs.clear(); 1015f757f3fSDimitry Andric 1025f757f3fSDimitry Andric return Changed; 1035f757f3fSDimitry Andric } 1045f757f3fSDimitry Andric 1050b57cec5SDimitry Andric /// Helper class that determines the relationship between incoming values of a 1060b57cec5SDimitry Andric /// phi in the control flow graph to determine where an incoming value can 1070b57cec5SDimitry Andric /// simply be taken as a scalar lane mask as-is, and where it needs to be 1080b57cec5SDimitry Andric /// merged with another, previously defined lane mask. 1090b57cec5SDimitry Andric /// 1100b57cec5SDimitry Andric /// The approach is as follows: 1110b57cec5SDimitry Andric /// - Determine all basic blocks which, starting from the incoming blocks, 1120b57cec5SDimitry Andric /// a wave may reach before entering the def block (the block containing the 1130b57cec5SDimitry Andric /// phi). 1140b57cec5SDimitry Andric /// - If an incoming block has no predecessors in this set, we can take the 1150b57cec5SDimitry Andric /// incoming value as a scalar lane mask as-is. 1160b57cec5SDimitry Andric /// -- A special case of this is when the def block has a self-loop. 1170b57cec5SDimitry Andric /// - Otherwise, the incoming value needs to be merged with a previously 1180b57cec5SDimitry Andric /// defined lane mask. 1190b57cec5SDimitry Andric /// - If there is a path into the set of reachable blocks that does _not_ go 1200b57cec5SDimitry Andric /// through an incoming block where we can take the scalar lane mask as-is, 1210b57cec5SDimitry Andric /// we need to invent an available value for the SSAUpdater. Choices are 1220b57cec5SDimitry Andric /// 0 and undef, with differing consequences for how to merge values etc. 1230b57cec5SDimitry Andric /// 1240b57cec5SDimitry Andric /// TODO: We could use region analysis to quickly skip over SESE regions during 1250b57cec5SDimitry Andric /// the traversal. 1260b57cec5SDimitry Andric /// 1270b57cec5SDimitry Andric class PhiIncomingAnalysis { 1280b57cec5SDimitry Andric MachinePostDominatorTree &PDT; 129bdd1243dSDimitry Andric const SIInstrInfo *TII; 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric // For each reachable basic block, whether it is a source in the induced 1320b57cec5SDimitry Andric // subgraph of the CFG. 1330b57cec5SDimitry Andric DenseMap<MachineBasicBlock *, bool> ReachableMap; 1340b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 1350b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> Stack; 1360b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> Predecessors; 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric public: 139bdd1243dSDimitry Andric PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) 140bdd1243dSDimitry Andric : PDT(PDT), TII(TII) {} 1410b57cec5SDimitry Andric 1420b57cec5SDimitry Andric /// Returns whether \p MBB is a source in the induced subgraph of reachable 1430b57cec5SDimitry Andric /// blocks. 1440b57cec5SDimitry Andric bool isSource(MachineBasicBlock &MBB) const { 1450b57cec5SDimitry Andric return ReachableMap.find(&MBB)->second; 1460b57cec5SDimitry Andric } 1470b57cec5SDimitry Andric 1480b57cec5SDimitry Andric ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 1490b57cec5SDimitry Andric 1505f757f3fSDimitry Andric void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { 1510b57cec5SDimitry Andric assert(Stack.empty()); 1520b57cec5SDimitry Andric ReachableMap.clear(); 1530b57cec5SDimitry Andric ReachableOrdered.clear(); 1540b57cec5SDimitry Andric Predecessors.clear(); 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric // Insert the def block first, so that it acts as an end point for the 1570b57cec5SDimitry Andric // traversal. 1580b57cec5SDimitry Andric ReachableMap.try_emplace(&DefBlock, false); 1590b57cec5SDimitry Andric ReachableOrdered.push_back(&DefBlock); 1600b57cec5SDimitry Andric 1615f757f3fSDimitry Andric for (auto Incoming : Incomings) { 1625f757f3fSDimitry Andric MachineBasicBlock *MBB = Incoming.Block; 1630b57cec5SDimitry Andric if (MBB == &DefBlock) { 1640b57cec5SDimitry Andric ReachableMap[&DefBlock] = true; // self-loop on DefBlock 1650b57cec5SDimitry Andric continue; 1660b57cec5SDimitry Andric } 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric ReachableMap.try_emplace(MBB, false); 1690b57cec5SDimitry Andric ReachableOrdered.push_back(MBB); 1700b57cec5SDimitry Andric 1710b57cec5SDimitry Andric // If this block has a divergent terminator and the def block is its 1720b57cec5SDimitry Andric // post-dominator, the wave may first visit the other successors. 173bdd1243dSDimitry Andric if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB)) 174e8d8bef9SDimitry Andric append_range(Stack, MBB->successors()); 1750b57cec5SDimitry Andric } 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric while (!Stack.empty()) { 1780b57cec5SDimitry Andric MachineBasicBlock *MBB = Stack.pop_back_val(); 1790b57cec5SDimitry Andric if (!ReachableMap.try_emplace(MBB, false).second) 1800b57cec5SDimitry Andric continue; 1810b57cec5SDimitry Andric ReachableOrdered.push_back(MBB); 1820b57cec5SDimitry Andric 183e8d8bef9SDimitry Andric append_range(Stack, MBB->successors()); 1840b57cec5SDimitry Andric } 1850b57cec5SDimitry Andric 1860b57cec5SDimitry Andric for (MachineBasicBlock *MBB : ReachableOrdered) { 1870b57cec5SDimitry Andric bool HaveReachablePred = false; 1880b57cec5SDimitry Andric for (MachineBasicBlock *Pred : MBB->predecessors()) { 1890b57cec5SDimitry Andric if (ReachableMap.count(Pred)) { 1900b57cec5SDimitry Andric HaveReachablePred = true; 1910b57cec5SDimitry Andric } else { 1920b57cec5SDimitry Andric Stack.push_back(Pred); 1930b57cec5SDimitry Andric } 1940b57cec5SDimitry Andric } 1950b57cec5SDimitry Andric if (!HaveReachablePred) 1960b57cec5SDimitry Andric ReachableMap[MBB] = true; 1970b57cec5SDimitry Andric if (HaveReachablePred) { 1980b57cec5SDimitry Andric for (MachineBasicBlock *UnreachablePred : Stack) { 199e8d8bef9SDimitry Andric if (!llvm::is_contained(Predecessors, UnreachablePred)) 2000b57cec5SDimitry Andric Predecessors.push_back(UnreachablePred); 2010b57cec5SDimitry Andric } 2020b57cec5SDimitry Andric } 2030b57cec5SDimitry Andric Stack.clear(); 2040b57cec5SDimitry Andric } 2050b57cec5SDimitry Andric } 2060b57cec5SDimitry Andric }; 2070b57cec5SDimitry Andric 2080b57cec5SDimitry Andric /// Helper class that detects loops which require us to lower an i1 COPY into 2090b57cec5SDimitry Andric /// bitwise manipulation. 2100b57cec5SDimitry Andric /// 2110b57cec5SDimitry Andric /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 2120b57cec5SDimitry Andric /// between loops with the same header. Consider this example: 2130b57cec5SDimitry Andric /// 2140b57cec5SDimitry Andric /// A-+-+ 2150b57cec5SDimitry Andric /// | | | 2160b57cec5SDimitry Andric /// B-+ | 2170b57cec5SDimitry Andric /// | | 2180b57cec5SDimitry Andric /// C---+ 2190b57cec5SDimitry Andric /// 2200b57cec5SDimitry Andric /// A is the header of a loop containing A, B, and C as far as LoopInfo is 2210b57cec5SDimitry Andric /// concerned. However, an i1 COPY in B that is used in C must be lowered to 2220b57cec5SDimitry Andric /// bitwise operations to combine results from different loop iterations when 2230b57cec5SDimitry Andric /// B has a divergent branch (since by default we will compile this code such 2240b57cec5SDimitry Andric /// that threads in a wave are merged at the entry of C). 2250b57cec5SDimitry Andric /// 2260b57cec5SDimitry Andric /// The following rule is implemented to determine whether bitwise operations 2270b57cec5SDimitry Andric /// are required: use the bitwise lowering for a def in block B if a backward 2280b57cec5SDimitry Andric /// edge to B is reachable without going through the nearest common 2290b57cec5SDimitry Andric /// post-dominator of B and all uses of the def. 2300b57cec5SDimitry Andric /// 2310b57cec5SDimitry Andric /// TODO: This rule is conservative because it does not check whether the 2320b57cec5SDimitry Andric /// relevant branches are actually divergent. 2330b57cec5SDimitry Andric /// 2340b57cec5SDimitry Andric /// The class is designed to cache the CFG traversal so that it can be re-used 2350b57cec5SDimitry Andric /// for multiple defs within the same basic block. 2360b57cec5SDimitry Andric /// 2370b57cec5SDimitry Andric /// TODO: We could use region analysis to quickly skip over SESE regions during 2380b57cec5SDimitry Andric /// the traversal. 2390b57cec5SDimitry Andric /// 2400b57cec5SDimitry Andric class LoopFinder { 2410b57cec5SDimitry Andric MachineDominatorTree &DT; 2420b57cec5SDimitry Andric MachinePostDominatorTree &PDT; 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric // All visited / reachable block, tagged by level (level 0 is the def block, 2450b57cec5SDimitry Andric // level 1 are all blocks reachable including but not going through the def 2460b57cec5SDimitry Andric // block's IPDOM, etc.). 2470b57cec5SDimitry Andric DenseMap<MachineBasicBlock *, unsigned> Visited; 2480b57cec5SDimitry Andric 2490b57cec5SDimitry Andric // Nearest common dominator of all visited blocks by level (level 0 is the 2500b57cec5SDimitry Andric // def block). Used for seeding the SSAUpdater. 2510b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> CommonDominators; 2520b57cec5SDimitry Andric 2530b57cec5SDimitry Andric // Post-dominator of all visited blocks. 2540b57cec5SDimitry Andric MachineBasicBlock *VisitedPostDom = nullptr; 2550b57cec5SDimitry Andric 2560b57cec5SDimitry Andric // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 2570b57cec5SDimitry Andric // reachable without going through the IPDOM of the def block (if the IPDOM 2580b57cec5SDimitry Andric // itself has an edge to the def block, the loop level is 2), etc. 2590b57cec5SDimitry Andric unsigned FoundLoopLevel = ~0u; 2600b57cec5SDimitry Andric 2610b57cec5SDimitry Andric MachineBasicBlock *DefBlock = nullptr; 2620b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> Stack; 2630b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> NextLevel; 2640b57cec5SDimitry Andric 2650b57cec5SDimitry Andric public: 2660b57cec5SDimitry Andric LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 2670b57cec5SDimitry Andric : DT(DT), PDT(PDT) {} 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric void initialize(MachineBasicBlock &MBB) { 2700b57cec5SDimitry Andric Visited.clear(); 2710b57cec5SDimitry Andric CommonDominators.clear(); 2720b57cec5SDimitry Andric Stack.clear(); 2730b57cec5SDimitry Andric NextLevel.clear(); 2740b57cec5SDimitry Andric VisitedPostDom = nullptr; 2750b57cec5SDimitry Andric FoundLoopLevel = ~0u; 2760b57cec5SDimitry Andric 2770b57cec5SDimitry Andric DefBlock = &MBB; 2780b57cec5SDimitry Andric } 2790b57cec5SDimitry Andric 2800b57cec5SDimitry Andric /// Check whether a backward edge can be reached without going through the 2810b57cec5SDimitry Andric /// given \p PostDom of the def block. 2820b57cec5SDimitry Andric /// 2830b57cec5SDimitry Andric /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 2840b57cec5SDimitry Andric unsigned findLoop(MachineBasicBlock *PostDom) { 2850b57cec5SDimitry Andric MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 2860b57cec5SDimitry Andric 2870b57cec5SDimitry Andric if (!VisitedPostDom) 2880b57cec5SDimitry Andric advanceLevel(); 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric unsigned Level = 0; 2910b57cec5SDimitry Andric while (PDNode->getBlock() != PostDom) { 2920b57cec5SDimitry Andric if (PDNode->getBlock() == VisitedPostDom) 2930b57cec5SDimitry Andric advanceLevel(); 2940b57cec5SDimitry Andric PDNode = PDNode->getIDom(); 2950b57cec5SDimitry Andric Level++; 2960b57cec5SDimitry Andric if (FoundLoopLevel == Level) 2970b57cec5SDimitry Andric return Level; 2980b57cec5SDimitry Andric } 2990b57cec5SDimitry Andric 3000b57cec5SDimitry Andric return 0; 3010b57cec5SDimitry Andric } 3020b57cec5SDimitry Andric 3030b57cec5SDimitry Andric /// Add undef values dominating the loop and the optionally given additional 3040b57cec5SDimitry Andric /// blocks, so that the SSA updater doesn't have to search all the way to the 3050b57cec5SDimitry Andric /// function entry. 3060b57cec5SDimitry Andric void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 307*0fca6ea1SDimitry Andric MachineRegisterInfo &MRI, 308*0fca6ea1SDimitry Andric MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs, 3095f757f3fSDimitry Andric ArrayRef<Incoming> Incomings = {}) { 3100b57cec5SDimitry Andric assert(LoopLevel < CommonDominators.size()); 3110b57cec5SDimitry Andric 3120b57cec5SDimitry Andric MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 3135f757f3fSDimitry Andric for (auto &Incoming : Incomings) 3145f757f3fSDimitry Andric Dom = DT.findNearestCommonDominator(Dom, Incoming.Block); 3150b57cec5SDimitry Andric 3165f757f3fSDimitry Andric if (!inLoopLevel(*Dom, LoopLevel, Incomings)) { 3175f757f3fSDimitry Andric SSAUpdater.AddAvailableValue( 3185f757f3fSDimitry Andric Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs)); 3190b57cec5SDimitry Andric } else { 3200b57cec5SDimitry Andric // The dominator is part of the loop or the given blocks, so add the 3210b57cec5SDimitry Andric // undef value to unreachable predecessors instead. 3220b57cec5SDimitry Andric for (MachineBasicBlock *Pred : Dom->predecessors()) { 3235f757f3fSDimitry Andric if (!inLoopLevel(*Pred, LoopLevel, Incomings)) 3245f757f3fSDimitry Andric SSAUpdater.AddAvailableValue( 3255f757f3fSDimitry Andric Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs)); 3260b57cec5SDimitry Andric } 3270b57cec5SDimitry Andric } 3280b57cec5SDimitry Andric } 3290b57cec5SDimitry Andric 3300b57cec5SDimitry Andric private: 3310b57cec5SDimitry Andric bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 3325f757f3fSDimitry Andric ArrayRef<Incoming> Incomings) const { 3330b57cec5SDimitry Andric auto DomIt = Visited.find(&MBB); 3340b57cec5SDimitry Andric if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 3350b57cec5SDimitry Andric return true; 3360b57cec5SDimitry Andric 3375f757f3fSDimitry Andric for (auto &Incoming : Incomings) 3385f757f3fSDimitry Andric if (Incoming.Block == &MBB) 3390b57cec5SDimitry Andric return true; 3400b57cec5SDimitry Andric 3410b57cec5SDimitry Andric return false; 3420b57cec5SDimitry Andric } 3430b57cec5SDimitry Andric 3440b57cec5SDimitry Andric void advanceLevel() { 3450b57cec5SDimitry Andric MachineBasicBlock *VisitedDom; 3460b57cec5SDimitry Andric 3470b57cec5SDimitry Andric if (!VisitedPostDom) { 3480b57cec5SDimitry Andric VisitedPostDom = DefBlock; 3490b57cec5SDimitry Andric VisitedDom = DefBlock; 3500b57cec5SDimitry Andric Stack.push_back(DefBlock); 3510b57cec5SDimitry Andric } else { 3520b57cec5SDimitry Andric VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 3530b57cec5SDimitry Andric VisitedDom = CommonDominators.back(); 3540b57cec5SDimitry Andric 3550b57cec5SDimitry Andric for (unsigned i = 0; i < NextLevel.size();) { 3560b57cec5SDimitry Andric if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 3570b57cec5SDimitry Andric Stack.push_back(NextLevel[i]); 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric NextLevel[i] = NextLevel.back(); 3600b57cec5SDimitry Andric NextLevel.pop_back(); 3610b57cec5SDimitry Andric } else { 3620b57cec5SDimitry Andric i++; 3630b57cec5SDimitry Andric } 3640b57cec5SDimitry Andric } 3650b57cec5SDimitry Andric } 3660b57cec5SDimitry Andric 3670b57cec5SDimitry Andric unsigned Level = CommonDominators.size(); 3680b57cec5SDimitry Andric while (!Stack.empty()) { 3690b57cec5SDimitry Andric MachineBasicBlock *MBB = Stack.pop_back_val(); 3700b57cec5SDimitry Andric if (!PDT.dominates(VisitedPostDom, MBB)) 3710b57cec5SDimitry Andric NextLevel.push_back(MBB); 3720b57cec5SDimitry Andric 3730b57cec5SDimitry Andric Visited[MBB] = Level; 3740b57cec5SDimitry Andric VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 3750b57cec5SDimitry Andric 3760b57cec5SDimitry Andric for (MachineBasicBlock *Succ : MBB->successors()) { 3770b57cec5SDimitry Andric if (Succ == DefBlock) { 3780b57cec5SDimitry Andric if (MBB == VisitedPostDom) 3790b57cec5SDimitry Andric FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 3800b57cec5SDimitry Andric else 3810b57cec5SDimitry Andric FoundLoopLevel = std::min(FoundLoopLevel, Level); 3820b57cec5SDimitry Andric continue; 3830b57cec5SDimitry Andric } 3840b57cec5SDimitry Andric 3850b57cec5SDimitry Andric if (Visited.try_emplace(Succ, ~0u).second) { 3860b57cec5SDimitry Andric if (MBB == VisitedPostDom) 3870b57cec5SDimitry Andric NextLevel.push_back(Succ); 3880b57cec5SDimitry Andric else 3890b57cec5SDimitry Andric Stack.push_back(Succ); 3900b57cec5SDimitry Andric } 3910b57cec5SDimitry Andric } 3920b57cec5SDimitry Andric } 3930b57cec5SDimitry Andric 3940b57cec5SDimitry Andric CommonDominators.push_back(VisitedDom); 3950b57cec5SDimitry Andric } 3960b57cec5SDimitry Andric }; 3970b57cec5SDimitry Andric 3980b57cec5SDimitry Andric } // End anonymous namespace. 3990b57cec5SDimitry Andric 4000b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 4010b57cec5SDimitry Andric false) 402*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 403*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 4040b57cec5SDimitry Andric INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 4050b57cec5SDimitry Andric false) 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric char SILowerI1Copies::ID = 0; 4080b57cec5SDimitry Andric 4090b57cec5SDimitry Andric char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 4100b57cec5SDimitry Andric 4110b57cec5SDimitry Andric FunctionPass *llvm::createSILowerI1CopiesPass() { 4120b57cec5SDimitry Andric return new SILowerI1Copies(); 4130b57cec5SDimitry Andric } 4140b57cec5SDimitry Andric 415*0fca6ea1SDimitry Andric Register 416*0fca6ea1SDimitry Andric llvm::createLaneMaskReg(MachineRegisterInfo *MRI, 417*0fca6ea1SDimitry Andric MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 418*0fca6ea1SDimitry Andric return MRI->createVirtualRegister(LaneMaskRegAttrs); 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric 421*0fca6ea1SDimitry Andric static Register 422*0fca6ea1SDimitry Andric insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 423*0fca6ea1SDimitry Andric MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 4245f757f3fSDimitry Andric MachineFunction &MF = *MBB->getParent(); 4250b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4260b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 4275f757f3fSDimitry Andric Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 4285f757f3fSDimitry Andric BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 4290b57cec5SDimitry Andric UndefReg); 4300b57cec5SDimitry Andric return UndefReg; 4310b57cec5SDimitry Andric } 4320b57cec5SDimitry Andric 4330b57cec5SDimitry Andric /// Lower all instructions that def or use vreg_1 registers. 4340b57cec5SDimitry Andric /// 4350b57cec5SDimitry Andric /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 4360b57cec5SDimitry Andric /// occur around inline assembly. We do this first, before vreg_1 registers 4370b57cec5SDimitry Andric /// are changed to scalar mask registers. 4380b57cec5SDimitry Andric /// 4390b57cec5SDimitry Andric /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 4400b57cec5SDimitry Andric /// all others, because phi lowering looks through copies and can therefore 4410b57cec5SDimitry Andric /// often make copy lowering unnecessary. 4420b57cec5SDimitry Andric bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 4435ffd83dbSDimitry Andric // Only need to run this in SelectionDAG path. 4445ffd83dbSDimitry Andric if (TheMF.getProperties().hasProperty( 4455ffd83dbSDimitry Andric MachineFunctionProperties::Property::Selected)) 4465ffd83dbSDimitry Andric return false; 4475ffd83dbSDimitry Andric 448*0fca6ea1SDimitry Andric Vreg1LoweringHelper Helper( 449*0fca6ea1SDimitry Andric &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(), 450*0fca6ea1SDimitry Andric &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree()); 4510b57cec5SDimitry Andric 45281ad6265SDimitry Andric bool Changed = false; 4535f757f3fSDimitry Andric Changed |= Helper.lowerCopiesFromI1(); 4545f757f3fSDimitry Andric Changed |= Helper.lowerPhis(); 4555f757f3fSDimitry Andric Changed |= Helper.lowerCopiesToI1(); 4565f757f3fSDimitry Andric return Helper.cleanConstrainRegs(Changed); 4570b57cec5SDimitry Andric } 4580b57cec5SDimitry Andric 4598bcb0991SDimitry Andric #ifndef NDEBUG 4608bcb0991SDimitry Andric static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 4618bcb0991SDimitry Andric const MachineRegisterInfo &MRI, 4628bcb0991SDimitry Andric Register Reg) { 4638bcb0991SDimitry Andric unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 4648bcb0991SDimitry Andric return Size == 1 || Size == 32; 4658bcb0991SDimitry Andric } 4668bcb0991SDimitry Andric #endif 4678bcb0991SDimitry Andric 4685f757f3fSDimitry Andric bool Vreg1LoweringHelper::lowerCopiesFromI1() { 46981ad6265SDimitry Andric bool Changed = false; 4700b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> DeadCopies; 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric for (MachineBasicBlock &MBB : *MF) { 4730b57cec5SDimitry Andric for (MachineInstr &MI : MBB) { 4740b57cec5SDimitry Andric if (MI.getOpcode() != AMDGPU::COPY) 4750b57cec5SDimitry Andric continue; 4760b57cec5SDimitry Andric 4778bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4788bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 4790b57cec5SDimitry Andric if (!isVreg1(SrcReg)) 4800b57cec5SDimitry Andric continue; 4810b57cec5SDimitry Andric 4820b57cec5SDimitry Andric if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 4830b57cec5SDimitry Andric continue; 4840b57cec5SDimitry Andric 48581ad6265SDimitry Andric Changed = true; 48681ad6265SDimitry Andric 4870b57cec5SDimitry Andric // Copy into a 32-bit vector register. 4880b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 4890b57cec5SDimitry Andric DebugLoc DL = MI.getDebugLoc(); 4900b57cec5SDimitry Andric 4918bcb0991SDimitry Andric assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 4920b57cec5SDimitry Andric assert(!MI.getOperand(0).getSubReg()); 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric ConstrainRegs.insert(SrcReg); 4950b57cec5SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 4960b57cec5SDimitry Andric .addImm(0) 4970b57cec5SDimitry Andric .addImm(0) 4980b57cec5SDimitry Andric .addImm(0) 4990b57cec5SDimitry Andric .addImm(-1) 5000b57cec5SDimitry Andric .addReg(SrcReg); 5010b57cec5SDimitry Andric DeadCopies.push_back(&MI); 5020b57cec5SDimitry Andric } 5030b57cec5SDimitry Andric 5040b57cec5SDimitry Andric for (MachineInstr *MI : DeadCopies) 5050b57cec5SDimitry Andric MI->eraseFromParent(); 5060b57cec5SDimitry Andric DeadCopies.clear(); 5070b57cec5SDimitry Andric } 50881ad6265SDimitry Andric return Changed; 5090b57cec5SDimitry Andric } 5100b57cec5SDimitry Andric 5115f757f3fSDimitry Andric PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, 5125f757f3fSDimitry Andric MachineDominatorTree *DT, 5135f757f3fSDimitry Andric MachinePostDominatorTree *PDT) 5145f757f3fSDimitry Andric : MF(MF), DT(DT), PDT(PDT) { 5155f757f3fSDimitry Andric MRI = &MF->getRegInfo(); 5165f757f3fSDimitry Andric 5175f757f3fSDimitry Andric ST = &MF->getSubtarget<GCNSubtarget>(); 5185f757f3fSDimitry Andric TII = ST->getInstrInfo(); 5195f757f3fSDimitry Andric IsWave32 = ST->isWave32(); 5205f757f3fSDimitry Andric 5215f757f3fSDimitry Andric if (IsWave32) { 5225f757f3fSDimitry Andric ExecReg = AMDGPU::EXEC_LO; 5235f757f3fSDimitry Andric MovOp = AMDGPU::S_MOV_B32; 5245f757f3fSDimitry Andric AndOp = AMDGPU::S_AND_B32; 5255f757f3fSDimitry Andric OrOp = AMDGPU::S_OR_B32; 5265f757f3fSDimitry Andric XorOp = AMDGPU::S_XOR_B32; 5275f757f3fSDimitry Andric AndN2Op = AMDGPU::S_ANDN2_B32; 5285f757f3fSDimitry Andric OrN2Op = AMDGPU::S_ORN2_B32; 5295f757f3fSDimitry Andric } else { 5305f757f3fSDimitry Andric ExecReg = AMDGPU::EXEC; 5315f757f3fSDimitry Andric MovOp = AMDGPU::S_MOV_B64; 5325f757f3fSDimitry Andric AndOp = AMDGPU::S_AND_B64; 5335f757f3fSDimitry Andric OrOp = AMDGPU::S_OR_B64; 5345f757f3fSDimitry Andric XorOp = AMDGPU::S_XOR_B64; 5355f757f3fSDimitry Andric AndN2Op = AMDGPU::S_ANDN2_B64; 5365f757f3fSDimitry Andric OrN2Op = AMDGPU::S_ORN2_B64; 5375f757f3fSDimitry Andric } 5385f757f3fSDimitry Andric } 5395f757f3fSDimitry Andric 5405f757f3fSDimitry Andric bool PhiLoweringHelper::lowerPhis() { 5410b57cec5SDimitry Andric MachineSSAUpdater SSAUpdater(*MF); 5420b57cec5SDimitry Andric LoopFinder LF(*DT, *PDT); 543bdd1243dSDimitry Andric PhiIncomingAnalysis PIA(*PDT, TII); 544480093f4SDimitry Andric SmallVector<MachineInstr *, 4> Vreg1Phis; 5455f757f3fSDimitry Andric SmallVector<Incoming, 4> Incomings; 5460b57cec5SDimitry Andric 5475f757f3fSDimitry Andric getCandidatesForLowering(Vreg1Phis); 54881ad6265SDimitry Andric if (Vreg1Phis.empty()) 54981ad6265SDimitry Andric return false; 5500b57cec5SDimitry Andric 5515f757f3fSDimitry Andric DT->getBase().updateDFSNumbers(); 552480093f4SDimitry Andric MachineBasicBlock *PrevMBB = nullptr; 553480093f4SDimitry Andric for (MachineInstr *MI : Vreg1Phis) { 554480093f4SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 555480093f4SDimitry Andric if (&MBB != PrevMBB) { 556480093f4SDimitry Andric LF.initialize(MBB); 557480093f4SDimitry Andric PrevMBB = &MBB; 558480093f4SDimitry Andric } 5590b57cec5SDimitry Andric 560480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 561480093f4SDimitry Andric 562480093f4SDimitry Andric Register DstReg = MI->getOperand(0).getReg(); 5635f757f3fSDimitry Andric markAsLaneMask(DstReg); 5645f757f3fSDimitry Andric initializeLaneMaskRegisterAttributes(DstReg); 5650b57cec5SDimitry Andric 5665f757f3fSDimitry Andric collectIncomingValuesFromPhi(MI, Incomings); 5670b57cec5SDimitry Andric 5685f757f3fSDimitry Andric // Sort the incomings such that incoming values that dominate other incoming 5695f757f3fSDimitry Andric // values are sorted earlier. This allows us to do some amount of on-the-fly 5705f757f3fSDimitry Andric // constant folding. 5715f757f3fSDimitry Andric // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. 5725f757f3fSDimitry Andric llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) { 5735f757f3fSDimitry Andric return DT->getNode(LHS.Block)->getDFSNumIn() < 5745f757f3fSDimitry Andric DT->getNode(RHS.Block)->getDFSNumIn(); 5755f757f3fSDimitry Andric }); 5760b57cec5SDimitry Andric 5770b57cec5SDimitry Andric #ifndef NDEBUG 5780b57cec5SDimitry Andric PhiRegisters.insert(DstReg); 5790b57cec5SDimitry Andric #endif 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric // Phis in a loop that are observed outside the loop receive a simple but 5820b57cec5SDimitry Andric // conservatively correct treatment. 5838bcb0991SDimitry Andric std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 5848bcb0991SDimitry Andric for (MachineInstr &Use : MRI->use_instructions(DstReg)) 5858bcb0991SDimitry Andric DomBlocks.push_back(Use.getParent()); 5860b57cec5SDimitry Andric 5878bcb0991SDimitry Andric MachineBasicBlock *PostDomBound = 5888bcb0991SDimitry Andric PDT->findNearestCommonDominator(DomBlocks); 589fe6060f1SDimitry Andric 590fe6060f1SDimitry Andric // FIXME: This fails to find irreducible cycles. If we have a def (other 591fe6060f1SDimitry Andric // than a constant) in a pair of blocks that end up looping back to each 592fe6060f1SDimitry Andric // other, it will be mishandle. Due to structurization this shouldn't occur 593fe6060f1SDimitry Andric // in practice. 5940b57cec5SDimitry Andric unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 5950b57cec5SDimitry Andric 5960b57cec5SDimitry Andric SSAUpdater.Initialize(DstReg); 5970b57cec5SDimitry Andric 5980b57cec5SDimitry Andric if (FoundLoopLevel) { 5995f757f3fSDimitry Andric LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs, 6005f757f3fSDimitry Andric Incomings); 6010b57cec5SDimitry Andric 6025f757f3fSDimitry Andric for (auto &Incoming : Incomings) { 6035f757f3fSDimitry Andric Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 6045f757f3fSDimitry Andric SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg); 6050b57cec5SDimitry Andric } 6060b57cec5SDimitry Andric 6075f757f3fSDimitry Andric for (auto &Incoming : Incomings) { 6085f757f3fSDimitry Andric MachineBasicBlock &IMBB = *Incoming.Block; 6090b57cec5SDimitry Andric buildMergeLaneMasks( 6105f757f3fSDimitry Andric IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 6115f757f3fSDimitry Andric SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 6120b57cec5SDimitry Andric } 6130b57cec5SDimitry Andric } else { 6140b57cec5SDimitry Andric // The phi is not observed from outside a loop. Use a more accurate 6150b57cec5SDimitry Andric // lowering. 6165f757f3fSDimitry Andric PIA.analyze(MBB, Incomings); 6170b57cec5SDimitry Andric 6180b57cec5SDimitry Andric for (MachineBasicBlock *MBB : PIA.predecessors()) 6195f757f3fSDimitry Andric SSAUpdater.AddAvailableValue( 6205f757f3fSDimitry Andric MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); 6210b57cec5SDimitry Andric 6225f757f3fSDimitry Andric for (auto &Incoming : Incomings) { 6235f757f3fSDimitry Andric MachineBasicBlock &IMBB = *Incoming.Block; 6240b57cec5SDimitry Andric if (PIA.isSource(IMBB)) { 625*0fca6ea1SDimitry Andric constrainAsLaneMask(Incoming); 6265f757f3fSDimitry Andric SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg); 6270b57cec5SDimitry Andric } else { 6285f757f3fSDimitry Andric Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 6295f757f3fSDimitry Andric SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg); 6300b57cec5SDimitry Andric } 6310b57cec5SDimitry Andric } 6320b57cec5SDimitry Andric 6335f757f3fSDimitry Andric for (auto &Incoming : Incomings) { 6345f757f3fSDimitry Andric if (!Incoming.UpdatedReg.isValid()) 6350b57cec5SDimitry Andric continue; 6360b57cec5SDimitry Andric 6375f757f3fSDimitry Andric MachineBasicBlock &IMBB = *Incoming.Block; 6380b57cec5SDimitry Andric buildMergeLaneMasks( 6395f757f3fSDimitry Andric IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 6405f757f3fSDimitry Andric SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 6410b57cec5SDimitry Andric } 6420b57cec5SDimitry Andric } 6430b57cec5SDimitry Andric 644e8d8bef9SDimitry Andric Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 6450b57cec5SDimitry Andric if (NewReg != DstReg) { 6465f757f3fSDimitry Andric replaceDstReg(NewReg, DstReg, &MBB); 647480093f4SDimitry Andric MI->eraseFromParent(); 6480b57cec5SDimitry Andric } 6490b57cec5SDimitry Andric 6505f757f3fSDimitry Andric Incomings.clear(); 6510b57cec5SDimitry Andric } 65281ad6265SDimitry Andric return true; 6530b57cec5SDimitry Andric } 6540b57cec5SDimitry Andric 6555f757f3fSDimitry Andric bool Vreg1LoweringHelper::lowerCopiesToI1() { 65681ad6265SDimitry Andric bool Changed = false; 6570b57cec5SDimitry Andric MachineSSAUpdater SSAUpdater(*MF); 6580b57cec5SDimitry Andric LoopFinder LF(*DT, *PDT); 6590b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> DeadCopies; 6600b57cec5SDimitry Andric 6610b57cec5SDimitry Andric for (MachineBasicBlock &MBB : *MF) { 6620b57cec5SDimitry Andric LF.initialize(MBB); 6630b57cec5SDimitry Andric 6640b57cec5SDimitry Andric for (MachineInstr &MI : MBB) { 6650b57cec5SDimitry Andric if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 6660b57cec5SDimitry Andric MI.getOpcode() != AMDGPU::COPY) 6670b57cec5SDimitry Andric continue; 6680b57cec5SDimitry Andric 6698bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 6700b57cec5SDimitry Andric if (!isVreg1(DstReg)) 6710b57cec5SDimitry Andric continue; 6720b57cec5SDimitry Andric 67381ad6265SDimitry Andric Changed = true; 67481ad6265SDimitry Andric 6750b57cec5SDimitry Andric if (MRI->use_empty(DstReg)) { 6760b57cec5SDimitry Andric DeadCopies.push_back(&MI); 6770b57cec5SDimitry Andric continue; 6780b57cec5SDimitry Andric } 6790b57cec5SDimitry Andric 6800b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 6810b57cec5SDimitry Andric 6825f757f3fSDimitry Andric markAsLaneMask(DstReg); 6835f757f3fSDimitry Andric initializeLaneMaskRegisterAttributes(DstReg); 6845f757f3fSDimitry Andric 6850b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 6860b57cec5SDimitry Andric continue; 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric DebugLoc DL = MI.getDebugLoc(); 6898bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 6900b57cec5SDimitry Andric assert(!MI.getOperand(1).getSubReg()); 6910b57cec5SDimitry Andric 692e8d8bef9SDimitry Andric if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 6930b57cec5SDimitry Andric assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 6945f757f3fSDimitry Andric Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 6950b57cec5SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 6960b57cec5SDimitry Andric .addReg(SrcReg) 6970b57cec5SDimitry Andric .addImm(0); 6980b57cec5SDimitry Andric MI.getOperand(1).setReg(TmpReg); 6990b57cec5SDimitry Andric SrcReg = TmpReg; 7005f757f3fSDimitry Andric } else { 7015f757f3fSDimitry Andric // SrcReg needs to be live beyond copy. 7025f757f3fSDimitry Andric MI.getOperand(1).setIsKill(false); 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric // Defs in a loop that are observed outside the loop must be transformed 7060b57cec5SDimitry Andric // into appropriate bit manipulation. 7078bcb0991SDimitry Andric std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 7088bcb0991SDimitry Andric for (MachineInstr &Use : MRI->use_instructions(DstReg)) 7098bcb0991SDimitry Andric DomBlocks.push_back(Use.getParent()); 7100b57cec5SDimitry Andric 7118bcb0991SDimitry Andric MachineBasicBlock *PostDomBound = 7128bcb0991SDimitry Andric PDT->findNearestCommonDominator(DomBlocks); 7130b57cec5SDimitry Andric unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 7140b57cec5SDimitry Andric if (FoundLoopLevel) { 7150b57cec5SDimitry Andric SSAUpdater.Initialize(DstReg); 7160b57cec5SDimitry Andric SSAUpdater.AddAvailableValue(&MBB, DstReg); 7175f757f3fSDimitry Andric LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs); 7180b57cec5SDimitry Andric 7190b57cec5SDimitry Andric buildMergeLaneMasks(MBB, MI, DL, DstReg, 7200b57cec5SDimitry Andric SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 7210b57cec5SDimitry Andric DeadCopies.push_back(&MI); 7220b57cec5SDimitry Andric } 7230b57cec5SDimitry Andric } 7240b57cec5SDimitry Andric 7250b57cec5SDimitry Andric for (MachineInstr *MI : DeadCopies) 7260b57cec5SDimitry Andric MI->eraseFromParent(); 7270b57cec5SDimitry Andric DeadCopies.clear(); 7280b57cec5SDimitry Andric } 72981ad6265SDimitry Andric return Changed; 7300b57cec5SDimitry Andric } 7310b57cec5SDimitry Andric 7325f757f3fSDimitry Andric bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { 7330b57cec5SDimitry Andric const MachineInstr *MI; 7340b57cec5SDimitry Andric for (;;) { 7350b57cec5SDimitry Andric MI = MRI->getUniqueVRegDef(Reg); 736fe6060f1SDimitry Andric if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 737fe6060f1SDimitry Andric return true; 738fe6060f1SDimitry Andric 7390b57cec5SDimitry Andric if (MI->getOpcode() != AMDGPU::COPY) 7400b57cec5SDimitry Andric break; 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric Reg = MI->getOperand(1).getReg(); 743e8d8bef9SDimitry Andric if (!Reg.isVirtual()) 7440b57cec5SDimitry Andric return false; 7450b57cec5SDimitry Andric if (!isLaneMaskReg(Reg)) 7460b57cec5SDimitry Andric return false; 7470b57cec5SDimitry Andric } 7480b57cec5SDimitry Andric 7490b57cec5SDimitry Andric if (MI->getOpcode() != MovOp) 7500b57cec5SDimitry Andric return false; 7510b57cec5SDimitry Andric 7520b57cec5SDimitry Andric if (!MI->getOperand(1).isImm()) 7530b57cec5SDimitry Andric return false; 7540b57cec5SDimitry Andric 7550b57cec5SDimitry Andric int64_t Imm = MI->getOperand(1).getImm(); 7560b57cec5SDimitry Andric if (Imm == 0) { 7570b57cec5SDimitry Andric Val = false; 7580b57cec5SDimitry Andric return true; 7590b57cec5SDimitry Andric } 7600b57cec5SDimitry Andric if (Imm == -1) { 7610b57cec5SDimitry Andric Val = true; 7620b57cec5SDimitry Andric return true; 7630b57cec5SDimitry Andric } 7640b57cec5SDimitry Andric 7650b57cec5SDimitry Andric return false; 7660b57cec5SDimitry Andric } 7670b57cec5SDimitry Andric 7680b57cec5SDimitry Andric static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 7690b57cec5SDimitry Andric Def = false; 7700b57cec5SDimitry Andric Use = false; 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric for (const MachineOperand &MO : MI.operands()) { 7730b57cec5SDimitry Andric if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 7740b57cec5SDimitry Andric if (MO.isUse()) 7750b57cec5SDimitry Andric Use = true; 7760b57cec5SDimitry Andric else 7770b57cec5SDimitry Andric Def = true; 7780b57cec5SDimitry Andric } 7790b57cec5SDimitry Andric } 7800b57cec5SDimitry Andric } 7810b57cec5SDimitry Andric 7820b57cec5SDimitry Andric /// Return a point at the end of the given \p MBB to insert SALU instructions 7830b57cec5SDimitry Andric /// for lane mask calculation. Take terminators and SCC into account. 7840b57cec5SDimitry Andric MachineBasicBlock::iterator 7855f757f3fSDimitry Andric PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 7860b57cec5SDimitry Andric auto InsertionPt = MBB.getFirstTerminator(); 7870b57cec5SDimitry Andric bool TerminatorsUseSCC = false; 7880b57cec5SDimitry Andric for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 7890b57cec5SDimitry Andric bool DefsSCC; 7900b57cec5SDimitry Andric instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 7910b57cec5SDimitry Andric if (TerminatorsUseSCC || DefsSCC) 7920b57cec5SDimitry Andric break; 7930b57cec5SDimitry Andric } 7940b57cec5SDimitry Andric 7950b57cec5SDimitry Andric if (!TerminatorsUseSCC) 7960b57cec5SDimitry Andric return InsertionPt; 7970b57cec5SDimitry Andric 7980b57cec5SDimitry Andric while (InsertionPt != MBB.begin()) { 7990b57cec5SDimitry Andric InsertionPt--; 8000b57cec5SDimitry Andric 8010b57cec5SDimitry Andric bool DefSCC, UseSCC; 8020b57cec5SDimitry Andric instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 8030b57cec5SDimitry Andric if (DefSCC) 8040b57cec5SDimitry Andric return InsertionPt; 8050b57cec5SDimitry Andric } 8060b57cec5SDimitry Andric 8070b57cec5SDimitry Andric // We should have at least seen an IMPLICIT_DEF or COPY 8080b57cec5SDimitry Andric llvm_unreachable("SCC used by terminator but no def in block"); 8090b57cec5SDimitry Andric } 8100b57cec5SDimitry Andric 8115f757f3fSDimitry Andric // VReg_1 -> SReg_32 or SReg_64 8125f757f3fSDimitry Andric void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { 8135f757f3fSDimitry Andric MRI->setRegClass(DstReg, ST->getBoolRC()); 8145f757f3fSDimitry Andric } 8155f757f3fSDimitry Andric 8165f757f3fSDimitry Andric void Vreg1LoweringHelper::getCandidatesForLowering( 8175f757f3fSDimitry Andric SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 8185f757f3fSDimitry Andric for (MachineBasicBlock &MBB : *MF) { 8195f757f3fSDimitry Andric for (MachineInstr &MI : MBB.phis()) { 8205f757f3fSDimitry Andric if (isVreg1(MI.getOperand(0).getReg())) 8215f757f3fSDimitry Andric Vreg1Phis.push_back(&MI); 8225f757f3fSDimitry Andric } 8235f757f3fSDimitry Andric } 8245f757f3fSDimitry Andric } 8255f757f3fSDimitry Andric 8265f757f3fSDimitry Andric void Vreg1LoweringHelper::collectIncomingValuesFromPhi( 8275f757f3fSDimitry Andric const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 8285f757f3fSDimitry Andric for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 8295f757f3fSDimitry Andric assert(i + 1 < MI->getNumOperands()); 8305f757f3fSDimitry Andric Register IncomingReg = MI->getOperand(i).getReg(); 8315f757f3fSDimitry Andric MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 8325f757f3fSDimitry Andric MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 8335f757f3fSDimitry Andric 8345f757f3fSDimitry Andric if (IncomingDef->getOpcode() == AMDGPU::COPY) { 8355f757f3fSDimitry Andric IncomingReg = IncomingDef->getOperand(1).getReg(); 8365f757f3fSDimitry Andric assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 8375f757f3fSDimitry Andric assert(!IncomingDef->getOperand(1).getSubReg()); 8385f757f3fSDimitry Andric } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 8395f757f3fSDimitry Andric continue; 8405f757f3fSDimitry Andric } else { 8415f757f3fSDimitry Andric assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 8425f757f3fSDimitry Andric } 8435f757f3fSDimitry Andric 8445f757f3fSDimitry Andric Incomings.emplace_back(IncomingReg, IncomingMBB, Register()); 8455f757f3fSDimitry Andric } 8465f757f3fSDimitry Andric } 8475f757f3fSDimitry Andric 8485f757f3fSDimitry Andric void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 8495f757f3fSDimitry Andric MachineBasicBlock *MBB) { 8505f757f3fSDimitry Andric MRI->replaceRegWith(NewReg, OldReg); 8515f757f3fSDimitry Andric } 8525f757f3fSDimitry Andric 8535f757f3fSDimitry Andric void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, 8540b57cec5SDimitry Andric MachineBasicBlock::iterator I, 8555f757f3fSDimitry Andric const DebugLoc &DL, 8565f757f3fSDimitry Andric Register DstReg, Register PrevReg, 8575f757f3fSDimitry Andric Register CurReg) { 858fe6060f1SDimitry Andric bool PrevVal = false; 8590b57cec5SDimitry Andric bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 860fe6060f1SDimitry Andric bool CurVal = false; 8610b57cec5SDimitry Andric bool CurConstant = isConstantLaneMask(CurReg, CurVal); 8620b57cec5SDimitry Andric 8630b57cec5SDimitry Andric if (PrevConstant && CurConstant) { 8640b57cec5SDimitry Andric if (PrevVal == CurVal) { 8650b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 8660b57cec5SDimitry Andric } else if (CurVal) { 8670b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 8680b57cec5SDimitry Andric } else { 8690b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 8700b57cec5SDimitry Andric .addReg(ExecReg) 8710b57cec5SDimitry Andric .addImm(-1); 8720b57cec5SDimitry Andric } 8730b57cec5SDimitry Andric return; 8740b57cec5SDimitry Andric } 8750b57cec5SDimitry Andric 8765f757f3fSDimitry Andric Register PrevMaskedReg; 8775f757f3fSDimitry Andric Register CurMaskedReg; 8780b57cec5SDimitry Andric if (!PrevConstant) { 8790b57cec5SDimitry Andric if (CurConstant && CurVal) { 8800b57cec5SDimitry Andric PrevMaskedReg = PrevReg; 8810b57cec5SDimitry Andric } else { 8825f757f3fSDimitry Andric PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 8830b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 8840b57cec5SDimitry Andric .addReg(PrevReg) 8850b57cec5SDimitry Andric .addReg(ExecReg); 8860b57cec5SDimitry Andric } 8870b57cec5SDimitry Andric } 8880b57cec5SDimitry Andric if (!CurConstant) { 8890b57cec5SDimitry Andric // TODO: check whether CurReg is already masked by EXEC 8900b57cec5SDimitry Andric if (PrevConstant && PrevVal) { 8910b57cec5SDimitry Andric CurMaskedReg = CurReg; 8920b57cec5SDimitry Andric } else { 8935f757f3fSDimitry Andric CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 8940b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 8950b57cec5SDimitry Andric .addReg(CurReg) 8960b57cec5SDimitry Andric .addReg(ExecReg); 8970b57cec5SDimitry Andric } 8980b57cec5SDimitry Andric } 8990b57cec5SDimitry Andric 9000b57cec5SDimitry Andric if (PrevConstant && !PrevVal) { 9010b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 9020b57cec5SDimitry Andric .addReg(CurMaskedReg); 9030b57cec5SDimitry Andric } else if (CurConstant && !CurVal) { 9040b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 9050b57cec5SDimitry Andric .addReg(PrevMaskedReg); 9060b57cec5SDimitry Andric } else if (PrevConstant && PrevVal) { 9070b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 9080b57cec5SDimitry Andric .addReg(CurMaskedReg) 9090b57cec5SDimitry Andric .addReg(ExecReg); 9100b57cec5SDimitry Andric } else { 9110b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 9120b57cec5SDimitry Andric .addReg(PrevMaskedReg) 9130b57cec5SDimitry Andric .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 9140b57cec5SDimitry Andric } 9150b57cec5SDimitry Andric } 9165f757f3fSDimitry Andric 917*0fca6ea1SDimitry Andric void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {} 918