xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp (revision 7f230feeeac8a67b335f52bd2e900a05c6098f20)
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/LiveIntervals.h"
21 #include "llvm/CodeGen/LiveRegMatrix.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/VirtRegMap.h"
24 #include "llvm/InitializePasses.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "amdgpu-nsa-reassign"
29 
30 STATISTIC(NumNSAInstructions,
31           "Number of NSA instructions with non-sequential address found");
32 STATISTIC(NumNSAConverted,
33           "Number of NSA instructions changed to sequential");
34 
35 namespace {
36 
37 class GCNNSAReassign : public MachineFunctionPass {
38 public:
39   static char ID;
40 
41   GCNNSAReassign() : MachineFunctionPass(ID) {
42     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
43   }
44 
45   bool runOnMachineFunction(MachineFunction &MF) override;
46 
47   StringRef getPassName() const override { return "GCN NSA Reassign"; }
48 
49   void getAnalysisUsage(AnalysisUsage &AU) const override {
50     AU.addRequired<LiveIntervals>();
51     AU.addRequired<VirtRegMap>();
52     AU.addRequired<LiveRegMatrix>();
53     AU.setPreservesAll();
54     MachineFunctionPass::getAnalysisUsage(AU);
55   }
56 
57 private:
58   typedef enum {
59     NOT_NSA,        // Not an NSA instruction
60     FIXED,          // NSA which we cannot modify
61     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
62                     // to optimize.
63     CONTIGUOUS      // NSA with all sequential address registers
64   } NSA_Status;
65 
66   const GCNSubtarget *ST;
67 
68   const MachineRegisterInfo *MRI;
69 
70   const SIRegisterInfo *TRI;
71 
72   VirtRegMap *VRM;
73 
74   LiveRegMatrix *LRM;
75 
76   LiveIntervals *LIS;
77 
78   unsigned MaxNumVGPRs;
79 
80   const MCPhysReg *CSRegs;
81 
82   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
83 
84   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
85                           unsigned StartReg) const;
86 
87   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
88 
89   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
90 };
91 
92 } // End anonymous namespace.
93 
94 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
95                       false, false)
96 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
97 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
98 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
99 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
100                     false, false)
101 
102 
103 char GCNNSAReassign::ID = 0;
104 
105 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
106 
107 bool
108 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
109                                    unsigned StartReg) const {
110   unsigned NumRegs = Intervals.size();
111 
112   for (unsigned N = 0; N < NumRegs; ++N)
113     if (VRM->hasPhys(Intervals[N]->reg()))
114       LRM->unassign(*Intervals[N]);
115 
116   for (unsigned N = 0; N < NumRegs; ++N)
117     if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
118       return false;
119 
120   for (unsigned N = 0; N < NumRegs; ++N)
121     LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
122 
123   return true;
124 }
125 
126 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
127   for (unsigned N = 0; N < NumRegs; ++N) {
128     unsigned Reg = StartReg + N;
129     if (!MRI->isAllocatable(Reg))
130       return false;
131 
132     for (unsigned I = 0; CSRegs[I]; ++I)
133       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
134           !LRM->isPhysRegUsed(CSRegs[I]))
135       return false;
136   }
137 
138   return true;
139 }
140 
141 bool
142 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
143   unsigned NumRegs = Intervals.size();
144 
145   if (NumRegs > MaxNumVGPRs)
146     return false;
147   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
148 
149   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
150     if (!canAssign(Reg, NumRegs))
151       continue;
152 
153     if (tryAssignRegisters(Intervals, Reg))
154       return true;
155   }
156 
157   return false;
158 }
159 
160 GCNNSAReassign::NSA_Status
161 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
162   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
163   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
164     return NSA_Status::NOT_NSA;
165 
166   int VAddr0Idx =
167     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
168 
169   unsigned VgprBase = 0;
170   bool NSA = false;
171   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
172     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
173     Register Reg = Op.getReg();
174     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
175       return NSA_Status::FIXED;
176 
177     Register PhysReg = VRM->getPhys(Reg);
178 
179     if (!Fast) {
180       if (!PhysReg)
181         return NSA_Status::FIXED;
182 
183       // Bail if address is not a VGPR32. That should be possible to extend the
184       // optimization to work with subregs of a wider register tuples, but the
185       // logic to find free registers will be much more complicated with much
186       // less chances for success. That seems reasonable to assume that in most
187       // cases a tuple is used because a vector variable contains different
188       // parts of an address and it is either already consequitive or cannot
189       // be reassigned if not. If needed it is better to rely on register
190       // coalescer to process such address tuples.
191       if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
192         return NSA_Status::FIXED;
193 
194       // InlineSpiller does not call LRM::assign() after an LI split leaving
195       // it in an inconsistent state, so we cannot call LRM::unassign().
196       // See llvm bug #48911.
197       // Skip reassign if a register has originated from such split.
198       // FIXME: Remove the workaround when bug #48911 is fixed.
199       if (VRM->getPreSplitReg(Reg))
200         return NSA_Status::FIXED;
201 
202       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
203 
204       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
205         return NSA_Status::FIXED;
206 
207       for (auto U : MRI->use_nodbg_operands(Reg)) {
208         if (U.isImplicit())
209           return NSA_Status::FIXED;
210         const MachineInstr *UseInst = U.getParent();
211         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
212           return NSA_Status::FIXED;
213       }
214 
215       if (!LIS->hasInterval(Reg))
216         return NSA_Status::FIXED;
217     }
218 
219     if (I == 0)
220       VgprBase = PhysReg;
221     else if (VgprBase + I != PhysReg)
222       NSA = true;
223   }
224 
225   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
226 }
227 
228 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
229   ST = &MF.getSubtarget<GCNSubtarget>();
230   if (ST->getGeneration() < GCNSubtarget::GFX10)
231     return false;
232 
233   MRI = &MF.getRegInfo();
234   TRI = ST->getRegisterInfo();
235   VRM = &getAnalysis<VirtRegMap>();
236   LRM = &getAnalysis<LiveRegMatrix>();
237   LIS = &getAnalysis<LiveIntervals>();
238 
239   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
240   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
241   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
242   CSRegs = MRI->getCalleeSavedRegs();
243 
244   using Candidate = std::pair<const MachineInstr*, bool>;
245   SmallVector<Candidate, 32> Candidates;
246   for (const MachineBasicBlock &MBB : MF) {
247     for (const MachineInstr &MI : MBB) {
248       switch (CheckNSA(MI)) {
249       default:
250         continue;
251       case NSA_Status::CONTIGUOUS:
252         Candidates.push_back(std::make_pair(&MI, true));
253         break;
254       case NSA_Status::NON_CONTIGUOUS:
255         Candidates.push_back(std::make_pair(&MI, false));
256         ++NumNSAInstructions;
257         break;
258       }
259     }
260   }
261 
262   bool Changed = false;
263   for (auto &C : Candidates) {
264     if (C.second)
265       continue;
266 
267     const MachineInstr *MI = C.first;
268     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
269       // Already happen to be fixed.
270       C.second = true;
271       ++NumNSAConverted;
272       continue;
273     }
274 
275     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
276     int VAddr0Idx =
277       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
278 
279     SmallVector<LiveInterval *, 16> Intervals;
280     SmallVector<MCRegister, 16> OrigRegs;
281     SlotIndex MinInd, MaxInd;
282     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
283       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
284       Register Reg = Op.getReg();
285       LiveInterval *LI = &LIS->getInterval(Reg);
286       if (llvm::is_contained(Intervals, LI)) {
287         // Same register used, unable to make sequential
288         Intervals.clear();
289         break;
290       }
291       Intervals.push_back(LI);
292       OrigRegs.push_back(VRM->getPhys(Reg));
293       if (LI->empty()) {
294         // The address input is undef, so it doesn't contribute to the relevant
295         // range. Seed a reasonable index range if required.
296         if (I == 0)
297           MinInd = MaxInd = LIS->getInstructionIndex(*MI);
298         continue;
299       }
300       MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
301       MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
302     }
303 
304     if (Intervals.empty())
305       continue;
306 
307     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
308                       << "\tOriginal allocation:\t";
309                for (auto *LI
310                     : Intervals) dbgs()
311                << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
312                dbgs() << '\n');
313 
314     bool Success = scavengeRegs(Intervals);
315     if (!Success) {
316       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
317       if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
318         continue;
319     } else {
320       // Check we did not make it worse for other instructions.
321       auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
322                                 [this](const Candidate &C, SlotIndex I) {
323                                   return LIS->getInstructionIndex(*C.first) < I;
324                                 });
325       for (auto E = Candidates.end(); Success && I != E &&
326               LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
327         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
328           Success = false;
329           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
330         }
331       }
332     }
333 
334     if (!Success) {
335       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
336         if (VRM->hasPhys(Intervals[I]->reg()))
337           LRM->unassign(*Intervals[I]);
338 
339       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
340         LRM->assign(*Intervals[I], OrigRegs[I]);
341 
342       continue;
343     }
344 
345     C.second = true;
346     ++NumNSAConverted;
347     LLVM_DEBUG(
348         dbgs() << "\tNew allocation:\t\t ["
349                << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
350                << " : "
351                << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
352                << "]\n");
353     Changed = true;
354   }
355 
356   return Changed;
357 }
358