xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp (revision be187369a03bf2df8bdbc76ecd381377b3bb6074)
1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// RenameIndependentSubregs pass leaves large partially used super registers,
10 /// for example:
11 ///   undef %0.sub4:VReg_1024 = ...
12 ///   %0.sub5:VReg_1024 = ...
13 ///   %0.sub6:VReg_1024 = ...
14 ///   %0.sub7:VReg_1024 = ...
15 ///   use %0.sub4_sub5_sub6_sub7
16 ///   use %0.sub6_sub7
17 ///
18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19 /// rewrites such partially used super registers with registers of minimal size:
20 ///   undef %0.sub0:VReg_128 = ...
21 ///   %0.sub1:VReg_128 = ...
22 ///   %0.sub2:VReg_128 = ...
23 ///   %0.sub3:VReg_128 = ...
24 ///   use %0.sub0_sub1_sub2_sub3
25 ///   use %0.sub2_sub3
26 ///
27 /// This allows to avoid subreg lanemasks tracking during register pressure
28 /// calculation and creates more possibilities for the code unaware of lanemasks
29 //===----------------------------------------------------------------------===//
30 
31 #include "AMDGPU.h"
32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
33 #include "SIRegisterInfo.h"
34 #include "llvm/CodeGen/LiveInterval.h"
35 #include "llvm/CodeGen/LiveIntervals.h"
36 #include "llvm/CodeGen/MachineFunctionPass.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetInstrInfo.h"
39 #include "llvm/Pass.h"
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "rewrite-partial-reg-uses"
44 
45 namespace {
46 
47 class GCNRewritePartialRegUses : public MachineFunctionPass {
48 public:
49   static char ID;
50   GCNRewritePartialRegUses() : MachineFunctionPass(ID) {}
51 
52   StringRef getPassName() const override {
53     return "Rewrite Partial Register Uses";
54   }
55 
56   void getAnalysisUsage(AnalysisUsage &AU) const override {
57     AU.setPreservesCFG();
58     AU.addPreserved<LiveIntervalsWrapperPass>();
59     AU.addPreserved<SlotIndexesWrapperPass>();
60     MachineFunctionPass::getAnalysisUsage(AU);
61   }
62 
63   bool runOnMachineFunction(MachineFunction &MF) override;
64 
65 private:
66   MachineRegisterInfo *MRI;
67   const SIRegisterInfo *TRI;
68   const TargetInstrInfo *TII;
69   LiveIntervals *LIS;
70 
71   /// Rewrite partially used register Reg by shifting all its subregisters to
72   /// the right and replacing the original register with a register of minimal
73   /// size. Return true if the change has been made.
74   bool rewriteReg(Register Reg) const;
75 
76   /// Value type for SubRegMap below.
77   struct SubRegInfo {
78     /// Register class required to hold the value stored in the SubReg.
79     const TargetRegisterClass *RC;
80 
81     /// Index for the right-shifted subregister. If 0 this is the "covering"
82     /// subreg i.e. subreg that covers all others. Covering subreg becomes the
83     /// whole register after the replacement.
84     unsigned SubReg = AMDGPU::NoSubRegister;
85     SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {}
86   };
87 
88   /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container.
89   using SubRegMap = SmallDenseMap<unsigned, SubRegInfo>;
90 
91   /// Given register class RC and the set of used subregs as keys in the SubRegs
92   /// map return new register class and indexes of right-shifted subregs as
93   /// values in SubRegs map such that the resulting regclass would contain
94   /// registers of minimal size.
95   const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
96                                            SubRegMap &SubRegs) const;
97 
98   /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to
99   /// find new regclass such that:
100   ///   1. It has subregs obtained by shifting each OldSubReg by RShift number
101   ///      of bits to the right. Every "shifted" subreg should have the same
102   ///      SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
103   ///      all other subregs in pairs. Basically such subreg becomes a whole
104   ///      register.
105   ///   2. Resulting register class contains registers of minimal size but not
106   ///      less than RegNumBits.
107   ///
108   /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
109   /// parameter:
110   ///   OldSubReg - input parameter,
111   ///   SubRegRC  - input parameter (cannot be null),
112   ///   NewSubReg - output, contains shifted subregs on return.
113   const TargetRegisterClass *
114   getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
115                                 unsigned RegNumBits, unsigned CoverSubregIdx,
116                                 SubRegMap &SubRegs) const;
117 
118   /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
119   /// describing OldSubReg -> NewSubReg mapping.
120   void updateLiveIntervals(Register OldReg, Register NewReg,
121                            SubRegMap &SubRegs) const;
122 
123   /// Helper methods.
124 
125   /// Return reg class expected by a MO's parent instruction for a given MO.
126   const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const;
127 
128   /// Find right-shifted by RShift amount version of the SubReg if it exists,
129   /// return 0 otherwise.
130   unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
131 
132   /// Find subreg index with a given Offset and Size, return 0 if there is no
133   /// such subregister index. The result is cached in SubRegs data-member.
134   unsigned getSubReg(unsigned Offset, unsigned Size) const;
135 
136   /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
137   mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
138 
139   /// Return bit mask that contains all register classes that are projected into
140   /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
141   const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
142                                        unsigned SubRegIdx) const;
143 
144   /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
145   mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
146                         const uint32_t *>
147       SuperRegMasks;
148 
149   /// Return bitmask containing all allocatable register classes with registers
150   /// aligned at AlignNumBits. The result is cached in
151   /// AllocatableAndAlignedRegClassMasks data-member.
152   const BitVector &
153   getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
154 
155   /// Cache for getAllocatableAndAlignedRegClassMask method:
156   ///   AlignNumBits -> Class bitmask.
157   mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
158 };
159 
160 } // end anonymous namespace
161 
162 // TODO: move this to the tablegen and use binary search by Offset.
163 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset,
164                                              unsigned Size) const {
165   const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0);
166   if (Inserted) {
167     for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
168       if (TRI->getSubRegIdxOffset(Idx) == Offset &&
169           TRI->getSubRegIdxSize(Idx) == Size) {
170         I->second = Idx;
171         break;
172       }
173     }
174   }
175   return I->second;
176 }
177 
178 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg,
179                                                unsigned RShift) const {
180   unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift;
181   return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg));
182 }
183 
184 const uint32_t *
185 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC,
186                                                unsigned SubRegIdx) const {
187   const auto [I, Inserted] =
188       SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr);
189   if (Inserted) {
190     for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
191       if (RCI.getSubReg() == SubRegIdx) {
192         I->second = RCI.getMask();
193         break;
194       }
195     }
196   }
197   return I->second;
198 }
199 
200 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask(
201     unsigned AlignNumBits) const {
202   const auto [I, Inserted] =
203       AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits);
204   if (Inserted) {
205     BitVector &BV = I->second;
206     BV.resize(TRI->getNumRegClasses());
207     for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
208       auto *RC = TRI->getRegClass(ClassID);
209       if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
210         BV.set(ClassID);
211     }
212   }
213   return I->second;
214 }
215 
216 const TargetRegisterClass *
217 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
218     const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits,
219     unsigned CoverSubregIdx, SubRegMap &SubRegs) const {
220 
221   unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
222   LLVM_DEBUG(dbgs() << "  Shift " << RShift << ", reg align " << RCAlign
223                     << '\n');
224 
225   BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
226   for (auto &[OldSubReg, SRI] : SubRegs) {
227     auto &[SubRegRC, NewSubReg] = SRI;
228     assert(SubRegRC);
229 
230     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(OldSubReg) << ':'
231                       << TRI->getRegClassName(SubRegRC)
232                       << (SubRegRC->isAllocatable() ? "" : " not alloc")
233                       << " -> ");
234 
235     if (OldSubReg == CoverSubregIdx) {
236       // Covering subreg will become a full register, RC should be allocatable.
237       assert(SubRegRC->isAllocatable());
238       NewSubReg = AMDGPU::NoSubRegister;
239       LLVM_DEBUG(dbgs() << "whole reg");
240     } else {
241       NewSubReg = shiftSubReg(OldSubReg, RShift);
242       if (!NewSubReg) {
243         LLVM_DEBUG(dbgs() << "none\n");
244         return nullptr;
245       }
246       LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
247     }
248 
249     const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg)
250                                      : SubRegRC->getSubClassMask();
251     if (!Mask)
252       llvm_unreachable("no register class mask?");
253 
254     ClassMask.clearBitsNotInMask(Mask);
255     // Don't try to early exit because checking if ClassMask has set bits isn't
256     // that cheap and we expect it to pass in most cases.
257     LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
258   }
259 
260   // ClassMask is the set of all register classes such that each class is
261   // allocatable, aligned, has all shifted subregs and each subreg has required
262   // register class (see SubRegRC above). Now select first (that is largest)
263   // register class with registers of minimal but not less than RegNumBits size.
264   // We have to check register size because we may encounter classes of smaller
265   // registers like VReg_1 in some situations.
266   const TargetRegisterClass *MinRC = nullptr;
267   unsigned MinNumBits = std::numeric_limits<unsigned>::max();
268   for (unsigned ClassID : ClassMask.set_bits()) {
269     auto *RC = TRI->getRegClass(ClassID);
270     unsigned NumBits = TRI->getRegSizeInBits(*RC);
271     if (NumBits < MinNumBits && NumBits >= RegNumBits) {
272       MinNumBits = NumBits;
273       MinRC = RC;
274     }
275     if (MinNumBits == RegNumBits)
276       break;
277   }
278 #ifndef NDEBUG
279   if (MinRC) {
280     assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
281     for (auto [SubReg, SRI] : SubRegs)
282       // Check that all registers in MinRC support SRI.SubReg subregister.
283       assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg));
284   }
285 #endif
286   // There might be zero RShift - in this case we just trying to find smaller
287   // register.
288   return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
289 }
290 
291 const TargetRegisterClass *
292 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC,
293                                         SubRegMap &SubRegs) const {
294   unsigned CoverSubreg = AMDGPU::NoSubRegister;
295   unsigned Offset = std::numeric_limits<unsigned>::max();
296   unsigned End = 0;
297   for (auto [SubReg, SRI] : SubRegs) {
298     unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg);
299     unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg);
300     if (SubRegOffset < Offset) {
301       Offset = SubRegOffset;
302       CoverSubreg = AMDGPU::NoSubRegister;
303     }
304     if (SubRegEnd > End) {
305       End = SubRegEnd;
306       CoverSubreg = AMDGPU::NoSubRegister;
307     }
308     if (SubRegOffset == Offset && SubRegEnd == End)
309       CoverSubreg = SubReg;
310   }
311   // If covering subreg is found shift everything so the covering subreg would
312   // be in the rightmost position.
313   if (CoverSubreg != AMDGPU::NoSubRegister)
314     return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg,
315                                          SubRegs);
316 
317   // Otherwise find subreg with maximum required alignment and shift it and all
318   // other subregs to the rightmost possible position with respect to the
319   // alignment.
320   unsigned MaxAlign = 0;
321   for (auto [SubReg, SRI] : SubRegs)
322     MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg));
323 
324   unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
325   for (auto [SubReg, SRI] : SubRegs) {
326     if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
327       continue;
328     FirstMaxAlignedSubRegOffset =
329         std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg));
330     if (FirstMaxAlignedSubRegOffset == Offset)
331       break;
332   }
333 
334   unsigned NewOffsetOfMaxAlignedSubReg =
335       alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign);
336 
337   if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
338     llvm_unreachable("misaligned subreg");
339 
340   unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
341   return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs);
342 }
343 
344 // Only the subrange's lanemasks of the original interval need to be modified.
345 // Subrange for a covering subreg becomes the main range.
346 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg,
347                                                    Register NewReg,
348                                                    SubRegMap &SubRegs) const {
349   if (!LIS->hasInterval(OldReg))
350     return;
351 
352   auto &OldLI = LIS->getInterval(OldReg);
353   auto &NewLI = LIS->createEmptyInterval(NewReg);
354 
355   auto &Allocator = LIS->getVNInfoAllocator();
356   NewLI.setWeight(OldLI.weight());
357 
358   for (auto &SR : OldLI.subranges()) {
359     auto I = find_if(SubRegs, [&](auto &P) {
360       return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first);
361     });
362 
363     if (I == SubRegs.end()) {
364       // There might be a situation when subranges don't exactly match used
365       // subregs, for example:
366       // %120 [160r,1392r:0) 0@160r
367       //    L000000000000C000 [160r,1392r:0) 0@160r
368       //    L0000000000003000 [160r,1392r:0) 0@160r
369       //    L0000000000000C00 [160r,1392r:0) 0@160r
370       //    L0000000000000300 [160r,1392r:0) 0@160r
371       //    L0000000000000003 [160r,1104r:0) 0@160r
372       //    L000000000000000C [160r,1104r:0) 0@160r
373       //    L0000000000000030 [160r,1104r:0) 0@160r
374       //    L00000000000000C0 [160r,1104r:0) 0@160r
375       // but used subregs are:
376       //    sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
377       //    sub0_sub1_sub2_sub3, L00000000000000FF
378       //    sub4_sub5_sub6_sub7, L000000000000FF00
379       // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
380       // have several subranges with the same lifetime. For such cases just
381       // recreate the interval.
382       LIS->removeInterval(OldReg);
383       LIS->removeInterval(NewReg);
384       LIS->createAndComputeVirtRegInterval(NewReg);
385       return;
386     }
387 
388     if (unsigned NewSubReg = I->second.SubReg)
389       NewLI.createSubRangeFrom(Allocator,
390                                TRI->getSubRegIndexLaneMask(NewSubReg), SR);
391     else // This is the covering subreg (0 index) - set it as main range.
392       NewLI.assign(SR, Allocator);
393 
394     SubRegs.erase(I);
395   }
396   if (NewLI.empty())
397     NewLI.assign(OldLI, Allocator);
398   assert(NewLI.verify(MRI));
399   LIS->removeInterval(OldReg);
400 }
401 
402 const TargetRegisterClass *
403 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
404   MachineInstr *MI = MO.getParent();
405   return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI,
406                           *MI->getParent()->getParent());
407 }
408 
409 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
410   auto Range = MRI->reg_nodbg_operands(Reg);
411   if (Range.empty() || any_of(Range, [](MachineOperand &MO) {
412         return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1]
413       }))
414     return false;
415 
416   auto *RC = MRI->getRegClass(Reg);
417   LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
418                     << ':' << TRI->getRegClassName(RC) << '\n');
419 
420   // Collect used subregs and their reg classes infered from instruction
421   // operands.
422   SubRegMap SubRegs;
423   for (MachineOperand &MO : Range) {
424     const unsigned SubReg = MO.getSubReg();
425     assert(SubReg != AMDGPU::NoSubRegister); // Due to [1].
426     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(SubReg) << ':');
427 
428     const auto [I, Inserted] = SubRegs.try_emplace(SubReg);
429     const TargetRegisterClass *&SubRegRC = I->second.RC;
430 
431     if (Inserted)
432       SubRegRC = TRI->getSubRegisterClass(RC, SubReg);
433 
434     if (SubRegRC) {
435       if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) {
436         LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & "
437                           << TRI->getRegClassName(OpDescRC) << " = ");
438         SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC);
439       }
440     }
441 
442     if (!SubRegRC) {
443       LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
444       return false;
445     }
446     LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n');
447   }
448 
449   auto *NewRC = getMinSizeReg(RC, SubRegs);
450   if (!NewRC) {
451     LLVM_DEBUG(dbgs() << "  No improvement achieved\n");
452     return false;
453   }
454 
455   Register NewReg = MRI->createVirtualRegister(NewRC);
456   LLVM_DEBUG(dbgs() << "  Success " << printReg(Reg, TRI) << ':'
457                     << TRI->getRegClassName(RC) << " -> "
458                     << printReg(NewReg, TRI) << ':'
459                     << TRI->getRegClassName(NewRC) << '\n');
460 
461   for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) {
462     MO.setReg(NewReg);
463     // Debug info can refer to the whole reg, just leave it as it is for now.
464     // TODO: create some DI shift expression?
465     if (MO.isDebug() && MO.getSubReg() == 0)
466       continue;
467     unsigned SubReg = SubRegs[MO.getSubReg()].SubReg;
468     MO.setSubReg(SubReg);
469     if (SubReg == AMDGPU::NoSubRegister && MO.isDef())
470       MO.setIsUndef(false);
471   }
472 
473   if (LIS)
474     updateLiveIntervals(Reg, NewReg, SubRegs);
475 
476   return true;
477 }
478 
479 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) {
480   MRI = &MF.getRegInfo();
481   TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
482   TII = MF.getSubtarget().getInstrInfo();
483   auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
484   LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
485   bool Changed = false;
486   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
487     Changed |= rewriteReg(Register::index2VirtReg(I));
488   }
489   return Changed;
490 }
491 
492 char GCNRewritePartialRegUses::ID;
493 
494 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID;
495 
496 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE,
497                       "Rewrite Partial Register Uses", false, false)
498 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE,
499                     "Rewrite Partial Register Uses", false, false)
500