1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// RenameIndependentSubregs pass leaves large partially used super registers, 10 /// for example: 11 /// undef %0.sub4:VReg_1024 = ... 12 /// %0.sub5:VReg_1024 = ... 13 /// %0.sub6:VReg_1024 = ... 14 /// %0.sub7:VReg_1024 = ... 15 /// use %0.sub4_sub5_sub6_sub7 16 /// use %0.sub6_sub7 17 /// 18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and 19 /// rewrites such partially used super registers with registers of minimal size: 20 /// undef %0.sub0:VReg_128 = ... 21 /// %0.sub1:VReg_128 = ... 22 /// %0.sub2:VReg_128 = ... 23 /// %0.sub3:VReg_128 = ... 24 /// use %0.sub0_sub1_sub2_sub3 25 /// use %0.sub2_sub3 26 /// 27 /// This allows to avoid subreg lanemasks tracking during register pressure 28 /// calculation and creates more possibilities for the code unaware of lanemasks 29 //===----------------------------------------------------------------------===// 30 31 #include "AMDGPU.h" 32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 33 #include "SIRegisterInfo.h" 34 #include "llvm/CodeGen/LiveInterval.h" 35 #include "llvm/CodeGen/LiveIntervals.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineRegisterInfo.h" 38 #include "llvm/CodeGen/TargetInstrInfo.h" 39 #include "llvm/Pass.h" 40 41 using namespace llvm; 42 43 #define DEBUG_TYPE "rewrite-partial-reg-uses" 44 45 namespace { 46 47 class GCNRewritePartialRegUses : public MachineFunctionPass { 48 public: 49 static char ID; 50 GCNRewritePartialRegUses() : MachineFunctionPass(ID) {} 51 52 StringRef getPassName() const override { 53 return "Rewrite Partial Register Uses"; 54 } 55 56 void getAnalysisUsage(AnalysisUsage &AU) const override { 57 AU.setPreservesCFG(); 58 AU.addPreserved<LiveIntervalsWrapperPass>(); 59 AU.addPreserved<SlotIndexesWrapperPass>(); 60 MachineFunctionPass::getAnalysisUsage(AU); 61 } 62 63 bool runOnMachineFunction(MachineFunction &MF) override; 64 65 private: 66 MachineRegisterInfo *MRI; 67 const SIRegisterInfo *TRI; 68 const TargetInstrInfo *TII; 69 LiveIntervals *LIS; 70 71 /// Rewrite partially used register Reg by shifting all its subregisters to 72 /// the right and replacing the original register with a register of minimal 73 /// size. Return true if the change has been made. 74 bool rewriteReg(Register Reg) const; 75 76 /// Value type for SubRegMap below. 77 struct SubRegInfo { 78 /// Register class required to hold the value stored in the SubReg. 79 const TargetRegisterClass *RC; 80 81 /// Index for the right-shifted subregister. If 0 this is the "covering" 82 /// subreg i.e. subreg that covers all others. Covering subreg becomes the 83 /// whole register after the replacement. 84 unsigned SubReg = AMDGPU::NoSubRegister; 85 SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {} 86 }; 87 88 /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container. 89 using SubRegMap = SmallDenseMap<unsigned, SubRegInfo>; 90 91 /// Given register class RC and the set of used subregs as keys in the SubRegs 92 /// map return new register class and indexes of right-shifted subregs as 93 /// values in SubRegs map such that the resulting regclass would contain 94 /// registers of minimal size. 95 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC, 96 SubRegMap &SubRegs) const; 97 98 /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to 99 /// find new regclass such that: 100 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number 101 /// of bits to the right. Every "shifted" subreg should have the same 102 /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers" 103 /// all other subregs in pairs. Basically such subreg becomes a whole 104 /// register. 105 /// 2. Resulting register class contains registers of minimal size but not 106 /// less than RegNumBits. 107 /// 108 /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out 109 /// parameter: 110 /// OldSubReg - input parameter, 111 /// SubRegRC - input parameter (cannot be null), 112 /// NewSubReg - output, contains shifted subregs on return. 113 const TargetRegisterClass * 114 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, 115 unsigned RegNumBits, unsigned CoverSubregIdx, 116 SubRegMap &SubRegs) const; 117 118 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map 119 /// describing OldSubReg -> NewSubReg mapping. 120 void updateLiveIntervals(Register OldReg, Register NewReg, 121 SubRegMap &SubRegs) const; 122 123 /// Helper methods. 124 125 /// Return reg class expected by a MO's parent instruction for a given MO. 126 const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const; 127 128 /// Find right-shifted by RShift amount version of the SubReg if it exists, 129 /// return 0 otherwise. 130 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const; 131 132 /// Find subreg index with a given Offset and Size, return 0 if there is no 133 /// such subregister index. The result is cached in SubRegs data-member. 134 unsigned getSubReg(unsigned Offset, unsigned Size) const; 135 136 /// Cache for getSubReg method: {Offset, Size} -> SubReg index. 137 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs; 138 139 /// Return bit mask that contains all register classes that are projected into 140 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member. 141 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC, 142 unsigned SubRegIdx) const; 143 144 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask. 145 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>, 146 const uint32_t *> 147 SuperRegMasks; 148 149 /// Return bitmask containing all allocatable register classes with registers 150 /// aligned at AlignNumBits. The result is cached in 151 /// AllocatableAndAlignedRegClassMasks data-member. 152 const BitVector & 153 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const; 154 155 /// Cache for getAllocatableAndAlignedRegClassMask method: 156 /// AlignNumBits -> Class bitmask. 157 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks; 158 }; 159 160 } // end anonymous namespace 161 162 // TODO: move this to the tablegen and use binary search by Offset. 163 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, 164 unsigned Size) const { 165 const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); 166 if (Inserted) { 167 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { 168 if (TRI->getSubRegIdxOffset(Idx) == Offset && 169 TRI->getSubRegIdxSize(Idx) == Size) { 170 I->second = Idx; 171 break; 172 } 173 } 174 } 175 return I->second; 176 } 177 178 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg, 179 unsigned RShift) const { 180 unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; 181 return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); 182 } 183 184 const uint32_t * 185 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, 186 unsigned SubRegIdx) const { 187 const auto [I, Inserted] = 188 SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); 189 if (Inserted) { 190 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) { 191 if (RCI.getSubReg() == SubRegIdx) { 192 I->second = RCI.getMask(); 193 break; 194 } 195 } 196 } 197 return I->second; 198 } 199 200 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( 201 unsigned AlignNumBits) const { 202 const auto [I, Inserted] = 203 AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); 204 if (Inserted) { 205 BitVector &BV = I->second; 206 BV.resize(TRI->getNumRegClasses()); 207 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { 208 auto *RC = TRI->getRegClass(ClassID); 209 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) 210 BV.set(ClassID); 211 } 212 } 213 return I->second; 214 } 215 216 const TargetRegisterClass * 217 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( 218 const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits, 219 unsigned CoverSubregIdx, SubRegMap &SubRegs) const { 220 221 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC); 222 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign 223 << '\n'); 224 225 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); 226 for (auto &[OldSubReg, SRI] : SubRegs) { 227 auto &[SubRegRC, NewSubReg] = SRI; 228 assert(SubRegRC); 229 230 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':' 231 << TRI->getRegClassName(SubRegRC) 232 << (SubRegRC->isAllocatable() ? "" : " not alloc") 233 << " -> "); 234 235 if (OldSubReg == CoverSubregIdx) { 236 // Covering subreg will become a full register, RC should be allocatable. 237 assert(SubRegRC->isAllocatable()); 238 NewSubReg = AMDGPU::NoSubRegister; 239 LLVM_DEBUG(dbgs() << "whole reg"); 240 } else { 241 NewSubReg = shiftSubReg(OldSubReg, RShift); 242 if (!NewSubReg) { 243 LLVM_DEBUG(dbgs() << "none\n"); 244 return nullptr; 245 } 246 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg)); 247 } 248 249 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg) 250 : SubRegRC->getSubClassMask(); 251 if (!Mask) 252 llvm_unreachable("no register class mask?"); 253 254 ClassMask.clearBitsNotInMask(Mask); 255 // Don't try to early exit because checking if ClassMask has set bits isn't 256 // that cheap and we expect it to pass in most cases. 257 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n'); 258 } 259 260 // ClassMask is the set of all register classes such that each class is 261 // allocatable, aligned, has all shifted subregs and each subreg has required 262 // register class (see SubRegRC above). Now select first (that is largest) 263 // register class with registers of minimal but not less than RegNumBits size. 264 // We have to check register size because we may encounter classes of smaller 265 // registers like VReg_1 in some situations. 266 const TargetRegisterClass *MinRC = nullptr; 267 unsigned MinNumBits = std::numeric_limits<unsigned>::max(); 268 for (unsigned ClassID : ClassMask.set_bits()) { 269 auto *RC = TRI->getRegClass(ClassID); 270 unsigned NumBits = TRI->getRegSizeInBits(*RC); 271 if (NumBits < MinNumBits && NumBits >= RegNumBits) { 272 MinNumBits = NumBits; 273 MinRC = RC; 274 } 275 if (MinNumBits == RegNumBits) 276 break; 277 } 278 #ifndef NDEBUG 279 if (MinRC) { 280 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign)); 281 for (auto [SubReg, SRI] : SubRegs) 282 // Check that all registers in MinRC support SRI.SubReg subregister. 283 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg)); 284 } 285 #endif 286 // There might be zero RShift - in this case we just trying to find smaller 287 // register. 288 return (MinRC != RC || RShift != 0) ? MinRC : nullptr; 289 } 290 291 const TargetRegisterClass * 292 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, 293 SubRegMap &SubRegs) const { 294 unsigned CoverSubreg = AMDGPU::NoSubRegister; 295 unsigned Offset = std::numeric_limits<unsigned>::max(); 296 unsigned End = 0; 297 for (auto [SubReg, SRI] : SubRegs) { 298 unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg); 299 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg); 300 if (SubRegOffset < Offset) { 301 Offset = SubRegOffset; 302 CoverSubreg = AMDGPU::NoSubRegister; 303 } 304 if (SubRegEnd > End) { 305 End = SubRegEnd; 306 CoverSubreg = AMDGPU::NoSubRegister; 307 } 308 if (SubRegOffset == Offset && SubRegEnd == End) 309 CoverSubreg = SubReg; 310 } 311 // If covering subreg is found shift everything so the covering subreg would 312 // be in the rightmost position. 313 if (CoverSubreg != AMDGPU::NoSubRegister) 314 return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg, 315 SubRegs); 316 317 // Otherwise find subreg with maximum required alignment and shift it and all 318 // other subregs to the rightmost possible position with respect to the 319 // alignment. 320 unsigned MaxAlign = 0; 321 for (auto [SubReg, SRI] : SubRegs) 322 MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg)); 323 324 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max(); 325 for (auto [SubReg, SRI] : SubRegs) { 326 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign) 327 continue; 328 FirstMaxAlignedSubRegOffset = 329 std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg)); 330 if (FirstMaxAlignedSubRegOffset == Offset) 331 break; 332 } 333 334 unsigned NewOffsetOfMaxAlignedSubReg = 335 alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign); 336 337 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset) 338 llvm_unreachable("misaligned subreg"); 339 340 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg; 341 return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs); 342 } 343 344 // Only the subrange's lanemasks of the original interval need to be modified. 345 // Subrange for a covering subreg becomes the main range. 346 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, 347 Register NewReg, 348 SubRegMap &SubRegs) const { 349 if (!LIS->hasInterval(OldReg)) 350 return; 351 352 auto &OldLI = LIS->getInterval(OldReg); 353 auto &NewLI = LIS->createEmptyInterval(NewReg); 354 355 auto &Allocator = LIS->getVNInfoAllocator(); 356 NewLI.setWeight(OldLI.weight()); 357 358 for (auto &SR : OldLI.subranges()) { 359 auto I = find_if(SubRegs, [&](auto &P) { 360 return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first); 361 }); 362 363 if (I == SubRegs.end()) { 364 // There might be a situation when subranges don't exactly match used 365 // subregs, for example: 366 // %120 [160r,1392r:0) 0@160r 367 // L000000000000C000 [160r,1392r:0) 0@160r 368 // L0000000000003000 [160r,1392r:0) 0@160r 369 // L0000000000000C00 [160r,1392r:0) 0@160r 370 // L0000000000000300 [160r,1392r:0) 0@160r 371 // L0000000000000003 [160r,1104r:0) 0@160r 372 // L000000000000000C [160r,1104r:0) 0@160r 373 // L0000000000000030 [160r,1104r:0) 0@160r 374 // L00000000000000C0 [160r,1104r:0) 0@160r 375 // but used subregs are: 376 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF 377 // sub0_sub1_sub2_sub3, L00000000000000FF 378 // sub4_sub5_sub6_sub7, L000000000000FF00 379 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7 380 // have several subranges with the same lifetime. For such cases just 381 // recreate the interval. 382 LIS->removeInterval(OldReg); 383 LIS->removeInterval(NewReg); 384 LIS->createAndComputeVirtRegInterval(NewReg); 385 return; 386 } 387 388 if (unsigned NewSubReg = I->second.SubReg) 389 NewLI.createSubRangeFrom(Allocator, 390 TRI->getSubRegIndexLaneMask(NewSubReg), SR); 391 else // This is the covering subreg (0 index) - set it as main range. 392 NewLI.assign(SR, Allocator); 393 394 SubRegs.erase(I); 395 } 396 if (NewLI.empty()) 397 NewLI.assign(OldLI, Allocator); 398 assert(NewLI.verify(MRI)); 399 LIS->removeInterval(OldReg); 400 } 401 402 const TargetRegisterClass * 403 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { 404 MachineInstr *MI = MO.getParent(); 405 return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI, 406 *MI->getParent()->getParent()); 407 } 408 409 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { 410 auto Range = MRI->reg_nodbg_operands(Reg); 411 if (Range.empty() || any_of(Range, [](MachineOperand &MO) { 412 return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1] 413 })) 414 return false; 415 416 auto *RC = MRI->getRegClass(Reg); 417 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) 418 << ':' << TRI->getRegClassName(RC) << '\n'); 419 420 // Collect used subregs and their reg classes infered from instruction 421 // operands. 422 SubRegMap SubRegs; 423 for (MachineOperand &MO : Range) { 424 const unsigned SubReg = MO.getSubReg(); 425 assert(SubReg != AMDGPU::NoSubRegister); // Due to [1]. 426 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(SubReg) << ':'); 427 428 const auto [I, Inserted] = SubRegs.try_emplace(SubReg); 429 const TargetRegisterClass *&SubRegRC = I->second.RC; 430 431 if (Inserted) 432 SubRegRC = TRI->getSubRegisterClass(RC, SubReg); 433 434 if (SubRegRC) { 435 if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) { 436 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & " 437 << TRI->getRegClassName(OpDescRC) << " = "); 438 SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC); 439 } 440 } 441 442 if (!SubRegRC) { 443 LLVM_DEBUG(dbgs() << "couldn't find target regclass\n"); 444 return false; 445 } 446 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n'); 447 } 448 449 auto *NewRC = getMinSizeReg(RC, SubRegs); 450 if (!NewRC) { 451 LLVM_DEBUG(dbgs() << " No improvement achieved\n"); 452 return false; 453 } 454 455 Register NewReg = MRI->createVirtualRegister(NewRC); 456 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':' 457 << TRI->getRegClassName(RC) << " -> " 458 << printReg(NewReg, TRI) << ':' 459 << TRI->getRegClassName(NewRC) << '\n'); 460 461 for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) { 462 MO.setReg(NewReg); 463 // Debug info can refer to the whole reg, just leave it as it is for now. 464 // TODO: create some DI shift expression? 465 if (MO.isDebug() && MO.getSubReg() == 0) 466 continue; 467 unsigned SubReg = SubRegs[MO.getSubReg()].SubReg; 468 MO.setSubReg(SubReg); 469 if (SubReg == AMDGPU::NoSubRegister && MO.isDef()) 470 MO.setIsUndef(false); 471 } 472 473 if (LIS) 474 updateLiveIntervals(Reg, NewReg, SubRegs); 475 476 return true; 477 } 478 479 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { 480 MRI = &MF.getRegInfo(); 481 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo()); 482 TII = MF.getSubtarget().getInstrInfo(); 483 auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>(); 484 LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; 485 bool Changed = false; 486 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { 487 Changed |= rewriteReg(Register::index2VirtReg(I)); 488 } 489 return Changed; 490 } 491 492 char GCNRewritePartialRegUses::ID; 493 494 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID; 495 496 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE, 497 "Rewrite Partial Register Uses", false, false) 498 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE, 499 "Rewrite Partial Register Uses", false, false) 500