1 //===-- SIFormMemoryClauses.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass creates bundles of SMEM and VMEM instructions forming memory 11 /// clauses if XNACK is enabled. Def operands of clauses are marked as early 12 /// clobber to make sure we will not override any source within a clause. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "GCNRegPressure.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/InitializePasses.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "si-form-memory-clauses" 25 26 // Clauses longer then 15 instructions would overflow one of the counters 27 // and stall. They can stall even earlier if there are outstanding counters. 28 static cl::opt<unsigned> 29 MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15), 30 cl::desc("Maximum length of a memory clause, instructions")); 31 32 namespace { 33 34 class SIFormMemoryClauses : public MachineFunctionPass { 35 typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse; 36 37 public: 38 static char ID; 39 40 public: 41 SIFormMemoryClauses() : MachineFunctionPass(ID) { 42 initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry()); 43 } 44 45 bool runOnMachineFunction(MachineFunction &MF) override; 46 47 StringRef getPassName() const override { 48 return "SI Form memory clauses"; 49 } 50 51 void getAnalysisUsage(AnalysisUsage &AU) const override { 52 AU.addRequired<LiveIntervals>(); 53 AU.setPreservesAll(); 54 MachineFunctionPass::getAnalysisUsage(AU); 55 } 56 57 private: 58 template <typename Callable> 59 void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const; 60 61 bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; 62 bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT); 63 void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; 64 bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, 65 GCNDownwardRPTracker &RPT); 66 67 const GCNSubtarget *ST; 68 const SIRegisterInfo *TRI; 69 const MachineRegisterInfo *MRI; 70 SIMachineFunctionInfo *MFI; 71 72 unsigned LastRecordedOccupancy; 73 unsigned MaxVGPRs; 74 unsigned MaxSGPRs; 75 }; 76 77 } // End anonymous namespace. 78 79 INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE, 80 "SI Form memory clauses", false, false) 81 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 82 INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE, 83 "SI Form memory clauses", false, false) 84 85 86 char SIFormMemoryClauses::ID = 0; 87 88 char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID; 89 90 FunctionPass *llvm::createSIFormMemoryClausesPass() { 91 return new SIFormMemoryClauses(); 92 } 93 94 static bool isVMEMClauseInst(const MachineInstr &MI) { 95 return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI); 96 } 97 98 static bool isSMEMClauseInst(const MachineInstr &MI) { 99 return SIInstrInfo::isSMRD(MI); 100 } 101 102 // There no sense to create store clauses, they do not define anything, 103 // thus there is nothing to set early-clobber. 104 static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { 105 if (MI.isDebugValue() || MI.isBundled()) 106 return false; 107 if (!MI.mayLoad() || MI.mayStore()) 108 return false; 109 if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 || 110 AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1) 111 return false; 112 if (IsVMEMClause && !isVMEMClauseInst(MI)) 113 return false; 114 if (!IsVMEMClause && !isSMEMClauseInst(MI)) 115 return false; 116 // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. 117 for (const MachineOperand &ResMO : MI.defs()) { 118 Register ResReg = ResMO.getReg(); 119 for (const MachineOperand &MO : MI.uses()) { 120 if (!MO.isReg() || MO.isDef()) 121 continue; 122 if (MO.getReg() == ResReg) 123 return false; 124 } 125 break; // Only check the first def. 126 } 127 return true; 128 } 129 130 static unsigned getMopState(const MachineOperand &MO) { 131 unsigned S = 0; 132 if (MO.isImplicit()) 133 S |= RegState::Implicit; 134 if (MO.isDead()) 135 S |= RegState::Dead; 136 if (MO.isUndef()) 137 S |= RegState::Undef; 138 if (MO.isKill()) 139 S |= RegState::Kill; 140 if (MO.isEarlyClobber()) 141 S |= RegState::EarlyClobber; 142 if (MO.getReg().isPhysical() && MO.isRenamable()) 143 S |= RegState::Renamable; 144 return S; 145 } 146 147 template <typename Callable> 148 void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask, 149 Callable Func) const { 150 if (LaneMask.all() || Reg.isPhysical() || 151 LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { 152 Func(0); 153 return; 154 } 155 156 const TargetRegisterClass *RC = MRI->getRegClass(Reg); 157 unsigned E = TRI->getNumSubRegIndices(); 158 SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs; 159 for (unsigned Idx = 1; Idx < E; ++Idx) { 160 // Is this index even compatible with the given class? 161 if (TRI->getSubClassWithSubReg(RC, Idx) != RC) 162 continue; 163 LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); 164 // Early exit if we found a perfect match. 165 if (SubRegMask == LaneMask) { 166 Func(Idx); 167 return; 168 } 169 170 if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) 171 continue; 172 173 CoveringSubregs.push_back(Idx); 174 } 175 176 llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) { 177 LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); 178 LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); 179 unsigned NA = MaskA.getNumLanes(); 180 unsigned NB = MaskB.getNumLanes(); 181 if (NA != NB) 182 return NA > NB; 183 return MaskA.getHighestLane() > MaskB.getHighestLane(); 184 }); 185 186 for (unsigned Idx : CoveringSubregs) { 187 LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); 188 if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) 189 continue; 190 191 Func(Idx); 192 LaneMask &= ~SubRegMask; 193 if (LaneMask.none()) 194 return; 195 } 196 197 llvm_unreachable("Failed to find all subregs to cover lane mask"); 198 } 199 200 // Returns false if there is a use of a def already in the map. 201 // In this case we must break the clause. 202 bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, 203 RegUse &Defs, RegUse &Uses) const { 204 // Check interference with defs. 205 for (const MachineOperand &MO : MI.operands()) { 206 // TODO: Prologue/Epilogue Insertion pass does not process bundled 207 // instructions. 208 if (MO.isFI()) 209 return false; 210 211 if (!MO.isReg()) 212 continue; 213 214 Register Reg = MO.getReg(); 215 216 // If it is tied we will need to write same register as we read. 217 if (MO.isTied()) 218 return false; 219 220 RegUse &Map = MO.isDef() ? Uses : Defs; 221 auto Conflict = Map.find(Reg); 222 if (Conflict == Map.end()) 223 continue; 224 225 if (Reg.isPhysical()) 226 return false; 227 228 LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); 229 if ((Conflict->second.second & Mask).any()) 230 return false; 231 } 232 233 return true; 234 } 235 236 // Since all defs in the clause are early clobber we can run out of registers. 237 // Function returns false if pressure would hit the limit if instruction is 238 // bundled into a memory clause. 239 bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI, 240 GCNDownwardRPTracker &RPT) { 241 // NB: skip advanceBeforeNext() call. Since all defs will be marked 242 // early-clobber they will all stay alive at least to the end of the 243 // clause. Therefor we should not decrease pressure even if load 244 // pointer becomes dead and could otherwise be reused for destination. 245 RPT.advanceToNext(); 246 GCNRegPressure MaxPressure = RPT.moveMaxPressure(); 247 unsigned Occupancy = MaxPressure.getOccupancy(*ST); 248 if (Occupancy >= MFI->getMinAllowedOccupancy() && 249 MaxPressure.getVGPRNum() <= MaxVGPRs && 250 MaxPressure.getSGPRNum() <= MaxSGPRs) { 251 LastRecordedOccupancy = Occupancy; 252 return true; 253 } 254 return false; 255 } 256 257 // Collect register defs and uses along with their lane masks and states. 258 void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI, 259 RegUse &Defs, RegUse &Uses) const { 260 for (const MachineOperand &MO : MI.operands()) { 261 if (!MO.isReg()) 262 continue; 263 Register Reg = MO.getReg(); 264 if (!Reg) 265 continue; 266 267 LaneBitmask Mask = Reg.isVirtual() 268 ? TRI->getSubRegIndexLaneMask(MO.getSubReg()) 269 : LaneBitmask::getAll(); 270 RegUse &Map = MO.isDef() ? Defs : Uses; 271 272 auto Loc = Map.find(Reg); 273 unsigned State = getMopState(MO); 274 if (Loc == Map.end()) { 275 Map[Reg] = std::make_pair(State, Mask); 276 } else { 277 Loc->second.first |= State; 278 Loc->second.second |= Mask; 279 } 280 } 281 } 282 283 // Check register def/use conflicts, occupancy limits and collect def/use maps. 284 // Return true if instruction can be bundled with previous. It it cannot 285 // def/use maps are not updated. 286 bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI, 287 RegUse &Defs, RegUse &Uses, 288 GCNDownwardRPTracker &RPT) { 289 if (!canBundle(MI, Defs, Uses)) 290 return false; 291 292 if (!checkPressure(MI, RPT)) 293 return false; 294 295 collectRegUses(MI, Defs, Uses); 296 return true; 297 } 298 299 bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { 300 if (skipFunction(MF.getFunction())) 301 return false; 302 303 ST = &MF.getSubtarget<GCNSubtarget>(); 304 if (!ST->isXNACKEnabled()) 305 return false; 306 307 const SIInstrInfo *TII = ST->getInstrInfo(); 308 TRI = ST->getRegisterInfo(); 309 MRI = &MF.getRegInfo(); 310 MFI = MF.getInfo<SIMachineFunctionInfo>(); 311 LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); 312 SlotIndexes *Ind = LIS->getSlotIndexes(); 313 bool Changed = false; 314 315 MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); 316 MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); 317 unsigned FuncMaxClause = AMDGPU::getIntegerAttribute( 318 MF.getFunction(), "amdgpu-max-memory-clause", MaxClause); 319 320 for (MachineBasicBlock &MBB : MF) { 321 MachineBasicBlock::instr_iterator Next; 322 for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) { 323 MachineInstr &MI = *I; 324 Next = std::next(I); 325 326 bool IsVMEM = isVMEMClauseInst(MI); 327 328 if (!isValidClauseInst(MI, IsVMEM)) 329 continue; 330 331 RegUse Defs, Uses; 332 GCNDownwardRPTracker RPT(*LIS); 333 RPT.reset(MI); 334 335 if (!processRegUses(MI, Defs, Uses, RPT)) 336 continue; 337 338 unsigned Length = 1; 339 for ( ; Next != E && Length < FuncMaxClause; ++Next) { 340 if (!isValidClauseInst(*Next, IsVMEM)) 341 break; 342 343 // A load from pointer which was loaded inside the same bundle is an 344 // impossible clause because we will need to write and read the same 345 // register inside. In this case processRegUses will return false. 346 if (!processRegUses(*Next, Defs, Uses, RPT)) 347 break; 348 349 ++Length; 350 } 351 if (Length < 2) 352 continue; 353 354 Changed = true; 355 MFI->limitOccupancy(LastRecordedOccupancy); 356 357 auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE)); 358 Ind->insertMachineInstrInMaps(*B); 359 360 for (auto BI = I; BI != Next; ++BI) { 361 BI->bundleWithPred(); 362 Ind->removeSingleMachineInstrFromMaps(*BI); 363 364 for (MachineOperand &MO : BI->defs()) 365 if (MO.readsReg()) 366 MO.setIsInternalRead(true); 367 } 368 369 for (auto &&R : Defs) { 370 forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { 371 unsigned S = R.second.first | RegState::EarlyClobber; 372 if (!SubReg) 373 S &= ~(RegState::Undef | RegState::Dead); 374 B.addDef(R.first, S, SubReg); 375 }); 376 } 377 378 for (auto &&R : Uses) { 379 forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { 380 B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg); 381 }); 382 } 383 384 for (auto &&R : Defs) { 385 Register Reg = R.first; 386 Uses.erase(Reg); 387 if (Reg.isPhysical()) 388 continue; 389 LIS->removeInterval(Reg); 390 LIS->createAndComputeVirtRegInterval(Reg); 391 } 392 393 for (auto &&R : Uses) { 394 Register Reg = R.first; 395 if (Reg.isPhysical()) 396 continue; 397 LIS->removeInterval(Reg); 398 LIS->createAndComputeVirtRegInterval(Reg); 399 } 400 } 401 } 402 403 return Changed; 404 } 405