1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 UseVALUReadHazardExhaustiveSearch(false), 64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; 172 } 173 174 static bool isLdsDma(const MachineInstr &MI) { 175 return SIInstrInfo::isVALU(MI) && 176 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 177 } 178 179 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 180 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 181 AMDGPU::OpName::simm16); 182 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 183 } 184 185 ScheduleHazardRecognizer::HazardType 186 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 187 MachineInstr *MI = SU->getInstr(); 188 // If we are not in "HazardRecognizerMode" and therefore not being run from 189 // the scheduler, track possible stalls from hazards but don't insert noops. 190 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 191 192 if (MI->isBundle()) 193 return NoHazard; 194 195 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 196 return HazardType; 197 198 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 199 return HazardType; 200 201 if (checkFPAtomicToDenormModeHazard(MI) > 0) 202 return HazardType; 203 204 if (ST.hasNoDataDepHazard()) 205 return NoHazard; 206 207 // FIXME: Should flat be considered vmem? 208 if ((SIInstrInfo::isVMEM(*MI) || 209 SIInstrInfo::isFLAT(*MI)) 210 && checkVMEMHazards(MI) > 0) 211 return HazardType; 212 213 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 214 return HazardType; 215 216 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 217 return HazardType; 218 219 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 220 return HazardType; 221 222 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 223 return HazardType; 224 225 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 226 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 227 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 228 return HazardType; 229 230 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 231 return HazardType; 232 233 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 234 return HazardType; 235 236 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 237 return HazardType; 238 239 if (((ST.hasReadM0MovRelInterpHazard() && 240 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 241 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 242 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 243 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 244 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 245 (ST.hasReadM0LdsDirectHazard() && 246 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 247 checkReadM0Hazards(MI) > 0) 248 return HazardType; 249 250 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 251 return HazardType; 252 253 if ((SIInstrInfo::isVMEM(*MI) || 254 SIInstrInfo::isFLAT(*MI) || 255 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 256 return HazardType; 257 258 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 259 return HazardType; 260 261 return NoHazard; 262 } 263 264 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 265 unsigned Quantity) { 266 while (Quantity > 0) { 267 unsigned Arg = std::min(Quantity, 8u); 268 Quantity -= Arg; 269 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 270 .addImm(Arg - 1); 271 } 272 } 273 274 unsigned 275 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 276 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 277 assert(TSchedModel.getWriteProcResBegin(SC) != 278 TSchedModel.getWriteProcResEnd(SC)); 279 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 280 } 281 282 void GCNHazardRecognizer::processBundle() { 283 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 284 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 285 // Check bundled MachineInstr's for hazards. 286 for (; MI != E && MI->isInsideBundle(); ++MI) { 287 CurrCycleInstr = &*MI; 288 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 289 290 if (IsHazardRecognizerMode) { 291 fixHazards(CurrCycleInstr); 292 293 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 294 } 295 296 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 297 // include the bundled MI directly after, only add a maximum of 298 // (MaxLookAhead - 1) noops to EmittedInstrs. 299 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 300 EmittedInstrs.push_front(nullptr); 301 302 EmittedInstrs.push_front(CurrCycleInstr); 303 EmittedInstrs.resize(MaxLookAhead); 304 } 305 CurrCycleInstr = nullptr; 306 } 307 308 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 309 assert(IsHazardRecognizerMode); 310 311 unsigned NumPreNoops = PreEmitNoops(MI); 312 EmitNoops(NumPreNoops); 313 if (MI->isInsideBundle()) 314 insertNoopsInBundle(MI, TII, NumPreNoops); 315 else 316 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 317 NumPreNoops); 318 EmitInstruction(MI); 319 AdvanceCycle(); 320 } 321 322 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 323 IsHazardRecognizerMode = true; 324 CurrCycleInstr = MI; 325 unsigned W = PreEmitNoopsCommon(MI); 326 fixHazards(MI); 327 CurrCycleInstr = nullptr; 328 return W; 329 } 330 331 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 332 if (MI->isBundle()) 333 return 0; 334 335 int WaitStates = 0; 336 337 if (SIInstrInfo::isSMRD(*MI)) 338 return std::max(WaitStates, checkSMRDHazards(MI)); 339 340 if (ST.hasNSAtoVMEMBug()) 341 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 342 343 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 344 345 if (ST.hasNoDataDepHazard()) 346 return WaitStates; 347 348 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 349 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 350 351 if (SIInstrInfo::isVALU(*MI)) 352 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 353 354 if (SIInstrInfo::isDPP(*MI)) 355 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 356 357 if (isDivFMas(MI->getOpcode())) 358 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 359 360 if (isRWLane(MI->getOpcode())) 361 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 362 363 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 364 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 365 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 366 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 367 368 if (MI->isInlineAsm()) 369 return std::max(WaitStates, checkInlineAsmHazards(MI)); 370 371 if (isSGetReg(MI->getOpcode())) 372 return std::max(WaitStates, checkGetRegHazards(MI)); 373 374 if (isSSetReg(MI->getOpcode())) 375 return std::max(WaitStates, checkSetRegHazards(MI)); 376 377 if (isRFE(MI->getOpcode())) 378 return std::max(WaitStates, checkRFEHazards(MI)); 379 380 if ((ST.hasReadM0MovRelInterpHazard() && 381 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 382 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 383 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 384 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 385 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 386 (ST.hasReadM0LdsDirectHazard() && 387 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 388 return std::max(WaitStates, checkReadM0Hazards(MI)); 389 390 if (SIInstrInfo::isMAI(*MI)) 391 return std::max(WaitStates, checkMAIHazards(MI)); 392 393 if (SIInstrInfo::isVMEM(*MI) || 394 SIInstrInfo::isFLAT(*MI) || 395 SIInstrInfo::isDS(*MI)) 396 return std::max(WaitStates, checkMAILdStHazards(MI)); 397 398 return WaitStates; 399 } 400 401 void GCNHazardRecognizer::EmitNoop() { 402 EmittedInstrs.push_front(nullptr); 403 } 404 405 void GCNHazardRecognizer::AdvanceCycle() { 406 // When the scheduler detects a stall, it will call AdvanceCycle() without 407 // emitting any instructions. 408 if (!CurrCycleInstr) { 409 EmittedInstrs.push_front(nullptr); 410 return; 411 } 412 413 if (CurrCycleInstr->isBundle()) { 414 processBundle(); 415 return; 416 } 417 418 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 419 if (!NumWaitStates) { 420 CurrCycleInstr = nullptr; 421 return; 422 } 423 424 // Keep track of emitted instructions 425 EmittedInstrs.push_front(CurrCycleInstr); 426 427 // Add a nullptr for each additional wait state after the first. Make sure 428 // not to add more than getMaxLookAhead() items to the list, since we 429 // truncate the list to that size right after this loop. 430 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 431 i < e; ++i) { 432 EmittedInstrs.push_front(nullptr); 433 } 434 435 // getMaxLookahead() is the largest number of wait states we will ever need 436 // to insert, so there is no point in keeping track of more than that many 437 // wait states. 438 EmittedInstrs.resize(getMaxLookAhead()); 439 440 CurrCycleInstr = nullptr; 441 } 442 443 void GCNHazardRecognizer::RecedeCycle() { 444 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 445 } 446 447 //===----------------------------------------------------------------------===// 448 // Helper Functions 449 //===----------------------------------------------------------------------===// 450 451 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 452 453 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 454 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 455 456 // Search for a hazard in a block and its predecessors. 457 template <typename StateT> 458 static bool 459 hasHazard(StateT State, 460 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 461 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 462 const MachineBasicBlock *MBB, 463 MachineBasicBlock::const_reverse_instr_iterator I, 464 DenseSet<const MachineBasicBlock *> &Visited) { 465 for (auto E = MBB->instr_rend(); I != E; ++I) { 466 // No need to look at parent BUNDLE instructions. 467 if (I->isBundle()) 468 continue; 469 470 switch (IsHazard(State, *I)) { 471 case HazardFound: 472 return true; 473 case HazardExpired: 474 return false; 475 default: 476 // Continue search 477 break; 478 } 479 480 if (I->isInlineAsm() || I->isMetaInstruction()) 481 continue; 482 483 UpdateState(State, *I); 484 } 485 486 for (MachineBasicBlock *Pred : MBB->predecessors()) { 487 if (!Visited.insert(Pred).second) 488 continue; 489 490 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 491 Visited)) 492 return true; 493 } 494 495 return false; 496 } 497 498 // Returns a minimum wait states since \p I walking all predecessors. 499 // Only scans until \p IsExpired does not return true. 500 // Can only be run in a hazard recognizer mode. 501 static int getWaitStatesSince( 502 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 503 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 504 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 505 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 506 for (auto E = MBB->instr_rend(); I != E; ++I) { 507 // Don't add WaitStates for parent BUNDLE instructions. 508 if (I->isBundle()) 509 continue; 510 511 if (IsHazard(*I)) 512 return WaitStates; 513 514 if (I->isInlineAsm()) 515 continue; 516 517 WaitStates += GetNumWaitStates(*I); 518 519 if (IsExpired(*I, WaitStates)) 520 return std::numeric_limits<int>::max(); 521 } 522 523 int MinWaitStates = std::numeric_limits<int>::max(); 524 for (MachineBasicBlock *Pred : MBB->predecessors()) { 525 if (!Visited.insert(Pred).second) 526 continue; 527 528 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 529 IsExpired, Visited, GetNumWaitStates); 530 531 MinWaitStates = std::min(MinWaitStates, W); 532 } 533 534 return MinWaitStates; 535 } 536 537 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 538 const MachineInstr *MI, IsExpiredFn IsExpired) { 539 DenseSet<const MachineBasicBlock *> Visited; 540 return getWaitStatesSince(IsHazard, MI->getParent(), 541 std::next(MI->getReverseIterator()), 542 0, IsExpired, Visited); 543 } 544 545 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 546 if (IsHazardRecognizerMode) { 547 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 548 return WaitStates >= Limit; 549 }; 550 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 551 } 552 553 int WaitStates = 0; 554 for (MachineInstr *MI : EmittedInstrs) { 555 if (MI) { 556 if (IsHazard(*MI)) 557 return WaitStates; 558 559 if (MI->isInlineAsm()) 560 continue; 561 } 562 ++WaitStates; 563 564 if (WaitStates >= Limit) 565 break; 566 } 567 return std::numeric_limits<int>::max(); 568 } 569 570 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 571 IsHazardFn IsHazardDef, 572 int Limit) { 573 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 574 575 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 576 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 577 }; 578 579 return getWaitStatesSince(IsHazardFn, Limit); 580 } 581 582 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 583 int Limit) { 584 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 585 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 586 }; 587 588 return getWaitStatesSince(IsHazardFn, Limit); 589 } 590 591 //===----------------------------------------------------------------------===// 592 // No-op Hazard Detection 593 //===----------------------------------------------------------------------===// 594 595 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 596 MCRegister Reg) { 597 for (MCRegUnit Unit : TRI.regunits(Reg)) 598 BV.set(Unit); 599 } 600 601 static void addRegsToSet(const SIRegisterInfo &TRI, 602 iterator_range<MachineInstr::const_mop_iterator> Ops, 603 BitVector &DefSet, BitVector &UseSet) { 604 for (const MachineOperand &Op : Ops) { 605 if (Op.isReg()) 606 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 607 } 608 } 609 610 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 611 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 612 } 613 614 static bool breaksSMEMSoftClause(MachineInstr *MI) { 615 return !SIInstrInfo::isSMRD(*MI); 616 } 617 618 static bool breaksVMEMSoftClause(MachineInstr *MI) { 619 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 620 } 621 622 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 623 // SMEM soft clause are only present on VI+, and only matter if xnack is 624 // enabled. 625 if (!ST.isXNACKEnabled()) 626 return 0; 627 628 bool IsSMRD = TII.isSMRD(*MEM); 629 630 resetClause(); 631 632 // A soft-clause is any group of consecutive SMEM instructions. The 633 // instructions in this group may return out of order and/or may be 634 // replayed (i.e. the same instruction issued more than once). 635 // 636 // In order to handle these situations correctly we need to make sure that 637 // when a clause has more than one instruction, no instruction in the clause 638 // writes to a register that is read by another instruction in the clause 639 // (including itself). If we encounter this situation, we need to break the 640 // clause by inserting a non SMEM instruction. 641 642 for (MachineInstr *MI : EmittedInstrs) { 643 // When we hit a non-SMEM instruction then we have passed the start of the 644 // clause and we can stop. 645 if (!MI) 646 break; 647 648 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 649 break; 650 651 addClauseInst(*MI); 652 } 653 654 if (ClauseDefs.none()) 655 return 0; 656 657 // We need to make sure not to put loads and stores in the same clause if they 658 // use the same address. For now, just start a new clause whenever we see a 659 // store. 660 if (MEM->mayStore()) 661 return 1; 662 663 addClauseInst(*MEM); 664 665 // If the set of defs and uses intersect then we cannot add this instruction 666 // to the clause, so we have a hazard. 667 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 668 } 669 670 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 671 int WaitStatesNeeded = 0; 672 673 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 674 675 // This SMRD hazard only affects SI. 676 if (!ST.hasSMRDReadVALUDefHazard()) 677 return WaitStatesNeeded; 678 679 // A read of an SGPR by SMRD instruction requires 4 wait states when the 680 // SGPR was written by a VALU instruction. 681 int SmrdSgprWaitStates = 4; 682 auto IsHazardDefFn = [this](const MachineInstr &MI) { 683 return TII.isVALU(MI); 684 }; 685 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 686 return TII.isSALU(MI); 687 }; 688 689 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 690 691 for (const MachineOperand &Use : SMRD->uses()) { 692 if (!Use.isReg()) 693 continue; 694 int WaitStatesNeededForUse = 695 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 696 SmrdSgprWaitStates); 697 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 698 699 // This fixes what appears to be undocumented hardware behavior in SI where 700 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 701 // needs some number of nops in between. We don't know how many we need, but 702 // let's use 4. This wasn't discovered before probably because the only 703 // case when this happens is when we expand a 64-bit pointer into a full 704 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 705 // probably never encountered in the closed-source land. 706 if (IsBufferSMRD) { 707 int WaitStatesNeededForUse = 708 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 709 IsBufferHazardDefFn, 710 SmrdSgprWaitStates); 711 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 712 } 713 } 714 715 return WaitStatesNeeded; 716 } 717 718 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 719 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 720 return 0; 721 722 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 723 724 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 725 // SGPR was written by a VALU Instruction. 726 const int VmemSgprWaitStates = 5; 727 auto IsHazardDefFn = [this](const MachineInstr &MI) { 728 return TII.isVALU(MI); 729 }; 730 for (const MachineOperand &Use : VMEM->uses()) { 731 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 732 continue; 733 734 int WaitStatesNeededForUse = 735 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 736 VmemSgprWaitStates); 737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 738 } 739 return WaitStatesNeeded; 740 } 741 742 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 743 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 744 const SIInstrInfo *TII = ST.getInstrInfo(); 745 746 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 747 int DppVgprWaitStates = 2; 748 int DppExecWaitStates = 5; 749 int WaitStatesNeeded = 0; 750 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 751 return TII->isVALU(MI); 752 }; 753 754 for (const MachineOperand &Use : DPP->uses()) { 755 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 756 continue; 757 int WaitStatesNeededForUse = 758 DppVgprWaitStates - getWaitStatesSinceDef( 759 Use.getReg(), 760 [](const MachineInstr &) { return true; }, 761 DppVgprWaitStates); 762 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 763 } 764 765 WaitStatesNeeded = std::max( 766 WaitStatesNeeded, 767 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 768 DppExecWaitStates)); 769 770 return WaitStatesNeeded; 771 } 772 773 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 774 const SIInstrInfo *TII = ST.getInstrInfo(); 775 776 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 777 // instruction. 778 const int DivFMasWaitStates = 4; 779 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 780 return TII->isVALU(MI); 781 }; 782 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 783 DivFMasWaitStates); 784 785 return DivFMasWaitStates - WaitStatesNeeded; 786 } 787 788 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 789 const SIInstrInfo *TII = ST.getInstrInfo(); 790 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 791 792 const int GetRegWaitStates = 2; 793 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 794 return GetRegHWReg == getHWReg(TII, MI); 795 }; 796 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 797 798 return GetRegWaitStates - WaitStatesNeeded; 799 } 800 801 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 802 const SIInstrInfo *TII = ST.getInstrInfo(); 803 unsigned HWReg = getHWReg(TII, *SetRegInstr); 804 805 const int SetRegWaitStates = ST.getSetRegWaitStates(); 806 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 807 return HWReg == getHWReg(TII, MI); 808 }; 809 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 810 return SetRegWaitStates - WaitStatesNeeded; 811 } 812 813 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 814 if (!MI.mayStore()) 815 return -1; 816 817 const SIInstrInfo *TII = ST.getInstrInfo(); 818 unsigned Opcode = MI.getOpcode(); 819 const MCInstrDesc &Desc = MI.getDesc(); 820 821 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 822 int VDataRCID = -1; 823 if (VDataIdx != -1) 824 VDataRCID = Desc.operands()[VDataIdx].RegClass; 825 826 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 827 // There is no hazard if the instruction does not use vector regs 828 // (like wbinvl1) 829 if (VDataIdx == -1) 830 return -1; 831 // For MUBUF/MTBUF instructions this hazard only exists if the 832 // instruction is not using a register in the soffset field. 833 const MachineOperand *SOffset = 834 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 835 // If we have no soffset operand, then assume this field has been 836 // hardcoded to zero. 837 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 838 (!SOffset || !SOffset->isReg())) 839 return VDataIdx; 840 } 841 842 // MIMG instructions create a hazard if they don't use a 256-bit T# and 843 // the store size is greater than 8 bytes and they have more than two bits 844 // of their dmask set. 845 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 846 if (TII->isMIMG(MI)) { 847 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 848 assert(SRsrcIdx != -1 && 849 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 850 (void)SRsrcIdx; 851 } 852 853 if (TII->isFLAT(MI)) { 854 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 855 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 856 return DataIdx; 857 } 858 859 return -1; 860 } 861 862 int 863 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 864 const MachineRegisterInfo &MRI) { 865 // Helper to check for the hazard where VMEM instructions that store more than 866 // 8 bytes can have there store data over written by the next instruction. 867 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 868 869 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 870 int WaitStatesNeeded = 0; 871 872 if (!TRI->isVectorRegister(MRI, Def.getReg())) 873 return WaitStatesNeeded; 874 Register Reg = Def.getReg(); 875 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 876 int DataIdx = createsVALUHazard(MI); 877 return DataIdx >= 0 && 878 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 879 }; 880 881 int WaitStatesNeededForDef = 882 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 883 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 884 885 return WaitStatesNeeded; 886 } 887 888 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 889 /// pack the computed value into correct bit position of the dest register. This 890 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 891 /// dst_sel that is not aligned to the register. This function analayzes the \p 892 /// MI and \returns an operand with dst forwarding issue, or nullptr if 893 /// none exists. 894 static const MachineOperand * 895 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 896 if (!SIInstrInfo::isVALU(MI)) 897 return nullptr; 898 899 const SIInstrInfo *TII = ST.getInstrInfo(); 900 901 unsigned Opcode = MI.getOpcode(); 902 903 // There are three different types of instructions 904 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 905 // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and 906 // CVT_SR_BF8_F32 with op_sel[3:2] 907 // != 0 908 if (SIInstrInfo::isSDWA(MI)) { 909 // Type 1: SDWA with dst_sel != DWORD 910 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 911 if (DstSel->getImm() == AMDGPU::SDWA::DWORD) 912 return nullptr; 913 } else { 914 // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and 915 // CVT_SR_BF8_F32 with op_sel[3:2] != 0) 916 if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || 917 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 918 SISrcMods::DST_OP_SEL || 919 (AMDGPU::isFP8DstSelInst(Opcode) && 920 (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 921 SISrcMods::OP_SEL_0)))) 922 return nullptr; 923 } 924 925 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 926 } 927 928 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 929 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 930 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 931 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 932 const MachineOperand *Dst, 933 const SIRegisterInfo *TRI) { 934 // We must consider implicit reads of the VALU. SDWA with dst_sel and 935 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 936 // and we must account for that hazard. 937 // We also must account for WAW hazards. In particular, WAW with dest 938 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 939 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 940 // check for ECC. Without accounting for this hazard, the ECC will be 941 // wrong. 942 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 943 // complete zeroesHigh16BitsOfDest) 944 for (auto &Operand : VALU->operands()) { 945 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 946 return true; 947 } 948 } 949 return false; 950 } 951 952 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 953 int WaitStatesNeeded = 0; 954 955 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 956 const int TransDefWaitstates = 1; 957 958 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 959 if (!SIInstrInfo::isTRANS(MI)) 960 return false; 961 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 962 const SIInstrInfo *TII = ST.getInstrInfo(); 963 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 964 965 for (const MachineOperand &Use : VALU->explicit_uses()) { 966 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 967 return true; 968 } 969 970 return false; 971 }; 972 973 int WaitStatesNeededForDef = 974 TransDefWaitstates - 975 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 977 } 978 979 if (ST.hasDstSelForwardingHazard()) { 980 const int Shift16DefWaitstates = 1; 981 982 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 983 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 984 const MachineOperand *ForwardedDst = 985 getDstSelForwardingOperand(ProducerMI, ST); 986 if (ForwardedDst) { 987 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 988 } 989 990 if (ProducerMI.isInlineAsm()) { 991 // Assume inline asm has dst forwarding hazard 992 for (auto &Def : ProducerMI.all_defs()) { 993 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 994 return true; 995 } 996 } 997 998 return false; 999 }; 1000 1001 int WaitStatesNeededForDef = 1002 Shift16DefWaitstates - 1003 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1005 } 1006 1007 if (ST.hasVDecCoExecHazard()) { 1008 const int VALUWriteSGPRVALUReadWaitstates = 2; 1009 const int VALUWriteEXECRWLane = 4; 1010 const int VALUWriteVGPRReadlaneRead = 1; 1011 1012 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1013 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1014 Register UseReg; 1015 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1016 if (!SIInstrInfo::isVALU(MI)) 1017 return false; 1018 return MI.modifiesRegister(UseReg, TRI); 1019 }; 1020 1021 for (const MachineOperand &Use : VALU->explicit_uses()) { 1022 if (!Use.isReg()) 1023 continue; 1024 1025 UseReg = Use.getReg(); 1026 if (TRI->isSGPRReg(MRI, UseReg)) { 1027 int WaitStatesNeededForDef = 1028 VALUWriteSGPRVALUReadWaitstates - 1029 getWaitStatesSince(IsVALUDefSGPRFn, 1030 VALUWriteSGPRVALUReadWaitstates); 1031 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1032 } 1033 } 1034 1035 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1036 UseReg = AMDGPU::VCC; 1037 int WaitStatesNeededForDef = 1038 VALUWriteSGPRVALUReadWaitstates - 1039 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1040 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1041 } 1042 1043 switch (VALU->getOpcode()) { 1044 case AMDGPU::V_READLANE_B32: 1045 case AMDGPU::V_READFIRSTLANE_B32: { 1046 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1047 UseReg = Src->getReg(); 1048 int WaitStatesNeededForDef = 1049 VALUWriteVGPRReadlaneRead - 1050 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1052 } 1053 [[fallthrough]]; 1054 case AMDGPU::V_WRITELANE_B32: { 1055 UseReg = AMDGPU::EXEC; 1056 int WaitStatesNeededForDef = 1057 VALUWriteEXECRWLane - 1058 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1059 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1060 break; 1061 } 1062 default: 1063 break; 1064 } 1065 } 1066 1067 // This checks for the hazard where VMEM instructions that store more than 1068 // 8 bytes can have there store data over written by the next instruction. 1069 if (!ST.has12DWordStoreHazard()) 1070 return WaitStatesNeeded; 1071 1072 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1073 1074 for (const MachineOperand &Def : VALU->defs()) { 1075 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1076 } 1077 1078 return WaitStatesNeeded; 1079 } 1080 1081 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1082 // This checks for hazards associated with inline asm statements. 1083 // Since inline asms can contain just about anything, we use this 1084 // to call/leverage other check*Hazard routines. Note that 1085 // this function doesn't attempt to address all possible inline asm 1086 // hazards (good luck), but is a collection of what has been 1087 // problematic thus far. 1088 1089 // see checkVALUHazards() 1090 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) 1091 return 0; 1092 1093 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1094 int WaitStatesNeeded = 0; 1095 1096 for (const MachineOperand &Op : 1097 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1098 if (Op.isReg() && Op.isDef()) { 1099 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1100 continue; 1101 1102 if (ST.has12DWordStoreHazard()) { 1103 WaitStatesNeeded = 1104 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1105 } 1106 } 1107 } 1108 1109 if (ST.hasDstSelForwardingHazard()) { 1110 const int Shift16DefWaitstates = 1; 1111 1112 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1113 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1114 // Assume inline asm reads the dst 1115 if (Dst) 1116 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1117 IA->readsRegister(Dst->getReg(), &TRI); 1118 1119 if (ProducerMI.isInlineAsm()) { 1120 // If MI is inline asm, assume it has dst forwarding hazard 1121 for (auto &Def : ProducerMI.all_defs()) { 1122 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1123 IA->readsRegister(Def.getReg(), &TRI)) { 1124 return true; 1125 } 1126 } 1127 } 1128 1129 return false; 1130 }; 1131 1132 int WaitStatesNeededForDef = 1133 Shift16DefWaitstates - 1134 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1135 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1136 } 1137 1138 return WaitStatesNeeded; 1139 } 1140 1141 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1142 const SIInstrInfo *TII = ST.getInstrInfo(); 1143 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1144 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1145 1146 const MachineOperand *LaneSelectOp = 1147 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1148 1149 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1150 return 0; 1151 1152 Register LaneSelectReg = LaneSelectOp->getReg(); 1153 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1154 1155 const int RWLaneWaitStates = 4; 1156 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1157 RWLaneWaitStates); 1158 return RWLaneWaitStates - WaitStatesSince; 1159 } 1160 1161 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1162 if (!ST.hasRFEHazards()) 1163 return 0; 1164 1165 const SIInstrInfo *TII = ST.getInstrInfo(); 1166 1167 const int RFEWaitStates = 1; 1168 1169 auto IsHazardFn = [TII](const MachineInstr &MI) { 1170 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1171 }; 1172 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1173 return RFEWaitStates - WaitStatesNeeded; 1174 } 1175 1176 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1177 const SIInstrInfo *TII = ST.getInstrInfo(); 1178 const int ReadM0WaitStates = 1; 1179 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1180 return ReadM0WaitStates - 1181 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1182 } 1183 1184 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1185 fixVMEMtoScalarWriteHazards(MI); 1186 fixVcmpxPermlaneHazards(MI); 1187 fixSMEMtoVectorWriteHazards(MI); 1188 fixVcmpxExecWARHazard(MI); 1189 fixLdsBranchVmemWARHazard(MI); 1190 if (ST.hasLdsDirect()) { 1191 fixLdsDirectVALUHazard(MI); 1192 fixLdsDirectVMEMHazard(MI); 1193 } 1194 fixVALUPartialForwardingHazard(MI); 1195 fixVALUTransUseHazard(MI); 1196 fixWMMAHazards(MI); 1197 fixShift64HighRegBug(MI); 1198 fixVALUMaskWriteHazard(MI); 1199 fixVALUReadSGPRHazard(MI); 1200 fixRequiredExportPriority(MI); 1201 } 1202 1203 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1204 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1205 return false; 1206 1207 const SIInstrInfo *TII = ST.getInstrInfo(); 1208 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1209 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1210 return (TII->isVOPC(MI) || 1211 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && 1212 MI.modifiesRegister(AMDGPU::EXEC, TRI); 1213 }; 1214 1215 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1216 unsigned Opc = MI.getOpcode(); 1217 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1218 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1219 }; 1220 1221 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1222 std::numeric_limits<int>::max()) 1223 return false; 1224 1225 // V_NOP will be discarded by SQ. 1226 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1227 // which is always a VGPR and available. 1228 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1229 Register Reg = Src0->getReg(); 1230 bool IsUndef = Src0->isUndef(); 1231 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1232 TII->get(AMDGPU::V_MOV_B32_e32)) 1233 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1234 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1235 1236 return true; 1237 } 1238 1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1240 if (!ST.hasVMEMtoScalarWriteHazard()) 1241 return false; 1242 assert(!ST.hasExtendedWaitCounts()); 1243 1244 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1245 return false; 1246 1247 if (MI->getNumDefs() == 0) 1248 return false; 1249 1250 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1251 1252 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1253 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1254 !SIInstrInfo::isFLAT(I)) 1255 return false; 1256 1257 for (const MachineOperand &Def : MI->defs()) { 1258 const MachineOperand *Op = 1259 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1260 if (!Op) 1261 continue; 1262 return true; 1263 } 1264 return false; 1265 }; 1266 1267 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1268 return SIInstrInfo::isVALU(MI) || 1269 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1270 !MI.getOperand(0).getImm()) || 1271 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1272 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1273 }; 1274 1275 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1276 std::numeric_limits<int>::max()) 1277 return false; 1278 1279 const SIInstrInfo *TII = ST.getInstrInfo(); 1280 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1281 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1282 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1283 return true; 1284 } 1285 1286 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1287 if (!ST.hasSMEMtoVectorWriteHazard()) 1288 return false; 1289 assert(!ST.hasExtendedWaitCounts()); 1290 1291 if (!SIInstrInfo::isVALU(*MI)) 1292 return false; 1293 1294 unsigned SDSTName; 1295 switch (MI->getOpcode()) { 1296 case AMDGPU::V_READLANE_B32: 1297 case AMDGPU::V_READFIRSTLANE_B32: 1298 SDSTName = AMDGPU::OpName::vdst; 1299 break; 1300 default: 1301 SDSTName = AMDGPU::OpName::sdst; 1302 break; 1303 } 1304 1305 const SIInstrInfo *TII = ST.getInstrInfo(); 1306 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1307 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1308 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1309 if (!SDST) { 1310 for (const auto &MO : MI->implicit_operands()) { 1311 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1312 SDST = &MO; 1313 break; 1314 } 1315 } 1316 } 1317 1318 if (!SDST) 1319 return false; 1320 1321 const Register SDSTReg = SDST->getReg(); 1322 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1323 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1324 }; 1325 1326 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1327 if (TII->isSALU(MI)) { 1328 switch (MI.getOpcode()) { 1329 case AMDGPU::S_SETVSKIP: 1330 case AMDGPU::S_VERSION: 1331 case AMDGPU::S_WAITCNT_VSCNT: 1332 case AMDGPU::S_WAITCNT_VMCNT: 1333 case AMDGPU::S_WAITCNT_EXPCNT: 1334 // These instructions cannot not mitigate the hazard. 1335 return false; 1336 case AMDGPU::S_WAITCNT_LGKMCNT: 1337 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1338 return (MI.getOperand(1).getImm() == 0) && 1339 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1340 case AMDGPU::S_WAITCNT: { 1341 const int64_t Imm = MI.getOperand(0).getImm(); 1342 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1343 // DsCnt corresponds to LGKMCnt here. 1344 return (Decoded.DsCnt == 0); 1345 } 1346 default: 1347 // SOPP instructions cannot mitigate the hazard. 1348 if (TII->isSOPP(MI)) 1349 return false; 1350 // At this point the SALU can be assumed to mitigate the hazard 1351 // because either: 1352 // (a) it is independent of the at risk SMEM (breaking chain), 1353 // or 1354 // (b) it is dependent on the SMEM, in which case an appropriate 1355 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1356 // SMEM instruction. 1357 return true; 1358 } 1359 } 1360 return false; 1361 }; 1362 1363 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1364 std::numeric_limits<int>::max()) 1365 return false; 1366 1367 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1368 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1369 .addImm(0); 1370 return true; 1371 } 1372 1373 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1374 if (!ST.hasVcmpxExecWARHazard()) 1375 return false; 1376 assert(!ST.hasExtendedWaitCounts()); 1377 1378 if (!SIInstrInfo::isVALU(*MI)) 1379 return false; 1380 1381 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1382 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1383 return false; 1384 1385 auto IsHazardFn = [TRI](const MachineInstr &I) { 1386 if (SIInstrInfo::isVALU(I)) 1387 return false; 1388 return I.readsRegister(AMDGPU::EXEC, TRI); 1389 }; 1390 1391 const SIInstrInfo *TII = ST.getInstrInfo(); 1392 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1393 if (SIInstrInfo::isVALU(MI)) { 1394 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1395 return true; 1396 for (auto MO : MI.implicit_operands()) 1397 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1398 return true; 1399 } 1400 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1401 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1402 return true; 1403 return false; 1404 }; 1405 1406 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1407 std::numeric_limits<int>::max()) 1408 return false; 1409 1410 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1411 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1412 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1413 return true; 1414 } 1415 1416 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1417 const GCNSubtarget &ST) { 1418 if (!ST.hasLdsBranchVmemWARHazard()) 1419 return false; 1420 1421 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1422 // instructions need to appear in the same function. 1423 bool HasLds = false; 1424 bool HasVmem = false; 1425 for (auto &MBB : MF) { 1426 for (auto &MI : MBB) { 1427 HasLds |= SIInstrInfo::isDS(MI); 1428 HasVmem |= 1429 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1430 if (HasLds && HasVmem) 1431 return true; 1432 } 1433 } 1434 return false; 1435 } 1436 1437 static bool isStoreCountWaitZero(const MachineInstr &I) { 1438 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1439 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1440 !I.getOperand(1).getImm(); 1441 } 1442 1443 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1444 if (!RunLdsBranchVmemWARHazardFixup) 1445 return false; 1446 1447 assert(ST.hasLdsBranchVmemWARHazard()); 1448 assert(!ST.hasExtendedWaitCounts()); 1449 1450 auto IsHazardInst = [](const MachineInstr &MI) { 1451 if (SIInstrInfo::isDS(MI)) 1452 return 1; 1453 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1454 return 2; 1455 return 0; 1456 }; 1457 1458 auto InstType = IsHazardInst(*MI); 1459 if (!InstType) 1460 return false; 1461 1462 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1463 return IsHazardInst(I) || isStoreCountWaitZero(I); 1464 }; 1465 1466 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1467 if (!I.isBranch()) 1468 return false; 1469 1470 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1471 auto InstType2 = IsHazardInst(I); 1472 return InstType2 && InstType != InstType2; 1473 }; 1474 1475 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1476 auto InstType2 = IsHazardInst(I); 1477 if (InstType == InstType2) 1478 return true; 1479 1480 return isStoreCountWaitZero(I); 1481 }; 1482 1483 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1484 std::numeric_limits<int>::max(); 1485 }; 1486 1487 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1488 std::numeric_limits<int>::max()) 1489 return false; 1490 1491 const SIInstrInfo *TII = ST.getInstrInfo(); 1492 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1493 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1494 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1495 .addImm(0); 1496 1497 return true; 1498 } 1499 1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1501 if (!SIInstrInfo::isLDSDIR(*MI)) 1502 return false; 1503 1504 const int NoHazardWaitStates = 15; 1505 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1506 const Register VDSTReg = VDST->getReg(); 1507 1508 bool VisitedTrans = false; 1509 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1510 if (!SIInstrInfo::isVALU(I)) 1511 return false; 1512 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1513 // Cover both WAR and WAW 1514 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1515 }; 1516 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1517 if (WaitStates >= NoHazardWaitStates) 1518 return true; 1519 // Instructions which cause va_vdst==0 expire hazard 1520 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1521 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1522 }; 1523 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1524 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1525 }; 1526 1527 DenseSet<const MachineBasicBlock *> Visited; 1528 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1529 std::next(MI->getReverseIterator()), 0, 1530 IsExpiredFn, Visited, GetWaitStatesFn); 1531 1532 // Transcendentals can execute in parallel to other VALUs. 1533 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1534 if (VisitedTrans) 1535 Count = 0; 1536 1537 MachineOperand *WaitVdstOp = 1538 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1539 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1540 1541 return true; 1542 } 1543 1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1545 if (!SIInstrInfo::isLDSDIR(*MI)) 1546 return false; 1547 1548 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1549 const Register VDSTReg = VDST->getReg(); 1550 1551 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1552 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1553 !SIInstrInfo::isDS(I)) 1554 return false; 1555 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1556 }; 1557 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1558 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1559 // according to the type of VMEM instruction. 1560 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1561 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1562 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1563 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1564 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1565 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1566 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1567 }; 1568 1569 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1570 std::numeric_limits<int>::max()) 1571 return false; 1572 1573 if (LdsdirCanWait) { 1574 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1575 } else { 1576 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1577 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1578 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1579 } 1580 1581 return true; 1582 } 1583 1584 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1585 if (!ST.hasVALUPartialForwardingHazard()) 1586 return false; 1587 assert(!ST.hasExtendedWaitCounts()); 1588 1589 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1590 return false; 1591 1592 SmallSetVector<Register, 4> SrcVGPRs; 1593 1594 for (const MachineOperand &Use : MI->explicit_uses()) { 1595 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1596 SrcVGPRs.insert(Use.getReg()); 1597 } 1598 1599 // Only applies with >= 2 unique VGPR sources 1600 if (SrcVGPRs.size() <= 1) 1601 return false; 1602 1603 // Look for the following pattern: 1604 // Va <- VALU [PreExecPos] 1605 // intv1 1606 // Exec <- SALU [ExecPos] 1607 // intv2 1608 // Vb <- VALU [PostExecPos] 1609 // intv3 1610 // MI Va, Vb (WaitState = 0) 1611 // 1612 // Where: 1613 // intv1 + intv2 <= 2 VALUs 1614 // intv3 <= 4 VALUs 1615 // 1616 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1617 1618 const int Intv1plus2MaxVALUs = 2; 1619 const int Intv3MaxVALUs = 4; 1620 const int IntvMaxVALUs = 6; 1621 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1622 1623 struct StateType { 1624 SmallDenseMap<Register, int, 4> DefPos; 1625 int ExecPos = std::numeric_limits<int>::max(); 1626 int VALUs = 0; 1627 }; 1628 1629 StateType State; 1630 1631 // This overloads expiry testing with all the hazard detection 1632 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1633 // Too many VALU states have passed 1634 if (State.VALUs > NoHazardVALUWaitStates) 1635 return HazardExpired; 1636 1637 // Instructions which cause va_vdst==0 expire hazard 1638 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1639 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1640 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1641 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1642 return HazardExpired; 1643 1644 // Track registers writes 1645 bool Changed = false; 1646 if (SIInstrInfo::isVALU(I)) { 1647 for (Register Src : SrcVGPRs) { 1648 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1649 State.DefPos[Src] = State.VALUs; 1650 Changed = true; 1651 } 1652 } 1653 } else if (SIInstrInfo::isSALU(I)) { 1654 if (State.ExecPos == std::numeric_limits<int>::max()) { 1655 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1656 State.ExecPos = State.VALUs; 1657 Changed = true; 1658 } 1659 } 1660 } 1661 1662 // Early expiration: too many VALUs in intv3 1663 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1664 return HazardExpired; 1665 1666 // Only evaluate state if something changed 1667 if (!Changed) 1668 return NoHazardFound; 1669 1670 // Determine positions of VALUs pre/post exec change 1671 if (State.ExecPos == std::numeric_limits<int>::max()) 1672 return NoHazardFound; 1673 1674 int PreExecPos = std::numeric_limits<int>::max(); 1675 int PostExecPos = std::numeric_limits<int>::max(); 1676 1677 for (auto Entry : State.DefPos) { 1678 int DefVALUs = Entry.second; 1679 if (DefVALUs != std::numeric_limits<int>::max()) { 1680 if (DefVALUs >= State.ExecPos) 1681 PreExecPos = std::min(PreExecPos, DefVALUs); 1682 else 1683 PostExecPos = std::min(PostExecPos, DefVALUs); 1684 } 1685 } 1686 1687 // Need a VALUs post exec change 1688 if (PostExecPos == std::numeric_limits<int>::max()) 1689 return NoHazardFound; 1690 1691 // Too many VALUs in intv3? 1692 int Intv3VALUs = PostExecPos; 1693 if (Intv3VALUs > Intv3MaxVALUs) 1694 return HazardExpired; 1695 1696 // Too many VALUs in intv2? 1697 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1698 if (Intv2VALUs > Intv1plus2MaxVALUs) 1699 return HazardExpired; 1700 1701 // Need a VALUs pre exec change 1702 if (PreExecPos == std::numeric_limits<int>::max()) 1703 return NoHazardFound; 1704 1705 // Too many VALUs in intv1? 1706 int Intv1VALUs = PreExecPos - State.ExecPos; 1707 if (Intv1VALUs > Intv1plus2MaxVALUs) 1708 return HazardExpired; 1709 1710 // Too many VALUs in intv1 + intv2 1711 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1712 return HazardExpired; 1713 1714 return HazardFound; 1715 }; 1716 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1717 if (SIInstrInfo::isVALU(MI)) 1718 State.VALUs += 1; 1719 }; 1720 1721 DenseSet<const MachineBasicBlock *> Visited; 1722 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1723 std::next(MI->getReverseIterator()), Visited)) 1724 return false; 1725 1726 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1727 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1728 .addImm(0x0fff); 1729 1730 return true; 1731 } 1732 1733 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1734 if (!ST.hasVALUTransUseHazard()) 1735 return false; 1736 assert(!ST.hasExtendedWaitCounts()); 1737 1738 if (!SIInstrInfo::isVALU(*MI)) 1739 return false; 1740 1741 SmallSet<Register, 4> SrcVGPRs; 1742 1743 for (const MachineOperand &Use : MI->explicit_uses()) { 1744 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1745 SrcVGPRs.insert(Use.getReg()); 1746 } 1747 1748 // Look for the following pattern: 1749 // Va <- TRANS VALU 1750 // intv 1751 // MI Va (WaitState = 0) 1752 // 1753 // Where: 1754 // intv <= 5 VALUs / 1 TRANS 1755 // 1756 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1757 1758 const int IntvMaxVALUs = 5; 1759 const int IntvMaxTRANS = 1; 1760 1761 struct StateType { 1762 int VALUs = 0; 1763 int TRANS = 0; 1764 }; 1765 1766 StateType State; 1767 1768 // This overloads expiry testing with all the hazard detection 1769 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1770 // Too many VALU states have passed 1771 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1772 return HazardExpired; 1773 1774 // Instructions which cause va_vdst==0 expire hazard 1775 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1776 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1777 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1778 I.getOperand(0).getImm() == 0x0fff)) 1779 return HazardExpired; 1780 1781 // Track registers writes 1782 if (SIInstrInfo::isTRANS(I)) { 1783 for (Register Src : SrcVGPRs) { 1784 if (I.modifiesRegister(Src, &TRI)) { 1785 return HazardFound; 1786 } 1787 } 1788 } 1789 1790 return NoHazardFound; 1791 }; 1792 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1793 if (SIInstrInfo::isVALU(MI)) 1794 State.VALUs += 1; 1795 if (SIInstrInfo::isTRANS(MI)) 1796 State.TRANS += 1; 1797 }; 1798 1799 DenseSet<const MachineBasicBlock *> Visited; 1800 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1801 std::next(MI->getReverseIterator()), Visited)) 1802 return false; 1803 1804 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1805 // avoided. 1806 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1807 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1808 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1809 1810 return true; 1811 } 1812 1813 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1814 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1815 return false; 1816 1817 const SIInstrInfo *TII = ST.getInstrInfo(); 1818 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1819 1820 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1821 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1822 return false; 1823 1824 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1825 // with the dest(matrix D) of the previous wmma. 1826 const Register CurSrc0Reg = 1827 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1828 const Register CurSrc1Reg = 1829 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1830 1831 const Register PrevDstReg = 1832 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1833 1834 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1835 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1836 return true; 1837 } 1838 1839 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1840 // but Index can't overlap with PrevDstReg. 1841 if (AMDGPU::isGFX12Plus(ST)) { 1842 if (SIInstrInfo::isSWMMAC(*MI)) { 1843 const Register CurIndex = 1844 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1845 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1846 return true; 1847 } 1848 return false; 1849 } 1850 1851 return false; 1852 }; 1853 1854 auto IsExpiredFn = [](const MachineInstr &I, int) { 1855 return SIInstrInfo::isVALU(I); 1856 }; 1857 1858 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1859 std::numeric_limits<int>::max()) 1860 return false; 1861 1862 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1863 1864 return true; 1865 } 1866 1867 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1868 if (!ST.hasShift64HighRegBug()) 1869 return false; 1870 assert(!ST.hasExtendedWaitCounts()); 1871 1872 switch (MI->getOpcode()) { 1873 default: 1874 return false; 1875 case AMDGPU::V_LSHLREV_B64_e64: 1876 case AMDGPU::V_LSHRREV_B64_e64: 1877 case AMDGPU::V_ASHRREV_I64_e64: 1878 break; 1879 } 1880 1881 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1882 if (!Amt->isReg()) 1883 return false; 1884 1885 Register AmtReg = Amt->getReg(); 1886 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1887 // Check if this is a last VGPR in the allocation block. 1888 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1889 return false; 1890 1891 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1892 return false; 1893 1894 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1895 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1896 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1897 bool Overlapped = OverlappedSrc || OverlappedDst; 1898 1899 assert(!OverlappedDst || !OverlappedSrc || 1900 Src1->getReg() == MI->getOperand(0).getReg()); 1901 assert(ST.needsAlignedVGPRs()); 1902 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1903 1904 Register NewReg; 1905 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1906 : AMDGPU::VGPR_32RegClass) { 1907 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1908 NewReg = Reg; 1909 break; 1910 } 1911 } 1912 1913 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1914 : NewReg; 1915 Register NewAmtLo; 1916 1917 if (Overlapped) 1918 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1919 1920 DebugLoc DL = MI->getDebugLoc(); 1921 MachineBasicBlock *MBB = MI->getParent(); 1922 // Insert a full wait count because found register might be pending a wait. 1923 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1924 .addImm(0); 1925 1926 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1927 if (Overlapped) 1928 runOnInstruction( 1929 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1930 .addDef(AmtReg - 1) 1931 .addReg(AmtReg - 1, RegState::Undef) 1932 .addReg(NewAmtLo, RegState::Undef)); 1933 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1934 .addDef(AmtReg) 1935 .addReg(AmtReg, RegState::Undef) 1936 .addReg(NewAmt, RegState::Undef)); 1937 1938 // Instructions emitted after the current instruction will be processed by the 1939 // parent loop of the hazard recognizer in a natural way. 1940 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1941 AmtReg) 1942 .addDef(NewAmt) 1943 .addReg(NewAmt) 1944 .addReg(AmtReg); 1945 if (Overlapped) 1946 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1947 AmtReg - 1) 1948 .addDef(NewAmtLo) 1949 .addReg(NewAmtLo) 1950 .addReg(AmtReg - 1); 1951 1952 // Re-running hazard recognizer on the modified instruction is not necessary, 1953 // inserted V_SWAP_B32 has already both read and write new registers so 1954 // hazards related to these register has already been handled. 1955 Amt->setReg(NewAmt); 1956 Amt->setIsKill(false); 1957 // We do not update liveness, so verifier may see it as undef. 1958 Amt->setIsUndef(); 1959 if (OverlappedDst) 1960 MI->getOperand(0).setReg(NewReg); 1961 if (OverlappedSrc) { 1962 Src1->setReg(NewReg); 1963 Src1->setIsKill(false); 1964 Src1->setIsUndef(); 1965 } 1966 1967 return true; 1968 } 1969 1970 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1971 int NSAtoVMEMWaitStates = 1; 1972 1973 if (!ST.hasNSAtoVMEMBug()) 1974 return 0; 1975 1976 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1977 return 0; 1978 1979 const SIInstrInfo *TII = ST.getInstrInfo(); 1980 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1981 if (!Offset || (Offset->getImm() & 6) == 0) 1982 return 0; 1983 1984 auto IsHazardFn = [TII](const MachineInstr &I) { 1985 if (!SIInstrInfo::isMIMG(I)) 1986 return false; 1987 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 1988 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1989 TII->getInstSizeInBytes(I) >= 16; 1990 }; 1991 1992 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1993 } 1994 1995 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1996 int FPAtomicToDenormModeWaitStates = 3; 1997 1998 if (!ST.hasFPAtomicToDenormModeHazard()) 1999 return 0; 2000 assert(!ST.hasExtendedWaitCounts()); 2001 2002 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2003 return 0; 2004 2005 auto IsHazardFn = [](const MachineInstr &I) { 2006 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2007 return false; 2008 return SIInstrInfo::isFPAtomic(I); 2009 }; 2010 2011 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2012 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2013 return true; 2014 2015 switch (MI.getOpcode()) { 2016 case AMDGPU::S_WAITCNT: 2017 case AMDGPU::S_WAITCNT_VSCNT: 2018 case AMDGPU::S_WAITCNT_VMCNT: 2019 case AMDGPU::S_WAITCNT_EXPCNT: 2020 case AMDGPU::S_WAITCNT_LGKMCNT: 2021 case AMDGPU::S_WAIT_IDLE: 2022 return true; 2023 default: 2024 break; 2025 } 2026 2027 return false; 2028 }; 2029 2030 return FPAtomicToDenormModeWaitStates - 2031 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2032 } 2033 2034 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2035 assert(SIInstrInfo::isMAI(*MI)); 2036 2037 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2038 } 2039 2040 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2041 // Early exit if no padding is requested. 2042 if (MFMAPaddingRatio == 0) 2043 return 0; 2044 2045 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2046 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2047 return 0; 2048 2049 int NeighborMFMALatency = 0; 2050 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2051 this](const MachineInstr &MI) { 2052 if (!SIInstrInfo::isMFMA(MI)) 2053 return false; 2054 2055 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2056 return true; 2057 }; 2058 2059 const int MaxMFMAPipelineWaitStates = 16; 2060 int WaitStatesSinceNeighborMFMA = 2061 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2062 2063 int NeighborMFMAPaddingNeeded = 2064 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2065 WaitStatesSinceNeighborMFMA; 2066 2067 return std::max(0, NeighborMFMAPaddingNeeded); 2068 } 2069 2070 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2071 int WaitStatesNeeded = 0; 2072 unsigned Opc = MI->getOpcode(); 2073 2074 auto IsVALUFn = [](const MachineInstr &MI) { 2075 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2076 }; 2077 2078 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2079 const int LegacyVALUWritesVGPRWaitStates = 2; 2080 const int VALUWritesExecWaitStates = 4; 2081 const int MaxWaitStates = 4; 2082 2083 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2084 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2086 2087 if (WaitStatesNeeded < MaxWaitStates) { 2088 for (const MachineOperand &Use : MI->explicit_uses()) { 2089 const int MaxWaitStates = 2; 2090 2091 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2092 continue; 2093 2094 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2095 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2097 2098 if (WaitStatesNeeded == MaxWaitStates) 2099 break; 2100 } 2101 } 2102 } 2103 2104 for (const MachineOperand &Op : MI->explicit_operands()) { 2105 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2106 continue; 2107 2108 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2109 continue; 2110 2111 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2112 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2113 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2114 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2115 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2116 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2117 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2118 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2119 const int MaxWaitStates = 18; 2120 Register Reg = Op.getReg(); 2121 unsigned HazardDefLatency = 0; 2122 2123 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2124 this](const MachineInstr &MI) { 2125 if (!SIInstrInfo::isMFMA(MI)) 2126 return false; 2127 Register DstReg = MI.getOperand(0).getReg(); 2128 if (DstReg == Reg) 2129 return false; 2130 HazardDefLatency = 2131 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2132 return TRI.regsOverlap(DstReg, Reg); 2133 }; 2134 2135 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2136 MaxWaitStates); 2137 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2138 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2139 int OpNo = Op.getOperandNo(); 2140 if (OpNo == SrcCIdx) { 2141 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2142 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2143 switch (HazardDefLatency) { 2144 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2145 break; 2146 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2147 break; 2148 case 16: [[fallthrough]]; 2149 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2150 break; 2151 } 2152 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2153 switch (HazardDefLatency) { 2154 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2155 break; 2156 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2157 break; 2158 case 16: [[fallthrough]]; 2159 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2160 break; 2161 } 2162 } 2163 2164 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2165 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2166 2167 if (WaitStatesNeeded == MaxWaitStates) 2168 return WaitStatesNeeded; // Early exit. 2169 2170 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2171 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2172 return false; 2173 Register DstReg = MI.getOperand(0).getReg(); 2174 return TRI.regsOverlap(Reg, DstReg); 2175 }; 2176 2177 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2178 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2179 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2180 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2181 if (OpNo == SrcCIdx) 2182 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2183 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2184 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2185 2186 WaitStatesNeededForUse = NeedWaitStates - 2187 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2189 2190 if (WaitStatesNeeded == MaxWaitStates) 2191 return WaitStatesNeeded; // Early exit. 2192 } 2193 2194 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2195 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2196 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2197 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2198 const int MaxWaitStates = 13; 2199 Register DstReg = MI->getOperand(0).getReg(); 2200 unsigned HazardDefLatency = 0; 2201 2202 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2203 this](const MachineInstr &MI) { 2204 if (!SIInstrInfo::isMFMA(MI)) 2205 return false; 2206 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2207 HazardDefLatency = 2208 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2209 return TRI.regsOverlap(Reg, DstReg); 2210 }; 2211 2212 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2213 int NeedWaitStates; 2214 switch (HazardDefLatency) { 2215 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2216 break; 2217 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2218 break; 2219 case 16: [[fallthrough]]; 2220 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2221 break; 2222 } 2223 2224 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2225 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2226 } 2227 2228 // Pad neighboring MFMA with noops for better inter-wave performance. 2229 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2230 2231 return WaitStatesNeeded; 2232 } 2233 2234 static int 2235 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses, 2236 bool IsGFX950) { 2237 // xdl def cycles | gfx940 | gfx950 2238 // 2 pass | 3 4 2239 // 4 pass | 5 6 2240 // 8 pass | 9 10 2241 // 16 pass | 17 18 2242 return NumPasses + 1 + IsGFX950; 2243 } 2244 2245 static int 2246 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2247 // 2 pass -> 2 2248 // 4 pass -> 4 2249 // 8 pass -> 8 2250 // 16 pass -> 16 2251 return NumPasses; 2252 } 2253 2254 static int 2255 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2256 // 2 pass -> 4 2257 // 4 pass -> 6 2258 // 8 pass -> 10 2259 // 16 pass -> 18 2260 return NumPasses + 2; 2261 } 2262 2263 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2264 // 2 pass -> 5 2265 // 4 pass -> 7 2266 // 8 pass -> 11 2267 // 16 pass -> 19 2268 return NumPasses + 3; 2269 } 2270 2271 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2272 int WaitStatesNeeded = 0; 2273 unsigned Opc = MI->getOpcode(); 2274 2275 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2276 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2277 }; 2278 2279 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2280 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2281 !SIInstrInfo::isDOT(MI); 2282 }; 2283 2284 if (!SIInstrInfo::isMFMA(*MI)) 2285 return WaitStatesNeeded; 2286 2287 const int VALUWritesExecWaitStates = 4; 2288 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2289 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2290 VALUWritesExecWaitStates); 2291 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2292 2293 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2294 2295 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2296 for (const MachineOperand &Use : MI->explicit_uses()) { 2297 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2298 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2299 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2300 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2301 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2302 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2303 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2304 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2305 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2306 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2307 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2308 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2309 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2310 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2311 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2312 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2313 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2314 const int MaxWaitStates = 19; 2315 2316 if (!Use.isReg()) 2317 continue; 2318 Register Reg = Use.getReg(); 2319 bool FullReg; 2320 const MachineInstr *MI1; 2321 2322 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2323 this](const MachineInstr &MI) { 2324 if (!SIInstrInfo::isMFMA(MI)) 2325 return false; 2326 Register DstReg = MI.getOperand(0).getReg(); 2327 FullReg = (DstReg == Reg); 2328 MI1 = &MI; 2329 return TRI.regsOverlap(DstReg, Reg); 2330 }; 2331 2332 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2333 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2334 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2335 2336 int NumWaitStates = 2337 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2338 if (NumWaitStates == std::numeric_limits<int>::max()) 2339 continue; 2340 2341 int OpNo = Use.getOperandNo(); 2342 unsigned Opc1 = MI1->getOpcode(); 2343 int NeedWaitStates = 0; 2344 if (OpNo == SrcCIdx) { 2345 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2346 NeedWaitStates = 0; 2347 } else if (FullReg) { 2348 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2349 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2350 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2351 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2352 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2353 else if (ST.hasGFX940Insts() && 2354 TSchedModel.computeInstrLatency(MI1) == 2) 2355 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2356 } else { 2357 switch (Opc1) { 2358 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2359 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2360 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2361 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2362 if (!isXDL(ST, *MI)) 2363 NeedWaitStates = 2364 ST.hasGFX950Insts() 2365 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2366 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2367 break; 2368 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2369 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2370 if (!isXDL(ST, *MI)) 2371 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2372 break; 2373 default: 2374 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2375 if (ST.hasGFX940Insts()) { 2376 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2377 break; 2378 2379 NeedWaitStates = 2380 isXDL(ST, *MI1) 2381 ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2382 NumPasses, ST.hasGFX950Insts()) 2383 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2384 NumPasses); 2385 break; 2386 } 2387 2388 switch (NumPasses) { 2389 case 2: 2390 NeedWaitStates = 2391 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2392 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2393 break; 2394 case 8: 2395 NeedWaitStates = 2396 isDGEMM(Opc) 2397 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2398 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2399 break; 2400 case 16: 2401 NeedWaitStates = 2402 isDGEMM(Opc) 2403 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2404 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2405 break; 2406 default: 2407 llvm_unreachable("unexpected number of passes"); 2408 } 2409 } 2410 } 2411 } else { 2412 switch (Opc1) { 2413 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2414 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2415 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2416 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2417 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2418 break; 2419 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2420 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2421 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2422 break; 2423 default: 2424 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2425 2426 if (ST.hasGFX940Insts()) { 2427 NeedWaitStates = 2428 isXDL(ST, *MI1) 2429 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2430 NumPasses) 2431 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2432 NumPasses); 2433 break; 2434 } 2435 2436 switch (NumPasses) { 2437 case 2: 2438 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2439 break; 2440 case 4: 2441 llvm_unreachable("unexpected number of passes for mfma"); 2442 case 8: 2443 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2444 break; 2445 case 16: 2446 default: 2447 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2448 } 2449 } 2450 } 2451 if (WaitStatesNeeded >= NeedWaitStates) 2452 continue; 2453 2454 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2455 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2456 2457 if (WaitStatesNeeded == MaxWaitStates) 2458 break; 2459 } 2460 2461 // Pad neighboring MFMA with noops for better inter-wave performance. 2462 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2463 2464 return WaitStatesNeeded; 2465 } 2466 2467 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2468 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2469 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2470 return 0; 2471 2472 int WaitStatesNeeded = 0; 2473 2474 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2475 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2476 }; 2477 2478 for (const MachineOperand &Op : MI->explicit_uses()) { 2479 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2480 continue; 2481 2482 Register Reg = Op.getReg(); 2483 2484 const int AccVgprReadLdStWaitStates = 2; 2485 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2486 const int MaxWaitStates = 2; 2487 2488 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2489 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2490 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2491 2492 if (WaitStatesNeeded == MaxWaitStates) 2493 return WaitStatesNeeded; // Early exit. 2494 2495 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2496 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2497 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2498 return false; 2499 auto IsVALUFn = [](const MachineInstr &MI) { 2500 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2501 }; 2502 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2503 std::numeric_limits<int>::max(); 2504 }; 2505 2506 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2507 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2508 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2509 } 2510 2511 return WaitStatesNeeded; 2512 } 2513 2514 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2515 // 2 pass -> 4 2516 // 4 pass -> 6 2517 // 8 pass -> 10 2518 // 16 pass -> 18 2519 return NumPasses + 2; 2520 } 2521 2522 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2523 // 2 pass -> 5 2524 // 4 pass -> 7 2525 // 8 pass -> 11 2526 // 16 pass -> 19 2527 return NumPasses + 3; 2528 } 2529 2530 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2531 // 2 pass -> 5 2532 // 4 pass -> 7 2533 // 8 pass -> 11 2534 // 16 pass -> 19 2535 return NumPasses + 3; 2536 } 2537 2538 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2539 // 2 pass -> 4 2540 // 4 pass -> 6 2541 // 8 pass -> 10 2542 // 16 pass -> 18 2543 return NumPasses + 2; 2544 } 2545 2546 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2547 if (!ST.hasGFX90AInsts()) 2548 return 0; 2549 2550 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2551 return isDGEMM(MI.getOpcode()); 2552 }; 2553 2554 // This is checked in checkMAIHazards90A() 2555 if (SIInstrInfo::isMFMA(*MI)) 2556 return 0; 2557 2558 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2559 2560 int WaitStatesNeeded = 0; 2561 2562 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2563 SIInstrInfo::isFLAT(*MI) || 2564 SIInstrInfo::isDS(*MI); 2565 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2566 bool IsVALU = SIInstrInfo::isVALU(*MI); 2567 2568 const MachineInstr *MFMA = nullptr; 2569 unsigned Reg; 2570 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2571 if (!SIInstrInfo::isMFMA(MI) || 2572 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2573 return false; 2574 MFMA = &MI; 2575 return true; 2576 }; 2577 2578 const MachineInstr *DOT = nullptr; 2579 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2580 if (!SIInstrInfo::isDOT(MI) || 2581 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2582 return false; 2583 DOT = &MI; 2584 return true; 2585 }; 2586 2587 bool DGEMMAfterVALUWrite = false; 2588 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2589 // Found DGEMM on reverse traversal to def. 2590 if (isDGEMM(MI.getOpcode())) 2591 DGEMMAfterVALUWrite = true; 2592 2593 // Only hazard if register is defined by a VALU and a DGEMM is found after 2594 // after the def. 2595 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2596 return false; 2597 2598 return true; 2599 }; 2600 2601 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2602 AMDGPU::OpName::src2); 2603 2604 if (IsMemOrExport || IsVALU) { 2605 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2606 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2607 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2608 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2609 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2610 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2611 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2612 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2613 const int DotWriteSameDotReadSrcAB = 3; 2614 const int DotWriteDifferentVALURead = 3; 2615 const int DMFMABetweenVALUWriteVMEMRead = 2; 2616 const int MaxWaitStates = 19; 2617 2618 for (const MachineOperand &Use : MI->explicit_uses()) { 2619 if (!Use.isReg()) 2620 continue; 2621 Reg = Use.getReg(); 2622 2623 DOT = nullptr; 2624 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2625 MaxWaitStates); 2626 if (DOT) { 2627 int NeedWaitStates = 0; 2628 if (DOT->getOpcode() == MI->getOpcode()) { 2629 if (&Use - &MI->getOperand(0) != SrcCIdx) 2630 NeedWaitStates = DotWriteSameDotReadSrcAB; 2631 } else { 2632 NeedWaitStates = DotWriteDifferentVALURead; 2633 } 2634 2635 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2636 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2637 } 2638 2639 // Workaround for HW data hazard bug observed only in GFX90A. When there 2640 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2641 // causes the SQ to incorrectly not insert two wait states between the two 2642 // instructions needed to avoid data hazard. 2643 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2644 DGEMMAfterVALUWrite = false; 2645 if (TRI.isVectorRegister(MRI, Reg)) { 2646 int WaitStatesNeededForUse = 2647 DMFMABetweenVALUWriteVMEMRead - 2648 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2649 DMFMABetweenVALUWriteVMEMRead); 2650 2651 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2652 } 2653 } 2654 2655 MFMA = nullptr; 2656 WaitStatesSinceDef = 2657 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2658 if (!MFMA) 2659 continue; 2660 2661 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2662 int NumPasses = HazardDefLatency; 2663 int NeedWaitStates = MaxWaitStates; 2664 2665 if (isDGEMM(MFMA->getOpcode())) { 2666 switch (HazardDefLatency) { 2667 case 4: 2668 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2669 : DMFMA4x4WriteVgprVALUReadWaitStates; 2670 break; 2671 case 8: 2672 case 16: 2673 NeedWaitStates = 2674 IsMemOrExport 2675 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2676 : (ST.hasGFX950Insts() 2677 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2678 : DMFMA16x16WriteVgprVALUReadWaitStates); 2679 break; 2680 default: 2681 llvm_unreachable("unexpected dgemm"); 2682 } 2683 } else if (ST.hasGFX940Insts()) { 2684 NeedWaitStates = 2685 isXDL(ST, *MFMA) 2686 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2687 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2688 NumPasses); 2689 } else { 2690 switch (HazardDefLatency) { 2691 case 2: 2692 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2693 break; 2694 case 8: 2695 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2696 break; 2697 case 16: 2698 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2699 break; 2700 default: 2701 llvm_unreachable("unexpected number of passes for mfma"); 2702 } 2703 } 2704 2705 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2706 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2707 2708 if (WaitStatesNeeded == MaxWaitStates) 2709 break; 2710 } 2711 } 2712 2713 unsigned Opc = MI->getOpcode(); 2714 const int DMFMAToFMA64WaitStates = 2; 2715 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2716 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2717 Opc == AMDGPU::V_FMAC_F64_dpp) && 2718 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2719 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2720 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2721 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2722 } 2723 2724 if (!IsVALU && !IsMemOrExport) 2725 return WaitStatesNeeded; 2726 2727 for (const MachineOperand &Def : MI->defs()) { 2728 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2729 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2730 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2731 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2732 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2733 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2734 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2735 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2736 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2737 const int DotWriteDifferentVALUWrite = 3; 2738 const int MaxWaitStates = 19; 2739 const int MaxWarWaitStates = 15; 2740 2741 Reg = Def.getReg(); 2742 2743 DOT = nullptr; 2744 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2745 MaxWaitStates); 2746 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2747 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2748 WaitStatesSinceDef); 2749 2750 MFMA = nullptr; 2751 WaitStatesSinceDef = 2752 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2753 if (MFMA) { 2754 int NeedWaitStates = MaxWaitStates; 2755 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2756 2757 if (isDGEMM(MFMA->getOpcode())) { 2758 switch (NumPasses) { 2759 case 4: 2760 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2761 break; 2762 case 8: 2763 case 16: 2764 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2765 break; 2766 default: 2767 llvm_unreachable("unexpected number of cycles for dgemm"); 2768 } 2769 } else if (ST.hasGFX940Insts()) { 2770 NeedWaitStates = 2771 isXDL(ST, *MFMA) 2772 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2773 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2774 } else { 2775 switch (NumPasses) { 2776 case 2: 2777 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2778 break; 2779 case 8: 2780 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2781 break; 2782 case 16: 2783 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2784 break; 2785 default: 2786 llvm_unreachable("Unexpected number of passes for mfma"); 2787 } 2788 } 2789 2790 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2792 2793 if (WaitStatesNeeded == MaxWaitStates) 2794 break; 2795 } 2796 2797 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2798 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2799 !MI.readsRegister(Reg, &TRI)) 2800 return false; 2801 2802 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2803 return false; 2804 2805 const MachineOperand *SrcC = 2806 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2807 assert(SrcC); 2808 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2809 return false; 2810 2811 MFMA = &MI; 2812 return true; 2813 }; 2814 2815 MFMA = nullptr; 2816 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2817 MaxWarWaitStates); 2818 if (!MFMA) 2819 continue; 2820 2821 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2822 int NeedWaitStates = MaxWaitStates; 2823 switch (HazardDefLatency) { 2824 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2825 break; 2826 case 4: assert(ST.hasGFX940Insts()); 2827 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2828 break; 2829 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2830 break; 2831 case 16: [[fallthrough]]; 2832 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2833 break; 2834 } 2835 2836 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2837 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2838 } 2839 2840 return WaitStatesNeeded; 2841 } 2842 2843 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2844 if (!SU->isInstr()) 2845 return false; 2846 2847 const MachineInstr *MAI = nullptr; 2848 2849 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2850 MAI = nullptr; 2851 if (SIInstrInfo::isMFMA(MI)) 2852 MAI = &MI; 2853 return MAI != nullptr; 2854 }; 2855 2856 MachineInstr *MI = SU->getInstr(); 2857 if (IsMFMAFn(*MI)) { 2858 int W = getWaitStatesSince(IsMFMAFn, 16); 2859 if (MAI) 2860 return W < (int)TSchedModel.computeInstrLatency(MAI); 2861 } 2862 2863 return false; 2864 } 2865 2866 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2867 // insertion of a new instruction. 2868 static void updateGetPCBundle(MachineInstr *NewMI) { 2869 if (!NewMI->isBundled()) 2870 return; 2871 2872 // Find start of bundle. 2873 auto I = NewMI->getIterator(); 2874 while (I->isBundledWithPred()) 2875 I--; 2876 if (I->isBundle()) 2877 I++; 2878 2879 // Bail if this is not an S_GETPC bundle. 2880 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2881 return; 2882 2883 // Update offsets of any references in the bundle. 2884 const unsigned NewBytes = 4; 2885 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2886 "Unexpected instruction insertion in bundle"); 2887 auto NextMI = std::next(NewMI->getIterator()); 2888 auto End = NewMI->getParent()->end(); 2889 while (NextMI != End && NextMI->isBundledWithPred()) { 2890 for (auto &Operand : NextMI->operands()) { 2891 if (Operand.isGlobal()) 2892 Operand.setOffset(Operand.getOffset() + NewBytes); 2893 } 2894 NextMI++; 2895 } 2896 } 2897 2898 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2899 if (!ST.hasVALUMaskWriteHazard()) 2900 return false; 2901 assert(!ST.hasExtendedWaitCounts()); 2902 2903 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2904 return false; 2905 2906 // The hazard sequence is three instructions: 2907 // 1. VALU reads SGPR as mask 2908 // 2. SALU writes SGPR 2909 // 3. SALU reads SGPR 2910 // The hazard can expire if the distance between 2 and 3 is sufficient. 2911 // In practice this happens <10% of the time, hence this always assumes 2912 // the hazard exists if 1 and 2 are present to avoid searching. 2913 2914 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2915 if (!SDSTOp || !SDSTOp->isReg()) 2916 return false; 2917 2918 const Register HazardReg = SDSTOp->getReg(); 2919 if (HazardReg == AMDGPU::EXEC || 2920 HazardReg == AMDGPU::EXEC_LO || 2921 HazardReg == AMDGPU::EXEC_HI || 2922 HazardReg == AMDGPU::M0) 2923 return false; 2924 2925 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2926 switch (I.getOpcode()) { 2927 case AMDGPU::V_ADDC_U32_e32: 2928 case AMDGPU::V_ADDC_U32_dpp: 2929 case AMDGPU::V_CNDMASK_B16_e32: 2930 case AMDGPU::V_CNDMASK_B16_dpp: 2931 case AMDGPU::V_CNDMASK_B32_e32: 2932 case AMDGPU::V_CNDMASK_B32_dpp: 2933 case AMDGPU::V_DIV_FMAS_F32_e64: 2934 case AMDGPU::V_DIV_FMAS_F64_e64: 2935 case AMDGPU::V_SUBB_U32_e32: 2936 case AMDGPU::V_SUBB_U32_dpp: 2937 case AMDGPU::V_SUBBREV_U32_e32: 2938 case AMDGPU::V_SUBBREV_U32_dpp: 2939 // These implicitly read VCC as mask source. 2940 return HazardReg == AMDGPU::VCC || 2941 HazardReg == AMDGPU::VCC_LO || 2942 HazardReg == AMDGPU::VCC_HI; 2943 case AMDGPU::V_ADDC_U32_e64: 2944 case AMDGPU::V_ADDC_U32_e64_dpp: 2945 case AMDGPU::V_CNDMASK_B16_e64: 2946 case AMDGPU::V_CNDMASK_B16_e64_dpp: 2947 case AMDGPU::V_CNDMASK_B32_e64: 2948 case AMDGPU::V_CNDMASK_B32_e64_dpp: 2949 case AMDGPU::V_SUBB_U32_e64: 2950 case AMDGPU::V_SUBB_U32_e64_dpp: 2951 case AMDGPU::V_SUBBREV_U32_e64: 2952 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 2953 // Only check mask register overlaps. 2954 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 2955 assert(SSRCOp); 2956 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 2957 } 2958 default: 2959 return false; 2960 } 2961 }; 2962 2963 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2964 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 2965 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 2966 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2967 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 2968 return true; 2969 2970 // VALU access to any SGPR or literal constant other than HazardReg 2971 // mitigates hazard. No need to check HazardReg here as this will 2972 // only be called when !IsHazardFn. 2973 if (!SIInstrInfo::isVALU(I)) 2974 return false; 2975 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 2976 const MachineOperand &Op = I.getOperand(OpNo); 2977 if (Op.isReg()) { 2978 Register OpReg = Op.getReg(); 2979 // Only consider uses 2980 if (!Op.isUse()) 2981 continue; 2982 // Ignore EXEC 2983 if (OpReg == AMDGPU::EXEC || 2984 OpReg == AMDGPU::EXEC_LO || 2985 OpReg == AMDGPU::EXEC_HI) 2986 continue; 2987 // Ignore all implicit uses except VCC 2988 if (Op.isImplicit()) { 2989 if (OpReg == AMDGPU::VCC || 2990 OpReg == AMDGPU::VCC_LO || 2991 OpReg == AMDGPU::VCC_HI) 2992 return true; 2993 continue; 2994 } 2995 if (TRI.isSGPRReg(MRI, OpReg)) 2996 return true; 2997 } else { 2998 const MCInstrDesc &InstDesc = I.getDesc(); 2999 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3000 if (!TII.isInlineConstant(Op, OpInfo)) 3001 return true; 3002 } 3003 } 3004 return false; 3005 }; 3006 3007 // Check for hazard 3008 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3009 std::numeric_limits<int>::max()) 3010 return false; 3011 3012 auto NextMI = std::next(MI->getIterator()); 3013 3014 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3015 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3016 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3017 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3018 3019 // SALU write may be s_getpc in a bundle. 3020 updateGetPCBundle(NewMI); 3021 3022 return true; 3023 } 3024 3025 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3026 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3027 static std::optional<unsigned> sgprPairNumber(Register Reg, 3028 const SIRegisterInfo &TRI) { 3029 switch (Reg) { 3030 case AMDGPU::M0: 3031 case AMDGPU::EXEC: 3032 case AMDGPU::EXEC_LO: 3033 case AMDGPU::EXEC_HI: 3034 case AMDGPU::SGPR_NULL: 3035 case AMDGPU::SGPR_NULL64: 3036 return {}; 3037 default: 3038 break; 3039 } 3040 unsigned RegN = TRI.getEncodingValue(Reg); 3041 if (RegN > 127) 3042 return {}; 3043 return (RegN >> 1) & 0x3f; 3044 } 3045 3046 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3047 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3048 assert(MMF == &MF); 3049 3050 // Assume non-empty vector means it has already been computed. 3051 if (!VALUReadHazardSGPRs.empty()) 3052 return; 3053 3054 auto CallingConv = MF.getFunction().getCallingConv(); 3055 bool IsCallFree = 3056 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3057 3058 // Exhaustive search is only viable in non-caller/callee functions where 3059 // VALUs will be exposed to the hazard recognizer. 3060 UseVALUReadHazardExhaustiveSearch = 3061 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3062 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3063 3064 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3065 bool UseVALUUseCache = 3066 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3067 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3068 if (!UseVALUUseCache) 3069 return; 3070 3071 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3072 // before a SALU write to the same SGPR. This provides a reduction in 3073 // hazard insertion when all VALU access to an SGPR occurs after its last 3074 // SALU write, when compared to a linear scan. 3075 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3076 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3077 MachineCycleInfo CI; 3078 CI.compute(*MMF); 3079 3080 for (auto *MBB : post_order(&MF)) { 3081 bool InCycle = CI.getCycle(MBB) != nullptr; 3082 for (auto &MI : reverse(MBB->instrs())) { 3083 bool IsVALU = SIInstrInfo::isVALU(MI); 3084 bool IsSALU = SIInstrInfo::isSALU(MI); 3085 if (!IsVALU && !IsSALU) 3086 continue; 3087 3088 for (const MachineOperand &Op : MI.operands()) { 3089 if (!Op.isReg()) 3090 continue; 3091 Register Reg = Op.getReg(); 3092 assert(!Op.getSubReg()); 3093 // Only consider implicit operands of VCC. 3094 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3095 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3096 continue; 3097 if (!TRI.isSGPRReg(MRI, Reg)) 3098 continue; 3099 auto RegN = sgprPairNumber(Reg, TRI); 3100 if (!RegN) 3101 continue; 3102 if (IsVALU && Op.isUse()) { 3103 // Note: any access within a cycle must be considered a hazard. 3104 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3105 VALUReadHazardSGPRs.set(*RegN); 3106 ReadSGPRs.set(*RegN); 3107 } else if (IsSALU) { 3108 if (Op.isDef()) 3109 SALUWriteSGPRs.set(*RegN); 3110 else 3111 ReadSGPRs.set(*RegN); 3112 } 3113 } 3114 } 3115 } 3116 } 3117 3118 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3119 if (!ST.hasVALUReadSGPRHazard()) 3120 return false; 3121 3122 // The hazard sequence is fundamentally three instructions: 3123 // 1. VALU reads SGPR 3124 // 2. SALU writes SGPR 3125 // 3. VALU/SALU reads SGPR 3126 // Try to avoid searching for (1) because the expiry point of the hazard is 3127 // indeterminate; however, the hazard between (2) and (3) can expire if the 3128 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3129 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3130 // even if individual SGPRs are accessed. 3131 3132 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3133 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3134 if (!(MIIsSALU || MIIsVALU)) 3135 return false; 3136 3137 // Avoid expensive search when compile time is priority by 3138 // mitigating every SALU which writes an SGPR. 3139 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3140 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3141 return false; 3142 3143 const MachineOperand *SDSTOp = 3144 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3145 if (!SDSTOp || !SDSTOp->isReg()) 3146 return false; 3147 3148 const Register HazardReg = SDSTOp->getReg(); 3149 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3150 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3151 return false; 3152 3153 // Add s_wait_alu sa_sdst(0) after SALU write. 3154 auto NextMI = std::next(MI->getIterator()); 3155 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3156 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3157 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3158 3159 // SALU write may be s_getpc in a bundle. 3160 updateGetPCBundle(NewMI); 3161 3162 return true; 3163 } 3164 3165 // Pre-compute set of SGPR pairs read by VALUs. 3166 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3167 computeVALUHazardSGPRs(MI->getMF()); 3168 3169 // If no VALUs hazard SGPRs exist then nothing to do. 3170 if (VALUReadHazardSGPRs.none()) 3171 return false; 3172 3173 // All SGPR writes before a call/return must be flushed as the callee/caller 3174 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3175 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3176 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3177 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3178 3179 // Collect all SGPR sources for MI which are read by a VALU. 3180 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3181 SmallSet<Register, 4> SGPRsUsed; 3182 3183 if (!IsSetPC) { 3184 for (const MachineOperand &Op : MI->all_uses()) { 3185 Register OpReg = Op.getReg(); 3186 3187 // Only consider VCC implicit uses on VALUs. 3188 // The only expected SALU implicit access is SCC which is no hazard. 3189 if (MIIsSALU && Op.isImplicit()) 3190 continue; 3191 3192 if (!TRI.isSGPRReg(MRI, OpReg)) 3193 continue; 3194 3195 auto RegN = sgprPairNumber(OpReg, TRI); 3196 if (!RegN) 3197 continue; 3198 3199 if (!VALUReadHazardSGPRs[*RegN]) 3200 continue; 3201 3202 SGPRsUsed.insert(OpReg); 3203 } 3204 3205 // No SGPRs -> nothing to do. 3206 if (SGPRsUsed.empty()) 3207 return false; 3208 } 3209 3210 // A hazard is any SALU which writes one of the SGPRs read by MI. 3211 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3212 if (!SIInstrInfo::isSALU(I)) 3213 return false; 3214 // Ensure SGPR flush before call/return by conservatively assuming every 3215 // SALU writes an SGPR. 3216 if (IsSetPC && I.getNumDefs() > 0) 3217 return true; 3218 // Check for any register writes. 3219 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3220 return I.modifiesRegister(Reg, &TRI); 3221 }); 3222 }; 3223 3224 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3225 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3226 if (Count >= SALUExpiryCount) 3227 return true; 3228 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3229 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3230 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3231 return true; 3232 return false; 3233 }; 3234 3235 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3236 // Only count true SALUs as wait states. 3237 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3238 return 0; 3239 // SALU must be unrelated to any hazard registers. 3240 if (any_of(SGPRsUsed, 3241 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3242 return 0; 3243 return 1; 3244 }; 3245 3246 // Check for the hazard. 3247 DenseSet<const MachineBasicBlock *> Visited; 3248 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3249 std::next(MI->getReverseIterator()), 0, 3250 IsExpiredFn, Visited, WaitStatesFn); 3251 3252 if (WaitStates >= SALUExpiryCount) 3253 return false; 3254 3255 // Validate hazard through an exhaustive search. 3256 if (UseVALUReadHazardExhaustiveSearch) { 3257 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3258 // This is searching for (1) in the hazard description. 3259 auto hazardPair = [this](Register Reg) { 3260 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3261 return Register(AMDGPU::VCC); 3262 auto RegN = sgprPairNumber(Reg, TRI); 3263 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3264 }; 3265 auto SearchHazardFn = [this, hazardPair, 3266 &SGPRsUsed](const MachineInstr &I) { 3267 if (!SIInstrInfo::isVALU(I)) 3268 return false; 3269 // Check for any register reads. 3270 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3271 return I.readsRegister(hazardPair(Reg), &TRI); 3272 }); 3273 }; 3274 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3275 return false; 3276 }; 3277 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3278 std::numeric_limits<int>::max()) 3279 return false; 3280 } 3281 3282 // Add s_wait_alu sa_sdst(0) before SALU read. 3283 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3284 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3285 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3286 3287 // SALU read may be after s_getpc in a bundle. 3288 updateGetPCBundle(NewMI); 3289 3290 return true; 3291 } 3292 3293 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3294 const SIInstrInfo &TII) { 3295 MachineBasicBlock &EntryMBB = MF->front(); 3296 if (EntryMBB.begin() != EntryMBB.end()) { 3297 auto &EntryMI = *EntryMBB.begin(); 3298 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3299 EntryMI.getOperand(0).getImm() >= Priority) 3300 return false; 3301 } 3302 3303 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3304 .addImm(Priority); 3305 return true; 3306 } 3307 3308 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3309 if (!ST.hasRequiredExportPriority()) 3310 return false; 3311 3312 // Assume the following shader types will never have exports, 3313 // and avoid adding or adjusting S_SETPRIO. 3314 MachineBasicBlock *MBB = MI->getParent(); 3315 MachineFunction *MF = MBB->getParent(); 3316 auto CC = MF->getFunction().getCallingConv(); 3317 switch (CC) { 3318 case CallingConv::AMDGPU_CS: 3319 case CallingConv::AMDGPU_CS_Chain: 3320 case CallingConv::AMDGPU_CS_ChainPreserve: 3321 case CallingConv::AMDGPU_KERNEL: 3322 return false; 3323 default: 3324 break; 3325 } 3326 3327 const int MaxPriority = 3; 3328 const int NormalPriority = 2; 3329 const int PostExportPriority = 0; 3330 3331 auto It = MI->getIterator(); 3332 switch (MI->getOpcode()) { 3333 case AMDGPU::S_ENDPGM: 3334 case AMDGPU::S_ENDPGM_SAVED: 3335 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3336 case AMDGPU::SI_RETURN_TO_EPILOG: 3337 // Ensure shader with calls raises priority at entry. 3338 // This ensures correct priority if exports exist in callee. 3339 if (MF->getFrameInfo().hasCalls()) 3340 return ensureEntrySetPrio(MF, NormalPriority, TII); 3341 return false; 3342 case AMDGPU::S_SETPRIO: { 3343 // Raise minimum priority unless in workaround. 3344 auto &PrioOp = MI->getOperand(0); 3345 int Prio = PrioOp.getImm(); 3346 bool InWA = (Prio == PostExportPriority) && 3347 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3348 if (InWA || Prio >= NormalPriority) 3349 return false; 3350 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3351 return true; 3352 } 3353 default: 3354 if (!TII.isEXP(*MI)) 3355 return false; 3356 break; 3357 } 3358 3359 // Check entry priority at each export (as there will only be a few). 3360 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3361 bool Changed = false; 3362 if (CC != CallingConv::AMDGPU_Gfx) 3363 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3364 3365 auto NextMI = std::next(It); 3366 bool EndOfShader = false; 3367 if (NextMI != MBB->end()) { 3368 // Only need WA at end of sequence of exports. 3369 if (TII.isEXP(*NextMI)) 3370 return Changed; 3371 // Assume appropriate S_SETPRIO after export means WA already applied. 3372 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3373 NextMI->getOperand(0).getImm() == PostExportPriority) 3374 return Changed; 3375 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3376 } 3377 3378 const DebugLoc &DL = MI->getDebugLoc(); 3379 3380 // Lower priority. 3381 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3382 .addImm(PostExportPriority); 3383 3384 if (!EndOfShader) { 3385 // Wait for exports to complete. 3386 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3387 .addReg(AMDGPU::SGPR_NULL) 3388 .addImm(0); 3389 } 3390 3391 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3392 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3393 3394 if (!EndOfShader) { 3395 // Return to normal (higher) priority. 3396 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3397 .addImm(NormalPriority); 3398 } 3399 3400 return true; 3401 } 3402