1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 UseVALUReadHazardExhaustiveSearch(false), 64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 || 172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || 173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || 174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || 175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; 176 } 177 178 static bool isLdsDma(const MachineInstr &MI) { 179 return SIInstrInfo::isVALU(MI) && 180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 181 } 182 183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 185 AMDGPU::OpName::simm16); 186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 187 } 188 189 ScheduleHazardRecognizer::HazardType 190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 191 MachineInstr *MI = SU->getInstr(); 192 // If we are not in "HazardRecognizerMode" and therefore not being run from 193 // the scheduler, track possible stalls from hazards but don't insert noops. 194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 195 196 if (MI->isBundle()) 197 return NoHazard; 198 199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 200 return HazardType; 201 202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 203 return HazardType; 204 205 if (checkFPAtomicToDenormModeHazard(MI) > 0) 206 return HazardType; 207 208 if (ST.hasNoDataDepHazard()) 209 return NoHazard; 210 211 // FIXME: Should flat be considered vmem? 212 if ((SIInstrInfo::isVMEM(*MI) || 213 SIInstrInfo::isFLAT(*MI)) 214 && checkVMEMHazards(MI) > 0) 215 return HazardType; 216 217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 218 return HazardType; 219 220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 221 return HazardType; 222 223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 224 return HazardType; 225 226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 227 return HazardType; 228 229 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 230 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 232 return HazardType; 233 234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 235 return HazardType; 236 237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 238 return HazardType; 239 240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 241 return HazardType; 242 243 if (((ST.hasReadM0MovRelInterpHazard() && 244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 247 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 249 (ST.hasReadM0LdsDirectHazard() && 250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 251 checkReadM0Hazards(MI) > 0) 252 return HazardType; 253 254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 255 return HazardType; 256 257 if ((SIInstrInfo::isVMEM(*MI) || 258 SIInstrInfo::isFLAT(*MI) || 259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 260 return HazardType; 261 262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 263 return HazardType; 264 265 return NoHazard; 266 } 267 268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 269 unsigned Quantity) { 270 while (Quantity > 0) { 271 unsigned Arg = std::min(Quantity, 8u); 272 Quantity -= Arg; 273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 274 .addImm(Arg - 1); 275 } 276 } 277 278 unsigned 279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 281 assert(TSchedModel.getWriteProcResBegin(SC) != 282 TSchedModel.getWriteProcResEnd(SC)); 283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 284 } 285 286 void GCNHazardRecognizer::processBundle() { 287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 288 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 289 // Check bundled MachineInstr's for hazards. 290 for (; MI != E && MI->isInsideBundle(); ++MI) { 291 CurrCycleInstr = &*MI; 292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 293 294 if (IsHazardRecognizerMode) { 295 fixHazards(CurrCycleInstr); 296 297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 298 } 299 300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 301 // include the bundled MI directly after, only add a maximum of 302 // (MaxLookAhead - 1) noops to EmittedInstrs. 303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 304 EmittedInstrs.push_front(nullptr); 305 306 EmittedInstrs.push_front(CurrCycleInstr); 307 EmittedInstrs.resize(MaxLookAhead); 308 } 309 CurrCycleInstr = nullptr; 310 } 311 312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 313 assert(IsHazardRecognizerMode); 314 315 unsigned NumPreNoops = PreEmitNoops(MI); 316 EmitNoops(NumPreNoops); 317 if (MI->isInsideBundle()) 318 insertNoopsInBundle(MI, TII, NumPreNoops); 319 else 320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 321 NumPreNoops); 322 EmitInstruction(MI); 323 AdvanceCycle(); 324 } 325 326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 327 IsHazardRecognizerMode = true; 328 CurrCycleInstr = MI; 329 unsigned W = PreEmitNoopsCommon(MI); 330 fixHazards(MI); 331 CurrCycleInstr = nullptr; 332 return W; 333 } 334 335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 336 if (MI->isBundle()) 337 return 0; 338 339 int WaitStates = 0; 340 341 if (SIInstrInfo::isSMRD(*MI)) 342 return std::max(WaitStates, checkSMRDHazards(MI)); 343 344 if (ST.hasNSAtoVMEMBug()) 345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 346 347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 348 349 if (ST.hasNoDataDepHazard()) 350 return WaitStates; 351 352 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 354 355 if (SIInstrInfo::isVALU(*MI)) 356 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 357 358 if (SIInstrInfo::isDPP(*MI)) 359 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 360 361 if (isDivFMas(MI->getOpcode())) 362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 363 364 if (isRWLane(MI->getOpcode())) 365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 366 367 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 368 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 371 372 if (MI->isInlineAsm()) 373 return std::max(WaitStates, checkInlineAsmHazards(MI)); 374 375 if (isSGetReg(MI->getOpcode())) 376 return std::max(WaitStates, checkGetRegHazards(MI)); 377 378 if (isSSetReg(MI->getOpcode())) 379 return std::max(WaitStates, checkSetRegHazards(MI)); 380 381 if (isRFE(MI->getOpcode())) 382 return std::max(WaitStates, checkRFEHazards(MI)); 383 384 if ((ST.hasReadM0MovRelInterpHazard() && 385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 388 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 390 (ST.hasReadM0LdsDirectHazard() && 391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 392 return std::max(WaitStates, checkReadM0Hazards(MI)); 393 394 if (SIInstrInfo::isMAI(*MI)) 395 return std::max(WaitStates, checkMAIHazards(MI)); 396 397 if (SIInstrInfo::isVMEM(*MI) || 398 SIInstrInfo::isFLAT(*MI) || 399 SIInstrInfo::isDS(*MI)) 400 return std::max(WaitStates, checkMAILdStHazards(MI)); 401 402 if (ST.hasGFX950Insts() && isPermlane(*MI)) 403 return std::max(WaitStates, checkPermlaneHazards(MI)); 404 405 return WaitStates; 406 } 407 408 void GCNHazardRecognizer::EmitNoop() { 409 EmittedInstrs.push_front(nullptr); 410 } 411 412 void GCNHazardRecognizer::AdvanceCycle() { 413 // When the scheduler detects a stall, it will call AdvanceCycle() without 414 // emitting any instructions. 415 if (!CurrCycleInstr) { 416 EmittedInstrs.push_front(nullptr); 417 return; 418 } 419 420 if (CurrCycleInstr->isBundle()) { 421 processBundle(); 422 return; 423 } 424 425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 426 if (!NumWaitStates) { 427 CurrCycleInstr = nullptr; 428 return; 429 } 430 431 // Keep track of emitted instructions 432 EmittedInstrs.push_front(CurrCycleInstr); 433 434 // Add a nullptr for each additional wait state after the first. Make sure 435 // not to add more than getMaxLookAhead() items to the list, since we 436 // truncate the list to that size right after this loop. 437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 438 i < e; ++i) { 439 EmittedInstrs.push_front(nullptr); 440 } 441 442 // getMaxLookahead() is the largest number of wait states we will ever need 443 // to insert, so there is no point in keeping track of more than that many 444 // wait states. 445 EmittedInstrs.resize(getMaxLookAhead()); 446 447 CurrCycleInstr = nullptr; 448 } 449 450 void GCNHazardRecognizer::RecedeCycle() { 451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 452 } 453 454 //===----------------------------------------------------------------------===// 455 // Helper Functions 456 //===----------------------------------------------------------------------===// 457 458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 459 460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 462 463 // Search for a hazard in a block and its predecessors. 464 template <typename StateT> 465 static bool 466 hasHazard(StateT State, 467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 468 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 469 const MachineBasicBlock *MBB, 470 MachineBasicBlock::const_reverse_instr_iterator I, 471 DenseSet<const MachineBasicBlock *> &Visited) { 472 for (auto E = MBB->instr_rend(); I != E; ++I) { 473 // No need to look at parent BUNDLE instructions. 474 if (I->isBundle()) 475 continue; 476 477 switch (IsHazard(State, *I)) { 478 case HazardFound: 479 return true; 480 case HazardExpired: 481 return false; 482 default: 483 // Continue search 484 break; 485 } 486 487 if (I->isInlineAsm() || I->isMetaInstruction()) 488 continue; 489 490 UpdateState(State, *I); 491 } 492 493 for (MachineBasicBlock *Pred : MBB->predecessors()) { 494 if (!Visited.insert(Pred).second) 495 continue; 496 497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 498 Visited)) 499 return true; 500 } 501 502 return false; 503 } 504 505 // Returns a minimum wait states since \p I walking all predecessors. 506 // Only scans until \p IsExpired does not return true. 507 // Can only be run in a hazard recognizer mode. 508 static int getWaitStatesSince( 509 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 510 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 511 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 512 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 513 for (auto E = MBB->instr_rend(); I != E; ++I) { 514 // Don't add WaitStates for parent BUNDLE instructions. 515 if (I->isBundle()) 516 continue; 517 518 if (IsHazard(*I)) 519 return WaitStates; 520 521 if (I->isInlineAsm()) 522 continue; 523 524 WaitStates += GetNumWaitStates(*I); 525 526 if (IsExpired(*I, WaitStates)) 527 return std::numeric_limits<int>::max(); 528 } 529 530 int MinWaitStates = std::numeric_limits<int>::max(); 531 for (MachineBasicBlock *Pred : MBB->predecessors()) { 532 if (!Visited.insert(Pred).second) 533 continue; 534 535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 536 IsExpired, Visited, GetNumWaitStates); 537 538 MinWaitStates = std::min(MinWaitStates, W); 539 } 540 541 return MinWaitStates; 542 } 543 544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 545 const MachineInstr *MI, IsExpiredFn IsExpired) { 546 DenseSet<const MachineBasicBlock *> Visited; 547 return getWaitStatesSince(IsHazard, MI->getParent(), 548 std::next(MI->getReverseIterator()), 549 0, IsExpired, Visited); 550 } 551 552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 553 if (IsHazardRecognizerMode) { 554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 555 return WaitStates >= Limit; 556 }; 557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 558 } 559 560 int WaitStates = 0; 561 for (MachineInstr *MI : EmittedInstrs) { 562 if (MI) { 563 if (IsHazard(*MI)) 564 return WaitStates; 565 566 if (MI->isInlineAsm()) 567 continue; 568 } 569 ++WaitStates; 570 571 if (WaitStates >= Limit) 572 break; 573 } 574 return std::numeric_limits<int>::max(); 575 } 576 577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 578 IsHazardFn IsHazardDef, 579 int Limit) { 580 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 581 582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 584 }; 585 586 return getWaitStatesSince(IsHazardFn, Limit); 587 } 588 589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 590 int Limit) { 591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 592 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 593 }; 594 595 return getWaitStatesSince(IsHazardFn, Limit); 596 } 597 598 //===----------------------------------------------------------------------===// 599 // No-op Hazard Detection 600 //===----------------------------------------------------------------------===// 601 602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 603 MCRegister Reg) { 604 for (MCRegUnit Unit : TRI.regunits(Reg)) 605 BV.set(Unit); 606 } 607 608 static void addRegsToSet(const SIRegisterInfo &TRI, 609 iterator_range<MachineInstr::const_mop_iterator> Ops, 610 BitVector &DefSet, BitVector &UseSet) { 611 for (const MachineOperand &Op : Ops) { 612 if (Op.isReg()) 613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 614 } 615 } 616 617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 619 } 620 621 static bool breaksSMEMSoftClause(MachineInstr *MI) { 622 return !SIInstrInfo::isSMRD(*MI); 623 } 624 625 static bool breaksVMEMSoftClause(MachineInstr *MI) { 626 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 627 } 628 629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 630 // SMEM soft clause are only present on VI+, and only matter if xnack is 631 // enabled. 632 if (!ST.isXNACKEnabled()) 633 return 0; 634 635 bool IsSMRD = TII.isSMRD(*MEM); 636 637 resetClause(); 638 639 // A soft-clause is any group of consecutive SMEM instructions. The 640 // instructions in this group may return out of order and/or may be 641 // replayed (i.e. the same instruction issued more than once). 642 // 643 // In order to handle these situations correctly we need to make sure that 644 // when a clause has more than one instruction, no instruction in the clause 645 // writes to a register that is read by another instruction in the clause 646 // (including itself). If we encounter this situation, we need to break the 647 // clause by inserting a non SMEM instruction. 648 649 for (MachineInstr *MI : EmittedInstrs) { 650 // When we hit a non-SMEM instruction then we have passed the start of the 651 // clause and we can stop. 652 if (!MI) 653 break; 654 655 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 656 break; 657 658 addClauseInst(*MI); 659 } 660 661 if (ClauseDefs.none()) 662 return 0; 663 664 // We need to make sure not to put loads and stores in the same clause if they 665 // use the same address. For now, just start a new clause whenever we see a 666 // store. 667 if (MEM->mayStore()) 668 return 1; 669 670 addClauseInst(*MEM); 671 672 // If the set of defs and uses intersect then we cannot add this instruction 673 // to the clause, so we have a hazard. 674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 675 } 676 677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 678 int WaitStatesNeeded = 0; 679 680 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 681 682 // This SMRD hazard only affects SI. 683 if (!ST.hasSMRDReadVALUDefHazard()) 684 return WaitStatesNeeded; 685 686 // A read of an SGPR by SMRD instruction requires 4 wait states when the 687 // SGPR was written by a VALU instruction. 688 int SmrdSgprWaitStates = 4; 689 auto IsHazardDefFn = [this](const MachineInstr &MI) { 690 return TII.isVALU(MI); 691 }; 692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 693 return TII.isSALU(MI); 694 }; 695 696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 697 698 for (const MachineOperand &Use : SMRD->uses()) { 699 if (!Use.isReg()) 700 continue; 701 int WaitStatesNeededForUse = 702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 703 SmrdSgprWaitStates); 704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 705 706 // This fixes what appears to be undocumented hardware behavior in SI where 707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 708 // needs some number of nops in between. We don't know how many we need, but 709 // let's use 4. This wasn't discovered before probably because the only 710 // case when this happens is when we expand a 64-bit pointer into a full 711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 712 // probably never encountered in the closed-source land. 713 if (IsBufferSMRD) { 714 int WaitStatesNeededForUse = 715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 716 IsBufferHazardDefFn, 717 SmrdSgprWaitStates); 718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 719 } 720 } 721 722 return WaitStatesNeeded; 723 } 724 725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 726 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 727 return 0; 728 729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 730 731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 732 // SGPR was written by a VALU Instruction. 733 const int VmemSgprWaitStates = 5; 734 auto IsHazardDefFn = [this](const MachineInstr &MI) { 735 return TII.isVALU(MI); 736 }; 737 for (const MachineOperand &Use : VMEM->uses()) { 738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 739 continue; 740 741 int WaitStatesNeededForUse = 742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 743 VmemSgprWaitStates); 744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 745 } 746 return WaitStatesNeeded; 747 } 748 749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 750 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 751 const SIInstrInfo *TII = ST.getInstrInfo(); 752 753 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 754 int DppVgprWaitStates = 2; 755 int DppExecWaitStates = 5; 756 int WaitStatesNeeded = 0; 757 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 758 return TII->isVALU(MI); 759 }; 760 761 for (const MachineOperand &Use : DPP->uses()) { 762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 763 continue; 764 int WaitStatesNeededForUse = 765 DppVgprWaitStates - getWaitStatesSinceDef( 766 Use.getReg(), 767 [](const MachineInstr &) { return true; }, 768 DppVgprWaitStates); 769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 770 } 771 772 WaitStatesNeeded = std::max( 773 WaitStatesNeeded, 774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 775 DppExecWaitStates)); 776 777 return WaitStatesNeeded; 778 } 779 780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 781 const SIInstrInfo *TII = ST.getInstrInfo(); 782 783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 784 // instruction. 785 const int DivFMasWaitStates = 4; 786 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 787 return TII->isVALU(MI); 788 }; 789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 790 DivFMasWaitStates); 791 792 return DivFMasWaitStates - WaitStatesNeeded; 793 } 794 795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 796 const SIInstrInfo *TII = ST.getInstrInfo(); 797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 798 799 const int GetRegWaitStates = 2; 800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 801 return GetRegHWReg == getHWReg(TII, MI); 802 }; 803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 804 805 return GetRegWaitStates - WaitStatesNeeded; 806 } 807 808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 809 const SIInstrInfo *TII = ST.getInstrInfo(); 810 unsigned HWReg = getHWReg(TII, *SetRegInstr); 811 812 const int SetRegWaitStates = ST.getSetRegWaitStates(); 813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 814 return HWReg == getHWReg(TII, MI); 815 }; 816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 817 return SetRegWaitStates - WaitStatesNeeded; 818 } 819 820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 821 if (!MI.mayStore()) 822 return -1; 823 824 const SIInstrInfo *TII = ST.getInstrInfo(); 825 unsigned Opcode = MI.getOpcode(); 826 const MCInstrDesc &Desc = MI.getDesc(); 827 828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 829 int VDataRCID = -1; 830 if (VDataIdx != -1) 831 VDataRCID = Desc.operands()[VDataIdx].RegClass; 832 833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 834 // There is no hazard if the instruction does not use vector regs 835 // (like wbinvl1) 836 if (VDataIdx == -1) 837 return -1; 838 // For MUBUF/MTBUF instructions this hazard only exists if the 839 // instruction is not using a register in the soffset field. 840 const MachineOperand *SOffset = 841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 842 // If we have no soffset operand, then assume this field has been 843 // hardcoded to zero. 844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 845 (!SOffset || !SOffset->isReg())) 846 return VDataIdx; 847 } 848 849 // MIMG instructions create a hazard if they don't use a 256-bit T# and 850 // the store size is greater than 8 bytes and they have more than two bits 851 // of their dmask set. 852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 853 if (TII->isMIMG(MI)) { 854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 855 assert(SRsrcIdx != -1 && 856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 857 (void)SRsrcIdx; 858 } 859 860 if (TII->isFLAT(MI)) { 861 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 862 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 863 return DataIdx; 864 } 865 866 return -1; 867 } 868 869 int 870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 871 const MachineRegisterInfo &MRI) { 872 // Helper to check for the hazard where VMEM instructions that store more than 873 // 8 bytes can have there store data over written by the next instruction. 874 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 875 876 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 877 int WaitStatesNeeded = 0; 878 879 if (!TRI->isVectorRegister(MRI, Def.getReg())) 880 return WaitStatesNeeded; 881 Register Reg = Def.getReg(); 882 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 883 int DataIdx = createsVALUHazard(MI); 884 return DataIdx >= 0 && 885 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 886 }; 887 888 int WaitStatesNeededForDef = 889 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 890 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 891 892 return WaitStatesNeeded; 893 } 894 895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 896 /// pack the computed value into correct bit position of the dest register. This 897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 898 /// dst_sel that is not aligned to the register. This function analayzes the \p 899 /// MI and \returns an operand with dst forwarding issue, or nullptr if 900 /// none exists. 901 static const MachineOperand * 902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 903 if (!SIInstrInfo::isVALU(MI)) 904 return nullptr; 905 906 const SIInstrInfo *TII = ST.getInstrInfo(); 907 908 unsigned Opcode = MI.getOpcode(); 909 910 // There are three different types of instructions 911 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 912 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst 913 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and 914 // op_sel[3:2] 915 // != 0 916 if (SIInstrInfo::isSDWA(MI)) { 917 // Type 1: SDWA with dst_sel != DWORD 918 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 919 if (DstSel->getImm() != AMDGPU::SDWA::DWORD) 920 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 921 } 922 923 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode); 924 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) { 925 // Type 2: VOP3 which write the hi bits 926 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) & 927 SISrcMods::DST_OP_SEL) 928 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 929 930 // Type 3: FP8DstSelInst with op_sel[3:2] != 0) 931 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 && 932 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) & 933 SISrcMods::OP_SEL_0)) 934 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 935 } 936 937 // Special case: nop is required for all the opsel values for fp4 sr variant 938 // cvt scale instructions 939 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4) 940 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 941 942 return nullptr; 943 } 944 945 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 946 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 947 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 948 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 949 const MachineOperand *Dst, 950 const SIRegisterInfo *TRI) { 951 // We must consider implicit reads of the VALU. SDWA with dst_sel and 952 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 953 // and we must account for that hazard. 954 // We also must account for WAW hazards. In particular, WAW with dest 955 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 956 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 957 // check for ECC. Without accounting for this hazard, the ECC will be 958 // wrong. 959 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 960 // complete zeroesHigh16BitsOfDest) 961 for (auto &Operand : VALU->operands()) { 962 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 963 return true; 964 } 965 } 966 return false; 967 } 968 969 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 970 int WaitStatesNeeded = 0; 971 972 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 973 const int TransDefWaitstates = 1; 974 975 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 976 if (!SIInstrInfo::isTRANS(MI)) 977 return false; 978 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 979 const SIInstrInfo *TII = ST.getInstrInfo(); 980 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 981 982 for (const MachineOperand &Use : VALU->explicit_uses()) { 983 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 984 return true; 985 } 986 987 return false; 988 }; 989 990 int WaitStatesNeededForDef = 991 TransDefWaitstates - 992 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 994 } 995 996 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) { 997 const int Shift16DefWaitstates = 1; 998 999 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 1000 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1001 const MachineOperand *ForwardedDst = 1002 getDstSelForwardingOperand(ProducerMI, ST); 1003 if (ForwardedDst) { 1004 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 1005 } 1006 1007 if (ProducerMI.isInlineAsm()) { 1008 // Assume inline asm has dst forwarding hazard 1009 for (auto &Def : ProducerMI.all_defs()) { 1010 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 1011 return true; 1012 } 1013 } 1014 1015 return false; 1016 }; 1017 1018 int WaitStatesNeededForDef = 1019 Shift16DefWaitstates - 1020 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1021 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1022 } 1023 1024 if (ST.hasVDecCoExecHazard()) { 1025 const int VALUWriteSGPRVALUReadWaitstates = 2; 1026 const int VALUWriteEXECRWLane = 4; 1027 const int VALUWriteVGPRReadlaneRead = 1; 1028 1029 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1030 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1031 Register UseReg; 1032 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1033 if (!SIInstrInfo::isVALU(MI)) 1034 return false; 1035 return MI.modifiesRegister(UseReg, TRI); 1036 }; 1037 1038 for (const MachineOperand &Use : VALU->explicit_uses()) { 1039 if (!Use.isReg()) 1040 continue; 1041 1042 UseReg = Use.getReg(); 1043 if (TRI->isSGPRReg(MRI, UseReg)) { 1044 int WaitStatesNeededForDef = 1045 VALUWriteSGPRVALUReadWaitstates - 1046 getWaitStatesSince(IsVALUDefSGPRFn, 1047 VALUWriteSGPRVALUReadWaitstates); 1048 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1049 } 1050 } 1051 1052 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1053 UseReg = AMDGPU::VCC; 1054 int WaitStatesNeededForDef = 1055 VALUWriteSGPRVALUReadWaitstates - 1056 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1057 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1058 } 1059 1060 switch (VALU->getOpcode()) { 1061 case AMDGPU::V_READLANE_B32: 1062 case AMDGPU::V_READFIRSTLANE_B32: { 1063 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1064 UseReg = Src->getReg(); 1065 int WaitStatesNeededForDef = 1066 VALUWriteVGPRReadlaneRead - 1067 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1068 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1069 } 1070 [[fallthrough]]; 1071 case AMDGPU::V_WRITELANE_B32: { 1072 UseReg = AMDGPU::EXEC; 1073 int WaitStatesNeededForDef = 1074 VALUWriteEXECRWLane - 1075 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1076 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1077 break; 1078 } 1079 default: 1080 break; 1081 } 1082 } 1083 1084 // This checks for the hazard where VMEM instructions that store more than 1085 // 8 bytes can have there store data over written by the next instruction. 1086 if (!ST.has12DWordStoreHazard()) 1087 return WaitStatesNeeded; 1088 1089 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1090 1091 for (const MachineOperand &Def : VALU->defs()) { 1092 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1093 } 1094 1095 return WaitStatesNeeded; 1096 } 1097 1098 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1099 // This checks for hazards associated with inline asm statements. 1100 // Since inline asms can contain just about anything, we use this 1101 // to call/leverage other check*Hazard routines. Note that 1102 // this function doesn't attempt to address all possible inline asm 1103 // hazards (good luck), but is a collection of what has been 1104 // problematic thus far. 1105 1106 // see checkVALUHazards() 1107 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() && 1108 !ST.hasCvtScaleForwardingHazard()) 1109 return 0; 1110 1111 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1112 int WaitStatesNeeded = 0; 1113 1114 for (const MachineOperand &Op : 1115 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1116 if (Op.isReg() && Op.isDef()) { 1117 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1118 continue; 1119 1120 if (ST.has12DWordStoreHazard()) { 1121 WaitStatesNeeded = 1122 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1123 } 1124 } 1125 } 1126 1127 if (ST.hasDstSelForwardingHazard()) { 1128 const int Shift16DefWaitstates = 1; 1129 1130 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1131 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1132 // Assume inline asm reads the dst 1133 if (Dst) 1134 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1135 IA->readsRegister(Dst->getReg(), &TRI); 1136 1137 if (ProducerMI.isInlineAsm()) { 1138 // If MI is inline asm, assume it has dst forwarding hazard 1139 for (auto &Def : ProducerMI.all_defs()) { 1140 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1141 IA->readsRegister(Def.getReg(), &TRI)) { 1142 return true; 1143 } 1144 } 1145 } 1146 1147 return false; 1148 }; 1149 1150 int WaitStatesNeededForDef = 1151 Shift16DefWaitstates - 1152 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1153 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1154 } 1155 1156 return WaitStatesNeeded; 1157 } 1158 1159 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1160 const SIInstrInfo *TII = ST.getInstrInfo(); 1161 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1162 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1163 1164 const MachineOperand *LaneSelectOp = 1165 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1166 1167 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1168 return 0; 1169 1170 Register LaneSelectReg = LaneSelectOp->getReg(); 1171 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1172 1173 const int RWLaneWaitStates = 4; 1174 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1175 RWLaneWaitStates); 1176 return RWLaneWaitStates - WaitStatesSince; 1177 } 1178 1179 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1180 if (!ST.hasRFEHazards()) 1181 return 0; 1182 1183 const SIInstrInfo *TII = ST.getInstrInfo(); 1184 1185 const int RFEWaitStates = 1; 1186 1187 auto IsHazardFn = [TII](const MachineInstr &MI) { 1188 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1189 }; 1190 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1191 return RFEWaitStates - WaitStatesNeeded; 1192 } 1193 1194 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1195 const SIInstrInfo *TII = ST.getInstrInfo(); 1196 const int ReadM0WaitStates = 1; 1197 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1198 return ReadM0WaitStates - 1199 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1200 } 1201 1202 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1203 fixVMEMtoScalarWriteHazards(MI); 1204 fixVcmpxPermlaneHazards(MI); 1205 fixSMEMtoVectorWriteHazards(MI); 1206 fixVcmpxExecWARHazard(MI); 1207 fixLdsBranchVmemWARHazard(MI); 1208 if (ST.hasLdsDirect()) { 1209 fixLdsDirectVALUHazard(MI); 1210 fixLdsDirectVMEMHazard(MI); 1211 } 1212 fixVALUPartialForwardingHazard(MI); 1213 fixVALUTransUseHazard(MI); 1214 fixWMMAHazards(MI); 1215 fixShift64HighRegBug(MI); 1216 fixVALUMaskWriteHazard(MI); 1217 fixVALUReadSGPRHazard(MI); 1218 fixRequiredExportPriority(MI); 1219 } 1220 1221 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, 1222 const MachineInstr &MI) { 1223 return (TII.isVOPC(MI) || 1224 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) && 1225 MI.modifiesRegister(AMDGPU::EXEC, &TRI); 1226 } 1227 1228 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1229 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1230 return false; 1231 1232 const SIInstrInfo *TII = ST.getInstrInfo(); 1233 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1234 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1235 return isVCmpXWritesExec(*TII, *TRI, MI); 1236 }; 1237 1238 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1239 unsigned Opc = MI.getOpcode(); 1240 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1241 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1242 }; 1243 1244 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1245 std::numeric_limits<int>::max()) 1246 return false; 1247 1248 // V_NOP will be discarded by SQ. 1249 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1250 // which is always a VGPR and available. 1251 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1252 Register Reg = Src0->getReg(); 1253 bool IsUndef = Src0->isUndef(); 1254 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1255 TII->get(AMDGPU::V_MOV_B32_e32)) 1256 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1257 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1258 1259 return true; 1260 } 1261 1262 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1263 if (!ST.hasVMEMtoScalarWriteHazard()) 1264 return false; 1265 assert(!ST.hasExtendedWaitCounts()); 1266 1267 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1268 return false; 1269 1270 if (MI->getNumDefs() == 0) 1271 return false; 1272 1273 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1274 1275 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1276 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1277 !SIInstrInfo::isFLAT(I)) 1278 return false; 1279 1280 for (const MachineOperand &Def : MI->defs()) { 1281 const MachineOperand *Op = 1282 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1283 if (!Op) 1284 continue; 1285 return true; 1286 } 1287 return false; 1288 }; 1289 1290 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1291 return SIInstrInfo::isVALU(MI) || 1292 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1293 !MI.getOperand(0).getImm()) || 1294 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1295 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1296 }; 1297 1298 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1299 std::numeric_limits<int>::max()) 1300 return false; 1301 1302 const SIInstrInfo *TII = ST.getInstrInfo(); 1303 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1304 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1305 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1306 return true; 1307 } 1308 1309 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1310 if (!ST.hasSMEMtoVectorWriteHazard()) 1311 return false; 1312 assert(!ST.hasExtendedWaitCounts()); 1313 1314 if (!SIInstrInfo::isVALU(*MI)) 1315 return false; 1316 1317 unsigned SDSTName; 1318 switch (MI->getOpcode()) { 1319 case AMDGPU::V_READLANE_B32: 1320 case AMDGPU::V_READFIRSTLANE_B32: 1321 SDSTName = AMDGPU::OpName::vdst; 1322 break; 1323 default: 1324 SDSTName = AMDGPU::OpName::sdst; 1325 break; 1326 } 1327 1328 const SIInstrInfo *TII = ST.getInstrInfo(); 1329 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1330 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1331 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1332 if (!SDST) { 1333 for (const auto &MO : MI->implicit_operands()) { 1334 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1335 SDST = &MO; 1336 break; 1337 } 1338 } 1339 } 1340 1341 if (!SDST) 1342 return false; 1343 1344 const Register SDSTReg = SDST->getReg(); 1345 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1346 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1347 }; 1348 1349 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1350 if (TII->isSALU(MI)) { 1351 switch (MI.getOpcode()) { 1352 case AMDGPU::S_SETVSKIP: 1353 case AMDGPU::S_VERSION: 1354 case AMDGPU::S_WAITCNT_VSCNT: 1355 case AMDGPU::S_WAITCNT_VMCNT: 1356 case AMDGPU::S_WAITCNT_EXPCNT: 1357 // These instructions cannot not mitigate the hazard. 1358 return false; 1359 case AMDGPU::S_WAITCNT_LGKMCNT: 1360 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1361 return (MI.getOperand(1).getImm() == 0) && 1362 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1363 case AMDGPU::S_WAITCNT: { 1364 const int64_t Imm = MI.getOperand(0).getImm(); 1365 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1366 // DsCnt corresponds to LGKMCnt here. 1367 return (Decoded.DsCnt == 0); 1368 } 1369 default: 1370 // SOPP instructions cannot mitigate the hazard. 1371 if (TII->isSOPP(MI)) 1372 return false; 1373 // At this point the SALU can be assumed to mitigate the hazard 1374 // because either: 1375 // (a) it is independent of the at risk SMEM (breaking chain), 1376 // or 1377 // (b) it is dependent on the SMEM, in which case an appropriate 1378 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1379 // SMEM instruction. 1380 return true; 1381 } 1382 } 1383 return false; 1384 }; 1385 1386 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1387 std::numeric_limits<int>::max()) 1388 return false; 1389 1390 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1391 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1392 .addImm(0); 1393 return true; 1394 } 1395 1396 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1397 if (!ST.hasVcmpxExecWARHazard()) 1398 return false; 1399 assert(!ST.hasExtendedWaitCounts()); 1400 1401 if (!SIInstrInfo::isVALU(*MI)) 1402 return false; 1403 1404 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1405 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1406 return false; 1407 1408 auto IsHazardFn = [TRI](const MachineInstr &I) { 1409 if (SIInstrInfo::isVALU(I)) 1410 return false; 1411 return I.readsRegister(AMDGPU::EXEC, TRI); 1412 }; 1413 1414 const SIInstrInfo *TII = ST.getInstrInfo(); 1415 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1416 if (SIInstrInfo::isVALU(MI)) { 1417 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1418 return true; 1419 for (auto MO : MI.implicit_operands()) 1420 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1421 return true; 1422 } 1423 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1424 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1425 return true; 1426 return false; 1427 }; 1428 1429 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1430 std::numeric_limits<int>::max()) 1431 return false; 1432 1433 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1434 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1435 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1436 return true; 1437 } 1438 1439 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1440 const GCNSubtarget &ST) { 1441 if (!ST.hasLdsBranchVmemWARHazard()) 1442 return false; 1443 1444 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1445 // instructions need to appear in the same function. 1446 bool HasLds = false; 1447 bool HasVmem = false; 1448 for (auto &MBB : MF) { 1449 for (auto &MI : MBB) { 1450 HasLds |= SIInstrInfo::isDS(MI); 1451 HasVmem |= 1452 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1453 if (HasLds && HasVmem) 1454 return true; 1455 } 1456 } 1457 return false; 1458 } 1459 1460 static bool isStoreCountWaitZero(const MachineInstr &I) { 1461 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1462 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1463 !I.getOperand(1).getImm(); 1464 } 1465 1466 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1467 if (!RunLdsBranchVmemWARHazardFixup) 1468 return false; 1469 1470 assert(ST.hasLdsBranchVmemWARHazard()); 1471 assert(!ST.hasExtendedWaitCounts()); 1472 1473 auto IsHazardInst = [](const MachineInstr &MI) { 1474 if (SIInstrInfo::isDS(MI)) 1475 return 1; 1476 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1477 return 2; 1478 return 0; 1479 }; 1480 1481 auto InstType = IsHazardInst(*MI); 1482 if (!InstType) 1483 return false; 1484 1485 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1486 return IsHazardInst(I) || isStoreCountWaitZero(I); 1487 }; 1488 1489 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1490 if (!I.isBranch()) 1491 return false; 1492 1493 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1494 auto InstType2 = IsHazardInst(I); 1495 return InstType2 && InstType != InstType2; 1496 }; 1497 1498 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1499 auto InstType2 = IsHazardInst(I); 1500 if (InstType == InstType2) 1501 return true; 1502 1503 return isStoreCountWaitZero(I); 1504 }; 1505 1506 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1507 std::numeric_limits<int>::max(); 1508 }; 1509 1510 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1511 std::numeric_limits<int>::max()) 1512 return false; 1513 1514 const SIInstrInfo *TII = ST.getInstrInfo(); 1515 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1516 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1517 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1518 .addImm(0); 1519 1520 return true; 1521 } 1522 1523 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1524 if (!SIInstrInfo::isLDSDIR(*MI)) 1525 return false; 1526 1527 const int NoHazardWaitStates = 15; 1528 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1529 const Register VDSTReg = VDST->getReg(); 1530 1531 bool VisitedTrans = false; 1532 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1533 if (!SIInstrInfo::isVALU(I)) 1534 return false; 1535 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1536 // Cover both WAR and WAW 1537 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1538 }; 1539 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1540 if (WaitStates >= NoHazardWaitStates) 1541 return true; 1542 // Instructions which cause va_vdst==0 expire hazard 1543 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1544 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1545 }; 1546 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1547 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1548 }; 1549 1550 DenseSet<const MachineBasicBlock *> Visited; 1551 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1552 std::next(MI->getReverseIterator()), 0, 1553 IsExpiredFn, Visited, GetWaitStatesFn); 1554 1555 // Transcendentals can execute in parallel to other VALUs. 1556 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1557 if (VisitedTrans) 1558 Count = 0; 1559 1560 MachineOperand *WaitVdstOp = 1561 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1562 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1563 1564 return true; 1565 } 1566 1567 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1568 if (!SIInstrInfo::isLDSDIR(*MI)) 1569 return false; 1570 1571 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1572 const Register VDSTReg = VDST->getReg(); 1573 1574 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1575 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1576 !SIInstrInfo::isDS(I)) 1577 return false; 1578 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1579 }; 1580 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1581 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1582 // according to the type of VMEM instruction. 1583 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1584 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1585 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1586 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1587 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1588 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1589 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1590 }; 1591 1592 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1593 std::numeric_limits<int>::max()) 1594 return false; 1595 1596 if (LdsdirCanWait) { 1597 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1598 } else { 1599 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1600 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1601 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1602 } 1603 1604 return true; 1605 } 1606 1607 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1608 if (!ST.hasVALUPartialForwardingHazard()) 1609 return false; 1610 assert(!ST.hasExtendedWaitCounts()); 1611 1612 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1613 return false; 1614 1615 SmallSetVector<Register, 4> SrcVGPRs; 1616 1617 for (const MachineOperand &Use : MI->explicit_uses()) { 1618 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1619 SrcVGPRs.insert(Use.getReg()); 1620 } 1621 1622 // Only applies with >= 2 unique VGPR sources 1623 if (SrcVGPRs.size() <= 1) 1624 return false; 1625 1626 // Look for the following pattern: 1627 // Va <- VALU [PreExecPos] 1628 // intv1 1629 // Exec <- SALU [ExecPos] 1630 // intv2 1631 // Vb <- VALU [PostExecPos] 1632 // intv3 1633 // MI Va, Vb (WaitState = 0) 1634 // 1635 // Where: 1636 // intv1 + intv2 <= 2 VALUs 1637 // intv3 <= 4 VALUs 1638 // 1639 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1640 1641 const int Intv1plus2MaxVALUs = 2; 1642 const int Intv3MaxVALUs = 4; 1643 const int IntvMaxVALUs = 6; 1644 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1645 1646 struct StateType { 1647 SmallDenseMap<Register, int, 4> DefPos; 1648 int ExecPos = std::numeric_limits<int>::max(); 1649 int VALUs = 0; 1650 }; 1651 1652 StateType State; 1653 1654 // This overloads expiry testing with all the hazard detection 1655 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1656 // Too many VALU states have passed 1657 if (State.VALUs > NoHazardVALUWaitStates) 1658 return HazardExpired; 1659 1660 // Instructions which cause va_vdst==0 expire hazard 1661 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1662 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1663 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1664 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1665 return HazardExpired; 1666 1667 // Track registers writes 1668 bool Changed = false; 1669 if (SIInstrInfo::isVALU(I)) { 1670 for (Register Src : SrcVGPRs) { 1671 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1672 State.DefPos[Src] = State.VALUs; 1673 Changed = true; 1674 } 1675 } 1676 } else if (SIInstrInfo::isSALU(I)) { 1677 if (State.ExecPos == std::numeric_limits<int>::max()) { 1678 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1679 State.ExecPos = State.VALUs; 1680 Changed = true; 1681 } 1682 } 1683 } 1684 1685 // Early expiration: too many VALUs in intv3 1686 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1687 return HazardExpired; 1688 1689 // Only evaluate state if something changed 1690 if (!Changed) 1691 return NoHazardFound; 1692 1693 // Determine positions of VALUs pre/post exec change 1694 if (State.ExecPos == std::numeric_limits<int>::max()) 1695 return NoHazardFound; 1696 1697 int PreExecPos = std::numeric_limits<int>::max(); 1698 int PostExecPos = std::numeric_limits<int>::max(); 1699 1700 for (auto Entry : State.DefPos) { 1701 int DefVALUs = Entry.second; 1702 if (DefVALUs != std::numeric_limits<int>::max()) { 1703 if (DefVALUs >= State.ExecPos) 1704 PreExecPos = std::min(PreExecPos, DefVALUs); 1705 else 1706 PostExecPos = std::min(PostExecPos, DefVALUs); 1707 } 1708 } 1709 1710 // Need a VALUs post exec change 1711 if (PostExecPos == std::numeric_limits<int>::max()) 1712 return NoHazardFound; 1713 1714 // Too many VALUs in intv3? 1715 int Intv3VALUs = PostExecPos; 1716 if (Intv3VALUs > Intv3MaxVALUs) 1717 return HazardExpired; 1718 1719 // Too many VALUs in intv2? 1720 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1721 if (Intv2VALUs > Intv1plus2MaxVALUs) 1722 return HazardExpired; 1723 1724 // Need a VALUs pre exec change 1725 if (PreExecPos == std::numeric_limits<int>::max()) 1726 return NoHazardFound; 1727 1728 // Too many VALUs in intv1? 1729 int Intv1VALUs = PreExecPos - State.ExecPos; 1730 if (Intv1VALUs > Intv1plus2MaxVALUs) 1731 return HazardExpired; 1732 1733 // Too many VALUs in intv1 + intv2 1734 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1735 return HazardExpired; 1736 1737 return HazardFound; 1738 }; 1739 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1740 if (SIInstrInfo::isVALU(MI)) 1741 State.VALUs += 1; 1742 }; 1743 1744 DenseSet<const MachineBasicBlock *> Visited; 1745 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1746 std::next(MI->getReverseIterator()), Visited)) 1747 return false; 1748 1749 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1750 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1751 .addImm(0x0fff); 1752 1753 return true; 1754 } 1755 1756 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1757 if (!ST.hasVALUTransUseHazard()) 1758 return false; 1759 assert(!ST.hasExtendedWaitCounts()); 1760 1761 if (!SIInstrInfo::isVALU(*MI)) 1762 return false; 1763 1764 SmallSet<Register, 4> SrcVGPRs; 1765 1766 for (const MachineOperand &Use : MI->explicit_uses()) { 1767 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1768 SrcVGPRs.insert(Use.getReg()); 1769 } 1770 1771 // Look for the following pattern: 1772 // Va <- TRANS VALU 1773 // intv 1774 // MI Va (WaitState = 0) 1775 // 1776 // Where: 1777 // intv <= 5 VALUs / 1 TRANS 1778 // 1779 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1780 1781 const int IntvMaxVALUs = 5; 1782 const int IntvMaxTRANS = 1; 1783 1784 struct StateType { 1785 int VALUs = 0; 1786 int TRANS = 0; 1787 }; 1788 1789 StateType State; 1790 1791 // This overloads expiry testing with all the hazard detection 1792 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1793 // Too many VALU states have passed 1794 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1795 return HazardExpired; 1796 1797 // Instructions which cause va_vdst==0 expire hazard 1798 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1799 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1800 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1801 I.getOperand(0).getImm() == 0x0fff)) 1802 return HazardExpired; 1803 1804 // Track registers writes 1805 if (SIInstrInfo::isTRANS(I)) { 1806 for (Register Src : SrcVGPRs) { 1807 if (I.modifiesRegister(Src, &TRI)) { 1808 return HazardFound; 1809 } 1810 } 1811 } 1812 1813 return NoHazardFound; 1814 }; 1815 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1816 if (SIInstrInfo::isVALU(MI)) 1817 State.VALUs += 1; 1818 if (SIInstrInfo::isTRANS(MI)) 1819 State.TRANS += 1; 1820 }; 1821 1822 DenseSet<const MachineBasicBlock *> Visited; 1823 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1824 std::next(MI->getReverseIterator()), Visited)) 1825 return false; 1826 1827 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1828 // avoided. 1829 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1830 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1831 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1832 1833 return true; 1834 } 1835 1836 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1837 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1838 return false; 1839 1840 const SIInstrInfo *TII = ST.getInstrInfo(); 1841 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1842 1843 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1844 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1845 return false; 1846 1847 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1848 // with the dest(matrix D) of the previous wmma. 1849 const Register CurSrc0Reg = 1850 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1851 const Register CurSrc1Reg = 1852 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1853 1854 const Register PrevDstReg = 1855 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1856 1857 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1858 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1859 return true; 1860 } 1861 1862 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1863 // but Index can't overlap with PrevDstReg. 1864 if (AMDGPU::isGFX12Plus(ST)) { 1865 if (SIInstrInfo::isSWMMAC(*MI)) { 1866 const Register CurIndex = 1867 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1868 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1869 return true; 1870 } 1871 return false; 1872 } 1873 1874 return false; 1875 }; 1876 1877 auto IsExpiredFn = [](const MachineInstr &I, int) { 1878 return SIInstrInfo::isVALU(I); 1879 }; 1880 1881 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1882 std::numeric_limits<int>::max()) 1883 return false; 1884 1885 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1886 1887 return true; 1888 } 1889 1890 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1891 if (!ST.hasShift64HighRegBug()) 1892 return false; 1893 assert(!ST.hasExtendedWaitCounts()); 1894 1895 switch (MI->getOpcode()) { 1896 default: 1897 return false; 1898 case AMDGPU::V_LSHLREV_B64_e64: 1899 case AMDGPU::V_LSHRREV_B64_e64: 1900 case AMDGPU::V_ASHRREV_I64_e64: 1901 break; 1902 } 1903 1904 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1905 if (!Amt->isReg()) 1906 return false; 1907 1908 Register AmtReg = Amt->getReg(); 1909 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1910 // Check if this is a last VGPR in the allocation block. 1911 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1912 return false; 1913 1914 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1915 return false; 1916 1917 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1918 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1919 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1920 bool Overlapped = OverlappedSrc || OverlappedDst; 1921 1922 assert(!OverlappedDst || !OverlappedSrc || 1923 Src1->getReg() == MI->getOperand(0).getReg()); 1924 assert(ST.needsAlignedVGPRs()); 1925 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1926 1927 Register NewReg; 1928 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1929 : AMDGPU::VGPR_32RegClass) { 1930 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1931 NewReg = Reg; 1932 break; 1933 } 1934 } 1935 1936 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1937 : NewReg; 1938 Register NewAmtLo; 1939 1940 if (Overlapped) 1941 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1942 1943 DebugLoc DL = MI->getDebugLoc(); 1944 MachineBasicBlock *MBB = MI->getParent(); 1945 // Insert a full wait count because found register might be pending a wait. 1946 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1947 .addImm(0); 1948 1949 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1950 if (Overlapped) 1951 runOnInstruction( 1952 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1953 .addDef(AmtReg - 1) 1954 .addReg(AmtReg - 1, RegState::Undef) 1955 .addReg(NewAmtLo, RegState::Undef)); 1956 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1957 .addDef(AmtReg) 1958 .addReg(AmtReg, RegState::Undef) 1959 .addReg(NewAmt, RegState::Undef)); 1960 1961 // Instructions emitted after the current instruction will be processed by the 1962 // parent loop of the hazard recognizer in a natural way. 1963 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1964 AmtReg) 1965 .addDef(NewAmt) 1966 .addReg(NewAmt) 1967 .addReg(AmtReg); 1968 if (Overlapped) 1969 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1970 AmtReg - 1) 1971 .addDef(NewAmtLo) 1972 .addReg(NewAmtLo) 1973 .addReg(AmtReg - 1); 1974 1975 // Re-running hazard recognizer on the modified instruction is not necessary, 1976 // inserted V_SWAP_B32 has already both read and write new registers so 1977 // hazards related to these register has already been handled. 1978 Amt->setReg(NewAmt); 1979 Amt->setIsKill(false); 1980 // We do not update liveness, so verifier may see it as undef. 1981 Amt->setIsUndef(); 1982 if (OverlappedDst) 1983 MI->getOperand(0).setReg(NewReg); 1984 if (OverlappedSrc) { 1985 Src1->setReg(NewReg); 1986 Src1->setIsKill(false); 1987 Src1->setIsUndef(); 1988 } 1989 1990 return true; 1991 } 1992 1993 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1994 int NSAtoVMEMWaitStates = 1; 1995 1996 if (!ST.hasNSAtoVMEMBug()) 1997 return 0; 1998 1999 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 2000 return 0; 2001 2002 const SIInstrInfo *TII = ST.getInstrInfo(); 2003 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2004 if (!Offset || (Offset->getImm() & 6) == 0) 2005 return 0; 2006 2007 auto IsHazardFn = [TII](const MachineInstr &I) { 2008 if (!SIInstrInfo::isMIMG(I)) 2009 return false; 2010 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 2011 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 2012 TII->getInstSizeInBytes(I) >= 16; 2013 }; 2014 2015 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 2016 } 2017 2018 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 2019 int FPAtomicToDenormModeWaitStates = 3; 2020 2021 if (!ST.hasFPAtomicToDenormModeHazard()) 2022 return 0; 2023 assert(!ST.hasExtendedWaitCounts()); 2024 2025 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2026 return 0; 2027 2028 auto IsHazardFn = [](const MachineInstr &I) { 2029 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2030 return false; 2031 return SIInstrInfo::isFPAtomic(I); 2032 }; 2033 2034 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2035 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2036 return true; 2037 2038 switch (MI.getOpcode()) { 2039 case AMDGPU::S_WAITCNT: 2040 case AMDGPU::S_WAITCNT_VSCNT: 2041 case AMDGPU::S_WAITCNT_VMCNT: 2042 case AMDGPU::S_WAITCNT_EXPCNT: 2043 case AMDGPU::S_WAITCNT_LGKMCNT: 2044 case AMDGPU::S_WAIT_IDLE: 2045 return true; 2046 default: 2047 break; 2048 } 2049 2050 return false; 2051 }; 2052 2053 return FPAtomicToDenormModeWaitStates - 2054 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2055 } 2056 2057 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2058 assert(SIInstrInfo::isMAI(*MI)); 2059 2060 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2061 } 2062 2063 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2064 // Early exit if no padding is requested. 2065 if (MFMAPaddingRatio == 0) 2066 return 0; 2067 2068 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2069 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2070 return 0; 2071 2072 int NeighborMFMALatency = 0; 2073 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2074 this](const MachineInstr &MI) { 2075 if (!SIInstrInfo::isMFMA(MI)) 2076 return false; 2077 2078 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2079 return true; 2080 }; 2081 2082 const int MaxMFMAPipelineWaitStates = 16; 2083 int WaitStatesSinceNeighborMFMA = 2084 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2085 2086 int NeighborMFMAPaddingNeeded = 2087 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2088 WaitStatesSinceNeighborMFMA; 2089 2090 return std::max(0, NeighborMFMAPaddingNeeded); 2091 } 2092 2093 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2094 int WaitStatesNeeded = 0; 2095 unsigned Opc = MI->getOpcode(); 2096 2097 auto IsVALUFn = [](const MachineInstr &MI) { 2098 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2099 }; 2100 2101 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2102 const int LegacyVALUWritesVGPRWaitStates = 2; 2103 const int VALUWritesExecWaitStates = 4; 2104 const int MaxWaitStates = 4; 2105 2106 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2107 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2109 2110 if (WaitStatesNeeded < MaxWaitStates) { 2111 for (const MachineOperand &Use : MI->explicit_uses()) { 2112 const int MaxWaitStates = 2; 2113 2114 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2115 continue; 2116 2117 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2118 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2119 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2120 2121 if (WaitStatesNeeded == MaxWaitStates) 2122 break; 2123 } 2124 } 2125 } 2126 2127 for (const MachineOperand &Op : MI->explicit_operands()) { 2128 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2129 continue; 2130 2131 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2132 continue; 2133 2134 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2135 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2136 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2137 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2138 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2139 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2140 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2141 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2142 const int MaxWaitStates = 18; 2143 Register Reg = Op.getReg(); 2144 unsigned HazardDefLatency = 0; 2145 2146 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2147 this](const MachineInstr &MI) { 2148 if (!SIInstrInfo::isMFMA(MI)) 2149 return false; 2150 Register DstReg = MI.getOperand(0).getReg(); 2151 if (DstReg == Reg) 2152 return false; 2153 HazardDefLatency = 2154 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2155 return TRI.regsOverlap(DstReg, Reg); 2156 }; 2157 2158 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2159 MaxWaitStates); 2160 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2161 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2162 int OpNo = Op.getOperandNo(); 2163 if (OpNo == SrcCIdx) { 2164 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2165 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2166 switch (HazardDefLatency) { 2167 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2168 break; 2169 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2170 break; 2171 case 16: [[fallthrough]]; 2172 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2173 break; 2174 } 2175 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2176 switch (HazardDefLatency) { 2177 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2178 break; 2179 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2180 break; 2181 case 16: [[fallthrough]]; 2182 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2183 break; 2184 } 2185 } 2186 2187 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2189 2190 if (WaitStatesNeeded == MaxWaitStates) 2191 return WaitStatesNeeded; // Early exit. 2192 2193 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2194 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2195 return false; 2196 Register DstReg = MI.getOperand(0).getReg(); 2197 return TRI.regsOverlap(Reg, DstReg); 2198 }; 2199 2200 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2201 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2202 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2203 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2204 if (OpNo == SrcCIdx) 2205 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2206 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2207 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2208 2209 WaitStatesNeededForUse = NeedWaitStates - 2210 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2211 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2212 2213 if (WaitStatesNeeded == MaxWaitStates) 2214 return WaitStatesNeeded; // Early exit. 2215 } 2216 2217 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2218 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2219 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2220 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2221 const int MaxWaitStates = 13; 2222 Register DstReg = MI->getOperand(0).getReg(); 2223 unsigned HazardDefLatency = 0; 2224 2225 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2226 this](const MachineInstr &MI) { 2227 if (!SIInstrInfo::isMFMA(MI)) 2228 return false; 2229 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2230 HazardDefLatency = 2231 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2232 return TRI.regsOverlap(Reg, DstReg); 2233 }; 2234 2235 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2236 int NeedWaitStates; 2237 switch (HazardDefLatency) { 2238 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2239 break; 2240 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2241 break; 2242 case 16: [[fallthrough]]; 2243 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2244 break; 2245 } 2246 2247 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2248 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2249 } 2250 2251 // Pad neighboring MFMA with noops for better inter-wave performance. 2252 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2253 2254 return WaitStatesNeeded; 2255 } 2256 2257 static int 2258 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, 2259 bool IsGFX950) { 2260 // xdl def cycles | gfx940 | gfx950 2261 // 2 pass | 3 4 2262 // 4 pass | 5 6 2263 // 8 pass | 9 10 2264 // 16 pass | 17 18 2265 return NumPasses + 1 + IsGFX950; 2266 } 2267 2268 static int 2269 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, 2270 bool IsGFX950) { 2271 // xdl def cycles | gfx940 | gfx950 2272 // 2 pass | 3 3 2273 // 4 pass | 5 6 2274 // 8 pass | 9 10 2275 // 16 pass | 17 18 2276 return NumPasses + 1 + (NumPasses != 2 && IsGFX950); 2277 } 2278 2279 static int 2280 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2281 // 2 pass -> 2 2282 // 4 pass -> 4 2283 // 8 pass -> 8 2284 // 16 pass -> 16 2285 return NumPasses; 2286 } 2287 2288 static int 2289 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2290 // 2 pass -> 4 2291 // 4 pass -> 6 2292 // 8 pass -> 10 2293 // 16 pass -> 18 2294 return NumPasses + 2; 2295 } 2296 2297 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2298 // 2 pass -> 5 2299 // 4 pass -> 7 2300 // 8 pass -> 11 2301 // 16 pass -> 19 2302 return NumPasses + 3; 2303 } 2304 2305 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2306 int WaitStatesNeeded = 0; 2307 unsigned Opc = MI->getOpcode(); 2308 2309 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2310 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2311 }; 2312 2313 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2314 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2315 !SIInstrInfo::isDOT(MI); 2316 }; 2317 2318 if (!SIInstrInfo::isMFMA(*MI)) 2319 return WaitStatesNeeded; 2320 2321 const int VALUWritesExecWaitStates = 4; 2322 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2323 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2324 VALUWritesExecWaitStates); 2325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2326 2327 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2328 2329 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2330 for (const MachineOperand &Use : MI->explicit_uses()) { 2331 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2332 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2333 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2334 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2335 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2336 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2337 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2338 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2339 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2340 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2341 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2342 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2343 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2344 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2345 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2346 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19; 2347 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2348 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2349 const int MaxWaitStates = 19; 2350 2351 if (!Use.isReg()) 2352 continue; 2353 Register Reg = Use.getReg(); 2354 bool FullReg; 2355 const MachineInstr *MI1; 2356 2357 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2358 this](const MachineInstr &MI) { 2359 if (!SIInstrInfo::isMFMA(MI)) 2360 return false; 2361 Register DstReg = MI.getOperand(0).getReg(); 2362 FullReg = (DstReg == Reg); 2363 MI1 = &MI; 2364 return TRI.regsOverlap(DstReg, Reg); 2365 }; 2366 2367 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2368 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2369 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2370 2371 int NumWaitStates = 2372 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2373 if (NumWaitStates == std::numeric_limits<int>::max()) 2374 continue; 2375 2376 int OpNo = Use.getOperandNo(); 2377 unsigned Opc1 = MI1->getOpcode(); 2378 int NeedWaitStates = 0; 2379 if (OpNo == SrcCIdx) { 2380 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2381 NeedWaitStates = 0; 2382 } else if (FullReg) { 2383 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2384 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2385 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2386 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2387 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2388 else if (ST.hasGFX940Insts() && 2389 TSchedModel.computeInstrLatency(MI1) == 2) 2390 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2391 } else { 2392 switch (Opc1) { 2393 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2394 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2395 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2396 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2397 if (!isXDL(ST, *MI)) 2398 NeedWaitStates = 2399 ST.hasGFX950Insts() 2400 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2401 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2402 break; 2403 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2404 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2405 if (!isXDL(ST, *MI)) 2406 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2407 break; 2408 default: 2409 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2410 if (ST.hasGFX940Insts()) { 2411 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2412 break; 2413 2414 NeedWaitStates = 2415 isXDL(ST, *MI1) 2416 ? (isXDL(ST, *MI) 2417 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates( 2418 NumPasses, ST.hasGFX950Insts()) 2419 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates( 2420 NumPasses, ST.hasGFX950Insts())) 2421 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2422 NumPasses); 2423 break; 2424 } 2425 2426 switch (NumPasses) { 2427 case 2: 2428 NeedWaitStates = 2429 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2430 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2431 break; 2432 case 8: 2433 NeedWaitStates = 2434 isDGEMM(Opc) 2435 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2436 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2437 break; 2438 case 16: 2439 NeedWaitStates = 2440 isDGEMM(Opc) 2441 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2442 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2443 break; 2444 default: 2445 llvm_unreachable("unexpected number of passes"); 2446 } 2447 } 2448 } 2449 } else { 2450 switch (Opc1) { 2451 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2452 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2453 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2454 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2455 NeedWaitStates = 2456 ST.hasGFX950Insts() 2457 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates 2458 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2459 break; 2460 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2461 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2462 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2463 break; 2464 default: 2465 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2466 2467 if (ST.hasGFX940Insts()) { 2468 NeedWaitStates = 2469 isXDL(ST, *MI1) 2470 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2471 NumPasses) 2472 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2473 NumPasses); 2474 break; 2475 } 2476 2477 switch (NumPasses) { 2478 case 2: 2479 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2480 break; 2481 case 4: 2482 llvm_unreachable("unexpected number of passes for mfma"); 2483 case 8: 2484 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2485 break; 2486 case 16: 2487 default: 2488 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2489 } 2490 } 2491 } 2492 if (WaitStatesNeeded >= NeedWaitStates) 2493 continue; 2494 2495 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2496 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2497 2498 if (WaitStatesNeeded == MaxWaitStates) 2499 break; 2500 } 2501 2502 // Pad neighboring MFMA with noops for better inter-wave performance. 2503 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2504 2505 return WaitStatesNeeded; 2506 } 2507 2508 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2509 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2510 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2511 return 0; 2512 2513 int WaitStatesNeeded = 0; 2514 2515 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2516 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2517 }; 2518 2519 for (const MachineOperand &Op : MI->explicit_uses()) { 2520 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2521 continue; 2522 2523 Register Reg = Op.getReg(); 2524 2525 const int AccVgprReadLdStWaitStates = 2; 2526 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2527 const int MaxWaitStates = 2; 2528 2529 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2530 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2532 2533 if (WaitStatesNeeded == MaxWaitStates) 2534 return WaitStatesNeeded; // Early exit. 2535 2536 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2537 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2538 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2539 return false; 2540 auto IsVALUFn = [](const MachineInstr &MI) { 2541 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2542 }; 2543 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2544 std::numeric_limits<int>::max(); 2545 }; 2546 2547 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2548 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2549 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2550 } 2551 2552 return WaitStatesNeeded; 2553 } 2554 2555 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { 2556 assert(!ST.hasVcmpxPermlaneHazard() && 2557 "this is a different vcmpx+permlane hazard"); 2558 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2559 const SIInstrInfo *TII = ST.getInstrInfo(); 2560 2561 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) { 2562 return isVCmpXWritesExec(*TII, *TRI, MI); 2563 }; 2564 2565 auto IsVALUFn = [](const MachineInstr &MI) { 2566 return SIInstrInfo::isVALU(MI); 2567 }; 2568 2569 const int VCmpXWritesExecWaitStates = 4; 2570 const int VALUWritesVDstWaitStates = 2; 2571 int WaitStatesNeeded = 0; 2572 2573 for (const MachineOperand &Op : MI->explicit_uses()) { 2574 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg())) 2575 continue; 2576 Register Reg = Op.getReg(); 2577 2578 int WaitStatesSinceDef = 2579 VALUWritesVDstWaitStates - 2580 getWaitStatesSinceDef(Reg, IsVALUFn, 2581 /*MaxWaitStates=*/VALUWritesVDstWaitStates); 2582 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef); 2583 if (WaitStatesNeeded >= VALUWritesVDstWaitStates) 2584 break; 2585 } 2586 2587 int VCmpXHazardWaits = 2588 VCmpXWritesExecWaitStates - 2589 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates); 2590 2591 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits); 2592 return WaitStatesNeeded; 2593 } 2594 2595 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2596 // 2 pass -> 4 2597 // 4 pass -> 6 2598 // 8 pass -> 10 2599 // 16 pass -> 18 2600 return NumPasses + 2; 2601 } 2602 2603 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2604 // 2 pass -> 5 2605 // 4 pass -> 7 2606 // 8 pass -> 11 2607 // 16 pass -> 19 2608 return NumPasses + 3; 2609 } 2610 2611 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2612 // 2 pass -> 5 2613 // 4 pass -> 7 2614 // 8 pass -> 11 2615 // 16 pass -> 19 2616 return NumPasses + 3; 2617 } 2618 2619 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2620 // 2 pass -> 4 2621 // 4 pass -> 6 2622 // 8 pass -> 10 2623 // 16 pass -> 18 2624 return NumPasses + 2; 2625 } 2626 2627 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2628 if (!ST.hasGFX90AInsts()) 2629 return 0; 2630 2631 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2632 return isDGEMM(MI.getOpcode()); 2633 }; 2634 2635 // This is checked in checkMAIHazards90A() 2636 if (SIInstrInfo::isMFMA(*MI)) 2637 return 0; 2638 2639 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2640 2641 int WaitStatesNeeded = 0; 2642 2643 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2644 SIInstrInfo::isFLAT(*MI) || 2645 SIInstrInfo::isDS(*MI); 2646 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2647 bool IsVALU = SIInstrInfo::isVALU(*MI); 2648 2649 const MachineInstr *MFMA = nullptr; 2650 unsigned Reg; 2651 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2652 if (!SIInstrInfo::isMFMA(MI) || 2653 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2654 return false; 2655 MFMA = &MI; 2656 return true; 2657 }; 2658 2659 const MachineInstr *DOT = nullptr; 2660 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2661 if (!SIInstrInfo::isDOT(MI) || 2662 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2663 return false; 2664 DOT = &MI; 2665 return true; 2666 }; 2667 2668 bool DGEMMAfterVALUWrite = false; 2669 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2670 // Found DGEMM on reverse traversal to def. 2671 if (isDGEMM(MI.getOpcode())) 2672 DGEMMAfterVALUWrite = true; 2673 2674 // Only hazard if register is defined by a VALU and a DGEMM is found after 2675 // after the def. 2676 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2677 return false; 2678 2679 return true; 2680 }; 2681 2682 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2683 AMDGPU::OpName::src2); 2684 2685 if (IsMemOrExport || IsVALU) { 2686 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2687 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2688 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2689 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2690 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2691 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2692 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2693 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2694 const int DotWriteSameDotReadSrcAB = 3; 2695 const int DotWriteDifferentVALURead = 3; 2696 const int DMFMABetweenVALUWriteVMEMRead = 2; 2697 const int MaxWaitStates = 19; 2698 2699 for (const MachineOperand &Use : MI->explicit_uses()) { 2700 if (!Use.isReg()) 2701 continue; 2702 Reg = Use.getReg(); 2703 2704 DOT = nullptr; 2705 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2706 MaxWaitStates); 2707 if (DOT) { 2708 int NeedWaitStates = 0; 2709 if (DOT->getOpcode() == MI->getOpcode()) { 2710 if (&Use - &MI->getOperand(0) != SrcCIdx) 2711 NeedWaitStates = DotWriteSameDotReadSrcAB; 2712 } else { 2713 NeedWaitStates = DotWriteDifferentVALURead; 2714 } 2715 2716 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2717 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2718 } 2719 2720 // Workaround for HW data hazard bug observed only in GFX90A. When there 2721 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2722 // causes the SQ to incorrectly not insert two wait states between the two 2723 // instructions needed to avoid data hazard. 2724 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2725 DGEMMAfterVALUWrite = false; 2726 if (TRI.isVectorRegister(MRI, Reg)) { 2727 int WaitStatesNeededForUse = 2728 DMFMABetweenVALUWriteVMEMRead - 2729 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2730 DMFMABetweenVALUWriteVMEMRead); 2731 2732 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2733 } 2734 } 2735 2736 MFMA = nullptr; 2737 WaitStatesSinceDef = 2738 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2739 if (!MFMA) 2740 continue; 2741 2742 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2743 int NumPasses = HazardDefLatency; 2744 int NeedWaitStates = MaxWaitStates; 2745 2746 if (isDGEMM(MFMA->getOpcode())) { 2747 switch (HazardDefLatency) { 2748 case 4: 2749 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2750 : DMFMA4x4WriteVgprVALUReadWaitStates; 2751 break; 2752 case 8: 2753 case 16: 2754 NeedWaitStates = 2755 IsMemOrExport 2756 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2757 : (ST.hasGFX950Insts() 2758 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2759 : DMFMA16x16WriteVgprVALUReadWaitStates); 2760 break; 2761 default: 2762 llvm_unreachable("unexpected dgemm"); 2763 } 2764 } else if (ST.hasGFX940Insts()) { 2765 NeedWaitStates = 2766 isXDL(ST, *MFMA) 2767 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2768 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2769 NumPasses); 2770 } else { 2771 switch (HazardDefLatency) { 2772 case 2: 2773 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2774 break; 2775 case 8: 2776 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2777 break; 2778 case 16: 2779 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2780 break; 2781 default: 2782 llvm_unreachable("unexpected number of passes for mfma"); 2783 } 2784 } 2785 2786 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2787 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2788 2789 if (WaitStatesNeeded == MaxWaitStates) 2790 break; 2791 } 2792 } 2793 2794 unsigned Opc = MI->getOpcode(); 2795 const int DMFMAToFMA64WaitStates = 2; 2796 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2797 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2798 Opc == AMDGPU::V_FMAC_F64_dpp) && 2799 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2800 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2801 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2802 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2803 } 2804 2805 if (!IsVALU && !IsMemOrExport) 2806 return WaitStatesNeeded; 2807 2808 for (const MachineOperand &Def : MI->defs()) { 2809 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2810 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2811 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2812 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2813 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2814 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2815 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2816 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2817 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2818 const int DotWriteDifferentVALUWrite = 3; 2819 const int MaxWaitStates = 19; 2820 const int MaxWarWaitStates = 15; 2821 2822 Reg = Def.getReg(); 2823 2824 DOT = nullptr; 2825 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2826 MaxWaitStates); 2827 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2828 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2829 WaitStatesSinceDef); 2830 2831 MFMA = nullptr; 2832 WaitStatesSinceDef = 2833 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2834 if (MFMA) { 2835 int NeedWaitStates = MaxWaitStates; 2836 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2837 2838 if (isDGEMM(MFMA->getOpcode())) { 2839 switch (NumPasses) { 2840 case 4: 2841 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2842 break; 2843 case 8: 2844 case 16: 2845 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2846 break; 2847 default: 2848 llvm_unreachable("unexpected number of cycles for dgemm"); 2849 } 2850 } else if (ST.hasGFX940Insts()) { 2851 NeedWaitStates = 2852 isXDL(ST, *MFMA) 2853 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2854 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2855 } else { 2856 switch (NumPasses) { 2857 case 2: 2858 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2859 break; 2860 case 8: 2861 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2862 break; 2863 case 16: 2864 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2865 break; 2866 default: 2867 llvm_unreachable("Unexpected number of passes for mfma"); 2868 } 2869 } 2870 2871 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2872 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2873 2874 if (WaitStatesNeeded == MaxWaitStates) 2875 break; 2876 } 2877 2878 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2879 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2880 !MI.readsRegister(Reg, &TRI)) 2881 return false; 2882 2883 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2884 return false; 2885 2886 const MachineOperand *SrcC = 2887 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2888 assert(SrcC); 2889 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2890 return false; 2891 2892 MFMA = &MI; 2893 return true; 2894 }; 2895 2896 MFMA = nullptr; 2897 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2898 MaxWarWaitStates); 2899 if (!MFMA) 2900 continue; 2901 2902 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2903 int NeedWaitStates = MaxWaitStates; 2904 switch (HazardDefLatency) { 2905 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2906 break; 2907 case 4: assert(ST.hasGFX940Insts()); 2908 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2909 break; 2910 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2911 break; 2912 case 16: [[fallthrough]]; 2913 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2914 break; 2915 } 2916 2917 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2918 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2919 } 2920 2921 return WaitStatesNeeded; 2922 } 2923 2924 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2925 if (!SU->isInstr()) 2926 return false; 2927 2928 const MachineInstr *MAI = nullptr; 2929 2930 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2931 MAI = nullptr; 2932 if (SIInstrInfo::isMFMA(MI)) 2933 MAI = &MI; 2934 return MAI != nullptr; 2935 }; 2936 2937 MachineInstr *MI = SU->getInstr(); 2938 if (IsMFMAFn(*MI)) { 2939 int W = getWaitStatesSince(IsMFMAFn, 16); 2940 if (MAI) 2941 return W < (int)TSchedModel.computeInstrLatency(MAI); 2942 } 2943 2944 return false; 2945 } 2946 2947 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2948 // insertion of a new instruction. 2949 static void updateGetPCBundle(MachineInstr *NewMI) { 2950 if (!NewMI->isBundled()) 2951 return; 2952 2953 // Find start of bundle. 2954 auto I = NewMI->getIterator(); 2955 while (I->isBundledWithPred()) 2956 I--; 2957 if (I->isBundle()) 2958 I++; 2959 2960 // Bail if this is not an S_GETPC bundle. 2961 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2962 return; 2963 2964 // Update offsets of any references in the bundle. 2965 const unsigned NewBytes = 4; 2966 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2967 "Unexpected instruction insertion in bundle"); 2968 auto NextMI = std::next(NewMI->getIterator()); 2969 auto End = NewMI->getParent()->end(); 2970 while (NextMI != End && NextMI->isBundledWithPred()) { 2971 for (auto &Operand : NextMI->operands()) { 2972 if (Operand.isGlobal()) 2973 Operand.setOffset(Operand.getOffset() + NewBytes); 2974 } 2975 NextMI++; 2976 } 2977 } 2978 2979 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2980 if (!ST.hasVALUMaskWriteHazard()) 2981 return false; 2982 assert(!ST.hasExtendedWaitCounts()); 2983 2984 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2985 return false; 2986 2987 // The hazard sequence is three instructions: 2988 // 1. VALU reads SGPR as mask 2989 // 2. SALU writes SGPR 2990 // 3. SALU reads SGPR 2991 // The hazard can expire if the distance between 2 and 3 is sufficient. 2992 // In practice this happens <10% of the time, hence this always assumes 2993 // the hazard exists if 1 and 2 are present to avoid searching. 2994 2995 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2996 if (!SDSTOp || !SDSTOp->isReg()) 2997 return false; 2998 2999 const Register HazardReg = SDSTOp->getReg(); 3000 if (HazardReg == AMDGPU::EXEC || 3001 HazardReg == AMDGPU::EXEC_LO || 3002 HazardReg == AMDGPU::EXEC_HI || 3003 HazardReg == AMDGPU::M0) 3004 return false; 3005 3006 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 3007 switch (I.getOpcode()) { 3008 case AMDGPU::V_ADDC_U32_e32: 3009 case AMDGPU::V_ADDC_U32_dpp: 3010 case AMDGPU::V_CNDMASK_B16_e32: 3011 case AMDGPU::V_CNDMASK_B16_dpp: 3012 case AMDGPU::V_CNDMASK_B32_e32: 3013 case AMDGPU::V_CNDMASK_B32_dpp: 3014 case AMDGPU::V_DIV_FMAS_F32_e64: 3015 case AMDGPU::V_DIV_FMAS_F64_e64: 3016 case AMDGPU::V_SUBB_U32_e32: 3017 case AMDGPU::V_SUBB_U32_dpp: 3018 case AMDGPU::V_SUBBREV_U32_e32: 3019 case AMDGPU::V_SUBBREV_U32_dpp: 3020 // These implicitly read VCC as mask source. 3021 return HazardReg == AMDGPU::VCC || 3022 HazardReg == AMDGPU::VCC_LO || 3023 HazardReg == AMDGPU::VCC_HI; 3024 case AMDGPU::V_ADDC_U32_e64: 3025 case AMDGPU::V_ADDC_U32_e64_dpp: 3026 case AMDGPU::V_CNDMASK_B16_e64: 3027 case AMDGPU::V_CNDMASK_B16_e64_dpp: 3028 case AMDGPU::V_CNDMASK_B32_e64: 3029 case AMDGPU::V_CNDMASK_B32_e64_dpp: 3030 case AMDGPU::V_SUBB_U32_e64: 3031 case AMDGPU::V_SUBB_U32_e64_dpp: 3032 case AMDGPU::V_SUBBREV_U32_e64: 3033 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 3034 // Only check mask register overlaps. 3035 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 3036 assert(SSRCOp); 3037 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 3038 } 3039 default: 3040 return false; 3041 } 3042 }; 3043 3044 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3045 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 3046 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 3047 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3048 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3049 return true; 3050 3051 // VALU access to any SGPR or literal constant other than HazardReg 3052 // mitigates hazard. No need to check HazardReg here as this will 3053 // only be called when !IsHazardFn. 3054 if (!SIInstrInfo::isVALU(I)) 3055 return false; 3056 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 3057 const MachineOperand &Op = I.getOperand(OpNo); 3058 if (Op.isReg()) { 3059 Register OpReg = Op.getReg(); 3060 // Only consider uses 3061 if (!Op.isUse()) 3062 continue; 3063 // Ignore EXEC 3064 if (OpReg == AMDGPU::EXEC || 3065 OpReg == AMDGPU::EXEC_LO || 3066 OpReg == AMDGPU::EXEC_HI) 3067 continue; 3068 // Ignore all implicit uses except VCC 3069 if (Op.isImplicit()) { 3070 if (OpReg == AMDGPU::VCC || 3071 OpReg == AMDGPU::VCC_LO || 3072 OpReg == AMDGPU::VCC_HI) 3073 return true; 3074 continue; 3075 } 3076 if (TRI.isSGPRReg(MRI, OpReg)) 3077 return true; 3078 } else { 3079 const MCInstrDesc &InstDesc = I.getDesc(); 3080 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3081 if (!TII.isInlineConstant(Op, OpInfo)) 3082 return true; 3083 } 3084 } 3085 return false; 3086 }; 3087 3088 // Check for hazard 3089 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3090 std::numeric_limits<int>::max()) 3091 return false; 3092 3093 auto NextMI = std::next(MI->getIterator()); 3094 3095 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3096 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3097 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3098 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3099 3100 // SALU write may be s_getpc in a bundle. 3101 updateGetPCBundle(NewMI); 3102 3103 return true; 3104 } 3105 3106 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3107 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3108 static std::optional<unsigned> sgprPairNumber(Register Reg, 3109 const SIRegisterInfo &TRI) { 3110 switch (Reg) { 3111 case AMDGPU::M0: 3112 case AMDGPU::EXEC: 3113 case AMDGPU::EXEC_LO: 3114 case AMDGPU::EXEC_HI: 3115 case AMDGPU::SGPR_NULL: 3116 case AMDGPU::SGPR_NULL64: 3117 return {}; 3118 default: 3119 break; 3120 } 3121 unsigned RegN = TRI.getEncodingValue(Reg); 3122 if (RegN > 127) 3123 return {}; 3124 return (RegN >> 1) & 0x3f; 3125 } 3126 3127 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3128 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3129 assert(MMF == &MF); 3130 3131 // Assume non-empty vector means it has already been computed. 3132 if (!VALUReadHazardSGPRs.empty()) 3133 return; 3134 3135 auto CallingConv = MF.getFunction().getCallingConv(); 3136 bool IsCallFree = 3137 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3138 3139 // Exhaustive search is only viable in non-caller/callee functions where 3140 // VALUs will be exposed to the hazard recognizer. 3141 UseVALUReadHazardExhaustiveSearch = 3142 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3143 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3144 3145 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3146 bool UseVALUUseCache = 3147 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3148 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3149 if (!UseVALUUseCache) 3150 return; 3151 3152 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3153 // before a SALU write to the same SGPR. This provides a reduction in 3154 // hazard insertion when all VALU access to an SGPR occurs after its last 3155 // SALU write, when compared to a linear scan. 3156 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3157 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3158 MachineCycleInfo CI; 3159 CI.compute(*MMF); 3160 3161 for (auto *MBB : post_order(&MF)) { 3162 bool InCycle = CI.getCycle(MBB) != nullptr; 3163 for (auto &MI : reverse(MBB->instrs())) { 3164 bool IsVALU = SIInstrInfo::isVALU(MI); 3165 bool IsSALU = SIInstrInfo::isSALU(MI); 3166 if (!IsVALU && !IsSALU) 3167 continue; 3168 3169 for (const MachineOperand &Op : MI.operands()) { 3170 if (!Op.isReg()) 3171 continue; 3172 Register Reg = Op.getReg(); 3173 assert(!Op.getSubReg()); 3174 // Only consider implicit operands of VCC. 3175 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3176 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3177 continue; 3178 if (!TRI.isSGPRReg(MRI, Reg)) 3179 continue; 3180 auto RegN = sgprPairNumber(Reg, TRI); 3181 if (!RegN) 3182 continue; 3183 if (IsVALU && Op.isUse()) { 3184 // Note: any access within a cycle must be considered a hazard. 3185 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3186 VALUReadHazardSGPRs.set(*RegN); 3187 ReadSGPRs.set(*RegN); 3188 } else if (IsSALU) { 3189 if (Op.isDef()) 3190 SALUWriteSGPRs.set(*RegN); 3191 else 3192 ReadSGPRs.set(*RegN); 3193 } 3194 } 3195 } 3196 } 3197 } 3198 3199 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3200 if (!ST.hasVALUReadSGPRHazard()) 3201 return false; 3202 3203 // The hazard sequence is fundamentally three instructions: 3204 // 1. VALU reads SGPR 3205 // 2. SALU writes SGPR 3206 // 3. VALU/SALU reads SGPR 3207 // Try to avoid searching for (1) because the expiry point of the hazard is 3208 // indeterminate; however, the hazard between (2) and (3) can expire if the 3209 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3210 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3211 // even if individual SGPRs are accessed. 3212 3213 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3214 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3215 if (!(MIIsSALU || MIIsVALU)) 3216 return false; 3217 3218 // Avoid expensive search when compile time is priority by 3219 // mitigating every SALU which writes an SGPR. 3220 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3221 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3222 return false; 3223 3224 const MachineOperand *SDSTOp = 3225 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3226 if (!SDSTOp || !SDSTOp->isReg()) 3227 return false; 3228 3229 const Register HazardReg = SDSTOp->getReg(); 3230 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3231 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3232 return false; 3233 3234 // Add s_wait_alu sa_sdst(0) after SALU write. 3235 auto NextMI = std::next(MI->getIterator()); 3236 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3237 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3238 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3239 3240 // SALU write may be s_getpc in a bundle. 3241 updateGetPCBundle(NewMI); 3242 3243 return true; 3244 } 3245 3246 // Pre-compute set of SGPR pairs read by VALUs. 3247 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3248 computeVALUHazardSGPRs(MI->getMF()); 3249 3250 // If no VALUs hazard SGPRs exist then nothing to do. 3251 if (VALUReadHazardSGPRs.none()) 3252 return false; 3253 3254 // All SGPR writes before a call/return must be flushed as the callee/caller 3255 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3256 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3257 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3258 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3259 3260 // Collect all SGPR sources for MI which are read by a VALU. 3261 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3262 SmallSet<Register, 4> SGPRsUsed; 3263 3264 if (!IsSetPC) { 3265 for (const MachineOperand &Op : MI->all_uses()) { 3266 Register OpReg = Op.getReg(); 3267 3268 // Only consider VCC implicit uses on VALUs. 3269 // The only expected SALU implicit access is SCC which is no hazard. 3270 if (MIIsSALU && Op.isImplicit()) 3271 continue; 3272 3273 if (!TRI.isSGPRReg(MRI, OpReg)) 3274 continue; 3275 3276 auto RegN = sgprPairNumber(OpReg, TRI); 3277 if (!RegN) 3278 continue; 3279 3280 if (!VALUReadHazardSGPRs[*RegN]) 3281 continue; 3282 3283 SGPRsUsed.insert(OpReg); 3284 } 3285 3286 // No SGPRs -> nothing to do. 3287 if (SGPRsUsed.empty()) 3288 return false; 3289 } 3290 3291 // A hazard is any SALU which writes one of the SGPRs read by MI. 3292 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3293 if (!SIInstrInfo::isSALU(I)) 3294 return false; 3295 // Ensure SGPR flush before call/return by conservatively assuming every 3296 // SALU writes an SGPR. 3297 if (IsSetPC && I.getNumDefs() > 0) 3298 return true; 3299 // Check for any register writes. 3300 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3301 return I.modifiesRegister(Reg, &TRI); 3302 }); 3303 }; 3304 3305 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3306 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3307 if (Count >= SALUExpiryCount) 3308 return true; 3309 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3310 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3311 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3312 return true; 3313 return false; 3314 }; 3315 3316 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3317 // Only count true SALUs as wait states. 3318 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3319 return 0; 3320 // SALU must be unrelated to any hazard registers. 3321 if (any_of(SGPRsUsed, 3322 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3323 return 0; 3324 return 1; 3325 }; 3326 3327 // Check for the hazard. 3328 DenseSet<const MachineBasicBlock *> Visited; 3329 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3330 std::next(MI->getReverseIterator()), 0, 3331 IsExpiredFn, Visited, WaitStatesFn); 3332 3333 if (WaitStates >= SALUExpiryCount) 3334 return false; 3335 3336 // Validate hazard through an exhaustive search. 3337 if (UseVALUReadHazardExhaustiveSearch) { 3338 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3339 // This is searching for (1) in the hazard description. 3340 auto hazardPair = [this](Register Reg) { 3341 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3342 return Register(AMDGPU::VCC); 3343 auto RegN = sgprPairNumber(Reg, TRI); 3344 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3345 }; 3346 auto SearchHazardFn = [this, hazardPair, 3347 &SGPRsUsed](const MachineInstr &I) { 3348 if (!SIInstrInfo::isVALU(I)) 3349 return false; 3350 // Check for any register reads. 3351 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3352 return I.readsRegister(hazardPair(Reg), &TRI); 3353 }); 3354 }; 3355 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3356 return false; 3357 }; 3358 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3359 std::numeric_limits<int>::max()) 3360 return false; 3361 } 3362 3363 // Add s_wait_alu sa_sdst(0) before SALU read. 3364 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3365 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3366 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3367 3368 // SALU read may be after s_getpc in a bundle. 3369 updateGetPCBundle(NewMI); 3370 3371 return true; 3372 } 3373 3374 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3375 const SIInstrInfo &TII) { 3376 MachineBasicBlock &EntryMBB = MF->front(); 3377 if (EntryMBB.begin() != EntryMBB.end()) { 3378 auto &EntryMI = *EntryMBB.begin(); 3379 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3380 EntryMI.getOperand(0).getImm() >= Priority) 3381 return false; 3382 } 3383 3384 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3385 .addImm(Priority); 3386 return true; 3387 } 3388 3389 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3390 if (!ST.hasRequiredExportPriority()) 3391 return false; 3392 3393 // Assume the following shader types will never have exports, 3394 // and avoid adding or adjusting S_SETPRIO. 3395 MachineBasicBlock *MBB = MI->getParent(); 3396 MachineFunction *MF = MBB->getParent(); 3397 auto CC = MF->getFunction().getCallingConv(); 3398 switch (CC) { 3399 case CallingConv::AMDGPU_CS: 3400 case CallingConv::AMDGPU_CS_Chain: 3401 case CallingConv::AMDGPU_CS_ChainPreserve: 3402 case CallingConv::AMDGPU_KERNEL: 3403 return false; 3404 default: 3405 break; 3406 } 3407 3408 const int MaxPriority = 3; 3409 const int NormalPriority = 2; 3410 const int PostExportPriority = 0; 3411 3412 auto It = MI->getIterator(); 3413 switch (MI->getOpcode()) { 3414 case AMDGPU::S_ENDPGM: 3415 case AMDGPU::S_ENDPGM_SAVED: 3416 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3417 case AMDGPU::SI_RETURN_TO_EPILOG: 3418 // Ensure shader with calls raises priority at entry. 3419 // This ensures correct priority if exports exist in callee. 3420 if (MF->getFrameInfo().hasCalls()) 3421 return ensureEntrySetPrio(MF, NormalPriority, TII); 3422 return false; 3423 case AMDGPU::S_SETPRIO: { 3424 // Raise minimum priority unless in workaround. 3425 auto &PrioOp = MI->getOperand(0); 3426 int Prio = PrioOp.getImm(); 3427 bool InWA = (Prio == PostExportPriority) && 3428 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3429 if (InWA || Prio >= NormalPriority) 3430 return false; 3431 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3432 return true; 3433 } 3434 default: 3435 if (!TII.isEXP(*MI)) 3436 return false; 3437 break; 3438 } 3439 3440 // Check entry priority at each export (as there will only be a few). 3441 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3442 bool Changed = false; 3443 if (CC != CallingConv::AMDGPU_Gfx) 3444 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3445 3446 auto NextMI = std::next(It); 3447 bool EndOfShader = false; 3448 if (NextMI != MBB->end()) { 3449 // Only need WA at end of sequence of exports. 3450 if (TII.isEXP(*NextMI)) 3451 return Changed; 3452 // Assume appropriate S_SETPRIO after export means WA already applied. 3453 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3454 NextMI->getOperand(0).getImm() == PostExportPriority) 3455 return Changed; 3456 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3457 } 3458 3459 const DebugLoc &DL = MI->getDebugLoc(); 3460 3461 // Lower priority. 3462 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3463 .addImm(PostExportPriority); 3464 3465 if (!EndOfShader) { 3466 // Wait for exports to complete. 3467 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3468 .addReg(AMDGPU::SGPR_NULL) 3469 .addImm(0); 3470 } 3471 3472 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3473 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3474 3475 if (!EndOfShader) { 3476 // Return to normal (higher) priority. 3477 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3478 .addImm(NormalPriority); 3479 } 3480 3481 return true; 3482 } 3483