1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 UseVALUReadHazardExhaustiveSearch(false), 64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 || 172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || 173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || 174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || 175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; 176 } 177 178 static bool isLdsDma(const MachineInstr &MI) { 179 return SIInstrInfo::isVALU(MI) && 180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 181 } 182 183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 185 AMDGPU::OpName::simm16); 186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 187 } 188 189 ScheduleHazardRecognizer::HazardType 190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 191 MachineInstr *MI = SU->getInstr(); 192 // If we are not in "HazardRecognizerMode" and therefore not being run from 193 // the scheduler, track possible stalls from hazards but don't insert noops. 194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 195 196 if (MI->isBundle()) 197 return NoHazard; 198 199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 200 return HazardType; 201 202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 203 return HazardType; 204 205 if (checkFPAtomicToDenormModeHazard(MI) > 0) 206 return HazardType; 207 208 if (ST.hasNoDataDepHazard()) 209 return NoHazard; 210 211 // FIXME: Should flat be considered vmem? 212 if ((SIInstrInfo::isVMEM(*MI) || 213 SIInstrInfo::isFLAT(*MI)) 214 && checkVMEMHazards(MI) > 0) 215 return HazardType; 216 217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 218 return HazardType; 219 220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 221 return HazardType; 222 223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 224 return HazardType; 225 226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 227 return HazardType; 228 229 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 230 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 232 return HazardType; 233 234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 235 return HazardType; 236 237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 238 return HazardType; 239 240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 241 return HazardType; 242 243 if (((ST.hasReadM0MovRelInterpHazard() && 244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 247 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 249 (ST.hasReadM0LdsDirectHazard() && 250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 251 checkReadM0Hazards(MI) > 0) 252 return HazardType; 253 254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 255 return HazardType; 256 257 if ((SIInstrInfo::isVMEM(*MI) || 258 SIInstrInfo::isFLAT(*MI) || 259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 260 return HazardType; 261 262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 263 return HazardType; 264 265 return NoHazard; 266 } 267 268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 269 unsigned Quantity) { 270 while (Quantity > 0) { 271 unsigned Arg = std::min(Quantity, 8u); 272 Quantity -= Arg; 273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 274 .addImm(Arg - 1); 275 } 276 } 277 278 unsigned 279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 281 assert(TSchedModel.getWriteProcResBegin(SC) != 282 TSchedModel.getWriteProcResEnd(SC)); 283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 284 } 285 286 void GCNHazardRecognizer::processBundle() { 287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 288 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 289 // Check bundled MachineInstr's for hazards. 290 for (; MI != E && MI->isInsideBundle(); ++MI) { 291 CurrCycleInstr = &*MI; 292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 293 294 if (IsHazardRecognizerMode) { 295 fixHazards(CurrCycleInstr); 296 297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 298 } 299 300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 301 // include the bundled MI directly after, only add a maximum of 302 // (MaxLookAhead - 1) noops to EmittedInstrs. 303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 304 EmittedInstrs.push_front(nullptr); 305 306 EmittedInstrs.push_front(CurrCycleInstr); 307 EmittedInstrs.resize(MaxLookAhead); 308 } 309 CurrCycleInstr = nullptr; 310 } 311 312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 313 assert(IsHazardRecognizerMode); 314 315 unsigned NumPreNoops = PreEmitNoops(MI); 316 EmitNoops(NumPreNoops); 317 if (MI->isInsideBundle()) 318 insertNoopsInBundle(MI, TII, NumPreNoops); 319 else 320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 321 NumPreNoops); 322 EmitInstruction(MI); 323 AdvanceCycle(); 324 } 325 326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 327 IsHazardRecognizerMode = true; 328 CurrCycleInstr = MI; 329 unsigned W = PreEmitNoopsCommon(MI); 330 fixHazards(MI); 331 CurrCycleInstr = nullptr; 332 return W; 333 } 334 335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 336 if (MI->isBundle()) 337 return 0; 338 339 int WaitStates = 0; 340 341 if (SIInstrInfo::isSMRD(*MI)) 342 return std::max(WaitStates, checkSMRDHazards(MI)); 343 344 if (ST.hasNSAtoVMEMBug()) 345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 346 347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 348 349 if (ST.hasNoDataDepHazard()) 350 return WaitStates; 351 352 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 354 355 if (SIInstrInfo::isVALU(*MI)) 356 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 357 358 if (SIInstrInfo::isDPP(*MI)) 359 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 360 361 if (isDivFMas(MI->getOpcode())) 362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 363 364 if (isRWLane(MI->getOpcode())) 365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 366 367 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 368 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 371 372 if (MI->isInlineAsm()) 373 return std::max(WaitStates, checkInlineAsmHazards(MI)); 374 375 if (isSGetReg(MI->getOpcode())) 376 return std::max(WaitStates, checkGetRegHazards(MI)); 377 378 if (isSSetReg(MI->getOpcode())) 379 return std::max(WaitStates, checkSetRegHazards(MI)); 380 381 if (isRFE(MI->getOpcode())) 382 return std::max(WaitStates, checkRFEHazards(MI)); 383 384 if ((ST.hasReadM0MovRelInterpHazard() && 385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 388 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 390 (ST.hasReadM0LdsDirectHazard() && 391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 392 return std::max(WaitStates, checkReadM0Hazards(MI)); 393 394 if (SIInstrInfo::isMAI(*MI)) 395 return std::max(WaitStates, checkMAIHazards(MI)); 396 397 if (SIInstrInfo::isVMEM(*MI) || 398 SIInstrInfo::isFLAT(*MI) || 399 SIInstrInfo::isDS(*MI)) 400 return std::max(WaitStates, checkMAILdStHazards(MI)); 401 402 if (ST.hasGFX950Insts() && isPermlane(*MI)) 403 return std::max(WaitStates, checkPermlaneHazards(MI)); 404 405 return WaitStates; 406 } 407 408 void GCNHazardRecognizer::EmitNoop() { 409 EmittedInstrs.push_front(nullptr); 410 } 411 412 void GCNHazardRecognizer::AdvanceCycle() { 413 // When the scheduler detects a stall, it will call AdvanceCycle() without 414 // emitting any instructions. 415 if (!CurrCycleInstr) { 416 EmittedInstrs.push_front(nullptr); 417 return; 418 } 419 420 if (CurrCycleInstr->isBundle()) { 421 processBundle(); 422 return; 423 } 424 425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 426 if (!NumWaitStates) { 427 CurrCycleInstr = nullptr; 428 return; 429 } 430 431 // Keep track of emitted instructions 432 EmittedInstrs.push_front(CurrCycleInstr); 433 434 // Add a nullptr for each additional wait state after the first. Make sure 435 // not to add more than getMaxLookAhead() items to the list, since we 436 // truncate the list to that size right after this loop. 437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 438 i < e; ++i) { 439 EmittedInstrs.push_front(nullptr); 440 } 441 442 // getMaxLookahead() is the largest number of wait states we will ever need 443 // to insert, so there is no point in keeping track of more than that many 444 // wait states. 445 EmittedInstrs.resize(getMaxLookAhead()); 446 447 CurrCycleInstr = nullptr; 448 } 449 450 void GCNHazardRecognizer::RecedeCycle() { 451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 452 } 453 454 //===----------------------------------------------------------------------===// 455 // Helper Functions 456 //===----------------------------------------------------------------------===// 457 458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 459 460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 462 463 // Search for a hazard in a block and its predecessors. 464 template <typename StateT> 465 static bool 466 hasHazard(StateT State, 467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 468 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 469 const MachineBasicBlock *MBB, 470 MachineBasicBlock::const_reverse_instr_iterator I, 471 DenseSet<const MachineBasicBlock *> &Visited) { 472 for (auto E = MBB->instr_rend(); I != E; ++I) { 473 // No need to look at parent BUNDLE instructions. 474 if (I->isBundle()) 475 continue; 476 477 switch (IsHazard(State, *I)) { 478 case HazardFound: 479 return true; 480 case HazardExpired: 481 return false; 482 default: 483 // Continue search 484 break; 485 } 486 487 if (I->isInlineAsm() || I->isMetaInstruction()) 488 continue; 489 490 UpdateState(State, *I); 491 } 492 493 for (MachineBasicBlock *Pred : MBB->predecessors()) { 494 if (!Visited.insert(Pred).second) 495 continue; 496 497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 498 Visited)) 499 return true; 500 } 501 502 return false; 503 } 504 505 // Returns a minimum wait states since \p I walking all predecessors. 506 // Only scans until \p IsExpired does not return true. 507 // Can only be run in a hazard recognizer mode. 508 static int getWaitStatesSince( 509 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 510 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 511 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 512 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 513 for (auto E = MBB->instr_rend(); I != E; ++I) { 514 // Don't add WaitStates for parent BUNDLE instructions. 515 if (I->isBundle()) 516 continue; 517 518 if (IsHazard(*I)) 519 return WaitStates; 520 521 if (I->isInlineAsm()) 522 continue; 523 524 WaitStates += GetNumWaitStates(*I); 525 526 if (IsExpired(*I, WaitStates)) 527 return std::numeric_limits<int>::max(); 528 } 529 530 int MinWaitStates = std::numeric_limits<int>::max(); 531 for (MachineBasicBlock *Pred : MBB->predecessors()) { 532 if (!Visited.insert(Pred).second) 533 continue; 534 535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 536 IsExpired, Visited, GetNumWaitStates); 537 538 MinWaitStates = std::min(MinWaitStates, W); 539 } 540 541 return MinWaitStates; 542 } 543 544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 545 const MachineInstr *MI, IsExpiredFn IsExpired) { 546 DenseSet<const MachineBasicBlock *> Visited; 547 return getWaitStatesSince(IsHazard, MI->getParent(), 548 std::next(MI->getReverseIterator()), 549 0, IsExpired, Visited); 550 } 551 552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 553 if (IsHazardRecognizerMode) { 554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 555 return WaitStates >= Limit; 556 }; 557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 558 } 559 560 int WaitStates = 0; 561 for (MachineInstr *MI : EmittedInstrs) { 562 if (MI) { 563 if (IsHazard(*MI)) 564 return WaitStates; 565 566 if (MI->isInlineAsm()) 567 continue; 568 } 569 ++WaitStates; 570 571 if (WaitStates >= Limit) 572 break; 573 } 574 return std::numeric_limits<int>::max(); 575 } 576 577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 578 IsHazardFn IsHazardDef, 579 int Limit) { 580 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 581 582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 584 }; 585 586 return getWaitStatesSince(IsHazardFn, Limit); 587 } 588 589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 590 int Limit) { 591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 592 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 593 }; 594 595 return getWaitStatesSince(IsHazardFn, Limit); 596 } 597 598 //===----------------------------------------------------------------------===// 599 // No-op Hazard Detection 600 //===----------------------------------------------------------------------===// 601 602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 603 MCRegister Reg) { 604 for (MCRegUnit Unit : TRI.regunits(Reg)) 605 BV.set(Unit); 606 } 607 608 static void addRegsToSet(const SIRegisterInfo &TRI, 609 iterator_range<MachineInstr::const_mop_iterator> Ops, 610 BitVector &DefSet, BitVector &UseSet) { 611 for (const MachineOperand &Op : Ops) { 612 if (Op.isReg()) 613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 614 } 615 } 616 617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 619 } 620 621 static bool breaksSMEMSoftClause(MachineInstr *MI) { 622 return !SIInstrInfo::isSMRD(*MI); 623 } 624 625 static bool breaksVMEMSoftClause(MachineInstr *MI) { 626 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 627 } 628 629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 630 // SMEM soft clause are only present on VI+, and only matter if xnack is 631 // enabled. 632 if (!ST.isXNACKEnabled()) 633 return 0; 634 635 bool IsSMRD = TII.isSMRD(*MEM); 636 637 resetClause(); 638 639 // A soft-clause is any group of consecutive SMEM instructions. The 640 // instructions in this group may return out of order and/or may be 641 // replayed (i.e. the same instruction issued more than once). 642 // 643 // In order to handle these situations correctly we need to make sure that 644 // when a clause has more than one instruction, no instruction in the clause 645 // writes to a register that is read by another instruction in the clause 646 // (including itself). If we encounter this situation, we need to break the 647 // clause by inserting a non SMEM instruction. 648 649 for (MachineInstr *MI : EmittedInstrs) { 650 // When we hit a non-SMEM instruction then we have passed the start of the 651 // clause and we can stop. 652 if (!MI) 653 break; 654 655 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 656 break; 657 658 addClauseInst(*MI); 659 } 660 661 if (ClauseDefs.none()) 662 return 0; 663 664 // We need to make sure not to put loads and stores in the same clause if they 665 // use the same address. For now, just start a new clause whenever we see a 666 // store. 667 if (MEM->mayStore()) 668 return 1; 669 670 addClauseInst(*MEM); 671 672 // If the set of defs and uses intersect then we cannot add this instruction 673 // to the clause, so we have a hazard. 674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 675 } 676 677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 678 int WaitStatesNeeded = 0; 679 680 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 681 682 // This SMRD hazard only affects SI. 683 if (!ST.hasSMRDReadVALUDefHazard()) 684 return WaitStatesNeeded; 685 686 // A read of an SGPR by SMRD instruction requires 4 wait states when the 687 // SGPR was written by a VALU instruction. 688 int SmrdSgprWaitStates = 4; 689 auto IsHazardDefFn = [this](const MachineInstr &MI) { 690 return TII.isVALU(MI); 691 }; 692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 693 return TII.isSALU(MI); 694 }; 695 696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 697 698 for (const MachineOperand &Use : SMRD->uses()) { 699 if (!Use.isReg()) 700 continue; 701 int WaitStatesNeededForUse = 702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 703 SmrdSgprWaitStates); 704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 705 706 // This fixes what appears to be undocumented hardware behavior in SI where 707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 708 // needs some number of nops in between. We don't know how many we need, but 709 // let's use 4. This wasn't discovered before probably because the only 710 // case when this happens is when we expand a 64-bit pointer into a full 711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 712 // probably never encountered in the closed-source land. 713 if (IsBufferSMRD) { 714 int WaitStatesNeededForUse = 715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 716 IsBufferHazardDefFn, 717 SmrdSgprWaitStates); 718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 719 } 720 } 721 722 return WaitStatesNeeded; 723 } 724 725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 726 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 727 return 0; 728 729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 730 731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 732 // SGPR was written by a VALU Instruction. 733 const int VmemSgprWaitStates = 5; 734 auto IsHazardDefFn = [this](const MachineInstr &MI) { 735 return TII.isVALU(MI); 736 }; 737 for (const MachineOperand &Use : VMEM->uses()) { 738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 739 continue; 740 741 int WaitStatesNeededForUse = 742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 743 VmemSgprWaitStates); 744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 745 } 746 return WaitStatesNeeded; 747 } 748 749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 750 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 751 const SIInstrInfo *TII = ST.getInstrInfo(); 752 753 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 754 int DppVgprWaitStates = 2; 755 int DppExecWaitStates = 5; 756 int WaitStatesNeeded = 0; 757 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 758 return TII->isVALU(MI); 759 }; 760 761 for (const MachineOperand &Use : DPP->uses()) { 762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 763 continue; 764 int WaitStatesNeededForUse = 765 DppVgprWaitStates - getWaitStatesSinceDef( 766 Use.getReg(), 767 [](const MachineInstr &) { return true; }, 768 DppVgprWaitStates); 769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 770 } 771 772 WaitStatesNeeded = std::max( 773 WaitStatesNeeded, 774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 775 DppExecWaitStates)); 776 777 return WaitStatesNeeded; 778 } 779 780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 781 const SIInstrInfo *TII = ST.getInstrInfo(); 782 783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 784 // instruction. 785 const int DivFMasWaitStates = 4; 786 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 787 return TII->isVALU(MI); 788 }; 789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 790 DivFMasWaitStates); 791 792 return DivFMasWaitStates - WaitStatesNeeded; 793 } 794 795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 796 const SIInstrInfo *TII = ST.getInstrInfo(); 797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 798 799 const int GetRegWaitStates = 2; 800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 801 return GetRegHWReg == getHWReg(TII, MI); 802 }; 803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 804 805 return GetRegWaitStates - WaitStatesNeeded; 806 } 807 808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 809 const SIInstrInfo *TII = ST.getInstrInfo(); 810 unsigned HWReg = getHWReg(TII, *SetRegInstr); 811 812 const int SetRegWaitStates = ST.getSetRegWaitStates(); 813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 814 return HWReg == getHWReg(TII, MI); 815 }; 816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 817 return SetRegWaitStates - WaitStatesNeeded; 818 } 819 820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 821 if (!MI.mayStore()) 822 return -1; 823 824 const SIInstrInfo *TII = ST.getInstrInfo(); 825 unsigned Opcode = MI.getOpcode(); 826 const MCInstrDesc &Desc = MI.getDesc(); 827 828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 829 int VDataRCID = -1; 830 if (VDataIdx != -1) 831 VDataRCID = Desc.operands()[VDataIdx].RegClass; 832 833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 834 // There is no hazard if the instruction does not use vector regs 835 // (like wbinvl1) 836 if (VDataIdx == -1) 837 return -1; 838 // For MUBUF/MTBUF instructions this hazard only exists if the 839 // instruction is not using a register in the soffset field. 840 const MachineOperand *SOffset = 841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 842 // If we have no soffset operand, then assume this field has been 843 // hardcoded to zero. 844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 845 (!SOffset || !SOffset->isReg())) 846 return VDataIdx; 847 } 848 849 // MIMG instructions create a hazard if they don't use a 256-bit T# and 850 // the store size is greater than 8 bytes and they have more than two bits 851 // of their dmask set. 852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 853 if (TII->isMIMG(MI)) { 854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 855 assert(SRsrcIdx != -1 && 856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 857 (void)SRsrcIdx; 858 } 859 860 if (TII->isFLAT(MI)) { 861 // There is no hazard if the instruction does not use vector regs 862 if (VDataIdx == -1) 863 return -1; 864 865 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) 866 return VDataIdx; 867 } 868 869 return -1; 870 } 871 872 int 873 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 874 const MachineRegisterInfo &MRI) { 875 // Helper to check for the hazard where VMEM instructions that store more than 876 // 8 bytes can have there store data over written by the next instruction. 877 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 878 879 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 880 int WaitStatesNeeded = 0; 881 882 if (!TRI->isVectorRegister(MRI, Def.getReg())) 883 return WaitStatesNeeded; 884 Register Reg = Def.getReg(); 885 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 886 int DataIdx = createsVALUHazard(MI); 887 return DataIdx >= 0 && 888 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 889 }; 890 891 int WaitStatesNeededForDef = 892 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 893 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 894 895 return WaitStatesNeeded; 896 } 897 898 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 899 /// pack the computed value into correct bit position of the dest register. This 900 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 901 /// dst_sel that is not aligned to the register. This function analayzes the \p 902 /// MI and \returns an operand with dst forwarding issue, or nullptr if 903 /// none exists. 904 static const MachineOperand * 905 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 906 if (!SIInstrInfo::isVALU(MI)) 907 return nullptr; 908 909 const SIInstrInfo *TII = ST.getInstrInfo(); 910 911 unsigned Opcode = MI.getOpcode(); 912 913 // There are three different types of instructions 914 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 915 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst 916 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and 917 // op_sel[3:2] 918 // != 0 919 if (SIInstrInfo::isSDWA(MI)) { 920 // Type 1: SDWA with dst_sel != DWORD 921 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 922 if (DstSel->getImm() != AMDGPU::SDWA::DWORD) 923 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 924 } 925 926 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode); 927 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) { 928 // Type 2: VOP3 which write the hi bits 929 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) & 930 SISrcMods::DST_OP_SEL) 931 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 932 933 // Type 3: FP8DstSelInst with op_sel[3:2] != 0) 934 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 && 935 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) & 936 SISrcMods::OP_SEL_0)) 937 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 938 } 939 940 // Special case: nop is required for all the opsel values for fp4 sr variant 941 // cvt scale instructions 942 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4) 943 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 944 945 return nullptr; 946 } 947 948 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 949 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 950 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 951 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 952 const MachineOperand *Dst, 953 const SIRegisterInfo *TRI) { 954 // We must consider implicit reads of the VALU. SDWA with dst_sel and 955 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 956 // and we must account for that hazard. 957 // We also must account for WAW hazards. In particular, WAW with dest 958 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 959 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 960 // check for ECC. Without accounting for this hazard, the ECC will be 961 // wrong. 962 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 963 // complete zeroesHigh16BitsOfDest) 964 for (auto &Operand : VALU->operands()) { 965 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 966 return true; 967 } 968 } 969 return false; 970 } 971 972 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 973 int WaitStatesNeeded = 0; 974 975 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 976 const int TransDefWaitstates = 1; 977 978 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 979 if (!SIInstrInfo::isTRANS(MI)) 980 return false; 981 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 982 const SIInstrInfo *TII = ST.getInstrInfo(); 983 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 984 985 for (const MachineOperand &Use : VALU->explicit_uses()) { 986 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 987 return true; 988 } 989 990 return false; 991 }; 992 993 int WaitStatesNeededForDef = 994 TransDefWaitstates - 995 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 996 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 997 } 998 999 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) { 1000 const int Shift16DefWaitstates = 1; 1001 1002 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 1003 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1004 const MachineOperand *ForwardedDst = 1005 getDstSelForwardingOperand(ProducerMI, ST); 1006 if (ForwardedDst) { 1007 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 1008 } 1009 1010 if (ProducerMI.isInlineAsm()) { 1011 // Assume inline asm has dst forwarding hazard 1012 for (auto &Def : ProducerMI.all_defs()) { 1013 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 1014 return true; 1015 } 1016 } 1017 1018 return false; 1019 }; 1020 1021 int WaitStatesNeededForDef = 1022 Shift16DefWaitstates - 1023 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1024 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1025 } 1026 1027 if (ST.hasVDecCoExecHazard()) { 1028 const int VALUWriteSGPRVALUReadWaitstates = 2; 1029 const int VALUWriteEXECRWLane = 4; 1030 const int VALUWriteVGPRReadlaneRead = 1; 1031 1032 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1033 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1034 Register UseReg; 1035 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1036 if (!SIInstrInfo::isVALU(MI)) 1037 return false; 1038 return MI.modifiesRegister(UseReg, TRI); 1039 }; 1040 1041 for (const MachineOperand &Use : VALU->explicit_uses()) { 1042 if (!Use.isReg()) 1043 continue; 1044 1045 UseReg = Use.getReg(); 1046 if (TRI->isSGPRReg(MRI, UseReg)) { 1047 int WaitStatesNeededForDef = 1048 VALUWriteSGPRVALUReadWaitstates - 1049 getWaitStatesSince(IsVALUDefSGPRFn, 1050 VALUWriteSGPRVALUReadWaitstates); 1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1052 } 1053 } 1054 1055 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1056 UseReg = AMDGPU::VCC; 1057 int WaitStatesNeededForDef = 1058 VALUWriteSGPRVALUReadWaitstates - 1059 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1060 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1061 } 1062 1063 switch (VALU->getOpcode()) { 1064 case AMDGPU::V_READLANE_B32: 1065 case AMDGPU::V_READFIRSTLANE_B32: { 1066 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1067 UseReg = Src->getReg(); 1068 int WaitStatesNeededForDef = 1069 VALUWriteVGPRReadlaneRead - 1070 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1071 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1072 } 1073 [[fallthrough]]; 1074 case AMDGPU::V_WRITELANE_B32: { 1075 UseReg = AMDGPU::EXEC; 1076 int WaitStatesNeededForDef = 1077 VALUWriteEXECRWLane - 1078 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1079 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1080 break; 1081 } 1082 default: 1083 break; 1084 } 1085 } 1086 1087 // This checks for the hazard where VMEM instructions that store more than 1088 // 8 bytes can have there store data over written by the next instruction. 1089 if (!ST.has12DWordStoreHazard()) 1090 return WaitStatesNeeded; 1091 1092 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1093 1094 for (const MachineOperand &Def : VALU->defs()) { 1095 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1096 } 1097 1098 return WaitStatesNeeded; 1099 } 1100 1101 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1102 // This checks for hazards associated with inline asm statements. 1103 // Since inline asms can contain just about anything, we use this 1104 // to call/leverage other check*Hazard routines. Note that 1105 // this function doesn't attempt to address all possible inline asm 1106 // hazards (good luck), but is a collection of what has been 1107 // problematic thus far. 1108 1109 // see checkVALUHazards() 1110 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() && 1111 !ST.hasCvtScaleForwardingHazard()) 1112 return 0; 1113 1114 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1115 int WaitStatesNeeded = 0; 1116 1117 for (const MachineOperand &Op : 1118 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1119 if (Op.isReg() && Op.isDef()) { 1120 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1121 continue; 1122 1123 if (ST.has12DWordStoreHazard()) { 1124 WaitStatesNeeded = 1125 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1126 } 1127 } 1128 } 1129 1130 if (ST.hasDstSelForwardingHazard()) { 1131 const int Shift16DefWaitstates = 1; 1132 1133 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1134 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1135 // Assume inline asm reads the dst 1136 if (Dst) 1137 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1138 IA->readsRegister(Dst->getReg(), &TRI); 1139 1140 if (ProducerMI.isInlineAsm()) { 1141 // If MI is inline asm, assume it has dst forwarding hazard 1142 for (auto &Def : ProducerMI.all_defs()) { 1143 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1144 IA->readsRegister(Def.getReg(), &TRI)) { 1145 return true; 1146 } 1147 } 1148 } 1149 1150 return false; 1151 }; 1152 1153 int WaitStatesNeededForDef = 1154 Shift16DefWaitstates - 1155 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1157 } 1158 1159 return WaitStatesNeeded; 1160 } 1161 1162 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1163 const SIInstrInfo *TII = ST.getInstrInfo(); 1164 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1165 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1166 1167 const MachineOperand *LaneSelectOp = 1168 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1169 1170 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1171 return 0; 1172 1173 Register LaneSelectReg = LaneSelectOp->getReg(); 1174 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1175 1176 const int RWLaneWaitStates = 4; 1177 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1178 RWLaneWaitStates); 1179 return RWLaneWaitStates - WaitStatesSince; 1180 } 1181 1182 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1183 if (!ST.hasRFEHazards()) 1184 return 0; 1185 1186 const SIInstrInfo *TII = ST.getInstrInfo(); 1187 1188 const int RFEWaitStates = 1; 1189 1190 auto IsHazardFn = [TII](const MachineInstr &MI) { 1191 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1192 }; 1193 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1194 return RFEWaitStates - WaitStatesNeeded; 1195 } 1196 1197 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1198 const SIInstrInfo *TII = ST.getInstrInfo(); 1199 const int ReadM0WaitStates = 1; 1200 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1201 return ReadM0WaitStates - 1202 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1203 } 1204 1205 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1206 fixVMEMtoScalarWriteHazards(MI); 1207 fixVcmpxPermlaneHazards(MI); 1208 fixSMEMtoVectorWriteHazards(MI); 1209 fixVcmpxExecWARHazard(MI); 1210 fixLdsBranchVmemWARHazard(MI); 1211 if (ST.hasLdsDirect()) { 1212 fixLdsDirectVALUHazard(MI); 1213 fixLdsDirectVMEMHazard(MI); 1214 } 1215 fixVALUPartialForwardingHazard(MI); 1216 fixVALUTransUseHazard(MI); 1217 fixWMMAHazards(MI); 1218 fixShift64HighRegBug(MI); 1219 fixVALUMaskWriteHazard(MI); 1220 fixVALUReadSGPRHazard(MI); 1221 fixRequiredExportPriority(MI); 1222 } 1223 1224 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, 1225 const MachineInstr &MI) { 1226 return (TII.isVOPC(MI) || 1227 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) && 1228 MI.modifiesRegister(AMDGPU::EXEC, &TRI); 1229 } 1230 1231 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1232 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1233 return false; 1234 1235 const SIInstrInfo *TII = ST.getInstrInfo(); 1236 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1237 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1238 return isVCmpXWritesExec(*TII, *TRI, MI); 1239 }; 1240 1241 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1242 unsigned Opc = MI.getOpcode(); 1243 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1244 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1245 }; 1246 1247 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1248 std::numeric_limits<int>::max()) 1249 return false; 1250 1251 // V_NOP will be discarded by SQ. 1252 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1253 // which is always a VGPR and available. 1254 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1255 Register Reg = Src0->getReg(); 1256 bool IsUndef = Src0->isUndef(); 1257 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1258 TII->get(AMDGPU::V_MOV_B32_e32)) 1259 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1260 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1261 1262 return true; 1263 } 1264 1265 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1266 if (!ST.hasVMEMtoScalarWriteHazard()) 1267 return false; 1268 assert(!ST.hasExtendedWaitCounts()); 1269 1270 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1271 return false; 1272 1273 if (MI->getNumDefs() == 0) 1274 return false; 1275 1276 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1277 1278 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1279 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1280 !SIInstrInfo::isFLAT(I)) 1281 return false; 1282 1283 for (const MachineOperand &Def : MI->defs()) { 1284 const MachineOperand *Op = 1285 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1286 if (!Op) 1287 continue; 1288 return true; 1289 } 1290 return false; 1291 }; 1292 1293 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1294 return SIInstrInfo::isVALU(MI) || 1295 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1296 !MI.getOperand(0).getImm()) || 1297 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1298 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1299 }; 1300 1301 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1302 std::numeric_limits<int>::max()) 1303 return false; 1304 1305 const SIInstrInfo *TII = ST.getInstrInfo(); 1306 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1307 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1308 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1309 return true; 1310 } 1311 1312 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1313 if (!ST.hasSMEMtoVectorWriteHazard()) 1314 return false; 1315 assert(!ST.hasExtendedWaitCounts()); 1316 1317 if (!SIInstrInfo::isVALU(*MI)) 1318 return false; 1319 1320 unsigned SDSTName; 1321 switch (MI->getOpcode()) { 1322 case AMDGPU::V_READLANE_B32: 1323 case AMDGPU::V_READFIRSTLANE_B32: 1324 SDSTName = AMDGPU::OpName::vdst; 1325 break; 1326 default: 1327 SDSTName = AMDGPU::OpName::sdst; 1328 break; 1329 } 1330 1331 const SIInstrInfo *TII = ST.getInstrInfo(); 1332 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1333 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1334 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1335 if (!SDST) { 1336 for (const auto &MO : MI->implicit_operands()) { 1337 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1338 SDST = &MO; 1339 break; 1340 } 1341 } 1342 } 1343 1344 if (!SDST) 1345 return false; 1346 1347 const Register SDSTReg = SDST->getReg(); 1348 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1349 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1350 }; 1351 1352 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1353 if (TII->isSALU(MI)) { 1354 switch (MI.getOpcode()) { 1355 case AMDGPU::S_SETVSKIP: 1356 case AMDGPU::S_VERSION: 1357 case AMDGPU::S_WAITCNT_VSCNT: 1358 case AMDGPU::S_WAITCNT_VMCNT: 1359 case AMDGPU::S_WAITCNT_EXPCNT: 1360 // These instructions cannot not mitigate the hazard. 1361 return false; 1362 case AMDGPU::S_WAITCNT_LGKMCNT: 1363 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1364 return (MI.getOperand(1).getImm() == 0) && 1365 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1366 case AMDGPU::S_WAITCNT: { 1367 const int64_t Imm = MI.getOperand(0).getImm(); 1368 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1369 // DsCnt corresponds to LGKMCnt here. 1370 return (Decoded.DsCnt == 0); 1371 } 1372 default: 1373 // SOPP instructions cannot mitigate the hazard. 1374 if (TII->isSOPP(MI)) 1375 return false; 1376 // At this point the SALU can be assumed to mitigate the hazard 1377 // because either: 1378 // (a) it is independent of the at risk SMEM (breaking chain), 1379 // or 1380 // (b) it is dependent on the SMEM, in which case an appropriate 1381 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1382 // SMEM instruction. 1383 return true; 1384 } 1385 } 1386 return false; 1387 }; 1388 1389 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1390 std::numeric_limits<int>::max()) 1391 return false; 1392 1393 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1394 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1395 .addImm(0); 1396 return true; 1397 } 1398 1399 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1400 if (!ST.hasVcmpxExecWARHazard()) 1401 return false; 1402 assert(!ST.hasExtendedWaitCounts()); 1403 1404 if (!SIInstrInfo::isVALU(*MI)) 1405 return false; 1406 1407 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1408 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1409 return false; 1410 1411 auto IsHazardFn = [TRI](const MachineInstr &I) { 1412 if (SIInstrInfo::isVALU(I)) 1413 return false; 1414 return I.readsRegister(AMDGPU::EXEC, TRI); 1415 }; 1416 1417 const SIInstrInfo *TII = ST.getInstrInfo(); 1418 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1419 if (SIInstrInfo::isVALU(MI)) { 1420 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1421 return true; 1422 for (auto MO : MI.implicit_operands()) 1423 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1424 return true; 1425 } 1426 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1427 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1428 return true; 1429 return false; 1430 }; 1431 1432 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1433 std::numeric_limits<int>::max()) 1434 return false; 1435 1436 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1437 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1438 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1439 return true; 1440 } 1441 1442 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1443 const GCNSubtarget &ST) { 1444 if (!ST.hasLdsBranchVmemWARHazard()) 1445 return false; 1446 1447 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1448 // instructions need to appear in the same function. 1449 bool HasLds = false; 1450 bool HasVmem = false; 1451 for (auto &MBB : MF) { 1452 for (auto &MI : MBB) { 1453 HasLds |= SIInstrInfo::isDS(MI); 1454 HasVmem |= 1455 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1456 if (HasLds && HasVmem) 1457 return true; 1458 } 1459 } 1460 return false; 1461 } 1462 1463 static bool isStoreCountWaitZero(const MachineInstr &I) { 1464 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1465 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1466 !I.getOperand(1).getImm(); 1467 } 1468 1469 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1470 if (!RunLdsBranchVmemWARHazardFixup) 1471 return false; 1472 1473 assert(ST.hasLdsBranchVmemWARHazard()); 1474 assert(!ST.hasExtendedWaitCounts()); 1475 1476 auto IsHazardInst = [](const MachineInstr &MI) { 1477 if (SIInstrInfo::isDS(MI)) 1478 return 1; 1479 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1480 return 2; 1481 return 0; 1482 }; 1483 1484 auto InstType = IsHazardInst(*MI); 1485 if (!InstType) 1486 return false; 1487 1488 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1489 return IsHazardInst(I) || isStoreCountWaitZero(I); 1490 }; 1491 1492 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1493 if (!I.isBranch()) 1494 return false; 1495 1496 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1497 auto InstType2 = IsHazardInst(I); 1498 return InstType2 && InstType != InstType2; 1499 }; 1500 1501 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1502 auto InstType2 = IsHazardInst(I); 1503 if (InstType == InstType2) 1504 return true; 1505 1506 return isStoreCountWaitZero(I); 1507 }; 1508 1509 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1510 std::numeric_limits<int>::max(); 1511 }; 1512 1513 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1514 std::numeric_limits<int>::max()) 1515 return false; 1516 1517 const SIInstrInfo *TII = ST.getInstrInfo(); 1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1519 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1520 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1521 .addImm(0); 1522 1523 return true; 1524 } 1525 1526 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1527 if (!SIInstrInfo::isLDSDIR(*MI)) 1528 return false; 1529 1530 const int NoHazardWaitStates = 15; 1531 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1532 const Register VDSTReg = VDST->getReg(); 1533 1534 bool VisitedTrans = false; 1535 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1536 if (!SIInstrInfo::isVALU(I)) 1537 return false; 1538 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1539 // Cover both WAR and WAW 1540 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1541 }; 1542 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1543 if (WaitStates >= NoHazardWaitStates) 1544 return true; 1545 // Instructions which cause va_vdst==0 expire hazard 1546 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1547 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1548 }; 1549 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1550 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1551 }; 1552 1553 DenseSet<const MachineBasicBlock *> Visited; 1554 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1555 std::next(MI->getReverseIterator()), 0, 1556 IsExpiredFn, Visited, GetWaitStatesFn); 1557 1558 // Transcendentals can execute in parallel to other VALUs. 1559 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1560 if (VisitedTrans) 1561 Count = 0; 1562 1563 MachineOperand *WaitVdstOp = 1564 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1565 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1566 1567 return true; 1568 } 1569 1570 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1571 if (!SIInstrInfo::isLDSDIR(*MI)) 1572 return false; 1573 1574 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1575 const Register VDSTReg = VDST->getReg(); 1576 1577 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1578 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1579 !SIInstrInfo::isDS(I)) 1580 return false; 1581 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1582 }; 1583 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1584 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1585 // according to the type of VMEM instruction. 1586 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1587 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1588 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1589 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1590 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1591 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1592 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1593 }; 1594 1595 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1596 std::numeric_limits<int>::max()) 1597 return false; 1598 1599 if (LdsdirCanWait) { 1600 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1601 } else { 1602 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1603 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1604 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1605 } 1606 1607 return true; 1608 } 1609 1610 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1611 if (!ST.hasVALUPartialForwardingHazard()) 1612 return false; 1613 assert(!ST.hasExtendedWaitCounts()); 1614 1615 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1616 return false; 1617 1618 SmallSetVector<Register, 4> SrcVGPRs; 1619 1620 for (const MachineOperand &Use : MI->explicit_uses()) { 1621 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1622 SrcVGPRs.insert(Use.getReg()); 1623 } 1624 1625 // Only applies with >= 2 unique VGPR sources 1626 if (SrcVGPRs.size() <= 1) 1627 return false; 1628 1629 // Look for the following pattern: 1630 // Va <- VALU [PreExecPos] 1631 // intv1 1632 // Exec <- SALU [ExecPos] 1633 // intv2 1634 // Vb <- VALU [PostExecPos] 1635 // intv3 1636 // MI Va, Vb (WaitState = 0) 1637 // 1638 // Where: 1639 // intv1 + intv2 <= 2 VALUs 1640 // intv3 <= 4 VALUs 1641 // 1642 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1643 1644 const int Intv1plus2MaxVALUs = 2; 1645 const int Intv3MaxVALUs = 4; 1646 const int IntvMaxVALUs = 6; 1647 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1648 1649 struct StateType { 1650 SmallDenseMap<Register, int, 4> DefPos; 1651 int ExecPos = std::numeric_limits<int>::max(); 1652 int VALUs = 0; 1653 }; 1654 1655 StateType State; 1656 1657 // This overloads expiry testing with all the hazard detection 1658 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1659 // Too many VALU states have passed 1660 if (State.VALUs > NoHazardVALUWaitStates) 1661 return HazardExpired; 1662 1663 // Instructions which cause va_vdst==0 expire hazard 1664 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1665 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1666 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1667 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1668 return HazardExpired; 1669 1670 // Track registers writes 1671 bool Changed = false; 1672 if (SIInstrInfo::isVALU(I)) { 1673 for (Register Src : SrcVGPRs) { 1674 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1675 State.DefPos[Src] = State.VALUs; 1676 Changed = true; 1677 } 1678 } 1679 } else if (SIInstrInfo::isSALU(I)) { 1680 if (State.ExecPos == std::numeric_limits<int>::max()) { 1681 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1682 State.ExecPos = State.VALUs; 1683 Changed = true; 1684 } 1685 } 1686 } 1687 1688 // Early expiration: too many VALUs in intv3 1689 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1690 return HazardExpired; 1691 1692 // Only evaluate state if something changed 1693 if (!Changed) 1694 return NoHazardFound; 1695 1696 // Determine positions of VALUs pre/post exec change 1697 if (State.ExecPos == std::numeric_limits<int>::max()) 1698 return NoHazardFound; 1699 1700 int PreExecPos = std::numeric_limits<int>::max(); 1701 int PostExecPos = std::numeric_limits<int>::max(); 1702 1703 for (auto Entry : State.DefPos) { 1704 int DefVALUs = Entry.second; 1705 if (DefVALUs != std::numeric_limits<int>::max()) { 1706 if (DefVALUs >= State.ExecPos) 1707 PreExecPos = std::min(PreExecPos, DefVALUs); 1708 else 1709 PostExecPos = std::min(PostExecPos, DefVALUs); 1710 } 1711 } 1712 1713 // Need a VALUs post exec change 1714 if (PostExecPos == std::numeric_limits<int>::max()) 1715 return NoHazardFound; 1716 1717 // Too many VALUs in intv3? 1718 int Intv3VALUs = PostExecPos; 1719 if (Intv3VALUs > Intv3MaxVALUs) 1720 return HazardExpired; 1721 1722 // Too many VALUs in intv2? 1723 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1724 if (Intv2VALUs > Intv1plus2MaxVALUs) 1725 return HazardExpired; 1726 1727 // Need a VALUs pre exec change 1728 if (PreExecPos == std::numeric_limits<int>::max()) 1729 return NoHazardFound; 1730 1731 // Too many VALUs in intv1? 1732 int Intv1VALUs = PreExecPos - State.ExecPos; 1733 if (Intv1VALUs > Intv1plus2MaxVALUs) 1734 return HazardExpired; 1735 1736 // Too many VALUs in intv1 + intv2 1737 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1738 return HazardExpired; 1739 1740 return HazardFound; 1741 }; 1742 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1743 if (SIInstrInfo::isVALU(MI)) 1744 State.VALUs += 1; 1745 }; 1746 1747 DenseSet<const MachineBasicBlock *> Visited; 1748 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1749 std::next(MI->getReverseIterator()), Visited)) 1750 return false; 1751 1752 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1753 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1754 .addImm(0x0fff); 1755 1756 return true; 1757 } 1758 1759 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1760 if (!ST.hasVALUTransUseHazard()) 1761 return false; 1762 assert(!ST.hasExtendedWaitCounts()); 1763 1764 if (!SIInstrInfo::isVALU(*MI)) 1765 return false; 1766 1767 SmallSet<Register, 4> SrcVGPRs; 1768 1769 for (const MachineOperand &Use : MI->explicit_uses()) { 1770 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1771 SrcVGPRs.insert(Use.getReg()); 1772 } 1773 1774 // Look for the following pattern: 1775 // Va <- TRANS VALU 1776 // intv 1777 // MI Va (WaitState = 0) 1778 // 1779 // Where: 1780 // intv <= 5 VALUs / 1 TRANS 1781 // 1782 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1783 1784 const int IntvMaxVALUs = 5; 1785 const int IntvMaxTRANS = 1; 1786 1787 struct StateType { 1788 int VALUs = 0; 1789 int TRANS = 0; 1790 }; 1791 1792 StateType State; 1793 1794 // This overloads expiry testing with all the hazard detection 1795 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1796 // Too many VALU states have passed 1797 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1798 return HazardExpired; 1799 1800 // Instructions which cause va_vdst==0 expire hazard 1801 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1802 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1803 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1804 I.getOperand(0).getImm() == 0x0fff)) 1805 return HazardExpired; 1806 1807 // Track registers writes 1808 if (SIInstrInfo::isTRANS(I)) { 1809 for (Register Src : SrcVGPRs) { 1810 if (I.modifiesRegister(Src, &TRI)) { 1811 return HazardFound; 1812 } 1813 } 1814 } 1815 1816 return NoHazardFound; 1817 }; 1818 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1819 if (SIInstrInfo::isVALU(MI)) 1820 State.VALUs += 1; 1821 if (SIInstrInfo::isTRANS(MI)) 1822 State.TRANS += 1; 1823 }; 1824 1825 DenseSet<const MachineBasicBlock *> Visited; 1826 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1827 std::next(MI->getReverseIterator()), Visited)) 1828 return false; 1829 1830 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1831 // avoided. 1832 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1833 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1834 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1835 1836 return true; 1837 } 1838 1839 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1840 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1841 return false; 1842 1843 const SIInstrInfo *TII = ST.getInstrInfo(); 1844 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1845 1846 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1847 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1848 return false; 1849 1850 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1851 // with the dest(matrix D) of the previous wmma. 1852 const Register CurSrc0Reg = 1853 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1854 const Register CurSrc1Reg = 1855 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1856 1857 const Register PrevDstReg = 1858 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1859 1860 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1861 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1862 return true; 1863 } 1864 1865 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1866 // but Index can't overlap with PrevDstReg. 1867 if (AMDGPU::isGFX12Plus(ST)) { 1868 if (SIInstrInfo::isSWMMAC(*MI)) { 1869 const Register CurIndex = 1870 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1871 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1872 return true; 1873 } 1874 return false; 1875 } 1876 1877 return false; 1878 }; 1879 1880 auto IsExpiredFn = [](const MachineInstr &I, int) { 1881 return SIInstrInfo::isVALU(I); 1882 }; 1883 1884 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1885 std::numeric_limits<int>::max()) 1886 return false; 1887 1888 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1889 1890 return true; 1891 } 1892 1893 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1894 if (!ST.hasShift64HighRegBug()) 1895 return false; 1896 assert(!ST.hasExtendedWaitCounts()); 1897 1898 switch (MI->getOpcode()) { 1899 default: 1900 return false; 1901 case AMDGPU::V_LSHLREV_B64_e64: 1902 case AMDGPU::V_LSHRREV_B64_e64: 1903 case AMDGPU::V_ASHRREV_I64_e64: 1904 break; 1905 } 1906 1907 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1908 if (!Amt->isReg()) 1909 return false; 1910 1911 Register AmtReg = Amt->getReg(); 1912 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1913 // Check if this is a last VGPR in the allocation block. 1914 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1915 return false; 1916 1917 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1918 return false; 1919 1920 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1921 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1922 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1923 bool Overlapped = OverlappedSrc || OverlappedDst; 1924 1925 assert(!OverlappedDst || !OverlappedSrc || 1926 Src1->getReg() == MI->getOperand(0).getReg()); 1927 assert(ST.needsAlignedVGPRs()); 1928 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1929 1930 Register NewReg; 1931 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1932 : AMDGPU::VGPR_32RegClass) { 1933 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1934 NewReg = Reg; 1935 break; 1936 } 1937 } 1938 1939 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1940 : NewReg; 1941 Register NewAmtLo; 1942 1943 if (Overlapped) 1944 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1945 1946 DebugLoc DL = MI->getDebugLoc(); 1947 MachineBasicBlock *MBB = MI->getParent(); 1948 // Insert a full wait count because found register might be pending a wait. 1949 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1950 .addImm(0); 1951 1952 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1953 if (Overlapped) 1954 runOnInstruction( 1955 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1956 .addDef(AmtReg - 1) 1957 .addReg(AmtReg - 1, RegState::Undef) 1958 .addReg(NewAmtLo, RegState::Undef)); 1959 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1960 .addDef(AmtReg) 1961 .addReg(AmtReg, RegState::Undef) 1962 .addReg(NewAmt, RegState::Undef)); 1963 1964 // Instructions emitted after the current instruction will be processed by the 1965 // parent loop of the hazard recognizer in a natural way. 1966 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1967 AmtReg) 1968 .addDef(NewAmt) 1969 .addReg(NewAmt) 1970 .addReg(AmtReg); 1971 if (Overlapped) 1972 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1973 AmtReg - 1) 1974 .addDef(NewAmtLo) 1975 .addReg(NewAmtLo) 1976 .addReg(AmtReg - 1); 1977 1978 // Re-running hazard recognizer on the modified instruction is not necessary, 1979 // inserted V_SWAP_B32 has already both read and write new registers so 1980 // hazards related to these register has already been handled. 1981 Amt->setReg(NewAmt); 1982 Amt->setIsKill(false); 1983 // We do not update liveness, so verifier may see it as undef. 1984 Amt->setIsUndef(); 1985 if (OverlappedDst) 1986 MI->getOperand(0).setReg(NewReg); 1987 if (OverlappedSrc) { 1988 Src1->setReg(NewReg); 1989 Src1->setIsKill(false); 1990 Src1->setIsUndef(); 1991 } 1992 1993 return true; 1994 } 1995 1996 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1997 int NSAtoVMEMWaitStates = 1; 1998 1999 if (!ST.hasNSAtoVMEMBug()) 2000 return 0; 2001 2002 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 2003 return 0; 2004 2005 const SIInstrInfo *TII = ST.getInstrInfo(); 2006 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2007 if (!Offset || (Offset->getImm() & 6) == 0) 2008 return 0; 2009 2010 auto IsHazardFn = [TII](const MachineInstr &I) { 2011 if (!SIInstrInfo::isMIMG(I)) 2012 return false; 2013 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 2014 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 2015 TII->getInstSizeInBytes(I) >= 16; 2016 }; 2017 2018 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 2019 } 2020 2021 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 2022 int FPAtomicToDenormModeWaitStates = 3; 2023 2024 if (!ST.hasFPAtomicToDenormModeHazard()) 2025 return 0; 2026 assert(!ST.hasExtendedWaitCounts()); 2027 2028 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2029 return 0; 2030 2031 auto IsHazardFn = [](const MachineInstr &I) { 2032 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2033 return false; 2034 return SIInstrInfo::isFPAtomic(I); 2035 }; 2036 2037 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2038 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2039 return true; 2040 2041 switch (MI.getOpcode()) { 2042 case AMDGPU::S_WAITCNT: 2043 case AMDGPU::S_WAITCNT_VSCNT: 2044 case AMDGPU::S_WAITCNT_VMCNT: 2045 case AMDGPU::S_WAITCNT_EXPCNT: 2046 case AMDGPU::S_WAITCNT_LGKMCNT: 2047 case AMDGPU::S_WAIT_IDLE: 2048 return true; 2049 default: 2050 break; 2051 } 2052 2053 return false; 2054 }; 2055 2056 return FPAtomicToDenormModeWaitStates - 2057 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2058 } 2059 2060 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2061 assert(SIInstrInfo::isMAI(*MI)); 2062 2063 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2064 } 2065 2066 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2067 // Early exit if no padding is requested. 2068 if (MFMAPaddingRatio == 0) 2069 return 0; 2070 2071 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2072 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2073 return 0; 2074 2075 int NeighborMFMALatency = 0; 2076 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2077 this](const MachineInstr &MI) { 2078 if (!SIInstrInfo::isMFMA(MI)) 2079 return false; 2080 2081 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2082 return true; 2083 }; 2084 2085 const int MaxMFMAPipelineWaitStates = 16; 2086 int WaitStatesSinceNeighborMFMA = 2087 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2088 2089 int NeighborMFMAPaddingNeeded = 2090 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2091 WaitStatesSinceNeighborMFMA; 2092 2093 return std::max(0, NeighborMFMAPaddingNeeded); 2094 } 2095 2096 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2097 int WaitStatesNeeded = 0; 2098 unsigned Opc = MI->getOpcode(); 2099 2100 auto IsVALUFn = [](const MachineInstr &MI) { 2101 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2102 }; 2103 2104 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2105 const int LegacyVALUWritesVGPRWaitStates = 2; 2106 const int VALUWritesExecWaitStates = 4; 2107 const int MaxWaitStates = 4; 2108 2109 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2110 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2111 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2112 2113 if (WaitStatesNeeded < MaxWaitStates) { 2114 for (const MachineOperand &Use : MI->explicit_uses()) { 2115 const int MaxWaitStates = 2; 2116 2117 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2118 continue; 2119 2120 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2121 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2122 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2123 2124 if (WaitStatesNeeded == MaxWaitStates) 2125 break; 2126 } 2127 } 2128 } 2129 2130 for (const MachineOperand &Op : MI->explicit_operands()) { 2131 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2132 continue; 2133 2134 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2135 continue; 2136 2137 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2138 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2139 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2140 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2141 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2142 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2143 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2144 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2145 const int MaxWaitStates = 18; 2146 Register Reg = Op.getReg(); 2147 unsigned HazardDefLatency = 0; 2148 2149 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2150 this](const MachineInstr &MI) { 2151 if (!SIInstrInfo::isMFMA(MI)) 2152 return false; 2153 Register DstReg = MI.getOperand(0).getReg(); 2154 if (DstReg == Reg) 2155 return false; 2156 HazardDefLatency = 2157 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2158 return TRI.regsOverlap(DstReg, Reg); 2159 }; 2160 2161 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2162 MaxWaitStates); 2163 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2164 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2165 int OpNo = Op.getOperandNo(); 2166 if (OpNo == SrcCIdx) { 2167 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2168 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2169 switch (HazardDefLatency) { 2170 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2171 break; 2172 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2173 break; 2174 case 16: [[fallthrough]]; 2175 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2176 break; 2177 } 2178 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2179 switch (HazardDefLatency) { 2180 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2181 break; 2182 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2183 break; 2184 case 16: [[fallthrough]]; 2185 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2186 break; 2187 } 2188 } 2189 2190 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2191 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2192 2193 if (WaitStatesNeeded == MaxWaitStates) 2194 return WaitStatesNeeded; // Early exit. 2195 2196 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2197 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2198 return false; 2199 Register DstReg = MI.getOperand(0).getReg(); 2200 return TRI.regsOverlap(Reg, DstReg); 2201 }; 2202 2203 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2204 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2205 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2206 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2207 if (OpNo == SrcCIdx) 2208 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2209 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2210 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2211 2212 WaitStatesNeededForUse = NeedWaitStates - 2213 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2215 2216 if (WaitStatesNeeded == MaxWaitStates) 2217 return WaitStatesNeeded; // Early exit. 2218 } 2219 2220 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2221 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2222 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2223 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2224 const int MaxWaitStates = 13; 2225 Register DstReg = MI->getOperand(0).getReg(); 2226 unsigned HazardDefLatency = 0; 2227 2228 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2229 this](const MachineInstr &MI) { 2230 if (!SIInstrInfo::isMFMA(MI)) 2231 return false; 2232 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2233 HazardDefLatency = 2234 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2235 return TRI.regsOverlap(Reg, DstReg); 2236 }; 2237 2238 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2239 int NeedWaitStates; 2240 switch (HazardDefLatency) { 2241 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2242 break; 2243 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2244 break; 2245 case 16: [[fallthrough]]; 2246 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2247 break; 2248 } 2249 2250 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2251 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2252 } 2253 2254 // Pad neighboring MFMA with noops for better inter-wave performance. 2255 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2256 2257 return WaitStatesNeeded; 2258 } 2259 2260 static int 2261 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, 2262 bool IsGFX950) { 2263 // xdl def cycles | gfx940 | gfx950 2264 // 2 pass | 3 4 2265 // 4 pass | 5 6 2266 // 8 pass | 9 10 2267 // 16 pass | 17 18 2268 return NumPasses + 1 + IsGFX950; 2269 } 2270 2271 static int 2272 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, 2273 bool IsGFX950) { 2274 // xdl def cycles | gfx940 | gfx950 2275 // 2 pass | 3 3 2276 // 4 pass | 5 6 2277 // 8 pass | 9 10 2278 // 16 pass | 17 18 2279 return NumPasses + 1 + (NumPasses != 2 && IsGFX950); 2280 } 2281 2282 static int 2283 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2284 // 2 pass -> 2 2285 // 4 pass -> 4 2286 // 8 pass -> 8 2287 // 16 pass -> 16 2288 return NumPasses; 2289 } 2290 2291 static int 2292 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2293 // 2 pass -> 4 2294 // 4 pass -> 6 2295 // 8 pass -> 10 2296 // 16 pass -> 18 2297 return NumPasses + 2; 2298 } 2299 2300 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2301 // 2 pass -> 5 2302 // 4 pass -> 7 2303 // 8 pass -> 11 2304 // 16 pass -> 19 2305 return NumPasses + 3; 2306 } 2307 2308 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2309 int WaitStatesNeeded = 0; 2310 unsigned Opc = MI->getOpcode(); 2311 2312 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2313 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2314 }; 2315 2316 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2317 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2318 !SIInstrInfo::isDOT(MI); 2319 }; 2320 2321 if (!SIInstrInfo::isMFMA(*MI)) 2322 return WaitStatesNeeded; 2323 2324 const int VALUWritesExecWaitStates = 4; 2325 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2326 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2327 VALUWritesExecWaitStates); 2328 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2329 2330 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2331 2332 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2333 for (const MachineOperand &Use : MI->explicit_uses()) { 2334 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2335 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2336 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2337 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2338 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2339 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2340 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2341 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2342 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2343 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2344 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2345 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2346 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2347 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2348 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2349 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19; 2350 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2351 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2352 const int MaxWaitStates = 19; 2353 2354 if (!Use.isReg()) 2355 continue; 2356 Register Reg = Use.getReg(); 2357 bool FullReg; 2358 const MachineInstr *MI1; 2359 2360 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2361 this](const MachineInstr &MI) { 2362 if (!SIInstrInfo::isMFMA(MI)) 2363 return false; 2364 Register DstReg = MI.getOperand(0).getReg(); 2365 FullReg = (DstReg == Reg); 2366 MI1 = &MI; 2367 return TRI.regsOverlap(DstReg, Reg); 2368 }; 2369 2370 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2371 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2373 2374 int NumWaitStates = 2375 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2376 if (NumWaitStates == std::numeric_limits<int>::max()) 2377 continue; 2378 2379 int OpNo = Use.getOperandNo(); 2380 unsigned Opc1 = MI1->getOpcode(); 2381 int NeedWaitStates = 0; 2382 if (OpNo == SrcCIdx) { 2383 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2384 NeedWaitStates = 0; 2385 } else if (FullReg) { 2386 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2387 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2388 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2389 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2390 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2391 else if (ST.hasGFX940Insts() && 2392 TSchedModel.computeInstrLatency(MI1) == 2) 2393 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2394 } else { 2395 switch (Opc1) { 2396 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2397 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2398 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2399 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2400 if (!isXDL(ST, *MI)) 2401 NeedWaitStates = 2402 ST.hasGFX950Insts() 2403 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2404 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2405 break; 2406 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2407 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2408 if (!isXDL(ST, *MI)) 2409 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2410 break; 2411 default: 2412 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2413 if (ST.hasGFX940Insts()) { 2414 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2415 break; 2416 2417 NeedWaitStates = 2418 isXDL(ST, *MI1) 2419 ? (isXDL(ST, *MI) 2420 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates( 2421 NumPasses, ST.hasGFX950Insts()) 2422 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates( 2423 NumPasses, ST.hasGFX950Insts())) 2424 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2425 NumPasses); 2426 break; 2427 } 2428 2429 switch (NumPasses) { 2430 case 2: 2431 NeedWaitStates = 2432 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2433 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2434 break; 2435 case 8: 2436 NeedWaitStates = 2437 isDGEMM(Opc) 2438 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2439 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2440 break; 2441 case 16: 2442 NeedWaitStates = 2443 isDGEMM(Opc) 2444 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2445 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2446 break; 2447 default: 2448 llvm_unreachable("unexpected number of passes"); 2449 } 2450 } 2451 } 2452 } else { 2453 switch (Opc1) { 2454 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2455 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2456 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2457 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2458 NeedWaitStates = 2459 ST.hasGFX950Insts() 2460 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates 2461 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2462 break; 2463 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2464 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2465 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2466 break; 2467 default: 2468 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2469 2470 if (ST.hasGFX940Insts()) { 2471 NeedWaitStates = 2472 isXDL(ST, *MI1) 2473 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2474 NumPasses) 2475 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2476 NumPasses); 2477 break; 2478 } 2479 2480 switch (NumPasses) { 2481 case 2: 2482 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2483 break; 2484 case 4: 2485 llvm_unreachable("unexpected number of passes for mfma"); 2486 case 8: 2487 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2488 break; 2489 case 16: 2490 default: 2491 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2492 } 2493 } 2494 } 2495 if (WaitStatesNeeded >= NeedWaitStates) 2496 continue; 2497 2498 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2499 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2500 2501 if (WaitStatesNeeded == MaxWaitStates) 2502 break; 2503 } 2504 2505 // Pad neighboring MFMA with noops for better inter-wave performance. 2506 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2507 2508 return WaitStatesNeeded; 2509 } 2510 2511 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2512 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2513 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2514 return 0; 2515 2516 int WaitStatesNeeded = 0; 2517 2518 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2519 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2520 }; 2521 2522 for (const MachineOperand &Op : MI->explicit_uses()) { 2523 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2524 continue; 2525 2526 Register Reg = Op.getReg(); 2527 2528 const int AccVgprReadLdStWaitStates = 2; 2529 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2530 const int MaxWaitStates = 2; 2531 2532 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2533 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2534 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2535 2536 if (WaitStatesNeeded == MaxWaitStates) 2537 return WaitStatesNeeded; // Early exit. 2538 2539 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2540 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2541 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2542 return false; 2543 auto IsVALUFn = [](const MachineInstr &MI) { 2544 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2545 }; 2546 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2547 std::numeric_limits<int>::max(); 2548 }; 2549 2550 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2551 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2553 } 2554 2555 return WaitStatesNeeded; 2556 } 2557 2558 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { 2559 assert(!ST.hasVcmpxPermlaneHazard() && 2560 "this is a different vcmpx+permlane hazard"); 2561 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2562 const SIInstrInfo *TII = ST.getInstrInfo(); 2563 2564 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) { 2565 return isVCmpXWritesExec(*TII, *TRI, MI); 2566 }; 2567 2568 auto IsVALUFn = [](const MachineInstr &MI) { 2569 return SIInstrInfo::isVALU(MI); 2570 }; 2571 2572 const int VCmpXWritesExecWaitStates = 4; 2573 const int VALUWritesVDstWaitStates = 2; 2574 int WaitStatesNeeded = 0; 2575 2576 for (const MachineOperand &Op : MI->explicit_uses()) { 2577 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg())) 2578 continue; 2579 Register Reg = Op.getReg(); 2580 2581 int WaitStatesSinceDef = 2582 VALUWritesVDstWaitStates - 2583 getWaitStatesSinceDef(Reg, IsVALUFn, 2584 /*MaxWaitStates=*/VALUWritesVDstWaitStates); 2585 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef); 2586 if (WaitStatesNeeded >= VALUWritesVDstWaitStates) 2587 break; 2588 } 2589 2590 int VCmpXHazardWaits = 2591 VCmpXWritesExecWaitStates - 2592 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates); 2593 2594 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits); 2595 return WaitStatesNeeded; 2596 } 2597 2598 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2599 // 2 pass -> 4 2600 // 4 pass -> 6 2601 // 8 pass -> 10 2602 // 16 pass -> 18 2603 return NumPasses + 2; 2604 } 2605 2606 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2607 // 2 pass -> 5 2608 // 4 pass -> 7 2609 // 8 pass -> 11 2610 // 16 pass -> 19 2611 return NumPasses + 3; 2612 } 2613 2614 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2615 // 2 pass -> 5 2616 // 4 pass -> 7 2617 // 8 pass -> 11 2618 // 16 pass -> 19 2619 return NumPasses + 3; 2620 } 2621 2622 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2623 // 2 pass -> 4 2624 // 4 pass -> 6 2625 // 8 pass -> 10 2626 // 16 pass -> 18 2627 return NumPasses + 2; 2628 } 2629 2630 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2631 if (!ST.hasGFX90AInsts()) 2632 return 0; 2633 2634 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2635 return isDGEMM(MI.getOpcode()); 2636 }; 2637 2638 // This is checked in checkMAIHazards90A() 2639 if (SIInstrInfo::isMFMA(*MI)) 2640 return 0; 2641 2642 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2643 2644 int WaitStatesNeeded = 0; 2645 2646 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2647 SIInstrInfo::isFLAT(*MI) || 2648 SIInstrInfo::isDS(*MI); 2649 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2650 bool IsVALU = SIInstrInfo::isVALU(*MI); 2651 2652 const MachineInstr *MFMA = nullptr; 2653 unsigned Reg; 2654 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2655 if (!SIInstrInfo::isMFMA(MI) || 2656 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2657 return false; 2658 MFMA = &MI; 2659 return true; 2660 }; 2661 2662 const MachineInstr *DOT = nullptr; 2663 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2664 if (!SIInstrInfo::isDOT(MI) || 2665 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2666 return false; 2667 DOT = &MI; 2668 return true; 2669 }; 2670 2671 bool DGEMMAfterVALUWrite = false; 2672 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2673 // Found DGEMM on reverse traversal to def. 2674 if (isDGEMM(MI.getOpcode())) 2675 DGEMMAfterVALUWrite = true; 2676 2677 // Only hazard if register is defined by a VALU and a DGEMM is found after 2678 // after the def. 2679 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2680 return false; 2681 2682 return true; 2683 }; 2684 2685 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2686 AMDGPU::OpName::src2); 2687 2688 if (IsMemOrExport || IsVALU) { 2689 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2690 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2691 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2692 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2693 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2694 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2695 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2696 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2697 const int DotWriteSameDotReadSrcAB = 3; 2698 const int DotWriteDifferentVALURead = 3; 2699 const int DMFMABetweenVALUWriteVMEMRead = 2; 2700 const int MaxWaitStates = 19; 2701 2702 for (const MachineOperand &Use : MI->explicit_uses()) { 2703 if (!Use.isReg()) 2704 continue; 2705 Reg = Use.getReg(); 2706 2707 DOT = nullptr; 2708 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2709 MaxWaitStates); 2710 if (DOT) { 2711 int NeedWaitStates = 0; 2712 if (DOT->getOpcode() == MI->getOpcode()) { 2713 if (&Use - &MI->getOperand(0) != SrcCIdx) 2714 NeedWaitStates = DotWriteSameDotReadSrcAB; 2715 } else { 2716 NeedWaitStates = DotWriteDifferentVALURead; 2717 } 2718 2719 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2720 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2721 } 2722 2723 // Workaround for HW data hazard bug observed only in GFX90A. When there 2724 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2725 // causes the SQ to incorrectly not insert two wait states between the two 2726 // instructions needed to avoid data hazard. 2727 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2728 DGEMMAfterVALUWrite = false; 2729 if (TRI.isVectorRegister(MRI, Reg)) { 2730 int WaitStatesNeededForUse = 2731 DMFMABetweenVALUWriteVMEMRead - 2732 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2733 DMFMABetweenVALUWriteVMEMRead); 2734 2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2736 } 2737 } 2738 2739 MFMA = nullptr; 2740 WaitStatesSinceDef = 2741 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2742 if (!MFMA) 2743 continue; 2744 2745 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2746 int NumPasses = HazardDefLatency; 2747 int NeedWaitStates = MaxWaitStates; 2748 2749 if (isDGEMM(MFMA->getOpcode())) { 2750 switch (HazardDefLatency) { 2751 case 4: 2752 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2753 : DMFMA4x4WriteVgprVALUReadWaitStates; 2754 break; 2755 case 8: 2756 case 16: 2757 NeedWaitStates = 2758 IsMemOrExport 2759 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2760 : (ST.hasGFX950Insts() 2761 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2762 : DMFMA16x16WriteVgprVALUReadWaitStates); 2763 break; 2764 default: 2765 llvm_unreachable("unexpected dgemm"); 2766 } 2767 } else if (ST.hasGFX940Insts()) { 2768 NeedWaitStates = 2769 isXDL(ST, *MFMA) 2770 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2771 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2772 NumPasses); 2773 } else { 2774 switch (HazardDefLatency) { 2775 case 2: 2776 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2777 break; 2778 case 8: 2779 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2780 break; 2781 case 16: 2782 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2783 break; 2784 default: 2785 llvm_unreachable("unexpected number of passes for mfma"); 2786 } 2787 } 2788 2789 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2790 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2791 2792 if (WaitStatesNeeded == MaxWaitStates) 2793 break; 2794 } 2795 } 2796 2797 unsigned Opc = MI->getOpcode(); 2798 const int DMFMAToFMA64WaitStates = 2; 2799 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2800 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2801 Opc == AMDGPU::V_FMAC_F64_dpp) && 2802 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2803 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2804 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2805 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2806 } 2807 2808 if (!IsVALU && !IsMemOrExport) 2809 return WaitStatesNeeded; 2810 2811 for (const MachineOperand &Def : MI->defs()) { 2812 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2813 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2814 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2815 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2816 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2817 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2818 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2819 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2820 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2821 const int DotWriteDifferentVALUWrite = 3; 2822 const int MaxWaitStates = 19; 2823 const int MaxWarWaitStates = 15; 2824 2825 Reg = Def.getReg(); 2826 2827 DOT = nullptr; 2828 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2829 MaxWaitStates); 2830 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2831 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2832 WaitStatesSinceDef); 2833 2834 MFMA = nullptr; 2835 WaitStatesSinceDef = 2836 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2837 if (MFMA) { 2838 int NeedWaitStates = MaxWaitStates; 2839 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2840 2841 if (isDGEMM(MFMA->getOpcode())) { 2842 switch (NumPasses) { 2843 case 4: 2844 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2845 break; 2846 case 8: 2847 case 16: 2848 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2849 break; 2850 default: 2851 llvm_unreachable("unexpected number of cycles for dgemm"); 2852 } 2853 } else if (ST.hasGFX940Insts()) { 2854 NeedWaitStates = 2855 isXDL(ST, *MFMA) 2856 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2857 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2858 } else { 2859 switch (NumPasses) { 2860 case 2: 2861 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2862 break; 2863 case 8: 2864 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2865 break; 2866 case 16: 2867 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2868 break; 2869 default: 2870 llvm_unreachable("Unexpected number of passes for mfma"); 2871 } 2872 } 2873 2874 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2875 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2876 2877 if (WaitStatesNeeded == MaxWaitStates) 2878 break; 2879 } 2880 2881 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2882 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2883 !MI.readsRegister(Reg, &TRI)) 2884 return false; 2885 2886 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2887 return false; 2888 2889 const MachineOperand *SrcC = 2890 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2891 assert(SrcC); 2892 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2893 return false; 2894 2895 MFMA = &MI; 2896 return true; 2897 }; 2898 2899 MFMA = nullptr; 2900 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2901 MaxWarWaitStates); 2902 if (!MFMA) 2903 continue; 2904 2905 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2906 int NeedWaitStates = MaxWaitStates; 2907 switch (HazardDefLatency) { 2908 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2909 break; 2910 case 4: assert(ST.hasGFX940Insts()); 2911 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2912 break; 2913 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2914 break; 2915 case 16: [[fallthrough]]; 2916 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2917 break; 2918 } 2919 2920 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2921 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2922 } 2923 2924 return WaitStatesNeeded; 2925 } 2926 2927 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2928 if (!SU->isInstr()) 2929 return false; 2930 2931 const MachineInstr *MAI = nullptr; 2932 2933 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2934 MAI = nullptr; 2935 if (SIInstrInfo::isMFMA(MI)) 2936 MAI = &MI; 2937 return MAI != nullptr; 2938 }; 2939 2940 MachineInstr *MI = SU->getInstr(); 2941 if (IsMFMAFn(*MI)) { 2942 int W = getWaitStatesSince(IsMFMAFn, 16); 2943 if (MAI) 2944 return W < (int)TSchedModel.computeInstrLatency(MAI); 2945 } 2946 2947 return false; 2948 } 2949 2950 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2951 // insertion of a new instruction. 2952 static void updateGetPCBundle(MachineInstr *NewMI) { 2953 if (!NewMI->isBundled()) 2954 return; 2955 2956 // Find start of bundle. 2957 auto I = NewMI->getIterator(); 2958 while (I->isBundledWithPred()) 2959 I--; 2960 if (I->isBundle()) 2961 I++; 2962 2963 // Bail if this is not an S_GETPC bundle. 2964 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2965 return; 2966 2967 // Update offsets of any references in the bundle. 2968 const unsigned NewBytes = 4; 2969 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2970 "Unexpected instruction insertion in bundle"); 2971 auto NextMI = std::next(NewMI->getIterator()); 2972 auto End = NewMI->getParent()->end(); 2973 while (NextMI != End && NextMI->isBundledWithPred()) { 2974 for (auto &Operand : NextMI->operands()) { 2975 if (Operand.isGlobal()) 2976 Operand.setOffset(Operand.getOffset() + NewBytes); 2977 } 2978 NextMI++; 2979 } 2980 } 2981 2982 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2983 if (!ST.hasVALUMaskWriteHazard()) 2984 return false; 2985 assert(!ST.hasExtendedWaitCounts()); 2986 2987 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2988 return false; 2989 2990 // The hazard sequence is three instructions: 2991 // 1. VALU reads SGPR as mask 2992 // 2. SALU writes SGPR 2993 // 3. SALU reads SGPR 2994 // The hazard can expire if the distance between 2 and 3 is sufficient. 2995 // In practice this happens <10% of the time, hence this always assumes 2996 // the hazard exists if 1 and 2 are present to avoid searching. 2997 2998 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2999 if (!SDSTOp || !SDSTOp->isReg()) 3000 return false; 3001 3002 const Register HazardReg = SDSTOp->getReg(); 3003 if (HazardReg == AMDGPU::EXEC || 3004 HazardReg == AMDGPU::EXEC_LO || 3005 HazardReg == AMDGPU::EXEC_HI || 3006 HazardReg == AMDGPU::M0) 3007 return false; 3008 3009 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 3010 switch (I.getOpcode()) { 3011 case AMDGPU::V_ADDC_U32_e32: 3012 case AMDGPU::V_ADDC_U32_dpp: 3013 case AMDGPU::V_CNDMASK_B16_fake16_e32: 3014 case AMDGPU::V_CNDMASK_B16_fake16_dpp: 3015 case AMDGPU::V_CNDMASK_B32_e32: 3016 case AMDGPU::V_CNDMASK_B32_dpp: 3017 case AMDGPU::V_DIV_FMAS_F32_e64: 3018 case AMDGPU::V_DIV_FMAS_F64_e64: 3019 case AMDGPU::V_SUBB_U32_e32: 3020 case AMDGPU::V_SUBB_U32_dpp: 3021 case AMDGPU::V_SUBBREV_U32_e32: 3022 case AMDGPU::V_SUBBREV_U32_dpp: 3023 // These implicitly read VCC as mask source. 3024 return HazardReg == AMDGPU::VCC || 3025 HazardReg == AMDGPU::VCC_LO || 3026 HazardReg == AMDGPU::VCC_HI; 3027 case AMDGPU::V_ADDC_U32_e64: 3028 case AMDGPU::V_ADDC_U32_e64_dpp: 3029 case AMDGPU::V_CNDMASK_B16_fake16_e64: 3030 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp: 3031 case AMDGPU::V_CNDMASK_B32_e64: 3032 case AMDGPU::V_CNDMASK_B32_e64_dpp: 3033 case AMDGPU::V_SUBB_U32_e64: 3034 case AMDGPU::V_SUBB_U32_e64_dpp: 3035 case AMDGPU::V_SUBBREV_U32_e64: 3036 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 3037 // Only check mask register overlaps. 3038 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 3039 assert(SSRCOp); 3040 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 3041 } 3042 default: 3043 return false; 3044 } 3045 }; 3046 3047 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3048 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 3049 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 3050 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3051 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3052 return true; 3053 3054 // VALU access to any SGPR or literal constant other than HazardReg 3055 // mitigates hazard. No need to check HazardReg here as this will 3056 // only be called when !IsHazardFn. 3057 if (!SIInstrInfo::isVALU(I)) 3058 return false; 3059 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 3060 const MachineOperand &Op = I.getOperand(OpNo); 3061 if (Op.isReg()) { 3062 Register OpReg = Op.getReg(); 3063 // Only consider uses 3064 if (!Op.isUse()) 3065 continue; 3066 // Ignore EXEC 3067 if (OpReg == AMDGPU::EXEC || 3068 OpReg == AMDGPU::EXEC_LO || 3069 OpReg == AMDGPU::EXEC_HI) 3070 continue; 3071 // Ignore all implicit uses except VCC 3072 if (Op.isImplicit()) { 3073 if (OpReg == AMDGPU::VCC || 3074 OpReg == AMDGPU::VCC_LO || 3075 OpReg == AMDGPU::VCC_HI) 3076 return true; 3077 continue; 3078 } 3079 if (TRI.isSGPRReg(MRI, OpReg)) 3080 return true; 3081 } else { 3082 const MCInstrDesc &InstDesc = I.getDesc(); 3083 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3084 if (!TII.isInlineConstant(Op, OpInfo)) 3085 return true; 3086 } 3087 } 3088 return false; 3089 }; 3090 3091 // Check for hazard 3092 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3093 std::numeric_limits<int>::max()) 3094 return false; 3095 3096 auto NextMI = std::next(MI->getIterator()); 3097 3098 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3099 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3100 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3101 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3102 3103 // SALU write may be s_getpc in a bundle. 3104 updateGetPCBundle(NewMI); 3105 3106 return true; 3107 } 3108 3109 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3110 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3111 static std::optional<unsigned> sgprPairNumber(Register Reg, 3112 const SIRegisterInfo &TRI) { 3113 switch (Reg) { 3114 case AMDGPU::M0: 3115 case AMDGPU::EXEC: 3116 case AMDGPU::EXEC_LO: 3117 case AMDGPU::EXEC_HI: 3118 case AMDGPU::SGPR_NULL: 3119 case AMDGPU::SGPR_NULL64: 3120 return {}; 3121 default: 3122 break; 3123 } 3124 unsigned RegN = TRI.getEncodingValue(Reg); 3125 if (RegN > 127) 3126 return {}; 3127 return (RegN >> 1) & 0x3f; 3128 } 3129 3130 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3131 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3132 assert(MMF == &MF); 3133 3134 // Assume non-empty vector means it has already been computed. 3135 if (!VALUReadHazardSGPRs.empty()) 3136 return; 3137 3138 auto CallingConv = MF.getFunction().getCallingConv(); 3139 bool IsCallFree = 3140 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3141 3142 // Exhaustive search is only viable in non-caller/callee functions where 3143 // VALUs will be exposed to the hazard recognizer. 3144 UseVALUReadHazardExhaustiveSearch = 3145 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3146 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3147 3148 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3149 bool UseVALUUseCache = 3150 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3151 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3152 if (!UseVALUUseCache) 3153 return; 3154 3155 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3156 // before a SALU write to the same SGPR. This provides a reduction in 3157 // hazard insertion when all VALU access to an SGPR occurs after its last 3158 // SALU write, when compared to a linear scan. 3159 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3160 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3161 MachineCycleInfo CI; 3162 CI.compute(*MMF); 3163 3164 for (auto *MBB : post_order(&MF)) { 3165 bool InCycle = CI.getCycle(MBB) != nullptr; 3166 for (auto &MI : reverse(MBB->instrs())) { 3167 bool IsVALU = SIInstrInfo::isVALU(MI); 3168 bool IsSALU = SIInstrInfo::isSALU(MI); 3169 if (!IsVALU && !IsSALU) 3170 continue; 3171 3172 for (const MachineOperand &Op : MI.operands()) { 3173 if (!Op.isReg()) 3174 continue; 3175 Register Reg = Op.getReg(); 3176 assert(!Op.getSubReg()); 3177 // Only consider implicit operands of VCC. 3178 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3179 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3180 continue; 3181 if (!TRI.isSGPRReg(MRI, Reg)) 3182 continue; 3183 auto RegN = sgprPairNumber(Reg, TRI); 3184 if (!RegN) 3185 continue; 3186 if (IsVALU && Op.isUse()) { 3187 // Note: any access within a cycle must be considered a hazard. 3188 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3189 VALUReadHazardSGPRs.set(*RegN); 3190 ReadSGPRs.set(*RegN); 3191 } else if (IsSALU) { 3192 if (Op.isDef()) 3193 SALUWriteSGPRs.set(*RegN); 3194 else 3195 ReadSGPRs.set(*RegN); 3196 } 3197 } 3198 } 3199 } 3200 } 3201 3202 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3203 if (!ST.hasVALUReadSGPRHazard()) 3204 return false; 3205 3206 // The hazard sequence is fundamentally three instructions: 3207 // 1. VALU reads SGPR 3208 // 2. SALU writes SGPR 3209 // 3. VALU/SALU reads SGPR 3210 // Try to avoid searching for (1) because the expiry point of the hazard is 3211 // indeterminate; however, the hazard between (2) and (3) can expire if the 3212 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3213 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3214 // even if individual SGPRs are accessed. 3215 3216 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3217 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3218 if (!(MIIsSALU || MIIsVALU)) 3219 return false; 3220 3221 // Avoid expensive search when compile time is priority by 3222 // mitigating every SALU which writes an SGPR. 3223 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3224 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3225 return false; 3226 3227 const MachineOperand *SDSTOp = 3228 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3229 if (!SDSTOp || !SDSTOp->isReg()) 3230 return false; 3231 3232 const Register HazardReg = SDSTOp->getReg(); 3233 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3234 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3235 return false; 3236 3237 // Add s_wait_alu sa_sdst(0) after SALU write. 3238 auto NextMI = std::next(MI->getIterator()); 3239 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3240 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3241 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3242 3243 // SALU write may be s_getpc in a bundle. 3244 updateGetPCBundle(NewMI); 3245 3246 return true; 3247 } 3248 3249 // Pre-compute set of SGPR pairs read by VALUs. 3250 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3251 computeVALUHazardSGPRs(MI->getMF()); 3252 3253 // If no VALUs hazard SGPRs exist then nothing to do. 3254 if (VALUReadHazardSGPRs.none()) 3255 return false; 3256 3257 // All SGPR writes before a call/return must be flushed as the callee/caller 3258 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3259 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3260 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3261 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3262 3263 // Collect all SGPR sources for MI which are read by a VALU. 3264 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3265 SmallSet<Register, 4> SGPRsUsed; 3266 3267 if (!IsSetPC) { 3268 for (const MachineOperand &Op : MI->all_uses()) { 3269 Register OpReg = Op.getReg(); 3270 3271 // Only consider VCC implicit uses on VALUs. 3272 // The only expected SALU implicit access is SCC which is no hazard. 3273 if (MIIsSALU && Op.isImplicit()) 3274 continue; 3275 3276 if (!TRI.isSGPRReg(MRI, OpReg)) 3277 continue; 3278 3279 auto RegN = sgprPairNumber(OpReg, TRI); 3280 if (!RegN) 3281 continue; 3282 3283 if (!VALUReadHazardSGPRs[*RegN]) 3284 continue; 3285 3286 SGPRsUsed.insert(OpReg); 3287 } 3288 3289 // No SGPRs -> nothing to do. 3290 if (SGPRsUsed.empty()) 3291 return false; 3292 } 3293 3294 // A hazard is any SALU which writes one of the SGPRs read by MI. 3295 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3296 if (!SIInstrInfo::isSALU(I)) 3297 return false; 3298 // Ensure SGPR flush before call/return by conservatively assuming every 3299 // SALU writes an SGPR. 3300 if (IsSetPC && I.getNumDefs() > 0) 3301 return true; 3302 // Check for any register writes. 3303 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3304 return I.modifiesRegister(Reg, &TRI); 3305 }); 3306 }; 3307 3308 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3309 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3310 if (Count >= SALUExpiryCount) 3311 return true; 3312 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3313 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3314 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3315 return true; 3316 return false; 3317 }; 3318 3319 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3320 // Only count true SALUs as wait states. 3321 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3322 return 0; 3323 // SALU must be unrelated to any hazard registers. 3324 if (any_of(SGPRsUsed, 3325 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3326 return 0; 3327 return 1; 3328 }; 3329 3330 // Check for the hazard. 3331 DenseSet<const MachineBasicBlock *> Visited; 3332 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3333 std::next(MI->getReverseIterator()), 0, 3334 IsExpiredFn, Visited, WaitStatesFn); 3335 3336 if (WaitStates >= SALUExpiryCount) 3337 return false; 3338 3339 // Validate hazard through an exhaustive search. 3340 if (UseVALUReadHazardExhaustiveSearch) { 3341 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3342 // This is searching for (1) in the hazard description. 3343 auto hazardPair = [this](Register Reg) { 3344 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3345 return Register(AMDGPU::VCC); 3346 auto RegN = sgprPairNumber(Reg, TRI); 3347 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3348 }; 3349 auto SearchHazardFn = [this, hazardPair, 3350 &SGPRsUsed](const MachineInstr &I) { 3351 if (!SIInstrInfo::isVALU(I)) 3352 return false; 3353 // Check for any register reads. 3354 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3355 return I.readsRegister(hazardPair(Reg), &TRI); 3356 }); 3357 }; 3358 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3359 return false; 3360 }; 3361 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3362 std::numeric_limits<int>::max()) 3363 return false; 3364 } 3365 3366 // Add s_wait_alu sa_sdst(0) before SALU read. 3367 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3368 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3369 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3370 3371 // SALU read may be after s_getpc in a bundle. 3372 updateGetPCBundle(NewMI); 3373 3374 return true; 3375 } 3376 3377 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3378 const SIInstrInfo &TII) { 3379 MachineBasicBlock &EntryMBB = MF->front(); 3380 if (EntryMBB.begin() != EntryMBB.end()) { 3381 auto &EntryMI = *EntryMBB.begin(); 3382 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3383 EntryMI.getOperand(0).getImm() >= Priority) 3384 return false; 3385 } 3386 3387 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3388 .addImm(Priority); 3389 return true; 3390 } 3391 3392 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3393 if (!ST.hasRequiredExportPriority()) 3394 return false; 3395 3396 // Assume the following shader types will never have exports, 3397 // and avoid adding or adjusting S_SETPRIO. 3398 MachineBasicBlock *MBB = MI->getParent(); 3399 MachineFunction *MF = MBB->getParent(); 3400 auto CC = MF->getFunction().getCallingConv(); 3401 switch (CC) { 3402 case CallingConv::AMDGPU_CS: 3403 case CallingConv::AMDGPU_CS_Chain: 3404 case CallingConv::AMDGPU_CS_ChainPreserve: 3405 case CallingConv::AMDGPU_KERNEL: 3406 return false; 3407 default: 3408 break; 3409 } 3410 3411 const int MaxPriority = 3; 3412 const int NormalPriority = 2; 3413 const int PostExportPriority = 0; 3414 3415 auto It = MI->getIterator(); 3416 switch (MI->getOpcode()) { 3417 case AMDGPU::S_ENDPGM: 3418 case AMDGPU::S_ENDPGM_SAVED: 3419 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3420 case AMDGPU::SI_RETURN_TO_EPILOG: 3421 // Ensure shader with calls raises priority at entry. 3422 // This ensures correct priority if exports exist in callee. 3423 if (MF->getFrameInfo().hasCalls()) 3424 return ensureEntrySetPrio(MF, NormalPriority, TII); 3425 return false; 3426 case AMDGPU::S_SETPRIO: { 3427 // Raise minimum priority unless in workaround. 3428 auto &PrioOp = MI->getOperand(0); 3429 int Prio = PrioOp.getImm(); 3430 bool InWA = (Prio == PostExportPriority) && 3431 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3432 if (InWA || Prio >= NormalPriority) 3433 return false; 3434 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3435 return true; 3436 } 3437 default: 3438 if (!TII.isEXP(*MI)) 3439 return false; 3440 break; 3441 } 3442 3443 // Check entry priority at each export (as there will only be a few). 3444 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3445 bool Changed = false; 3446 if (CC != CallingConv::AMDGPU_Gfx) 3447 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3448 3449 auto NextMI = std::next(It); 3450 bool EndOfShader = false; 3451 if (NextMI != MBB->end()) { 3452 // Only need WA at end of sequence of exports. 3453 if (TII.isEXP(*NextMI)) 3454 return Changed; 3455 // Assume appropriate S_SETPRIO after export means WA already applied. 3456 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3457 NextMI->getOperand(0).getImm() == PostExportPriority) 3458 return Changed; 3459 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3460 } 3461 3462 const DebugLoc &DL = MI->getDebugLoc(); 3463 3464 // Lower priority. 3465 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3466 .addImm(PostExportPriority); 3467 3468 if (!EndOfShader) { 3469 // Wait for exports to complete. 3470 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3471 .addReg(AMDGPU::SGPR_NULL) 3472 .addImm(0); 3473 } 3474 3475 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3476 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3477 3478 if (!EndOfShader) { 3479 // Return to normal (higher) priority. 3480 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3481 .addImm(NormalPriority); 3482 } 3483 3484 return true; 3485 } 3486