1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false), 63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 65 TSchedModel.init(&ST); 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; 172 } 173 174 static bool isLdsDma(const MachineInstr &MI) { 175 return SIInstrInfo::isVALU(MI) && 176 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 177 } 178 179 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 180 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 181 AMDGPU::OpName::simm16); 182 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 183 } 184 185 ScheduleHazardRecognizer::HazardType 186 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 187 MachineInstr *MI = SU->getInstr(); 188 // If we are not in "HazardRecognizerMode" and therefore not being run from 189 // the scheduler, track possible stalls from hazards but don't insert noops. 190 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 191 192 if (MI->isBundle()) 193 return NoHazard; 194 195 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 196 return HazardType; 197 198 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 199 return HazardType; 200 201 if (checkFPAtomicToDenormModeHazard(MI) > 0) 202 return HazardType; 203 204 if (ST.hasNoDataDepHazard()) 205 return NoHazard; 206 207 // FIXME: Should flat be considered vmem? 208 if ((SIInstrInfo::isVMEM(*MI) || 209 SIInstrInfo::isFLAT(*MI)) 210 && checkVMEMHazards(MI) > 0) 211 return HazardType; 212 213 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 214 return HazardType; 215 216 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 217 return HazardType; 218 219 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 220 return HazardType; 221 222 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 223 return HazardType; 224 225 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 226 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 227 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 228 return HazardType; 229 230 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 231 return HazardType; 232 233 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 234 return HazardType; 235 236 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 237 return HazardType; 238 239 if (((ST.hasReadM0MovRelInterpHazard() && 240 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 241 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 242 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 243 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 244 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 245 (ST.hasReadM0LdsDirectHazard() && 246 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 247 checkReadM0Hazards(MI) > 0) 248 return HazardType; 249 250 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 251 return HazardType; 252 253 if ((SIInstrInfo::isVMEM(*MI) || 254 SIInstrInfo::isFLAT(*MI) || 255 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 256 return HazardType; 257 258 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 259 return HazardType; 260 261 return NoHazard; 262 } 263 264 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 265 unsigned Quantity) { 266 while (Quantity > 0) { 267 unsigned Arg = std::min(Quantity, 8u); 268 Quantity -= Arg; 269 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 270 .addImm(Arg - 1); 271 } 272 } 273 274 unsigned 275 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 276 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 277 assert(TSchedModel.getWriteProcResBegin(SC) != 278 TSchedModel.getWriteProcResEnd(SC)); 279 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 280 } 281 282 void GCNHazardRecognizer::processBundle() { 283 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 284 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 285 // Check bundled MachineInstr's for hazards. 286 for (; MI != E && MI->isInsideBundle(); ++MI) { 287 CurrCycleInstr = &*MI; 288 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 289 290 if (IsHazardRecognizerMode) { 291 fixHazards(CurrCycleInstr); 292 293 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 294 } 295 296 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 297 // include the bundled MI directly after, only add a maximum of 298 // (MaxLookAhead - 1) noops to EmittedInstrs. 299 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 300 EmittedInstrs.push_front(nullptr); 301 302 EmittedInstrs.push_front(CurrCycleInstr); 303 EmittedInstrs.resize(MaxLookAhead); 304 } 305 CurrCycleInstr = nullptr; 306 } 307 308 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 309 assert(IsHazardRecognizerMode); 310 311 unsigned NumPreNoops = PreEmitNoops(MI); 312 EmitNoops(NumPreNoops); 313 if (MI->isInsideBundle()) 314 insertNoopsInBundle(MI, TII, NumPreNoops); 315 else 316 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 317 NumPreNoops); 318 EmitInstruction(MI); 319 AdvanceCycle(); 320 } 321 322 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 323 IsHazardRecognizerMode = true; 324 CurrCycleInstr = MI; 325 unsigned W = PreEmitNoopsCommon(MI); 326 fixHazards(MI); 327 CurrCycleInstr = nullptr; 328 return W; 329 } 330 331 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 332 if (MI->isBundle()) 333 return 0; 334 335 int WaitStates = 0; 336 337 if (SIInstrInfo::isSMRD(*MI)) 338 return std::max(WaitStates, checkSMRDHazards(MI)); 339 340 if (ST.hasNSAtoVMEMBug()) 341 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 342 343 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 344 345 if (ST.hasNoDataDepHazard()) 346 return WaitStates; 347 348 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 349 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 350 351 if (SIInstrInfo::isVALU(*MI)) 352 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 353 354 if (SIInstrInfo::isDPP(*MI)) 355 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 356 357 if (isDivFMas(MI->getOpcode())) 358 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 359 360 if (isRWLane(MI->getOpcode())) 361 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 362 363 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 364 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 365 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 366 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 367 368 if (MI->isInlineAsm()) 369 return std::max(WaitStates, checkInlineAsmHazards(MI)); 370 371 if (isSGetReg(MI->getOpcode())) 372 return std::max(WaitStates, checkGetRegHazards(MI)); 373 374 if (isSSetReg(MI->getOpcode())) 375 return std::max(WaitStates, checkSetRegHazards(MI)); 376 377 if (isRFE(MI->getOpcode())) 378 return std::max(WaitStates, checkRFEHazards(MI)); 379 380 if ((ST.hasReadM0MovRelInterpHazard() && 381 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 382 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 383 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 384 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 385 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 386 (ST.hasReadM0LdsDirectHazard() && 387 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 388 return std::max(WaitStates, checkReadM0Hazards(MI)); 389 390 if (SIInstrInfo::isMAI(*MI)) 391 return std::max(WaitStates, checkMAIHazards(MI)); 392 393 if (SIInstrInfo::isVMEM(*MI) || 394 SIInstrInfo::isFLAT(*MI) || 395 SIInstrInfo::isDS(*MI)) 396 return std::max(WaitStates, checkMAILdStHazards(MI)); 397 398 return WaitStates; 399 } 400 401 void GCNHazardRecognizer::EmitNoop() { 402 EmittedInstrs.push_front(nullptr); 403 } 404 405 void GCNHazardRecognizer::AdvanceCycle() { 406 // When the scheduler detects a stall, it will call AdvanceCycle() without 407 // emitting any instructions. 408 if (!CurrCycleInstr) { 409 EmittedInstrs.push_front(nullptr); 410 return; 411 } 412 413 if (CurrCycleInstr->isBundle()) { 414 processBundle(); 415 return; 416 } 417 418 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 419 if (!NumWaitStates) { 420 CurrCycleInstr = nullptr; 421 return; 422 } 423 424 // Keep track of emitted instructions 425 EmittedInstrs.push_front(CurrCycleInstr); 426 427 // Add a nullptr for each additional wait state after the first. Make sure 428 // not to add more than getMaxLookAhead() items to the list, since we 429 // truncate the list to that size right after this loop. 430 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 431 i < e; ++i) { 432 EmittedInstrs.push_front(nullptr); 433 } 434 435 // getMaxLookahead() is the largest number of wait states we will ever need 436 // to insert, so there is no point in keeping track of more than that many 437 // wait states. 438 EmittedInstrs.resize(getMaxLookAhead()); 439 440 CurrCycleInstr = nullptr; 441 } 442 443 void GCNHazardRecognizer::RecedeCycle() { 444 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 445 } 446 447 //===----------------------------------------------------------------------===// 448 // Helper Functions 449 //===----------------------------------------------------------------------===// 450 451 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 452 453 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 454 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 455 456 // Search for a hazard in a block and its predecessors. 457 template <typename StateT> 458 static bool 459 hasHazard(StateT State, 460 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 461 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 462 const MachineBasicBlock *MBB, 463 MachineBasicBlock::const_reverse_instr_iterator I, 464 DenseSet<const MachineBasicBlock *> &Visited) { 465 for (auto E = MBB->instr_rend(); I != E; ++I) { 466 // No need to look at parent BUNDLE instructions. 467 if (I->isBundle()) 468 continue; 469 470 switch (IsHazard(State, *I)) { 471 case HazardFound: 472 return true; 473 case HazardExpired: 474 return false; 475 default: 476 // Continue search 477 break; 478 } 479 480 if (I->isInlineAsm() || I->isMetaInstruction()) 481 continue; 482 483 UpdateState(State, *I); 484 } 485 486 for (MachineBasicBlock *Pred : MBB->predecessors()) { 487 if (!Visited.insert(Pred).second) 488 continue; 489 490 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 491 Visited)) 492 return true; 493 } 494 495 return false; 496 } 497 498 // Returns a minimum wait states since \p I walking all predecessors. 499 // Only scans until \p IsExpired does not return true. 500 // Can only be run in a hazard recognizer mode. 501 static int getWaitStatesSince( 502 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 503 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 504 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 505 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 506 for (auto E = MBB->instr_rend(); I != E; ++I) { 507 // Don't add WaitStates for parent BUNDLE instructions. 508 if (I->isBundle()) 509 continue; 510 511 if (IsHazard(*I)) 512 return WaitStates; 513 514 if (I->isInlineAsm()) 515 continue; 516 517 WaitStates += GetNumWaitStates(*I); 518 519 if (IsExpired(*I, WaitStates)) 520 return std::numeric_limits<int>::max(); 521 } 522 523 int MinWaitStates = std::numeric_limits<int>::max(); 524 for (MachineBasicBlock *Pred : MBB->predecessors()) { 525 if (!Visited.insert(Pred).second) 526 continue; 527 528 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 529 IsExpired, Visited, GetNumWaitStates); 530 531 MinWaitStates = std::min(MinWaitStates, W); 532 } 533 534 return MinWaitStates; 535 } 536 537 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 538 const MachineInstr *MI, IsExpiredFn IsExpired) { 539 DenseSet<const MachineBasicBlock *> Visited; 540 return getWaitStatesSince(IsHazard, MI->getParent(), 541 std::next(MI->getReverseIterator()), 542 0, IsExpired, Visited); 543 } 544 545 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 546 if (IsHazardRecognizerMode) { 547 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 548 return WaitStates >= Limit; 549 }; 550 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 551 } 552 553 int WaitStates = 0; 554 for (MachineInstr *MI : EmittedInstrs) { 555 if (MI) { 556 if (IsHazard(*MI)) 557 return WaitStates; 558 559 if (MI->isInlineAsm()) 560 continue; 561 } 562 ++WaitStates; 563 564 if (WaitStates >= Limit) 565 break; 566 } 567 return std::numeric_limits<int>::max(); 568 } 569 570 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 571 IsHazardFn IsHazardDef, 572 int Limit) { 573 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 574 575 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 576 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 577 }; 578 579 return getWaitStatesSince(IsHazardFn, Limit); 580 } 581 582 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 583 int Limit) { 584 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 585 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 586 }; 587 588 return getWaitStatesSince(IsHazardFn, Limit); 589 } 590 591 //===----------------------------------------------------------------------===// 592 // No-op Hazard Detection 593 //===----------------------------------------------------------------------===// 594 595 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 596 MCRegister Reg) { 597 for (MCRegUnit Unit : TRI.regunits(Reg)) 598 BV.set(Unit); 599 } 600 601 static void addRegsToSet(const SIRegisterInfo &TRI, 602 iterator_range<MachineInstr::const_mop_iterator> Ops, 603 BitVector &DefSet, BitVector &UseSet) { 604 for (const MachineOperand &Op : Ops) { 605 if (Op.isReg()) 606 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 607 } 608 } 609 610 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 611 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 612 } 613 614 static bool breaksSMEMSoftClause(MachineInstr *MI) { 615 return !SIInstrInfo::isSMRD(*MI); 616 } 617 618 static bool breaksVMEMSoftClause(MachineInstr *MI) { 619 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 620 } 621 622 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 623 // SMEM soft clause are only present on VI+, and only matter if xnack is 624 // enabled. 625 if (!ST.isXNACKEnabled()) 626 return 0; 627 628 bool IsSMRD = TII.isSMRD(*MEM); 629 630 resetClause(); 631 632 // A soft-clause is any group of consecutive SMEM instructions. The 633 // instructions in this group may return out of order and/or may be 634 // replayed (i.e. the same instruction issued more than once). 635 // 636 // In order to handle these situations correctly we need to make sure that 637 // when a clause has more than one instruction, no instruction in the clause 638 // writes to a register that is read by another instruction in the clause 639 // (including itself). If we encounter this situation, we need to break the 640 // clause by inserting a non SMEM instruction. 641 642 for (MachineInstr *MI : EmittedInstrs) { 643 // When we hit a non-SMEM instruction then we have passed the start of the 644 // clause and we can stop. 645 if (!MI) 646 break; 647 648 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 649 break; 650 651 addClauseInst(*MI); 652 } 653 654 if (ClauseDefs.none()) 655 return 0; 656 657 // We need to make sure not to put loads and stores in the same clause if they 658 // use the same address. For now, just start a new clause whenever we see a 659 // store. 660 if (MEM->mayStore()) 661 return 1; 662 663 addClauseInst(*MEM); 664 665 // If the set of defs and uses intersect then we cannot add this instruction 666 // to the clause, so we have a hazard. 667 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 668 } 669 670 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 671 int WaitStatesNeeded = 0; 672 673 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 674 675 // This SMRD hazard only affects SI. 676 if (!ST.hasSMRDReadVALUDefHazard()) 677 return WaitStatesNeeded; 678 679 // A read of an SGPR by SMRD instruction requires 4 wait states when the 680 // SGPR was written by a VALU instruction. 681 int SmrdSgprWaitStates = 4; 682 auto IsHazardDefFn = [this](const MachineInstr &MI) { 683 return TII.isVALU(MI); 684 }; 685 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 686 return TII.isSALU(MI); 687 }; 688 689 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 690 691 for (const MachineOperand &Use : SMRD->uses()) { 692 if (!Use.isReg()) 693 continue; 694 int WaitStatesNeededForUse = 695 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 696 SmrdSgprWaitStates); 697 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 698 699 // This fixes what appears to be undocumented hardware behavior in SI where 700 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 701 // needs some number of nops in between. We don't know how many we need, but 702 // let's use 4. This wasn't discovered before probably because the only 703 // case when this happens is when we expand a 64-bit pointer into a full 704 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 705 // probably never encountered in the closed-source land. 706 if (IsBufferSMRD) { 707 int WaitStatesNeededForUse = 708 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 709 IsBufferHazardDefFn, 710 SmrdSgprWaitStates); 711 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 712 } 713 } 714 715 return WaitStatesNeeded; 716 } 717 718 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 719 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 720 return 0; 721 722 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 723 724 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 725 // SGPR was written by a VALU Instruction. 726 const int VmemSgprWaitStates = 5; 727 auto IsHazardDefFn = [this](const MachineInstr &MI) { 728 return TII.isVALU(MI); 729 }; 730 for (const MachineOperand &Use : VMEM->uses()) { 731 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 732 continue; 733 734 int WaitStatesNeededForUse = 735 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 736 VmemSgprWaitStates); 737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 738 } 739 return WaitStatesNeeded; 740 } 741 742 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 743 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 744 const SIInstrInfo *TII = ST.getInstrInfo(); 745 746 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 747 int DppVgprWaitStates = 2; 748 int DppExecWaitStates = 5; 749 int WaitStatesNeeded = 0; 750 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 751 return TII->isVALU(MI); 752 }; 753 754 for (const MachineOperand &Use : DPP->uses()) { 755 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 756 continue; 757 int WaitStatesNeededForUse = 758 DppVgprWaitStates - getWaitStatesSinceDef( 759 Use.getReg(), 760 [](const MachineInstr &) { return true; }, 761 DppVgprWaitStates); 762 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 763 } 764 765 WaitStatesNeeded = std::max( 766 WaitStatesNeeded, 767 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 768 DppExecWaitStates)); 769 770 return WaitStatesNeeded; 771 } 772 773 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 774 const SIInstrInfo *TII = ST.getInstrInfo(); 775 776 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 777 // instruction. 778 const int DivFMasWaitStates = 4; 779 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 780 return TII->isVALU(MI); 781 }; 782 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 783 DivFMasWaitStates); 784 785 return DivFMasWaitStates - WaitStatesNeeded; 786 } 787 788 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 789 const SIInstrInfo *TII = ST.getInstrInfo(); 790 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 791 792 const int GetRegWaitStates = 2; 793 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 794 return GetRegHWReg == getHWReg(TII, MI); 795 }; 796 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 797 798 return GetRegWaitStates - WaitStatesNeeded; 799 } 800 801 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 802 const SIInstrInfo *TII = ST.getInstrInfo(); 803 unsigned HWReg = getHWReg(TII, *SetRegInstr); 804 805 const int SetRegWaitStates = ST.getSetRegWaitStates(); 806 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 807 return HWReg == getHWReg(TII, MI); 808 }; 809 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 810 return SetRegWaitStates - WaitStatesNeeded; 811 } 812 813 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 814 if (!MI.mayStore()) 815 return -1; 816 817 const SIInstrInfo *TII = ST.getInstrInfo(); 818 unsigned Opcode = MI.getOpcode(); 819 const MCInstrDesc &Desc = MI.getDesc(); 820 821 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 822 int VDataRCID = -1; 823 if (VDataIdx != -1) 824 VDataRCID = Desc.operands()[VDataIdx].RegClass; 825 826 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 827 // There is no hazard if the instruction does not use vector regs 828 // (like wbinvl1) 829 if (VDataIdx == -1) 830 return -1; 831 // For MUBUF/MTBUF instructions this hazard only exists if the 832 // instruction is not using a register in the soffset field. 833 const MachineOperand *SOffset = 834 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 835 // If we have no soffset operand, then assume this field has been 836 // hardcoded to zero. 837 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 838 (!SOffset || !SOffset->isReg())) 839 return VDataIdx; 840 } 841 842 // MIMG instructions create a hazard if they don't use a 256-bit T# and 843 // the store size is greater than 8 bytes and they have more than two bits 844 // of their dmask set. 845 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 846 if (TII->isMIMG(MI)) { 847 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 848 assert(SRsrcIdx != -1 && 849 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 850 (void)SRsrcIdx; 851 } 852 853 if (TII->isFLAT(MI)) { 854 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 855 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 856 return DataIdx; 857 } 858 859 return -1; 860 } 861 862 int 863 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 864 const MachineRegisterInfo &MRI) { 865 // Helper to check for the hazard where VMEM instructions that store more than 866 // 8 bytes can have there store data over written by the next instruction. 867 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 868 869 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 870 int WaitStatesNeeded = 0; 871 872 if (!TRI->isVectorRegister(MRI, Def.getReg())) 873 return WaitStatesNeeded; 874 Register Reg = Def.getReg(); 875 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 876 int DataIdx = createsVALUHazard(MI); 877 return DataIdx >= 0 && 878 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 879 }; 880 881 int WaitStatesNeededForDef = 882 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 883 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 884 885 return WaitStatesNeeded; 886 } 887 888 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 889 /// pack the computed value into correct bit position of the dest register. This 890 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 891 /// dst_sel that is not aligned to the register. This function analayzes the \p 892 /// MI and \returns an operand with dst forwarding issue, or nullptr if 893 /// none exists. 894 static const MachineOperand * 895 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 896 if (!SIInstrInfo::isVALU(MI)) 897 return nullptr; 898 899 const SIInstrInfo *TII = ST.getInstrInfo(); 900 901 unsigned Opcode = MI.getOpcode(); 902 903 // There are three different types of instructions 904 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 905 // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and 906 // CVT_SR_BF8_F32 with op_sel[3:2] 907 // != 0 908 if (SIInstrInfo::isSDWA(MI)) { 909 // Type 1: SDWA with dst_sel != DWORD 910 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 911 if (DstSel->getImm() == AMDGPU::SDWA::DWORD) 912 return nullptr; 913 } else { 914 // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and 915 // CVT_SR_BF8_F32 with op_sel[3:2] != 0) 916 if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || 917 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 918 SISrcMods::DST_OP_SEL || 919 (AMDGPU::isFP8DstSelInst(Opcode) && 920 (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 921 SISrcMods::OP_SEL_0)))) 922 return nullptr; 923 } 924 925 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 926 } 927 928 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 929 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 930 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 931 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 932 const MachineOperand *Dst, 933 const SIRegisterInfo *TRI) { 934 // We must consider implicit reads of the VALU. SDWA with dst_sel and 935 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 936 // and we must account for that hazard. 937 // We also must account for WAW hazards. In particular, WAW with dest 938 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 939 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 940 // check for ECC. Without accounting for this hazard, the ECC will be 941 // wrong. 942 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 943 // complete zeroesHigh16BitsOfDest) 944 for (auto &Operand : VALU->operands()) { 945 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 946 return true; 947 } 948 } 949 return false; 950 } 951 952 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 953 int WaitStatesNeeded = 0; 954 955 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 956 const int TransDefWaitstates = 1; 957 958 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 959 if (!SIInstrInfo::isTRANS(MI)) 960 return false; 961 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 962 const SIInstrInfo *TII = ST.getInstrInfo(); 963 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 964 965 for (const MachineOperand &Use : VALU->explicit_uses()) { 966 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 967 return true; 968 } 969 970 return false; 971 }; 972 973 int WaitStatesNeededForDef = 974 TransDefWaitstates - 975 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 977 } 978 979 if (ST.hasDstSelForwardingHazard()) { 980 const int Shift16DefWaitstates = 1; 981 982 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 983 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 984 const MachineOperand *ForwardedDst = 985 getDstSelForwardingOperand(ProducerMI, ST); 986 if (ForwardedDst) { 987 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 988 } 989 990 if (ProducerMI.isInlineAsm()) { 991 // Assume inline asm has dst forwarding hazard 992 for (auto &Def : ProducerMI.all_defs()) { 993 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 994 return true; 995 } 996 } 997 998 return false; 999 }; 1000 1001 int WaitStatesNeededForDef = 1002 Shift16DefWaitstates - 1003 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1005 } 1006 1007 if (ST.hasVDecCoExecHazard()) { 1008 const int VALUWriteSGPRVALUReadWaitstates = 2; 1009 const int VALUWriteEXECRWLane = 4; 1010 const int VALUWriteVGPRReadlaneRead = 1; 1011 1012 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1013 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1014 Register UseReg; 1015 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1016 if (!SIInstrInfo::isVALU(MI)) 1017 return false; 1018 return MI.modifiesRegister(UseReg, TRI); 1019 }; 1020 1021 for (const MachineOperand &Use : VALU->explicit_uses()) { 1022 if (!Use.isReg()) 1023 continue; 1024 1025 UseReg = Use.getReg(); 1026 if (TRI->isSGPRReg(MRI, UseReg)) { 1027 int WaitStatesNeededForDef = 1028 VALUWriteSGPRVALUReadWaitstates - 1029 getWaitStatesSince(IsVALUDefSGPRFn, 1030 VALUWriteSGPRVALUReadWaitstates); 1031 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1032 } 1033 } 1034 1035 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1036 UseReg = AMDGPU::VCC; 1037 int WaitStatesNeededForDef = 1038 VALUWriteSGPRVALUReadWaitstates - 1039 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1040 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1041 } 1042 1043 switch (VALU->getOpcode()) { 1044 case AMDGPU::V_READLANE_B32: 1045 case AMDGPU::V_READFIRSTLANE_B32: { 1046 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1047 UseReg = Src->getReg(); 1048 int WaitStatesNeededForDef = 1049 VALUWriteVGPRReadlaneRead - 1050 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1052 } 1053 [[fallthrough]]; 1054 case AMDGPU::V_WRITELANE_B32: { 1055 UseReg = AMDGPU::EXEC; 1056 int WaitStatesNeededForDef = 1057 VALUWriteEXECRWLane - 1058 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1059 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1060 break; 1061 } 1062 default: 1063 break; 1064 } 1065 } 1066 1067 // This checks for the hazard where VMEM instructions that store more than 1068 // 8 bytes can have there store data over written by the next instruction. 1069 if (!ST.has12DWordStoreHazard()) 1070 return WaitStatesNeeded; 1071 1072 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1073 1074 for (const MachineOperand &Def : VALU->defs()) { 1075 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1076 } 1077 1078 return WaitStatesNeeded; 1079 } 1080 1081 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1082 // This checks for hazards associated with inline asm statements. 1083 // Since inline asms can contain just about anything, we use this 1084 // to call/leverage other check*Hazard routines. Note that 1085 // this function doesn't attempt to address all possible inline asm 1086 // hazards (good luck), but is a collection of what has been 1087 // problematic thus far. 1088 1089 // see checkVALUHazards() 1090 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) 1091 return 0; 1092 1093 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1094 int WaitStatesNeeded = 0; 1095 1096 for (const MachineOperand &Op : 1097 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1098 if (Op.isReg() && Op.isDef()) { 1099 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1100 continue; 1101 1102 if (ST.has12DWordStoreHazard()) { 1103 WaitStatesNeeded = 1104 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1105 } 1106 } 1107 } 1108 1109 if (ST.hasDstSelForwardingHazard()) { 1110 const int Shift16DefWaitstates = 1; 1111 1112 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1113 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1114 // Assume inline asm reads the dst 1115 if (Dst) 1116 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1117 IA->readsRegister(Dst->getReg(), &TRI); 1118 1119 if (ProducerMI.isInlineAsm()) { 1120 // If MI is inline asm, assume it has dst forwarding hazard 1121 for (auto &Def : ProducerMI.all_defs()) { 1122 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1123 IA->readsRegister(Def.getReg(), &TRI)) { 1124 return true; 1125 } 1126 } 1127 } 1128 1129 return false; 1130 }; 1131 1132 int WaitStatesNeededForDef = 1133 Shift16DefWaitstates - 1134 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1135 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1136 } 1137 1138 return WaitStatesNeeded; 1139 } 1140 1141 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1142 const SIInstrInfo *TII = ST.getInstrInfo(); 1143 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1144 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1145 1146 const MachineOperand *LaneSelectOp = 1147 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1148 1149 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1150 return 0; 1151 1152 Register LaneSelectReg = LaneSelectOp->getReg(); 1153 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1154 1155 const int RWLaneWaitStates = 4; 1156 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1157 RWLaneWaitStates); 1158 return RWLaneWaitStates - WaitStatesSince; 1159 } 1160 1161 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1162 if (!ST.hasRFEHazards()) 1163 return 0; 1164 1165 const SIInstrInfo *TII = ST.getInstrInfo(); 1166 1167 const int RFEWaitStates = 1; 1168 1169 auto IsHazardFn = [TII](const MachineInstr &MI) { 1170 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1171 }; 1172 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1173 return RFEWaitStates - WaitStatesNeeded; 1174 } 1175 1176 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1177 const SIInstrInfo *TII = ST.getInstrInfo(); 1178 const int ReadM0WaitStates = 1; 1179 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1180 return ReadM0WaitStates - 1181 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1182 } 1183 1184 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1185 fixVMEMtoScalarWriteHazards(MI); 1186 fixVcmpxPermlaneHazards(MI); 1187 fixSMEMtoVectorWriteHazards(MI); 1188 fixVcmpxExecWARHazard(MI); 1189 fixLdsBranchVmemWARHazard(MI); 1190 if (ST.hasLdsDirect()) { 1191 fixLdsDirectVALUHazard(MI); 1192 fixLdsDirectVMEMHazard(MI); 1193 } 1194 fixVALUPartialForwardingHazard(MI); 1195 fixVALUTransUseHazard(MI); 1196 fixWMMAHazards(MI); 1197 fixShift64HighRegBug(MI); 1198 fixVALUMaskWriteHazard(MI); 1199 fixVALUReadSGPRHazard(MI); 1200 fixRequiredExportPriority(MI); 1201 } 1202 1203 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1204 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1205 return false; 1206 1207 const SIInstrInfo *TII = ST.getInstrInfo(); 1208 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1209 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1210 return (TII->isVOPC(MI) || 1211 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && 1212 MI.modifiesRegister(AMDGPU::EXEC, TRI); 1213 }; 1214 1215 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1216 unsigned Opc = MI.getOpcode(); 1217 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1218 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1219 }; 1220 1221 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1222 std::numeric_limits<int>::max()) 1223 return false; 1224 1225 // V_NOP will be discarded by SQ. 1226 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1227 // which is always a VGPR and available. 1228 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1229 Register Reg = Src0->getReg(); 1230 bool IsUndef = Src0->isUndef(); 1231 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1232 TII->get(AMDGPU::V_MOV_B32_e32)) 1233 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1234 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1235 1236 return true; 1237 } 1238 1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1240 if (!ST.hasVMEMtoScalarWriteHazard()) 1241 return false; 1242 assert(!ST.hasExtendedWaitCounts()); 1243 1244 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1245 return false; 1246 1247 if (MI->getNumDefs() == 0) 1248 return false; 1249 1250 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1251 1252 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1253 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1254 !SIInstrInfo::isFLAT(I)) 1255 return false; 1256 1257 for (const MachineOperand &Def : MI->defs()) { 1258 const MachineOperand *Op = 1259 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1260 if (!Op) 1261 continue; 1262 return true; 1263 } 1264 return false; 1265 }; 1266 1267 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1268 return SIInstrInfo::isVALU(MI) || 1269 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1270 !MI.getOperand(0).getImm()) || 1271 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1272 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1273 }; 1274 1275 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1276 std::numeric_limits<int>::max()) 1277 return false; 1278 1279 const SIInstrInfo *TII = ST.getInstrInfo(); 1280 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1281 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1282 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1283 return true; 1284 } 1285 1286 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1287 if (!ST.hasSMEMtoVectorWriteHazard()) 1288 return false; 1289 assert(!ST.hasExtendedWaitCounts()); 1290 1291 if (!SIInstrInfo::isVALU(*MI)) 1292 return false; 1293 1294 unsigned SDSTName; 1295 switch (MI->getOpcode()) { 1296 case AMDGPU::V_READLANE_B32: 1297 case AMDGPU::V_READFIRSTLANE_B32: 1298 SDSTName = AMDGPU::OpName::vdst; 1299 break; 1300 default: 1301 SDSTName = AMDGPU::OpName::sdst; 1302 break; 1303 } 1304 1305 const SIInstrInfo *TII = ST.getInstrInfo(); 1306 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1307 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1308 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1309 if (!SDST) { 1310 for (const auto &MO : MI->implicit_operands()) { 1311 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1312 SDST = &MO; 1313 break; 1314 } 1315 } 1316 } 1317 1318 if (!SDST) 1319 return false; 1320 1321 const Register SDSTReg = SDST->getReg(); 1322 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1323 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1324 }; 1325 1326 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1327 if (TII->isSALU(MI)) { 1328 switch (MI.getOpcode()) { 1329 case AMDGPU::S_SETVSKIP: 1330 case AMDGPU::S_VERSION: 1331 case AMDGPU::S_WAITCNT_VSCNT: 1332 case AMDGPU::S_WAITCNT_VMCNT: 1333 case AMDGPU::S_WAITCNT_EXPCNT: 1334 // These instructions cannot not mitigate the hazard. 1335 return false; 1336 case AMDGPU::S_WAITCNT_LGKMCNT: 1337 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1338 return (MI.getOperand(1).getImm() == 0) && 1339 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1340 case AMDGPU::S_WAITCNT: { 1341 const int64_t Imm = MI.getOperand(0).getImm(); 1342 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1343 // DsCnt corresponds to LGKMCnt here. 1344 return (Decoded.DsCnt == 0); 1345 } 1346 default: 1347 // SOPP instructions cannot mitigate the hazard. 1348 if (TII->isSOPP(MI)) 1349 return false; 1350 // At this point the SALU can be assumed to mitigate the hazard 1351 // because either: 1352 // (a) it is independent of the at risk SMEM (breaking chain), 1353 // or 1354 // (b) it is dependent on the SMEM, in which case an appropriate 1355 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1356 // SMEM instruction. 1357 return true; 1358 } 1359 } 1360 return false; 1361 }; 1362 1363 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1364 std::numeric_limits<int>::max()) 1365 return false; 1366 1367 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1368 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1369 .addImm(0); 1370 return true; 1371 } 1372 1373 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1374 if (!ST.hasVcmpxExecWARHazard()) 1375 return false; 1376 assert(!ST.hasExtendedWaitCounts()); 1377 1378 if (!SIInstrInfo::isVALU(*MI)) 1379 return false; 1380 1381 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1382 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1383 return false; 1384 1385 auto IsHazardFn = [TRI](const MachineInstr &I) { 1386 if (SIInstrInfo::isVALU(I)) 1387 return false; 1388 return I.readsRegister(AMDGPU::EXEC, TRI); 1389 }; 1390 1391 const SIInstrInfo *TII = ST.getInstrInfo(); 1392 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1393 if (SIInstrInfo::isVALU(MI)) { 1394 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1395 return true; 1396 for (auto MO : MI.implicit_operands()) 1397 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1398 return true; 1399 } 1400 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1401 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1402 return true; 1403 return false; 1404 }; 1405 1406 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1407 std::numeric_limits<int>::max()) 1408 return false; 1409 1410 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1411 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1412 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1413 return true; 1414 } 1415 1416 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1417 const GCNSubtarget &ST) { 1418 if (!ST.hasLdsBranchVmemWARHazard()) 1419 return false; 1420 1421 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1422 // instructions need to appear in the same function. 1423 bool HasLds = false; 1424 bool HasVmem = false; 1425 for (auto &MBB : MF) { 1426 for (auto &MI : MBB) { 1427 HasLds |= SIInstrInfo::isDS(MI); 1428 HasVmem |= 1429 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1430 if (HasLds && HasVmem) 1431 return true; 1432 } 1433 } 1434 return false; 1435 } 1436 1437 static bool isStoreCountWaitZero(const MachineInstr &I) { 1438 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1439 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1440 !I.getOperand(1).getImm(); 1441 } 1442 1443 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1444 if (!RunLdsBranchVmemWARHazardFixup) 1445 return false; 1446 1447 assert(ST.hasLdsBranchVmemWARHazard()); 1448 assert(!ST.hasExtendedWaitCounts()); 1449 1450 auto IsHazardInst = [](const MachineInstr &MI) { 1451 if (SIInstrInfo::isDS(MI)) 1452 return 1; 1453 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1454 return 2; 1455 return 0; 1456 }; 1457 1458 auto InstType = IsHazardInst(*MI); 1459 if (!InstType) 1460 return false; 1461 1462 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1463 return IsHazardInst(I) || isStoreCountWaitZero(I); 1464 }; 1465 1466 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1467 if (!I.isBranch()) 1468 return false; 1469 1470 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1471 auto InstType2 = IsHazardInst(I); 1472 return InstType2 && InstType != InstType2; 1473 }; 1474 1475 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1476 auto InstType2 = IsHazardInst(I); 1477 if (InstType == InstType2) 1478 return true; 1479 1480 return isStoreCountWaitZero(I); 1481 }; 1482 1483 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1484 std::numeric_limits<int>::max(); 1485 }; 1486 1487 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1488 std::numeric_limits<int>::max()) 1489 return false; 1490 1491 const SIInstrInfo *TII = ST.getInstrInfo(); 1492 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1493 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1494 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1495 .addImm(0); 1496 1497 return true; 1498 } 1499 1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1501 if (!SIInstrInfo::isLDSDIR(*MI)) 1502 return false; 1503 1504 const int NoHazardWaitStates = 15; 1505 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1506 const Register VDSTReg = VDST->getReg(); 1507 1508 bool VisitedTrans = false; 1509 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1510 if (!SIInstrInfo::isVALU(I)) 1511 return false; 1512 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1513 // Cover both WAR and WAW 1514 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1515 }; 1516 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1517 if (WaitStates >= NoHazardWaitStates) 1518 return true; 1519 // Instructions which cause va_vdst==0 expire hazard 1520 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1521 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1522 }; 1523 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1524 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1525 }; 1526 1527 DenseSet<const MachineBasicBlock *> Visited; 1528 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1529 std::next(MI->getReverseIterator()), 0, 1530 IsExpiredFn, Visited, GetWaitStatesFn); 1531 1532 // Transcendentals can execute in parallel to other VALUs. 1533 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1534 if (VisitedTrans) 1535 Count = 0; 1536 1537 MachineOperand *WaitVdstOp = 1538 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1539 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1540 1541 return true; 1542 } 1543 1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1545 if (!SIInstrInfo::isLDSDIR(*MI)) 1546 return false; 1547 1548 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1549 const Register VDSTReg = VDST->getReg(); 1550 1551 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1552 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1553 !SIInstrInfo::isDS(I)) 1554 return false; 1555 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1556 }; 1557 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1558 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1559 // according to the type of VMEM instruction. 1560 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1561 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1562 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1563 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1564 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1565 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1566 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1567 }; 1568 1569 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1570 std::numeric_limits<int>::max()) 1571 return false; 1572 1573 if (LdsdirCanWait) { 1574 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1575 } else { 1576 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1577 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1578 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1579 } 1580 1581 return true; 1582 } 1583 1584 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1585 if (!ST.hasVALUPartialForwardingHazard()) 1586 return false; 1587 assert(!ST.hasExtendedWaitCounts()); 1588 1589 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1590 return false; 1591 1592 SmallSetVector<Register, 4> SrcVGPRs; 1593 1594 for (const MachineOperand &Use : MI->explicit_uses()) { 1595 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1596 SrcVGPRs.insert(Use.getReg()); 1597 } 1598 1599 // Only applies with >= 2 unique VGPR sources 1600 if (SrcVGPRs.size() <= 1) 1601 return false; 1602 1603 // Look for the following pattern: 1604 // Va <- VALU [PreExecPos] 1605 // intv1 1606 // Exec <- SALU [ExecPos] 1607 // intv2 1608 // Vb <- VALU [PostExecPos] 1609 // intv3 1610 // MI Va, Vb (WaitState = 0) 1611 // 1612 // Where: 1613 // intv1 + intv2 <= 2 VALUs 1614 // intv3 <= 4 VALUs 1615 // 1616 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1617 1618 const int Intv1plus2MaxVALUs = 2; 1619 const int Intv3MaxVALUs = 4; 1620 const int IntvMaxVALUs = 6; 1621 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1622 1623 struct StateType { 1624 SmallDenseMap<Register, int, 4> DefPos; 1625 int ExecPos = std::numeric_limits<int>::max(); 1626 int VALUs = 0; 1627 }; 1628 1629 StateType State; 1630 1631 // This overloads expiry testing with all the hazard detection 1632 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1633 // Too many VALU states have passed 1634 if (State.VALUs > NoHazardVALUWaitStates) 1635 return HazardExpired; 1636 1637 // Instructions which cause va_vdst==0 expire hazard 1638 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1639 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1640 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1641 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1642 return HazardExpired; 1643 1644 // Track registers writes 1645 bool Changed = false; 1646 if (SIInstrInfo::isVALU(I)) { 1647 for (Register Src : SrcVGPRs) { 1648 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1649 State.DefPos[Src] = State.VALUs; 1650 Changed = true; 1651 } 1652 } 1653 } else if (SIInstrInfo::isSALU(I)) { 1654 if (State.ExecPos == std::numeric_limits<int>::max()) { 1655 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1656 State.ExecPos = State.VALUs; 1657 Changed = true; 1658 } 1659 } 1660 } 1661 1662 // Early expiration: too many VALUs in intv3 1663 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1664 return HazardExpired; 1665 1666 // Only evaluate state if something changed 1667 if (!Changed) 1668 return NoHazardFound; 1669 1670 // Determine positions of VALUs pre/post exec change 1671 if (State.ExecPos == std::numeric_limits<int>::max()) 1672 return NoHazardFound; 1673 1674 int PreExecPos = std::numeric_limits<int>::max(); 1675 int PostExecPos = std::numeric_limits<int>::max(); 1676 1677 for (auto Entry : State.DefPos) { 1678 int DefVALUs = Entry.second; 1679 if (DefVALUs != std::numeric_limits<int>::max()) { 1680 if (DefVALUs >= State.ExecPos) 1681 PreExecPos = std::min(PreExecPos, DefVALUs); 1682 else 1683 PostExecPos = std::min(PostExecPos, DefVALUs); 1684 } 1685 } 1686 1687 // Need a VALUs post exec change 1688 if (PostExecPos == std::numeric_limits<int>::max()) 1689 return NoHazardFound; 1690 1691 // Too many VALUs in intv3? 1692 int Intv3VALUs = PostExecPos; 1693 if (Intv3VALUs > Intv3MaxVALUs) 1694 return HazardExpired; 1695 1696 // Too many VALUs in intv2? 1697 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1698 if (Intv2VALUs > Intv1plus2MaxVALUs) 1699 return HazardExpired; 1700 1701 // Need a VALUs pre exec change 1702 if (PreExecPos == std::numeric_limits<int>::max()) 1703 return NoHazardFound; 1704 1705 // Too many VALUs in intv1? 1706 int Intv1VALUs = PreExecPos - State.ExecPos; 1707 if (Intv1VALUs > Intv1plus2MaxVALUs) 1708 return HazardExpired; 1709 1710 // Too many VALUs in intv1 + intv2 1711 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1712 return HazardExpired; 1713 1714 return HazardFound; 1715 }; 1716 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1717 if (SIInstrInfo::isVALU(MI)) 1718 State.VALUs += 1; 1719 }; 1720 1721 DenseSet<const MachineBasicBlock *> Visited; 1722 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1723 std::next(MI->getReverseIterator()), Visited)) 1724 return false; 1725 1726 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1727 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1728 .addImm(0x0fff); 1729 1730 return true; 1731 } 1732 1733 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1734 if (!ST.hasVALUTransUseHazard()) 1735 return false; 1736 assert(!ST.hasExtendedWaitCounts()); 1737 1738 if (!SIInstrInfo::isVALU(*MI)) 1739 return false; 1740 1741 SmallSet<Register, 4> SrcVGPRs; 1742 1743 for (const MachineOperand &Use : MI->explicit_uses()) { 1744 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1745 SrcVGPRs.insert(Use.getReg()); 1746 } 1747 1748 // Look for the following pattern: 1749 // Va <- TRANS VALU 1750 // intv 1751 // MI Va (WaitState = 0) 1752 // 1753 // Where: 1754 // intv <= 5 VALUs / 1 TRANS 1755 // 1756 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1757 1758 const int IntvMaxVALUs = 5; 1759 const int IntvMaxTRANS = 1; 1760 1761 struct StateType { 1762 int VALUs = 0; 1763 int TRANS = 0; 1764 }; 1765 1766 StateType State; 1767 1768 // This overloads expiry testing with all the hazard detection 1769 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1770 // Too many VALU states have passed 1771 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1772 return HazardExpired; 1773 1774 // Instructions which cause va_vdst==0 expire hazard 1775 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1776 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1777 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1778 I.getOperand(0).getImm() == 0x0fff)) 1779 return HazardExpired; 1780 1781 // Track registers writes 1782 if (SIInstrInfo::isTRANS(I)) { 1783 for (Register Src : SrcVGPRs) { 1784 if (I.modifiesRegister(Src, &TRI)) { 1785 return HazardFound; 1786 } 1787 } 1788 } 1789 1790 return NoHazardFound; 1791 }; 1792 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1793 if (SIInstrInfo::isVALU(MI)) 1794 State.VALUs += 1; 1795 if (SIInstrInfo::isTRANS(MI)) 1796 State.TRANS += 1; 1797 }; 1798 1799 DenseSet<const MachineBasicBlock *> Visited; 1800 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1801 std::next(MI->getReverseIterator()), Visited)) 1802 return false; 1803 1804 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1805 // avoided. 1806 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1807 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1808 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1809 1810 return true; 1811 } 1812 1813 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1814 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1815 return false; 1816 1817 const SIInstrInfo *TII = ST.getInstrInfo(); 1818 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1819 1820 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1821 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1822 return false; 1823 1824 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1825 // with the dest(matrix D) of the previous wmma. 1826 const Register CurSrc0Reg = 1827 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1828 const Register CurSrc1Reg = 1829 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1830 1831 const Register PrevDstReg = 1832 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1833 1834 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1835 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1836 return true; 1837 } 1838 1839 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1840 // but Index can't overlap with PrevDstReg. 1841 if (AMDGPU::isGFX12Plus(ST)) { 1842 if (SIInstrInfo::isSWMMAC(*MI)) { 1843 const Register CurIndex = 1844 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1845 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1846 return true; 1847 } 1848 return false; 1849 } 1850 1851 return false; 1852 }; 1853 1854 auto IsExpiredFn = [](const MachineInstr &I, int) { 1855 return SIInstrInfo::isVALU(I); 1856 }; 1857 1858 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1859 std::numeric_limits<int>::max()) 1860 return false; 1861 1862 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1863 1864 return true; 1865 } 1866 1867 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1868 if (!ST.hasShift64HighRegBug()) 1869 return false; 1870 assert(!ST.hasExtendedWaitCounts()); 1871 1872 switch (MI->getOpcode()) { 1873 default: 1874 return false; 1875 case AMDGPU::V_LSHLREV_B64_e64: 1876 case AMDGPU::V_LSHRREV_B64_e64: 1877 case AMDGPU::V_ASHRREV_I64_e64: 1878 break; 1879 } 1880 1881 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1882 if (!Amt->isReg()) 1883 return false; 1884 1885 Register AmtReg = Amt->getReg(); 1886 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1887 // Check if this is a last VGPR in the allocation block. 1888 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1889 return false; 1890 1891 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1892 return false; 1893 1894 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1895 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1896 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1897 bool Overlapped = OverlappedSrc || OverlappedDst; 1898 1899 assert(!OverlappedDst || !OverlappedSrc || 1900 Src1->getReg() == MI->getOperand(0).getReg()); 1901 assert(ST.needsAlignedVGPRs()); 1902 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1903 1904 Register NewReg; 1905 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1906 : AMDGPU::VGPR_32RegClass) { 1907 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1908 NewReg = Reg; 1909 break; 1910 } 1911 } 1912 1913 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1914 : NewReg; 1915 Register NewAmtLo; 1916 1917 if (Overlapped) 1918 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1919 1920 DebugLoc DL = MI->getDebugLoc(); 1921 MachineBasicBlock *MBB = MI->getParent(); 1922 // Insert a full wait count because found register might be pending a wait. 1923 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1924 .addImm(0); 1925 1926 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1927 if (Overlapped) 1928 runOnInstruction( 1929 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1930 .addDef(AmtReg - 1) 1931 .addReg(AmtReg - 1, RegState::Undef) 1932 .addReg(NewAmtLo, RegState::Undef)); 1933 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1934 .addDef(AmtReg) 1935 .addReg(AmtReg, RegState::Undef) 1936 .addReg(NewAmt, RegState::Undef)); 1937 1938 // Instructions emitted after the current instruction will be processed by the 1939 // parent loop of the hazard recognizer in a natural way. 1940 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1941 AmtReg) 1942 .addDef(NewAmt) 1943 .addReg(NewAmt) 1944 .addReg(AmtReg); 1945 if (Overlapped) 1946 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1947 AmtReg - 1) 1948 .addDef(NewAmtLo) 1949 .addReg(NewAmtLo) 1950 .addReg(AmtReg - 1); 1951 1952 // Re-running hazard recognizer on the modified instruction is not necessary, 1953 // inserted V_SWAP_B32 has already both read and write new registers so 1954 // hazards related to these register has already been handled. 1955 Amt->setReg(NewAmt); 1956 Amt->setIsKill(false); 1957 // We do not update liveness, so verifier may see it as undef. 1958 Amt->setIsUndef(); 1959 if (OverlappedDst) 1960 MI->getOperand(0).setReg(NewReg); 1961 if (OverlappedSrc) { 1962 Src1->setReg(NewReg); 1963 Src1->setIsKill(false); 1964 Src1->setIsUndef(); 1965 } 1966 1967 return true; 1968 } 1969 1970 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1971 int NSAtoVMEMWaitStates = 1; 1972 1973 if (!ST.hasNSAtoVMEMBug()) 1974 return 0; 1975 1976 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1977 return 0; 1978 1979 const SIInstrInfo *TII = ST.getInstrInfo(); 1980 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1981 if (!Offset || (Offset->getImm() & 6) == 0) 1982 return 0; 1983 1984 auto IsHazardFn = [TII](const MachineInstr &I) { 1985 if (!SIInstrInfo::isMIMG(I)) 1986 return false; 1987 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 1988 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1989 TII->getInstSizeInBytes(I) >= 16; 1990 }; 1991 1992 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1993 } 1994 1995 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1996 int FPAtomicToDenormModeWaitStates = 3; 1997 1998 if (!ST.hasFPAtomicToDenormModeHazard()) 1999 return 0; 2000 assert(!ST.hasExtendedWaitCounts()); 2001 2002 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2003 return 0; 2004 2005 auto IsHazardFn = [](const MachineInstr &I) { 2006 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2007 return false; 2008 return SIInstrInfo::isFPAtomic(I); 2009 }; 2010 2011 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2012 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2013 return true; 2014 2015 switch (MI.getOpcode()) { 2016 case AMDGPU::S_WAITCNT: 2017 case AMDGPU::S_WAITCNT_VSCNT: 2018 case AMDGPU::S_WAITCNT_VMCNT: 2019 case AMDGPU::S_WAITCNT_EXPCNT: 2020 case AMDGPU::S_WAITCNT_LGKMCNT: 2021 case AMDGPU::S_WAIT_IDLE: 2022 return true; 2023 default: 2024 break; 2025 } 2026 2027 return false; 2028 }; 2029 2030 return FPAtomicToDenormModeWaitStates - 2031 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2032 } 2033 2034 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2035 assert(SIInstrInfo::isMAI(*MI)); 2036 2037 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2038 } 2039 2040 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2041 // Early exit if no padding is requested. 2042 if (MFMAPaddingRatio == 0) 2043 return 0; 2044 2045 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2046 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2047 return 0; 2048 2049 int NeighborMFMALatency = 0; 2050 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2051 this](const MachineInstr &MI) { 2052 if (!SIInstrInfo::isMFMA(MI)) 2053 return false; 2054 2055 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2056 return true; 2057 }; 2058 2059 const int MaxMFMAPipelineWaitStates = 16; 2060 int WaitStatesSinceNeighborMFMA = 2061 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2062 2063 int NeighborMFMAPaddingNeeded = 2064 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2065 WaitStatesSinceNeighborMFMA; 2066 2067 return std::max(0, NeighborMFMAPaddingNeeded); 2068 } 2069 2070 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2071 int WaitStatesNeeded = 0; 2072 unsigned Opc = MI->getOpcode(); 2073 2074 auto IsVALUFn = [](const MachineInstr &MI) { 2075 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2076 }; 2077 2078 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2079 const int LegacyVALUWritesVGPRWaitStates = 2; 2080 const int VALUWritesExecWaitStates = 4; 2081 const int MaxWaitStates = 4; 2082 2083 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2084 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2086 2087 if (WaitStatesNeeded < MaxWaitStates) { 2088 for (const MachineOperand &Use : MI->explicit_uses()) { 2089 const int MaxWaitStates = 2; 2090 2091 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2092 continue; 2093 2094 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2095 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2097 2098 if (WaitStatesNeeded == MaxWaitStates) 2099 break; 2100 } 2101 } 2102 } 2103 2104 for (const MachineOperand &Op : MI->explicit_operands()) { 2105 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2106 continue; 2107 2108 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2109 continue; 2110 2111 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2112 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2113 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2114 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2115 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2116 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2117 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2118 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2119 const int MaxWaitStates = 18; 2120 Register Reg = Op.getReg(); 2121 unsigned HazardDefLatency = 0; 2122 2123 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2124 this](const MachineInstr &MI) { 2125 if (!SIInstrInfo::isMFMA(MI)) 2126 return false; 2127 Register DstReg = MI.getOperand(0).getReg(); 2128 if (DstReg == Reg) 2129 return false; 2130 HazardDefLatency = 2131 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2132 return TRI.regsOverlap(DstReg, Reg); 2133 }; 2134 2135 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2136 MaxWaitStates); 2137 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2138 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2139 int OpNo = Op.getOperandNo(); 2140 if (OpNo == SrcCIdx) { 2141 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2142 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2143 switch (HazardDefLatency) { 2144 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2145 break; 2146 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2147 break; 2148 case 16: [[fallthrough]]; 2149 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2150 break; 2151 } 2152 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2153 switch (HazardDefLatency) { 2154 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2155 break; 2156 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2157 break; 2158 case 16: [[fallthrough]]; 2159 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2160 break; 2161 } 2162 } 2163 2164 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2165 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2166 2167 if (WaitStatesNeeded == MaxWaitStates) 2168 return WaitStatesNeeded; // Early exit. 2169 2170 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2171 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2172 return false; 2173 Register DstReg = MI.getOperand(0).getReg(); 2174 return TRI.regsOverlap(Reg, DstReg); 2175 }; 2176 2177 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2178 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2179 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2180 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2181 if (OpNo == SrcCIdx) 2182 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2183 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2184 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2185 2186 WaitStatesNeededForUse = NeedWaitStates - 2187 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2189 2190 if (WaitStatesNeeded == MaxWaitStates) 2191 return WaitStatesNeeded; // Early exit. 2192 } 2193 2194 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2195 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2196 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2197 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2198 const int MaxWaitStates = 13; 2199 Register DstReg = MI->getOperand(0).getReg(); 2200 unsigned HazardDefLatency = 0; 2201 2202 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2203 this](const MachineInstr &MI) { 2204 if (!SIInstrInfo::isMFMA(MI)) 2205 return false; 2206 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2207 HazardDefLatency = 2208 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2209 return TRI.regsOverlap(Reg, DstReg); 2210 }; 2211 2212 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2213 int NeedWaitStates; 2214 switch (HazardDefLatency) { 2215 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2216 break; 2217 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2218 break; 2219 case 16: [[fallthrough]]; 2220 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2221 break; 2222 } 2223 2224 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2225 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2226 } 2227 2228 // Pad neighboring MFMA with noops for better inter-wave performance. 2229 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2230 2231 return WaitStatesNeeded; 2232 } 2233 2234 static int 2235 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2236 // 2 pass -> 3 2237 // 4 pass -> 5 2238 // 8 pass -> 9 2239 // 16 pass -> 17 2240 return NumPasses + 1; 2241 } 2242 2243 static int 2244 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2245 // 2 pass -> 2 2246 // 4 pass -> 4 2247 // 8 pass -> 8 2248 // 16 pass -> 16 2249 return NumPasses; 2250 } 2251 2252 static int 2253 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2254 // 2 pass -> 4 2255 // 4 pass -> 6 2256 // 8 pass -> 10 2257 // 16 pass -> 18 2258 return NumPasses + 2; 2259 } 2260 2261 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2262 // 2 pass -> 5 2263 // 4 pass -> 7 2264 // 8 pass -> 11 2265 // 16 pass -> 19 2266 return NumPasses + 3; 2267 } 2268 2269 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2270 int WaitStatesNeeded = 0; 2271 unsigned Opc = MI->getOpcode(); 2272 2273 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2274 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2275 }; 2276 2277 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2278 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2279 !SIInstrInfo::isDOT(MI); 2280 }; 2281 2282 if (!SIInstrInfo::isMFMA(*MI)) 2283 return WaitStatesNeeded; 2284 2285 const int VALUWritesExecWaitStates = 4; 2286 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2287 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2288 VALUWritesExecWaitStates); 2289 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2290 2291 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2292 2293 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2294 for (const MachineOperand &Use : MI->explicit_uses()) { 2295 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2296 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2297 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2298 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2299 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2300 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2301 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2302 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2303 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2304 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2305 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2306 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2307 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2308 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2309 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2310 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2311 const int MaxWaitStates = 19; 2312 2313 if (!Use.isReg()) 2314 continue; 2315 Register Reg = Use.getReg(); 2316 bool FullReg; 2317 const MachineInstr *MI1; 2318 2319 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2320 this](const MachineInstr &MI) { 2321 if (!SIInstrInfo::isMFMA(MI)) 2322 return false; 2323 Register DstReg = MI.getOperand(0).getReg(); 2324 FullReg = (DstReg == Reg); 2325 MI1 = &MI; 2326 return TRI.regsOverlap(DstReg, Reg); 2327 }; 2328 2329 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2330 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2331 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2332 2333 int NumWaitStates = 2334 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2335 if (NumWaitStates == std::numeric_limits<int>::max()) 2336 continue; 2337 2338 int OpNo = Use.getOperandNo(); 2339 unsigned Opc1 = MI1->getOpcode(); 2340 int NeedWaitStates = 0; 2341 if (OpNo == SrcCIdx) { 2342 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2343 NeedWaitStates = 0; 2344 } else if (FullReg) { 2345 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2346 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2347 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2348 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2349 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2350 else if (ST.hasGFX940Insts() && 2351 TSchedModel.computeInstrLatency(MI1) == 2) 2352 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2353 } else { 2354 switch (Opc1) { 2355 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2356 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2357 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2358 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2359 if (!isXDL(ST, *MI)) 2360 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2361 break; 2362 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2363 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2364 if (!isXDL(ST, *MI)) 2365 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2366 break; 2367 default: 2368 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2369 if (ST.hasGFX940Insts()) { 2370 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2371 break; 2372 2373 NeedWaitStates = 2374 isXDL(ST, *MI1) 2375 ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2376 NumPasses) 2377 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2378 NumPasses); 2379 break; 2380 } 2381 2382 switch (NumPasses) { 2383 case 2: 2384 NeedWaitStates = 2385 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2386 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2387 break; 2388 case 8: 2389 NeedWaitStates = 2390 isDGEMM(Opc) 2391 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2392 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2393 break; 2394 case 16: 2395 NeedWaitStates = 2396 isDGEMM(Opc) 2397 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2398 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2399 break; 2400 default: 2401 llvm_unreachable("unexpected number of passes"); 2402 } 2403 } 2404 } 2405 } else { 2406 switch (Opc1) { 2407 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2408 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2409 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2410 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2411 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2412 break; 2413 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2414 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2415 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2416 break; 2417 default: 2418 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2419 2420 if (ST.hasGFX940Insts()) { 2421 NeedWaitStates = 2422 isXDL(ST, *MI1) 2423 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2424 NumPasses) 2425 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2426 NumPasses); 2427 break; 2428 } 2429 2430 switch (NumPasses) { 2431 case 2: 2432 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2433 break; 2434 case 4: 2435 llvm_unreachable("unexpected number of passes for mfma"); 2436 case 8: 2437 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2438 break; 2439 case 16: 2440 default: 2441 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2442 } 2443 } 2444 } 2445 if (WaitStatesNeeded >= NeedWaitStates) 2446 continue; 2447 2448 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2449 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2450 2451 if (WaitStatesNeeded == MaxWaitStates) 2452 break; 2453 } 2454 2455 // Pad neighboring MFMA with noops for better inter-wave performance. 2456 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2457 2458 return WaitStatesNeeded; 2459 } 2460 2461 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2462 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2463 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2464 return 0; 2465 2466 int WaitStatesNeeded = 0; 2467 2468 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2469 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2470 }; 2471 2472 for (const MachineOperand &Op : MI->explicit_uses()) { 2473 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2474 continue; 2475 2476 Register Reg = Op.getReg(); 2477 2478 const int AccVgprReadLdStWaitStates = 2; 2479 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2480 const int MaxWaitStates = 2; 2481 2482 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2483 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2484 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2485 2486 if (WaitStatesNeeded == MaxWaitStates) 2487 return WaitStatesNeeded; // Early exit. 2488 2489 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2490 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2491 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2492 return false; 2493 auto IsVALUFn = [](const MachineInstr &MI) { 2494 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2495 }; 2496 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2497 std::numeric_limits<int>::max(); 2498 }; 2499 2500 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2501 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2502 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2503 } 2504 2505 return WaitStatesNeeded; 2506 } 2507 2508 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2509 // 2 pass -> 4 2510 // 4 pass -> 6 2511 // 8 pass -> 10 2512 // 16 pass -> 18 2513 return NumPasses + 2; 2514 } 2515 2516 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2517 // 2 pass -> 5 2518 // 4 pass -> 7 2519 // 8 pass -> 11 2520 // 16 pass -> 19 2521 return NumPasses + 3; 2522 } 2523 2524 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2525 // 2 pass -> 5 2526 // 4 pass -> 7 2527 // 8 pass -> 11 2528 // 16 pass -> 19 2529 return NumPasses + 3; 2530 } 2531 2532 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2533 // 2 pass -> 4 2534 // 4 pass -> 6 2535 // 8 pass -> 10 2536 // 16 pass -> 18 2537 return NumPasses + 2; 2538 } 2539 2540 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2541 if (!ST.hasGFX90AInsts()) 2542 return 0; 2543 2544 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2545 return isDGEMM(MI.getOpcode()); 2546 }; 2547 2548 // This is checked in checkMAIHazards90A() 2549 if (SIInstrInfo::isMFMA(*MI)) 2550 return 0; 2551 2552 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2553 2554 int WaitStatesNeeded = 0; 2555 2556 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2557 SIInstrInfo::isFLAT(*MI) || 2558 SIInstrInfo::isDS(*MI); 2559 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2560 bool IsVALU = SIInstrInfo::isVALU(*MI); 2561 2562 const MachineInstr *MFMA = nullptr; 2563 unsigned Reg; 2564 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2565 if (!SIInstrInfo::isMFMA(MI) || 2566 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2567 return false; 2568 MFMA = &MI; 2569 return true; 2570 }; 2571 2572 const MachineInstr *DOT = nullptr; 2573 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2574 if (!SIInstrInfo::isDOT(MI) || 2575 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2576 return false; 2577 DOT = &MI; 2578 return true; 2579 }; 2580 2581 bool DGEMMAfterVALUWrite = false; 2582 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2583 // Found DGEMM on reverse traversal to def. 2584 if (isDGEMM(MI.getOpcode())) 2585 DGEMMAfterVALUWrite = true; 2586 2587 // Only hazard if register is defined by a VALU and a DGEMM is found after 2588 // after the def. 2589 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2590 return false; 2591 2592 return true; 2593 }; 2594 2595 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2596 AMDGPU::OpName::src2); 2597 2598 if (IsMemOrExport || IsVALU) { 2599 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2600 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2601 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2602 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2603 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2604 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2605 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2606 const int DotWriteSameDotReadSrcAB = 3; 2607 const int DotWriteDifferentVALURead = 3; 2608 const int DMFMABetweenVALUWriteVMEMRead = 2; 2609 const int MaxWaitStates = 19; 2610 2611 for (const MachineOperand &Use : MI->explicit_uses()) { 2612 if (!Use.isReg()) 2613 continue; 2614 Reg = Use.getReg(); 2615 2616 DOT = nullptr; 2617 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2618 MaxWaitStates); 2619 if (DOT) { 2620 int NeedWaitStates = 0; 2621 if (DOT->getOpcode() == MI->getOpcode()) { 2622 if (&Use - &MI->getOperand(0) != SrcCIdx) 2623 NeedWaitStates = DotWriteSameDotReadSrcAB; 2624 } else { 2625 NeedWaitStates = DotWriteDifferentVALURead; 2626 } 2627 2628 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2629 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2630 } 2631 2632 // Workaround for HW data hazard bug observed only in GFX90A. When there 2633 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2634 // causes the SQ to incorrectly not insert two wait states between the two 2635 // instructions needed to avoid data hazard. 2636 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2637 DGEMMAfterVALUWrite = false; 2638 if (TRI.isVectorRegister(MRI, Reg)) { 2639 int WaitStatesNeededForUse = 2640 DMFMABetweenVALUWriteVMEMRead - 2641 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2642 DMFMABetweenVALUWriteVMEMRead); 2643 2644 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2645 } 2646 } 2647 2648 MFMA = nullptr; 2649 WaitStatesSinceDef = 2650 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2651 if (!MFMA) 2652 continue; 2653 2654 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2655 int NumPasses = HazardDefLatency; 2656 int NeedWaitStates = MaxWaitStates; 2657 2658 if (isDGEMM(MFMA->getOpcode())) { 2659 switch (HazardDefLatency) { 2660 case 4: 2661 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2662 : DMFMA4x4WriteVgprVALUReadWaitStates; 2663 break; 2664 case 8: 2665 case 16: 2666 NeedWaitStates = IsMemOrExport 2667 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2668 : DMFMA16x16WriteVgprVALUReadWaitStates; 2669 break; 2670 default: 2671 llvm_unreachable("unexpected dgemm"); 2672 } 2673 } else if (ST.hasGFX940Insts()) { 2674 NeedWaitStates = 2675 isXDL(ST, *MFMA) 2676 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2677 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2678 NumPasses); 2679 } else { 2680 switch (HazardDefLatency) { 2681 case 2: 2682 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2683 break; 2684 case 8: 2685 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2686 break; 2687 case 16: 2688 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2689 break; 2690 default: 2691 llvm_unreachable("unexpected number of passes for mfma"); 2692 } 2693 } 2694 2695 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2696 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2697 2698 if (WaitStatesNeeded == MaxWaitStates) 2699 break; 2700 } 2701 } 2702 2703 unsigned Opc = MI->getOpcode(); 2704 const int DMFMAToFMA64WaitStates = 2; 2705 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2706 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2707 Opc == AMDGPU::V_FMAC_F64_dpp) && 2708 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2709 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2710 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2711 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2712 } 2713 2714 if (!IsVALU && !IsMemOrExport) 2715 return WaitStatesNeeded; 2716 2717 for (const MachineOperand &Def : MI->defs()) { 2718 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2719 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2720 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2721 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2722 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2723 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2724 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2725 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2726 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2727 const int DotWriteDifferentVALUWrite = 3; 2728 const int MaxWaitStates = 19; 2729 const int MaxWarWaitStates = 15; 2730 2731 Reg = Def.getReg(); 2732 2733 DOT = nullptr; 2734 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2735 MaxWaitStates); 2736 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2737 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2738 WaitStatesSinceDef); 2739 2740 MFMA = nullptr; 2741 WaitStatesSinceDef = 2742 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2743 if (MFMA) { 2744 int NeedWaitStates = MaxWaitStates; 2745 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2746 2747 if (isDGEMM(MFMA->getOpcode())) { 2748 switch (NumPasses) { 2749 case 4: 2750 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2751 break; 2752 case 8: 2753 case 16: 2754 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2755 break; 2756 default: 2757 llvm_unreachable("unexpected number of cycles for dgemm"); 2758 } 2759 } else if (ST.hasGFX940Insts()) { 2760 NeedWaitStates = 2761 isXDL(ST, *MFMA) 2762 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2763 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2764 } else { 2765 switch (NumPasses) { 2766 case 2: 2767 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2768 break; 2769 case 8: 2770 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2771 break; 2772 case 16: 2773 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2774 break; 2775 default: 2776 llvm_unreachable("Unexpected number of passes for mfma"); 2777 } 2778 } 2779 2780 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2782 2783 if (WaitStatesNeeded == MaxWaitStates) 2784 break; 2785 } 2786 2787 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2788 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2789 !MI.readsRegister(Reg, &TRI)) 2790 return false; 2791 2792 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2793 return false; 2794 2795 const MachineOperand *SrcC = 2796 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2797 assert(SrcC); 2798 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2799 return false; 2800 2801 MFMA = &MI; 2802 return true; 2803 }; 2804 2805 MFMA = nullptr; 2806 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2807 MaxWarWaitStates); 2808 if (!MFMA) 2809 continue; 2810 2811 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2812 int NeedWaitStates = MaxWaitStates; 2813 switch (HazardDefLatency) { 2814 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2815 break; 2816 case 4: assert(ST.hasGFX940Insts()); 2817 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2818 break; 2819 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2820 break; 2821 case 16: [[fallthrough]]; 2822 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2823 break; 2824 } 2825 2826 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2827 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2828 } 2829 2830 return WaitStatesNeeded; 2831 } 2832 2833 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2834 if (!SU->isInstr()) 2835 return false; 2836 2837 const MachineInstr *MAI = nullptr; 2838 2839 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2840 MAI = nullptr; 2841 if (SIInstrInfo::isMFMA(MI)) 2842 MAI = &MI; 2843 return MAI != nullptr; 2844 }; 2845 2846 MachineInstr *MI = SU->getInstr(); 2847 if (IsMFMAFn(*MI)) { 2848 int W = getWaitStatesSince(IsMFMAFn, 16); 2849 if (MAI) 2850 return W < (int)TSchedModel.computeInstrLatency(MAI); 2851 } 2852 2853 return false; 2854 } 2855 2856 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2857 // insertion of a new instruction. 2858 static void updateGetPCBundle(MachineInstr *NewMI) { 2859 if (!NewMI->isBundled()) 2860 return; 2861 2862 // Find start of bundle. 2863 auto I = NewMI->getIterator(); 2864 while (I->isBundledWithPred()) 2865 I--; 2866 if (I->isBundle()) 2867 I++; 2868 2869 // Bail if this is not an S_GETPC bundle. 2870 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2871 return; 2872 2873 // Update offsets of any references in the bundle. 2874 const unsigned NewBytes = 4; 2875 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2876 "Unexpected instruction insertion in bundle"); 2877 auto NextMI = std::next(NewMI->getIterator()); 2878 auto End = NewMI->getParent()->end(); 2879 while (NextMI != End && NextMI->isBundledWithPred()) { 2880 for (auto &Operand : NextMI->operands()) { 2881 if (Operand.isGlobal()) 2882 Operand.setOffset(Operand.getOffset() + NewBytes); 2883 } 2884 NextMI++; 2885 } 2886 } 2887 2888 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2889 if (!ST.hasVALUMaskWriteHazard()) 2890 return false; 2891 assert(!ST.hasExtendedWaitCounts()); 2892 2893 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2894 return false; 2895 2896 // The hazard sequence is three instructions: 2897 // 1. VALU reads SGPR as mask 2898 // 2. SALU writes SGPR 2899 // 3. SALU reads SGPR 2900 // The hazard can expire if the distance between 2 and 3 is sufficient. 2901 // In practice this happens <10% of the time, hence this always assumes 2902 // the hazard exists if 1 and 2 are present to avoid searching. 2903 2904 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2905 if (!SDSTOp || !SDSTOp->isReg()) 2906 return false; 2907 2908 const Register HazardReg = SDSTOp->getReg(); 2909 if (HazardReg == AMDGPU::EXEC || 2910 HazardReg == AMDGPU::EXEC_LO || 2911 HazardReg == AMDGPU::EXEC_HI || 2912 HazardReg == AMDGPU::M0) 2913 return false; 2914 2915 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2916 switch (I.getOpcode()) { 2917 case AMDGPU::V_ADDC_U32_e32: 2918 case AMDGPU::V_ADDC_U32_dpp: 2919 case AMDGPU::V_CNDMASK_B16_e32: 2920 case AMDGPU::V_CNDMASK_B16_dpp: 2921 case AMDGPU::V_CNDMASK_B32_e32: 2922 case AMDGPU::V_CNDMASK_B32_dpp: 2923 case AMDGPU::V_DIV_FMAS_F32_e64: 2924 case AMDGPU::V_DIV_FMAS_F64_e64: 2925 case AMDGPU::V_SUBB_U32_e32: 2926 case AMDGPU::V_SUBB_U32_dpp: 2927 case AMDGPU::V_SUBBREV_U32_e32: 2928 case AMDGPU::V_SUBBREV_U32_dpp: 2929 // These implicitly read VCC as mask source. 2930 return HazardReg == AMDGPU::VCC || 2931 HazardReg == AMDGPU::VCC_LO || 2932 HazardReg == AMDGPU::VCC_HI; 2933 case AMDGPU::V_ADDC_U32_e64: 2934 case AMDGPU::V_ADDC_U32_e64_dpp: 2935 case AMDGPU::V_CNDMASK_B16_e64: 2936 case AMDGPU::V_CNDMASK_B16_e64_dpp: 2937 case AMDGPU::V_CNDMASK_B32_e64: 2938 case AMDGPU::V_CNDMASK_B32_e64_dpp: 2939 case AMDGPU::V_SUBB_U32_e64: 2940 case AMDGPU::V_SUBB_U32_e64_dpp: 2941 case AMDGPU::V_SUBBREV_U32_e64: 2942 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 2943 // Only check mask register overlaps. 2944 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 2945 assert(SSRCOp); 2946 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 2947 } 2948 default: 2949 return false; 2950 } 2951 }; 2952 2953 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2954 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 2955 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 2956 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2957 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 2958 return true; 2959 2960 // VALU access to any SGPR or literal constant other than HazardReg 2961 // mitigates hazard. No need to check HazardReg here as this will 2962 // only be called when !IsHazardFn. 2963 if (!SIInstrInfo::isVALU(I)) 2964 return false; 2965 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 2966 const MachineOperand &Op = I.getOperand(OpNo); 2967 if (Op.isReg()) { 2968 Register OpReg = Op.getReg(); 2969 // Only consider uses 2970 if (!Op.isUse()) 2971 continue; 2972 // Ignore EXEC 2973 if (OpReg == AMDGPU::EXEC || 2974 OpReg == AMDGPU::EXEC_LO || 2975 OpReg == AMDGPU::EXEC_HI) 2976 continue; 2977 // Ignore all implicit uses except VCC 2978 if (Op.isImplicit()) { 2979 if (OpReg == AMDGPU::VCC || 2980 OpReg == AMDGPU::VCC_LO || 2981 OpReg == AMDGPU::VCC_HI) 2982 return true; 2983 continue; 2984 } 2985 if (TRI.isSGPRReg(MRI, OpReg)) 2986 return true; 2987 } else { 2988 const MCInstrDesc &InstDesc = I.getDesc(); 2989 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 2990 if (!TII.isInlineConstant(Op, OpInfo)) 2991 return true; 2992 } 2993 } 2994 return false; 2995 }; 2996 2997 // Check for hazard 2998 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 2999 std::numeric_limits<int>::max()) 3000 return false; 3001 3002 auto NextMI = std::next(MI->getIterator()); 3003 3004 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3005 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3006 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3007 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3008 3009 // SALU write may be s_getpc in a bundle. 3010 updateGetPCBundle(NewMI); 3011 3012 return true; 3013 } 3014 3015 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3016 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3017 static std::optional<unsigned> sgprPairNumber(Register Reg, 3018 const SIRegisterInfo &TRI) { 3019 switch (Reg) { 3020 case AMDGPU::M0: 3021 case AMDGPU::EXEC: 3022 case AMDGPU::EXEC_LO: 3023 case AMDGPU::EXEC_HI: 3024 case AMDGPU::SGPR_NULL: 3025 case AMDGPU::SGPR_NULL64: 3026 return {}; 3027 default: 3028 break; 3029 } 3030 unsigned RegN = TRI.getEncodingValue(Reg); 3031 if (RegN > 127) 3032 return {}; 3033 return (RegN >> 1) & 0x3f; 3034 } 3035 3036 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3037 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3038 assert(MMF == &MF); 3039 3040 // Assume non-empty vector means it has already been computed. 3041 if (!VALUReadHazardSGPRs.empty()) 3042 return; 3043 3044 auto CallingConv = MF.getFunction().getCallingConv(); 3045 bool IsCallFree = 3046 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3047 3048 // Exhaustive search is only viable in non-caller/callee functions where 3049 // VALUs will be exposed to the hazard recognizer. 3050 UseVALUReadHazardExhaustiveSearch = 3051 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3052 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3053 3054 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3055 bool UseVALUUseCache = 3056 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3057 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3058 if (!UseVALUUseCache) 3059 return; 3060 3061 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3062 // before a SALU write to the same SGPR. This provides a reduction in 3063 // hazard insertion when all VALU access to an SGPR occurs after its last 3064 // SALU write, when compared to a linear scan. 3065 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3066 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3067 MachineCycleInfo CI; 3068 CI.compute(*MMF); 3069 3070 for (auto *MBB : post_order(&MF)) { 3071 bool InCycle = CI.getCycle(MBB) != nullptr; 3072 for (auto &MI : reverse(MBB->instrs())) { 3073 bool IsVALU = SIInstrInfo::isVALU(MI); 3074 bool IsSALU = SIInstrInfo::isSALU(MI); 3075 if (!IsVALU && !IsSALU) 3076 continue; 3077 3078 for (const MachineOperand &Op : MI.operands()) { 3079 if (!Op.isReg()) 3080 continue; 3081 Register Reg = Op.getReg(); 3082 assert(!Op.getSubReg()); 3083 // Only consider implicit operands of VCC. 3084 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3085 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3086 continue; 3087 if (!TRI.isSGPRReg(MRI, Reg)) 3088 continue; 3089 auto RegN = sgprPairNumber(Reg, TRI); 3090 if (!RegN) 3091 continue; 3092 if (IsVALU && Op.isUse()) { 3093 // Note: any access within a cycle must be considered a hazard. 3094 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3095 VALUReadHazardSGPRs.set(*RegN); 3096 ReadSGPRs.set(*RegN); 3097 } else if (IsSALU) { 3098 if (Op.isDef()) 3099 SALUWriteSGPRs.set(*RegN); 3100 else 3101 ReadSGPRs.set(*RegN); 3102 } 3103 } 3104 } 3105 } 3106 } 3107 3108 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3109 if (!ST.hasVALUReadSGPRHazard()) 3110 return false; 3111 3112 // The hazard sequence is fundamentally three instructions: 3113 // 1. VALU reads SGPR 3114 // 2. SALU writes SGPR 3115 // 3. VALU/SALU reads SGPR 3116 // Try to avoid searching for (1) because the expiry point of the hazard is 3117 // indeterminate; however, the hazard between (2) and (3) can expire if the 3118 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3119 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3120 // even if individual SGPRs are accessed. 3121 3122 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3123 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3124 if (!(MIIsSALU || MIIsVALU)) 3125 return false; 3126 3127 // Avoid expensive search when compile time is priority by 3128 // mitigating every SALU which writes an SGPR. 3129 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3130 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3131 return false; 3132 3133 const MachineOperand *SDSTOp = 3134 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3135 if (!SDSTOp || !SDSTOp->isReg()) 3136 return false; 3137 3138 const Register HazardReg = SDSTOp->getReg(); 3139 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3140 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3141 return false; 3142 3143 // Add s_wait_alu sa_sdst(0) after SALU write. 3144 auto NextMI = std::next(MI->getIterator()); 3145 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3146 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3147 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3148 3149 // SALU write may be s_getpc in a bundle. 3150 updateGetPCBundle(NewMI); 3151 3152 return true; 3153 } 3154 3155 // Pre-compute set of SGPR pairs read by VALUs. 3156 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3157 computeVALUHazardSGPRs(MI->getMF()); 3158 3159 // If no VALUs hazard SGPRs exist then nothing to do. 3160 if (VALUReadHazardSGPRs.none()) 3161 return false; 3162 3163 // All SGPR writes before a call/return must be flushed as the callee/caller 3164 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3165 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3166 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3167 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3168 3169 // Collect all SGPR sources for MI which are read by a VALU. 3170 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3171 SmallSet<Register, 4> SGPRsUsed; 3172 3173 if (!IsSetPC) { 3174 for (const MachineOperand &Op : MI->all_uses()) { 3175 Register OpReg = Op.getReg(); 3176 3177 // Only consider VCC implicit uses on VALUs. 3178 // The only expected SALU implicit access is SCC which is no hazard. 3179 if (MIIsSALU && Op.isImplicit()) 3180 continue; 3181 3182 if (!TRI.isSGPRReg(MRI, OpReg)) 3183 continue; 3184 3185 auto RegN = sgprPairNumber(OpReg, TRI); 3186 if (!RegN) 3187 continue; 3188 3189 if (!VALUReadHazardSGPRs[*RegN]) 3190 continue; 3191 3192 SGPRsUsed.insert(OpReg); 3193 } 3194 3195 // No SGPRs -> nothing to do. 3196 if (SGPRsUsed.empty()) 3197 return false; 3198 } 3199 3200 // A hazard is any SALU which writes one of the SGPRs read by MI. 3201 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3202 if (!SIInstrInfo::isSALU(I)) 3203 return false; 3204 // Ensure SGPR flush before call/return by conservatively assuming every 3205 // SALU writes an SGPR. 3206 if (IsSetPC && I.getNumDefs() > 0) 3207 return true; 3208 // Check for any register writes. 3209 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3210 return I.modifiesRegister(Reg, &TRI); 3211 }); 3212 }; 3213 3214 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3215 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3216 if (Count >= SALUExpiryCount) 3217 return true; 3218 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3219 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3220 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3221 return true; 3222 return false; 3223 }; 3224 3225 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3226 // Only count true SALUs as wait states. 3227 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3228 return 0; 3229 // SALU must be unrelated to any hazard registers. 3230 if (any_of(SGPRsUsed, 3231 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3232 return 0; 3233 return 1; 3234 }; 3235 3236 // Check for the hazard. 3237 DenseSet<const MachineBasicBlock *> Visited; 3238 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3239 std::next(MI->getReverseIterator()), 0, 3240 IsExpiredFn, Visited, WaitStatesFn); 3241 3242 if (WaitStates >= SALUExpiryCount) 3243 return false; 3244 3245 // Validate hazard through an exhaustive search. 3246 if (UseVALUReadHazardExhaustiveSearch) { 3247 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3248 // This is searching for (1) in the hazard description. 3249 auto hazardPair = [this](Register Reg) { 3250 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3251 return Register(AMDGPU::VCC); 3252 auto RegN = sgprPairNumber(Reg, TRI); 3253 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3254 }; 3255 auto SearchHazardFn = [this, hazardPair, 3256 &SGPRsUsed](const MachineInstr &I) { 3257 if (!SIInstrInfo::isVALU(I)) 3258 return false; 3259 // Check for any register reads. 3260 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3261 return I.readsRegister(hazardPair(Reg), &TRI); 3262 }); 3263 }; 3264 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3265 return false; 3266 }; 3267 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3268 std::numeric_limits<int>::max()) 3269 return false; 3270 } 3271 3272 // Add s_wait_alu sa_sdst(0) before SALU read. 3273 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3274 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3275 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3276 3277 // SALU read may be after s_getpc in a bundle. 3278 updateGetPCBundle(NewMI); 3279 3280 return true; 3281 } 3282 3283 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3284 const SIInstrInfo &TII) { 3285 MachineBasicBlock &EntryMBB = MF->front(); 3286 if (EntryMBB.begin() != EntryMBB.end()) { 3287 auto &EntryMI = *EntryMBB.begin(); 3288 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3289 EntryMI.getOperand(0).getImm() >= Priority) 3290 return false; 3291 } 3292 3293 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3294 .addImm(Priority); 3295 return true; 3296 } 3297 3298 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3299 if (!ST.hasRequiredExportPriority()) 3300 return false; 3301 3302 // Assume the following shader types will never have exports, 3303 // and avoid adding or adjusting S_SETPRIO. 3304 MachineBasicBlock *MBB = MI->getParent(); 3305 MachineFunction *MF = MBB->getParent(); 3306 auto CC = MF->getFunction().getCallingConv(); 3307 switch (CC) { 3308 case CallingConv::AMDGPU_CS: 3309 case CallingConv::AMDGPU_CS_Chain: 3310 case CallingConv::AMDGPU_CS_ChainPreserve: 3311 case CallingConv::AMDGPU_KERNEL: 3312 return false; 3313 default: 3314 break; 3315 } 3316 3317 const int MaxPriority = 3; 3318 const int NormalPriority = 2; 3319 const int PostExportPriority = 0; 3320 3321 auto It = MI->getIterator(); 3322 switch (MI->getOpcode()) { 3323 case AMDGPU::S_ENDPGM: 3324 case AMDGPU::S_ENDPGM_SAVED: 3325 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3326 case AMDGPU::SI_RETURN_TO_EPILOG: 3327 // Ensure shader with calls raises priority at entry. 3328 // This ensures correct priority if exports exist in callee. 3329 if (MF->getFrameInfo().hasCalls()) 3330 return ensureEntrySetPrio(MF, NormalPriority, TII); 3331 return false; 3332 case AMDGPU::S_SETPRIO: { 3333 // Raise minimum priority unless in workaround. 3334 auto &PrioOp = MI->getOperand(0); 3335 int Prio = PrioOp.getImm(); 3336 bool InWA = (Prio == PostExportPriority) && 3337 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3338 if (InWA || Prio >= NormalPriority) 3339 return false; 3340 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3341 return true; 3342 } 3343 default: 3344 if (!TII.isEXP(*MI)) 3345 return false; 3346 break; 3347 } 3348 3349 // Check entry priority at each export (as there will only be a few). 3350 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3351 bool Changed = false; 3352 if (CC != CallingConv::AMDGPU_Gfx) 3353 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3354 3355 auto NextMI = std::next(It); 3356 bool EndOfShader = false; 3357 if (NextMI != MBB->end()) { 3358 // Only need WA at end of sequence of exports. 3359 if (TII.isEXP(*NextMI)) 3360 return Changed; 3361 // Assume appropriate S_SETPRIO after export means WA already applied. 3362 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3363 NextMI->getOperand(0).getImm() == PostExportPriority) 3364 return Changed; 3365 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3366 } 3367 3368 const DebugLoc &DL = MI->getDebugLoc(); 3369 3370 // Lower priority. 3371 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3372 .addImm(PostExportPriority); 3373 3374 if (!EndOfShader) { 3375 // Wait for exports to complete. 3376 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3377 .addReg(AMDGPU::SGPR_NULL) 3378 .addImm(0); 3379 } 3380 3381 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3382 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3383 3384 if (!EndOfShader) { 3385 // Return to normal (higher) priority. 3386 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3387 .addImm(NormalPriority); 3388 } 3389 3390 return true; 3391 } 3392