1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 UseVALUReadHazardExhaustiveSearch(false), 64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 || 172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || 173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || 174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || 175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; 176 } 177 178 static bool isLdsDma(const MachineInstr &MI) { 179 return SIInstrInfo::isVALU(MI) && 180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 181 } 182 183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 185 AMDGPU::OpName::simm16); 186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 187 } 188 189 ScheduleHazardRecognizer::HazardType 190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 191 MachineInstr *MI = SU->getInstr(); 192 // If we are not in "HazardRecognizerMode" and therefore not being run from 193 // the scheduler, track possible stalls from hazards but don't insert noops. 194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 195 196 if (MI->isBundle()) 197 return NoHazard; 198 199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 200 return HazardType; 201 202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 203 return HazardType; 204 205 if (checkFPAtomicToDenormModeHazard(MI) > 0) 206 return HazardType; 207 208 if (ST.hasNoDataDepHazard()) 209 return NoHazard; 210 211 // FIXME: Should flat be considered vmem? 212 if ((SIInstrInfo::isVMEM(*MI) || 213 SIInstrInfo::isFLAT(*MI)) 214 && checkVMEMHazards(MI) > 0) 215 return HazardType; 216 217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 218 return HazardType; 219 220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 221 return HazardType; 222 223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 224 return HazardType; 225 226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 227 return HazardType; 228 229 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 230 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 232 return HazardType; 233 234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 235 return HazardType; 236 237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 238 return HazardType; 239 240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 241 return HazardType; 242 243 if (((ST.hasReadM0MovRelInterpHazard() && 244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 247 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 249 (ST.hasReadM0LdsDirectHazard() && 250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 251 checkReadM0Hazards(MI) > 0) 252 return HazardType; 253 254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 255 return HazardType; 256 257 if ((SIInstrInfo::isVMEM(*MI) || 258 SIInstrInfo::isFLAT(*MI) || 259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 260 return HazardType; 261 262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 263 return HazardType; 264 265 return NoHazard; 266 } 267 268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 269 unsigned Quantity) { 270 while (Quantity > 0) { 271 unsigned Arg = std::min(Quantity, 8u); 272 Quantity -= Arg; 273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 274 .addImm(Arg - 1); 275 } 276 } 277 278 unsigned 279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 281 assert(TSchedModel.getWriteProcResBegin(SC) != 282 TSchedModel.getWriteProcResEnd(SC)); 283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 284 } 285 286 void GCNHazardRecognizer::processBundle() { 287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 288 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 289 // Check bundled MachineInstr's for hazards. 290 for (; MI != E && MI->isInsideBundle(); ++MI) { 291 CurrCycleInstr = &*MI; 292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 293 294 if (IsHazardRecognizerMode) { 295 fixHazards(CurrCycleInstr); 296 297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 298 } 299 300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 301 // include the bundled MI directly after, only add a maximum of 302 // (MaxLookAhead - 1) noops to EmittedInstrs. 303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 304 EmittedInstrs.push_front(nullptr); 305 306 EmittedInstrs.push_front(CurrCycleInstr); 307 EmittedInstrs.resize(MaxLookAhead); 308 } 309 CurrCycleInstr = nullptr; 310 } 311 312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 313 assert(IsHazardRecognizerMode); 314 315 unsigned NumPreNoops = PreEmitNoops(MI); 316 EmitNoops(NumPreNoops); 317 if (MI->isInsideBundle()) 318 insertNoopsInBundle(MI, TII, NumPreNoops); 319 else 320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 321 NumPreNoops); 322 EmitInstruction(MI); 323 AdvanceCycle(); 324 } 325 326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 327 IsHazardRecognizerMode = true; 328 CurrCycleInstr = MI; 329 unsigned W = PreEmitNoopsCommon(MI); 330 fixHazards(MI); 331 CurrCycleInstr = nullptr; 332 return W; 333 } 334 335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 336 if (MI->isBundle()) 337 return 0; 338 339 int WaitStates = 0; 340 341 if (SIInstrInfo::isSMRD(*MI)) 342 return std::max(WaitStates, checkSMRDHazards(MI)); 343 344 if (ST.hasNSAtoVMEMBug()) 345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 346 347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 348 349 if (ST.hasNoDataDepHazard()) 350 return WaitStates; 351 352 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 354 355 if (SIInstrInfo::isVALU(*MI)) 356 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 357 358 if (SIInstrInfo::isDPP(*MI)) 359 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 360 361 if (isDivFMas(MI->getOpcode())) 362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 363 364 if (isRWLane(MI->getOpcode())) 365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 366 367 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 368 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 371 372 if (MI->isInlineAsm()) 373 return std::max(WaitStates, checkInlineAsmHazards(MI)); 374 375 if (isSGetReg(MI->getOpcode())) 376 return std::max(WaitStates, checkGetRegHazards(MI)); 377 378 if (isSSetReg(MI->getOpcode())) 379 return std::max(WaitStates, checkSetRegHazards(MI)); 380 381 if (isRFE(MI->getOpcode())) 382 return std::max(WaitStates, checkRFEHazards(MI)); 383 384 if ((ST.hasReadM0MovRelInterpHazard() && 385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 388 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 390 (ST.hasReadM0LdsDirectHazard() && 391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 392 return std::max(WaitStates, checkReadM0Hazards(MI)); 393 394 if (SIInstrInfo::isMAI(*MI)) 395 return std::max(WaitStates, checkMAIHazards(MI)); 396 397 if (SIInstrInfo::isVMEM(*MI) || 398 SIInstrInfo::isFLAT(*MI) || 399 SIInstrInfo::isDS(*MI)) 400 return std::max(WaitStates, checkMAILdStHazards(MI)); 401 402 if (ST.hasGFX950Insts() && isPermlane(*MI)) 403 return std::max(WaitStates, checkPermlaneHazards(MI)); 404 405 return WaitStates; 406 } 407 408 void GCNHazardRecognizer::EmitNoop() { 409 EmittedInstrs.push_front(nullptr); 410 } 411 412 void GCNHazardRecognizer::AdvanceCycle() { 413 // When the scheduler detects a stall, it will call AdvanceCycle() without 414 // emitting any instructions. 415 if (!CurrCycleInstr) { 416 EmittedInstrs.push_front(nullptr); 417 return; 418 } 419 420 if (CurrCycleInstr->isBundle()) { 421 processBundle(); 422 return; 423 } 424 425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 426 if (!NumWaitStates) { 427 CurrCycleInstr = nullptr; 428 return; 429 } 430 431 // Keep track of emitted instructions 432 EmittedInstrs.push_front(CurrCycleInstr); 433 434 // Add a nullptr for each additional wait state after the first. Make sure 435 // not to add more than getMaxLookAhead() items to the list, since we 436 // truncate the list to that size right after this loop. 437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 438 i < e; ++i) { 439 EmittedInstrs.push_front(nullptr); 440 } 441 442 // getMaxLookahead() is the largest number of wait states we will ever need 443 // to insert, so there is no point in keeping track of more than that many 444 // wait states. 445 EmittedInstrs.resize(getMaxLookAhead()); 446 447 CurrCycleInstr = nullptr; 448 } 449 450 void GCNHazardRecognizer::RecedeCycle() { 451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 452 } 453 454 //===----------------------------------------------------------------------===// 455 // Helper Functions 456 //===----------------------------------------------------------------------===// 457 458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 459 460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 462 463 // Search for a hazard in a block and its predecessors. 464 template <typename StateT> 465 static bool 466 hasHazard(StateT State, 467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 468 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 469 const MachineBasicBlock *MBB, 470 MachineBasicBlock::const_reverse_instr_iterator I, 471 DenseSet<const MachineBasicBlock *> &Visited) { 472 for (auto E = MBB->instr_rend(); I != E; ++I) { 473 // No need to look at parent BUNDLE instructions. 474 if (I->isBundle()) 475 continue; 476 477 switch (IsHazard(State, *I)) { 478 case HazardFound: 479 return true; 480 case HazardExpired: 481 return false; 482 default: 483 // Continue search 484 break; 485 } 486 487 if (I->isInlineAsm() || I->isMetaInstruction()) 488 continue; 489 490 UpdateState(State, *I); 491 } 492 493 for (MachineBasicBlock *Pred : MBB->predecessors()) { 494 if (!Visited.insert(Pred).second) 495 continue; 496 497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 498 Visited)) 499 return true; 500 } 501 502 return false; 503 } 504 505 // Returns a minimum wait states since \p I walking all predecessors. 506 // Only scans until \p IsExpired does not return true. 507 // Can only be run in a hazard recognizer mode. 508 static int getWaitStatesSince( 509 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 510 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 511 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 512 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 513 for (auto E = MBB->instr_rend(); I != E; ++I) { 514 // Don't add WaitStates for parent BUNDLE instructions. 515 if (I->isBundle()) 516 continue; 517 518 if (IsHazard(*I)) 519 return WaitStates; 520 521 if (I->isInlineAsm()) 522 continue; 523 524 WaitStates += GetNumWaitStates(*I); 525 526 if (IsExpired(*I, WaitStates)) 527 return std::numeric_limits<int>::max(); 528 } 529 530 int MinWaitStates = std::numeric_limits<int>::max(); 531 for (MachineBasicBlock *Pred : MBB->predecessors()) { 532 if (!Visited.insert(Pred).second) 533 continue; 534 535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 536 IsExpired, Visited, GetNumWaitStates); 537 538 MinWaitStates = std::min(MinWaitStates, W); 539 } 540 541 return MinWaitStates; 542 } 543 544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 545 const MachineInstr *MI, IsExpiredFn IsExpired) { 546 DenseSet<const MachineBasicBlock *> Visited; 547 return getWaitStatesSince(IsHazard, MI->getParent(), 548 std::next(MI->getReverseIterator()), 549 0, IsExpired, Visited); 550 } 551 552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 553 if (IsHazardRecognizerMode) { 554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 555 return WaitStates >= Limit; 556 }; 557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 558 } 559 560 int WaitStates = 0; 561 for (MachineInstr *MI : EmittedInstrs) { 562 if (MI) { 563 if (IsHazard(*MI)) 564 return WaitStates; 565 566 if (MI->isInlineAsm()) 567 continue; 568 } 569 ++WaitStates; 570 571 if (WaitStates >= Limit) 572 break; 573 } 574 return std::numeric_limits<int>::max(); 575 } 576 577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 578 IsHazardFn IsHazardDef, 579 int Limit) { 580 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 581 582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 584 }; 585 586 return getWaitStatesSince(IsHazardFn, Limit); 587 } 588 589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 590 int Limit) { 591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 592 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 593 }; 594 595 return getWaitStatesSince(IsHazardFn, Limit); 596 } 597 598 //===----------------------------------------------------------------------===// 599 // No-op Hazard Detection 600 //===----------------------------------------------------------------------===// 601 602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 603 MCRegister Reg) { 604 for (MCRegUnit Unit : TRI.regunits(Reg)) 605 BV.set(Unit); 606 } 607 608 static void addRegsToSet(const SIRegisterInfo &TRI, 609 iterator_range<MachineInstr::const_mop_iterator> Ops, 610 BitVector &DefSet, BitVector &UseSet) { 611 for (const MachineOperand &Op : Ops) { 612 if (Op.isReg()) 613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 614 } 615 } 616 617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 619 } 620 621 static bool breaksSMEMSoftClause(MachineInstr *MI) { 622 return !SIInstrInfo::isSMRD(*MI); 623 } 624 625 static bool breaksVMEMSoftClause(MachineInstr *MI) { 626 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 627 } 628 629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 630 // SMEM soft clause are only present on VI+, and only matter if xnack is 631 // enabled. 632 if (!ST.isXNACKEnabled()) 633 return 0; 634 635 bool IsSMRD = TII.isSMRD(*MEM); 636 637 resetClause(); 638 639 // A soft-clause is any group of consecutive SMEM instructions. The 640 // instructions in this group may return out of order and/or may be 641 // replayed (i.e. the same instruction issued more than once). 642 // 643 // In order to handle these situations correctly we need to make sure that 644 // when a clause has more than one instruction, no instruction in the clause 645 // writes to a register that is read by another instruction in the clause 646 // (including itself). If we encounter this situation, we need to break the 647 // clause by inserting a non SMEM instruction. 648 649 for (MachineInstr *MI : EmittedInstrs) { 650 // When we hit a non-SMEM instruction then we have passed the start of the 651 // clause and we can stop. 652 if (!MI) 653 break; 654 655 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 656 break; 657 658 addClauseInst(*MI); 659 } 660 661 if (ClauseDefs.none()) 662 return 0; 663 664 // We need to make sure not to put loads and stores in the same clause if they 665 // use the same address. For now, just start a new clause whenever we see a 666 // store. 667 if (MEM->mayStore()) 668 return 1; 669 670 addClauseInst(*MEM); 671 672 // If the set of defs and uses intersect then we cannot add this instruction 673 // to the clause, so we have a hazard. 674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 675 } 676 677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 678 int WaitStatesNeeded = 0; 679 680 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 681 682 // This SMRD hazard only affects SI. 683 if (!ST.hasSMRDReadVALUDefHazard()) 684 return WaitStatesNeeded; 685 686 // A read of an SGPR by SMRD instruction requires 4 wait states when the 687 // SGPR was written by a VALU instruction. 688 int SmrdSgprWaitStates = 4; 689 auto IsHazardDefFn = [this](const MachineInstr &MI) { 690 return TII.isVALU(MI); 691 }; 692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 693 return TII.isSALU(MI); 694 }; 695 696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 697 698 for (const MachineOperand &Use : SMRD->uses()) { 699 if (!Use.isReg()) 700 continue; 701 int WaitStatesNeededForUse = 702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 703 SmrdSgprWaitStates); 704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 705 706 // This fixes what appears to be undocumented hardware behavior in SI where 707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 708 // needs some number of nops in between. We don't know how many we need, but 709 // let's use 4. This wasn't discovered before probably because the only 710 // case when this happens is when we expand a 64-bit pointer into a full 711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 712 // probably never encountered in the closed-source land. 713 if (IsBufferSMRD) { 714 int WaitStatesNeededForUse = 715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 716 IsBufferHazardDefFn, 717 SmrdSgprWaitStates); 718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 719 } 720 } 721 722 return WaitStatesNeeded; 723 } 724 725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 726 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 727 return 0; 728 729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 730 731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 732 // SGPR was written by a VALU Instruction. 733 const int VmemSgprWaitStates = 5; 734 auto IsHazardDefFn = [this](const MachineInstr &MI) { 735 return TII.isVALU(MI); 736 }; 737 for (const MachineOperand &Use : VMEM->uses()) { 738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 739 continue; 740 741 int WaitStatesNeededForUse = 742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 743 VmemSgprWaitStates); 744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 745 } 746 return WaitStatesNeeded; 747 } 748 749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 750 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 751 const SIInstrInfo *TII = ST.getInstrInfo(); 752 753 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 754 int DppVgprWaitStates = 2; 755 int DppExecWaitStates = 5; 756 int WaitStatesNeeded = 0; 757 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 758 return TII->isVALU(MI); 759 }; 760 761 for (const MachineOperand &Use : DPP->uses()) { 762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 763 continue; 764 int WaitStatesNeededForUse = 765 DppVgprWaitStates - getWaitStatesSinceDef( 766 Use.getReg(), 767 [](const MachineInstr &) { return true; }, 768 DppVgprWaitStates); 769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 770 } 771 772 WaitStatesNeeded = std::max( 773 WaitStatesNeeded, 774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 775 DppExecWaitStates)); 776 777 return WaitStatesNeeded; 778 } 779 780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 781 const SIInstrInfo *TII = ST.getInstrInfo(); 782 783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 784 // instruction. 785 const int DivFMasWaitStates = 4; 786 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 787 return TII->isVALU(MI); 788 }; 789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 790 DivFMasWaitStates); 791 792 return DivFMasWaitStates - WaitStatesNeeded; 793 } 794 795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 796 const SIInstrInfo *TII = ST.getInstrInfo(); 797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 798 799 const int GetRegWaitStates = 2; 800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 801 return GetRegHWReg == getHWReg(TII, MI); 802 }; 803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 804 805 return GetRegWaitStates - WaitStatesNeeded; 806 } 807 808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 809 const SIInstrInfo *TII = ST.getInstrInfo(); 810 unsigned HWReg = getHWReg(TII, *SetRegInstr); 811 812 const int SetRegWaitStates = ST.getSetRegWaitStates(); 813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 814 return HWReg == getHWReg(TII, MI); 815 }; 816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 817 return SetRegWaitStates - WaitStatesNeeded; 818 } 819 820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 821 if (!MI.mayStore()) 822 return -1; 823 824 const SIInstrInfo *TII = ST.getInstrInfo(); 825 unsigned Opcode = MI.getOpcode(); 826 const MCInstrDesc &Desc = MI.getDesc(); 827 828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 829 int VDataRCID = -1; 830 if (VDataIdx != -1) 831 VDataRCID = Desc.operands()[VDataIdx].RegClass; 832 833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 834 // There is no hazard if the instruction does not use vector regs 835 // (like wbinvl1) 836 if (VDataIdx == -1) 837 return -1; 838 // For MUBUF/MTBUF instructions this hazard only exists if the 839 // instruction is not using a register in the soffset field. 840 const MachineOperand *SOffset = 841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 842 // If we have no soffset operand, then assume this field has been 843 // hardcoded to zero. 844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 845 (!SOffset || !SOffset->isReg())) 846 return VDataIdx; 847 } 848 849 // MIMG instructions create a hazard if they don't use a 256-bit T# and 850 // the store size is greater than 8 bytes and they have more than two bits 851 // of their dmask set. 852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 853 if (TII->isMIMG(MI)) { 854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 855 assert(SRsrcIdx != -1 && 856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 857 (void)SRsrcIdx; 858 } 859 860 if (TII->isFLAT(MI)) { 861 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 862 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 863 return DataIdx; 864 } 865 866 return -1; 867 } 868 869 int 870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 871 const MachineRegisterInfo &MRI) { 872 // Helper to check for the hazard where VMEM instructions that store more than 873 // 8 bytes can have there store data over written by the next instruction. 874 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 875 876 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 877 int WaitStatesNeeded = 0; 878 879 if (!TRI->isVectorRegister(MRI, Def.getReg())) 880 return WaitStatesNeeded; 881 Register Reg = Def.getReg(); 882 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 883 int DataIdx = createsVALUHazard(MI); 884 return DataIdx >= 0 && 885 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 886 }; 887 888 int WaitStatesNeededForDef = 889 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 890 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 891 892 return WaitStatesNeeded; 893 } 894 895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 896 /// pack the computed value into correct bit position of the dest register. This 897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 898 /// dst_sel that is not aligned to the register. This function analayzes the \p 899 /// MI and \returns an operand with dst forwarding issue, or nullptr if 900 /// none exists. 901 static const MachineOperand * 902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 903 if (!SIInstrInfo::isVALU(MI)) 904 return nullptr; 905 906 const SIInstrInfo *TII = ST.getInstrInfo(); 907 908 unsigned Opcode = MI.getOpcode(); 909 910 // There are three different types of instructions 911 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 912 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst 913 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and 914 // op_sel[3:2] 915 // != 0 916 if (SIInstrInfo::isSDWA(MI)) { 917 // Type 1: SDWA with dst_sel != DWORD 918 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 919 if (DstSel->getImm() == AMDGPU::SDWA::DWORD) 920 return nullptr; 921 } else { 922 // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst 923 // with op_sel[3:2] != 0) 924 if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || 925 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 926 SISrcMods::DST_OP_SEL || 927 (AMDGPU::isFP8DstSelInst(Opcode) && 928 (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 929 SISrcMods::OP_SEL_0)))) 930 return nullptr; 931 } 932 933 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 934 } 935 936 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 937 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 938 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 939 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 940 const MachineOperand *Dst, 941 const SIRegisterInfo *TRI) { 942 // We must consider implicit reads of the VALU. SDWA with dst_sel and 943 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 944 // and we must account for that hazard. 945 // We also must account for WAW hazards. In particular, WAW with dest 946 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 947 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 948 // check for ECC. Without accounting for this hazard, the ECC will be 949 // wrong. 950 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 951 // complete zeroesHigh16BitsOfDest) 952 for (auto &Operand : VALU->operands()) { 953 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 954 return true; 955 } 956 } 957 return false; 958 } 959 960 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 961 int WaitStatesNeeded = 0; 962 963 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 964 const int TransDefWaitstates = 1; 965 966 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 967 if (!SIInstrInfo::isTRANS(MI)) 968 return false; 969 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 970 const SIInstrInfo *TII = ST.getInstrInfo(); 971 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 972 973 for (const MachineOperand &Use : VALU->explicit_uses()) { 974 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 975 return true; 976 } 977 978 return false; 979 }; 980 981 int WaitStatesNeededForDef = 982 TransDefWaitstates - 983 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 984 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 985 } 986 987 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) { 988 const int Shift16DefWaitstates = 1; 989 990 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 991 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 992 const MachineOperand *ForwardedDst = 993 getDstSelForwardingOperand(ProducerMI, ST); 994 if (ForwardedDst) { 995 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 996 } 997 998 if (ProducerMI.isInlineAsm()) { 999 // Assume inline asm has dst forwarding hazard 1000 for (auto &Def : ProducerMI.all_defs()) { 1001 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 1002 return true; 1003 } 1004 } 1005 1006 return false; 1007 }; 1008 1009 int WaitStatesNeededForDef = 1010 Shift16DefWaitstates - 1011 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1012 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1013 } 1014 1015 if (ST.hasVDecCoExecHazard()) { 1016 const int VALUWriteSGPRVALUReadWaitstates = 2; 1017 const int VALUWriteEXECRWLane = 4; 1018 const int VALUWriteVGPRReadlaneRead = 1; 1019 1020 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1021 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1022 Register UseReg; 1023 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1024 if (!SIInstrInfo::isVALU(MI)) 1025 return false; 1026 return MI.modifiesRegister(UseReg, TRI); 1027 }; 1028 1029 for (const MachineOperand &Use : VALU->explicit_uses()) { 1030 if (!Use.isReg()) 1031 continue; 1032 1033 UseReg = Use.getReg(); 1034 if (TRI->isSGPRReg(MRI, UseReg)) { 1035 int WaitStatesNeededForDef = 1036 VALUWriteSGPRVALUReadWaitstates - 1037 getWaitStatesSince(IsVALUDefSGPRFn, 1038 VALUWriteSGPRVALUReadWaitstates); 1039 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1040 } 1041 } 1042 1043 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1044 UseReg = AMDGPU::VCC; 1045 int WaitStatesNeededForDef = 1046 VALUWriteSGPRVALUReadWaitstates - 1047 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1048 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1049 } 1050 1051 switch (VALU->getOpcode()) { 1052 case AMDGPU::V_READLANE_B32: 1053 case AMDGPU::V_READFIRSTLANE_B32: { 1054 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1055 UseReg = Src->getReg(); 1056 int WaitStatesNeededForDef = 1057 VALUWriteVGPRReadlaneRead - 1058 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1059 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1060 } 1061 [[fallthrough]]; 1062 case AMDGPU::V_WRITELANE_B32: { 1063 UseReg = AMDGPU::EXEC; 1064 int WaitStatesNeededForDef = 1065 VALUWriteEXECRWLane - 1066 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1067 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1068 break; 1069 } 1070 default: 1071 break; 1072 } 1073 } 1074 1075 // This checks for the hazard where VMEM instructions that store more than 1076 // 8 bytes can have there store data over written by the next instruction. 1077 if (!ST.has12DWordStoreHazard()) 1078 return WaitStatesNeeded; 1079 1080 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1081 1082 for (const MachineOperand &Def : VALU->defs()) { 1083 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1084 } 1085 1086 return WaitStatesNeeded; 1087 } 1088 1089 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1090 // This checks for hazards associated with inline asm statements. 1091 // Since inline asms can contain just about anything, we use this 1092 // to call/leverage other check*Hazard routines. Note that 1093 // this function doesn't attempt to address all possible inline asm 1094 // hazards (good luck), but is a collection of what has been 1095 // problematic thus far. 1096 1097 // see checkVALUHazards() 1098 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() && 1099 !ST.hasCvtScaleForwardingHazard()) 1100 return 0; 1101 1102 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1103 int WaitStatesNeeded = 0; 1104 1105 for (const MachineOperand &Op : 1106 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1107 if (Op.isReg() && Op.isDef()) { 1108 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1109 continue; 1110 1111 if (ST.has12DWordStoreHazard()) { 1112 WaitStatesNeeded = 1113 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1114 } 1115 } 1116 } 1117 1118 if (ST.hasDstSelForwardingHazard()) { 1119 const int Shift16DefWaitstates = 1; 1120 1121 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1122 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1123 // Assume inline asm reads the dst 1124 if (Dst) 1125 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1126 IA->readsRegister(Dst->getReg(), &TRI); 1127 1128 if (ProducerMI.isInlineAsm()) { 1129 // If MI is inline asm, assume it has dst forwarding hazard 1130 for (auto &Def : ProducerMI.all_defs()) { 1131 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1132 IA->readsRegister(Def.getReg(), &TRI)) { 1133 return true; 1134 } 1135 } 1136 } 1137 1138 return false; 1139 }; 1140 1141 int WaitStatesNeededForDef = 1142 Shift16DefWaitstates - 1143 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1144 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1145 } 1146 1147 return WaitStatesNeeded; 1148 } 1149 1150 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1151 const SIInstrInfo *TII = ST.getInstrInfo(); 1152 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1153 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1154 1155 const MachineOperand *LaneSelectOp = 1156 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1157 1158 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1159 return 0; 1160 1161 Register LaneSelectReg = LaneSelectOp->getReg(); 1162 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1163 1164 const int RWLaneWaitStates = 4; 1165 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1166 RWLaneWaitStates); 1167 return RWLaneWaitStates - WaitStatesSince; 1168 } 1169 1170 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1171 if (!ST.hasRFEHazards()) 1172 return 0; 1173 1174 const SIInstrInfo *TII = ST.getInstrInfo(); 1175 1176 const int RFEWaitStates = 1; 1177 1178 auto IsHazardFn = [TII](const MachineInstr &MI) { 1179 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1180 }; 1181 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1182 return RFEWaitStates - WaitStatesNeeded; 1183 } 1184 1185 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1186 const SIInstrInfo *TII = ST.getInstrInfo(); 1187 const int ReadM0WaitStates = 1; 1188 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1189 return ReadM0WaitStates - 1190 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1191 } 1192 1193 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1194 fixVMEMtoScalarWriteHazards(MI); 1195 fixVcmpxPermlaneHazards(MI); 1196 fixSMEMtoVectorWriteHazards(MI); 1197 fixVcmpxExecWARHazard(MI); 1198 fixLdsBranchVmemWARHazard(MI); 1199 if (ST.hasLdsDirect()) { 1200 fixLdsDirectVALUHazard(MI); 1201 fixLdsDirectVMEMHazard(MI); 1202 } 1203 fixVALUPartialForwardingHazard(MI); 1204 fixVALUTransUseHazard(MI); 1205 fixWMMAHazards(MI); 1206 fixShift64HighRegBug(MI); 1207 fixVALUMaskWriteHazard(MI); 1208 fixVALUReadSGPRHazard(MI); 1209 fixRequiredExportPriority(MI); 1210 } 1211 1212 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, 1213 const MachineInstr &MI) { 1214 return (TII.isVOPC(MI) || 1215 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) && 1216 MI.modifiesRegister(AMDGPU::EXEC, &TRI); 1217 } 1218 1219 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1220 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1221 return false; 1222 1223 const SIInstrInfo *TII = ST.getInstrInfo(); 1224 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1225 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1226 return isVCmpXWritesExec(*TII, *TRI, MI); 1227 }; 1228 1229 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1230 unsigned Opc = MI.getOpcode(); 1231 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1232 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1233 }; 1234 1235 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1236 std::numeric_limits<int>::max()) 1237 return false; 1238 1239 // V_NOP will be discarded by SQ. 1240 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1241 // which is always a VGPR and available. 1242 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1243 Register Reg = Src0->getReg(); 1244 bool IsUndef = Src0->isUndef(); 1245 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1246 TII->get(AMDGPU::V_MOV_B32_e32)) 1247 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1248 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1249 1250 return true; 1251 } 1252 1253 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1254 if (!ST.hasVMEMtoScalarWriteHazard()) 1255 return false; 1256 assert(!ST.hasExtendedWaitCounts()); 1257 1258 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1259 return false; 1260 1261 if (MI->getNumDefs() == 0) 1262 return false; 1263 1264 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1265 1266 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1267 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1268 !SIInstrInfo::isFLAT(I)) 1269 return false; 1270 1271 for (const MachineOperand &Def : MI->defs()) { 1272 const MachineOperand *Op = 1273 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1274 if (!Op) 1275 continue; 1276 return true; 1277 } 1278 return false; 1279 }; 1280 1281 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1282 return SIInstrInfo::isVALU(MI) || 1283 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1284 !MI.getOperand(0).getImm()) || 1285 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1286 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1287 }; 1288 1289 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1290 std::numeric_limits<int>::max()) 1291 return false; 1292 1293 const SIInstrInfo *TII = ST.getInstrInfo(); 1294 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1295 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1296 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1297 return true; 1298 } 1299 1300 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1301 if (!ST.hasSMEMtoVectorWriteHazard()) 1302 return false; 1303 assert(!ST.hasExtendedWaitCounts()); 1304 1305 if (!SIInstrInfo::isVALU(*MI)) 1306 return false; 1307 1308 unsigned SDSTName; 1309 switch (MI->getOpcode()) { 1310 case AMDGPU::V_READLANE_B32: 1311 case AMDGPU::V_READFIRSTLANE_B32: 1312 SDSTName = AMDGPU::OpName::vdst; 1313 break; 1314 default: 1315 SDSTName = AMDGPU::OpName::sdst; 1316 break; 1317 } 1318 1319 const SIInstrInfo *TII = ST.getInstrInfo(); 1320 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1321 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1322 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1323 if (!SDST) { 1324 for (const auto &MO : MI->implicit_operands()) { 1325 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1326 SDST = &MO; 1327 break; 1328 } 1329 } 1330 } 1331 1332 if (!SDST) 1333 return false; 1334 1335 const Register SDSTReg = SDST->getReg(); 1336 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1337 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1338 }; 1339 1340 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1341 if (TII->isSALU(MI)) { 1342 switch (MI.getOpcode()) { 1343 case AMDGPU::S_SETVSKIP: 1344 case AMDGPU::S_VERSION: 1345 case AMDGPU::S_WAITCNT_VSCNT: 1346 case AMDGPU::S_WAITCNT_VMCNT: 1347 case AMDGPU::S_WAITCNT_EXPCNT: 1348 // These instructions cannot not mitigate the hazard. 1349 return false; 1350 case AMDGPU::S_WAITCNT_LGKMCNT: 1351 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1352 return (MI.getOperand(1).getImm() == 0) && 1353 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1354 case AMDGPU::S_WAITCNT: { 1355 const int64_t Imm = MI.getOperand(0).getImm(); 1356 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1357 // DsCnt corresponds to LGKMCnt here. 1358 return (Decoded.DsCnt == 0); 1359 } 1360 default: 1361 // SOPP instructions cannot mitigate the hazard. 1362 if (TII->isSOPP(MI)) 1363 return false; 1364 // At this point the SALU can be assumed to mitigate the hazard 1365 // because either: 1366 // (a) it is independent of the at risk SMEM (breaking chain), 1367 // or 1368 // (b) it is dependent on the SMEM, in which case an appropriate 1369 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1370 // SMEM instruction. 1371 return true; 1372 } 1373 } 1374 return false; 1375 }; 1376 1377 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1378 std::numeric_limits<int>::max()) 1379 return false; 1380 1381 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1382 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1383 .addImm(0); 1384 return true; 1385 } 1386 1387 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1388 if (!ST.hasVcmpxExecWARHazard()) 1389 return false; 1390 assert(!ST.hasExtendedWaitCounts()); 1391 1392 if (!SIInstrInfo::isVALU(*MI)) 1393 return false; 1394 1395 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1396 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1397 return false; 1398 1399 auto IsHazardFn = [TRI](const MachineInstr &I) { 1400 if (SIInstrInfo::isVALU(I)) 1401 return false; 1402 return I.readsRegister(AMDGPU::EXEC, TRI); 1403 }; 1404 1405 const SIInstrInfo *TII = ST.getInstrInfo(); 1406 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1407 if (SIInstrInfo::isVALU(MI)) { 1408 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1409 return true; 1410 for (auto MO : MI.implicit_operands()) 1411 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1412 return true; 1413 } 1414 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1415 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1416 return true; 1417 return false; 1418 }; 1419 1420 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1421 std::numeric_limits<int>::max()) 1422 return false; 1423 1424 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1425 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1426 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1427 return true; 1428 } 1429 1430 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1431 const GCNSubtarget &ST) { 1432 if (!ST.hasLdsBranchVmemWARHazard()) 1433 return false; 1434 1435 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1436 // instructions need to appear in the same function. 1437 bool HasLds = false; 1438 bool HasVmem = false; 1439 for (auto &MBB : MF) { 1440 for (auto &MI : MBB) { 1441 HasLds |= SIInstrInfo::isDS(MI); 1442 HasVmem |= 1443 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1444 if (HasLds && HasVmem) 1445 return true; 1446 } 1447 } 1448 return false; 1449 } 1450 1451 static bool isStoreCountWaitZero(const MachineInstr &I) { 1452 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1453 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1454 !I.getOperand(1).getImm(); 1455 } 1456 1457 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1458 if (!RunLdsBranchVmemWARHazardFixup) 1459 return false; 1460 1461 assert(ST.hasLdsBranchVmemWARHazard()); 1462 assert(!ST.hasExtendedWaitCounts()); 1463 1464 auto IsHazardInst = [](const MachineInstr &MI) { 1465 if (SIInstrInfo::isDS(MI)) 1466 return 1; 1467 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1468 return 2; 1469 return 0; 1470 }; 1471 1472 auto InstType = IsHazardInst(*MI); 1473 if (!InstType) 1474 return false; 1475 1476 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1477 return IsHazardInst(I) || isStoreCountWaitZero(I); 1478 }; 1479 1480 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1481 if (!I.isBranch()) 1482 return false; 1483 1484 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1485 auto InstType2 = IsHazardInst(I); 1486 return InstType2 && InstType != InstType2; 1487 }; 1488 1489 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1490 auto InstType2 = IsHazardInst(I); 1491 if (InstType == InstType2) 1492 return true; 1493 1494 return isStoreCountWaitZero(I); 1495 }; 1496 1497 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1498 std::numeric_limits<int>::max(); 1499 }; 1500 1501 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1502 std::numeric_limits<int>::max()) 1503 return false; 1504 1505 const SIInstrInfo *TII = ST.getInstrInfo(); 1506 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1507 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1508 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1509 .addImm(0); 1510 1511 return true; 1512 } 1513 1514 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1515 if (!SIInstrInfo::isLDSDIR(*MI)) 1516 return false; 1517 1518 const int NoHazardWaitStates = 15; 1519 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1520 const Register VDSTReg = VDST->getReg(); 1521 1522 bool VisitedTrans = false; 1523 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1524 if (!SIInstrInfo::isVALU(I)) 1525 return false; 1526 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1527 // Cover both WAR and WAW 1528 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1529 }; 1530 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1531 if (WaitStates >= NoHazardWaitStates) 1532 return true; 1533 // Instructions which cause va_vdst==0 expire hazard 1534 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1535 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1536 }; 1537 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1538 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1539 }; 1540 1541 DenseSet<const MachineBasicBlock *> Visited; 1542 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1543 std::next(MI->getReverseIterator()), 0, 1544 IsExpiredFn, Visited, GetWaitStatesFn); 1545 1546 // Transcendentals can execute in parallel to other VALUs. 1547 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1548 if (VisitedTrans) 1549 Count = 0; 1550 1551 MachineOperand *WaitVdstOp = 1552 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1553 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1554 1555 return true; 1556 } 1557 1558 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1559 if (!SIInstrInfo::isLDSDIR(*MI)) 1560 return false; 1561 1562 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1563 const Register VDSTReg = VDST->getReg(); 1564 1565 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1566 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1567 !SIInstrInfo::isDS(I)) 1568 return false; 1569 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1570 }; 1571 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1572 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1573 // according to the type of VMEM instruction. 1574 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1575 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1576 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1577 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1578 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1579 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1580 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1581 }; 1582 1583 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1584 std::numeric_limits<int>::max()) 1585 return false; 1586 1587 if (LdsdirCanWait) { 1588 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1589 } else { 1590 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1591 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1592 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1593 } 1594 1595 return true; 1596 } 1597 1598 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1599 if (!ST.hasVALUPartialForwardingHazard()) 1600 return false; 1601 assert(!ST.hasExtendedWaitCounts()); 1602 1603 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1604 return false; 1605 1606 SmallSetVector<Register, 4> SrcVGPRs; 1607 1608 for (const MachineOperand &Use : MI->explicit_uses()) { 1609 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1610 SrcVGPRs.insert(Use.getReg()); 1611 } 1612 1613 // Only applies with >= 2 unique VGPR sources 1614 if (SrcVGPRs.size() <= 1) 1615 return false; 1616 1617 // Look for the following pattern: 1618 // Va <- VALU [PreExecPos] 1619 // intv1 1620 // Exec <- SALU [ExecPos] 1621 // intv2 1622 // Vb <- VALU [PostExecPos] 1623 // intv3 1624 // MI Va, Vb (WaitState = 0) 1625 // 1626 // Where: 1627 // intv1 + intv2 <= 2 VALUs 1628 // intv3 <= 4 VALUs 1629 // 1630 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1631 1632 const int Intv1plus2MaxVALUs = 2; 1633 const int Intv3MaxVALUs = 4; 1634 const int IntvMaxVALUs = 6; 1635 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1636 1637 struct StateType { 1638 SmallDenseMap<Register, int, 4> DefPos; 1639 int ExecPos = std::numeric_limits<int>::max(); 1640 int VALUs = 0; 1641 }; 1642 1643 StateType State; 1644 1645 // This overloads expiry testing with all the hazard detection 1646 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1647 // Too many VALU states have passed 1648 if (State.VALUs > NoHazardVALUWaitStates) 1649 return HazardExpired; 1650 1651 // Instructions which cause va_vdst==0 expire hazard 1652 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1653 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1654 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1655 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1656 return HazardExpired; 1657 1658 // Track registers writes 1659 bool Changed = false; 1660 if (SIInstrInfo::isVALU(I)) { 1661 for (Register Src : SrcVGPRs) { 1662 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1663 State.DefPos[Src] = State.VALUs; 1664 Changed = true; 1665 } 1666 } 1667 } else if (SIInstrInfo::isSALU(I)) { 1668 if (State.ExecPos == std::numeric_limits<int>::max()) { 1669 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1670 State.ExecPos = State.VALUs; 1671 Changed = true; 1672 } 1673 } 1674 } 1675 1676 // Early expiration: too many VALUs in intv3 1677 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1678 return HazardExpired; 1679 1680 // Only evaluate state if something changed 1681 if (!Changed) 1682 return NoHazardFound; 1683 1684 // Determine positions of VALUs pre/post exec change 1685 if (State.ExecPos == std::numeric_limits<int>::max()) 1686 return NoHazardFound; 1687 1688 int PreExecPos = std::numeric_limits<int>::max(); 1689 int PostExecPos = std::numeric_limits<int>::max(); 1690 1691 for (auto Entry : State.DefPos) { 1692 int DefVALUs = Entry.second; 1693 if (DefVALUs != std::numeric_limits<int>::max()) { 1694 if (DefVALUs >= State.ExecPos) 1695 PreExecPos = std::min(PreExecPos, DefVALUs); 1696 else 1697 PostExecPos = std::min(PostExecPos, DefVALUs); 1698 } 1699 } 1700 1701 // Need a VALUs post exec change 1702 if (PostExecPos == std::numeric_limits<int>::max()) 1703 return NoHazardFound; 1704 1705 // Too many VALUs in intv3? 1706 int Intv3VALUs = PostExecPos; 1707 if (Intv3VALUs > Intv3MaxVALUs) 1708 return HazardExpired; 1709 1710 // Too many VALUs in intv2? 1711 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1712 if (Intv2VALUs > Intv1plus2MaxVALUs) 1713 return HazardExpired; 1714 1715 // Need a VALUs pre exec change 1716 if (PreExecPos == std::numeric_limits<int>::max()) 1717 return NoHazardFound; 1718 1719 // Too many VALUs in intv1? 1720 int Intv1VALUs = PreExecPos - State.ExecPos; 1721 if (Intv1VALUs > Intv1plus2MaxVALUs) 1722 return HazardExpired; 1723 1724 // Too many VALUs in intv1 + intv2 1725 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1726 return HazardExpired; 1727 1728 return HazardFound; 1729 }; 1730 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1731 if (SIInstrInfo::isVALU(MI)) 1732 State.VALUs += 1; 1733 }; 1734 1735 DenseSet<const MachineBasicBlock *> Visited; 1736 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1737 std::next(MI->getReverseIterator()), Visited)) 1738 return false; 1739 1740 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1741 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1742 .addImm(0x0fff); 1743 1744 return true; 1745 } 1746 1747 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1748 if (!ST.hasVALUTransUseHazard()) 1749 return false; 1750 assert(!ST.hasExtendedWaitCounts()); 1751 1752 if (!SIInstrInfo::isVALU(*MI)) 1753 return false; 1754 1755 SmallSet<Register, 4> SrcVGPRs; 1756 1757 for (const MachineOperand &Use : MI->explicit_uses()) { 1758 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1759 SrcVGPRs.insert(Use.getReg()); 1760 } 1761 1762 // Look for the following pattern: 1763 // Va <- TRANS VALU 1764 // intv 1765 // MI Va (WaitState = 0) 1766 // 1767 // Where: 1768 // intv <= 5 VALUs / 1 TRANS 1769 // 1770 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1771 1772 const int IntvMaxVALUs = 5; 1773 const int IntvMaxTRANS = 1; 1774 1775 struct StateType { 1776 int VALUs = 0; 1777 int TRANS = 0; 1778 }; 1779 1780 StateType State; 1781 1782 // This overloads expiry testing with all the hazard detection 1783 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1784 // Too many VALU states have passed 1785 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1786 return HazardExpired; 1787 1788 // Instructions which cause va_vdst==0 expire hazard 1789 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1790 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1791 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1792 I.getOperand(0).getImm() == 0x0fff)) 1793 return HazardExpired; 1794 1795 // Track registers writes 1796 if (SIInstrInfo::isTRANS(I)) { 1797 for (Register Src : SrcVGPRs) { 1798 if (I.modifiesRegister(Src, &TRI)) { 1799 return HazardFound; 1800 } 1801 } 1802 } 1803 1804 return NoHazardFound; 1805 }; 1806 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1807 if (SIInstrInfo::isVALU(MI)) 1808 State.VALUs += 1; 1809 if (SIInstrInfo::isTRANS(MI)) 1810 State.TRANS += 1; 1811 }; 1812 1813 DenseSet<const MachineBasicBlock *> Visited; 1814 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1815 std::next(MI->getReverseIterator()), Visited)) 1816 return false; 1817 1818 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1819 // avoided. 1820 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1821 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1822 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1823 1824 return true; 1825 } 1826 1827 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1828 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1829 return false; 1830 1831 const SIInstrInfo *TII = ST.getInstrInfo(); 1832 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1833 1834 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1835 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1836 return false; 1837 1838 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1839 // with the dest(matrix D) of the previous wmma. 1840 const Register CurSrc0Reg = 1841 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1842 const Register CurSrc1Reg = 1843 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1844 1845 const Register PrevDstReg = 1846 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1847 1848 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1849 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1850 return true; 1851 } 1852 1853 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1854 // but Index can't overlap with PrevDstReg. 1855 if (AMDGPU::isGFX12Plus(ST)) { 1856 if (SIInstrInfo::isSWMMAC(*MI)) { 1857 const Register CurIndex = 1858 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1859 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1860 return true; 1861 } 1862 return false; 1863 } 1864 1865 return false; 1866 }; 1867 1868 auto IsExpiredFn = [](const MachineInstr &I, int) { 1869 return SIInstrInfo::isVALU(I); 1870 }; 1871 1872 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1873 std::numeric_limits<int>::max()) 1874 return false; 1875 1876 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1877 1878 return true; 1879 } 1880 1881 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1882 if (!ST.hasShift64HighRegBug()) 1883 return false; 1884 assert(!ST.hasExtendedWaitCounts()); 1885 1886 switch (MI->getOpcode()) { 1887 default: 1888 return false; 1889 case AMDGPU::V_LSHLREV_B64_e64: 1890 case AMDGPU::V_LSHRREV_B64_e64: 1891 case AMDGPU::V_ASHRREV_I64_e64: 1892 break; 1893 } 1894 1895 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1896 if (!Amt->isReg()) 1897 return false; 1898 1899 Register AmtReg = Amt->getReg(); 1900 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1901 // Check if this is a last VGPR in the allocation block. 1902 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1903 return false; 1904 1905 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1906 return false; 1907 1908 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1909 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1910 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1911 bool Overlapped = OverlappedSrc || OverlappedDst; 1912 1913 assert(!OverlappedDst || !OverlappedSrc || 1914 Src1->getReg() == MI->getOperand(0).getReg()); 1915 assert(ST.needsAlignedVGPRs()); 1916 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1917 1918 Register NewReg; 1919 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1920 : AMDGPU::VGPR_32RegClass) { 1921 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1922 NewReg = Reg; 1923 break; 1924 } 1925 } 1926 1927 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1928 : NewReg; 1929 Register NewAmtLo; 1930 1931 if (Overlapped) 1932 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1933 1934 DebugLoc DL = MI->getDebugLoc(); 1935 MachineBasicBlock *MBB = MI->getParent(); 1936 // Insert a full wait count because found register might be pending a wait. 1937 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1938 .addImm(0); 1939 1940 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1941 if (Overlapped) 1942 runOnInstruction( 1943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1944 .addDef(AmtReg - 1) 1945 .addReg(AmtReg - 1, RegState::Undef) 1946 .addReg(NewAmtLo, RegState::Undef)); 1947 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1948 .addDef(AmtReg) 1949 .addReg(AmtReg, RegState::Undef) 1950 .addReg(NewAmt, RegState::Undef)); 1951 1952 // Instructions emitted after the current instruction will be processed by the 1953 // parent loop of the hazard recognizer in a natural way. 1954 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1955 AmtReg) 1956 .addDef(NewAmt) 1957 .addReg(NewAmt) 1958 .addReg(AmtReg); 1959 if (Overlapped) 1960 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1961 AmtReg - 1) 1962 .addDef(NewAmtLo) 1963 .addReg(NewAmtLo) 1964 .addReg(AmtReg - 1); 1965 1966 // Re-running hazard recognizer on the modified instruction is not necessary, 1967 // inserted V_SWAP_B32 has already both read and write new registers so 1968 // hazards related to these register has already been handled. 1969 Amt->setReg(NewAmt); 1970 Amt->setIsKill(false); 1971 // We do not update liveness, so verifier may see it as undef. 1972 Amt->setIsUndef(); 1973 if (OverlappedDst) 1974 MI->getOperand(0).setReg(NewReg); 1975 if (OverlappedSrc) { 1976 Src1->setReg(NewReg); 1977 Src1->setIsKill(false); 1978 Src1->setIsUndef(); 1979 } 1980 1981 return true; 1982 } 1983 1984 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1985 int NSAtoVMEMWaitStates = 1; 1986 1987 if (!ST.hasNSAtoVMEMBug()) 1988 return 0; 1989 1990 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1991 return 0; 1992 1993 const SIInstrInfo *TII = ST.getInstrInfo(); 1994 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1995 if (!Offset || (Offset->getImm() & 6) == 0) 1996 return 0; 1997 1998 auto IsHazardFn = [TII](const MachineInstr &I) { 1999 if (!SIInstrInfo::isMIMG(I)) 2000 return false; 2001 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 2002 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 2003 TII->getInstSizeInBytes(I) >= 16; 2004 }; 2005 2006 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 2007 } 2008 2009 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 2010 int FPAtomicToDenormModeWaitStates = 3; 2011 2012 if (!ST.hasFPAtomicToDenormModeHazard()) 2013 return 0; 2014 assert(!ST.hasExtendedWaitCounts()); 2015 2016 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2017 return 0; 2018 2019 auto IsHazardFn = [](const MachineInstr &I) { 2020 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2021 return false; 2022 return SIInstrInfo::isFPAtomic(I); 2023 }; 2024 2025 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2026 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2027 return true; 2028 2029 switch (MI.getOpcode()) { 2030 case AMDGPU::S_WAITCNT: 2031 case AMDGPU::S_WAITCNT_VSCNT: 2032 case AMDGPU::S_WAITCNT_VMCNT: 2033 case AMDGPU::S_WAITCNT_EXPCNT: 2034 case AMDGPU::S_WAITCNT_LGKMCNT: 2035 case AMDGPU::S_WAIT_IDLE: 2036 return true; 2037 default: 2038 break; 2039 } 2040 2041 return false; 2042 }; 2043 2044 return FPAtomicToDenormModeWaitStates - 2045 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2046 } 2047 2048 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2049 assert(SIInstrInfo::isMAI(*MI)); 2050 2051 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2052 } 2053 2054 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2055 // Early exit if no padding is requested. 2056 if (MFMAPaddingRatio == 0) 2057 return 0; 2058 2059 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2060 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2061 return 0; 2062 2063 int NeighborMFMALatency = 0; 2064 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2065 this](const MachineInstr &MI) { 2066 if (!SIInstrInfo::isMFMA(MI)) 2067 return false; 2068 2069 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2070 return true; 2071 }; 2072 2073 const int MaxMFMAPipelineWaitStates = 16; 2074 int WaitStatesSinceNeighborMFMA = 2075 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2076 2077 int NeighborMFMAPaddingNeeded = 2078 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2079 WaitStatesSinceNeighborMFMA; 2080 2081 return std::max(0, NeighborMFMAPaddingNeeded); 2082 } 2083 2084 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2085 int WaitStatesNeeded = 0; 2086 unsigned Opc = MI->getOpcode(); 2087 2088 auto IsVALUFn = [](const MachineInstr &MI) { 2089 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2090 }; 2091 2092 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2093 const int LegacyVALUWritesVGPRWaitStates = 2; 2094 const int VALUWritesExecWaitStates = 4; 2095 const int MaxWaitStates = 4; 2096 2097 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2098 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2099 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2100 2101 if (WaitStatesNeeded < MaxWaitStates) { 2102 for (const MachineOperand &Use : MI->explicit_uses()) { 2103 const int MaxWaitStates = 2; 2104 2105 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2106 continue; 2107 2108 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2109 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2110 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2111 2112 if (WaitStatesNeeded == MaxWaitStates) 2113 break; 2114 } 2115 } 2116 } 2117 2118 for (const MachineOperand &Op : MI->explicit_operands()) { 2119 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2120 continue; 2121 2122 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2123 continue; 2124 2125 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2126 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2127 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2128 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2129 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2130 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2131 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2132 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2133 const int MaxWaitStates = 18; 2134 Register Reg = Op.getReg(); 2135 unsigned HazardDefLatency = 0; 2136 2137 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2138 this](const MachineInstr &MI) { 2139 if (!SIInstrInfo::isMFMA(MI)) 2140 return false; 2141 Register DstReg = MI.getOperand(0).getReg(); 2142 if (DstReg == Reg) 2143 return false; 2144 HazardDefLatency = 2145 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2146 return TRI.regsOverlap(DstReg, Reg); 2147 }; 2148 2149 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2150 MaxWaitStates); 2151 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2152 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2153 int OpNo = Op.getOperandNo(); 2154 if (OpNo == SrcCIdx) { 2155 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2156 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2157 switch (HazardDefLatency) { 2158 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2159 break; 2160 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2161 break; 2162 case 16: [[fallthrough]]; 2163 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2164 break; 2165 } 2166 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2167 switch (HazardDefLatency) { 2168 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2169 break; 2170 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2171 break; 2172 case 16: [[fallthrough]]; 2173 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2174 break; 2175 } 2176 } 2177 2178 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2179 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2180 2181 if (WaitStatesNeeded == MaxWaitStates) 2182 return WaitStatesNeeded; // Early exit. 2183 2184 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2185 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2186 return false; 2187 Register DstReg = MI.getOperand(0).getReg(); 2188 return TRI.regsOverlap(Reg, DstReg); 2189 }; 2190 2191 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2192 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2193 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2194 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2195 if (OpNo == SrcCIdx) 2196 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2197 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2198 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2199 2200 WaitStatesNeededForUse = NeedWaitStates - 2201 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2202 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2203 2204 if (WaitStatesNeeded == MaxWaitStates) 2205 return WaitStatesNeeded; // Early exit. 2206 } 2207 2208 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2209 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2210 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2211 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2212 const int MaxWaitStates = 13; 2213 Register DstReg = MI->getOperand(0).getReg(); 2214 unsigned HazardDefLatency = 0; 2215 2216 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2217 this](const MachineInstr &MI) { 2218 if (!SIInstrInfo::isMFMA(MI)) 2219 return false; 2220 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2221 HazardDefLatency = 2222 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2223 return TRI.regsOverlap(Reg, DstReg); 2224 }; 2225 2226 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2227 int NeedWaitStates; 2228 switch (HazardDefLatency) { 2229 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2230 break; 2231 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2232 break; 2233 case 16: [[fallthrough]]; 2234 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2235 break; 2236 } 2237 2238 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2239 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2240 } 2241 2242 // Pad neighboring MFMA with noops for better inter-wave performance. 2243 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2244 2245 return WaitStatesNeeded; 2246 } 2247 2248 static int 2249 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, 2250 bool IsGFX950) { 2251 // xdl def cycles | gfx940 | gfx950 2252 // 2 pass | 3 4 2253 // 4 pass | 5 6 2254 // 8 pass | 9 10 2255 // 16 pass | 17 18 2256 return NumPasses + 1 + IsGFX950; 2257 } 2258 2259 static int 2260 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, 2261 bool IsGFX950) { 2262 // xdl def cycles | gfx940 | gfx950 2263 // 2 pass | 3 3 2264 // 4 pass | 5 6 2265 // 8 pass | 9 10 2266 // 16 pass | 17 18 2267 return NumPasses + 1 + (NumPasses != 2 && IsGFX950); 2268 } 2269 2270 static int 2271 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2272 // 2 pass -> 2 2273 // 4 pass -> 4 2274 // 8 pass -> 8 2275 // 16 pass -> 16 2276 return NumPasses; 2277 } 2278 2279 static int 2280 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2281 // 2 pass -> 4 2282 // 4 pass -> 6 2283 // 8 pass -> 10 2284 // 16 pass -> 18 2285 return NumPasses + 2; 2286 } 2287 2288 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2289 // 2 pass -> 5 2290 // 4 pass -> 7 2291 // 8 pass -> 11 2292 // 16 pass -> 19 2293 return NumPasses + 3; 2294 } 2295 2296 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2297 int WaitStatesNeeded = 0; 2298 unsigned Opc = MI->getOpcode(); 2299 2300 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2301 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2302 }; 2303 2304 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2305 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2306 !SIInstrInfo::isDOT(MI); 2307 }; 2308 2309 if (!SIInstrInfo::isMFMA(*MI)) 2310 return WaitStatesNeeded; 2311 2312 const int VALUWritesExecWaitStates = 4; 2313 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2314 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2315 VALUWritesExecWaitStates); 2316 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2317 2318 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2319 2320 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2321 for (const MachineOperand &Use : MI->explicit_uses()) { 2322 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2323 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2324 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2325 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2326 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2327 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2328 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2329 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2330 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2331 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2332 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2333 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2334 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2335 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2336 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2337 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19; 2338 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2339 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2340 const int MaxWaitStates = 19; 2341 2342 if (!Use.isReg()) 2343 continue; 2344 Register Reg = Use.getReg(); 2345 bool FullReg; 2346 const MachineInstr *MI1; 2347 2348 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2349 this](const MachineInstr &MI) { 2350 if (!SIInstrInfo::isMFMA(MI)) 2351 return false; 2352 Register DstReg = MI.getOperand(0).getReg(); 2353 FullReg = (DstReg == Reg); 2354 MI1 = &MI; 2355 return TRI.regsOverlap(DstReg, Reg); 2356 }; 2357 2358 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2359 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2360 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2361 2362 int NumWaitStates = 2363 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2364 if (NumWaitStates == std::numeric_limits<int>::max()) 2365 continue; 2366 2367 int OpNo = Use.getOperandNo(); 2368 unsigned Opc1 = MI1->getOpcode(); 2369 int NeedWaitStates = 0; 2370 if (OpNo == SrcCIdx) { 2371 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2372 NeedWaitStates = 0; 2373 } else if (FullReg) { 2374 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2375 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2376 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2377 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2378 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2379 else if (ST.hasGFX940Insts() && 2380 TSchedModel.computeInstrLatency(MI1) == 2) 2381 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2382 } else { 2383 switch (Opc1) { 2384 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2385 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2386 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2387 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2388 if (!isXDL(ST, *MI)) 2389 NeedWaitStates = 2390 ST.hasGFX950Insts() 2391 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2392 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2393 break; 2394 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2395 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2396 if (!isXDL(ST, *MI)) 2397 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2398 break; 2399 default: 2400 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2401 if (ST.hasGFX940Insts()) { 2402 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2403 break; 2404 2405 NeedWaitStates = 2406 isXDL(ST, *MI1) 2407 ? (isXDL(ST, *MI) 2408 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates( 2409 NumPasses, ST.hasGFX950Insts()) 2410 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates( 2411 NumPasses, ST.hasGFX950Insts())) 2412 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2413 NumPasses); 2414 break; 2415 } 2416 2417 switch (NumPasses) { 2418 case 2: 2419 NeedWaitStates = 2420 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2421 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2422 break; 2423 case 8: 2424 NeedWaitStates = 2425 isDGEMM(Opc) 2426 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2427 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2428 break; 2429 case 16: 2430 NeedWaitStates = 2431 isDGEMM(Opc) 2432 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2433 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2434 break; 2435 default: 2436 llvm_unreachable("unexpected number of passes"); 2437 } 2438 } 2439 } 2440 } else { 2441 switch (Opc1) { 2442 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2443 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2444 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2445 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2446 NeedWaitStates = 2447 ST.hasGFX950Insts() 2448 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates 2449 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2450 break; 2451 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2452 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2453 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2454 break; 2455 default: 2456 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2457 2458 if (ST.hasGFX940Insts()) { 2459 NeedWaitStates = 2460 isXDL(ST, *MI1) 2461 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2462 NumPasses) 2463 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2464 NumPasses); 2465 break; 2466 } 2467 2468 switch (NumPasses) { 2469 case 2: 2470 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2471 break; 2472 case 4: 2473 llvm_unreachable("unexpected number of passes for mfma"); 2474 case 8: 2475 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2476 break; 2477 case 16: 2478 default: 2479 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2480 } 2481 } 2482 } 2483 if (WaitStatesNeeded >= NeedWaitStates) 2484 continue; 2485 2486 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2487 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2488 2489 if (WaitStatesNeeded == MaxWaitStates) 2490 break; 2491 } 2492 2493 // Pad neighboring MFMA with noops for better inter-wave performance. 2494 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2495 2496 return WaitStatesNeeded; 2497 } 2498 2499 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2500 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2501 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2502 return 0; 2503 2504 int WaitStatesNeeded = 0; 2505 2506 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2507 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2508 }; 2509 2510 for (const MachineOperand &Op : MI->explicit_uses()) { 2511 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2512 continue; 2513 2514 Register Reg = Op.getReg(); 2515 2516 const int AccVgprReadLdStWaitStates = 2; 2517 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2518 const int MaxWaitStates = 2; 2519 2520 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2521 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2522 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2523 2524 if (WaitStatesNeeded == MaxWaitStates) 2525 return WaitStatesNeeded; // Early exit. 2526 2527 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2528 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2529 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2530 return false; 2531 auto IsVALUFn = [](const MachineInstr &MI) { 2532 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2533 }; 2534 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2535 std::numeric_limits<int>::max(); 2536 }; 2537 2538 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2539 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2540 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2541 } 2542 2543 return WaitStatesNeeded; 2544 } 2545 2546 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { 2547 assert(!ST.hasVcmpxPermlaneHazard() && 2548 "this is a different vcmpx+permlane hazard"); 2549 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2550 const SIInstrInfo *TII = ST.getInstrInfo(); 2551 2552 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) { 2553 return isVCmpXWritesExec(*TII, *TRI, MI); 2554 }; 2555 2556 auto IsVALUFn = [](const MachineInstr &MI) { 2557 return SIInstrInfo::isVALU(MI); 2558 }; 2559 2560 const int VCmpXWritesExecWaitStates = 4; 2561 const int VALUWritesVDstWaitStates = 2; 2562 int WaitStatesNeeded = 0; 2563 2564 for (const MachineOperand &Op : MI->explicit_uses()) { 2565 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg())) 2566 continue; 2567 Register Reg = Op.getReg(); 2568 2569 int WaitStatesSinceDef = 2570 VALUWritesVDstWaitStates - 2571 getWaitStatesSinceDef(Reg, IsVALUFn, 2572 /*MaxWaitStates=*/VALUWritesVDstWaitStates); 2573 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef); 2574 if (WaitStatesNeeded >= VALUWritesVDstWaitStates) 2575 break; 2576 } 2577 2578 int VCmpXHazardWaits = 2579 VCmpXWritesExecWaitStates - 2580 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates); 2581 2582 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits); 2583 return WaitStatesNeeded; 2584 } 2585 2586 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2587 // 2 pass -> 4 2588 // 4 pass -> 6 2589 // 8 pass -> 10 2590 // 16 pass -> 18 2591 return NumPasses + 2; 2592 } 2593 2594 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2595 // 2 pass -> 5 2596 // 4 pass -> 7 2597 // 8 pass -> 11 2598 // 16 pass -> 19 2599 return NumPasses + 3; 2600 } 2601 2602 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2603 // 2 pass -> 5 2604 // 4 pass -> 7 2605 // 8 pass -> 11 2606 // 16 pass -> 19 2607 return NumPasses + 3; 2608 } 2609 2610 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2611 // 2 pass -> 4 2612 // 4 pass -> 6 2613 // 8 pass -> 10 2614 // 16 pass -> 18 2615 return NumPasses + 2; 2616 } 2617 2618 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2619 if (!ST.hasGFX90AInsts()) 2620 return 0; 2621 2622 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2623 return isDGEMM(MI.getOpcode()); 2624 }; 2625 2626 // This is checked in checkMAIHazards90A() 2627 if (SIInstrInfo::isMFMA(*MI)) 2628 return 0; 2629 2630 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2631 2632 int WaitStatesNeeded = 0; 2633 2634 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2635 SIInstrInfo::isFLAT(*MI) || 2636 SIInstrInfo::isDS(*MI); 2637 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2638 bool IsVALU = SIInstrInfo::isVALU(*MI); 2639 2640 const MachineInstr *MFMA = nullptr; 2641 unsigned Reg; 2642 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2643 if (!SIInstrInfo::isMFMA(MI) || 2644 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2645 return false; 2646 MFMA = &MI; 2647 return true; 2648 }; 2649 2650 const MachineInstr *DOT = nullptr; 2651 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2652 if (!SIInstrInfo::isDOT(MI) || 2653 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2654 return false; 2655 DOT = &MI; 2656 return true; 2657 }; 2658 2659 bool DGEMMAfterVALUWrite = false; 2660 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2661 // Found DGEMM on reverse traversal to def. 2662 if (isDGEMM(MI.getOpcode())) 2663 DGEMMAfterVALUWrite = true; 2664 2665 // Only hazard if register is defined by a VALU and a DGEMM is found after 2666 // after the def. 2667 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2668 return false; 2669 2670 return true; 2671 }; 2672 2673 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2674 AMDGPU::OpName::src2); 2675 2676 if (IsMemOrExport || IsVALU) { 2677 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2678 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2679 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2680 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2681 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2682 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2683 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2684 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2685 const int DotWriteSameDotReadSrcAB = 3; 2686 const int DotWriteDifferentVALURead = 3; 2687 const int DMFMABetweenVALUWriteVMEMRead = 2; 2688 const int MaxWaitStates = 19; 2689 2690 for (const MachineOperand &Use : MI->explicit_uses()) { 2691 if (!Use.isReg()) 2692 continue; 2693 Reg = Use.getReg(); 2694 2695 DOT = nullptr; 2696 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2697 MaxWaitStates); 2698 if (DOT) { 2699 int NeedWaitStates = 0; 2700 if (DOT->getOpcode() == MI->getOpcode()) { 2701 if (&Use - &MI->getOperand(0) != SrcCIdx) 2702 NeedWaitStates = DotWriteSameDotReadSrcAB; 2703 } else { 2704 NeedWaitStates = DotWriteDifferentVALURead; 2705 } 2706 2707 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2708 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2709 } 2710 2711 // Workaround for HW data hazard bug observed only in GFX90A. When there 2712 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2713 // causes the SQ to incorrectly not insert two wait states between the two 2714 // instructions needed to avoid data hazard. 2715 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2716 DGEMMAfterVALUWrite = false; 2717 if (TRI.isVectorRegister(MRI, Reg)) { 2718 int WaitStatesNeededForUse = 2719 DMFMABetweenVALUWriteVMEMRead - 2720 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2721 DMFMABetweenVALUWriteVMEMRead); 2722 2723 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2724 } 2725 } 2726 2727 MFMA = nullptr; 2728 WaitStatesSinceDef = 2729 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2730 if (!MFMA) 2731 continue; 2732 2733 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2734 int NumPasses = HazardDefLatency; 2735 int NeedWaitStates = MaxWaitStates; 2736 2737 if (isDGEMM(MFMA->getOpcode())) { 2738 switch (HazardDefLatency) { 2739 case 4: 2740 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2741 : DMFMA4x4WriteVgprVALUReadWaitStates; 2742 break; 2743 case 8: 2744 case 16: 2745 NeedWaitStates = 2746 IsMemOrExport 2747 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2748 : (ST.hasGFX950Insts() 2749 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2750 : DMFMA16x16WriteVgprVALUReadWaitStates); 2751 break; 2752 default: 2753 llvm_unreachable("unexpected dgemm"); 2754 } 2755 } else if (ST.hasGFX940Insts()) { 2756 NeedWaitStates = 2757 isXDL(ST, *MFMA) 2758 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2759 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2760 NumPasses); 2761 } else { 2762 switch (HazardDefLatency) { 2763 case 2: 2764 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2765 break; 2766 case 8: 2767 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2768 break; 2769 case 16: 2770 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2771 break; 2772 default: 2773 llvm_unreachable("unexpected number of passes for mfma"); 2774 } 2775 } 2776 2777 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2779 2780 if (WaitStatesNeeded == MaxWaitStates) 2781 break; 2782 } 2783 } 2784 2785 unsigned Opc = MI->getOpcode(); 2786 const int DMFMAToFMA64WaitStates = 2; 2787 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2788 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2789 Opc == AMDGPU::V_FMAC_F64_dpp) && 2790 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2791 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2792 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2793 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2794 } 2795 2796 if (!IsVALU && !IsMemOrExport) 2797 return WaitStatesNeeded; 2798 2799 for (const MachineOperand &Def : MI->defs()) { 2800 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2801 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2802 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2803 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2804 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2805 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2806 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2807 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2808 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2809 const int DotWriteDifferentVALUWrite = 3; 2810 const int MaxWaitStates = 19; 2811 const int MaxWarWaitStates = 15; 2812 2813 Reg = Def.getReg(); 2814 2815 DOT = nullptr; 2816 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2817 MaxWaitStates); 2818 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2819 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2820 WaitStatesSinceDef); 2821 2822 MFMA = nullptr; 2823 WaitStatesSinceDef = 2824 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2825 if (MFMA) { 2826 int NeedWaitStates = MaxWaitStates; 2827 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2828 2829 if (isDGEMM(MFMA->getOpcode())) { 2830 switch (NumPasses) { 2831 case 4: 2832 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2833 break; 2834 case 8: 2835 case 16: 2836 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2837 break; 2838 default: 2839 llvm_unreachable("unexpected number of cycles for dgemm"); 2840 } 2841 } else if (ST.hasGFX940Insts()) { 2842 NeedWaitStates = 2843 isXDL(ST, *MFMA) 2844 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2845 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2846 } else { 2847 switch (NumPasses) { 2848 case 2: 2849 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2850 break; 2851 case 8: 2852 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2853 break; 2854 case 16: 2855 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2856 break; 2857 default: 2858 llvm_unreachable("Unexpected number of passes for mfma"); 2859 } 2860 } 2861 2862 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2863 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2864 2865 if (WaitStatesNeeded == MaxWaitStates) 2866 break; 2867 } 2868 2869 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2870 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2871 !MI.readsRegister(Reg, &TRI)) 2872 return false; 2873 2874 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2875 return false; 2876 2877 const MachineOperand *SrcC = 2878 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2879 assert(SrcC); 2880 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2881 return false; 2882 2883 MFMA = &MI; 2884 return true; 2885 }; 2886 2887 MFMA = nullptr; 2888 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2889 MaxWarWaitStates); 2890 if (!MFMA) 2891 continue; 2892 2893 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2894 int NeedWaitStates = MaxWaitStates; 2895 switch (HazardDefLatency) { 2896 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2897 break; 2898 case 4: assert(ST.hasGFX940Insts()); 2899 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2900 break; 2901 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2902 break; 2903 case 16: [[fallthrough]]; 2904 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2905 break; 2906 } 2907 2908 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2909 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2910 } 2911 2912 return WaitStatesNeeded; 2913 } 2914 2915 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2916 if (!SU->isInstr()) 2917 return false; 2918 2919 const MachineInstr *MAI = nullptr; 2920 2921 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2922 MAI = nullptr; 2923 if (SIInstrInfo::isMFMA(MI)) 2924 MAI = &MI; 2925 return MAI != nullptr; 2926 }; 2927 2928 MachineInstr *MI = SU->getInstr(); 2929 if (IsMFMAFn(*MI)) { 2930 int W = getWaitStatesSince(IsMFMAFn, 16); 2931 if (MAI) 2932 return W < (int)TSchedModel.computeInstrLatency(MAI); 2933 } 2934 2935 return false; 2936 } 2937 2938 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2939 // insertion of a new instruction. 2940 static void updateGetPCBundle(MachineInstr *NewMI) { 2941 if (!NewMI->isBundled()) 2942 return; 2943 2944 // Find start of bundle. 2945 auto I = NewMI->getIterator(); 2946 while (I->isBundledWithPred()) 2947 I--; 2948 if (I->isBundle()) 2949 I++; 2950 2951 // Bail if this is not an S_GETPC bundle. 2952 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2953 return; 2954 2955 // Update offsets of any references in the bundle. 2956 const unsigned NewBytes = 4; 2957 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2958 "Unexpected instruction insertion in bundle"); 2959 auto NextMI = std::next(NewMI->getIterator()); 2960 auto End = NewMI->getParent()->end(); 2961 while (NextMI != End && NextMI->isBundledWithPred()) { 2962 for (auto &Operand : NextMI->operands()) { 2963 if (Operand.isGlobal()) 2964 Operand.setOffset(Operand.getOffset() + NewBytes); 2965 } 2966 NextMI++; 2967 } 2968 } 2969 2970 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2971 if (!ST.hasVALUMaskWriteHazard()) 2972 return false; 2973 assert(!ST.hasExtendedWaitCounts()); 2974 2975 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2976 return false; 2977 2978 // The hazard sequence is three instructions: 2979 // 1. VALU reads SGPR as mask 2980 // 2. SALU writes SGPR 2981 // 3. SALU reads SGPR 2982 // The hazard can expire if the distance between 2 and 3 is sufficient. 2983 // In practice this happens <10% of the time, hence this always assumes 2984 // the hazard exists if 1 and 2 are present to avoid searching. 2985 2986 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2987 if (!SDSTOp || !SDSTOp->isReg()) 2988 return false; 2989 2990 const Register HazardReg = SDSTOp->getReg(); 2991 if (HazardReg == AMDGPU::EXEC || 2992 HazardReg == AMDGPU::EXEC_LO || 2993 HazardReg == AMDGPU::EXEC_HI || 2994 HazardReg == AMDGPU::M0) 2995 return false; 2996 2997 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2998 switch (I.getOpcode()) { 2999 case AMDGPU::V_ADDC_U32_e32: 3000 case AMDGPU::V_ADDC_U32_dpp: 3001 case AMDGPU::V_CNDMASK_B16_e32: 3002 case AMDGPU::V_CNDMASK_B16_dpp: 3003 case AMDGPU::V_CNDMASK_B32_e32: 3004 case AMDGPU::V_CNDMASK_B32_dpp: 3005 case AMDGPU::V_DIV_FMAS_F32_e64: 3006 case AMDGPU::V_DIV_FMAS_F64_e64: 3007 case AMDGPU::V_SUBB_U32_e32: 3008 case AMDGPU::V_SUBB_U32_dpp: 3009 case AMDGPU::V_SUBBREV_U32_e32: 3010 case AMDGPU::V_SUBBREV_U32_dpp: 3011 // These implicitly read VCC as mask source. 3012 return HazardReg == AMDGPU::VCC || 3013 HazardReg == AMDGPU::VCC_LO || 3014 HazardReg == AMDGPU::VCC_HI; 3015 case AMDGPU::V_ADDC_U32_e64: 3016 case AMDGPU::V_ADDC_U32_e64_dpp: 3017 case AMDGPU::V_CNDMASK_B16_e64: 3018 case AMDGPU::V_CNDMASK_B16_e64_dpp: 3019 case AMDGPU::V_CNDMASK_B32_e64: 3020 case AMDGPU::V_CNDMASK_B32_e64_dpp: 3021 case AMDGPU::V_SUBB_U32_e64: 3022 case AMDGPU::V_SUBB_U32_e64_dpp: 3023 case AMDGPU::V_SUBBREV_U32_e64: 3024 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 3025 // Only check mask register overlaps. 3026 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 3027 assert(SSRCOp); 3028 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 3029 } 3030 default: 3031 return false; 3032 } 3033 }; 3034 3035 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3036 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 3037 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 3038 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3039 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3040 return true; 3041 3042 // VALU access to any SGPR or literal constant other than HazardReg 3043 // mitigates hazard. No need to check HazardReg here as this will 3044 // only be called when !IsHazardFn. 3045 if (!SIInstrInfo::isVALU(I)) 3046 return false; 3047 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 3048 const MachineOperand &Op = I.getOperand(OpNo); 3049 if (Op.isReg()) { 3050 Register OpReg = Op.getReg(); 3051 // Only consider uses 3052 if (!Op.isUse()) 3053 continue; 3054 // Ignore EXEC 3055 if (OpReg == AMDGPU::EXEC || 3056 OpReg == AMDGPU::EXEC_LO || 3057 OpReg == AMDGPU::EXEC_HI) 3058 continue; 3059 // Ignore all implicit uses except VCC 3060 if (Op.isImplicit()) { 3061 if (OpReg == AMDGPU::VCC || 3062 OpReg == AMDGPU::VCC_LO || 3063 OpReg == AMDGPU::VCC_HI) 3064 return true; 3065 continue; 3066 } 3067 if (TRI.isSGPRReg(MRI, OpReg)) 3068 return true; 3069 } else { 3070 const MCInstrDesc &InstDesc = I.getDesc(); 3071 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3072 if (!TII.isInlineConstant(Op, OpInfo)) 3073 return true; 3074 } 3075 } 3076 return false; 3077 }; 3078 3079 // Check for hazard 3080 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3081 std::numeric_limits<int>::max()) 3082 return false; 3083 3084 auto NextMI = std::next(MI->getIterator()); 3085 3086 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3087 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3088 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3089 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3090 3091 // SALU write may be s_getpc in a bundle. 3092 updateGetPCBundle(NewMI); 3093 3094 return true; 3095 } 3096 3097 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3098 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3099 static std::optional<unsigned> sgprPairNumber(Register Reg, 3100 const SIRegisterInfo &TRI) { 3101 switch (Reg) { 3102 case AMDGPU::M0: 3103 case AMDGPU::EXEC: 3104 case AMDGPU::EXEC_LO: 3105 case AMDGPU::EXEC_HI: 3106 case AMDGPU::SGPR_NULL: 3107 case AMDGPU::SGPR_NULL64: 3108 return {}; 3109 default: 3110 break; 3111 } 3112 unsigned RegN = TRI.getEncodingValue(Reg); 3113 if (RegN > 127) 3114 return {}; 3115 return (RegN >> 1) & 0x3f; 3116 } 3117 3118 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3119 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3120 assert(MMF == &MF); 3121 3122 // Assume non-empty vector means it has already been computed. 3123 if (!VALUReadHazardSGPRs.empty()) 3124 return; 3125 3126 auto CallingConv = MF.getFunction().getCallingConv(); 3127 bool IsCallFree = 3128 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3129 3130 // Exhaustive search is only viable in non-caller/callee functions where 3131 // VALUs will be exposed to the hazard recognizer. 3132 UseVALUReadHazardExhaustiveSearch = 3133 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3134 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3135 3136 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3137 bool UseVALUUseCache = 3138 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3139 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3140 if (!UseVALUUseCache) 3141 return; 3142 3143 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3144 // before a SALU write to the same SGPR. This provides a reduction in 3145 // hazard insertion when all VALU access to an SGPR occurs after its last 3146 // SALU write, when compared to a linear scan. 3147 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3148 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3149 MachineCycleInfo CI; 3150 CI.compute(*MMF); 3151 3152 for (auto *MBB : post_order(&MF)) { 3153 bool InCycle = CI.getCycle(MBB) != nullptr; 3154 for (auto &MI : reverse(MBB->instrs())) { 3155 bool IsVALU = SIInstrInfo::isVALU(MI); 3156 bool IsSALU = SIInstrInfo::isSALU(MI); 3157 if (!IsVALU && !IsSALU) 3158 continue; 3159 3160 for (const MachineOperand &Op : MI.operands()) { 3161 if (!Op.isReg()) 3162 continue; 3163 Register Reg = Op.getReg(); 3164 assert(!Op.getSubReg()); 3165 // Only consider implicit operands of VCC. 3166 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3167 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3168 continue; 3169 if (!TRI.isSGPRReg(MRI, Reg)) 3170 continue; 3171 auto RegN = sgprPairNumber(Reg, TRI); 3172 if (!RegN) 3173 continue; 3174 if (IsVALU && Op.isUse()) { 3175 // Note: any access within a cycle must be considered a hazard. 3176 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3177 VALUReadHazardSGPRs.set(*RegN); 3178 ReadSGPRs.set(*RegN); 3179 } else if (IsSALU) { 3180 if (Op.isDef()) 3181 SALUWriteSGPRs.set(*RegN); 3182 else 3183 ReadSGPRs.set(*RegN); 3184 } 3185 } 3186 } 3187 } 3188 } 3189 3190 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3191 if (!ST.hasVALUReadSGPRHazard()) 3192 return false; 3193 3194 // The hazard sequence is fundamentally three instructions: 3195 // 1. VALU reads SGPR 3196 // 2. SALU writes SGPR 3197 // 3. VALU/SALU reads SGPR 3198 // Try to avoid searching for (1) because the expiry point of the hazard is 3199 // indeterminate; however, the hazard between (2) and (3) can expire if the 3200 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3201 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3202 // even if individual SGPRs are accessed. 3203 3204 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3205 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3206 if (!(MIIsSALU || MIIsVALU)) 3207 return false; 3208 3209 // Avoid expensive search when compile time is priority by 3210 // mitigating every SALU which writes an SGPR. 3211 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3212 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3213 return false; 3214 3215 const MachineOperand *SDSTOp = 3216 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3217 if (!SDSTOp || !SDSTOp->isReg()) 3218 return false; 3219 3220 const Register HazardReg = SDSTOp->getReg(); 3221 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3222 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3223 return false; 3224 3225 // Add s_wait_alu sa_sdst(0) after SALU write. 3226 auto NextMI = std::next(MI->getIterator()); 3227 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3228 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3229 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3230 3231 // SALU write may be s_getpc in a bundle. 3232 updateGetPCBundle(NewMI); 3233 3234 return true; 3235 } 3236 3237 // Pre-compute set of SGPR pairs read by VALUs. 3238 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3239 computeVALUHazardSGPRs(MI->getMF()); 3240 3241 // If no VALUs hazard SGPRs exist then nothing to do. 3242 if (VALUReadHazardSGPRs.none()) 3243 return false; 3244 3245 // All SGPR writes before a call/return must be flushed as the callee/caller 3246 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3247 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3248 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3249 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3250 3251 // Collect all SGPR sources for MI which are read by a VALU. 3252 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3253 SmallSet<Register, 4> SGPRsUsed; 3254 3255 if (!IsSetPC) { 3256 for (const MachineOperand &Op : MI->all_uses()) { 3257 Register OpReg = Op.getReg(); 3258 3259 // Only consider VCC implicit uses on VALUs. 3260 // The only expected SALU implicit access is SCC which is no hazard. 3261 if (MIIsSALU && Op.isImplicit()) 3262 continue; 3263 3264 if (!TRI.isSGPRReg(MRI, OpReg)) 3265 continue; 3266 3267 auto RegN = sgprPairNumber(OpReg, TRI); 3268 if (!RegN) 3269 continue; 3270 3271 if (!VALUReadHazardSGPRs[*RegN]) 3272 continue; 3273 3274 SGPRsUsed.insert(OpReg); 3275 } 3276 3277 // No SGPRs -> nothing to do. 3278 if (SGPRsUsed.empty()) 3279 return false; 3280 } 3281 3282 // A hazard is any SALU which writes one of the SGPRs read by MI. 3283 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3284 if (!SIInstrInfo::isSALU(I)) 3285 return false; 3286 // Ensure SGPR flush before call/return by conservatively assuming every 3287 // SALU writes an SGPR. 3288 if (IsSetPC && I.getNumDefs() > 0) 3289 return true; 3290 // Check for any register writes. 3291 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3292 return I.modifiesRegister(Reg, &TRI); 3293 }); 3294 }; 3295 3296 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3297 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3298 if (Count >= SALUExpiryCount) 3299 return true; 3300 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3301 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3302 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3303 return true; 3304 return false; 3305 }; 3306 3307 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3308 // Only count true SALUs as wait states. 3309 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3310 return 0; 3311 // SALU must be unrelated to any hazard registers. 3312 if (any_of(SGPRsUsed, 3313 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3314 return 0; 3315 return 1; 3316 }; 3317 3318 // Check for the hazard. 3319 DenseSet<const MachineBasicBlock *> Visited; 3320 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3321 std::next(MI->getReverseIterator()), 0, 3322 IsExpiredFn, Visited, WaitStatesFn); 3323 3324 if (WaitStates >= SALUExpiryCount) 3325 return false; 3326 3327 // Validate hazard through an exhaustive search. 3328 if (UseVALUReadHazardExhaustiveSearch) { 3329 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3330 // This is searching for (1) in the hazard description. 3331 auto hazardPair = [this](Register Reg) { 3332 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3333 return Register(AMDGPU::VCC); 3334 auto RegN = sgprPairNumber(Reg, TRI); 3335 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3336 }; 3337 auto SearchHazardFn = [this, hazardPair, 3338 &SGPRsUsed](const MachineInstr &I) { 3339 if (!SIInstrInfo::isVALU(I)) 3340 return false; 3341 // Check for any register reads. 3342 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3343 return I.readsRegister(hazardPair(Reg), &TRI); 3344 }); 3345 }; 3346 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3347 return false; 3348 }; 3349 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3350 std::numeric_limits<int>::max()) 3351 return false; 3352 } 3353 3354 // Add s_wait_alu sa_sdst(0) before SALU read. 3355 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3356 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3357 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3358 3359 // SALU read may be after s_getpc in a bundle. 3360 updateGetPCBundle(NewMI); 3361 3362 return true; 3363 } 3364 3365 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3366 const SIInstrInfo &TII) { 3367 MachineBasicBlock &EntryMBB = MF->front(); 3368 if (EntryMBB.begin() != EntryMBB.end()) { 3369 auto &EntryMI = *EntryMBB.begin(); 3370 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3371 EntryMI.getOperand(0).getImm() >= Priority) 3372 return false; 3373 } 3374 3375 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3376 .addImm(Priority); 3377 return true; 3378 } 3379 3380 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3381 if (!ST.hasRequiredExportPriority()) 3382 return false; 3383 3384 // Assume the following shader types will never have exports, 3385 // and avoid adding or adjusting S_SETPRIO. 3386 MachineBasicBlock *MBB = MI->getParent(); 3387 MachineFunction *MF = MBB->getParent(); 3388 auto CC = MF->getFunction().getCallingConv(); 3389 switch (CC) { 3390 case CallingConv::AMDGPU_CS: 3391 case CallingConv::AMDGPU_CS_Chain: 3392 case CallingConv::AMDGPU_CS_ChainPreserve: 3393 case CallingConv::AMDGPU_KERNEL: 3394 return false; 3395 default: 3396 break; 3397 } 3398 3399 const int MaxPriority = 3; 3400 const int NormalPriority = 2; 3401 const int PostExportPriority = 0; 3402 3403 auto It = MI->getIterator(); 3404 switch (MI->getOpcode()) { 3405 case AMDGPU::S_ENDPGM: 3406 case AMDGPU::S_ENDPGM_SAVED: 3407 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3408 case AMDGPU::SI_RETURN_TO_EPILOG: 3409 // Ensure shader with calls raises priority at entry. 3410 // This ensures correct priority if exports exist in callee. 3411 if (MF->getFrameInfo().hasCalls()) 3412 return ensureEntrySetPrio(MF, NormalPriority, TII); 3413 return false; 3414 case AMDGPU::S_SETPRIO: { 3415 // Raise minimum priority unless in workaround. 3416 auto &PrioOp = MI->getOperand(0); 3417 int Prio = PrioOp.getImm(); 3418 bool InWA = (Prio == PostExportPriority) && 3419 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3420 if (InWA || Prio >= NormalPriority) 3421 return false; 3422 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3423 return true; 3424 } 3425 default: 3426 if (!TII.isEXP(*MI)) 3427 return false; 3428 break; 3429 } 3430 3431 // Check entry priority at each export (as there will only be a few). 3432 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3433 bool Changed = false; 3434 if (CC != CallingConv::AMDGPU_Gfx) 3435 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3436 3437 auto NextMI = std::next(It); 3438 bool EndOfShader = false; 3439 if (NextMI != MBB->end()) { 3440 // Only need WA at end of sequence of exports. 3441 if (TII.isEXP(*NextMI)) 3442 return Changed; 3443 // Assume appropriate S_SETPRIO after export means WA already applied. 3444 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3445 NextMI->getOperand(0).getImm() == PostExportPriority) 3446 return Changed; 3447 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3448 } 3449 3450 const DebugLoc &DL = MI->getDebugLoc(); 3451 3452 // Lower priority. 3453 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3454 .addImm(PostExportPriority); 3455 3456 if (!EndOfShader) { 3457 // Wait for exports to complete. 3458 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3459 .addReg(AMDGPU::SGPR_NULL) 3460 .addImm(0); 3461 } 3462 3463 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3464 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3465 3466 if (!EndOfShader) { 3467 // Return to normal (higher) priority. 3468 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3469 .addImm(NormalPriority); 3470 } 3471 3472 return true; 3473 } 3474