1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/PostOrderIterator.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/ScheduleDAG.h" 21 #include "llvm/TargetParser/TargetParser.h" 22 23 using namespace llvm; 24 25 namespace { 26 27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 29 30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 31 if (Arg.getAsInteger(0, Value)) 32 return O.error("'" + Arg + "' value invalid for uint argument!"); 33 34 if (Value > 100) 35 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 36 37 return false; 38 } 39 }; 40 41 } // end anonymous namespace 42 43 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 45 cl::desc("Fill a percentage of the latency between " 46 "neighboring MFMA with s_nops.")); 47 48 static cl::opt<unsigned> MaxExhaustiveHazardSearch( 49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, 50 cl::desc("Maximum function size for exhausive hazard search")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 UseVALUReadHazardExhaustiveSearch(false), 64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 67 } 68 69 void GCNHazardRecognizer::Reset() { 70 EmittedInstrs.clear(); 71 } 72 73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 74 EmitInstruction(SU->getInstr()); 75 } 76 77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 78 CurrCycleInstr = MI; 79 } 80 81 static bool isDivFMas(unsigned Opcode) { 82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 83 } 84 85 static bool isSGetReg(unsigned Opcode) { 86 return Opcode == AMDGPU::S_GETREG_B32; 87 } 88 89 static bool isSSetReg(unsigned Opcode) { 90 switch (Opcode) { 91 case AMDGPU::S_SETREG_B32: 92 case AMDGPU::S_SETREG_B32_mode: 93 case AMDGPU::S_SETREG_IMM32_B32: 94 case AMDGPU::S_SETREG_IMM32_B32_mode: 95 return true; 96 } 97 return false; 98 } 99 100 static bool isRWLane(unsigned Opcode) { 101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 102 } 103 104 static bool isRFE(unsigned Opcode) { 105 return Opcode == AMDGPU::S_RFE_B64; 106 } 107 108 static bool isSMovRel(unsigned Opcode) { 109 switch (Opcode) { 110 case AMDGPU::S_MOVRELS_B32: 111 case AMDGPU::S_MOVRELS_B64: 112 case AMDGPU::S_MOVRELD_B32: 113 case AMDGPU::S_MOVRELD_B64: 114 return true; 115 default: 116 return false; 117 } 118 } 119 120 static bool isDGEMM(unsigned Opcode) { 121 return AMDGPU::getMAIIsDGEMM(Opcode); 122 } 123 124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 125 unsigned Opcode = MI.getOpcode(); 126 127 if (!SIInstrInfo::isMAI(MI) || 128 isDGEMM(Opcode) || 129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 131 return false; 132 133 if (!ST.hasGFX940Insts()) 134 return true; 135 136 return AMDGPU::getMAIIsGFX940XDL(Opcode); 137 } 138 139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 140 const MachineInstr &MI) { 141 if (TII.isAlwaysGDS(MI.getOpcode())) 142 return true; 143 144 switch (MI.getOpcode()) { 145 case AMDGPU::S_SENDMSG: 146 case AMDGPU::S_SENDMSGHALT: 147 case AMDGPU::S_TTRACEDATA: 148 return true; 149 // These DS opcodes don't support GDS. 150 case AMDGPU::DS_NOP: 151 case AMDGPU::DS_PERMUTE_B32: 152 case AMDGPU::DS_BPERMUTE_B32: 153 return false; 154 default: 155 if (TII.isDS(MI.getOpcode())) { 156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 157 AMDGPU::OpName::gds); 158 if (MI.getOperand(GDS).getImm()) 159 return true; 160 } 161 return false; 162 } 163 } 164 165 static bool isPermlane(const MachineInstr &MI) { 166 unsigned Opcode = MI.getOpcode(); 167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 168 Opcode == AMDGPU::V_PERMLANE64_B32 || 169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 || 172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || 173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || 174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || 175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; 176 } 177 178 static bool isLdsDma(const MachineInstr &MI) { 179 return SIInstrInfo::isVALU(MI) && 180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 181 } 182 183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 185 AMDGPU::OpName::simm16); 186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 187 } 188 189 ScheduleHazardRecognizer::HazardType 190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 191 MachineInstr *MI = SU->getInstr(); 192 // If we are not in "HazardRecognizerMode" and therefore not being run from 193 // the scheduler, track possible stalls from hazards but don't insert noops. 194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 195 196 if (MI->isBundle()) 197 return NoHazard; 198 199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 200 return HazardType; 201 202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 203 return HazardType; 204 205 if (checkFPAtomicToDenormModeHazard(MI) > 0) 206 return HazardType; 207 208 if (ST.hasNoDataDepHazard()) 209 return NoHazard; 210 211 // FIXME: Should flat be considered vmem? 212 if ((SIInstrInfo::isVMEM(*MI) || 213 SIInstrInfo::isFLAT(*MI)) 214 && checkVMEMHazards(MI) > 0) 215 return HazardType; 216 217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 218 return HazardType; 219 220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 221 return HazardType; 222 223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 224 return HazardType; 225 226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 227 return HazardType; 228 229 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 230 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 232 return HazardType; 233 234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 235 return HazardType; 236 237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 238 return HazardType; 239 240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 241 return HazardType; 242 243 if (((ST.hasReadM0MovRelInterpHazard() && 244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 247 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 249 (ST.hasReadM0LdsDirectHazard() && 250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 251 checkReadM0Hazards(MI) > 0) 252 return HazardType; 253 254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 255 return HazardType; 256 257 if ((SIInstrInfo::isVMEM(*MI) || 258 SIInstrInfo::isFLAT(*MI) || 259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 260 return HazardType; 261 262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 263 return HazardType; 264 265 return NoHazard; 266 } 267 268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 269 unsigned Quantity) { 270 while (Quantity > 0) { 271 unsigned Arg = std::min(Quantity, 8u); 272 Quantity -= Arg; 273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 274 .addImm(Arg - 1); 275 } 276 } 277 278 unsigned 279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 281 assert(TSchedModel.getWriteProcResBegin(SC) != 282 TSchedModel.getWriteProcResEnd(SC)); 283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 284 } 285 286 void GCNHazardRecognizer::processBundle() { 287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 288 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 289 // Check bundled MachineInstr's for hazards. 290 for (; MI != E && MI->isInsideBundle(); ++MI) { 291 CurrCycleInstr = &*MI; 292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 293 294 if (IsHazardRecognizerMode) { 295 fixHazards(CurrCycleInstr); 296 297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 298 } 299 300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 301 // include the bundled MI directly after, only add a maximum of 302 // (MaxLookAhead - 1) noops to EmittedInstrs. 303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 304 EmittedInstrs.push_front(nullptr); 305 306 EmittedInstrs.push_front(CurrCycleInstr); 307 EmittedInstrs.resize(MaxLookAhead); 308 } 309 CurrCycleInstr = nullptr; 310 } 311 312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 313 assert(IsHazardRecognizerMode); 314 315 unsigned NumPreNoops = PreEmitNoops(MI); 316 EmitNoops(NumPreNoops); 317 if (MI->isInsideBundle()) 318 insertNoopsInBundle(MI, TII, NumPreNoops); 319 else 320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 321 NumPreNoops); 322 EmitInstruction(MI); 323 AdvanceCycle(); 324 } 325 326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 327 IsHazardRecognizerMode = true; 328 CurrCycleInstr = MI; 329 unsigned W = PreEmitNoopsCommon(MI); 330 fixHazards(MI); 331 CurrCycleInstr = nullptr; 332 return W; 333 } 334 335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 336 if (MI->isBundle()) 337 return 0; 338 339 int WaitStates = 0; 340 341 if (SIInstrInfo::isSMRD(*MI)) 342 return std::max(WaitStates, checkSMRDHazards(MI)); 343 344 if (ST.hasNSAtoVMEMBug()) 345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 346 347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 348 349 if (ST.hasNoDataDepHazard()) 350 return WaitStates; 351 352 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 354 355 if (SIInstrInfo::isVALU(*MI)) 356 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 357 358 if (SIInstrInfo::isDPP(*MI)) 359 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 360 361 if (isDivFMas(MI->getOpcode())) 362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 363 364 if (isRWLane(MI->getOpcode())) 365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 366 367 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 368 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 371 372 if (MI->isInlineAsm()) 373 return std::max(WaitStates, checkInlineAsmHazards(MI)); 374 375 if (isSGetReg(MI->getOpcode())) 376 return std::max(WaitStates, checkGetRegHazards(MI)); 377 378 if (isSSetReg(MI->getOpcode())) 379 return std::max(WaitStates, checkSetRegHazards(MI)); 380 381 if (isRFE(MI->getOpcode())) 382 return std::max(WaitStates, checkRFEHazards(MI)); 383 384 if ((ST.hasReadM0MovRelInterpHazard() && 385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 388 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 390 (ST.hasReadM0LdsDirectHazard() && 391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 392 return std::max(WaitStates, checkReadM0Hazards(MI)); 393 394 if (SIInstrInfo::isMAI(*MI)) 395 return std::max(WaitStates, checkMAIHazards(MI)); 396 397 if (SIInstrInfo::isVMEM(*MI) || 398 SIInstrInfo::isFLAT(*MI) || 399 SIInstrInfo::isDS(*MI)) 400 return std::max(WaitStates, checkMAILdStHazards(MI)); 401 402 if (ST.hasGFX950Insts() && isPermlane(*MI)) 403 return std::max(WaitStates, checkPermlaneHazards(MI)); 404 405 return WaitStates; 406 } 407 408 void GCNHazardRecognizer::EmitNoop() { 409 EmittedInstrs.push_front(nullptr); 410 } 411 412 void GCNHazardRecognizer::AdvanceCycle() { 413 // When the scheduler detects a stall, it will call AdvanceCycle() without 414 // emitting any instructions. 415 if (!CurrCycleInstr) { 416 EmittedInstrs.push_front(nullptr); 417 return; 418 } 419 420 if (CurrCycleInstr->isBundle()) { 421 processBundle(); 422 return; 423 } 424 425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 426 if (!NumWaitStates) { 427 CurrCycleInstr = nullptr; 428 return; 429 } 430 431 // Keep track of emitted instructions 432 EmittedInstrs.push_front(CurrCycleInstr); 433 434 // Add a nullptr for each additional wait state after the first. Make sure 435 // not to add more than getMaxLookAhead() items to the list, since we 436 // truncate the list to that size right after this loop. 437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 438 i < e; ++i) { 439 EmittedInstrs.push_front(nullptr); 440 } 441 442 // getMaxLookahead() is the largest number of wait states we will ever need 443 // to insert, so there is no point in keeping track of more than that many 444 // wait states. 445 EmittedInstrs.resize(getMaxLookAhead()); 446 447 CurrCycleInstr = nullptr; 448 } 449 450 void GCNHazardRecognizer::RecedeCycle() { 451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 452 } 453 454 //===----------------------------------------------------------------------===// 455 // Helper Functions 456 //===----------------------------------------------------------------------===// 457 458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 459 460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 462 463 // Search for a hazard in a block and its predecessors. 464 template <typename StateT> 465 static bool 466 hasHazard(StateT State, 467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 468 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 469 const MachineBasicBlock *MBB, 470 MachineBasicBlock::const_reverse_instr_iterator I, 471 DenseSet<const MachineBasicBlock *> &Visited) { 472 for (auto E = MBB->instr_rend(); I != E; ++I) { 473 // No need to look at parent BUNDLE instructions. 474 if (I->isBundle()) 475 continue; 476 477 switch (IsHazard(State, *I)) { 478 case HazardFound: 479 return true; 480 case HazardExpired: 481 return false; 482 default: 483 // Continue search 484 break; 485 } 486 487 if (I->isInlineAsm() || I->isMetaInstruction()) 488 continue; 489 490 UpdateState(State, *I); 491 } 492 493 for (MachineBasicBlock *Pred : MBB->predecessors()) { 494 if (!Visited.insert(Pred).second) 495 continue; 496 497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 498 Visited)) 499 return true; 500 } 501 502 return false; 503 } 504 505 // Returns a minimum wait states since \p I walking all predecessors. 506 // Only scans until \p IsExpired does not return true. 507 // Can only be run in a hazard recognizer mode. 508 static int getWaitStatesSince( 509 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 510 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 511 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 512 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 513 for (auto E = MBB->instr_rend(); I != E; ++I) { 514 // Don't add WaitStates for parent BUNDLE instructions. 515 if (I->isBundle()) 516 continue; 517 518 if (IsHazard(*I)) 519 return WaitStates; 520 521 if (I->isInlineAsm()) 522 continue; 523 524 WaitStates += GetNumWaitStates(*I); 525 526 if (IsExpired(*I, WaitStates)) 527 return std::numeric_limits<int>::max(); 528 } 529 530 int MinWaitStates = std::numeric_limits<int>::max(); 531 for (MachineBasicBlock *Pred : MBB->predecessors()) { 532 if (!Visited.insert(Pred).second) 533 continue; 534 535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 536 IsExpired, Visited, GetNumWaitStates); 537 538 MinWaitStates = std::min(MinWaitStates, W); 539 } 540 541 return MinWaitStates; 542 } 543 544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 545 const MachineInstr *MI, IsExpiredFn IsExpired) { 546 DenseSet<const MachineBasicBlock *> Visited; 547 return getWaitStatesSince(IsHazard, MI->getParent(), 548 std::next(MI->getReverseIterator()), 549 0, IsExpired, Visited); 550 } 551 552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 553 if (IsHazardRecognizerMode) { 554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 555 return WaitStates >= Limit; 556 }; 557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 558 } 559 560 int WaitStates = 0; 561 for (MachineInstr *MI : EmittedInstrs) { 562 if (MI) { 563 if (IsHazard(*MI)) 564 return WaitStates; 565 566 if (MI->isInlineAsm()) 567 continue; 568 } 569 ++WaitStates; 570 571 if (WaitStates >= Limit) 572 break; 573 } 574 return std::numeric_limits<int>::max(); 575 } 576 577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 578 IsHazardFn IsHazardDef, 579 int Limit) { 580 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 581 582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 584 }; 585 586 return getWaitStatesSince(IsHazardFn, Limit); 587 } 588 589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 590 int Limit) { 591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 592 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 593 }; 594 595 return getWaitStatesSince(IsHazardFn, Limit); 596 } 597 598 //===----------------------------------------------------------------------===// 599 // No-op Hazard Detection 600 //===----------------------------------------------------------------------===// 601 602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 603 MCRegister Reg) { 604 for (MCRegUnit Unit : TRI.regunits(Reg)) 605 BV.set(Unit); 606 } 607 608 static void addRegsToSet(const SIRegisterInfo &TRI, 609 iterator_range<MachineInstr::const_mop_iterator> Ops, 610 BitVector &DefSet, BitVector &UseSet) { 611 for (const MachineOperand &Op : Ops) { 612 if (Op.isReg()) 613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 614 } 615 } 616 617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 619 } 620 621 static bool breaksSMEMSoftClause(MachineInstr *MI) { 622 return !SIInstrInfo::isSMRD(*MI); 623 } 624 625 static bool breaksVMEMSoftClause(MachineInstr *MI) { 626 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 627 } 628 629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 630 // SMEM soft clause are only present on VI+, and only matter if xnack is 631 // enabled. 632 if (!ST.isXNACKEnabled()) 633 return 0; 634 635 bool IsSMRD = TII.isSMRD(*MEM); 636 637 resetClause(); 638 639 // A soft-clause is any group of consecutive SMEM instructions. The 640 // instructions in this group may return out of order and/or may be 641 // replayed (i.e. the same instruction issued more than once). 642 // 643 // In order to handle these situations correctly we need to make sure that 644 // when a clause has more than one instruction, no instruction in the clause 645 // writes to a register that is read by another instruction in the clause 646 // (including itself). If we encounter this situation, we need to break the 647 // clause by inserting a non SMEM instruction. 648 649 for (MachineInstr *MI : EmittedInstrs) { 650 // When we hit a non-SMEM instruction then we have passed the start of the 651 // clause and we can stop. 652 if (!MI) 653 break; 654 655 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 656 break; 657 658 addClauseInst(*MI); 659 } 660 661 if (ClauseDefs.none()) 662 return 0; 663 664 // We need to make sure not to put loads and stores in the same clause if they 665 // use the same address. For now, just start a new clause whenever we see a 666 // store. 667 if (MEM->mayStore()) 668 return 1; 669 670 addClauseInst(*MEM); 671 672 // If the set of defs and uses intersect then we cannot add this instruction 673 // to the clause, so we have a hazard. 674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 675 } 676 677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 678 int WaitStatesNeeded = 0; 679 680 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 681 682 // This SMRD hazard only affects SI. 683 if (!ST.hasSMRDReadVALUDefHazard()) 684 return WaitStatesNeeded; 685 686 // A read of an SGPR by SMRD instruction requires 4 wait states when the 687 // SGPR was written by a VALU instruction. 688 int SmrdSgprWaitStates = 4; 689 auto IsHazardDefFn = [this](const MachineInstr &MI) { 690 return TII.isVALU(MI); 691 }; 692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 693 return TII.isSALU(MI); 694 }; 695 696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 697 698 for (const MachineOperand &Use : SMRD->uses()) { 699 if (!Use.isReg()) 700 continue; 701 int WaitStatesNeededForUse = 702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 703 SmrdSgprWaitStates); 704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 705 706 // This fixes what appears to be undocumented hardware behavior in SI where 707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 708 // needs some number of nops in between. We don't know how many we need, but 709 // let's use 4. This wasn't discovered before probably because the only 710 // case when this happens is when we expand a 64-bit pointer into a full 711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 712 // probably never encountered in the closed-source land. 713 if (IsBufferSMRD) { 714 int WaitStatesNeededForUse = 715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 716 IsBufferHazardDefFn, 717 SmrdSgprWaitStates); 718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 719 } 720 } 721 722 return WaitStatesNeeded; 723 } 724 725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 726 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 727 return 0; 728 729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 730 731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 732 // SGPR was written by a VALU Instruction. 733 const int VmemSgprWaitStates = 5; 734 auto IsHazardDefFn = [this](const MachineInstr &MI) { 735 return TII.isVALU(MI); 736 }; 737 for (const MachineOperand &Use : VMEM->uses()) { 738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 739 continue; 740 741 int WaitStatesNeededForUse = 742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 743 VmemSgprWaitStates); 744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 745 } 746 return WaitStatesNeeded; 747 } 748 749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 750 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 751 const SIInstrInfo *TII = ST.getInstrInfo(); 752 753 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 754 int DppVgprWaitStates = 2; 755 int DppExecWaitStates = 5; 756 int WaitStatesNeeded = 0; 757 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 758 return TII->isVALU(MI); 759 }; 760 761 for (const MachineOperand &Use : DPP->uses()) { 762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 763 continue; 764 int WaitStatesNeededForUse = 765 DppVgprWaitStates - getWaitStatesSinceDef( 766 Use.getReg(), 767 [](const MachineInstr &) { return true; }, 768 DppVgprWaitStates); 769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 770 } 771 772 WaitStatesNeeded = std::max( 773 WaitStatesNeeded, 774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 775 DppExecWaitStates)); 776 777 return WaitStatesNeeded; 778 } 779 780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 781 const SIInstrInfo *TII = ST.getInstrInfo(); 782 783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 784 // instruction. 785 const int DivFMasWaitStates = 4; 786 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 787 return TII->isVALU(MI); 788 }; 789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 790 DivFMasWaitStates); 791 792 return DivFMasWaitStates - WaitStatesNeeded; 793 } 794 795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 796 const SIInstrInfo *TII = ST.getInstrInfo(); 797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 798 799 const int GetRegWaitStates = 2; 800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 801 return GetRegHWReg == getHWReg(TII, MI); 802 }; 803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 804 805 return GetRegWaitStates - WaitStatesNeeded; 806 } 807 808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 809 const SIInstrInfo *TII = ST.getInstrInfo(); 810 unsigned HWReg = getHWReg(TII, *SetRegInstr); 811 812 const int SetRegWaitStates = ST.getSetRegWaitStates(); 813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 814 return HWReg == getHWReg(TII, MI); 815 }; 816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 817 return SetRegWaitStates - WaitStatesNeeded; 818 } 819 820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 821 if (!MI.mayStore()) 822 return -1; 823 824 const SIInstrInfo *TII = ST.getInstrInfo(); 825 unsigned Opcode = MI.getOpcode(); 826 const MCInstrDesc &Desc = MI.getDesc(); 827 828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 829 int VDataRCID = -1; 830 if (VDataIdx != -1) 831 VDataRCID = Desc.operands()[VDataIdx].RegClass; 832 833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 834 // There is no hazard if the instruction does not use vector regs 835 // (like wbinvl1) 836 if (VDataIdx == -1) 837 return -1; 838 // For MUBUF/MTBUF instructions this hazard only exists if the 839 // instruction is not using a register in the soffset field. 840 const MachineOperand *SOffset = 841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 842 // If we have no soffset operand, then assume this field has been 843 // hardcoded to zero. 844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 845 (!SOffset || !SOffset->isReg())) 846 return VDataIdx; 847 } 848 849 // MIMG instructions create a hazard if they don't use a 256-bit T# and 850 // the store size is greater than 8 bytes and they have more than two bits 851 // of their dmask set. 852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 853 if (TII->isMIMG(MI)) { 854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 855 assert(SRsrcIdx != -1 && 856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 857 (void)SRsrcIdx; 858 } 859 860 if (TII->isFLAT(MI)) { 861 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 862 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 863 return DataIdx; 864 } 865 866 return -1; 867 } 868 869 int 870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 871 const MachineRegisterInfo &MRI) { 872 // Helper to check for the hazard where VMEM instructions that store more than 873 // 8 bytes can have there store data over written by the next instruction. 874 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 875 876 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 877 int WaitStatesNeeded = 0; 878 879 if (!TRI->isVectorRegister(MRI, Def.getReg())) 880 return WaitStatesNeeded; 881 Register Reg = Def.getReg(); 882 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 883 int DataIdx = createsVALUHazard(MI); 884 return DataIdx >= 0 && 885 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 886 }; 887 888 int WaitStatesNeededForDef = 889 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 890 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 891 892 return WaitStatesNeeded; 893 } 894 895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 896 /// pack the computed value into correct bit position of the dest register. This 897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 898 /// dst_sel that is not aligned to the register. This function analayzes the \p 899 /// MI and \returns an operand with dst forwarding issue, or nullptr if 900 /// none exists. 901 static const MachineOperand * 902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 903 if (!SIInstrInfo::isVALU(MI)) 904 return nullptr; 905 906 const SIInstrInfo *TII = ST.getInstrInfo(); 907 908 unsigned Opcode = MI.getOpcode(); 909 910 // There are three different types of instructions 911 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 912 // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and 913 // CVT_SR_BF8_F32 with op_sel[3:2] 914 // != 0 915 if (SIInstrInfo::isSDWA(MI)) { 916 // Type 1: SDWA with dst_sel != DWORD 917 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 918 if (DstSel->getImm() == AMDGPU::SDWA::DWORD) 919 return nullptr; 920 } else { 921 // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and 922 // CVT_SR_BF8_F32 with op_sel[3:2] != 0) 923 if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || 924 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 925 SISrcMods::DST_OP_SEL || 926 (AMDGPU::isFP8DstSelInst(Opcode) && 927 (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 928 SISrcMods::OP_SEL_0)))) 929 return nullptr; 930 } 931 932 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 933 } 934 935 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 936 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 937 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 938 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 939 const MachineOperand *Dst, 940 const SIRegisterInfo *TRI) { 941 // We must consider implicit reads of the VALU. SDWA with dst_sel and 942 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 943 // and we must account for that hazard. 944 // We also must account for WAW hazards. In particular, WAW with dest 945 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 946 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 947 // check for ECC. Without accounting for this hazard, the ECC will be 948 // wrong. 949 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 950 // complete zeroesHigh16BitsOfDest) 951 for (auto &Operand : VALU->operands()) { 952 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 953 return true; 954 } 955 } 956 return false; 957 } 958 959 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 960 int WaitStatesNeeded = 0; 961 962 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 963 const int TransDefWaitstates = 1; 964 965 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 966 if (!SIInstrInfo::isTRANS(MI)) 967 return false; 968 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 969 const SIInstrInfo *TII = ST.getInstrInfo(); 970 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 971 972 for (const MachineOperand &Use : VALU->explicit_uses()) { 973 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 974 return true; 975 } 976 977 return false; 978 }; 979 980 int WaitStatesNeededForDef = 981 TransDefWaitstates - 982 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 983 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 984 } 985 986 if (ST.hasDstSelForwardingHazard()) { 987 const int Shift16DefWaitstates = 1; 988 989 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 990 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 991 const MachineOperand *ForwardedDst = 992 getDstSelForwardingOperand(ProducerMI, ST); 993 if (ForwardedDst) { 994 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 995 } 996 997 if (ProducerMI.isInlineAsm()) { 998 // Assume inline asm has dst forwarding hazard 999 for (auto &Def : ProducerMI.all_defs()) { 1000 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 1001 return true; 1002 } 1003 } 1004 1005 return false; 1006 }; 1007 1008 int WaitStatesNeededForDef = 1009 Shift16DefWaitstates - 1010 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1011 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1012 } 1013 1014 if (ST.hasVDecCoExecHazard()) { 1015 const int VALUWriteSGPRVALUReadWaitstates = 2; 1016 const int VALUWriteEXECRWLane = 4; 1017 const int VALUWriteVGPRReadlaneRead = 1; 1018 1019 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1020 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1021 Register UseReg; 1022 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1023 if (!SIInstrInfo::isVALU(MI)) 1024 return false; 1025 return MI.modifiesRegister(UseReg, TRI); 1026 }; 1027 1028 for (const MachineOperand &Use : VALU->explicit_uses()) { 1029 if (!Use.isReg()) 1030 continue; 1031 1032 UseReg = Use.getReg(); 1033 if (TRI->isSGPRReg(MRI, UseReg)) { 1034 int WaitStatesNeededForDef = 1035 VALUWriteSGPRVALUReadWaitstates - 1036 getWaitStatesSince(IsVALUDefSGPRFn, 1037 VALUWriteSGPRVALUReadWaitstates); 1038 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1039 } 1040 } 1041 1042 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1043 UseReg = AMDGPU::VCC; 1044 int WaitStatesNeededForDef = 1045 VALUWriteSGPRVALUReadWaitstates - 1046 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1047 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1048 } 1049 1050 switch (VALU->getOpcode()) { 1051 case AMDGPU::V_READLANE_B32: 1052 case AMDGPU::V_READFIRSTLANE_B32: { 1053 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1054 UseReg = Src->getReg(); 1055 int WaitStatesNeededForDef = 1056 VALUWriteVGPRReadlaneRead - 1057 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1058 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1059 } 1060 [[fallthrough]]; 1061 case AMDGPU::V_WRITELANE_B32: { 1062 UseReg = AMDGPU::EXEC; 1063 int WaitStatesNeededForDef = 1064 VALUWriteEXECRWLane - 1065 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1066 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1067 break; 1068 } 1069 default: 1070 break; 1071 } 1072 } 1073 1074 // This checks for the hazard where VMEM instructions that store more than 1075 // 8 bytes can have there store data over written by the next instruction. 1076 if (!ST.has12DWordStoreHazard()) 1077 return WaitStatesNeeded; 1078 1079 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1080 1081 for (const MachineOperand &Def : VALU->defs()) { 1082 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1083 } 1084 1085 return WaitStatesNeeded; 1086 } 1087 1088 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1089 // This checks for hazards associated with inline asm statements. 1090 // Since inline asms can contain just about anything, we use this 1091 // to call/leverage other check*Hazard routines. Note that 1092 // this function doesn't attempt to address all possible inline asm 1093 // hazards (good luck), but is a collection of what has been 1094 // problematic thus far. 1095 1096 // see checkVALUHazards() 1097 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) 1098 return 0; 1099 1100 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1101 int WaitStatesNeeded = 0; 1102 1103 for (const MachineOperand &Op : 1104 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1105 if (Op.isReg() && Op.isDef()) { 1106 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1107 continue; 1108 1109 if (ST.has12DWordStoreHazard()) { 1110 WaitStatesNeeded = 1111 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1112 } 1113 } 1114 } 1115 1116 if (ST.hasDstSelForwardingHazard()) { 1117 const int Shift16DefWaitstates = 1; 1118 1119 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1120 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1121 // Assume inline asm reads the dst 1122 if (Dst) 1123 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1124 IA->readsRegister(Dst->getReg(), &TRI); 1125 1126 if (ProducerMI.isInlineAsm()) { 1127 // If MI is inline asm, assume it has dst forwarding hazard 1128 for (auto &Def : ProducerMI.all_defs()) { 1129 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1130 IA->readsRegister(Def.getReg(), &TRI)) { 1131 return true; 1132 } 1133 } 1134 } 1135 1136 return false; 1137 }; 1138 1139 int WaitStatesNeededForDef = 1140 Shift16DefWaitstates - 1141 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1142 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1143 } 1144 1145 return WaitStatesNeeded; 1146 } 1147 1148 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1149 const SIInstrInfo *TII = ST.getInstrInfo(); 1150 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1151 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1152 1153 const MachineOperand *LaneSelectOp = 1154 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1155 1156 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1157 return 0; 1158 1159 Register LaneSelectReg = LaneSelectOp->getReg(); 1160 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1161 1162 const int RWLaneWaitStates = 4; 1163 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1164 RWLaneWaitStates); 1165 return RWLaneWaitStates - WaitStatesSince; 1166 } 1167 1168 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1169 if (!ST.hasRFEHazards()) 1170 return 0; 1171 1172 const SIInstrInfo *TII = ST.getInstrInfo(); 1173 1174 const int RFEWaitStates = 1; 1175 1176 auto IsHazardFn = [TII](const MachineInstr &MI) { 1177 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1178 }; 1179 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1180 return RFEWaitStates - WaitStatesNeeded; 1181 } 1182 1183 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1184 const SIInstrInfo *TII = ST.getInstrInfo(); 1185 const int ReadM0WaitStates = 1; 1186 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1187 return ReadM0WaitStates - 1188 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1189 } 1190 1191 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1192 fixVMEMtoScalarWriteHazards(MI); 1193 fixVcmpxPermlaneHazards(MI); 1194 fixSMEMtoVectorWriteHazards(MI); 1195 fixVcmpxExecWARHazard(MI); 1196 fixLdsBranchVmemWARHazard(MI); 1197 if (ST.hasLdsDirect()) { 1198 fixLdsDirectVALUHazard(MI); 1199 fixLdsDirectVMEMHazard(MI); 1200 } 1201 fixVALUPartialForwardingHazard(MI); 1202 fixVALUTransUseHazard(MI); 1203 fixWMMAHazards(MI); 1204 fixShift64HighRegBug(MI); 1205 fixVALUMaskWriteHazard(MI); 1206 fixVALUReadSGPRHazard(MI); 1207 fixRequiredExportPriority(MI); 1208 } 1209 1210 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, 1211 const MachineInstr &MI) { 1212 return (TII.isVOPC(MI) || 1213 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) && 1214 MI.modifiesRegister(AMDGPU::EXEC, &TRI); 1215 } 1216 1217 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1218 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1219 return false; 1220 1221 const SIInstrInfo *TII = ST.getInstrInfo(); 1222 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1223 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1224 return isVCmpXWritesExec(*TII, *TRI, MI); 1225 }; 1226 1227 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1228 unsigned Opc = MI.getOpcode(); 1229 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1230 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1231 }; 1232 1233 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1234 std::numeric_limits<int>::max()) 1235 return false; 1236 1237 // V_NOP will be discarded by SQ. 1238 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1239 // which is always a VGPR and available. 1240 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1241 Register Reg = Src0->getReg(); 1242 bool IsUndef = Src0->isUndef(); 1243 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1244 TII->get(AMDGPU::V_MOV_B32_e32)) 1245 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1246 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1247 1248 return true; 1249 } 1250 1251 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1252 if (!ST.hasVMEMtoScalarWriteHazard()) 1253 return false; 1254 assert(!ST.hasExtendedWaitCounts()); 1255 1256 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1257 return false; 1258 1259 if (MI->getNumDefs() == 0) 1260 return false; 1261 1262 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1263 1264 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1265 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1266 !SIInstrInfo::isFLAT(I)) 1267 return false; 1268 1269 for (const MachineOperand &Def : MI->defs()) { 1270 const MachineOperand *Op = 1271 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1272 if (!Op) 1273 continue; 1274 return true; 1275 } 1276 return false; 1277 }; 1278 1279 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1280 return SIInstrInfo::isVALU(MI) || 1281 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1282 !MI.getOperand(0).getImm()) || 1283 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1284 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1285 }; 1286 1287 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1288 std::numeric_limits<int>::max()) 1289 return false; 1290 1291 const SIInstrInfo *TII = ST.getInstrInfo(); 1292 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1293 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1294 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1295 return true; 1296 } 1297 1298 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1299 if (!ST.hasSMEMtoVectorWriteHazard()) 1300 return false; 1301 assert(!ST.hasExtendedWaitCounts()); 1302 1303 if (!SIInstrInfo::isVALU(*MI)) 1304 return false; 1305 1306 unsigned SDSTName; 1307 switch (MI->getOpcode()) { 1308 case AMDGPU::V_READLANE_B32: 1309 case AMDGPU::V_READFIRSTLANE_B32: 1310 SDSTName = AMDGPU::OpName::vdst; 1311 break; 1312 default: 1313 SDSTName = AMDGPU::OpName::sdst; 1314 break; 1315 } 1316 1317 const SIInstrInfo *TII = ST.getInstrInfo(); 1318 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1319 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1320 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1321 if (!SDST) { 1322 for (const auto &MO : MI->implicit_operands()) { 1323 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1324 SDST = &MO; 1325 break; 1326 } 1327 } 1328 } 1329 1330 if (!SDST) 1331 return false; 1332 1333 const Register SDSTReg = SDST->getReg(); 1334 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1335 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1336 }; 1337 1338 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1339 if (TII->isSALU(MI)) { 1340 switch (MI.getOpcode()) { 1341 case AMDGPU::S_SETVSKIP: 1342 case AMDGPU::S_VERSION: 1343 case AMDGPU::S_WAITCNT_VSCNT: 1344 case AMDGPU::S_WAITCNT_VMCNT: 1345 case AMDGPU::S_WAITCNT_EXPCNT: 1346 // These instructions cannot not mitigate the hazard. 1347 return false; 1348 case AMDGPU::S_WAITCNT_LGKMCNT: 1349 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1350 return (MI.getOperand(1).getImm() == 0) && 1351 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1352 case AMDGPU::S_WAITCNT: { 1353 const int64_t Imm = MI.getOperand(0).getImm(); 1354 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1355 // DsCnt corresponds to LGKMCnt here. 1356 return (Decoded.DsCnt == 0); 1357 } 1358 default: 1359 // SOPP instructions cannot mitigate the hazard. 1360 if (TII->isSOPP(MI)) 1361 return false; 1362 // At this point the SALU can be assumed to mitigate the hazard 1363 // because either: 1364 // (a) it is independent of the at risk SMEM (breaking chain), 1365 // or 1366 // (b) it is dependent on the SMEM, in which case an appropriate 1367 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1368 // SMEM instruction. 1369 return true; 1370 } 1371 } 1372 return false; 1373 }; 1374 1375 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1376 std::numeric_limits<int>::max()) 1377 return false; 1378 1379 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1380 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1381 .addImm(0); 1382 return true; 1383 } 1384 1385 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1386 if (!ST.hasVcmpxExecWARHazard()) 1387 return false; 1388 assert(!ST.hasExtendedWaitCounts()); 1389 1390 if (!SIInstrInfo::isVALU(*MI)) 1391 return false; 1392 1393 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1394 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1395 return false; 1396 1397 auto IsHazardFn = [TRI](const MachineInstr &I) { 1398 if (SIInstrInfo::isVALU(I)) 1399 return false; 1400 return I.readsRegister(AMDGPU::EXEC, TRI); 1401 }; 1402 1403 const SIInstrInfo *TII = ST.getInstrInfo(); 1404 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1405 if (SIInstrInfo::isVALU(MI)) { 1406 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1407 return true; 1408 for (auto MO : MI.implicit_operands()) 1409 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1410 return true; 1411 } 1412 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1413 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1414 return true; 1415 return false; 1416 }; 1417 1418 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1419 std::numeric_limits<int>::max()) 1420 return false; 1421 1422 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1423 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1424 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1425 return true; 1426 } 1427 1428 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1429 const GCNSubtarget &ST) { 1430 if (!ST.hasLdsBranchVmemWARHazard()) 1431 return false; 1432 1433 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1434 // instructions need to appear in the same function. 1435 bool HasLds = false; 1436 bool HasVmem = false; 1437 for (auto &MBB : MF) { 1438 for (auto &MI : MBB) { 1439 HasLds |= SIInstrInfo::isDS(MI); 1440 HasVmem |= 1441 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1442 if (HasLds && HasVmem) 1443 return true; 1444 } 1445 } 1446 return false; 1447 } 1448 1449 static bool isStoreCountWaitZero(const MachineInstr &I) { 1450 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1451 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1452 !I.getOperand(1).getImm(); 1453 } 1454 1455 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1456 if (!RunLdsBranchVmemWARHazardFixup) 1457 return false; 1458 1459 assert(ST.hasLdsBranchVmemWARHazard()); 1460 assert(!ST.hasExtendedWaitCounts()); 1461 1462 auto IsHazardInst = [](const MachineInstr &MI) { 1463 if (SIInstrInfo::isDS(MI)) 1464 return 1; 1465 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1466 return 2; 1467 return 0; 1468 }; 1469 1470 auto InstType = IsHazardInst(*MI); 1471 if (!InstType) 1472 return false; 1473 1474 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1475 return IsHazardInst(I) || isStoreCountWaitZero(I); 1476 }; 1477 1478 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1479 if (!I.isBranch()) 1480 return false; 1481 1482 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1483 auto InstType2 = IsHazardInst(I); 1484 return InstType2 && InstType != InstType2; 1485 }; 1486 1487 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1488 auto InstType2 = IsHazardInst(I); 1489 if (InstType == InstType2) 1490 return true; 1491 1492 return isStoreCountWaitZero(I); 1493 }; 1494 1495 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1496 std::numeric_limits<int>::max(); 1497 }; 1498 1499 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1500 std::numeric_limits<int>::max()) 1501 return false; 1502 1503 const SIInstrInfo *TII = ST.getInstrInfo(); 1504 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1505 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1506 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1507 .addImm(0); 1508 1509 return true; 1510 } 1511 1512 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1513 if (!SIInstrInfo::isLDSDIR(*MI)) 1514 return false; 1515 1516 const int NoHazardWaitStates = 15; 1517 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1518 const Register VDSTReg = VDST->getReg(); 1519 1520 bool VisitedTrans = false; 1521 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1522 if (!SIInstrInfo::isVALU(I)) 1523 return false; 1524 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1525 // Cover both WAR and WAW 1526 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1527 }; 1528 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1529 if (WaitStates >= NoHazardWaitStates) 1530 return true; 1531 // Instructions which cause va_vdst==0 expire hazard 1532 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1533 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 1534 }; 1535 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1536 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1537 }; 1538 1539 DenseSet<const MachineBasicBlock *> Visited; 1540 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1541 std::next(MI->getReverseIterator()), 0, 1542 IsExpiredFn, Visited, GetWaitStatesFn); 1543 1544 // Transcendentals can execute in parallel to other VALUs. 1545 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1546 if (VisitedTrans) 1547 Count = 0; 1548 1549 MachineOperand *WaitVdstOp = 1550 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1551 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1552 1553 return true; 1554 } 1555 1556 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1557 if (!SIInstrInfo::isLDSDIR(*MI)) 1558 return false; 1559 1560 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1561 const Register VDSTReg = VDST->getReg(); 1562 1563 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1564 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 1565 !SIInstrInfo::isDS(I)) 1566 return false; 1567 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1568 }; 1569 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1570 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1571 // according to the type of VMEM instruction. 1572 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1573 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1574 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1575 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1576 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1577 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1578 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1579 }; 1580 1581 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1582 std::numeric_limits<int>::max()) 1583 return false; 1584 1585 if (LdsdirCanWait) { 1586 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1587 } else { 1588 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1589 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1590 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1591 } 1592 1593 return true; 1594 } 1595 1596 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1597 if (!ST.hasVALUPartialForwardingHazard()) 1598 return false; 1599 assert(!ST.hasExtendedWaitCounts()); 1600 1601 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1602 return false; 1603 1604 SmallSetVector<Register, 4> SrcVGPRs; 1605 1606 for (const MachineOperand &Use : MI->explicit_uses()) { 1607 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1608 SrcVGPRs.insert(Use.getReg()); 1609 } 1610 1611 // Only applies with >= 2 unique VGPR sources 1612 if (SrcVGPRs.size() <= 1) 1613 return false; 1614 1615 // Look for the following pattern: 1616 // Va <- VALU [PreExecPos] 1617 // intv1 1618 // Exec <- SALU [ExecPos] 1619 // intv2 1620 // Vb <- VALU [PostExecPos] 1621 // intv3 1622 // MI Va, Vb (WaitState = 0) 1623 // 1624 // Where: 1625 // intv1 + intv2 <= 2 VALUs 1626 // intv3 <= 4 VALUs 1627 // 1628 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1629 1630 const int Intv1plus2MaxVALUs = 2; 1631 const int Intv3MaxVALUs = 4; 1632 const int IntvMaxVALUs = 6; 1633 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1634 1635 struct StateType { 1636 SmallDenseMap<Register, int, 4> DefPos; 1637 int ExecPos = std::numeric_limits<int>::max(); 1638 int VALUs = 0; 1639 }; 1640 1641 StateType State; 1642 1643 // This overloads expiry testing with all the hazard detection 1644 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1645 // Too many VALU states have passed 1646 if (State.VALUs > NoHazardVALUWaitStates) 1647 return HazardExpired; 1648 1649 // Instructions which cause va_vdst==0 expire hazard 1650 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1651 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1652 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1653 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1654 return HazardExpired; 1655 1656 // Track registers writes 1657 bool Changed = false; 1658 if (SIInstrInfo::isVALU(I)) { 1659 for (Register Src : SrcVGPRs) { 1660 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1661 State.DefPos[Src] = State.VALUs; 1662 Changed = true; 1663 } 1664 } 1665 } else if (SIInstrInfo::isSALU(I)) { 1666 if (State.ExecPos == std::numeric_limits<int>::max()) { 1667 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1668 State.ExecPos = State.VALUs; 1669 Changed = true; 1670 } 1671 } 1672 } 1673 1674 // Early expiration: too many VALUs in intv3 1675 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1676 return HazardExpired; 1677 1678 // Only evaluate state if something changed 1679 if (!Changed) 1680 return NoHazardFound; 1681 1682 // Determine positions of VALUs pre/post exec change 1683 if (State.ExecPos == std::numeric_limits<int>::max()) 1684 return NoHazardFound; 1685 1686 int PreExecPos = std::numeric_limits<int>::max(); 1687 int PostExecPos = std::numeric_limits<int>::max(); 1688 1689 for (auto Entry : State.DefPos) { 1690 int DefVALUs = Entry.second; 1691 if (DefVALUs != std::numeric_limits<int>::max()) { 1692 if (DefVALUs >= State.ExecPos) 1693 PreExecPos = std::min(PreExecPos, DefVALUs); 1694 else 1695 PostExecPos = std::min(PostExecPos, DefVALUs); 1696 } 1697 } 1698 1699 // Need a VALUs post exec change 1700 if (PostExecPos == std::numeric_limits<int>::max()) 1701 return NoHazardFound; 1702 1703 // Too many VALUs in intv3? 1704 int Intv3VALUs = PostExecPos; 1705 if (Intv3VALUs > Intv3MaxVALUs) 1706 return HazardExpired; 1707 1708 // Too many VALUs in intv2? 1709 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1710 if (Intv2VALUs > Intv1plus2MaxVALUs) 1711 return HazardExpired; 1712 1713 // Need a VALUs pre exec change 1714 if (PreExecPos == std::numeric_limits<int>::max()) 1715 return NoHazardFound; 1716 1717 // Too many VALUs in intv1? 1718 int Intv1VALUs = PreExecPos - State.ExecPos; 1719 if (Intv1VALUs > Intv1plus2MaxVALUs) 1720 return HazardExpired; 1721 1722 // Too many VALUs in intv1 + intv2 1723 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1724 return HazardExpired; 1725 1726 return HazardFound; 1727 }; 1728 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1729 if (SIInstrInfo::isVALU(MI)) 1730 State.VALUs += 1; 1731 }; 1732 1733 DenseSet<const MachineBasicBlock *> Visited; 1734 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1735 std::next(MI->getReverseIterator()), Visited)) 1736 return false; 1737 1738 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1739 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1740 .addImm(0x0fff); 1741 1742 return true; 1743 } 1744 1745 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1746 if (!ST.hasVALUTransUseHazard()) 1747 return false; 1748 assert(!ST.hasExtendedWaitCounts()); 1749 1750 if (!SIInstrInfo::isVALU(*MI)) 1751 return false; 1752 1753 SmallSet<Register, 4> SrcVGPRs; 1754 1755 for (const MachineOperand &Use : MI->explicit_uses()) { 1756 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1757 SrcVGPRs.insert(Use.getReg()); 1758 } 1759 1760 // Look for the following pattern: 1761 // Va <- TRANS VALU 1762 // intv 1763 // MI Va (WaitState = 0) 1764 // 1765 // Where: 1766 // intv <= 5 VALUs / 1 TRANS 1767 // 1768 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1769 1770 const int IntvMaxVALUs = 5; 1771 const int IntvMaxTRANS = 1; 1772 1773 struct StateType { 1774 int VALUs = 0; 1775 int TRANS = 0; 1776 }; 1777 1778 StateType State; 1779 1780 // This overloads expiry testing with all the hazard detection 1781 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1782 // Too many VALU states have passed 1783 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1784 return HazardExpired; 1785 1786 // Instructions which cause va_vdst==0 expire hazard 1787 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 1788 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 1789 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1790 I.getOperand(0).getImm() == 0x0fff)) 1791 return HazardExpired; 1792 1793 // Track registers writes 1794 if (SIInstrInfo::isTRANS(I)) { 1795 for (Register Src : SrcVGPRs) { 1796 if (I.modifiesRegister(Src, &TRI)) { 1797 return HazardFound; 1798 } 1799 } 1800 } 1801 1802 return NoHazardFound; 1803 }; 1804 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1805 if (SIInstrInfo::isVALU(MI)) 1806 State.VALUs += 1; 1807 if (SIInstrInfo::isTRANS(MI)) 1808 State.TRANS += 1; 1809 }; 1810 1811 DenseSet<const MachineBasicBlock *> Visited; 1812 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1813 std::next(MI->getReverseIterator()), Visited)) 1814 return false; 1815 1816 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1817 // avoided. 1818 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1819 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1820 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1821 1822 return true; 1823 } 1824 1825 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1826 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1827 return false; 1828 1829 const SIInstrInfo *TII = ST.getInstrInfo(); 1830 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1831 1832 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1833 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1834 return false; 1835 1836 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1837 // with the dest(matrix D) of the previous wmma. 1838 const Register CurSrc0Reg = 1839 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1840 const Register CurSrc1Reg = 1841 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1842 1843 const Register PrevDstReg = 1844 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1845 1846 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1847 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1848 return true; 1849 } 1850 1851 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1852 // but Index can't overlap with PrevDstReg. 1853 if (AMDGPU::isGFX12Plus(ST)) { 1854 if (SIInstrInfo::isSWMMAC(*MI)) { 1855 const Register CurIndex = 1856 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1857 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1858 return true; 1859 } 1860 return false; 1861 } 1862 1863 return false; 1864 }; 1865 1866 auto IsExpiredFn = [](const MachineInstr &I, int) { 1867 return SIInstrInfo::isVALU(I); 1868 }; 1869 1870 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1871 std::numeric_limits<int>::max()) 1872 return false; 1873 1874 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1875 1876 return true; 1877 } 1878 1879 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1880 if (!ST.hasShift64HighRegBug()) 1881 return false; 1882 assert(!ST.hasExtendedWaitCounts()); 1883 1884 switch (MI->getOpcode()) { 1885 default: 1886 return false; 1887 case AMDGPU::V_LSHLREV_B64_e64: 1888 case AMDGPU::V_LSHRREV_B64_e64: 1889 case AMDGPU::V_ASHRREV_I64_e64: 1890 break; 1891 } 1892 1893 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1894 if (!Amt->isReg()) 1895 return false; 1896 1897 Register AmtReg = Amt->getReg(); 1898 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1899 // Check if this is a last VGPR in the allocation block. 1900 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1901 return false; 1902 1903 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1904 return false; 1905 1906 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1907 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1908 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1909 bool Overlapped = OverlappedSrc || OverlappedDst; 1910 1911 assert(!OverlappedDst || !OverlappedSrc || 1912 Src1->getReg() == MI->getOperand(0).getReg()); 1913 assert(ST.needsAlignedVGPRs()); 1914 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1915 1916 Register NewReg; 1917 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1918 : AMDGPU::VGPR_32RegClass) { 1919 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1920 NewReg = Reg; 1921 break; 1922 } 1923 } 1924 1925 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1926 : NewReg; 1927 Register NewAmtLo; 1928 1929 if (Overlapped) 1930 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1931 1932 DebugLoc DL = MI->getDebugLoc(); 1933 MachineBasicBlock *MBB = MI->getParent(); 1934 // Insert a full wait count because found register might be pending a wait. 1935 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1936 .addImm(0); 1937 1938 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1939 if (Overlapped) 1940 runOnInstruction( 1941 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1942 .addDef(AmtReg - 1) 1943 .addReg(AmtReg - 1, RegState::Undef) 1944 .addReg(NewAmtLo, RegState::Undef)); 1945 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1946 .addDef(AmtReg) 1947 .addReg(AmtReg, RegState::Undef) 1948 .addReg(NewAmt, RegState::Undef)); 1949 1950 // Instructions emitted after the current instruction will be processed by the 1951 // parent loop of the hazard recognizer in a natural way. 1952 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1953 AmtReg) 1954 .addDef(NewAmt) 1955 .addReg(NewAmt) 1956 .addReg(AmtReg); 1957 if (Overlapped) 1958 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1959 AmtReg - 1) 1960 .addDef(NewAmtLo) 1961 .addReg(NewAmtLo) 1962 .addReg(AmtReg - 1); 1963 1964 // Re-running hazard recognizer on the modified instruction is not necessary, 1965 // inserted V_SWAP_B32 has already both read and write new registers so 1966 // hazards related to these register has already been handled. 1967 Amt->setReg(NewAmt); 1968 Amt->setIsKill(false); 1969 // We do not update liveness, so verifier may see it as undef. 1970 Amt->setIsUndef(); 1971 if (OverlappedDst) 1972 MI->getOperand(0).setReg(NewReg); 1973 if (OverlappedSrc) { 1974 Src1->setReg(NewReg); 1975 Src1->setIsKill(false); 1976 Src1->setIsUndef(); 1977 } 1978 1979 return true; 1980 } 1981 1982 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1983 int NSAtoVMEMWaitStates = 1; 1984 1985 if (!ST.hasNSAtoVMEMBug()) 1986 return 0; 1987 1988 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1989 return 0; 1990 1991 const SIInstrInfo *TII = ST.getInstrInfo(); 1992 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1993 if (!Offset || (Offset->getImm() & 6) == 0) 1994 return 0; 1995 1996 auto IsHazardFn = [TII](const MachineInstr &I) { 1997 if (!SIInstrInfo::isMIMG(I)) 1998 return false; 1999 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 2000 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 2001 TII->getInstSizeInBytes(I) >= 16; 2002 }; 2003 2004 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 2005 } 2006 2007 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 2008 int FPAtomicToDenormModeWaitStates = 3; 2009 2010 if (!ST.hasFPAtomicToDenormModeHazard()) 2011 return 0; 2012 assert(!ST.hasExtendedWaitCounts()); 2013 2014 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2015 return 0; 2016 2017 auto IsHazardFn = [](const MachineInstr &I) { 2018 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 2019 return false; 2020 return SIInstrInfo::isFPAtomic(I); 2021 }; 2022 2023 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2024 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2025 return true; 2026 2027 switch (MI.getOpcode()) { 2028 case AMDGPU::S_WAITCNT: 2029 case AMDGPU::S_WAITCNT_VSCNT: 2030 case AMDGPU::S_WAITCNT_VMCNT: 2031 case AMDGPU::S_WAITCNT_EXPCNT: 2032 case AMDGPU::S_WAITCNT_LGKMCNT: 2033 case AMDGPU::S_WAIT_IDLE: 2034 return true; 2035 default: 2036 break; 2037 } 2038 2039 return false; 2040 }; 2041 2042 return FPAtomicToDenormModeWaitStates - 2043 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2044 } 2045 2046 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2047 assert(SIInstrInfo::isMAI(*MI)); 2048 2049 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2050 } 2051 2052 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2053 // Early exit if no padding is requested. 2054 if (MFMAPaddingRatio == 0) 2055 return 0; 2056 2057 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2058 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2059 return 0; 2060 2061 int NeighborMFMALatency = 0; 2062 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2063 this](const MachineInstr &MI) { 2064 if (!SIInstrInfo::isMFMA(MI)) 2065 return false; 2066 2067 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2068 return true; 2069 }; 2070 2071 const int MaxMFMAPipelineWaitStates = 16; 2072 int WaitStatesSinceNeighborMFMA = 2073 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2074 2075 int NeighborMFMAPaddingNeeded = 2076 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2077 WaitStatesSinceNeighborMFMA; 2078 2079 return std::max(0, NeighborMFMAPaddingNeeded); 2080 } 2081 2082 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2083 int WaitStatesNeeded = 0; 2084 unsigned Opc = MI->getOpcode(); 2085 2086 auto IsVALUFn = [](const MachineInstr &MI) { 2087 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2088 }; 2089 2090 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2091 const int LegacyVALUWritesVGPRWaitStates = 2; 2092 const int VALUWritesExecWaitStates = 4; 2093 const int MaxWaitStates = 4; 2094 2095 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2096 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2097 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2098 2099 if (WaitStatesNeeded < MaxWaitStates) { 2100 for (const MachineOperand &Use : MI->explicit_uses()) { 2101 const int MaxWaitStates = 2; 2102 2103 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2104 continue; 2105 2106 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2107 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2109 2110 if (WaitStatesNeeded == MaxWaitStates) 2111 break; 2112 } 2113 } 2114 } 2115 2116 for (const MachineOperand &Op : MI->explicit_operands()) { 2117 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2118 continue; 2119 2120 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2121 continue; 2122 2123 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2124 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2125 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2126 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2127 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2128 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2129 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2130 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2131 const int MaxWaitStates = 18; 2132 Register Reg = Op.getReg(); 2133 unsigned HazardDefLatency = 0; 2134 2135 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2136 this](const MachineInstr &MI) { 2137 if (!SIInstrInfo::isMFMA(MI)) 2138 return false; 2139 Register DstReg = MI.getOperand(0).getReg(); 2140 if (DstReg == Reg) 2141 return false; 2142 HazardDefLatency = 2143 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2144 return TRI.regsOverlap(DstReg, Reg); 2145 }; 2146 2147 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2148 MaxWaitStates); 2149 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2150 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2151 int OpNo = Op.getOperandNo(); 2152 if (OpNo == SrcCIdx) { 2153 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2154 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2155 switch (HazardDefLatency) { 2156 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2157 break; 2158 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2159 break; 2160 case 16: [[fallthrough]]; 2161 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2162 break; 2163 } 2164 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2165 switch (HazardDefLatency) { 2166 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2167 break; 2168 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2169 break; 2170 case 16: [[fallthrough]]; 2171 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2172 break; 2173 } 2174 } 2175 2176 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2177 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2178 2179 if (WaitStatesNeeded == MaxWaitStates) 2180 return WaitStatesNeeded; // Early exit. 2181 2182 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2183 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2184 return false; 2185 Register DstReg = MI.getOperand(0).getReg(); 2186 return TRI.regsOverlap(Reg, DstReg); 2187 }; 2188 2189 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2190 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2191 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2192 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2193 if (OpNo == SrcCIdx) 2194 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2195 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2196 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2197 2198 WaitStatesNeededForUse = NeedWaitStates - 2199 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2200 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2201 2202 if (WaitStatesNeeded == MaxWaitStates) 2203 return WaitStatesNeeded; // Early exit. 2204 } 2205 2206 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2207 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2208 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2209 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2210 const int MaxWaitStates = 13; 2211 Register DstReg = MI->getOperand(0).getReg(); 2212 unsigned HazardDefLatency = 0; 2213 2214 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2215 this](const MachineInstr &MI) { 2216 if (!SIInstrInfo::isMFMA(MI)) 2217 return false; 2218 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2219 HazardDefLatency = 2220 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2221 return TRI.regsOverlap(Reg, DstReg); 2222 }; 2223 2224 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2225 int NeedWaitStates; 2226 switch (HazardDefLatency) { 2227 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2228 break; 2229 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2230 break; 2231 case 16: [[fallthrough]]; 2232 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2233 break; 2234 } 2235 2236 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2237 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2238 } 2239 2240 // Pad neighboring MFMA with noops for better inter-wave performance. 2241 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2242 2243 return WaitStatesNeeded; 2244 } 2245 2246 static int 2247 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, 2248 bool IsGFX950) { 2249 // xdl def cycles | gfx940 | gfx950 2250 // 2 pass | 3 4 2251 // 4 pass | 5 6 2252 // 8 pass | 9 10 2253 // 16 pass | 17 18 2254 return NumPasses + 1 + IsGFX950; 2255 } 2256 2257 static int 2258 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, 2259 bool IsGFX950) { 2260 // xdl def cycles | gfx940 | gfx950 2261 // 2 pass | 3 3 2262 // 4 pass | 5 6 2263 // 8 pass | 9 10 2264 // 16 pass | 17 18 2265 return NumPasses + 1 + (NumPasses != 2 && IsGFX950); 2266 } 2267 2268 static int 2269 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2270 // 2 pass -> 2 2271 // 4 pass -> 4 2272 // 8 pass -> 8 2273 // 16 pass -> 16 2274 return NumPasses; 2275 } 2276 2277 static int 2278 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2279 // 2 pass -> 4 2280 // 4 pass -> 6 2281 // 8 pass -> 10 2282 // 16 pass -> 18 2283 return NumPasses + 2; 2284 } 2285 2286 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2287 // 2 pass -> 5 2288 // 4 pass -> 7 2289 // 8 pass -> 11 2290 // 16 pass -> 19 2291 return NumPasses + 3; 2292 } 2293 2294 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2295 int WaitStatesNeeded = 0; 2296 unsigned Opc = MI->getOpcode(); 2297 2298 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2299 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2300 }; 2301 2302 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2303 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2304 !SIInstrInfo::isDOT(MI); 2305 }; 2306 2307 if (!SIInstrInfo::isMFMA(*MI)) 2308 return WaitStatesNeeded; 2309 2310 const int VALUWritesExecWaitStates = 4; 2311 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2312 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2313 VALUWritesExecWaitStates); 2314 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2315 2316 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2317 2318 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2319 for (const MachineOperand &Use : MI->explicit_uses()) { 2320 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2321 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2322 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2323 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2324 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2325 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2326 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2327 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2328 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2329 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2330 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2331 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2332 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2333 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2334 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2335 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19; 2336 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2337 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2338 const int MaxWaitStates = 19; 2339 2340 if (!Use.isReg()) 2341 continue; 2342 Register Reg = Use.getReg(); 2343 bool FullReg; 2344 const MachineInstr *MI1; 2345 2346 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2347 this](const MachineInstr &MI) { 2348 if (!SIInstrInfo::isMFMA(MI)) 2349 return false; 2350 Register DstReg = MI.getOperand(0).getReg(); 2351 FullReg = (DstReg == Reg); 2352 MI1 = &MI; 2353 return TRI.regsOverlap(DstReg, Reg); 2354 }; 2355 2356 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2357 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2358 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2359 2360 int NumWaitStates = 2361 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2362 if (NumWaitStates == std::numeric_limits<int>::max()) 2363 continue; 2364 2365 int OpNo = Use.getOperandNo(); 2366 unsigned Opc1 = MI1->getOpcode(); 2367 int NeedWaitStates = 0; 2368 if (OpNo == SrcCIdx) { 2369 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2370 NeedWaitStates = 0; 2371 } else if (FullReg) { 2372 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2373 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2374 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2375 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2376 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2377 else if (ST.hasGFX940Insts() && 2378 TSchedModel.computeInstrLatency(MI1) == 2) 2379 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2380 } else { 2381 switch (Opc1) { 2382 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2383 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2384 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2385 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2386 if (!isXDL(ST, *MI)) 2387 NeedWaitStates = 2388 ST.hasGFX950Insts() 2389 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2390 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2391 break; 2392 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2393 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2394 if (!isXDL(ST, *MI)) 2395 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2396 break; 2397 default: 2398 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2399 if (ST.hasGFX940Insts()) { 2400 if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 2401 break; 2402 2403 NeedWaitStates = 2404 isXDL(ST, *MI1) 2405 ? (isXDL(ST, *MI) 2406 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates( 2407 NumPasses, ST.hasGFX950Insts()) 2408 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates( 2409 NumPasses, ST.hasGFX950Insts())) 2410 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2411 NumPasses); 2412 break; 2413 } 2414 2415 switch (NumPasses) { 2416 case 2: 2417 NeedWaitStates = 2418 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2419 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2420 break; 2421 case 8: 2422 NeedWaitStates = 2423 isDGEMM(Opc) 2424 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2425 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2426 break; 2427 case 16: 2428 NeedWaitStates = 2429 isDGEMM(Opc) 2430 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2431 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2432 break; 2433 default: 2434 llvm_unreachable("unexpected number of passes"); 2435 } 2436 } 2437 } 2438 } else { 2439 switch (Opc1) { 2440 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2441 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2442 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2443 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2444 NeedWaitStates = 2445 ST.hasGFX950Insts() 2446 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates 2447 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2448 break; 2449 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2450 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2451 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2452 break; 2453 default: 2454 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2455 2456 if (ST.hasGFX940Insts()) { 2457 NeedWaitStates = 2458 isXDL(ST, *MI1) 2459 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2460 NumPasses) 2461 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2462 NumPasses); 2463 break; 2464 } 2465 2466 switch (NumPasses) { 2467 case 2: 2468 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2469 break; 2470 case 4: 2471 llvm_unreachable("unexpected number of passes for mfma"); 2472 case 8: 2473 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2474 break; 2475 case 16: 2476 default: 2477 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2478 } 2479 } 2480 } 2481 if (WaitStatesNeeded >= NeedWaitStates) 2482 continue; 2483 2484 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2485 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2486 2487 if (WaitStatesNeeded == MaxWaitStates) 2488 break; 2489 } 2490 2491 // Pad neighboring MFMA with noops for better inter-wave performance. 2492 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2493 2494 return WaitStatesNeeded; 2495 } 2496 2497 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2498 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2499 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2500 return 0; 2501 2502 int WaitStatesNeeded = 0; 2503 2504 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2505 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2506 }; 2507 2508 for (const MachineOperand &Op : MI->explicit_uses()) { 2509 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2510 continue; 2511 2512 Register Reg = Op.getReg(); 2513 2514 const int AccVgprReadLdStWaitStates = 2; 2515 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2516 const int MaxWaitStates = 2; 2517 2518 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2519 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2520 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2521 2522 if (WaitStatesNeeded == MaxWaitStates) 2523 return WaitStatesNeeded; // Early exit. 2524 2525 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2526 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2527 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2528 return false; 2529 auto IsVALUFn = [](const MachineInstr &MI) { 2530 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2531 }; 2532 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2533 std::numeric_limits<int>::max(); 2534 }; 2535 2536 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2537 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2538 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2539 } 2540 2541 return WaitStatesNeeded; 2542 } 2543 2544 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { 2545 assert(!ST.hasVcmpxPermlaneHazard() && 2546 "this is a different vcmpx+permlane hazard"); 2547 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2548 const SIInstrInfo *TII = ST.getInstrInfo(); 2549 2550 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) { 2551 return isVCmpXWritesExec(*TII, *TRI, MI); 2552 }; 2553 2554 const int NumWaitStates = 4; 2555 return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates); 2556 } 2557 2558 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2559 // 2 pass -> 4 2560 // 4 pass -> 6 2561 // 8 pass -> 10 2562 // 16 pass -> 18 2563 return NumPasses + 2; 2564 } 2565 2566 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2567 // 2 pass -> 5 2568 // 4 pass -> 7 2569 // 8 pass -> 11 2570 // 16 pass -> 19 2571 return NumPasses + 3; 2572 } 2573 2574 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2575 // 2 pass -> 5 2576 // 4 pass -> 7 2577 // 8 pass -> 11 2578 // 16 pass -> 19 2579 return NumPasses + 3; 2580 } 2581 2582 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2583 // 2 pass -> 4 2584 // 4 pass -> 6 2585 // 8 pass -> 10 2586 // 16 pass -> 18 2587 return NumPasses + 2; 2588 } 2589 2590 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2591 if (!ST.hasGFX90AInsts()) 2592 return 0; 2593 2594 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2595 return isDGEMM(MI.getOpcode()); 2596 }; 2597 2598 // This is checked in checkMAIHazards90A() 2599 if (SIInstrInfo::isMFMA(*MI)) 2600 return 0; 2601 2602 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2603 2604 int WaitStatesNeeded = 0; 2605 2606 bool IsMem = SIInstrInfo::isVMEM(*MI) || 2607 SIInstrInfo::isFLAT(*MI) || 2608 SIInstrInfo::isDS(*MI); 2609 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2610 bool IsVALU = SIInstrInfo::isVALU(*MI); 2611 2612 const MachineInstr *MFMA = nullptr; 2613 unsigned Reg; 2614 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2615 if (!SIInstrInfo::isMFMA(MI) || 2616 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2617 return false; 2618 MFMA = &MI; 2619 return true; 2620 }; 2621 2622 const MachineInstr *DOT = nullptr; 2623 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2624 if (!SIInstrInfo::isDOT(MI) || 2625 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2626 return false; 2627 DOT = &MI; 2628 return true; 2629 }; 2630 2631 bool DGEMMAfterVALUWrite = false; 2632 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2633 // Found DGEMM on reverse traversal to def. 2634 if (isDGEMM(MI.getOpcode())) 2635 DGEMMAfterVALUWrite = true; 2636 2637 // Only hazard if register is defined by a VALU and a DGEMM is found after 2638 // after the def. 2639 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2640 return false; 2641 2642 return true; 2643 }; 2644 2645 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2646 AMDGPU::OpName::src2); 2647 2648 if (IsMemOrExport || IsVALU) { 2649 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2650 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2651 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2652 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2653 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2654 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2655 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2656 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2657 const int DotWriteSameDotReadSrcAB = 3; 2658 const int DotWriteDifferentVALURead = 3; 2659 const int DMFMABetweenVALUWriteVMEMRead = 2; 2660 const int MaxWaitStates = 19; 2661 2662 for (const MachineOperand &Use : MI->explicit_uses()) { 2663 if (!Use.isReg()) 2664 continue; 2665 Reg = Use.getReg(); 2666 2667 DOT = nullptr; 2668 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2669 MaxWaitStates); 2670 if (DOT) { 2671 int NeedWaitStates = 0; 2672 if (DOT->getOpcode() == MI->getOpcode()) { 2673 if (&Use - &MI->getOperand(0) != SrcCIdx) 2674 NeedWaitStates = DotWriteSameDotReadSrcAB; 2675 } else { 2676 NeedWaitStates = DotWriteDifferentVALURead; 2677 } 2678 2679 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2680 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2681 } 2682 2683 // Workaround for HW data hazard bug observed only in GFX90A. When there 2684 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2685 // causes the SQ to incorrectly not insert two wait states between the two 2686 // instructions needed to avoid data hazard. 2687 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2688 DGEMMAfterVALUWrite = false; 2689 if (TRI.isVectorRegister(MRI, Reg)) { 2690 int WaitStatesNeededForUse = 2691 DMFMABetweenVALUWriteVMEMRead - 2692 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2693 DMFMABetweenVALUWriteVMEMRead); 2694 2695 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2696 } 2697 } 2698 2699 MFMA = nullptr; 2700 WaitStatesSinceDef = 2701 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2702 if (!MFMA) 2703 continue; 2704 2705 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2706 int NumPasses = HazardDefLatency; 2707 int NeedWaitStates = MaxWaitStates; 2708 2709 if (isDGEMM(MFMA->getOpcode())) { 2710 switch (HazardDefLatency) { 2711 case 4: 2712 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2713 : DMFMA4x4WriteVgprVALUReadWaitStates; 2714 break; 2715 case 8: 2716 case 16: 2717 NeedWaitStates = 2718 IsMemOrExport 2719 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2720 : (ST.hasGFX950Insts() 2721 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2722 : DMFMA16x16WriteVgprVALUReadWaitStates); 2723 break; 2724 default: 2725 llvm_unreachable("unexpected dgemm"); 2726 } 2727 } else if (ST.hasGFX940Insts()) { 2728 NeedWaitStates = 2729 isXDL(ST, *MFMA) 2730 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2731 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2732 NumPasses); 2733 } else { 2734 switch (HazardDefLatency) { 2735 case 2: 2736 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2737 break; 2738 case 8: 2739 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2740 break; 2741 case 16: 2742 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2743 break; 2744 default: 2745 llvm_unreachable("unexpected number of passes for mfma"); 2746 } 2747 } 2748 2749 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2750 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2751 2752 if (WaitStatesNeeded == MaxWaitStates) 2753 break; 2754 } 2755 } 2756 2757 unsigned Opc = MI->getOpcode(); 2758 const int DMFMAToFMA64WaitStates = 2; 2759 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2760 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2761 Opc == AMDGPU::V_FMAC_F64_dpp) && 2762 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2763 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2764 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2765 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2766 } 2767 2768 if (!IsVALU && !IsMemOrExport) 2769 return WaitStatesNeeded; 2770 2771 for (const MachineOperand &Def : MI->defs()) { 2772 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2773 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2774 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2775 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2776 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2777 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2778 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2779 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2780 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2781 const int DotWriteDifferentVALUWrite = 3; 2782 const int MaxWaitStates = 19; 2783 const int MaxWarWaitStates = 15; 2784 2785 Reg = Def.getReg(); 2786 2787 DOT = nullptr; 2788 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2789 MaxWaitStates); 2790 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2791 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2792 WaitStatesSinceDef); 2793 2794 MFMA = nullptr; 2795 WaitStatesSinceDef = 2796 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2797 if (MFMA) { 2798 int NeedWaitStates = MaxWaitStates; 2799 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2800 2801 if (isDGEMM(MFMA->getOpcode())) { 2802 switch (NumPasses) { 2803 case 4: 2804 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2805 break; 2806 case 8: 2807 case 16: 2808 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2809 break; 2810 default: 2811 llvm_unreachable("unexpected number of cycles for dgemm"); 2812 } 2813 } else if (ST.hasGFX940Insts()) { 2814 NeedWaitStates = 2815 isXDL(ST, *MFMA) 2816 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2817 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2818 } else { 2819 switch (NumPasses) { 2820 case 2: 2821 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2822 break; 2823 case 8: 2824 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2825 break; 2826 case 16: 2827 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2828 break; 2829 default: 2830 llvm_unreachable("Unexpected number of passes for mfma"); 2831 } 2832 } 2833 2834 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2835 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2836 2837 if (WaitStatesNeeded == MaxWaitStates) 2838 break; 2839 } 2840 2841 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2842 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2843 !MI.readsRegister(Reg, &TRI)) 2844 return false; 2845 2846 if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 2847 return false; 2848 2849 const MachineOperand *SrcC = 2850 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2851 assert(SrcC); 2852 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2853 return false; 2854 2855 MFMA = &MI; 2856 return true; 2857 }; 2858 2859 MFMA = nullptr; 2860 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2861 MaxWarWaitStates); 2862 if (!MFMA) 2863 continue; 2864 2865 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2866 int NeedWaitStates = MaxWaitStates; 2867 switch (HazardDefLatency) { 2868 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2869 break; 2870 case 4: assert(ST.hasGFX940Insts()); 2871 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2872 break; 2873 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2874 break; 2875 case 16: [[fallthrough]]; 2876 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2877 break; 2878 } 2879 2880 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2881 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2882 } 2883 2884 return WaitStatesNeeded; 2885 } 2886 2887 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2888 if (!SU->isInstr()) 2889 return false; 2890 2891 const MachineInstr *MAI = nullptr; 2892 2893 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2894 MAI = nullptr; 2895 if (SIInstrInfo::isMFMA(MI)) 2896 MAI = &MI; 2897 return MAI != nullptr; 2898 }; 2899 2900 MachineInstr *MI = SU->getInstr(); 2901 if (IsMFMAFn(*MI)) { 2902 int W = getWaitStatesSince(IsMFMAFn, 16); 2903 if (MAI) 2904 return W < (int)TSchedModel.computeInstrLatency(MAI); 2905 } 2906 2907 return false; 2908 } 2909 2910 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2911 // insertion of a new instruction. 2912 static void updateGetPCBundle(MachineInstr *NewMI) { 2913 if (!NewMI->isBundled()) 2914 return; 2915 2916 // Find start of bundle. 2917 auto I = NewMI->getIterator(); 2918 while (I->isBundledWithPred()) 2919 I--; 2920 if (I->isBundle()) 2921 I++; 2922 2923 // Bail if this is not an S_GETPC bundle. 2924 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2925 return; 2926 2927 // Update offsets of any references in the bundle. 2928 const unsigned NewBytes = 4; 2929 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2930 "Unexpected instruction insertion in bundle"); 2931 auto NextMI = std::next(NewMI->getIterator()); 2932 auto End = NewMI->getParent()->end(); 2933 while (NextMI != End && NextMI->isBundledWithPred()) { 2934 for (auto &Operand : NextMI->operands()) { 2935 if (Operand.isGlobal()) 2936 Operand.setOffset(Operand.getOffset() + NewBytes); 2937 } 2938 NextMI++; 2939 } 2940 } 2941 2942 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2943 if (!ST.hasVALUMaskWriteHazard()) 2944 return false; 2945 assert(!ST.hasExtendedWaitCounts()); 2946 2947 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2948 return false; 2949 2950 // The hazard sequence is three instructions: 2951 // 1. VALU reads SGPR as mask 2952 // 2. SALU writes SGPR 2953 // 3. SALU reads SGPR 2954 // The hazard can expire if the distance between 2 and 3 is sufficient. 2955 // In practice this happens <10% of the time, hence this always assumes 2956 // the hazard exists if 1 and 2 are present to avoid searching. 2957 2958 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2959 if (!SDSTOp || !SDSTOp->isReg()) 2960 return false; 2961 2962 const Register HazardReg = SDSTOp->getReg(); 2963 if (HazardReg == AMDGPU::EXEC || 2964 HazardReg == AMDGPU::EXEC_LO || 2965 HazardReg == AMDGPU::EXEC_HI || 2966 HazardReg == AMDGPU::M0) 2967 return false; 2968 2969 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2970 switch (I.getOpcode()) { 2971 case AMDGPU::V_ADDC_U32_e32: 2972 case AMDGPU::V_ADDC_U32_dpp: 2973 case AMDGPU::V_CNDMASK_B16_e32: 2974 case AMDGPU::V_CNDMASK_B16_dpp: 2975 case AMDGPU::V_CNDMASK_B32_e32: 2976 case AMDGPU::V_CNDMASK_B32_dpp: 2977 case AMDGPU::V_DIV_FMAS_F32_e64: 2978 case AMDGPU::V_DIV_FMAS_F64_e64: 2979 case AMDGPU::V_SUBB_U32_e32: 2980 case AMDGPU::V_SUBB_U32_dpp: 2981 case AMDGPU::V_SUBBREV_U32_e32: 2982 case AMDGPU::V_SUBBREV_U32_dpp: 2983 // These implicitly read VCC as mask source. 2984 return HazardReg == AMDGPU::VCC || 2985 HazardReg == AMDGPU::VCC_LO || 2986 HazardReg == AMDGPU::VCC_HI; 2987 case AMDGPU::V_ADDC_U32_e64: 2988 case AMDGPU::V_ADDC_U32_e64_dpp: 2989 case AMDGPU::V_CNDMASK_B16_e64: 2990 case AMDGPU::V_CNDMASK_B16_e64_dpp: 2991 case AMDGPU::V_CNDMASK_B32_e64: 2992 case AMDGPU::V_CNDMASK_B32_e64_dpp: 2993 case AMDGPU::V_SUBB_U32_e64: 2994 case AMDGPU::V_SUBB_U32_e64_dpp: 2995 case AMDGPU::V_SUBBREV_U32_e64: 2996 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 2997 // Only check mask register overlaps. 2998 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 2999 assert(SSRCOp); 3000 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 3001 } 3002 default: 3003 return false; 3004 } 3005 }; 3006 3007 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3008 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 3009 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 3010 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3011 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3012 return true; 3013 3014 // VALU access to any SGPR or literal constant other than HazardReg 3015 // mitigates hazard. No need to check HazardReg here as this will 3016 // only be called when !IsHazardFn. 3017 if (!SIInstrInfo::isVALU(I)) 3018 return false; 3019 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 3020 const MachineOperand &Op = I.getOperand(OpNo); 3021 if (Op.isReg()) { 3022 Register OpReg = Op.getReg(); 3023 // Only consider uses 3024 if (!Op.isUse()) 3025 continue; 3026 // Ignore EXEC 3027 if (OpReg == AMDGPU::EXEC || 3028 OpReg == AMDGPU::EXEC_LO || 3029 OpReg == AMDGPU::EXEC_HI) 3030 continue; 3031 // Ignore all implicit uses except VCC 3032 if (Op.isImplicit()) { 3033 if (OpReg == AMDGPU::VCC || 3034 OpReg == AMDGPU::VCC_LO || 3035 OpReg == AMDGPU::VCC_HI) 3036 return true; 3037 continue; 3038 } 3039 if (TRI.isSGPRReg(MRI, OpReg)) 3040 return true; 3041 } else { 3042 const MCInstrDesc &InstDesc = I.getDesc(); 3043 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3044 if (!TII.isInlineConstant(Op, OpInfo)) 3045 return true; 3046 } 3047 } 3048 return false; 3049 }; 3050 3051 // Check for hazard 3052 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3053 std::numeric_limits<int>::max()) 3054 return false; 3055 3056 auto NextMI = std::next(MI->getIterator()); 3057 3058 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3059 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3060 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3061 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3062 3063 // SALU write may be s_getpc in a bundle. 3064 updateGetPCBundle(NewMI); 3065 3066 return true; 3067 } 3068 3069 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. 3070 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 3071 static std::optional<unsigned> sgprPairNumber(Register Reg, 3072 const SIRegisterInfo &TRI) { 3073 switch (Reg) { 3074 case AMDGPU::M0: 3075 case AMDGPU::EXEC: 3076 case AMDGPU::EXEC_LO: 3077 case AMDGPU::EXEC_HI: 3078 case AMDGPU::SGPR_NULL: 3079 case AMDGPU::SGPR_NULL64: 3080 return {}; 3081 default: 3082 break; 3083 } 3084 unsigned RegN = TRI.getEncodingValue(Reg); 3085 if (RegN > 127) 3086 return {}; 3087 return (RegN >> 1) & 0x3f; 3088 } 3089 3090 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. 3091 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { 3092 assert(MMF == &MF); 3093 3094 // Assume non-empty vector means it has already been computed. 3095 if (!VALUReadHazardSGPRs.empty()) 3096 return; 3097 3098 auto CallingConv = MF.getFunction().getCallingConv(); 3099 bool IsCallFree = 3100 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); 3101 3102 // Exhaustive search is only viable in non-caller/callee functions where 3103 // VALUs will be exposed to the hazard recognizer. 3104 UseVALUReadHazardExhaustiveSearch = 3105 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && 3106 MF.getInstructionCount() <= MaxExhaustiveHazardSearch; 3107 3108 // Consider all SGPRs hazards if the shader uses function calls or is callee. 3109 bool UseVALUUseCache = 3110 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; 3111 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); 3112 if (!UseVALUUseCache) 3113 return; 3114 3115 // Perform a post ordered reverse scan to find VALUs which read an SGPR 3116 // before a SALU write to the same SGPR. This provides a reduction in 3117 // hazard insertion when all VALU access to an SGPR occurs after its last 3118 // SALU write, when compared to a linear scan. 3119 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3120 BitVector SALUWriteSGPRs(64), ReadSGPRs(64); 3121 MachineCycleInfo CI; 3122 CI.compute(*MMF); 3123 3124 for (auto *MBB : post_order(&MF)) { 3125 bool InCycle = CI.getCycle(MBB) != nullptr; 3126 for (auto &MI : reverse(MBB->instrs())) { 3127 bool IsVALU = SIInstrInfo::isVALU(MI); 3128 bool IsSALU = SIInstrInfo::isSALU(MI); 3129 if (!IsVALU && !IsSALU) 3130 continue; 3131 3132 for (const MachineOperand &Op : MI.operands()) { 3133 if (!Op.isReg()) 3134 continue; 3135 Register Reg = Op.getReg(); 3136 assert(!Op.getSubReg()); 3137 // Only consider implicit operands of VCC. 3138 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || 3139 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) 3140 continue; 3141 if (!TRI.isSGPRReg(MRI, Reg)) 3142 continue; 3143 auto RegN = sgprPairNumber(Reg, TRI); 3144 if (!RegN) 3145 continue; 3146 if (IsVALU && Op.isUse()) { 3147 // Note: any access within a cycle must be considered a hazard. 3148 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) 3149 VALUReadHazardSGPRs.set(*RegN); 3150 ReadSGPRs.set(*RegN); 3151 } else if (IsSALU) { 3152 if (Op.isDef()) 3153 SALUWriteSGPRs.set(*RegN); 3154 else 3155 ReadSGPRs.set(*RegN); 3156 } 3157 } 3158 } 3159 } 3160 } 3161 3162 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { 3163 if (!ST.hasVALUReadSGPRHazard()) 3164 return false; 3165 3166 // The hazard sequence is fundamentally three instructions: 3167 // 1. VALU reads SGPR 3168 // 2. SALU writes SGPR 3169 // 3. VALU/SALU reads SGPR 3170 // Try to avoid searching for (1) because the expiry point of the hazard is 3171 // indeterminate; however, the hazard between (2) and (3) can expire if the 3172 // gap contains sufficient SALU instructions with no usage of SGPR from (1). 3173 // Note: SGPRs must be considered as 64-bit pairs as hazard exists 3174 // even if individual SGPRs are accessed. 3175 3176 bool MIIsSALU = SIInstrInfo::isSALU(*MI); 3177 bool MIIsVALU = SIInstrInfo::isVALU(*MI); 3178 if (!(MIIsSALU || MIIsVALU)) 3179 return false; 3180 3181 // Avoid expensive search when compile time is priority by 3182 // mitigating every SALU which writes an SGPR. 3183 if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { 3184 if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) 3185 return false; 3186 3187 const MachineOperand *SDSTOp = 3188 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 3189 if (!SDSTOp || !SDSTOp->isReg()) 3190 return false; 3191 3192 const Register HazardReg = SDSTOp->getReg(); 3193 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || 3194 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) 3195 return false; 3196 3197 // Add s_wait_alu sa_sdst(0) after SALU write. 3198 auto NextMI = std::next(MI->getIterator()); 3199 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3200 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3201 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3202 3203 // SALU write may be s_getpc in a bundle. 3204 updateGetPCBundle(NewMI); 3205 3206 return true; 3207 } 3208 3209 // Pre-compute set of SGPR pairs read by VALUs. 3210 // Note: pass mutable pointer to MachineFunction for CycleInfo. 3211 computeVALUHazardSGPRs(MI->getMF()); 3212 3213 // If no VALUs hazard SGPRs exist then nothing to do. 3214 if (VALUReadHazardSGPRs.none()) 3215 return false; 3216 3217 // All SGPR writes before a call/return must be flushed as the callee/caller 3218 // will not will not see the hazard chain, i.e. (2) to (3) described above. 3219 const bool IsSetPC = (MI->isCall() || MI->isReturn()) && 3220 !(MI->getOpcode() == AMDGPU::S_ENDPGM || 3221 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); 3222 3223 // Collect all SGPR sources for MI which are read by a VALU. 3224 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3225 SmallSet<Register, 4> SGPRsUsed; 3226 3227 if (!IsSetPC) { 3228 for (const MachineOperand &Op : MI->all_uses()) { 3229 Register OpReg = Op.getReg(); 3230 3231 // Only consider VCC implicit uses on VALUs. 3232 // The only expected SALU implicit access is SCC which is no hazard. 3233 if (MIIsSALU && Op.isImplicit()) 3234 continue; 3235 3236 if (!TRI.isSGPRReg(MRI, OpReg)) 3237 continue; 3238 3239 auto RegN = sgprPairNumber(OpReg, TRI); 3240 if (!RegN) 3241 continue; 3242 3243 if (!VALUReadHazardSGPRs[*RegN]) 3244 continue; 3245 3246 SGPRsUsed.insert(OpReg); 3247 } 3248 3249 // No SGPRs -> nothing to do. 3250 if (SGPRsUsed.empty()) 3251 return false; 3252 } 3253 3254 // A hazard is any SALU which writes one of the SGPRs read by MI. 3255 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { 3256 if (!SIInstrInfo::isSALU(I)) 3257 return false; 3258 // Ensure SGPR flush before call/return by conservatively assuming every 3259 // SALU writes an SGPR. 3260 if (IsSetPC && I.getNumDefs() > 0) 3261 return true; 3262 // Check for any register writes. 3263 return any_of(SGPRsUsed, [this, &I](Register Reg) { 3264 return I.modifiesRegister(Reg, &TRI); 3265 }); 3266 }; 3267 3268 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; 3269 auto IsExpiredFn = [&](const MachineInstr &I, int Count) { 3270 if (Count >= SALUExpiryCount) 3271 return true; 3272 // s_wait_alu sa_sdst(0) on path mitigates hazard. 3273 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3274 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3275 return true; 3276 return false; 3277 }; 3278 3279 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { 3280 // Only count true SALUs as wait states. 3281 if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) 3282 return 0; 3283 // SALU must be unrelated to any hazard registers. 3284 if (any_of(SGPRsUsed, 3285 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) 3286 return 0; 3287 return 1; 3288 }; 3289 3290 // Check for the hazard. 3291 DenseSet<const MachineBasicBlock *> Visited; 3292 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 3293 std::next(MI->getReverseIterator()), 0, 3294 IsExpiredFn, Visited, WaitStatesFn); 3295 3296 if (WaitStates >= SALUExpiryCount) 3297 return false; 3298 3299 // Validate hazard through an exhaustive search. 3300 if (UseVALUReadHazardExhaustiveSearch) { 3301 // A hazard is any VALU which reads one of the paired SGPRs read by MI. 3302 // This is searching for (1) in the hazard description. 3303 auto hazardPair = [this](Register Reg) { 3304 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) 3305 return Register(AMDGPU::VCC); 3306 auto RegN = sgprPairNumber(Reg, TRI); 3307 return Register(AMDGPU::SGPR0_SGPR1 + *RegN); 3308 }; 3309 auto SearchHazardFn = [this, hazardPair, 3310 &SGPRsUsed](const MachineInstr &I) { 3311 if (!SIInstrInfo::isVALU(I)) 3312 return false; 3313 // Check for any register reads. 3314 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { 3315 return I.readsRegister(hazardPair(Reg), &TRI); 3316 }); 3317 }; 3318 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { 3319 return false; 3320 }; 3321 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == 3322 std::numeric_limits<int>::max()) 3323 return false; 3324 } 3325 3326 // Add s_wait_alu sa_sdst(0) before SALU read. 3327 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 3328 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3329 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3330 3331 // SALU read may be after s_getpc in a bundle. 3332 updateGetPCBundle(NewMI); 3333 3334 return true; 3335 } 3336 3337 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3338 const SIInstrInfo &TII) { 3339 MachineBasicBlock &EntryMBB = MF->front(); 3340 if (EntryMBB.begin() != EntryMBB.end()) { 3341 auto &EntryMI = *EntryMBB.begin(); 3342 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3343 EntryMI.getOperand(0).getImm() >= Priority) 3344 return false; 3345 } 3346 3347 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3348 .addImm(Priority); 3349 return true; 3350 } 3351 3352 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3353 if (!ST.hasRequiredExportPriority()) 3354 return false; 3355 3356 // Assume the following shader types will never have exports, 3357 // and avoid adding or adjusting S_SETPRIO. 3358 MachineBasicBlock *MBB = MI->getParent(); 3359 MachineFunction *MF = MBB->getParent(); 3360 auto CC = MF->getFunction().getCallingConv(); 3361 switch (CC) { 3362 case CallingConv::AMDGPU_CS: 3363 case CallingConv::AMDGPU_CS_Chain: 3364 case CallingConv::AMDGPU_CS_ChainPreserve: 3365 case CallingConv::AMDGPU_KERNEL: 3366 return false; 3367 default: 3368 break; 3369 } 3370 3371 const int MaxPriority = 3; 3372 const int NormalPriority = 2; 3373 const int PostExportPriority = 0; 3374 3375 auto It = MI->getIterator(); 3376 switch (MI->getOpcode()) { 3377 case AMDGPU::S_ENDPGM: 3378 case AMDGPU::S_ENDPGM_SAVED: 3379 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3380 case AMDGPU::SI_RETURN_TO_EPILOG: 3381 // Ensure shader with calls raises priority at entry. 3382 // This ensures correct priority if exports exist in callee. 3383 if (MF->getFrameInfo().hasCalls()) 3384 return ensureEntrySetPrio(MF, NormalPriority, TII); 3385 return false; 3386 case AMDGPU::S_SETPRIO: { 3387 // Raise minimum priority unless in workaround. 3388 auto &PrioOp = MI->getOperand(0); 3389 int Prio = PrioOp.getImm(); 3390 bool InWA = (Prio == PostExportPriority) && 3391 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3392 if (InWA || Prio >= NormalPriority) 3393 return false; 3394 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3395 return true; 3396 } 3397 default: 3398 if (!TII.isEXP(*MI)) 3399 return false; 3400 break; 3401 } 3402 3403 // Check entry priority at each export (as there will only be a few). 3404 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3405 bool Changed = false; 3406 if (CC != CallingConv::AMDGPU_Gfx) 3407 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3408 3409 auto NextMI = std::next(It); 3410 bool EndOfShader = false; 3411 if (NextMI != MBB->end()) { 3412 // Only need WA at end of sequence of exports. 3413 if (TII.isEXP(*NextMI)) 3414 return Changed; 3415 // Assume appropriate S_SETPRIO after export means WA already applied. 3416 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3417 NextMI->getOperand(0).getImm() == PostExportPriority) 3418 return Changed; 3419 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3420 } 3421 3422 const DebugLoc &DL = MI->getDebugLoc(); 3423 3424 // Lower priority. 3425 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3426 .addImm(PostExportPriority); 3427 3428 if (!EndOfShader) { 3429 // Wait for exports to complete. 3430 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3431 .addReg(AMDGPU::SGPR_NULL) 3432 .addImm(0); 3433 } 3434 3435 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3436 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3437 3438 if (!EndOfShader) { 3439 // Return to normal (higher) priority. 3440 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3441 .addImm(NormalPriority); 3442 } 3443 3444 return true; 3445 } 3446