1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "llvm/CodeGen/MachineFunction.h" 16 #include "llvm/CodeGen/ScheduleDAG.h" 17 #include "llvm/Support/TargetParser.h" 18 19 using namespace llvm; 20 21 //===----------------------------------------------------------------------===// 22 // Hazard Recoginizer Implementation 23 //===----------------------------------------------------------------------===// 24 25 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 26 IsHazardRecognizerMode(false), 27 CurrCycleInstr(nullptr), 28 MF(MF), 29 ST(MF.getSubtarget<GCNSubtarget>()), 30 TII(*ST.getInstrInfo()), 31 TRI(TII.getRegisterInfo()), 32 ClauseUses(TRI.getNumRegUnits()), 33 ClauseDefs(TRI.getNumRegUnits()) { 34 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 35 TSchedModel.init(&ST); 36 } 37 38 void GCNHazardRecognizer::Reset() { 39 EmittedInstrs.clear(); 40 } 41 42 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 43 EmitInstruction(SU->getInstr()); 44 } 45 46 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 47 CurrCycleInstr = MI; 48 } 49 50 static bool isDivFMas(unsigned Opcode) { 51 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 52 } 53 54 static bool isSGetReg(unsigned Opcode) { 55 return Opcode == AMDGPU::S_GETREG_B32; 56 } 57 58 static bool isSSetReg(unsigned Opcode) { 59 switch (Opcode) { 60 case AMDGPU::S_SETREG_B32: 61 case AMDGPU::S_SETREG_B32_mode: 62 case AMDGPU::S_SETREG_IMM32_B32: 63 case AMDGPU::S_SETREG_IMM32_B32_mode: 64 return true; 65 } 66 return false; 67 } 68 69 static bool isRWLane(unsigned Opcode) { 70 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 71 } 72 73 static bool isRFE(unsigned Opcode) { 74 return Opcode == AMDGPU::S_RFE_B64; 75 } 76 77 static bool isSMovRel(unsigned Opcode) { 78 switch (Opcode) { 79 case AMDGPU::S_MOVRELS_B32: 80 case AMDGPU::S_MOVRELS_B64: 81 case AMDGPU::S_MOVRELD_B32: 82 case AMDGPU::S_MOVRELD_B64: 83 return true; 84 default: 85 return false; 86 } 87 } 88 89 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 90 const MachineInstr &MI) { 91 if (TII.isAlwaysGDS(MI.getOpcode())) 92 return true; 93 94 switch (MI.getOpcode()) { 95 case AMDGPU::S_SENDMSG: 96 case AMDGPU::S_SENDMSGHALT: 97 case AMDGPU::S_TTRACEDATA: 98 return true; 99 // These DS opcodes don't support GDS. 100 case AMDGPU::DS_NOP: 101 case AMDGPU::DS_PERMUTE_B32: 102 case AMDGPU::DS_BPERMUTE_B32: 103 return false; 104 default: 105 if (TII.isDS(MI.getOpcode())) { 106 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 107 AMDGPU::OpName::gds); 108 if (MI.getOperand(GDS).getImm()) 109 return true; 110 } 111 return false; 112 } 113 } 114 115 static bool isPermlane(const MachineInstr &MI) { 116 unsigned Opcode = MI.getOpcode(); 117 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 118 Opcode == AMDGPU::V_PERMLANEX16_B32_e64; 119 } 120 121 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 122 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 123 AMDGPU::OpName::simm16); 124 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 125 } 126 127 ScheduleHazardRecognizer::HazardType 128 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 129 MachineInstr *MI = SU->getInstr(); 130 // If we are not in "HazardRecognizerMode" and therefore not being run from 131 // the scheduler, track possible stalls from hazards but don't insert noops. 132 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 133 134 if (MI->isBundle()) 135 return NoHazard; 136 137 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 138 return HazardType; 139 140 // FIXME: Should flat be considered vmem? 141 if ((SIInstrInfo::isVMEM(*MI) || 142 SIInstrInfo::isFLAT(*MI)) 143 && checkVMEMHazards(MI) > 0) 144 return HazardType; 145 146 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 147 return HazardType; 148 149 if (checkFPAtomicToDenormModeHazard(MI) > 0) 150 return HazardType; 151 152 if (ST.hasNoDataDepHazard()) 153 return NoHazard; 154 155 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 156 return HazardType; 157 158 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 159 return HazardType; 160 161 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 162 return HazardType; 163 164 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 165 return HazardType; 166 167 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 168 return HazardType; 169 170 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 171 return HazardType; 172 173 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 174 return HazardType; 175 176 if (ST.hasReadM0MovRelInterpHazard() && 177 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 178 checkReadM0Hazards(MI) > 0) 179 return HazardType; 180 181 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 182 checkReadM0Hazards(MI) > 0) 183 return HazardType; 184 185 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 186 return HazardType; 187 188 if ((SIInstrInfo::isVMEM(*MI) || 189 SIInstrInfo::isFLAT(*MI) || 190 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 191 return HazardType; 192 193 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 194 return HazardType; 195 196 return NoHazard; 197 } 198 199 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 200 unsigned Quantity) { 201 while (Quantity > 0) { 202 unsigned Arg = std::min(Quantity, 8u); 203 Quantity -= Arg; 204 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 205 .addImm(Arg - 1); 206 } 207 } 208 209 void GCNHazardRecognizer::processBundle() { 210 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 211 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 212 // Check bundled MachineInstr's for hazards. 213 for (; MI != E && MI->isInsideBundle(); ++MI) { 214 CurrCycleInstr = &*MI; 215 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 216 217 if (IsHazardRecognizerMode) { 218 fixHazards(CurrCycleInstr); 219 220 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 221 } 222 223 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 224 // include the bundled MI directly after, only add a maximum of 225 // (MaxLookAhead - 1) noops to EmittedInstrs. 226 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 227 EmittedInstrs.push_front(nullptr); 228 229 EmittedInstrs.push_front(CurrCycleInstr); 230 EmittedInstrs.resize(MaxLookAhead); 231 } 232 CurrCycleInstr = nullptr; 233 } 234 235 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 236 IsHazardRecognizerMode = true; 237 CurrCycleInstr = MI; 238 unsigned W = PreEmitNoopsCommon(MI); 239 fixHazards(MI); 240 CurrCycleInstr = nullptr; 241 return W; 242 } 243 244 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 245 if (MI->isBundle()) 246 return 0; 247 248 int WaitStates = 0; 249 250 if (SIInstrInfo::isSMRD(*MI)) 251 return std::max(WaitStates, checkSMRDHazards(MI)); 252 253 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 254 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 255 256 if (ST.hasNSAtoVMEMBug()) 257 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 258 259 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 260 261 if (ST.hasNoDataDepHazard()) 262 return WaitStates; 263 264 if (SIInstrInfo::isVALU(*MI)) 265 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 266 267 if (SIInstrInfo::isDPP(*MI)) 268 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 269 270 if (isDivFMas(MI->getOpcode())) 271 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 272 273 if (isRWLane(MI->getOpcode())) 274 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 275 276 if (MI->isInlineAsm()) 277 return std::max(WaitStates, checkInlineAsmHazards(MI)); 278 279 if (isSGetReg(MI->getOpcode())) 280 return std::max(WaitStates, checkGetRegHazards(MI)); 281 282 if (isSSetReg(MI->getOpcode())) 283 return std::max(WaitStates, checkSetRegHazards(MI)); 284 285 if (isRFE(MI->getOpcode())) 286 return std::max(WaitStates, checkRFEHazards(MI)); 287 288 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 289 isSMovRel(MI->getOpcode()))) 290 return std::max(WaitStates, checkReadM0Hazards(MI)); 291 292 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 293 return std::max(WaitStates, checkReadM0Hazards(MI)); 294 295 if (SIInstrInfo::isMAI(*MI)) 296 return std::max(WaitStates, checkMAIHazards(MI)); 297 298 if (SIInstrInfo::isVMEM(*MI) || 299 SIInstrInfo::isFLAT(*MI) || 300 SIInstrInfo::isDS(*MI)) 301 return std::max(WaitStates, checkMAILdStHazards(MI)); 302 303 return WaitStates; 304 } 305 306 void GCNHazardRecognizer::EmitNoop() { 307 EmittedInstrs.push_front(nullptr); 308 } 309 310 void GCNHazardRecognizer::AdvanceCycle() { 311 // When the scheduler detects a stall, it will call AdvanceCycle() without 312 // emitting any instructions. 313 if (!CurrCycleInstr) { 314 EmittedInstrs.push_front(nullptr); 315 return; 316 } 317 318 // Do not track non-instructions which do not affect the wait states. 319 // If included, these instructions can lead to buffer overflow such that 320 // detectable hazards are missed. 321 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 322 CurrCycleInstr->isKill()) { 323 CurrCycleInstr = nullptr; 324 return; 325 } 326 327 if (CurrCycleInstr->isBundle()) { 328 processBundle(); 329 return; 330 } 331 332 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 333 334 // Keep track of emitted instructions 335 EmittedInstrs.push_front(CurrCycleInstr); 336 337 // Add a nullptr for each additional wait state after the first. Make sure 338 // not to add more than getMaxLookAhead() items to the list, since we 339 // truncate the list to that size right after this loop. 340 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 341 i < e; ++i) { 342 EmittedInstrs.push_front(nullptr); 343 } 344 345 // getMaxLookahead() is the largest number of wait states we will ever need 346 // to insert, so there is no point in keeping track of more than that many 347 // wait states. 348 EmittedInstrs.resize(getMaxLookAhead()); 349 350 CurrCycleInstr = nullptr; 351 } 352 353 void GCNHazardRecognizer::RecedeCycle() { 354 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 355 } 356 357 //===----------------------------------------------------------------------===// 358 // Helper Functions 359 //===----------------------------------------------------------------------===// 360 361 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 362 363 // Returns a minimum wait states since \p I walking all predecessors. 364 // Only scans until \p IsExpired does not return true. 365 // Can only be run in a hazard recognizer mode. 366 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 367 MachineBasicBlock *MBB, 368 MachineBasicBlock::reverse_instr_iterator I, 369 int WaitStates, 370 IsExpiredFn IsExpired, 371 DenseSet<const MachineBasicBlock *> &Visited) { 372 for (auto E = MBB->instr_rend(); I != E; ++I) { 373 // Don't add WaitStates for parent BUNDLE instructions. 374 if (I->isBundle()) 375 continue; 376 377 if (IsHazard(&*I)) 378 return WaitStates; 379 380 if (I->isInlineAsm() || I->isMetaInstruction()) 381 continue; 382 383 WaitStates += SIInstrInfo::getNumWaitStates(*I); 384 385 if (IsExpired(&*I, WaitStates)) 386 return std::numeric_limits<int>::max(); 387 } 388 389 int MinWaitStates = WaitStates; 390 bool Found = false; 391 for (MachineBasicBlock *Pred : MBB->predecessors()) { 392 if (!Visited.insert(Pred).second) 393 continue; 394 395 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 396 WaitStates, IsExpired, Visited); 397 398 if (W == std::numeric_limits<int>::max()) 399 continue; 400 401 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 402 if (IsExpired(nullptr, MinWaitStates)) 403 return MinWaitStates; 404 405 Found = true; 406 } 407 408 if (Found) 409 return MinWaitStates; 410 411 return std::numeric_limits<int>::max(); 412 } 413 414 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 415 MachineInstr *MI, 416 IsExpiredFn IsExpired) { 417 DenseSet<const MachineBasicBlock *> Visited; 418 return getWaitStatesSince(IsHazard, MI->getParent(), 419 std::next(MI->getReverseIterator()), 420 0, IsExpired, Visited); 421 } 422 423 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 424 if (IsHazardRecognizerMode) { 425 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 426 return WaitStates >= Limit; 427 }; 428 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 429 } 430 431 int WaitStates = 0; 432 for (MachineInstr *MI : EmittedInstrs) { 433 if (MI) { 434 if (IsHazard(MI)) 435 return WaitStates; 436 437 if (MI->isInlineAsm()) 438 continue; 439 } 440 ++WaitStates; 441 442 if (WaitStates >= Limit) 443 break; 444 } 445 return std::numeric_limits<int>::max(); 446 } 447 448 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 449 IsHazardFn IsHazardDef, 450 int Limit) { 451 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 452 453 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 454 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 455 }; 456 457 return getWaitStatesSince(IsHazardFn, Limit); 458 } 459 460 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 461 int Limit) { 462 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 463 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 464 }; 465 466 return getWaitStatesSince(IsHazardFn, Limit); 467 } 468 469 //===----------------------------------------------------------------------===// 470 // No-op Hazard Detection 471 //===----------------------------------------------------------------------===// 472 473 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 474 MCRegister Reg) { 475 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 476 BV.set(*RUI); 477 } 478 479 static void addRegsToSet(const SIRegisterInfo &TRI, 480 iterator_range<MachineInstr::const_mop_iterator> Ops, 481 BitVector &Set) { 482 for (const MachineOperand &Op : Ops) { 483 if (Op.isReg()) 484 addRegUnits(TRI, Set, Op.getReg().asMCReg()); 485 } 486 } 487 488 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 489 // XXX: Do we need to worry about implicit operands 490 addRegsToSet(TRI, MI.defs(), ClauseDefs); 491 addRegsToSet(TRI, MI.uses(), ClauseUses); 492 } 493 494 static bool breaksSMEMSoftClause(MachineInstr *MI) { 495 return !SIInstrInfo::isSMRD(*MI); 496 } 497 498 static bool breaksVMEMSoftClause(MachineInstr *MI) { 499 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 500 } 501 502 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 503 // SMEM soft clause are only present on VI+, and only matter if xnack is 504 // enabled. 505 if (!ST.isXNACKEnabled()) 506 return 0; 507 508 bool IsSMRD = TII.isSMRD(*MEM); 509 510 resetClause(); 511 512 // A soft-clause is any group of consecutive SMEM instructions. The 513 // instructions in this group may return out of order and/or may be 514 // replayed (i.e. the same instruction issued more than once). 515 // 516 // In order to handle these situations correctly we need to make sure that 517 // when a clause has more than one instruction, no instruction in the clause 518 // writes to a register that is read by another instruction in the clause 519 // (including itself). If we encounter this situaion, we need to break the 520 // clause by inserting a non SMEM instruction. 521 522 for (MachineInstr *MI : EmittedInstrs) { 523 // When we hit a non-SMEM instruction then we have passed the start of the 524 // clause and we can stop. 525 if (!MI) 526 break; 527 528 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 529 break; 530 531 addClauseInst(*MI); 532 } 533 534 if (ClauseDefs.none()) 535 return 0; 536 537 // We need to make sure not to put loads and stores in the same clause if they 538 // use the same address. For now, just start a new clause whenever we see a 539 // store. 540 if (MEM->mayStore()) 541 return 1; 542 543 addClauseInst(*MEM); 544 545 // If the set of defs and uses intersect then we cannot add this instruction 546 // to the clause, so we have a hazard. 547 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 548 } 549 550 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 551 int WaitStatesNeeded = 0; 552 553 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 554 555 // This SMRD hazard only affects SI. 556 if (!ST.hasSMRDReadVALUDefHazard()) 557 return WaitStatesNeeded; 558 559 // A read of an SGPR by SMRD instruction requires 4 wait states when the 560 // SGPR was written by a VALU instruction. 561 int SmrdSgprWaitStates = 4; 562 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 563 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 564 565 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 566 567 for (const MachineOperand &Use : SMRD->uses()) { 568 if (!Use.isReg()) 569 continue; 570 int WaitStatesNeededForUse = 571 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 572 SmrdSgprWaitStates); 573 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 574 575 // This fixes what appears to be undocumented hardware behavior in SI where 576 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 577 // needs some number of nops in between. We don't know how many we need, but 578 // let's use 4. This wasn't discovered before probably because the only 579 // case when this happens is when we expand a 64-bit pointer into a full 580 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 581 // probably never encountered in the closed-source land. 582 if (IsBufferSMRD) { 583 int WaitStatesNeededForUse = 584 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 585 IsBufferHazardDefFn, 586 SmrdSgprWaitStates); 587 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 588 } 589 } 590 591 return WaitStatesNeeded; 592 } 593 594 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 595 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 596 return 0; 597 598 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 599 600 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 601 // SGPR was written by a VALU Instruction. 602 const int VmemSgprWaitStates = 5; 603 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 604 for (const MachineOperand &Use : VMEM->uses()) { 605 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 606 continue; 607 608 int WaitStatesNeededForUse = 609 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 610 VmemSgprWaitStates); 611 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 612 } 613 return WaitStatesNeeded; 614 } 615 616 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 617 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 618 const SIInstrInfo *TII = ST.getInstrInfo(); 619 620 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 621 int DppVgprWaitStates = 2; 622 int DppExecWaitStates = 5; 623 int WaitStatesNeeded = 0; 624 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 625 626 for (const MachineOperand &Use : DPP->uses()) { 627 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 628 continue; 629 int WaitStatesNeededForUse = 630 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 631 [](MachineInstr *) { return true; }, 632 DppVgprWaitStates); 633 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 634 } 635 636 WaitStatesNeeded = std::max( 637 WaitStatesNeeded, 638 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 639 DppExecWaitStates)); 640 641 return WaitStatesNeeded; 642 } 643 644 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 645 const SIInstrInfo *TII = ST.getInstrInfo(); 646 647 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 648 // instruction. 649 const int DivFMasWaitStates = 4; 650 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 651 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 652 DivFMasWaitStates); 653 654 return DivFMasWaitStates - WaitStatesNeeded; 655 } 656 657 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 658 const SIInstrInfo *TII = ST.getInstrInfo(); 659 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 660 661 const int GetRegWaitStates = 2; 662 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 663 return GetRegHWReg == getHWReg(TII, *MI); 664 }; 665 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 666 667 return GetRegWaitStates - WaitStatesNeeded; 668 } 669 670 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 671 const SIInstrInfo *TII = ST.getInstrInfo(); 672 unsigned HWReg = getHWReg(TII, *SetRegInstr); 673 674 const int SetRegWaitStates = ST.getSetRegWaitStates(); 675 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 676 return HWReg == getHWReg(TII, *MI); 677 }; 678 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 679 return SetRegWaitStates - WaitStatesNeeded; 680 } 681 682 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 683 if (!MI.mayStore()) 684 return -1; 685 686 const SIInstrInfo *TII = ST.getInstrInfo(); 687 unsigned Opcode = MI.getOpcode(); 688 const MCInstrDesc &Desc = MI.getDesc(); 689 690 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 691 int VDataRCID = -1; 692 if (VDataIdx != -1) 693 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 694 695 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 696 // There is no hazard if the instruction does not use vector regs 697 // (like wbinvl1) 698 if (VDataIdx == -1) 699 return -1; 700 // For MUBUF/MTBUF instructions this hazard only exists if the 701 // instruction is not using a register in the soffset field. 702 const MachineOperand *SOffset = 703 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 704 // If we have no soffset operand, then assume this field has been 705 // hardcoded to zero. 706 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 707 (!SOffset || !SOffset->isReg())) 708 return VDataIdx; 709 } 710 711 // MIMG instructions create a hazard if they don't use a 256-bit T# and 712 // the store size is greater than 8 bytes and they have more than two bits 713 // of their dmask set. 714 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 715 if (TII->isMIMG(MI)) { 716 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 717 assert(SRsrcIdx != -1 && 718 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 719 (void)SRsrcIdx; 720 } 721 722 if (TII->isFLAT(MI)) { 723 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 724 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 725 return DataIdx; 726 } 727 728 return -1; 729 } 730 731 int 732 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 733 const MachineRegisterInfo &MRI) { 734 // Helper to check for the hazard where VMEM instructions that store more than 735 // 8 bytes can have there store data over written by the next instruction. 736 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 737 738 const int VALUWaitStates = 1; 739 int WaitStatesNeeded = 0; 740 741 if (!TRI->isVGPR(MRI, Def.getReg())) 742 return WaitStatesNeeded; 743 Register Reg = Def.getReg(); 744 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 745 int DataIdx = createsVALUHazard(*MI); 746 return DataIdx >= 0 && 747 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 748 }; 749 int WaitStatesNeededForDef = 750 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 751 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 752 753 return WaitStatesNeeded; 754 } 755 756 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 757 // This checks for the hazard where VMEM instructions that store more than 758 // 8 bytes can have there store data over written by the next instruction. 759 if (!ST.has12DWordStoreHazard()) 760 return 0; 761 762 const MachineRegisterInfo &MRI = MF.getRegInfo(); 763 int WaitStatesNeeded = 0; 764 765 for (const MachineOperand &Def : VALU->defs()) { 766 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 767 } 768 769 return WaitStatesNeeded; 770 } 771 772 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 773 // This checks for hazards associated with inline asm statements. 774 // Since inline asms can contain just about anything, we use this 775 // to call/leverage other check*Hazard routines. Note that 776 // this function doesn't attempt to address all possible inline asm 777 // hazards (good luck), but is a collection of what has been 778 // problematic thus far. 779 780 // see checkVALUHazards() 781 if (!ST.has12DWordStoreHazard()) 782 return 0; 783 784 const MachineRegisterInfo &MRI = MF.getRegInfo(); 785 int WaitStatesNeeded = 0; 786 787 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 788 I != E; ++I) { 789 const MachineOperand &Op = IA->getOperand(I); 790 if (Op.isReg() && Op.isDef()) { 791 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 792 } 793 } 794 795 return WaitStatesNeeded; 796 } 797 798 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 799 const SIInstrInfo *TII = ST.getInstrInfo(); 800 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 801 const MachineRegisterInfo &MRI = MF.getRegInfo(); 802 803 const MachineOperand *LaneSelectOp = 804 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 805 806 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 807 return 0; 808 809 Register LaneSelectReg = LaneSelectOp->getReg(); 810 auto IsHazardFn = [TII] (MachineInstr *MI) { 811 return TII->isVALU(*MI); 812 }; 813 814 const int RWLaneWaitStates = 4; 815 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 816 RWLaneWaitStates); 817 return RWLaneWaitStates - WaitStatesSince; 818 } 819 820 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 821 if (!ST.hasRFEHazards()) 822 return 0; 823 824 const SIInstrInfo *TII = ST.getInstrInfo(); 825 826 const int RFEWaitStates = 1; 827 828 auto IsHazardFn = [TII] (MachineInstr *MI) { 829 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 830 }; 831 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 832 return RFEWaitStates - WaitStatesNeeded; 833 } 834 835 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 836 const SIInstrInfo *TII = ST.getInstrInfo(); 837 const int SMovRelWaitStates = 1; 838 auto IsHazardFn = [TII] (MachineInstr *MI) { 839 return TII->isSALU(*MI); 840 }; 841 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 842 SMovRelWaitStates); 843 } 844 845 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 846 fixVMEMtoScalarWriteHazards(MI); 847 fixVcmpxPermlaneHazards(MI); 848 fixSMEMtoVectorWriteHazards(MI); 849 fixVcmpxExecWARHazard(MI); 850 fixLdsBranchVmemWARHazard(MI); 851 } 852 853 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 854 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 855 return false; 856 857 const SIInstrInfo *TII = ST.getInstrInfo(); 858 auto IsHazardFn = [TII] (MachineInstr *MI) { 859 return TII->isVOPC(*MI); 860 }; 861 862 auto IsExpiredFn = [] (MachineInstr *MI, int) { 863 if (!MI) 864 return false; 865 unsigned Opc = MI->getOpcode(); 866 return SIInstrInfo::isVALU(*MI) && 867 Opc != AMDGPU::V_NOP_e32 && 868 Opc != AMDGPU::V_NOP_e64 && 869 Opc != AMDGPU::V_NOP_sdwa; 870 }; 871 872 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 873 std::numeric_limits<int>::max()) 874 return false; 875 876 // V_NOP will be discarded by SQ. 877 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 878 // which is always a VGPR and available. 879 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 880 Register Reg = Src0->getReg(); 881 bool IsUndef = Src0->isUndef(); 882 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 883 TII->get(AMDGPU::V_MOV_B32_e32)) 884 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 885 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 886 887 return true; 888 } 889 890 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 891 if (!ST.hasVMEMtoScalarWriteHazard()) 892 return false; 893 894 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 895 return false; 896 897 if (MI->getNumDefs() == 0) 898 return false; 899 900 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 901 902 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 903 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 904 !SIInstrInfo::isFLAT(*I)) 905 return false; 906 907 for (const MachineOperand &Def : MI->defs()) { 908 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 909 if (!Op) 910 continue; 911 return true; 912 } 913 return false; 914 }; 915 916 auto IsExpiredFn = [](MachineInstr *MI, int) { 917 return MI && (SIInstrInfo::isVALU(*MI) || 918 (MI->getOpcode() == AMDGPU::S_WAITCNT && 919 !MI->getOperand(0).getImm()) || 920 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 921 MI->getOperand(0).getImm() == 0xffe3)); 922 }; 923 924 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 925 std::numeric_limits<int>::max()) 926 return false; 927 928 const SIInstrInfo *TII = ST.getInstrInfo(); 929 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 930 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 931 .addImm(0xffe3); 932 return true; 933 } 934 935 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 936 if (!ST.hasSMEMtoVectorWriteHazard()) 937 return false; 938 939 if (!SIInstrInfo::isVALU(*MI)) 940 return false; 941 942 unsigned SDSTName; 943 switch (MI->getOpcode()) { 944 case AMDGPU::V_READLANE_B32: 945 case AMDGPU::V_READFIRSTLANE_B32: 946 SDSTName = AMDGPU::OpName::vdst; 947 break; 948 default: 949 SDSTName = AMDGPU::OpName::sdst; 950 break; 951 } 952 953 const SIInstrInfo *TII = ST.getInstrInfo(); 954 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 955 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 956 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 957 if (!SDST) { 958 for (const auto &MO : MI->implicit_operands()) { 959 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 960 SDST = &MO; 961 break; 962 } 963 } 964 } 965 966 if (!SDST) 967 return false; 968 969 const Register SDSTReg = SDST->getReg(); 970 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 971 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 972 }; 973 974 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 975 if (MI) { 976 if (TII->isSALU(*MI)) { 977 switch (MI->getOpcode()) { 978 case AMDGPU::S_SETVSKIP: 979 case AMDGPU::S_VERSION: 980 case AMDGPU::S_WAITCNT_VSCNT: 981 case AMDGPU::S_WAITCNT_VMCNT: 982 case AMDGPU::S_WAITCNT_EXPCNT: 983 // These instructions cannot not mitigate the hazard. 984 return false; 985 case AMDGPU::S_WAITCNT_LGKMCNT: 986 // Reducing lgkmcnt count to 0 always mitigates the hazard. 987 return (MI->getOperand(1).getImm() == 0) && 988 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 989 case AMDGPU::S_WAITCNT: { 990 const int64_t Imm = MI->getOperand(0).getImm(); 991 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 992 return (Decoded.LgkmCnt == 0); 993 } 994 default: 995 // SOPP instructions cannot mitigate the hazard. 996 if (TII->isSOPP(*MI)) 997 return false; 998 // At this point the SALU can be assumed to mitigate the hazard 999 // because either: 1000 // (a) it is independent of the at risk SMEM (breaking chain), 1001 // or 1002 // (b) it is dependent on the SMEM, in which case an appropriate 1003 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1004 // SMEM instruction. 1005 return true; 1006 } 1007 } 1008 } 1009 return false; 1010 }; 1011 1012 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1013 std::numeric_limits<int>::max()) 1014 return false; 1015 1016 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1017 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1018 .addImm(0); 1019 return true; 1020 } 1021 1022 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1023 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1024 return false; 1025 1026 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1027 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1028 return false; 1029 1030 auto IsHazardFn = [TRI] (MachineInstr *I) { 1031 if (SIInstrInfo::isVALU(*I)) 1032 return false; 1033 return I->readsRegister(AMDGPU::EXEC, TRI); 1034 }; 1035 1036 const SIInstrInfo *TII = ST.getInstrInfo(); 1037 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1038 if (!MI) 1039 return false; 1040 if (SIInstrInfo::isVALU(*MI)) { 1041 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1042 return true; 1043 for (auto MO : MI->implicit_operands()) 1044 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1045 return true; 1046 } 1047 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1048 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1049 return true; 1050 return false; 1051 }; 1052 1053 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1054 std::numeric_limits<int>::max()) 1055 return false; 1056 1057 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1058 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1059 .addImm(0xfffe); 1060 return true; 1061 } 1062 1063 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1064 if (!ST.hasLdsBranchVmemWARHazard()) 1065 return false; 1066 1067 auto IsHazardInst = [] (const MachineInstr *MI) { 1068 if (SIInstrInfo::isDS(*MI)) 1069 return 1; 1070 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1071 return 2; 1072 return 0; 1073 }; 1074 1075 auto InstType = IsHazardInst(MI); 1076 if (!InstType) 1077 return false; 1078 1079 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1080 return I && (IsHazardInst(I) || 1081 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1082 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1083 !I->getOperand(1).getImm())); 1084 }; 1085 1086 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1087 if (!I->isBranch()) 1088 return false; 1089 1090 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1091 auto InstType2 = IsHazardInst(I); 1092 return InstType2 && InstType != InstType2; 1093 }; 1094 1095 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1096 if (!I) 1097 return false; 1098 1099 auto InstType2 = IsHazardInst(I); 1100 if (InstType == InstType2) 1101 return true; 1102 1103 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1104 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1105 !I->getOperand(1).getImm(); 1106 }; 1107 1108 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1109 std::numeric_limits<int>::max(); 1110 }; 1111 1112 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1113 std::numeric_limits<int>::max()) 1114 return false; 1115 1116 const SIInstrInfo *TII = ST.getInstrInfo(); 1117 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1118 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1119 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1120 .addImm(0); 1121 1122 return true; 1123 } 1124 1125 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1126 int NSAtoVMEMWaitStates = 1; 1127 1128 if (!ST.hasNSAtoVMEMBug()) 1129 return 0; 1130 1131 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1132 return 0; 1133 1134 const SIInstrInfo *TII = ST.getInstrInfo(); 1135 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1136 if (!Offset || (Offset->getImm() & 6) == 0) 1137 return 0; 1138 1139 auto IsHazardFn = [TII] (MachineInstr *I) { 1140 if (!SIInstrInfo::isMIMG(*I)) 1141 return false; 1142 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1143 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1144 TII->getInstSizeInBytes(*I) >= 16; 1145 }; 1146 1147 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1148 } 1149 1150 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1151 int FPAtomicToDenormModeWaitStates = 3; 1152 1153 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1154 return 0; 1155 1156 auto IsHazardFn = [] (MachineInstr *I) { 1157 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1158 return false; 1159 return SIInstrInfo::isFPAtomic(*I); 1160 }; 1161 1162 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1163 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1164 return true; 1165 1166 switch (MI->getOpcode()) { 1167 case AMDGPU::S_WAITCNT: 1168 case AMDGPU::S_WAITCNT_VSCNT: 1169 case AMDGPU::S_WAITCNT_VMCNT: 1170 case AMDGPU::S_WAITCNT_EXPCNT: 1171 case AMDGPU::S_WAITCNT_LGKMCNT: 1172 case AMDGPU::S_WAIT_IDLE: 1173 return true; 1174 default: 1175 break; 1176 } 1177 1178 return false; 1179 }; 1180 1181 1182 return FPAtomicToDenormModeWaitStates - 1183 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1184 } 1185 1186 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1187 assert(SIInstrInfo::isMAI(*MI)); 1188 1189 int WaitStatesNeeded = 0; 1190 unsigned Opc = MI->getOpcode(); 1191 1192 auto IsVALUFn = [] (MachineInstr *MI) { 1193 return SIInstrInfo::isVALU(*MI); 1194 }; 1195 1196 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 1197 const int LegacyVALUWritesVGPRWaitStates = 2; 1198 const int VALUWritesExecWaitStates = 4; 1199 const int MaxWaitStates = 4; 1200 1201 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1202 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1203 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1204 1205 if (WaitStatesNeeded < MaxWaitStates) { 1206 for (const MachineOperand &Use : MI->explicit_uses()) { 1207 const int MaxWaitStates = 2; 1208 1209 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1210 continue; 1211 1212 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1213 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1215 1216 if (WaitStatesNeeded == MaxWaitStates) 1217 break; 1218 } 1219 } 1220 } 1221 1222 auto IsMFMAFn = [] (MachineInstr *MI) { 1223 return SIInstrInfo::isMAI(*MI) && 1224 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1225 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; 1226 }; 1227 1228 for (const MachineOperand &Op : MI->explicit_operands()) { 1229 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1230 continue; 1231 1232 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1233 continue; 1234 1235 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1236 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1237 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1238 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1239 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1240 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1241 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1242 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1243 const int MaxWaitStates = 18; 1244 Register Reg = Op.getReg(); 1245 unsigned HazardDefLatency = 0; 1246 1247 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1248 (MachineInstr *MI) { 1249 if (!IsMFMAFn(MI)) 1250 return false; 1251 Register DstReg = MI->getOperand(0).getReg(); 1252 if (DstReg == Reg) 1253 return false; 1254 HazardDefLatency = std::max(HazardDefLatency, 1255 TSchedModel.computeInstrLatency(MI)); 1256 return TRI.regsOverlap(DstReg, Reg); 1257 }; 1258 1259 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1260 MaxWaitStates); 1261 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1262 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1263 int OpNo = MI->getOperandNo(&Op); 1264 if (OpNo == SrcCIdx) { 1265 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1266 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 1267 switch (HazardDefLatency) { 1268 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1269 break; 1270 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1271 break; 1272 case 16: LLVM_FALLTHROUGH; 1273 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1274 break; 1275 } 1276 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1277 switch (HazardDefLatency) { 1278 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1279 break; 1280 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1281 break; 1282 case 16: LLVM_FALLTHROUGH; 1283 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1284 break; 1285 } 1286 } 1287 1288 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1289 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1290 1291 if (WaitStatesNeeded == MaxWaitStates) 1292 return WaitStatesNeeded; // Early exit. 1293 1294 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1295 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1296 return false; 1297 Register DstReg = MI->getOperand(0).getReg(); 1298 return TRI.regsOverlap(Reg, DstReg); 1299 }; 1300 1301 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1302 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1303 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1304 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1305 if (OpNo == SrcCIdx) 1306 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1307 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 1308 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1309 1310 WaitStatesNeededForUse = NeedWaitStates - 1311 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1312 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1313 1314 if (WaitStatesNeeded == MaxWaitStates) 1315 return WaitStatesNeeded; // Early exit. 1316 } 1317 1318 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1319 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1320 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1321 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1322 const int MaxWaitStates = 13; 1323 Register DstReg = MI->getOperand(0).getReg(); 1324 unsigned HazardDefLatency = 0; 1325 1326 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1327 (MachineInstr *MI) { 1328 if (!IsMFMAFn(MI)) 1329 return false; 1330 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1331 HazardDefLatency = std::max(HazardDefLatency, 1332 TSchedModel.computeInstrLatency(MI)); 1333 return TRI.regsOverlap(Reg, DstReg); 1334 }; 1335 1336 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1337 int NeedWaitStates; 1338 switch (HazardDefLatency) { 1339 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1340 break; 1341 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1342 break; 1343 case 16: LLVM_FALLTHROUGH; 1344 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1345 break; 1346 } 1347 1348 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1349 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1350 } 1351 1352 return WaitStatesNeeded; 1353 } 1354 1355 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1356 if (!ST.hasMAIInsts()) 1357 return 0; 1358 1359 int WaitStatesNeeded = 0; 1360 1361 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1362 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 1363 }; 1364 1365 for (const MachineOperand &Op : MI->explicit_uses()) { 1366 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1367 continue; 1368 1369 Register Reg = Op.getReg(); 1370 1371 const int AccVgprReadLdStWaitStates = 2; 1372 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1373 const int MaxWaitStates = 2; 1374 1375 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1376 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1377 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1378 1379 if (WaitStatesNeeded == MaxWaitStates) 1380 return WaitStatesNeeded; // Early exit. 1381 1382 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { 1383 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 1384 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1385 return false; 1386 auto IsVALUFn = [] (MachineInstr *MI) { 1387 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1388 }; 1389 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1390 std::numeric_limits<int>::max(); 1391 }; 1392 1393 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1394 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1395 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1396 } 1397 1398 return WaitStatesNeeded; 1399 } 1400 1401 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1402 if (!SU->isInstr()) 1403 return false; 1404 1405 MachineInstr *MAI = nullptr; 1406 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1407 MAI = nullptr; 1408 if (SIInstrInfo::isMAI(*MI) && 1409 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1410 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) 1411 MAI = MI; 1412 return MAI != nullptr; 1413 }; 1414 1415 MachineInstr *MI = SU->getInstr(); 1416 if (IsMFMAFn(MI)) { 1417 int W = getWaitStatesSince(IsMFMAFn, 16); 1418 if (MAI) 1419 return W < (int)TSchedModel.computeInstrLatency(MAI); 1420 } 1421 1422 return false; 1423 } 1424