1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "SIDefines.h" 16 #include "SIInstrInfo.h" 17 #include "SIRegisterInfo.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <limits> 31 #include <set> 32 #include <vector> 33 34 using namespace llvm; 35 36 //===----------------------------------------------------------------------===// 37 // Hazard Recoginizer Implementation 38 //===----------------------------------------------------------------------===// 39 40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41 IsHazardRecognizerMode(false), 42 CurrCycleInstr(nullptr), 43 MF(MF), 44 ST(MF.getSubtarget<GCNSubtarget>()), 45 TII(*ST.getInstrInfo()), 46 TRI(TII.getRegisterInfo()), 47 ClauseUses(TRI.getNumRegUnits()), 48 ClauseDefs(TRI.getNumRegUnits()) { 49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50 TSchedModel.init(&ST); 51 } 52 53 void GCNHazardRecognizer::Reset() { 54 EmittedInstrs.clear(); 55 } 56 57 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 58 EmitInstruction(SU->getInstr()); 59 } 60 61 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 62 CurrCycleInstr = MI; 63 } 64 65 static bool isDivFMas(unsigned Opcode) { 66 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 67 } 68 69 static bool isSGetReg(unsigned Opcode) { 70 return Opcode == AMDGPU::S_GETREG_B32; 71 } 72 73 static bool isSSetReg(unsigned Opcode) { 74 switch (Opcode) { 75 case AMDGPU::S_SETREG_B32: 76 case AMDGPU::S_SETREG_B32_mode: 77 case AMDGPU::S_SETREG_IMM32_B32: 78 case AMDGPU::S_SETREG_IMM32_B32_mode: 79 return true; 80 } 81 return false; 82 } 83 84 static bool isRWLane(unsigned Opcode) { 85 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 86 } 87 88 static bool isRFE(unsigned Opcode) { 89 return Opcode == AMDGPU::S_RFE_B64; 90 } 91 92 static bool isSMovRel(unsigned Opcode) { 93 switch (Opcode) { 94 case AMDGPU::S_MOVRELS_B32: 95 case AMDGPU::S_MOVRELS_B64: 96 case AMDGPU::S_MOVRELD_B32: 97 case AMDGPU::S_MOVRELD_B64: 98 return true; 99 default: 100 return false; 101 } 102 } 103 104 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 105 const MachineInstr &MI) { 106 if (TII.isAlwaysGDS(MI.getOpcode())) 107 return true; 108 109 switch (MI.getOpcode()) { 110 case AMDGPU::S_SENDMSG: 111 case AMDGPU::S_SENDMSGHALT: 112 case AMDGPU::S_TTRACEDATA: 113 return true; 114 // These DS opcodes don't support GDS. 115 case AMDGPU::DS_NOP: 116 case AMDGPU::DS_PERMUTE_B32: 117 case AMDGPU::DS_BPERMUTE_B32: 118 return false; 119 default: 120 if (TII.isDS(MI.getOpcode())) { 121 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 122 AMDGPU::OpName::gds); 123 if (MI.getOperand(GDS).getImm()) 124 return true; 125 } 126 return false; 127 } 128 } 129 130 static bool isPermlane(const MachineInstr &MI) { 131 unsigned Opcode = MI.getOpcode(); 132 return Opcode == AMDGPU::V_PERMLANE16_B32 || 133 Opcode == AMDGPU::V_PERMLANEX16_B32; 134 } 135 136 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 137 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 138 AMDGPU::OpName::simm16); 139 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 140 } 141 142 ScheduleHazardRecognizer::HazardType 143 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 144 MachineInstr *MI = SU->getInstr(); 145 // If we are not in "HazardRecognizerMode" and therefore not being run from 146 // the scheduler, track possible stalls from hazards but don't insert noops. 147 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 148 149 if (MI->isBundle()) 150 return NoHazard; 151 152 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 153 return HazardType; 154 155 // FIXME: Should flat be considered vmem? 156 if ((SIInstrInfo::isVMEM(*MI) || 157 SIInstrInfo::isFLAT(*MI)) 158 && checkVMEMHazards(MI) > 0) 159 return HazardType; 160 161 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 162 return HazardType; 163 164 if (checkFPAtomicToDenormModeHazard(MI) > 0) 165 return HazardType; 166 167 if (ST.hasNoDataDepHazard()) 168 return NoHazard; 169 170 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 171 return HazardType; 172 173 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 174 return HazardType; 175 176 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 177 return HazardType; 178 179 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 180 return HazardType; 181 182 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 183 return HazardType; 184 185 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 186 return HazardType; 187 188 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 189 return HazardType; 190 191 if (ST.hasReadM0MovRelInterpHazard() && 192 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 193 checkReadM0Hazards(MI) > 0) 194 return HazardType; 195 196 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 197 checkReadM0Hazards(MI) > 0) 198 return HazardType; 199 200 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 201 return HazardType; 202 203 if ((SIInstrInfo::isVMEM(*MI) || 204 SIInstrInfo::isFLAT(*MI) || 205 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 206 return HazardType; 207 208 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 209 return HazardType; 210 211 return NoHazard; 212 } 213 214 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 215 unsigned Quantity) { 216 while (Quantity > 0) { 217 unsigned Arg = std::min(Quantity, 8u); 218 Quantity -= Arg; 219 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 220 .addImm(Arg - 1); 221 } 222 } 223 224 void GCNHazardRecognizer::processBundle() { 225 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 226 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 227 // Check bundled MachineInstr's for hazards. 228 for (; MI != E && MI->isInsideBundle(); ++MI) { 229 CurrCycleInstr = &*MI; 230 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 231 232 if (IsHazardRecognizerMode) { 233 fixHazards(CurrCycleInstr); 234 235 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 236 } 237 238 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 239 // include the bundled MI directly after, only add a maximum of 240 // (MaxLookAhead - 1) noops to EmittedInstrs. 241 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 242 EmittedInstrs.push_front(nullptr); 243 244 EmittedInstrs.push_front(CurrCycleInstr); 245 EmittedInstrs.resize(MaxLookAhead); 246 } 247 CurrCycleInstr = nullptr; 248 } 249 250 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 251 IsHazardRecognizerMode = true; 252 CurrCycleInstr = MI; 253 unsigned W = PreEmitNoopsCommon(MI); 254 fixHazards(MI); 255 CurrCycleInstr = nullptr; 256 return W; 257 } 258 259 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 260 if (MI->isBundle()) 261 return 0; 262 263 int WaitStates = 0; 264 265 if (SIInstrInfo::isSMRD(*MI)) 266 return std::max(WaitStates, checkSMRDHazards(MI)); 267 268 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 269 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 270 271 if (ST.hasNSAtoVMEMBug()) 272 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 273 274 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 275 276 if (ST.hasNoDataDepHazard()) 277 return WaitStates; 278 279 if (SIInstrInfo::isVALU(*MI)) 280 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 281 282 if (SIInstrInfo::isDPP(*MI)) 283 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 284 285 if (isDivFMas(MI->getOpcode())) 286 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 287 288 if (isRWLane(MI->getOpcode())) 289 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 290 291 if (MI->isInlineAsm()) 292 return std::max(WaitStates, checkInlineAsmHazards(MI)); 293 294 if (isSGetReg(MI->getOpcode())) 295 return std::max(WaitStates, checkGetRegHazards(MI)); 296 297 if (isSSetReg(MI->getOpcode())) 298 return std::max(WaitStates, checkSetRegHazards(MI)); 299 300 if (isRFE(MI->getOpcode())) 301 return std::max(WaitStates, checkRFEHazards(MI)); 302 303 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 304 isSMovRel(MI->getOpcode()))) 305 return std::max(WaitStates, checkReadM0Hazards(MI)); 306 307 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 308 return std::max(WaitStates, checkReadM0Hazards(MI)); 309 310 if (SIInstrInfo::isMAI(*MI)) 311 return std::max(WaitStates, checkMAIHazards(MI)); 312 313 if (SIInstrInfo::isVMEM(*MI) || 314 SIInstrInfo::isFLAT(*MI) || 315 SIInstrInfo::isDS(*MI)) 316 return std::max(WaitStates, checkMAILdStHazards(MI)); 317 318 return WaitStates; 319 } 320 321 void GCNHazardRecognizer::EmitNoop() { 322 EmittedInstrs.push_front(nullptr); 323 } 324 325 void GCNHazardRecognizer::AdvanceCycle() { 326 // When the scheduler detects a stall, it will call AdvanceCycle() without 327 // emitting any instructions. 328 if (!CurrCycleInstr) { 329 EmittedInstrs.push_front(nullptr); 330 return; 331 } 332 333 // Do not track non-instructions which do not affect the wait states. 334 // If included, these instructions can lead to buffer overflow such that 335 // detectable hazards are missed. 336 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 337 CurrCycleInstr->isKill()) { 338 CurrCycleInstr = nullptr; 339 return; 340 } 341 342 if (CurrCycleInstr->isBundle()) { 343 processBundle(); 344 return; 345 } 346 347 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 348 349 // Keep track of emitted instructions 350 EmittedInstrs.push_front(CurrCycleInstr); 351 352 // Add a nullptr for each additional wait state after the first. Make sure 353 // not to add more than getMaxLookAhead() items to the list, since we 354 // truncate the list to that size right after this loop. 355 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 356 i < e; ++i) { 357 EmittedInstrs.push_front(nullptr); 358 } 359 360 // getMaxLookahead() is the largest number of wait states we will ever need 361 // to insert, so there is no point in keeping track of more than that many 362 // wait states. 363 EmittedInstrs.resize(getMaxLookAhead()); 364 365 CurrCycleInstr = nullptr; 366 } 367 368 void GCNHazardRecognizer::RecedeCycle() { 369 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 370 } 371 372 //===----------------------------------------------------------------------===// 373 // Helper Functions 374 //===----------------------------------------------------------------------===// 375 376 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 377 378 // Returns a minimum wait states since \p I walking all predecessors. 379 // Only scans until \p IsExpired does not return true. 380 // Can only be run in a hazard recognizer mode. 381 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 382 MachineBasicBlock *MBB, 383 MachineBasicBlock::reverse_instr_iterator I, 384 int WaitStates, 385 IsExpiredFn IsExpired, 386 DenseSet<const MachineBasicBlock *> &Visited) { 387 for (auto E = MBB->instr_rend(); I != E; ++I) { 388 // Don't add WaitStates for parent BUNDLE instructions. 389 if (I->isBundle()) 390 continue; 391 392 if (IsHazard(&*I)) 393 return WaitStates; 394 395 if (I->isInlineAsm() || I->isMetaInstruction()) 396 continue; 397 398 WaitStates += SIInstrInfo::getNumWaitStates(*I); 399 400 if (IsExpired(&*I, WaitStates)) 401 return std::numeric_limits<int>::max(); 402 } 403 404 int MinWaitStates = WaitStates; 405 bool Found = false; 406 for (MachineBasicBlock *Pred : MBB->predecessors()) { 407 if (!Visited.insert(Pred).second) 408 continue; 409 410 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 411 WaitStates, IsExpired, Visited); 412 413 if (W == std::numeric_limits<int>::max()) 414 continue; 415 416 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 417 if (IsExpired(nullptr, MinWaitStates)) 418 return MinWaitStates; 419 420 Found = true; 421 } 422 423 if (Found) 424 return MinWaitStates; 425 426 return std::numeric_limits<int>::max(); 427 } 428 429 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 430 MachineInstr *MI, 431 IsExpiredFn IsExpired) { 432 DenseSet<const MachineBasicBlock *> Visited; 433 return getWaitStatesSince(IsHazard, MI->getParent(), 434 std::next(MI->getReverseIterator()), 435 0, IsExpired, Visited); 436 } 437 438 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 439 if (IsHazardRecognizerMode) { 440 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 441 return WaitStates >= Limit; 442 }; 443 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 444 } 445 446 int WaitStates = 0; 447 for (MachineInstr *MI : EmittedInstrs) { 448 if (MI) { 449 if (IsHazard(MI)) 450 return WaitStates; 451 452 if (MI->isInlineAsm()) 453 continue; 454 } 455 ++WaitStates; 456 457 if (WaitStates >= Limit) 458 break; 459 } 460 return std::numeric_limits<int>::max(); 461 } 462 463 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 464 IsHazardFn IsHazardDef, 465 int Limit) { 466 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 467 468 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 469 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 470 }; 471 472 return getWaitStatesSince(IsHazardFn, Limit); 473 } 474 475 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 476 int Limit) { 477 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 478 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 479 }; 480 481 return getWaitStatesSince(IsHazardFn, Limit); 482 } 483 484 //===----------------------------------------------------------------------===// 485 // No-op Hazard Detection 486 //===----------------------------------------------------------------------===// 487 488 static void addRegUnits(const SIRegisterInfo &TRI, 489 BitVector &BV, unsigned Reg) { 490 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 491 BV.set(*RUI); 492 } 493 494 static void addRegsToSet(const SIRegisterInfo &TRI, 495 iterator_range<MachineInstr::const_mop_iterator> Ops, 496 BitVector &Set) { 497 for (const MachineOperand &Op : Ops) { 498 if (Op.isReg()) 499 addRegUnits(TRI, Set, Op.getReg()); 500 } 501 } 502 503 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 504 // XXX: Do we need to worry about implicit operands 505 addRegsToSet(TRI, MI.defs(), ClauseDefs); 506 addRegsToSet(TRI, MI.uses(), ClauseUses); 507 } 508 509 static bool breaksSMEMSoftClause(MachineInstr *MI) { 510 return !SIInstrInfo::isSMRD(*MI); 511 } 512 513 static bool breaksVMEMSoftClause(MachineInstr *MI) { 514 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 515 } 516 517 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 518 // SMEM soft clause are only present on VI+, and only matter if xnack is 519 // enabled. 520 if (!ST.isXNACKEnabled()) 521 return 0; 522 523 bool IsSMRD = TII.isSMRD(*MEM); 524 525 resetClause(); 526 527 // A soft-clause is any group of consecutive SMEM instructions. The 528 // instructions in this group may return out of order and/or may be 529 // replayed (i.e. the same instruction issued more than once). 530 // 531 // In order to handle these situations correctly we need to make sure that 532 // when a clause has more than one instruction, no instruction in the clause 533 // writes to a register that is read by another instruction in the clause 534 // (including itself). If we encounter this situaion, we need to break the 535 // clause by inserting a non SMEM instruction. 536 537 for (MachineInstr *MI : EmittedInstrs) { 538 // When we hit a non-SMEM instruction then we have passed the start of the 539 // clause and we can stop. 540 if (!MI) 541 break; 542 543 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 544 break; 545 546 addClauseInst(*MI); 547 } 548 549 if (ClauseDefs.none()) 550 return 0; 551 552 // We need to make sure not to put loads and stores in the same clause if they 553 // use the same address. For now, just start a new clause whenever we see a 554 // store. 555 if (MEM->mayStore()) 556 return 1; 557 558 addClauseInst(*MEM); 559 560 // If the set of defs and uses intersect then we cannot add this instruction 561 // to the clause, so we have a hazard. 562 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 563 } 564 565 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 566 int WaitStatesNeeded = 0; 567 568 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 569 570 // This SMRD hazard only affects SI. 571 if (!ST.hasSMRDReadVALUDefHazard()) 572 return WaitStatesNeeded; 573 574 // A read of an SGPR by SMRD instruction requires 4 wait states when the 575 // SGPR was written by a VALU instruction. 576 int SmrdSgprWaitStates = 4; 577 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 578 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 579 580 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 581 582 for (const MachineOperand &Use : SMRD->uses()) { 583 if (!Use.isReg()) 584 continue; 585 int WaitStatesNeededForUse = 586 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 587 SmrdSgprWaitStates); 588 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 589 590 // This fixes what appears to be undocumented hardware behavior in SI where 591 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 592 // needs some number of nops in between. We don't know how many we need, but 593 // let's use 4. This wasn't discovered before probably because the only 594 // case when this happens is when we expand a 64-bit pointer into a full 595 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 596 // probably never encountered in the closed-source land. 597 if (IsBufferSMRD) { 598 int WaitStatesNeededForUse = 599 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 600 IsBufferHazardDefFn, 601 SmrdSgprWaitStates); 602 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 603 } 604 } 605 606 return WaitStatesNeeded; 607 } 608 609 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 610 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 611 return 0; 612 613 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 614 615 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 616 // SGPR was written by a VALU Instruction. 617 const int VmemSgprWaitStates = 5; 618 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 619 for (const MachineOperand &Use : VMEM->uses()) { 620 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 621 continue; 622 623 int WaitStatesNeededForUse = 624 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 625 VmemSgprWaitStates); 626 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 627 } 628 return WaitStatesNeeded; 629 } 630 631 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 632 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 633 const SIInstrInfo *TII = ST.getInstrInfo(); 634 635 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 636 int DppVgprWaitStates = 2; 637 int DppExecWaitStates = 5; 638 int WaitStatesNeeded = 0; 639 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 640 641 for (const MachineOperand &Use : DPP->uses()) { 642 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 643 continue; 644 int WaitStatesNeededForUse = 645 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 646 [](MachineInstr *) { return true; }, 647 DppVgprWaitStates); 648 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 649 } 650 651 WaitStatesNeeded = std::max( 652 WaitStatesNeeded, 653 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 654 DppExecWaitStates)); 655 656 return WaitStatesNeeded; 657 } 658 659 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 660 const SIInstrInfo *TII = ST.getInstrInfo(); 661 662 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 663 // instruction. 664 const int DivFMasWaitStates = 4; 665 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 666 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 667 DivFMasWaitStates); 668 669 return DivFMasWaitStates - WaitStatesNeeded; 670 } 671 672 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 673 const SIInstrInfo *TII = ST.getInstrInfo(); 674 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 675 676 const int GetRegWaitStates = 2; 677 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 678 return GetRegHWReg == getHWReg(TII, *MI); 679 }; 680 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 681 682 return GetRegWaitStates - WaitStatesNeeded; 683 } 684 685 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 686 const SIInstrInfo *TII = ST.getInstrInfo(); 687 unsigned HWReg = getHWReg(TII, *SetRegInstr); 688 689 const int SetRegWaitStates = ST.getSetRegWaitStates(); 690 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 691 return HWReg == getHWReg(TII, *MI); 692 }; 693 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 694 return SetRegWaitStates - WaitStatesNeeded; 695 } 696 697 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 698 if (!MI.mayStore()) 699 return -1; 700 701 const SIInstrInfo *TII = ST.getInstrInfo(); 702 unsigned Opcode = MI.getOpcode(); 703 const MCInstrDesc &Desc = MI.getDesc(); 704 705 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 706 int VDataRCID = -1; 707 if (VDataIdx != -1) 708 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 709 710 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 711 // There is no hazard if the instruction does not use vector regs 712 // (like wbinvl1) 713 if (VDataIdx == -1) 714 return -1; 715 // For MUBUF/MTBUF instructions this hazard only exists if the 716 // instruction is not using a register in the soffset field. 717 const MachineOperand *SOffset = 718 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 719 // If we have no soffset operand, then assume this field has been 720 // hardcoded to zero. 721 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 722 (!SOffset || !SOffset->isReg())) 723 return VDataIdx; 724 } 725 726 // MIMG instructions create a hazard if they don't use a 256-bit T# and 727 // the store size is greater than 8 bytes and they have more than two bits 728 // of their dmask set. 729 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 730 if (TII->isMIMG(MI)) { 731 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 732 assert(SRsrcIdx != -1 && 733 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 734 (void)SRsrcIdx; 735 } 736 737 if (TII->isFLAT(MI)) { 738 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 739 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 740 return DataIdx; 741 } 742 743 return -1; 744 } 745 746 int 747 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 748 const MachineRegisterInfo &MRI) { 749 // Helper to check for the hazard where VMEM instructions that store more than 750 // 8 bytes can have there store data over written by the next instruction. 751 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 752 753 const int VALUWaitStates = 1; 754 int WaitStatesNeeded = 0; 755 756 if (!TRI->isVGPR(MRI, Def.getReg())) 757 return WaitStatesNeeded; 758 Register Reg = Def.getReg(); 759 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 760 int DataIdx = createsVALUHazard(*MI); 761 return DataIdx >= 0 && 762 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 763 }; 764 int WaitStatesNeededForDef = 765 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 766 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 767 768 return WaitStatesNeeded; 769 } 770 771 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 772 // This checks for the hazard where VMEM instructions that store more than 773 // 8 bytes can have there store data over written by the next instruction. 774 if (!ST.has12DWordStoreHazard()) 775 return 0; 776 777 const MachineRegisterInfo &MRI = MF.getRegInfo(); 778 int WaitStatesNeeded = 0; 779 780 for (const MachineOperand &Def : VALU->defs()) { 781 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 782 } 783 784 return WaitStatesNeeded; 785 } 786 787 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 788 // This checks for hazards associated with inline asm statements. 789 // Since inline asms can contain just about anything, we use this 790 // to call/leverage other check*Hazard routines. Note that 791 // this function doesn't attempt to address all possible inline asm 792 // hazards (good luck), but is a collection of what has been 793 // problematic thus far. 794 795 // see checkVALUHazards() 796 if (!ST.has12DWordStoreHazard()) 797 return 0; 798 799 const MachineRegisterInfo &MRI = MF.getRegInfo(); 800 int WaitStatesNeeded = 0; 801 802 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 803 I != E; ++I) { 804 const MachineOperand &Op = IA->getOperand(I); 805 if (Op.isReg() && Op.isDef()) { 806 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 807 } 808 } 809 810 return WaitStatesNeeded; 811 } 812 813 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 814 const SIInstrInfo *TII = ST.getInstrInfo(); 815 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 816 const MachineRegisterInfo &MRI = MF.getRegInfo(); 817 818 const MachineOperand *LaneSelectOp = 819 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 820 821 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 822 return 0; 823 824 Register LaneSelectReg = LaneSelectOp->getReg(); 825 auto IsHazardFn = [TII] (MachineInstr *MI) { 826 return TII->isVALU(*MI); 827 }; 828 829 const int RWLaneWaitStates = 4; 830 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 831 RWLaneWaitStates); 832 return RWLaneWaitStates - WaitStatesSince; 833 } 834 835 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 836 if (!ST.hasRFEHazards()) 837 return 0; 838 839 const SIInstrInfo *TII = ST.getInstrInfo(); 840 841 const int RFEWaitStates = 1; 842 843 auto IsHazardFn = [TII] (MachineInstr *MI) { 844 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 845 }; 846 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 847 return RFEWaitStates - WaitStatesNeeded; 848 } 849 850 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 851 const SIInstrInfo *TII = ST.getInstrInfo(); 852 const int SMovRelWaitStates = 1; 853 auto IsHazardFn = [TII] (MachineInstr *MI) { 854 return TII->isSALU(*MI); 855 }; 856 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 857 SMovRelWaitStates); 858 } 859 860 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 861 fixVMEMtoScalarWriteHazards(MI); 862 fixVcmpxPermlaneHazards(MI); 863 fixSMEMtoVectorWriteHazards(MI); 864 fixVcmpxExecWARHazard(MI); 865 fixLdsBranchVmemWARHazard(MI); 866 } 867 868 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 869 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 870 return false; 871 872 const SIInstrInfo *TII = ST.getInstrInfo(); 873 auto IsHazardFn = [TII] (MachineInstr *MI) { 874 return TII->isVOPC(*MI); 875 }; 876 877 auto IsExpiredFn = [] (MachineInstr *MI, int) { 878 if (!MI) 879 return false; 880 unsigned Opc = MI->getOpcode(); 881 return SIInstrInfo::isVALU(*MI) && 882 Opc != AMDGPU::V_NOP_e32 && 883 Opc != AMDGPU::V_NOP_e64 && 884 Opc != AMDGPU::V_NOP_sdwa; 885 }; 886 887 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 888 std::numeric_limits<int>::max()) 889 return false; 890 891 // V_NOP will be discarded by SQ. 892 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 893 // which is always a VGPR and available. 894 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 895 Register Reg = Src0->getReg(); 896 bool IsUndef = Src0->isUndef(); 897 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 898 TII->get(AMDGPU::V_MOV_B32_e32)) 899 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 900 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 901 902 return true; 903 } 904 905 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 906 if (!ST.hasVMEMtoScalarWriteHazard()) 907 return false; 908 909 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 910 return false; 911 912 if (MI->getNumDefs() == 0) 913 return false; 914 915 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 916 917 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 918 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 919 !SIInstrInfo::isFLAT(*I)) 920 return false; 921 922 for (const MachineOperand &Def : MI->defs()) { 923 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 924 if (!Op) 925 continue; 926 return true; 927 } 928 return false; 929 }; 930 931 auto IsExpiredFn = [](MachineInstr *MI, int) { 932 return MI && (SIInstrInfo::isVALU(*MI) || 933 (MI->getOpcode() == AMDGPU::S_WAITCNT && 934 !MI->getOperand(0).getImm()) || 935 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 936 MI->getOperand(0).getImm() == 0xffe3)); 937 }; 938 939 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 940 std::numeric_limits<int>::max()) 941 return false; 942 943 const SIInstrInfo *TII = ST.getInstrInfo(); 944 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 945 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 946 .addImm(0xffe3); 947 return true; 948 } 949 950 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 951 if (!ST.hasSMEMtoVectorWriteHazard()) 952 return false; 953 954 if (!SIInstrInfo::isVALU(*MI)) 955 return false; 956 957 unsigned SDSTName; 958 switch (MI->getOpcode()) { 959 case AMDGPU::V_READLANE_B32: 960 case AMDGPU::V_READLANE_B32_gfx10: 961 case AMDGPU::V_READFIRSTLANE_B32: 962 SDSTName = AMDGPU::OpName::vdst; 963 break; 964 default: 965 SDSTName = AMDGPU::OpName::sdst; 966 break; 967 } 968 969 const SIInstrInfo *TII = ST.getInstrInfo(); 970 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 971 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 972 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 973 if (!SDST) { 974 for (const auto &MO : MI->implicit_operands()) { 975 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 976 SDST = &MO; 977 break; 978 } 979 } 980 } 981 982 if (!SDST) 983 return false; 984 985 const Register SDSTReg = SDST->getReg(); 986 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 987 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 988 }; 989 990 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 991 if (MI) { 992 if (TII->isSALU(*MI)) { 993 switch (MI->getOpcode()) { 994 case AMDGPU::S_SETVSKIP: 995 case AMDGPU::S_VERSION: 996 case AMDGPU::S_WAITCNT_VSCNT: 997 case AMDGPU::S_WAITCNT_VMCNT: 998 case AMDGPU::S_WAITCNT_EXPCNT: 999 // These instructions cannot not mitigate the hazard. 1000 return false; 1001 case AMDGPU::S_WAITCNT_LGKMCNT: 1002 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1003 return (MI->getOperand(1).getImm() == 0) && 1004 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1005 case AMDGPU::S_WAITCNT: { 1006 const int64_t Imm = MI->getOperand(0).getImm(); 1007 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1008 return (Decoded.LgkmCnt == 0); 1009 } 1010 default: 1011 // SOPP instructions cannot mitigate the hazard. 1012 if (TII->isSOPP(*MI)) 1013 return false; 1014 // At this point the SALU can be assumed to mitigate the hazard 1015 // because either: 1016 // (a) it is independent of the at risk SMEM (breaking chain), 1017 // or 1018 // (b) it is dependent on the SMEM, in which case an appropriate 1019 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1020 // SMEM instruction. 1021 return true; 1022 } 1023 } 1024 } 1025 return false; 1026 }; 1027 1028 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1029 std::numeric_limits<int>::max()) 1030 return false; 1031 1032 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1033 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1034 .addImm(0); 1035 return true; 1036 } 1037 1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1039 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1040 return false; 1041 1042 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1043 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1044 return false; 1045 1046 auto IsHazardFn = [TRI] (MachineInstr *I) { 1047 if (SIInstrInfo::isVALU(*I)) 1048 return false; 1049 return I->readsRegister(AMDGPU::EXEC, TRI); 1050 }; 1051 1052 const SIInstrInfo *TII = ST.getInstrInfo(); 1053 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1054 if (!MI) 1055 return false; 1056 if (SIInstrInfo::isVALU(*MI)) { 1057 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1058 return true; 1059 for (auto MO : MI->implicit_operands()) 1060 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1061 return true; 1062 } 1063 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1064 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1065 return true; 1066 return false; 1067 }; 1068 1069 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1070 std::numeric_limits<int>::max()) 1071 return false; 1072 1073 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1074 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1075 .addImm(0xfffe); 1076 return true; 1077 } 1078 1079 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1080 if (!ST.hasLdsBranchVmemWARHazard()) 1081 return false; 1082 1083 auto IsHazardInst = [] (const MachineInstr *MI) { 1084 if (SIInstrInfo::isDS(*MI)) 1085 return 1; 1086 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1087 return 2; 1088 return 0; 1089 }; 1090 1091 auto InstType = IsHazardInst(MI); 1092 if (!InstType) 1093 return false; 1094 1095 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1096 return I && (IsHazardInst(I) || 1097 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1098 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1099 !I->getOperand(1).getImm())); 1100 }; 1101 1102 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1103 if (!I->isBranch()) 1104 return false; 1105 1106 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1107 auto InstType2 = IsHazardInst(I); 1108 return InstType2 && InstType != InstType2; 1109 }; 1110 1111 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1112 if (!I) 1113 return false; 1114 1115 auto InstType2 = IsHazardInst(I); 1116 if (InstType == InstType2) 1117 return true; 1118 1119 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1120 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1121 !I->getOperand(1).getImm(); 1122 }; 1123 1124 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1125 std::numeric_limits<int>::max(); 1126 }; 1127 1128 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1129 std::numeric_limits<int>::max()) 1130 return false; 1131 1132 const SIInstrInfo *TII = ST.getInstrInfo(); 1133 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1134 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1135 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1136 .addImm(0); 1137 1138 return true; 1139 } 1140 1141 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1142 int NSAtoVMEMWaitStates = 1; 1143 1144 if (!ST.hasNSAtoVMEMBug()) 1145 return 0; 1146 1147 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1148 return 0; 1149 1150 const SIInstrInfo *TII = ST.getInstrInfo(); 1151 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1152 if (!Offset || (Offset->getImm() & 6) == 0) 1153 return 0; 1154 1155 auto IsHazardFn = [TII] (MachineInstr *I) { 1156 if (!SIInstrInfo::isMIMG(*I)) 1157 return false; 1158 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1159 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1160 TII->getInstSizeInBytes(*I) >= 16; 1161 }; 1162 1163 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1164 } 1165 1166 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1167 int FPAtomicToDenormModeWaitStates = 3; 1168 1169 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1170 return 0; 1171 1172 auto IsHazardFn = [] (MachineInstr *I) { 1173 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1174 return false; 1175 return SIInstrInfo::isFPAtomic(*I); 1176 }; 1177 1178 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1179 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1180 return true; 1181 1182 switch (MI->getOpcode()) { 1183 case AMDGPU::S_WAITCNT: 1184 case AMDGPU::S_WAITCNT_VSCNT: 1185 case AMDGPU::S_WAITCNT_VMCNT: 1186 case AMDGPU::S_WAITCNT_EXPCNT: 1187 case AMDGPU::S_WAITCNT_LGKMCNT: 1188 case AMDGPU::S_WAITCNT_IDLE: 1189 return true; 1190 default: 1191 break; 1192 } 1193 1194 return false; 1195 }; 1196 1197 1198 return FPAtomicToDenormModeWaitStates - 1199 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1200 } 1201 1202 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1203 assert(SIInstrInfo::isMAI(*MI)); 1204 1205 int WaitStatesNeeded = 0; 1206 unsigned Opc = MI->getOpcode(); 1207 1208 auto IsVALUFn = [] (MachineInstr *MI) { 1209 return SIInstrInfo::isVALU(*MI); 1210 }; 1211 1212 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1213 const int LegacyVALUWritesVGPRWaitStates = 2; 1214 const int VALUWritesExecWaitStates = 4; 1215 const int MaxWaitStates = 4; 1216 1217 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1218 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1219 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1220 1221 if (WaitStatesNeeded < MaxWaitStates) { 1222 for (const MachineOperand &Use : MI->explicit_uses()) { 1223 const int MaxWaitStates = 2; 1224 1225 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1226 continue; 1227 1228 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1229 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1230 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1231 1232 if (WaitStatesNeeded == MaxWaitStates) 1233 break; 1234 } 1235 } 1236 } 1237 1238 auto IsMFMAFn = [] (MachineInstr *MI) { 1239 return SIInstrInfo::isMAI(*MI) && 1240 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1241 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1242 }; 1243 1244 for (const MachineOperand &Op : MI->explicit_operands()) { 1245 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1246 continue; 1247 1248 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1249 continue; 1250 1251 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1252 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1253 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1254 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1255 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1256 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1257 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1258 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1259 const int MaxWaitStates = 18; 1260 Register Reg = Op.getReg(); 1261 unsigned HazardDefLatency = 0; 1262 1263 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1264 (MachineInstr *MI) { 1265 if (!IsMFMAFn(MI)) 1266 return false; 1267 Register DstReg = MI->getOperand(0).getReg(); 1268 if (DstReg == Reg) 1269 return false; 1270 HazardDefLatency = std::max(HazardDefLatency, 1271 TSchedModel.computeInstrLatency(MI)); 1272 return TRI.regsOverlap(DstReg, Reg); 1273 }; 1274 1275 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1276 MaxWaitStates); 1277 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1278 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1279 int OpNo = MI->getOperandNo(&Op); 1280 if (OpNo == SrcCIdx) { 1281 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1282 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1283 switch (HazardDefLatency) { 1284 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1285 break; 1286 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1287 break; 1288 case 16: LLVM_FALLTHROUGH; 1289 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1290 break; 1291 } 1292 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1293 switch (HazardDefLatency) { 1294 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1295 break; 1296 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1297 break; 1298 case 16: LLVM_FALLTHROUGH; 1299 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1300 break; 1301 } 1302 } 1303 1304 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1305 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1306 1307 if (WaitStatesNeeded == MaxWaitStates) 1308 return WaitStatesNeeded; // Early exit. 1309 1310 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1311 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1312 return false; 1313 Register DstReg = MI->getOperand(0).getReg(); 1314 return TRI.regsOverlap(Reg, DstReg); 1315 }; 1316 1317 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1318 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1319 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1320 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1321 if (OpNo == SrcCIdx) 1322 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1323 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1324 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1325 1326 WaitStatesNeededForUse = NeedWaitStates - 1327 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1328 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1329 1330 if (WaitStatesNeeded == MaxWaitStates) 1331 return WaitStatesNeeded; // Early exit. 1332 } 1333 1334 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1335 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1336 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1337 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1338 const int MaxWaitStates = 13; 1339 Register DstReg = MI->getOperand(0).getReg(); 1340 unsigned HazardDefLatency = 0; 1341 1342 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1343 (MachineInstr *MI) { 1344 if (!IsMFMAFn(MI)) 1345 return false; 1346 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1347 HazardDefLatency = std::max(HazardDefLatency, 1348 TSchedModel.computeInstrLatency(MI)); 1349 return TRI.regsOverlap(Reg, DstReg); 1350 }; 1351 1352 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1353 int NeedWaitStates; 1354 switch (HazardDefLatency) { 1355 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1356 break; 1357 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1358 break; 1359 case 16: LLVM_FALLTHROUGH; 1360 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1361 break; 1362 } 1363 1364 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1365 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1366 } 1367 1368 return WaitStatesNeeded; 1369 } 1370 1371 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1372 if (!ST.hasMAIInsts()) 1373 return 0; 1374 1375 int WaitStatesNeeded = 0; 1376 1377 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1378 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1379 }; 1380 1381 for (const MachineOperand &Op : MI->explicit_uses()) { 1382 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1383 continue; 1384 1385 Register Reg = Op.getReg(); 1386 1387 const int AccVgprReadLdStWaitStates = 2; 1388 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1389 const int MaxWaitStates = 2; 1390 1391 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1392 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1393 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1394 1395 if (WaitStatesNeeded == MaxWaitStates) 1396 return WaitStatesNeeded; // Early exit. 1397 1398 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { 1399 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 && 1400 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1401 return false; 1402 auto IsVALUFn = [] (MachineInstr *MI) { 1403 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1404 }; 1405 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1406 std::numeric_limits<int>::max(); 1407 }; 1408 1409 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1410 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1411 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1412 } 1413 1414 return WaitStatesNeeded; 1415 } 1416 1417 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1418 if (!SU->isInstr()) 1419 return false; 1420 1421 MachineInstr *MAI = nullptr; 1422 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1423 MAI = nullptr; 1424 if (SIInstrInfo::isMAI(*MI) && 1425 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1426 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1427 MAI = MI; 1428 return MAI != nullptr; 1429 }; 1430 1431 MachineInstr *MI = SU->getInstr(); 1432 if (IsMFMAFn(MI)) { 1433 int W = getWaitStatesSince(IsMFMAFn, 16); 1434 if (MAI) 1435 return W < (int)TSchedModel.computeInstrLatency(MAI); 1436 } 1437 1438 return false; 1439 } 1440