1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "SIDefines.h" 16 #include "SIInstrInfo.h" 17 #include "SIRegisterInfo.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <limits> 31 #include <set> 32 #include <vector> 33 34 using namespace llvm; 35 36 //===----------------------------------------------------------------------===// 37 // Hazard Recoginizer Implementation 38 //===----------------------------------------------------------------------===// 39 40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41 IsHazardRecognizerMode(false), 42 CurrCycleInstr(nullptr), 43 MF(MF), 44 ST(MF.getSubtarget<GCNSubtarget>()), 45 TII(*ST.getInstrInfo()), 46 TRI(TII.getRegisterInfo()), 47 ClauseUses(TRI.getNumRegUnits()), 48 ClauseDefs(TRI.getNumRegUnits()) { 49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50 TSchedModel.init(&ST); 51 } 52 53 void GCNHazardRecognizer::Reset() { 54 EmittedInstrs.clear(); 55 } 56 57 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 58 EmitInstruction(SU->getInstr()); 59 } 60 61 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 62 CurrCycleInstr = MI; 63 } 64 65 static bool isDivFMas(unsigned Opcode) { 66 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 67 } 68 69 static bool isSGetReg(unsigned Opcode) { 70 return Opcode == AMDGPU::S_GETREG_B32; 71 } 72 73 static bool isSSetReg(unsigned Opcode) { 74 switch (Opcode) { 75 case AMDGPU::S_SETREG_B32: 76 case AMDGPU::S_SETREG_B32_mode: 77 case AMDGPU::S_SETREG_IMM32_B32: 78 case AMDGPU::S_SETREG_IMM32_B32_mode: 79 return true; 80 } 81 return false; 82 } 83 84 static bool isRWLane(unsigned Opcode) { 85 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 86 } 87 88 static bool isRFE(unsigned Opcode) { 89 return Opcode == AMDGPU::S_RFE_B64; 90 } 91 92 static bool isSMovRel(unsigned Opcode) { 93 switch (Opcode) { 94 case AMDGPU::S_MOVRELS_B32: 95 case AMDGPU::S_MOVRELS_B64: 96 case AMDGPU::S_MOVRELD_B32: 97 case AMDGPU::S_MOVRELD_B64: 98 return true; 99 default: 100 return false; 101 } 102 } 103 104 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 105 const MachineInstr &MI) { 106 if (TII.isAlwaysGDS(MI.getOpcode())) 107 return true; 108 109 switch (MI.getOpcode()) { 110 case AMDGPU::S_SENDMSG: 111 case AMDGPU::S_SENDMSGHALT: 112 case AMDGPU::S_TTRACEDATA: 113 return true; 114 // These DS opcodes don't support GDS. 115 case AMDGPU::DS_NOP: 116 case AMDGPU::DS_PERMUTE_B32: 117 case AMDGPU::DS_BPERMUTE_B32: 118 return false; 119 default: 120 if (TII.isDS(MI.getOpcode())) { 121 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 122 AMDGPU::OpName::gds); 123 if (MI.getOperand(GDS).getImm()) 124 return true; 125 } 126 return false; 127 } 128 } 129 130 static bool isPermlane(const MachineInstr &MI) { 131 unsigned Opcode = MI.getOpcode(); 132 return Opcode == AMDGPU::V_PERMLANE16_B32 || 133 Opcode == AMDGPU::V_PERMLANEX16_B32; 134 } 135 136 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 137 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 138 AMDGPU::OpName::simm16); 139 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 140 } 141 142 ScheduleHazardRecognizer::HazardType 143 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 144 MachineInstr *MI = SU->getInstr(); 145 // If we are not in "HazardRecognizerMode" and therefore not being run from 146 // the scheduler, track possible stalls from hazards but don't insert noops. 147 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 148 149 if (MI->isBundle()) 150 return NoHazard; 151 152 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 153 return HazardType; 154 155 // FIXME: Should flat be considered vmem? 156 if ((SIInstrInfo::isVMEM(*MI) || 157 SIInstrInfo::isFLAT(*MI)) 158 && checkVMEMHazards(MI) > 0) 159 return HazardType; 160 161 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 162 return HazardType; 163 164 if (checkFPAtomicToDenormModeHazard(MI) > 0) 165 return HazardType; 166 167 if (ST.hasNoDataDepHazard()) 168 return NoHazard; 169 170 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 171 return HazardType; 172 173 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 174 return HazardType; 175 176 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 177 return HazardType; 178 179 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 180 return HazardType; 181 182 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 183 return HazardType; 184 185 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 186 return HazardType; 187 188 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 189 return HazardType; 190 191 if (ST.hasReadM0MovRelInterpHazard() && 192 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 193 checkReadM0Hazards(MI) > 0) 194 return HazardType; 195 196 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 197 checkReadM0Hazards(MI) > 0) 198 return HazardType; 199 200 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 201 return HazardType; 202 203 if ((SIInstrInfo::isVMEM(*MI) || 204 SIInstrInfo::isFLAT(*MI) || 205 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 206 return HazardType; 207 208 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 209 return HazardType; 210 211 return NoHazard; 212 } 213 214 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 215 unsigned Quantity) { 216 while (Quantity > 0) { 217 unsigned Arg = std::min(Quantity, 8u); 218 Quantity -= Arg; 219 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 220 .addImm(Arg - 1); 221 } 222 } 223 224 void GCNHazardRecognizer::processBundle() { 225 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 226 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 227 // Check bundled MachineInstr's for hazards. 228 for (; MI != E && MI->isInsideBundle(); ++MI) { 229 CurrCycleInstr = &*MI; 230 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 231 232 if (IsHazardRecognizerMode) { 233 fixHazards(CurrCycleInstr); 234 235 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 236 } 237 238 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 239 // include the bundled MI directly after, only add a maximum of 240 // (MaxLookAhead - 1) noops to EmittedInstrs. 241 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 242 EmittedInstrs.push_front(nullptr); 243 244 EmittedInstrs.push_front(CurrCycleInstr); 245 EmittedInstrs.resize(MaxLookAhead); 246 } 247 CurrCycleInstr = nullptr; 248 } 249 250 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 251 IsHazardRecognizerMode = true; 252 CurrCycleInstr = MI; 253 unsigned W = PreEmitNoopsCommon(MI); 254 fixHazards(MI); 255 CurrCycleInstr = nullptr; 256 return W; 257 } 258 259 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 260 if (MI->isBundle()) 261 return 0; 262 263 int WaitStates = 0; 264 265 if (SIInstrInfo::isSMRD(*MI)) 266 return std::max(WaitStates, checkSMRDHazards(MI)); 267 268 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 269 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 270 271 if (ST.hasNSAtoVMEMBug()) 272 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 273 274 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 275 276 if (ST.hasNoDataDepHazard()) 277 return WaitStates; 278 279 if (SIInstrInfo::isVALU(*MI)) 280 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 281 282 if (SIInstrInfo::isDPP(*MI)) 283 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 284 285 if (isDivFMas(MI->getOpcode())) 286 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 287 288 if (isRWLane(MI->getOpcode())) 289 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 290 291 if (MI->isInlineAsm()) 292 return std::max(WaitStates, checkInlineAsmHazards(MI)); 293 294 if (isSGetReg(MI->getOpcode())) 295 return std::max(WaitStates, checkGetRegHazards(MI)); 296 297 if (isSSetReg(MI->getOpcode())) 298 return std::max(WaitStates, checkSetRegHazards(MI)); 299 300 if (isRFE(MI->getOpcode())) 301 return std::max(WaitStates, checkRFEHazards(MI)); 302 303 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 304 isSMovRel(MI->getOpcode()))) 305 return std::max(WaitStates, checkReadM0Hazards(MI)); 306 307 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 308 return std::max(WaitStates, checkReadM0Hazards(MI)); 309 310 if (SIInstrInfo::isMAI(*MI)) 311 return std::max(WaitStates, checkMAIHazards(MI)); 312 313 if (SIInstrInfo::isVMEM(*MI) || 314 SIInstrInfo::isFLAT(*MI) || 315 SIInstrInfo::isDS(*MI)) 316 return std::max(WaitStates, checkMAILdStHazards(MI)); 317 318 return WaitStates; 319 } 320 321 void GCNHazardRecognizer::EmitNoop() { 322 EmittedInstrs.push_front(nullptr); 323 } 324 325 void GCNHazardRecognizer::AdvanceCycle() { 326 // When the scheduler detects a stall, it will call AdvanceCycle() without 327 // emitting any instructions. 328 if (!CurrCycleInstr) { 329 EmittedInstrs.push_front(nullptr); 330 return; 331 } 332 333 // Do not track non-instructions which do not affect the wait states. 334 // If included, these instructions can lead to buffer overflow such that 335 // detectable hazards are missed. 336 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 337 CurrCycleInstr->isKill()) { 338 CurrCycleInstr = nullptr; 339 return; 340 } 341 342 if (CurrCycleInstr->isBundle()) { 343 processBundle(); 344 return; 345 } 346 347 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 348 349 // Keep track of emitted instructions 350 EmittedInstrs.push_front(CurrCycleInstr); 351 352 // Add a nullptr for each additional wait state after the first. Make sure 353 // not to add more than getMaxLookAhead() items to the list, since we 354 // truncate the list to that size right after this loop. 355 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 356 i < e; ++i) { 357 EmittedInstrs.push_front(nullptr); 358 } 359 360 // getMaxLookahead() is the largest number of wait states we will ever need 361 // to insert, so there is no point in keeping track of more than that many 362 // wait states. 363 EmittedInstrs.resize(getMaxLookAhead()); 364 365 CurrCycleInstr = nullptr; 366 } 367 368 void GCNHazardRecognizer::RecedeCycle() { 369 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 370 } 371 372 //===----------------------------------------------------------------------===// 373 // Helper Functions 374 //===----------------------------------------------------------------------===// 375 376 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 377 378 // Returns a minimum wait states since \p I walking all predecessors. 379 // Only scans until \p IsExpired does not return true. 380 // Can only be run in a hazard recognizer mode. 381 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 382 MachineBasicBlock *MBB, 383 MachineBasicBlock::reverse_instr_iterator I, 384 int WaitStates, 385 IsExpiredFn IsExpired, 386 DenseSet<const MachineBasicBlock *> &Visited) { 387 for (auto E = MBB->instr_rend(); I != E; ++I) { 388 // Don't add WaitStates for parent BUNDLE instructions. 389 if (I->isBundle()) 390 continue; 391 392 if (IsHazard(&*I)) 393 return WaitStates; 394 395 if (I->isInlineAsm() || I->isMetaInstruction()) 396 continue; 397 398 WaitStates += SIInstrInfo::getNumWaitStates(*I); 399 400 if (IsExpired(&*I, WaitStates)) 401 return std::numeric_limits<int>::max(); 402 } 403 404 int MinWaitStates = WaitStates; 405 bool Found = false; 406 for (MachineBasicBlock *Pred : MBB->predecessors()) { 407 if (!Visited.insert(Pred).second) 408 continue; 409 410 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 411 WaitStates, IsExpired, Visited); 412 413 if (W == std::numeric_limits<int>::max()) 414 continue; 415 416 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 417 if (IsExpired(nullptr, MinWaitStates)) 418 return MinWaitStates; 419 420 Found = true; 421 } 422 423 if (Found) 424 return MinWaitStates; 425 426 return std::numeric_limits<int>::max(); 427 } 428 429 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 430 MachineInstr *MI, 431 IsExpiredFn IsExpired) { 432 DenseSet<const MachineBasicBlock *> Visited; 433 return getWaitStatesSince(IsHazard, MI->getParent(), 434 std::next(MI->getReverseIterator()), 435 0, IsExpired, Visited); 436 } 437 438 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 439 if (IsHazardRecognizerMode) { 440 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 441 return WaitStates >= Limit; 442 }; 443 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 444 } 445 446 int WaitStates = 0; 447 for (MachineInstr *MI : EmittedInstrs) { 448 if (MI) { 449 if (IsHazard(MI)) 450 return WaitStates; 451 452 if (MI->isInlineAsm()) 453 continue; 454 } 455 ++WaitStates; 456 457 if (WaitStates >= Limit) 458 break; 459 } 460 return std::numeric_limits<int>::max(); 461 } 462 463 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 464 IsHazardFn IsHazardDef, 465 int Limit) { 466 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 467 468 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 469 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 470 }; 471 472 return getWaitStatesSince(IsHazardFn, Limit); 473 } 474 475 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 476 int Limit) { 477 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 478 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 479 }; 480 481 return getWaitStatesSince(IsHazardFn, Limit); 482 } 483 484 //===----------------------------------------------------------------------===// 485 // No-op Hazard Detection 486 //===----------------------------------------------------------------------===// 487 488 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 489 MCRegister Reg) { 490 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 491 BV.set(*RUI); 492 } 493 494 static void addRegsToSet(const SIRegisterInfo &TRI, 495 iterator_range<MachineInstr::const_mop_iterator> Ops, 496 BitVector &Set) { 497 for (const MachineOperand &Op : Ops) { 498 if (Op.isReg()) 499 addRegUnits(TRI, Set, Op.getReg().asMCReg()); 500 } 501 } 502 503 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 504 // XXX: Do we need to worry about implicit operands 505 addRegsToSet(TRI, MI.defs(), ClauseDefs); 506 addRegsToSet(TRI, MI.uses(), ClauseUses); 507 } 508 509 static bool breaksSMEMSoftClause(MachineInstr *MI) { 510 return !SIInstrInfo::isSMRD(*MI); 511 } 512 513 static bool breaksVMEMSoftClause(MachineInstr *MI) { 514 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 515 } 516 517 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 518 // SMEM soft clause are only present on VI+, and only matter if xnack is 519 // enabled. 520 if (!ST.isXNACKEnabled()) 521 return 0; 522 523 bool IsSMRD = TII.isSMRD(*MEM); 524 525 resetClause(); 526 527 // A soft-clause is any group of consecutive SMEM instructions. The 528 // instructions in this group may return out of order and/or may be 529 // replayed (i.e. the same instruction issued more than once). 530 // 531 // In order to handle these situations correctly we need to make sure that 532 // when a clause has more than one instruction, no instruction in the clause 533 // writes to a register that is read by another instruction in the clause 534 // (including itself). If we encounter this situaion, we need to break the 535 // clause by inserting a non SMEM instruction. 536 537 for (MachineInstr *MI : EmittedInstrs) { 538 // When we hit a non-SMEM instruction then we have passed the start of the 539 // clause and we can stop. 540 if (!MI) 541 break; 542 543 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 544 break; 545 546 addClauseInst(*MI); 547 } 548 549 if (ClauseDefs.none()) 550 return 0; 551 552 // We need to make sure not to put loads and stores in the same clause if they 553 // use the same address. For now, just start a new clause whenever we see a 554 // store. 555 if (MEM->mayStore()) 556 return 1; 557 558 addClauseInst(*MEM); 559 560 // If the set of defs and uses intersect then we cannot add this instruction 561 // to the clause, so we have a hazard. 562 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 563 } 564 565 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 566 int WaitStatesNeeded = 0; 567 568 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 569 570 // This SMRD hazard only affects SI. 571 if (!ST.hasSMRDReadVALUDefHazard()) 572 return WaitStatesNeeded; 573 574 // A read of an SGPR by SMRD instruction requires 4 wait states when the 575 // SGPR was written by a VALU instruction. 576 int SmrdSgprWaitStates = 4; 577 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 578 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 579 580 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 581 582 for (const MachineOperand &Use : SMRD->uses()) { 583 if (!Use.isReg()) 584 continue; 585 int WaitStatesNeededForUse = 586 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 587 SmrdSgprWaitStates); 588 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 589 590 // This fixes what appears to be undocumented hardware behavior in SI where 591 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 592 // needs some number of nops in between. We don't know how many we need, but 593 // let's use 4. This wasn't discovered before probably because the only 594 // case when this happens is when we expand a 64-bit pointer into a full 595 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 596 // probably never encountered in the closed-source land. 597 if (IsBufferSMRD) { 598 int WaitStatesNeededForUse = 599 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 600 IsBufferHazardDefFn, 601 SmrdSgprWaitStates); 602 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 603 } 604 } 605 606 return WaitStatesNeeded; 607 } 608 609 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 610 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 611 return 0; 612 613 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 614 615 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 616 // SGPR was written by a VALU Instruction. 617 const int VmemSgprWaitStates = 5; 618 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 619 for (const MachineOperand &Use : VMEM->uses()) { 620 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 621 continue; 622 623 int WaitStatesNeededForUse = 624 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 625 VmemSgprWaitStates); 626 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 627 } 628 return WaitStatesNeeded; 629 } 630 631 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 632 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 633 const SIInstrInfo *TII = ST.getInstrInfo(); 634 635 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 636 int DppVgprWaitStates = 2; 637 int DppExecWaitStates = 5; 638 int WaitStatesNeeded = 0; 639 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 640 641 for (const MachineOperand &Use : DPP->uses()) { 642 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 643 continue; 644 int WaitStatesNeededForUse = 645 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 646 [](MachineInstr *) { return true; }, 647 DppVgprWaitStates); 648 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 649 } 650 651 WaitStatesNeeded = std::max( 652 WaitStatesNeeded, 653 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 654 DppExecWaitStates)); 655 656 return WaitStatesNeeded; 657 } 658 659 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 660 const SIInstrInfo *TII = ST.getInstrInfo(); 661 662 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 663 // instruction. 664 const int DivFMasWaitStates = 4; 665 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 666 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 667 DivFMasWaitStates); 668 669 return DivFMasWaitStates - WaitStatesNeeded; 670 } 671 672 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 673 const SIInstrInfo *TII = ST.getInstrInfo(); 674 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 675 676 const int GetRegWaitStates = 2; 677 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 678 return GetRegHWReg == getHWReg(TII, *MI); 679 }; 680 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 681 682 return GetRegWaitStates - WaitStatesNeeded; 683 } 684 685 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 686 const SIInstrInfo *TII = ST.getInstrInfo(); 687 unsigned HWReg = getHWReg(TII, *SetRegInstr); 688 689 const int SetRegWaitStates = ST.getSetRegWaitStates(); 690 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 691 return HWReg == getHWReg(TII, *MI); 692 }; 693 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 694 return SetRegWaitStates - WaitStatesNeeded; 695 } 696 697 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 698 if (!MI.mayStore()) 699 return -1; 700 701 const SIInstrInfo *TII = ST.getInstrInfo(); 702 unsigned Opcode = MI.getOpcode(); 703 const MCInstrDesc &Desc = MI.getDesc(); 704 705 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 706 int VDataRCID = -1; 707 if (VDataIdx != -1) 708 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 709 710 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 711 // There is no hazard if the instruction does not use vector regs 712 // (like wbinvl1) 713 if (VDataIdx == -1) 714 return -1; 715 // For MUBUF/MTBUF instructions this hazard only exists if the 716 // instruction is not using a register in the soffset field. 717 const MachineOperand *SOffset = 718 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 719 // If we have no soffset operand, then assume this field has been 720 // hardcoded to zero. 721 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 722 (!SOffset || !SOffset->isReg())) 723 return VDataIdx; 724 } 725 726 // MIMG instructions create a hazard if they don't use a 256-bit T# and 727 // the store size is greater than 8 bytes and they have more than two bits 728 // of their dmask set. 729 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 730 if (TII->isMIMG(MI)) { 731 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 732 assert(SRsrcIdx != -1 && 733 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 734 (void)SRsrcIdx; 735 } 736 737 if (TII->isFLAT(MI)) { 738 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 739 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 740 return DataIdx; 741 } 742 743 return -1; 744 } 745 746 int 747 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 748 const MachineRegisterInfo &MRI) { 749 // Helper to check for the hazard where VMEM instructions that store more than 750 // 8 bytes can have there store data over written by the next instruction. 751 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 752 753 const int VALUWaitStates = 1; 754 int WaitStatesNeeded = 0; 755 756 if (!TRI->isVGPR(MRI, Def.getReg())) 757 return WaitStatesNeeded; 758 Register Reg = Def.getReg(); 759 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 760 int DataIdx = createsVALUHazard(*MI); 761 return DataIdx >= 0 && 762 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 763 }; 764 int WaitStatesNeededForDef = 765 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 766 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 767 768 return WaitStatesNeeded; 769 } 770 771 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 772 // This checks for the hazard where VMEM instructions that store more than 773 // 8 bytes can have there store data over written by the next instruction. 774 if (!ST.has12DWordStoreHazard()) 775 return 0; 776 777 const MachineRegisterInfo &MRI = MF.getRegInfo(); 778 int WaitStatesNeeded = 0; 779 780 for (const MachineOperand &Def : VALU->defs()) { 781 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 782 } 783 784 return WaitStatesNeeded; 785 } 786 787 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 788 // This checks for hazards associated with inline asm statements. 789 // Since inline asms can contain just about anything, we use this 790 // to call/leverage other check*Hazard routines. Note that 791 // this function doesn't attempt to address all possible inline asm 792 // hazards (good luck), but is a collection of what has been 793 // problematic thus far. 794 795 // see checkVALUHazards() 796 if (!ST.has12DWordStoreHazard()) 797 return 0; 798 799 const MachineRegisterInfo &MRI = MF.getRegInfo(); 800 int WaitStatesNeeded = 0; 801 802 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 803 I != E; ++I) { 804 const MachineOperand &Op = IA->getOperand(I); 805 if (Op.isReg() && Op.isDef()) { 806 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 807 } 808 } 809 810 return WaitStatesNeeded; 811 } 812 813 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 814 const SIInstrInfo *TII = ST.getInstrInfo(); 815 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 816 const MachineRegisterInfo &MRI = MF.getRegInfo(); 817 818 const MachineOperand *LaneSelectOp = 819 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 820 821 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 822 return 0; 823 824 Register LaneSelectReg = LaneSelectOp->getReg(); 825 auto IsHazardFn = [TII] (MachineInstr *MI) { 826 return TII->isVALU(*MI); 827 }; 828 829 const int RWLaneWaitStates = 4; 830 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 831 RWLaneWaitStates); 832 return RWLaneWaitStates - WaitStatesSince; 833 } 834 835 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 836 if (!ST.hasRFEHazards()) 837 return 0; 838 839 const SIInstrInfo *TII = ST.getInstrInfo(); 840 841 const int RFEWaitStates = 1; 842 843 auto IsHazardFn = [TII] (MachineInstr *MI) { 844 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 845 }; 846 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 847 return RFEWaitStates - WaitStatesNeeded; 848 } 849 850 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 851 const SIInstrInfo *TII = ST.getInstrInfo(); 852 const int SMovRelWaitStates = 1; 853 auto IsHazardFn = [TII] (MachineInstr *MI) { 854 return TII->isSALU(*MI); 855 }; 856 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 857 SMovRelWaitStates); 858 } 859 860 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 861 fixVMEMtoScalarWriteHazards(MI); 862 fixVcmpxPermlaneHazards(MI); 863 fixSMEMtoVectorWriteHazards(MI); 864 fixVcmpxExecWARHazard(MI); 865 fixLdsBranchVmemWARHazard(MI); 866 } 867 868 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 869 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 870 return false; 871 872 const SIInstrInfo *TII = ST.getInstrInfo(); 873 auto IsHazardFn = [TII] (MachineInstr *MI) { 874 return TII->isVOPC(*MI); 875 }; 876 877 auto IsExpiredFn = [] (MachineInstr *MI, int) { 878 if (!MI) 879 return false; 880 unsigned Opc = MI->getOpcode(); 881 return SIInstrInfo::isVALU(*MI) && 882 Opc != AMDGPU::V_NOP_e32 && 883 Opc != AMDGPU::V_NOP_e64 && 884 Opc != AMDGPU::V_NOP_sdwa; 885 }; 886 887 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 888 std::numeric_limits<int>::max()) 889 return false; 890 891 // V_NOP will be discarded by SQ. 892 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 893 // which is always a VGPR and available. 894 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 895 Register Reg = Src0->getReg(); 896 bool IsUndef = Src0->isUndef(); 897 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 898 TII->get(AMDGPU::V_MOV_B32_e32)) 899 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 900 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 901 902 return true; 903 } 904 905 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 906 if (!ST.hasVMEMtoScalarWriteHazard()) 907 return false; 908 909 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 910 return false; 911 912 if (MI->getNumDefs() == 0) 913 return false; 914 915 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 916 917 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 918 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 919 !SIInstrInfo::isFLAT(*I)) 920 return false; 921 922 for (const MachineOperand &Def : MI->defs()) { 923 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 924 if (!Op) 925 continue; 926 return true; 927 } 928 return false; 929 }; 930 931 auto IsExpiredFn = [](MachineInstr *MI, int) { 932 return MI && (SIInstrInfo::isVALU(*MI) || 933 (MI->getOpcode() == AMDGPU::S_WAITCNT && 934 !MI->getOperand(0).getImm()) || 935 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 936 MI->getOperand(0).getImm() == 0xffe3)); 937 }; 938 939 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 940 std::numeric_limits<int>::max()) 941 return false; 942 943 const SIInstrInfo *TII = ST.getInstrInfo(); 944 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 945 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 946 .addImm(0xffe3); 947 return true; 948 } 949 950 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 951 if (!ST.hasSMEMtoVectorWriteHazard()) 952 return false; 953 954 if (!SIInstrInfo::isVALU(*MI)) 955 return false; 956 957 unsigned SDSTName; 958 switch (MI->getOpcode()) { 959 case AMDGPU::V_READLANE_B32: 960 case AMDGPU::V_READFIRSTLANE_B32: 961 SDSTName = AMDGPU::OpName::vdst; 962 break; 963 default: 964 SDSTName = AMDGPU::OpName::sdst; 965 break; 966 } 967 968 const SIInstrInfo *TII = ST.getInstrInfo(); 969 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 970 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 971 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 972 if (!SDST) { 973 for (const auto &MO : MI->implicit_operands()) { 974 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 975 SDST = &MO; 976 break; 977 } 978 } 979 } 980 981 if (!SDST) 982 return false; 983 984 const Register SDSTReg = SDST->getReg(); 985 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 986 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 987 }; 988 989 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 990 if (MI) { 991 if (TII->isSALU(*MI)) { 992 switch (MI->getOpcode()) { 993 case AMDGPU::S_SETVSKIP: 994 case AMDGPU::S_VERSION: 995 case AMDGPU::S_WAITCNT_VSCNT: 996 case AMDGPU::S_WAITCNT_VMCNT: 997 case AMDGPU::S_WAITCNT_EXPCNT: 998 // These instructions cannot not mitigate the hazard. 999 return false; 1000 case AMDGPU::S_WAITCNT_LGKMCNT: 1001 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1002 return (MI->getOperand(1).getImm() == 0) && 1003 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1004 case AMDGPU::S_WAITCNT: { 1005 const int64_t Imm = MI->getOperand(0).getImm(); 1006 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1007 return (Decoded.LgkmCnt == 0); 1008 } 1009 default: 1010 // SOPP instructions cannot mitigate the hazard. 1011 if (TII->isSOPP(*MI)) 1012 return false; 1013 // At this point the SALU can be assumed to mitigate the hazard 1014 // because either: 1015 // (a) it is independent of the at risk SMEM (breaking chain), 1016 // or 1017 // (b) it is dependent on the SMEM, in which case an appropriate 1018 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1019 // SMEM instruction. 1020 return true; 1021 } 1022 } 1023 } 1024 return false; 1025 }; 1026 1027 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1028 std::numeric_limits<int>::max()) 1029 return false; 1030 1031 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1032 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1033 .addImm(0); 1034 return true; 1035 } 1036 1037 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1038 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1039 return false; 1040 1041 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1042 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1043 return false; 1044 1045 auto IsHazardFn = [TRI] (MachineInstr *I) { 1046 if (SIInstrInfo::isVALU(*I)) 1047 return false; 1048 return I->readsRegister(AMDGPU::EXEC, TRI); 1049 }; 1050 1051 const SIInstrInfo *TII = ST.getInstrInfo(); 1052 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1053 if (!MI) 1054 return false; 1055 if (SIInstrInfo::isVALU(*MI)) { 1056 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1057 return true; 1058 for (auto MO : MI->implicit_operands()) 1059 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1060 return true; 1061 } 1062 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1063 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1064 return true; 1065 return false; 1066 }; 1067 1068 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1069 std::numeric_limits<int>::max()) 1070 return false; 1071 1072 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1073 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1074 .addImm(0xfffe); 1075 return true; 1076 } 1077 1078 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1079 if (!ST.hasLdsBranchVmemWARHazard()) 1080 return false; 1081 1082 auto IsHazardInst = [] (const MachineInstr *MI) { 1083 if (SIInstrInfo::isDS(*MI)) 1084 return 1; 1085 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1086 return 2; 1087 return 0; 1088 }; 1089 1090 auto InstType = IsHazardInst(MI); 1091 if (!InstType) 1092 return false; 1093 1094 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1095 return I && (IsHazardInst(I) || 1096 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1097 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1098 !I->getOperand(1).getImm())); 1099 }; 1100 1101 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1102 if (!I->isBranch()) 1103 return false; 1104 1105 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1106 auto InstType2 = IsHazardInst(I); 1107 return InstType2 && InstType != InstType2; 1108 }; 1109 1110 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1111 if (!I) 1112 return false; 1113 1114 auto InstType2 = IsHazardInst(I); 1115 if (InstType == InstType2) 1116 return true; 1117 1118 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1119 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1120 !I->getOperand(1).getImm(); 1121 }; 1122 1123 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1124 std::numeric_limits<int>::max(); 1125 }; 1126 1127 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1128 std::numeric_limits<int>::max()) 1129 return false; 1130 1131 const SIInstrInfo *TII = ST.getInstrInfo(); 1132 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1133 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1134 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1135 .addImm(0); 1136 1137 return true; 1138 } 1139 1140 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1141 int NSAtoVMEMWaitStates = 1; 1142 1143 if (!ST.hasNSAtoVMEMBug()) 1144 return 0; 1145 1146 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1147 return 0; 1148 1149 const SIInstrInfo *TII = ST.getInstrInfo(); 1150 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1151 if (!Offset || (Offset->getImm() & 6) == 0) 1152 return 0; 1153 1154 auto IsHazardFn = [TII] (MachineInstr *I) { 1155 if (!SIInstrInfo::isMIMG(*I)) 1156 return false; 1157 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1158 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1159 TII->getInstSizeInBytes(*I) >= 16; 1160 }; 1161 1162 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1163 } 1164 1165 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1166 int FPAtomicToDenormModeWaitStates = 3; 1167 1168 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1169 return 0; 1170 1171 auto IsHazardFn = [] (MachineInstr *I) { 1172 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1173 return false; 1174 return SIInstrInfo::isFPAtomic(*I); 1175 }; 1176 1177 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1178 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1179 return true; 1180 1181 switch (MI->getOpcode()) { 1182 case AMDGPU::S_WAITCNT: 1183 case AMDGPU::S_WAITCNT_VSCNT: 1184 case AMDGPU::S_WAITCNT_VMCNT: 1185 case AMDGPU::S_WAITCNT_EXPCNT: 1186 case AMDGPU::S_WAITCNT_LGKMCNT: 1187 case AMDGPU::S_WAITCNT_IDLE: 1188 return true; 1189 default: 1190 break; 1191 } 1192 1193 return false; 1194 }; 1195 1196 1197 return FPAtomicToDenormModeWaitStates - 1198 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1199 } 1200 1201 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1202 assert(SIInstrInfo::isMAI(*MI)); 1203 1204 int WaitStatesNeeded = 0; 1205 unsigned Opc = MI->getOpcode(); 1206 1207 auto IsVALUFn = [] (MachineInstr *MI) { 1208 return SIInstrInfo::isVALU(*MI); 1209 }; 1210 1211 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1212 const int LegacyVALUWritesVGPRWaitStates = 2; 1213 const int VALUWritesExecWaitStates = 4; 1214 const int MaxWaitStates = 4; 1215 1216 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1217 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1218 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1219 1220 if (WaitStatesNeeded < MaxWaitStates) { 1221 for (const MachineOperand &Use : MI->explicit_uses()) { 1222 const int MaxWaitStates = 2; 1223 1224 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1225 continue; 1226 1227 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1228 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1229 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1230 1231 if (WaitStatesNeeded == MaxWaitStates) 1232 break; 1233 } 1234 } 1235 } 1236 1237 auto IsMFMAFn = [] (MachineInstr *MI) { 1238 return SIInstrInfo::isMAI(*MI) && 1239 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1240 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1241 }; 1242 1243 for (const MachineOperand &Op : MI->explicit_operands()) { 1244 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1245 continue; 1246 1247 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1248 continue; 1249 1250 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1251 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1252 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1253 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1254 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1255 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1256 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1257 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1258 const int MaxWaitStates = 18; 1259 Register Reg = Op.getReg(); 1260 unsigned HazardDefLatency = 0; 1261 1262 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1263 (MachineInstr *MI) { 1264 if (!IsMFMAFn(MI)) 1265 return false; 1266 Register DstReg = MI->getOperand(0).getReg(); 1267 if (DstReg == Reg) 1268 return false; 1269 HazardDefLatency = std::max(HazardDefLatency, 1270 TSchedModel.computeInstrLatency(MI)); 1271 return TRI.regsOverlap(DstReg, Reg); 1272 }; 1273 1274 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1275 MaxWaitStates); 1276 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1277 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1278 int OpNo = MI->getOperandNo(&Op); 1279 if (OpNo == SrcCIdx) { 1280 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1281 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1282 switch (HazardDefLatency) { 1283 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1284 break; 1285 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1286 break; 1287 case 16: LLVM_FALLTHROUGH; 1288 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1289 break; 1290 } 1291 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1292 switch (HazardDefLatency) { 1293 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1294 break; 1295 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1296 break; 1297 case 16: LLVM_FALLTHROUGH; 1298 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1299 break; 1300 } 1301 } 1302 1303 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1304 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1305 1306 if (WaitStatesNeeded == MaxWaitStates) 1307 return WaitStatesNeeded; // Early exit. 1308 1309 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1310 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1311 return false; 1312 Register DstReg = MI->getOperand(0).getReg(); 1313 return TRI.regsOverlap(Reg, DstReg); 1314 }; 1315 1316 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1317 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1318 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1319 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1320 if (OpNo == SrcCIdx) 1321 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1322 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1323 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1324 1325 WaitStatesNeededForUse = NeedWaitStates - 1326 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1327 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1328 1329 if (WaitStatesNeeded == MaxWaitStates) 1330 return WaitStatesNeeded; // Early exit. 1331 } 1332 1333 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1334 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1335 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1336 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1337 const int MaxWaitStates = 13; 1338 Register DstReg = MI->getOperand(0).getReg(); 1339 unsigned HazardDefLatency = 0; 1340 1341 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1342 (MachineInstr *MI) { 1343 if (!IsMFMAFn(MI)) 1344 return false; 1345 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1346 HazardDefLatency = std::max(HazardDefLatency, 1347 TSchedModel.computeInstrLatency(MI)); 1348 return TRI.regsOverlap(Reg, DstReg); 1349 }; 1350 1351 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1352 int NeedWaitStates; 1353 switch (HazardDefLatency) { 1354 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1355 break; 1356 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1357 break; 1358 case 16: LLVM_FALLTHROUGH; 1359 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1360 break; 1361 } 1362 1363 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1364 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1365 } 1366 1367 return WaitStatesNeeded; 1368 } 1369 1370 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1371 if (!ST.hasMAIInsts()) 1372 return 0; 1373 1374 int WaitStatesNeeded = 0; 1375 1376 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1377 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1378 }; 1379 1380 for (const MachineOperand &Op : MI->explicit_uses()) { 1381 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1382 continue; 1383 1384 Register Reg = Op.getReg(); 1385 1386 const int AccVgprReadLdStWaitStates = 2; 1387 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1388 const int MaxWaitStates = 2; 1389 1390 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1391 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1392 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1393 1394 if (WaitStatesNeeded == MaxWaitStates) 1395 return WaitStatesNeeded; // Early exit. 1396 1397 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { 1398 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 && 1399 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1400 return false; 1401 auto IsVALUFn = [] (MachineInstr *MI) { 1402 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1403 }; 1404 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1405 std::numeric_limits<int>::max(); 1406 }; 1407 1408 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1409 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1411 } 1412 1413 return WaitStatesNeeded; 1414 } 1415 1416 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1417 if (!SU->isInstr()) 1418 return false; 1419 1420 MachineInstr *MAI = nullptr; 1421 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1422 MAI = nullptr; 1423 if (SIInstrInfo::isMAI(*MI) && 1424 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1425 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1426 MAI = MI; 1427 return MAI != nullptr; 1428 }; 1429 1430 MachineInstr *MI = SU->getInstr(); 1431 if (IsMFMAFn(MI)) { 1432 int W = getWaitStatesSince(IsMFMAFn, 16); 1433 if (MAI) 1434 return W < (int)TSchedModel.computeInstrLatency(MAI); 1435 } 1436 1437 return false; 1438 } 1439