1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "SIDefines.h" 16 #include "SIInstrInfo.h" 17 #include "SIRegisterInfo.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <limits> 31 #include <set> 32 #include <vector> 33 34 using namespace llvm; 35 36 //===----------------------------------------------------------------------===// 37 // Hazard Recoginizer Implementation 38 //===----------------------------------------------------------------------===// 39 40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41 IsHazardRecognizerMode(false), 42 CurrCycleInstr(nullptr), 43 MF(MF), 44 ST(MF.getSubtarget<GCNSubtarget>()), 45 TII(*ST.getInstrInfo()), 46 TRI(TII.getRegisterInfo()), 47 ClauseUses(TRI.getNumRegUnits()), 48 ClauseDefs(TRI.getNumRegUnits()) { 49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50 TSchedModel.init(&ST); 51 } 52 53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 54 EmitInstruction(SU->getInstr()); 55 } 56 57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 58 CurrCycleInstr = MI; 59 } 60 61 static bool isDivFMas(unsigned Opcode) { 62 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 63 } 64 65 static bool isSGetReg(unsigned Opcode) { 66 return Opcode == AMDGPU::S_GETREG_B32; 67 } 68 69 static bool isSSetReg(unsigned Opcode) { 70 return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32; 71 } 72 73 static bool isRWLane(unsigned Opcode) { 74 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 75 } 76 77 static bool isRFE(unsigned Opcode) { 78 return Opcode == AMDGPU::S_RFE_B64; 79 } 80 81 static bool isSMovRel(unsigned Opcode) { 82 switch (Opcode) { 83 case AMDGPU::S_MOVRELS_B32: 84 case AMDGPU::S_MOVRELS_B64: 85 case AMDGPU::S_MOVRELD_B32: 86 case AMDGPU::S_MOVRELD_B64: 87 return true; 88 default: 89 return false; 90 } 91 } 92 93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 94 const MachineInstr &MI) { 95 if (TII.isAlwaysGDS(MI.getOpcode())) 96 return true; 97 98 switch (MI.getOpcode()) { 99 case AMDGPU::S_SENDMSG: 100 case AMDGPU::S_SENDMSGHALT: 101 case AMDGPU::S_TTRACEDATA: 102 return true; 103 // These DS opcodes don't support GDS. 104 case AMDGPU::DS_NOP: 105 case AMDGPU::DS_PERMUTE_B32: 106 case AMDGPU::DS_BPERMUTE_B32: 107 return false; 108 default: 109 if (TII.isDS(MI.getOpcode())) { 110 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 111 AMDGPU::OpName::gds); 112 if (MI.getOperand(GDS).getImm()) 113 return true; 114 } 115 return false; 116 } 117 } 118 119 static bool isPermlane(const MachineInstr &MI) { 120 unsigned Opcode = MI.getOpcode(); 121 return Opcode == AMDGPU::V_PERMLANE16_B32 || 122 Opcode == AMDGPU::V_PERMLANEX16_B32; 123 } 124 125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 126 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 127 AMDGPU::OpName::simm16); 128 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 129 } 130 131 ScheduleHazardRecognizer::HazardType 132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 133 MachineInstr *MI = SU->getInstr(); 134 if (MI->isBundle()) 135 return NoHazard; 136 137 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 138 return NoopHazard; 139 140 // FIXME: Should flat be considered vmem? 141 if ((SIInstrInfo::isVMEM(*MI) || 142 SIInstrInfo::isFLAT(*MI)) 143 && checkVMEMHazards(MI) > 0) 144 return NoopHazard; 145 146 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 147 return NoopHazard; 148 149 if (checkFPAtomicToDenormModeHazard(MI) > 0) 150 return NoopHazard; 151 152 if (ST.hasNoDataDepHazard()) 153 return NoHazard; 154 155 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 156 return NoopHazard; 157 158 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 159 return NoopHazard; 160 161 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 162 return NoopHazard; 163 164 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 165 return NoopHazard; 166 167 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 168 return NoopHazard; 169 170 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 171 return NoopHazard; 172 173 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 174 return NoopHazard; 175 176 if (ST.hasReadM0MovRelInterpHazard() && 177 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 178 checkReadM0Hazards(MI) > 0) 179 return NoopHazard; 180 181 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 182 checkReadM0Hazards(MI) > 0) 183 return NoopHazard; 184 185 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 186 return NoopHazard; 187 188 if ((SIInstrInfo::isVMEM(*MI) || 189 SIInstrInfo::isFLAT(*MI) || 190 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 191 return NoopHazard; 192 193 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 194 return NoopHazard; 195 196 return NoHazard; 197 } 198 199 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { 200 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 201 .addImm(0); 202 } 203 204 void GCNHazardRecognizer::processBundle() { 205 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 206 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 207 // Check bundled MachineInstr's for hazards. 208 for (; MI != E && MI->isInsideBundle(); ++MI) { 209 CurrCycleInstr = &*MI; 210 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 211 212 if (IsHazardRecognizerMode) 213 fixHazards(CurrCycleInstr); 214 215 for (unsigned i = 0; i < WaitStates; ++i) 216 insertNoopInBundle(CurrCycleInstr, TII); 217 218 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 219 // include the bundled MI directly after, only add a maximum of 220 // (MaxLookAhead - 1) noops to EmittedInstrs. 221 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 222 EmittedInstrs.push_front(nullptr); 223 224 EmittedInstrs.push_front(CurrCycleInstr); 225 EmittedInstrs.resize(MaxLookAhead); 226 } 227 CurrCycleInstr = nullptr; 228 } 229 230 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 231 IsHazardRecognizerMode = true; 232 CurrCycleInstr = MI; 233 unsigned W = PreEmitNoopsCommon(MI); 234 fixHazards(MI); 235 CurrCycleInstr = nullptr; 236 return W; 237 } 238 239 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 240 if (MI->isBundle()) 241 return 0; 242 243 int WaitStates = 0; 244 245 if (SIInstrInfo::isSMRD(*MI)) 246 return std::max(WaitStates, checkSMRDHazards(MI)); 247 248 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 249 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 250 251 if (ST.hasNSAtoVMEMBug()) 252 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 253 254 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 255 256 if (ST.hasNoDataDepHazard()) 257 return WaitStates; 258 259 if (SIInstrInfo::isVALU(*MI)) 260 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 261 262 if (SIInstrInfo::isDPP(*MI)) 263 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 264 265 if (isDivFMas(MI->getOpcode())) 266 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 267 268 if (isRWLane(MI->getOpcode())) 269 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 270 271 if (MI->isInlineAsm()) 272 return std::max(WaitStates, checkInlineAsmHazards(MI)); 273 274 if (isSGetReg(MI->getOpcode())) 275 return std::max(WaitStates, checkGetRegHazards(MI)); 276 277 if (isSSetReg(MI->getOpcode())) 278 return std::max(WaitStates, checkSetRegHazards(MI)); 279 280 if (isRFE(MI->getOpcode())) 281 return std::max(WaitStates, checkRFEHazards(MI)); 282 283 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 284 isSMovRel(MI->getOpcode()))) 285 return std::max(WaitStates, checkReadM0Hazards(MI)); 286 287 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 288 return std::max(WaitStates, checkReadM0Hazards(MI)); 289 290 if (SIInstrInfo::isMAI(*MI)) 291 return std::max(WaitStates, checkMAIHazards(MI)); 292 293 if (SIInstrInfo::isVMEM(*MI) || 294 SIInstrInfo::isFLAT(*MI) || 295 SIInstrInfo::isDS(*MI)) 296 return std::max(WaitStates, checkMAILdStHazards(MI)); 297 298 return WaitStates; 299 } 300 301 void GCNHazardRecognizer::EmitNoop() { 302 EmittedInstrs.push_front(nullptr); 303 } 304 305 void GCNHazardRecognizer::AdvanceCycle() { 306 // When the scheduler detects a stall, it will call AdvanceCycle() without 307 // emitting any instructions. 308 if (!CurrCycleInstr) 309 return; 310 311 // Do not track non-instructions which do not affect the wait states. 312 // If included, these instructions can lead to buffer overflow such that 313 // detectable hazards are missed. 314 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 315 CurrCycleInstr->isKill()) 316 return; 317 318 if (CurrCycleInstr->isBundle()) { 319 processBundle(); 320 return; 321 } 322 323 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 324 325 // Keep track of emitted instructions 326 EmittedInstrs.push_front(CurrCycleInstr); 327 328 // Add a nullptr for each additional wait state after the first. Make sure 329 // not to add more than getMaxLookAhead() items to the list, since we 330 // truncate the list to that size right after this loop. 331 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 332 i < e; ++i) { 333 EmittedInstrs.push_front(nullptr); 334 } 335 336 // getMaxLookahead() is the largest number of wait states we will ever need 337 // to insert, so there is no point in keeping track of more than that many 338 // wait states. 339 EmittedInstrs.resize(getMaxLookAhead()); 340 341 CurrCycleInstr = nullptr; 342 } 343 344 void GCNHazardRecognizer::RecedeCycle() { 345 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 346 } 347 348 //===----------------------------------------------------------------------===// 349 // Helper Functions 350 //===----------------------------------------------------------------------===// 351 352 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 353 354 // Returns a minimum wait states since \p I walking all predecessors. 355 // Only scans until \p IsExpired does not return true. 356 // Can only be run in a hazard recognizer mode. 357 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 358 MachineBasicBlock *MBB, 359 MachineBasicBlock::reverse_instr_iterator I, 360 int WaitStates, 361 IsExpiredFn IsExpired, 362 DenseSet<const MachineBasicBlock *> &Visited) { 363 for (auto E = MBB->instr_rend(); I != E; ++I) { 364 // Don't add WaitStates for parent BUNDLE instructions. 365 if (I->isBundle()) 366 continue; 367 368 if (IsHazard(&*I)) 369 return WaitStates; 370 371 if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) 372 continue; 373 374 WaitStates += SIInstrInfo::getNumWaitStates(*I); 375 376 if (IsExpired(&*I, WaitStates)) 377 return std::numeric_limits<int>::max(); 378 } 379 380 int MinWaitStates = WaitStates; 381 bool Found = false; 382 for (MachineBasicBlock *Pred : MBB->predecessors()) { 383 if (!Visited.insert(Pred).second) 384 continue; 385 386 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 387 WaitStates, IsExpired, Visited); 388 389 if (W == std::numeric_limits<int>::max()) 390 continue; 391 392 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 393 if (IsExpired(nullptr, MinWaitStates)) 394 return MinWaitStates; 395 396 Found = true; 397 } 398 399 if (Found) 400 return MinWaitStates; 401 402 return std::numeric_limits<int>::max(); 403 } 404 405 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 406 MachineInstr *MI, 407 IsExpiredFn IsExpired) { 408 DenseSet<const MachineBasicBlock *> Visited; 409 return getWaitStatesSince(IsHazard, MI->getParent(), 410 std::next(MI->getReverseIterator()), 411 0, IsExpired, Visited); 412 } 413 414 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 415 if (IsHazardRecognizerMode) { 416 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 417 return WaitStates >= Limit; 418 }; 419 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 420 } 421 422 int WaitStates = 0; 423 for (MachineInstr *MI : EmittedInstrs) { 424 if (MI) { 425 if (IsHazard(MI)) 426 return WaitStates; 427 428 if (MI->isInlineAsm()) 429 continue; 430 } 431 ++WaitStates; 432 433 if (WaitStates >= Limit) 434 break; 435 } 436 return std::numeric_limits<int>::max(); 437 } 438 439 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 440 IsHazardFn IsHazardDef, 441 int Limit) { 442 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 443 444 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 445 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 446 }; 447 448 return getWaitStatesSince(IsHazardFn, Limit); 449 } 450 451 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 452 int Limit) { 453 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 454 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 455 }; 456 457 return getWaitStatesSince(IsHazardFn, Limit); 458 } 459 460 //===----------------------------------------------------------------------===// 461 // No-op Hazard Detection 462 //===----------------------------------------------------------------------===// 463 464 static void addRegUnits(const SIRegisterInfo &TRI, 465 BitVector &BV, unsigned Reg) { 466 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 467 BV.set(*RUI); 468 } 469 470 static void addRegsToSet(const SIRegisterInfo &TRI, 471 iterator_range<MachineInstr::const_mop_iterator> Ops, 472 BitVector &Set) { 473 for (const MachineOperand &Op : Ops) { 474 if (Op.isReg()) 475 addRegUnits(TRI, Set, Op.getReg()); 476 } 477 } 478 479 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 480 // XXX: Do we need to worry about implicit operands 481 addRegsToSet(TRI, MI.defs(), ClauseDefs); 482 addRegsToSet(TRI, MI.uses(), ClauseUses); 483 } 484 485 static bool breaksSMEMSoftClause(MachineInstr *MI) { 486 return !SIInstrInfo::isSMRD(*MI); 487 } 488 489 static bool breaksVMEMSoftClause(MachineInstr *MI) { 490 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 491 } 492 493 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 494 // SMEM soft clause are only present on VI+, and only matter if xnack is 495 // enabled. 496 if (!ST.isXNACKEnabled()) 497 return 0; 498 499 bool IsSMRD = TII.isSMRD(*MEM); 500 501 resetClause(); 502 503 // A soft-clause is any group of consecutive SMEM instructions. The 504 // instructions in this group may return out of order and/or may be 505 // replayed (i.e. the same instruction issued more than once). 506 // 507 // In order to handle these situations correctly we need to make sure that 508 // when a clause has more than one instruction, no instruction in the clause 509 // writes to a register that is read by another instruction in the clause 510 // (including itself). If we encounter this situaion, we need to break the 511 // clause by inserting a non SMEM instruction. 512 513 for (MachineInstr *MI : EmittedInstrs) { 514 // When we hit a non-SMEM instruction then we have passed the start of the 515 // clause and we can stop. 516 if (!MI) 517 break; 518 519 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 520 break; 521 522 addClauseInst(*MI); 523 } 524 525 if (ClauseDefs.none()) 526 return 0; 527 528 // We need to make sure not to put loads and stores in the same clause if they 529 // use the same address. For now, just start a new clause whenever we see a 530 // store. 531 if (MEM->mayStore()) 532 return 1; 533 534 addClauseInst(*MEM); 535 536 // If the set of defs and uses intersect then we cannot add this instruction 537 // to the clause, so we have a hazard. 538 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 539 } 540 541 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 542 int WaitStatesNeeded = 0; 543 544 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 545 546 // This SMRD hazard only affects SI. 547 if (!ST.hasSMRDReadVALUDefHazard()) 548 return WaitStatesNeeded; 549 550 // A read of an SGPR by SMRD instruction requires 4 wait states when the 551 // SGPR was written by a VALU instruction. 552 int SmrdSgprWaitStates = 4; 553 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 554 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 555 556 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 557 558 for (const MachineOperand &Use : SMRD->uses()) { 559 if (!Use.isReg()) 560 continue; 561 int WaitStatesNeededForUse = 562 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 563 SmrdSgprWaitStates); 564 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 565 566 // This fixes what appears to be undocumented hardware behavior in SI where 567 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 568 // needs some number of nops in between. We don't know how many we need, but 569 // let's use 4. This wasn't discovered before probably because the only 570 // case when this happens is when we expand a 64-bit pointer into a full 571 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 572 // probably never encountered in the closed-source land. 573 if (IsBufferSMRD) { 574 int WaitStatesNeededForUse = 575 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 576 IsBufferHazardDefFn, 577 SmrdSgprWaitStates); 578 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 579 } 580 } 581 582 return WaitStatesNeeded; 583 } 584 585 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 586 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 587 return 0; 588 589 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 590 591 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 592 // SGPR was written by a VALU Instruction. 593 const int VmemSgprWaitStates = 5; 594 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 595 for (const MachineOperand &Use : VMEM->uses()) { 596 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 597 continue; 598 599 int WaitStatesNeededForUse = 600 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 601 VmemSgprWaitStates); 602 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 603 } 604 return WaitStatesNeeded; 605 } 606 607 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 608 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 609 const SIInstrInfo *TII = ST.getInstrInfo(); 610 611 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 612 int DppVgprWaitStates = 2; 613 int DppExecWaitStates = 5; 614 int WaitStatesNeeded = 0; 615 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 616 617 for (const MachineOperand &Use : DPP->uses()) { 618 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 619 continue; 620 int WaitStatesNeededForUse = 621 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 622 [](MachineInstr *) { return true; }, 623 DppVgprWaitStates); 624 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 625 } 626 627 WaitStatesNeeded = std::max( 628 WaitStatesNeeded, 629 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 630 DppExecWaitStates)); 631 632 return WaitStatesNeeded; 633 } 634 635 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 636 const SIInstrInfo *TII = ST.getInstrInfo(); 637 638 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 639 // instruction. 640 const int DivFMasWaitStates = 4; 641 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 642 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 643 DivFMasWaitStates); 644 645 return DivFMasWaitStates - WaitStatesNeeded; 646 } 647 648 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 649 const SIInstrInfo *TII = ST.getInstrInfo(); 650 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 651 652 const int GetRegWaitStates = 2; 653 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 654 return GetRegHWReg == getHWReg(TII, *MI); 655 }; 656 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 657 658 return GetRegWaitStates - WaitStatesNeeded; 659 } 660 661 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 662 const SIInstrInfo *TII = ST.getInstrInfo(); 663 unsigned HWReg = getHWReg(TII, *SetRegInstr); 664 665 const int SetRegWaitStates = ST.getSetRegWaitStates(); 666 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 667 return HWReg == getHWReg(TII, *MI); 668 }; 669 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 670 return SetRegWaitStates - WaitStatesNeeded; 671 } 672 673 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 674 if (!MI.mayStore()) 675 return -1; 676 677 const SIInstrInfo *TII = ST.getInstrInfo(); 678 unsigned Opcode = MI.getOpcode(); 679 const MCInstrDesc &Desc = MI.getDesc(); 680 681 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 682 int VDataRCID = -1; 683 if (VDataIdx != -1) 684 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 685 686 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 687 // There is no hazard if the instruction does not use vector regs 688 // (like wbinvl1) 689 if (VDataIdx == -1) 690 return -1; 691 // For MUBUF/MTBUF instructions this hazard only exists if the 692 // instruction is not using a register in the soffset field. 693 const MachineOperand *SOffset = 694 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 695 // If we have no soffset operand, then assume this field has been 696 // hardcoded to zero. 697 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 698 (!SOffset || !SOffset->isReg())) 699 return VDataIdx; 700 } 701 702 // MIMG instructions create a hazard if they don't use a 256-bit T# and 703 // the store size is greater than 8 bytes and they have more than two bits 704 // of their dmask set. 705 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 706 if (TII->isMIMG(MI)) { 707 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 708 assert(SRsrcIdx != -1 && 709 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 710 (void)SRsrcIdx; 711 } 712 713 if (TII->isFLAT(MI)) { 714 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 715 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 716 return DataIdx; 717 } 718 719 return -1; 720 } 721 722 int 723 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 724 const MachineRegisterInfo &MRI) { 725 // Helper to check for the hazard where VMEM instructions that store more than 726 // 8 bytes can have there store data over written by the next instruction. 727 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 728 729 const int VALUWaitStates = 1; 730 int WaitStatesNeeded = 0; 731 732 if (!TRI->isVGPR(MRI, Def.getReg())) 733 return WaitStatesNeeded; 734 Register Reg = Def.getReg(); 735 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 736 int DataIdx = createsVALUHazard(*MI); 737 return DataIdx >= 0 && 738 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 739 }; 740 int WaitStatesNeededForDef = 741 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 742 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 743 744 return WaitStatesNeeded; 745 } 746 747 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 748 // This checks for the hazard where VMEM instructions that store more than 749 // 8 bytes can have there store data over written by the next instruction. 750 if (!ST.has12DWordStoreHazard()) 751 return 0; 752 753 const MachineRegisterInfo &MRI = MF.getRegInfo(); 754 int WaitStatesNeeded = 0; 755 756 for (const MachineOperand &Def : VALU->defs()) { 757 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 758 } 759 760 return WaitStatesNeeded; 761 } 762 763 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 764 // This checks for hazards associated with inline asm statements. 765 // Since inline asms can contain just about anything, we use this 766 // to call/leverage other check*Hazard routines. Note that 767 // this function doesn't attempt to address all possible inline asm 768 // hazards (good luck), but is a collection of what has been 769 // problematic thus far. 770 771 // see checkVALUHazards() 772 if (!ST.has12DWordStoreHazard()) 773 return 0; 774 775 const MachineRegisterInfo &MRI = MF.getRegInfo(); 776 int WaitStatesNeeded = 0; 777 778 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 779 I != E; ++I) { 780 const MachineOperand &Op = IA->getOperand(I); 781 if (Op.isReg() && Op.isDef()) { 782 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 783 } 784 } 785 786 return WaitStatesNeeded; 787 } 788 789 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 790 const SIInstrInfo *TII = ST.getInstrInfo(); 791 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 792 const MachineRegisterInfo &MRI = MF.getRegInfo(); 793 794 const MachineOperand *LaneSelectOp = 795 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 796 797 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 798 return 0; 799 800 Register LaneSelectReg = LaneSelectOp->getReg(); 801 auto IsHazardFn = [TII] (MachineInstr *MI) { 802 return TII->isVALU(*MI); 803 }; 804 805 const int RWLaneWaitStates = 4; 806 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 807 RWLaneWaitStates); 808 return RWLaneWaitStates - WaitStatesSince; 809 } 810 811 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 812 if (!ST.hasRFEHazards()) 813 return 0; 814 815 const SIInstrInfo *TII = ST.getInstrInfo(); 816 817 const int RFEWaitStates = 1; 818 819 auto IsHazardFn = [TII] (MachineInstr *MI) { 820 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 821 }; 822 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 823 return RFEWaitStates - WaitStatesNeeded; 824 } 825 826 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 827 const SIInstrInfo *TII = ST.getInstrInfo(); 828 const int SMovRelWaitStates = 1; 829 auto IsHazardFn = [TII] (MachineInstr *MI) { 830 return TII->isSALU(*MI); 831 }; 832 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 833 SMovRelWaitStates); 834 } 835 836 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 837 fixVMEMtoScalarWriteHazards(MI); 838 fixVcmpxPermlaneHazards(MI); 839 fixSMEMtoVectorWriteHazards(MI); 840 fixVcmpxExecWARHazard(MI); 841 fixLdsBranchVmemWARHazard(MI); 842 } 843 844 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 845 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 846 return false; 847 848 const SIInstrInfo *TII = ST.getInstrInfo(); 849 auto IsHazardFn = [TII] (MachineInstr *MI) { 850 return TII->isVOPC(*MI); 851 }; 852 853 auto IsExpiredFn = [] (MachineInstr *MI, int) { 854 if (!MI) 855 return false; 856 unsigned Opc = MI->getOpcode(); 857 return SIInstrInfo::isVALU(*MI) && 858 Opc != AMDGPU::V_NOP_e32 && 859 Opc != AMDGPU::V_NOP_e64 && 860 Opc != AMDGPU::V_NOP_sdwa; 861 }; 862 863 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 864 std::numeric_limits<int>::max()) 865 return false; 866 867 // V_NOP will be discarded by SQ. 868 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 869 // which is always a VGPR and available. 870 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 871 Register Reg = Src0->getReg(); 872 bool IsUndef = Src0->isUndef(); 873 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 874 TII->get(AMDGPU::V_MOV_B32_e32)) 875 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 876 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 877 878 return true; 879 } 880 881 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 882 if (!ST.hasVMEMtoScalarWriteHazard()) 883 return false; 884 885 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 886 return false; 887 888 if (MI->getNumDefs() == 0) 889 return false; 890 891 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 892 893 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 894 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 895 !SIInstrInfo::isFLAT(*I)) 896 return false; 897 898 for (const MachineOperand &Def : MI->defs()) { 899 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 900 if (!Op) 901 continue; 902 return true; 903 } 904 return false; 905 }; 906 907 auto IsExpiredFn = [](MachineInstr *MI, int) { 908 return MI && (SIInstrInfo::isVALU(*MI) || 909 (MI->getOpcode() == AMDGPU::S_WAITCNT && 910 !MI->getOperand(0).getImm()) || 911 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 912 MI->getOperand(0).getImm() == 0xffe3)); 913 }; 914 915 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 916 std::numeric_limits<int>::max()) 917 return false; 918 919 const SIInstrInfo *TII = ST.getInstrInfo(); 920 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 921 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 922 .addImm(0xffe3); 923 return true; 924 } 925 926 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 927 if (!ST.hasSMEMtoVectorWriteHazard()) 928 return false; 929 930 if (!SIInstrInfo::isVALU(*MI)) 931 return false; 932 933 unsigned SDSTName; 934 switch (MI->getOpcode()) { 935 case AMDGPU::V_READLANE_B32: 936 case AMDGPU::V_READLANE_B32_gfx10: 937 case AMDGPU::V_READFIRSTLANE_B32: 938 SDSTName = AMDGPU::OpName::vdst; 939 break; 940 default: 941 SDSTName = AMDGPU::OpName::sdst; 942 break; 943 } 944 945 const SIInstrInfo *TII = ST.getInstrInfo(); 946 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 947 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 948 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 949 if (!SDST) { 950 for (const auto &MO : MI->implicit_operands()) { 951 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 952 SDST = &MO; 953 break; 954 } 955 } 956 } 957 958 if (!SDST) 959 return false; 960 961 const Register SDSTReg = SDST->getReg(); 962 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 963 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 964 }; 965 966 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 967 if (MI) { 968 if (TII->isSALU(*MI)) { 969 switch (MI->getOpcode()) { 970 case AMDGPU::S_SETVSKIP: 971 case AMDGPU::S_VERSION: 972 case AMDGPU::S_WAITCNT_VSCNT: 973 case AMDGPU::S_WAITCNT_VMCNT: 974 case AMDGPU::S_WAITCNT_EXPCNT: 975 // These instructions cannot not mitigate the hazard. 976 return false; 977 case AMDGPU::S_WAITCNT_LGKMCNT: 978 // Reducing lgkmcnt count to 0 always mitigates the hazard. 979 return (MI->getOperand(1).getImm() == 0) && 980 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 981 case AMDGPU::S_WAITCNT: { 982 const int64_t Imm = MI->getOperand(0).getImm(); 983 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 984 return (Decoded.LgkmCnt == 0); 985 } 986 default: 987 // SOPP instructions cannot mitigate the hazard. 988 if (TII->isSOPP(*MI)) 989 return false; 990 // At this point the SALU can be assumed to mitigate the hazard 991 // because either: 992 // (a) it is independent of the at risk SMEM (breaking chain), 993 // or 994 // (b) it is dependent on the SMEM, in which case an appropriate 995 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 996 // SMEM instruction. 997 return true; 998 } 999 } 1000 } 1001 return false; 1002 }; 1003 1004 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1005 std::numeric_limits<int>::max()) 1006 return false; 1007 1008 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1009 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1010 .addImm(0); 1011 return true; 1012 } 1013 1014 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1015 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1016 return false; 1017 1018 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1019 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1020 return false; 1021 1022 auto IsHazardFn = [TRI] (MachineInstr *I) { 1023 if (SIInstrInfo::isVALU(*I)) 1024 return false; 1025 return I->readsRegister(AMDGPU::EXEC, TRI); 1026 }; 1027 1028 const SIInstrInfo *TII = ST.getInstrInfo(); 1029 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1030 if (!MI) 1031 return false; 1032 if (SIInstrInfo::isVALU(*MI)) { 1033 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1034 return true; 1035 for (auto MO : MI->implicit_operands()) 1036 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1037 return true; 1038 } 1039 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1040 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1041 return true; 1042 return false; 1043 }; 1044 1045 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1046 std::numeric_limits<int>::max()) 1047 return false; 1048 1049 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1050 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1051 .addImm(0xfffe); 1052 return true; 1053 } 1054 1055 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1056 if (!ST.hasLdsBranchVmemWARHazard()) 1057 return false; 1058 1059 auto IsHazardInst = [] (const MachineInstr *MI) { 1060 if (SIInstrInfo::isDS(*MI)) 1061 return 1; 1062 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1063 return 2; 1064 return 0; 1065 }; 1066 1067 auto InstType = IsHazardInst(MI); 1068 if (!InstType) 1069 return false; 1070 1071 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1072 return I && (IsHazardInst(I) || 1073 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1074 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1075 !I->getOperand(1).getImm())); 1076 }; 1077 1078 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1079 if (!I->isBranch()) 1080 return false; 1081 1082 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1083 auto InstType2 = IsHazardInst(I); 1084 return InstType2 && InstType != InstType2; 1085 }; 1086 1087 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1088 if (!I) 1089 return false; 1090 1091 auto InstType2 = IsHazardInst(I); 1092 if (InstType == InstType2) 1093 return true; 1094 1095 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1096 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1097 !I->getOperand(1).getImm(); 1098 }; 1099 1100 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1101 std::numeric_limits<int>::max(); 1102 }; 1103 1104 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1105 std::numeric_limits<int>::max()) 1106 return false; 1107 1108 const SIInstrInfo *TII = ST.getInstrInfo(); 1109 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1110 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1111 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1112 .addImm(0); 1113 1114 return true; 1115 } 1116 1117 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1118 int NSAtoVMEMWaitStates = 1; 1119 1120 if (!ST.hasNSAtoVMEMBug()) 1121 return 0; 1122 1123 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1124 return 0; 1125 1126 const SIInstrInfo *TII = ST.getInstrInfo(); 1127 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1128 if (!Offset || (Offset->getImm() & 6) == 0) 1129 return 0; 1130 1131 auto IsHazardFn = [TII] (MachineInstr *I) { 1132 if (!SIInstrInfo::isMIMG(*I)) 1133 return false; 1134 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1135 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1136 TII->getInstSizeInBytes(*I) >= 16; 1137 }; 1138 1139 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1140 } 1141 1142 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1143 int FPAtomicToDenormModeWaitStates = 3; 1144 1145 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1146 return 0; 1147 1148 auto IsHazardFn = [] (MachineInstr *I) { 1149 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1150 return false; 1151 return SIInstrInfo::isFPAtomic(*I); 1152 }; 1153 1154 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1155 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1156 return true; 1157 1158 switch (MI->getOpcode()) { 1159 case AMDGPU::S_WAITCNT: 1160 case AMDGPU::S_WAITCNT_VSCNT: 1161 case AMDGPU::S_WAITCNT_VMCNT: 1162 case AMDGPU::S_WAITCNT_EXPCNT: 1163 case AMDGPU::S_WAITCNT_LGKMCNT: 1164 case AMDGPU::S_WAITCNT_IDLE: 1165 return true; 1166 default: 1167 break; 1168 } 1169 1170 return false; 1171 }; 1172 1173 1174 return FPAtomicToDenormModeWaitStates - 1175 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1176 } 1177 1178 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1179 assert(SIInstrInfo::isMAI(*MI)); 1180 1181 int WaitStatesNeeded = 0; 1182 unsigned Opc = MI->getOpcode(); 1183 1184 auto IsVALUFn = [] (MachineInstr *MI) { 1185 return SIInstrInfo::isVALU(*MI); 1186 }; 1187 1188 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1189 const int LegacyVALUWritesVGPRWaitStates = 2; 1190 const int VALUWritesExecWaitStates = 4; 1191 const int MaxWaitStates = 4; 1192 1193 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1194 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1195 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1196 1197 if (WaitStatesNeeded < MaxWaitStates) { 1198 for (const MachineOperand &Use : MI->explicit_uses()) { 1199 const int MaxWaitStates = 2; 1200 1201 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1202 continue; 1203 1204 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1205 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1206 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1207 1208 if (WaitStatesNeeded == MaxWaitStates) 1209 break; 1210 } 1211 } 1212 } 1213 1214 auto IsMFMAFn = [] (MachineInstr *MI) { 1215 return SIInstrInfo::isMAI(*MI) && 1216 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1217 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1218 }; 1219 1220 for (const MachineOperand &Op : MI->explicit_operands()) { 1221 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1222 continue; 1223 1224 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1225 continue; 1226 1227 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1228 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1229 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1230 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1231 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1232 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1233 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1234 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1235 const int MaxWaitStates = 18; 1236 Register Reg = Op.getReg(); 1237 unsigned HazardDefLatency = 0; 1238 1239 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1240 (MachineInstr *MI) { 1241 if (!IsMFMAFn(MI)) 1242 return false; 1243 Register DstReg = MI->getOperand(0).getReg(); 1244 if (DstReg == Reg) 1245 return false; 1246 HazardDefLatency = std::max(HazardDefLatency, 1247 TSchedModel.computeInstrLatency(MI)); 1248 return TRI.regsOverlap(DstReg, Reg); 1249 }; 1250 1251 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1252 MaxWaitStates); 1253 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1254 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1255 int OpNo = MI->getOperandNo(&Op); 1256 if (OpNo == SrcCIdx) { 1257 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1258 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1259 switch (HazardDefLatency) { 1260 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1261 break; 1262 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1263 break; 1264 case 16: LLVM_FALLTHROUGH; 1265 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1266 break; 1267 } 1268 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1269 switch (HazardDefLatency) { 1270 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1271 break; 1272 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1273 break; 1274 case 16: LLVM_FALLTHROUGH; 1275 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1276 break; 1277 } 1278 } 1279 1280 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1281 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1282 1283 if (WaitStatesNeeded == MaxWaitStates) 1284 return WaitStatesNeeded; // Early exit. 1285 1286 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1287 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1288 return false; 1289 Register DstReg = MI->getOperand(0).getReg(); 1290 return TRI.regsOverlap(Reg, DstReg); 1291 }; 1292 1293 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1294 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1295 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1296 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1297 if (OpNo == SrcCIdx) 1298 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1299 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1300 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1301 1302 WaitStatesNeededForUse = NeedWaitStates - 1303 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1304 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1305 1306 if (WaitStatesNeeded == MaxWaitStates) 1307 return WaitStatesNeeded; // Early exit. 1308 } 1309 1310 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1311 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1312 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1313 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1314 const int MaxWaitStates = 13; 1315 Register DstReg = MI->getOperand(0).getReg(); 1316 unsigned HazardDefLatency = 0; 1317 1318 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1319 (MachineInstr *MI) { 1320 if (!IsMFMAFn(MI)) 1321 return false; 1322 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1323 HazardDefLatency = std::max(HazardDefLatency, 1324 TSchedModel.computeInstrLatency(MI)); 1325 return TRI.regsOverlap(Reg, DstReg); 1326 }; 1327 1328 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1329 int NeedWaitStates; 1330 switch (HazardDefLatency) { 1331 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1332 break; 1333 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1334 break; 1335 case 16: LLVM_FALLTHROUGH; 1336 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1337 break; 1338 } 1339 1340 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1341 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1342 } 1343 1344 return WaitStatesNeeded; 1345 } 1346 1347 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1348 if (!ST.hasMAIInsts()) 1349 return 0; 1350 1351 int WaitStatesNeeded = 0; 1352 1353 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1354 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1355 }; 1356 1357 for (const MachineOperand &Op : MI->explicit_uses()) { 1358 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1359 continue; 1360 1361 Register Reg = Op.getReg(); 1362 1363 const int AccVgprReadLdStWaitStates = 2; 1364 const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; 1365 const int MaxWaitStates = 2; 1366 1367 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1368 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1369 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1370 1371 if (WaitStatesNeeded == MaxWaitStates) 1372 return WaitStatesNeeded; // Early exit. 1373 1374 auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { 1375 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1376 return false; 1377 auto IsVALUFn = [] (MachineInstr *MI) { 1378 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1379 }; 1380 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1381 std::numeric_limits<int>::max(); 1382 }; 1383 1384 WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - 1385 getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); 1386 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1387 } 1388 1389 return WaitStatesNeeded; 1390 } 1391 1392 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1393 if (!SU->isInstr()) 1394 return false; 1395 1396 MachineInstr *MAI = nullptr; 1397 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1398 MAI = nullptr; 1399 if (SIInstrInfo::isMAI(*MI) && 1400 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1401 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1402 MAI = MI; 1403 return MAI != nullptr; 1404 }; 1405 1406 MachineInstr *MI = SU->getInstr(); 1407 if (IsMFMAFn(MI)) { 1408 int W = getWaitStatesSince(IsMFMAFn, 16); 1409 if (MAI) 1410 return W < (int)TSchedModel.computeInstrLatency(MAI); 1411 } 1412 1413 return false; 1414 } 1415