1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "SIDefines.h" 16 #include "SIInstrInfo.h" 17 #include "SIRegisterInfo.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <limits> 31 #include <set> 32 #include <vector> 33 34 using namespace llvm; 35 36 //===----------------------------------------------------------------------===// 37 // Hazard Recoginizer Implementation 38 //===----------------------------------------------------------------------===// 39 40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41 IsHazardRecognizerMode(false), 42 CurrCycleInstr(nullptr), 43 MF(MF), 44 ST(MF.getSubtarget<GCNSubtarget>()), 45 TII(*ST.getInstrInfo()), 46 TRI(TII.getRegisterInfo()), 47 ClauseUses(TRI.getNumRegUnits()), 48 ClauseDefs(TRI.getNumRegUnits()) { 49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50 TSchedModel.init(&ST); 51 } 52 53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 54 EmitInstruction(SU->getInstr()); 55 } 56 57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 58 CurrCycleInstr = MI; 59 } 60 61 static bool isDivFMas(unsigned Opcode) { 62 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 63 } 64 65 static bool isSGetReg(unsigned Opcode) { 66 return Opcode == AMDGPU::S_GETREG_B32; 67 } 68 69 static bool isSSetReg(unsigned Opcode) { 70 switch (Opcode) { 71 case AMDGPU::S_SETREG_B32: 72 case AMDGPU::S_SETREG_B32_mode: 73 case AMDGPU::S_SETREG_IMM32_B32: 74 case AMDGPU::S_SETREG_IMM32_B32_mode: 75 return true; 76 } 77 return false; 78 } 79 80 static bool isRWLane(unsigned Opcode) { 81 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 82 } 83 84 static bool isRFE(unsigned Opcode) { 85 return Opcode == AMDGPU::S_RFE_B64; 86 } 87 88 static bool isSMovRel(unsigned Opcode) { 89 switch (Opcode) { 90 case AMDGPU::S_MOVRELS_B32: 91 case AMDGPU::S_MOVRELS_B64: 92 case AMDGPU::S_MOVRELD_B32: 93 case AMDGPU::S_MOVRELD_B64: 94 return true; 95 default: 96 return false; 97 } 98 } 99 100 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 101 const MachineInstr &MI) { 102 if (TII.isAlwaysGDS(MI.getOpcode())) 103 return true; 104 105 switch (MI.getOpcode()) { 106 case AMDGPU::S_SENDMSG: 107 case AMDGPU::S_SENDMSGHALT: 108 case AMDGPU::S_TTRACEDATA: 109 return true; 110 // These DS opcodes don't support GDS. 111 case AMDGPU::DS_NOP: 112 case AMDGPU::DS_PERMUTE_B32: 113 case AMDGPU::DS_BPERMUTE_B32: 114 return false; 115 default: 116 if (TII.isDS(MI.getOpcode())) { 117 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 118 AMDGPU::OpName::gds); 119 if (MI.getOperand(GDS).getImm()) 120 return true; 121 } 122 return false; 123 } 124 } 125 126 static bool isPermlane(const MachineInstr &MI) { 127 unsigned Opcode = MI.getOpcode(); 128 return Opcode == AMDGPU::V_PERMLANE16_B32 || 129 Opcode == AMDGPU::V_PERMLANEX16_B32; 130 } 131 132 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 133 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 134 AMDGPU::OpName::simm16); 135 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 136 } 137 138 ScheduleHazardRecognizer::HazardType 139 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 140 MachineInstr *MI = SU->getInstr(); 141 if (MI->isBundle()) 142 return NoHazard; 143 144 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 145 return NoopHazard; 146 147 // FIXME: Should flat be considered vmem? 148 if ((SIInstrInfo::isVMEM(*MI) || 149 SIInstrInfo::isFLAT(*MI)) 150 && checkVMEMHazards(MI) > 0) 151 return NoopHazard; 152 153 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 154 return NoopHazard; 155 156 if (checkFPAtomicToDenormModeHazard(MI) > 0) 157 return NoopHazard; 158 159 if (ST.hasNoDataDepHazard()) 160 return NoHazard; 161 162 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 163 return NoopHazard; 164 165 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 166 return NoopHazard; 167 168 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 169 return NoopHazard; 170 171 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 172 return NoopHazard; 173 174 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 175 return NoopHazard; 176 177 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 178 return NoopHazard; 179 180 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 181 return NoopHazard; 182 183 if (ST.hasReadM0MovRelInterpHazard() && 184 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 185 checkReadM0Hazards(MI) > 0) 186 return NoopHazard; 187 188 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 189 checkReadM0Hazards(MI) > 0) 190 return NoopHazard; 191 192 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 193 return NoopHazard; 194 195 if ((SIInstrInfo::isVMEM(*MI) || 196 SIInstrInfo::isFLAT(*MI) || 197 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 198 return NoopHazard; 199 200 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 201 return NoopHazard; 202 203 return NoHazard; 204 } 205 206 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { 207 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 208 .addImm(0); 209 } 210 211 void GCNHazardRecognizer::processBundle() { 212 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 213 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 214 // Check bundled MachineInstr's for hazards. 215 for (; MI != E && MI->isInsideBundle(); ++MI) { 216 CurrCycleInstr = &*MI; 217 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 218 219 if (IsHazardRecognizerMode) 220 fixHazards(CurrCycleInstr); 221 222 for (unsigned i = 0; i < WaitStates; ++i) 223 insertNoopInBundle(CurrCycleInstr, TII); 224 225 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 226 // include the bundled MI directly after, only add a maximum of 227 // (MaxLookAhead - 1) noops to EmittedInstrs. 228 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 229 EmittedInstrs.push_front(nullptr); 230 231 EmittedInstrs.push_front(CurrCycleInstr); 232 EmittedInstrs.resize(MaxLookAhead); 233 } 234 CurrCycleInstr = nullptr; 235 } 236 237 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 238 IsHazardRecognizerMode = true; 239 CurrCycleInstr = MI; 240 unsigned W = PreEmitNoopsCommon(MI); 241 fixHazards(MI); 242 CurrCycleInstr = nullptr; 243 return W; 244 } 245 246 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 247 if (MI->isBundle()) 248 return 0; 249 250 int WaitStates = 0; 251 252 if (SIInstrInfo::isSMRD(*MI)) 253 return std::max(WaitStates, checkSMRDHazards(MI)); 254 255 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 256 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 257 258 if (ST.hasNSAtoVMEMBug()) 259 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 260 261 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 262 263 if (ST.hasNoDataDepHazard()) 264 return WaitStates; 265 266 if (SIInstrInfo::isVALU(*MI)) 267 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 268 269 if (SIInstrInfo::isDPP(*MI)) 270 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 271 272 if (isDivFMas(MI->getOpcode())) 273 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 274 275 if (isRWLane(MI->getOpcode())) 276 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 277 278 if (MI->isInlineAsm()) 279 return std::max(WaitStates, checkInlineAsmHazards(MI)); 280 281 if (isSGetReg(MI->getOpcode())) 282 return std::max(WaitStates, checkGetRegHazards(MI)); 283 284 if (isSSetReg(MI->getOpcode())) 285 return std::max(WaitStates, checkSetRegHazards(MI)); 286 287 if (isRFE(MI->getOpcode())) 288 return std::max(WaitStates, checkRFEHazards(MI)); 289 290 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 291 isSMovRel(MI->getOpcode()))) 292 return std::max(WaitStates, checkReadM0Hazards(MI)); 293 294 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 295 return std::max(WaitStates, checkReadM0Hazards(MI)); 296 297 if (SIInstrInfo::isMAI(*MI)) 298 return std::max(WaitStates, checkMAIHazards(MI)); 299 300 if (SIInstrInfo::isVMEM(*MI) || 301 SIInstrInfo::isFLAT(*MI) || 302 SIInstrInfo::isDS(*MI)) 303 return std::max(WaitStates, checkMAILdStHazards(MI)); 304 305 return WaitStates; 306 } 307 308 void GCNHazardRecognizer::EmitNoop() { 309 EmittedInstrs.push_front(nullptr); 310 } 311 312 void GCNHazardRecognizer::AdvanceCycle() { 313 // When the scheduler detects a stall, it will call AdvanceCycle() without 314 // emitting any instructions. 315 if (!CurrCycleInstr) 316 return; 317 318 // Do not track non-instructions which do not affect the wait states. 319 // If included, these instructions can lead to buffer overflow such that 320 // detectable hazards are missed. 321 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 322 CurrCycleInstr->isKill()) 323 return; 324 325 if (CurrCycleInstr->isBundle()) { 326 processBundle(); 327 return; 328 } 329 330 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 331 332 // Keep track of emitted instructions 333 EmittedInstrs.push_front(CurrCycleInstr); 334 335 // Add a nullptr for each additional wait state after the first. Make sure 336 // not to add more than getMaxLookAhead() items to the list, since we 337 // truncate the list to that size right after this loop. 338 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 339 i < e; ++i) { 340 EmittedInstrs.push_front(nullptr); 341 } 342 343 // getMaxLookahead() is the largest number of wait states we will ever need 344 // to insert, so there is no point in keeping track of more than that many 345 // wait states. 346 EmittedInstrs.resize(getMaxLookAhead()); 347 348 CurrCycleInstr = nullptr; 349 } 350 351 void GCNHazardRecognizer::RecedeCycle() { 352 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 353 } 354 355 //===----------------------------------------------------------------------===// 356 // Helper Functions 357 //===----------------------------------------------------------------------===// 358 359 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 360 361 // Returns a minimum wait states since \p I walking all predecessors. 362 // Only scans until \p IsExpired does not return true. 363 // Can only be run in a hazard recognizer mode. 364 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 365 MachineBasicBlock *MBB, 366 MachineBasicBlock::reverse_instr_iterator I, 367 int WaitStates, 368 IsExpiredFn IsExpired, 369 DenseSet<const MachineBasicBlock *> &Visited) { 370 for (auto E = MBB->instr_rend(); I != E; ++I) { 371 // Don't add WaitStates for parent BUNDLE instructions. 372 if (I->isBundle()) 373 continue; 374 375 if (IsHazard(&*I)) 376 return WaitStates; 377 378 if (I->isInlineAsm() || I->isMetaInstruction()) 379 continue; 380 381 WaitStates += SIInstrInfo::getNumWaitStates(*I); 382 383 if (IsExpired(&*I, WaitStates)) 384 return std::numeric_limits<int>::max(); 385 } 386 387 int MinWaitStates = WaitStates; 388 bool Found = false; 389 for (MachineBasicBlock *Pred : MBB->predecessors()) { 390 if (!Visited.insert(Pred).second) 391 continue; 392 393 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 394 WaitStates, IsExpired, Visited); 395 396 if (W == std::numeric_limits<int>::max()) 397 continue; 398 399 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 400 if (IsExpired(nullptr, MinWaitStates)) 401 return MinWaitStates; 402 403 Found = true; 404 } 405 406 if (Found) 407 return MinWaitStates; 408 409 return std::numeric_limits<int>::max(); 410 } 411 412 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 413 MachineInstr *MI, 414 IsExpiredFn IsExpired) { 415 DenseSet<const MachineBasicBlock *> Visited; 416 return getWaitStatesSince(IsHazard, MI->getParent(), 417 std::next(MI->getReverseIterator()), 418 0, IsExpired, Visited); 419 } 420 421 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 422 if (IsHazardRecognizerMode) { 423 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 424 return WaitStates >= Limit; 425 }; 426 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 427 } 428 429 int WaitStates = 0; 430 for (MachineInstr *MI : EmittedInstrs) { 431 if (MI) { 432 if (IsHazard(MI)) 433 return WaitStates; 434 435 if (MI->isInlineAsm()) 436 continue; 437 } 438 ++WaitStates; 439 440 if (WaitStates >= Limit) 441 break; 442 } 443 return std::numeric_limits<int>::max(); 444 } 445 446 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 447 IsHazardFn IsHazardDef, 448 int Limit) { 449 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 450 451 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 452 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 453 }; 454 455 return getWaitStatesSince(IsHazardFn, Limit); 456 } 457 458 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 459 int Limit) { 460 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 461 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 462 }; 463 464 return getWaitStatesSince(IsHazardFn, Limit); 465 } 466 467 //===----------------------------------------------------------------------===// 468 // No-op Hazard Detection 469 //===----------------------------------------------------------------------===// 470 471 static void addRegUnits(const SIRegisterInfo &TRI, 472 BitVector &BV, unsigned Reg) { 473 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 474 BV.set(*RUI); 475 } 476 477 static void addRegsToSet(const SIRegisterInfo &TRI, 478 iterator_range<MachineInstr::const_mop_iterator> Ops, 479 BitVector &Set) { 480 for (const MachineOperand &Op : Ops) { 481 if (Op.isReg()) 482 addRegUnits(TRI, Set, Op.getReg()); 483 } 484 } 485 486 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 487 // XXX: Do we need to worry about implicit operands 488 addRegsToSet(TRI, MI.defs(), ClauseDefs); 489 addRegsToSet(TRI, MI.uses(), ClauseUses); 490 } 491 492 static bool breaksSMEMSoftClause(MachineInstr *MI) { 493 return !SIInstrInfo::isSMRD(*MI); 494 } 495 496 static bool breaksVMEMSoftClause(MachineInstr *MI) { 497 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 498 } 499 500 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 501 // SMEM soft clause are only present on VI+, and only matter if xnack is 502 // enabled. 503 if (!ST.isXNACKEnabled()) 504 return 0; 505 506 bool IsSMRD = TII.isSMRD(*MEM); 507 508 resetClause(); 509 510 // A soft-clause is any group of consecutive SMEM instructions. The 511 // instructions in this group may return out of order and/or may be 512 // replayed (i.e. the same instruction issued more than once). 513 // 514 // In order to handle these situations correctly we need to make sure that 515 // when a clause has more than one instruction, no instruction in the clause 516 // writes to a register that is read by another instruction in the clause 517 // (including itself). If we encounter this situaion, we need to break the 518 // clause by inserting a non SMEM instruction. 519 520 for (MachineInstr *MI : EmittedInstrs) { 521 // When we hit a non-SMEM instruction then we have passed the start of the 522 // clause and we can stop. 523 if (!MI) 524 break; 525 526 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 527 break; 528 529 addClauseInst(*MI); 530 } 531 532 if (ClauseDefs.none()) 533 return 0; 534 535 // We need to make sure not to put loads and stores in the same clause if they 536 // use the same address. For now, just start a new clause whenever we see a 537 // store. 538 if (MEM->mayStore()) 539 return 1; 540 541 addClauseInst(*MEM); 542 543 // If the set of defs and uses intersect then we cannot add this instruction 544 // to the clause, so we have a hazard. 545 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 546 } 547 548 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 549 int WaitStatesNeeded = 0; 550 551 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 552 553 // This SMRD hazard only affects SI. 554 if (!ST.hasSMRDReadVALUDefHazard()) 555 return WaitStatesNeeded; 556 557 // A read of an SGPR by SMRD instruction requires 4 wait states when the 558 // SGPR was written by a VALU instruction. 559 int SmrdSgprWaitStates = 4; 560 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 561 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 562 563 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 564 565 for (const MachineOperand &Use : SMRD->uses()) { 566 if (!Use.isReg()) 567 continue; 568 int WaitStatesNeededForUse = 569 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 570 SmrdSgprWaitStates); 571 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 572 573 // This fixes what appears to be undocumented hardware behavior in SI where 574 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 575 // needs some number of nops in between. We don't know how many we need, but 576 // let's use 4. This wasn't discovered before probably because the only 577 // case when this happens is when we expand a 64-bit pointer into a full 578 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 579 // probably never encountered in the closed-source land. 580 if (IsBufferSMRD) { 581 int WaitStatesNeededForUse = 582 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 583 IsBufferHazardDefFn, 584 SmrdSgprWaitStates); 585 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 586 } 587 } 588 589 return WaitStatesNeeded; 590 } 591 592 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 593 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 594 return 0; 595 596 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 597 598 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 599 // SGPR was written by a VALU Instruction. 600 const int VmemSgprWaitStates = 5; 601 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 602 for (const MachineOperand &Use : VMEM->uses()) { 603 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 604 continue; 605 606 int WaitStatesNeededForUse = 607 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 608 VmemSgprWaitStates); 609 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 610 } 611 return WaitStatesNeeded; 612 } 613 614 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 615 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 616 const SIInstrInfo *TII = ST.getInstrInfo(); 617 618 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 619 int DppVgprWaitStates = 2; 620 int DppExecWaitStates = 5; 621 int WaitStatesNeeded = 0; 622 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 623 624 for (const MachineOperand &Use : DPP->uses()) { 625 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 626 continue; 627 int WaitStatesNeededForUse = 628 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 629 [](MachineInstr *) { return true; }, 630 DppVgprWaitStates); 631 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 632 } 633 634 WaitStatesNeeded = std::max( 635 WaitStatesNeeded, 636 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 637 DppExecWaitStates)); 638 639 return WaitStatesNeeded; 640 } 641 642 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 643 const SIInstrInfo *TII = ST.getInstrInfo(); 644 645 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 646 // instruction. 647 const int DivFMasWaitStates = 4; 648 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 649 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 650 DivFMasWaitStates); 651 652 return DivFMasWaitStates - WaitStatesNeeded; 653 } 654 655 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 656 const SIInstrInfo *TII = ST.getInstrInfo(); 657 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 658 659 const int GetRegWaitStates = 2; 660 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 661 return GetRegHWReg == getHWReg(TII, *MI); 662 }; 663 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 664 665 return GetRegWaitStates - WaitStatesNeeded; 666 } 667 668 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 669 const SIInstrInfo *TII = ST.getInstrInfo(); 670 unsigned HWReg = getHWReg(TII, *SetRegInstr); 671 672 const int SetRegWaitStates = ST.getSetRegWaitStates(); 673 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 674 return HWReg == getHWReg(TII, *MI); 675 }; 676 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 677 return SetRegWaitStates - WaitStatesNeeded; 678 } 679 680 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 681 if (!MI.mayStore()) 682 return -1; 683 684 const SIInstrInfo *TII = ST.getInstrInfo(); 685 unsigned Opcode = MI.getOpcode(); 686 const MCInstrDesc &Desc = MI.getDesc(); 687 688 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 689 int VDataRCID = -1; 690 if (VDataIdx != -1) 691 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 692 693 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 694 // There is no hazard if the instruction does not use vector regs 695 // (like wbinvl1) 696 if (VDataIdx == -1) 697 return -1; 698 // For MUBUF/MTBUF instructions this hazard only exists if the 699 // instruction is not using a register in the soffset field. 700 const MachineOperand *SOffset = 701 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 702 // If we have no soffset operand, then assume this field has been 703 // hardcoded to zero. 704 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 705 (!SOffset || !SOffset->isReg())) 706 return VDataIdx; 707 } 708 709 // MIMG instructions create a hazard if they don't use a 256-bit T# and 710 // the store size is greater than 8 bytes and they have more than two bits 711 // of their dmask set. 712 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 713 if (TII->isMIMG(MI)) { 714 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 715 assert(SRsrcIdx != -1 && 716 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 717 (void)SRsrcIdx; 718 } 719 720 if (TII->isFLAT(MI)) { 721 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 722 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 723 return DataIdx; 724 } 725 726 return -1; 727 } 728 729 int 730 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 731 const MachineRegisterInfo &MRI) { 732 // Helper to check for the hazard where VMEM instructions that store more than 733 // 8 bytes can have there store data over written by the next instruction. 734 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 735 736 const int VALUWaitStates = 1; 737 int WaitStatesNeeded = 0; 738 739 if (!TRI->isVGPR(MRI, Def.getReg())) 740 return WaitStatesNeeded; 741 Register Reg = Def.getReg(); 742 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 743 int DataIdx = createsVALUHazard(*MI); 744 return DataIdx >= 0 && 745 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 746 }; 747 int WaitStatesNeededForDef = 748 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 749 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 750 751 return WaitStatesNeeded; 752 } 753 754 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 755 // This checks for the hazard where VMEM instructions that store more than 756 // 8 bytes can have there store data over written by the next instruction. 757 if (!ST.has12DWordStoreHazard()) 758 return 0; 759 760 const MachineRegisterInfo &MRI = MF.getRegInfo(); 761 int WaitStatesNeeded = 0; 762 763 for (const MachineOperand &Def : VALU->defs()) { 764 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 765 } 766 767 return WaitStatesNeeded; 768 } 769 770 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 771 // This checks for hazards associated with inline asm statements. 772 // Since inline asms can contain just about anything, we use this 773 // to call/leverage other check*Hazard routines. Note that 774 // this function doesn't attempt to address all possible inline asm 775 // hazards (good luck), but is a collection of what has been 776 // problematic thus far. 777 778 // see checkVALUHazards() 779 if (!ST.has12DWordStoreHazard()) 780 return 0; 781 782 const MachineRegisterInfo &MRI = MF.getRegInfo(); 783 int WaitStatesNeeded = 0; 784 785 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 786 I != E; ++I) { 787 const MachineOperand &Op = IA->getOperand(I); 788 if (Op.isReg() && Op.isDef()) { 789 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 790 } 791 } 792 793 return WaitStatesNeeded; 794 } 795 796 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 797 const SIInstrInfo *TII = ST.getInstrInfo(); 798 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 799 const MachineRegisterInfo &MRI = MF.getRegInfo(); 800 801 const MachineOperand *LaneSelectOp = 802 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 803 804 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 805 return 0; 806 807 Register LaneSelectReg = LaneSelectOp->getReg(); 808 auto IsHazardFn = [TII] (MachineInstr *MI) { 809 return TII->isVALU(*MI); 810 }; 811 812 const int RWLaneWaitStates = 4; 813 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 814 RWLaneWaitStates); 815 return RWLaneWaitStates - WaitStatesSince; 816 } 817 818 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 819 if (!ST.hasRFEHazards()) 820 return 0; 821 822 const SIInstrInfo *TII = ST.getInstrInfo(); 823 824 const int RFEWaitStates = 1; 825 826 auto IsHazardFn = [TII] (MachineInstr *MI) { 827 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 828 }; 829 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 830 return RFEWaitStates - WaitStatesNeeded; 831 } 832 833 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 834 const SIInstrInfo *TII = ST.getInstrInfo(); 835 const int SMovRelWaitStates = 1; 836 auto IsHazardFn = [TII] (MachineInstr *MI) { 837 return TII->isSALU(*MI); 838 }; 839 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 840 SMovRelWaitStates); 841 } 842 843 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 844 fixVMEMtoScalarWriteHazards(MI); 845 fixVcmpxPermlaneHazards(MI); 846 fixSMEMtoVectorWriteHazards(MI); 847 fixVcmpxExecWARHazard(MI); 848 fixLdsBranchVmemWARHazard(MI); 849 } 850 851 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 852 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 853 return false; 854 855 const SIInstrInfo *TII = ST.getInstrInfo(); 856 auto IsHazardFn = [TII] (MachineInstr *MI) { 857 return TII->isVOPC(*MI); 858 }; 859 860 auto IsExpiredFn = [] (MachineInstr *MI, int) { 861 if (!MI) 862 return false; 863 unsigned Opc = MI->getOpcode(); 864 return SIInstrInfo::isVALU(*MI) && 865 Opc != AMDGPU::V_NOP_e32 && 866 Opc != AMDGPU::V_NOP_e64 && 867 Opc != AMDGPU::V_NOP_sdwa; 868 }; 869 870 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 871 std::numeric_limits<int>::max()) 872 return false; 873 874 // V_NOP will be discarded by SQ. 875 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 876 // which is always a VGPR and available. 877 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 878 Register Reg = Src0->getReg(); 879 bool IsUndef = Src0->isUndef(); 880 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 881 TII->get(AMDGPU::V_MOV_B32_e32)) 882 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 883 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 884 885 return true; 886 } 887 888 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 889 if (!ST.hasVMEMtoScalarWriteHazard()) 890 return false; 891 892 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 893 return false; 894 895 if (MI->getNumDefs() == 0) 896 return false; 897 898 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 899 900 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 901 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 902 !SIInstrInfo::isFLAT(*I)) 903 return false; 904 905 for (const MachineOperand &Def : MI->defs()) { 906 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 907 if (!Op) 908 continue; 909 return true; 910 } 911 return false; 912 }; 913 914 auto IsExpiredFn = [](MachineInstr *MI, int) { 915 return MI && (SIInstrInfo::isVALU(*MI) || 916 (MI->getOpcode() == AMDGPU::S_WAITCNT && 917 !MI->getOperand(0).getImm()) || 918 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 919 MI->getOperand(0).getImm() == 0xffe3)); 920 }; 921 922 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 923 std::numeric_limits<int>::max()) 924 return false; 925 926 const SIInstrInfo *TII = ST.getInstrInfo(); 927 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 928 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 929 .addImm(0xffe3); 930 return true; 931 } 932 933 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 934 if (!ST.hasSMEMtoVectorWriteHazard()) 935 return false; 936 937 if (!SIInstrInfo::isVALU(*MI)) 938 return false; 939 940 unsigned SDSTName; 941 switch (MI->getOpcode()) { 942 case AMDGPU::V_READLANE_B32: 943 case AMDGPU::V_READLANE_B32_gfx10: 944 case AMDGPU::V_READFIRSTLANE_B32: 945 SDSTName = AMDGPU::OpName::vdst; 946 break; 947 default: 948 SDSTName = AMDGPU::OpName::sdst; 949 break; 950 } 951 952 const SIInstrInfo *TII = ST.getInstrInfo(); 953 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 954 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 955 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 956 if (!SDST) { 957 for (const auto &MO : MI->implicit_operands()) { 958 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 959 SDST = &MO; 960 break; 961 } 962 } 963 } 964 965 if (!SDST) 966 return false; 967 968 const Register SDSTReg = SDST->getReg(); 969 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 970 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 971 }; 972 973 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 974 if (MI) { 975 if (TII->isSALU(*MI)) { 976 switch (MI->getOpcode()) { 977 case AMDGPU::S_SETVSKIP: 978 case AMDGPU::S_VERSION: 979 case AMDGPU::S_WAITCNT_VSCNT: 980 case AMDGPU::S_WAITCNT_VMCNT: 981 case AMDGPU::S_WAITCNT_EXPCNT: 982 // These instructions cannot not mitigate the hazard. 983 return false; 984 case AMDGPU::S_WAITCNT_LGKMCNT: 985 // Reducing lgkmcnt count to 0 always mitigates the hazard. 986 return (MI->getOperand(1).getImm() == 0) && 987 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 988 case AMDGPU::S_WAITCNT: { 989 const int64_t Imm = MI->getOperand(0).getImm(); 990 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 991 return (Decoded.LgkmCnt == 0); 992 } 993 default: 994 // SOPP instructions cannot mitigate the hazard. 995 if (TII->isSOPP(*MI)) 996 return false; 997 // At this point the SALU can be assumed to mitigate the hazard 998 // because either: 999 // (a) it is independent of the at risk SMEM (breaking chain), 1000 // or 1001 // (b) it is dependent on the SMEM, in which case an appropriate 1002 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1003 // SMEM instruction. 1004 return true; 1005 } 1006 } 1007 } 1008 return false; 1009 }; 1010 1011 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1012 std::numeric_limits<int>::max()) 1013 return false; 1014 1015 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1016 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1017 .addImm(0); 1018 return true; 1019 } 1020 1021 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1022 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1023 return false; 1024 1025 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1026 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1027 return false; 1028 1029 auto IsHazardFn = [TRI] (MachineInstr *I) { 1030 if (SIInstrInfo::isVALU(*I)) 1031 return false; 1032 return I->readsRegister(AMDGPU::EXEC, TRI); 1033 }; 1034 1035 const SIInstrInfo *TII = ST.getInstrInfo(); 1036 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1037 if (!MI) 1038 return false; 1039 if (SIInstrInfo::isVALU(*MI)) { 1040 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1041 return true; 1042 for (auto MO : MI->implicit_operands()) 1043 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1044 return true; 1045 } 1046 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1047 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1048 return true; 1049 return false; 1050 }; 1051 1052 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1053 std::numeric_limits<int>::max()) 1054 return false; 1055 1056 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1057 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1058 .addImm(0xfffe); 1059 return true; 1060 } 1061 1062 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1063 if (!ST.hasLdsBranchVmemWARHazard()) 1064 return false; 1065 1066 auto IsHazardInst = [] (const MachineInstr *MI) { 1067 if (SIInstrInfo::isDS(*MI)) 1068 return 1; 1069 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1070 return 2; 1071 return 0; 1072 }; 1073 1074 auto InstType = IsHazardInst(MI); 1075 if (!InstType) 1076 return false; 1077 1078 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1079 return I && (IsHazardInst(I) || 1080 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1081 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1082 !I->getOperand(1).getImm())); 1083 }; 1084 1085 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1086 if (!I->isBranch()) 1087 return false; 1088 1089 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1090 auto InstType2 = IsHazardInst(I); 1091 return InstType2 && InstType != InstType2; 1092 }; 1093 1094 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1095 if (!I) 1096 return false; 1097 1098 auto InstType2 = IsHazardInst(I); 1099 if (InstType == InstType2) 1100 return true; 1101 1102 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1103 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1104 !I->getOperand(1).getImm(); 1105 }; 1106 1107 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1108 std::numeric_limits<int>::max(); 1109 }; 1110 1111 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1112 std::numeric_limits<int>::max()) 1113 return false; 1114 1115 const SIInstrInfo *TII = ST.getInstrInfo(); 1116 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1117 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1118 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1119 .addImm(0); 1120 1121 return true; 1122 } 1123 1124 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1125 int NSAtoVMEMWaitStates = 1; 1126 1127 if (!ST.hasNSAtoVMEMBug()) 1128 return 0; 1129 1130 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1131 return 0; 1132 1133 const SIInstrInfo *TII = ST.getInstrInfo(); 1134 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1135 if (!Offset || (Offset->getImm() & 6) == 0) 1136 return 0; 1137 1138 auto IsHazardFn = [TII] (MachineInstr *I) { 1139 if (!SIInstrInfo::isMIMG(*I)) 1140 return false; 1141 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1142 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1143 TII->getInstSizeInBytes(*I) >= 16; 1144 }; 1145 1146 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1147 } 1148 1149 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1150 int FPAtomicToDenormModeWaitStates = 3; 1151 1152 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1153 return 0; 1154 1155 auto IsHazardFn = [] (MachineInstr *I) { 1156 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1157 return false; 1158 return SIInstrInfo::isFPAtomic(*I); 1159 }; 1160 1161 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1162 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1163 return true; 1164 1165 switch (MI->getOpcode()) { 1166 case AMDGPU::S_WAITCNT: 1167 case AMDGPU::S_WAITCNT_VSCNT: 1168 case AMDGPU::S_WAITCNT_VMCNT: 1169 case AMDGPU::S_WAITCNT_EXPCNT: 1170 case AMDGPU::S_WAITCNT_LGKMCNT: 1171 case AMDGPU::S_WAITCNT_IDLE: 1172 return true; 1173 default: 1174 break; 1175 } 1176 1177 return false; 1178 }; 1179 1180 1181 return FPAtomicToDenormModeWaitStates - 1182 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1183 } 1184 1185 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1186 assert(SIInstrInfo::isMAI(*MI)); 1187 1188 int WaitStatesNeeded = 0; 1189 unsigned Opc = MI->getOpcode(); 1190 1191 auto IsVALUFn = [] (MachineInstr *MI) { 1192 return SIInstrInfo::isVALU(*MI); 1193 }; 1194 1195 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1196 const int LegacyVALUWritesVGPRWaitStates = 2; 1197 const int VALUWritesExecWaitStates = 4; 1198 const int MaxWaitStates = 4; 1199 1200 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1201 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1202 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1203 1204 if (WaitStatesNeeded < MaxWaitStates) { 1205 for (const MachineOperand &Use : MI->explicit_uses()) { 1206 const int MaxWaitStates = 2; 1207 1208 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1209 continue; 1210 1211 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1212 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1214 1215 if (WaitStatesNeeded == MaxWaitStates) 1216 break; 1217 } 1218 } 1219 } 1220 1221 auto IsMFMAFn = [] (MachineInstr *MI) { 1222 return SIInstrInfo::isMAI(*MI) && 1223 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1224 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1225 }; 1226 1227 for (const MachineOperand &Op : MI->explicit_operands()) { 1228 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1229 continue; 1230 1231 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1232 continue; 1233 1234 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1235 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1236 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1237 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1238 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1239 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1240 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1241 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1242 const int MaxWaitStates = 18; 1243 Register Reg = Op.getReg(); 1244 unsigned HazardDefLatency = 0; 1245 1246 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1247 (MachineInstr *MI) { 1248 if (!IsMFMAFn(MI)) 1249 return false; 1250 Register DstReg = MI->getOperand(0).getReg(); 1251 if (DstReg == Reg) 1252 return false; 1253 HazardDefLatency = std::max(HazardDefLatency, 1254 TSchedModel.computeInstrLatency(MI)); 1255 return TRI.regsOverlap(DstReg, Reg); 1256 }; 1257 1258 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1259 MaxWaitStates); 1260 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1261 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1262 int OpNo = MI->getOperandNo(&Op); 1263 if (OpNo == SrcCIdx) { 1264 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1265 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1266 switch (HazardDefLatency) { 1267 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1268 break; 1269 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1270 break; 1271 case 16: LLVM_FALLTHROUGH; 1272 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1273 break; 1274 } 1275 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1276 switch (HazardDefLatency) { 1277 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1278 break; 1279 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1280 break; 1281 case 16: LLVM_FALLTHROUGH; 1282 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1283 break; 1284 } 1285 } 1286 1287 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1288 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1289 1290 if (WaitStatesNeeded == MaxWaitStates) 1291 return WaitStatesNeeded; // Early exit. 1292 1293 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1294 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1295 return false; 1296 Register DstReg = MI->getOperand(0).getReg(); 1297 return TRI.regsOverlap(Reg, DstReg); 1298 }; 1299 1300 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1301 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1302 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1303 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1304 if (OpNo == SrcCIdx) 1305 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1306 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1307 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1308 1309 WaitStatesNeededForUse = NeedWaitStates - 1310 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1311 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1312 1313 if (WaitStatesNeeded == MaxWaitStates) 1314 return WaitStatesNeeded; // Early exit. 1315 } 1316 1317 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1318 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1319 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1320 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1321 const int MaxWaitStates = 13; 1322 Register DstReg = MI->getOperand(0).getReg(); 1323 unsigned HazardDefLatency = 0; 1324 1325 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1326 (MachineInstr *MI) { 1327 if (!IsMFMAFn(MI)) 1328 return false; 1329 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1330 HazardDefLatency = std::max(HazardDefLatency, 1331 TSchedModel.computeInstrLatency(MI)); 1332 return TRI.regsOverlap(Reg, DstReg); 1333 }; 1334 1335 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1336 int NeedWaitStates; 1337 switch (HazardDefLatency) { 1338 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1339 break; 1340 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1341 break; 1342 case 16: LLVM_FALLTHROUGH; 1343 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1344 break; 1345 } 1346 1347 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1348 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1349 } 1350 1351 return WaitStatesNeeded; 1352 } 1353 1354 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1355 if (!ST.hasMAIInsts()) 1356 return 0; 1357 1358 int WaitStatesNeeded = 0; 1359 1360 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1361 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1362 }; 1363 1364 for (const MachineOperand &Op : MI->explicit_uses()) { 1365 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1366 continue; 1367 1368 Register Reg = Op.getReg(); 1369 1370 const int AccVgprReadLdStWaitStates = 2; 1371 const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; 1372 const int MaxWaitStates = 2; 1373 1374 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1375 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1376 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1377 1378 if (WaitStatesNeeded == MaxWaitStates) 1379 return WaitStatesNeeded; // Early exit. 1380 1381 auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { 1382 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1383 return false; 1384 auto IsVALUFn = [] (MachineInstr *MI) { 1385 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1386 }; 1387 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1388 std::numeric_limits<int>::max(); 1389 }; 1390 1391 WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - 1392 getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); 1393 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1394 } 1395 1396 return WaitStatesNeeded; 1397 } 1398 1399 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1400 if (!SU->isInstr()) 1401 return false; 1402 1403 MachineInstr *MAI = nullptr; 1404 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1405 MAI = nullptr; 1406 if (SIInstrInfo::isMAI(*MI) && 1407 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1408 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1409 MAI = MI; 1410 return MAI != nullptr; 1411 }; 1412 1413 MachineInstr *MI = SU->getInstr(); 1414 if (IsMFMAFn(MI)) { 1415 int W = getWaitStatesSince(IsMFMAFn, 16); 1416 if (MAI) 1417 return W < (int)TSchedModel.computeInstrLatency(MAI); 1418 } 1419 1420 return false; 1421 } 1422