1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "llvm/CodeGen/MachineFunction.h" 17 #include "llvm/CodeGen/ScheduleDAG.h" 18 #include "llvm/Support/TargetParser.h" 19 20 using namespace llvm; 21 22 //===----------------------------------------------------------------------===// 23 // Hazard Recoginizer Implementation 24 //===----------------------------------------------------------------------===// 25 26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 27 IsHazardRecognizerMode(false), 28 CurrCycleInstr(nullptr), 29 MF(MF), 30 ST(MF.getSubtarget<GCNSubtarget>()), 31 TII(*ST.getInstrInfo()), 32 TRI(TII.getRegisterInfo()), 33 ClauseUses(TRI.getNumRegUnits()), 34 ClauseDefs(TRI.getNumRegUnits()) { 35 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 36 TSchedModel.init(&ST); 37 } 38 39 void GCNHazardRecognizer::Reset() { 40 EmittedInstrs.clear(); 41 } 42 43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 44 EmitInstruction(SU->getInstr()); 45 } 46 47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 48 CurrCycleInstr = MI; 49 } 50 51 static bool isDivFMas(unsigned Opcode) { 52 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 53 } 54 55 static bool isSGetReg(unsigned Opcode) { 56 return Opcode == AMDGPU::S_GETREG_B32; 57 } 58 59 static bool isSSetReg(unsigned Opcode) { 60 switch (Opcode) { 61 case AMDGPU::S_SETREG_B32: 62 case AMDGPU::S_SETREG_B32_mode: 63 case AMDGPU::S_SETREG_IMM32_B32: 64 case AMDGPU::S_SETREG_IMM32_B32_mode: 65 return true; 66 } 67 return false; 68 } 69 70 static bool isRWLane(unsigned Opcode) { 71 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 72 } 73 74 static bool isRFE(unsigned Opcode) { 75 return Opcode == AMDGPU::S_RFE_B64; 76 } 77 78 static bool isSMovRel(unsigned Opcode) { 79 switch (Opcode) { 80 case AMDGPU::S_MOVRELS_B32: 81 case AMDGPU::S_MOVRELS_B64: 82 case AMDGPU::S_MOVRELD_B32: 83 case AMDGPU::S_MOVRELD_B64: 84 return true; 85 default: 86 return false; 87 } 88 } 89 90 static bool isDGEMM(unsigned Opcode) { 91 return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 92 Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || 93 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || 94 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; 95 } 96 97 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 98 unsigned Opcode = MI.getOpcode(); 99 100 if (!SIInstrInfo::isMAI(MI) || 101 isDGEMM(Opcode) || 102 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 103 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 104 return false; 105 106 return true; 107 } 108 109 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 110 const MachineInstr &MI) { 111 if (TII.isAlwaysGDS(MI.getOpcode())) 112 return true; 113 114 switch (MI.getOpcode()) { 115 case AMDGPU::S_SENDMSG: 116 case AMDGPU::S_SENDMSGHALT: 117 case AMDGPU::S_TTRACEDATA: 118 return true; 119 // These DS opcodes don't support GDS. 120 case AMDGPU::DS_NOP: 121 case AMDGPU::DS_PERMUTE_B32: 122 case AMDGPU::DS_BPERMUTE_B32: 123 return false; 124 default: 125 if (TII.isDS(MI.getOpcode())) { 126 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 127 AMDGPU::OpName::gds); 128 if (MI.getOperand(GDS).getImm()) 129 return true; 130 } 131 return false; 132 } 133 } 134 135 static bool isPermlane(const MachineInstr &MI) { 136 unsigned Opcode = MI.getOpcode(); 137 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 138 Opcode == AMDGPU::V_PERMLANEX16_B32_e64; 139 } 140 141 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 142 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 143 AMDGPU::OpName::simm16); 144 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 145 } 146 147 ScheduleHazardRecognizer::HazardType 148 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 149 MachineInstr *MI = SU->getInstr(); 150 // If we are not in "HazardRecognizerMode" and therefore not being run from 151 // the scheduler, track possible stalls from hazards but don't insert noops. 152 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 153 154 if (MI->isBundle()) 155 return NoHazard; 156 157 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 158 return HazardType; 159 160 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 161 return HazardType; 162 163 if (checkFPAtomicToDenormModeHazard(MI) > 0) 164 return HazardType; 165 166 if (ST.hasNoDataDepHazard()) 167 return NoHazard; 168 169 // FIXME: Should flat be considered vmem? 170 if ((SIInstrInfo::isVMEM(*MI) || 171 SIInstrInfo::isFLAT(*MI)) 172 && checkVMEMHazards(MI) > 0) 173 return HazardType; 174 175 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 176 return HazardType; 177 178 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 179 return HazardType; 180 181 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 182 return HazardType; 183 184 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 185 return HazardType; 186 187 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 188 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 189 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 190 return HazardType; 191 192 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 193 return HazardType; 194 195 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 196 return HazardType; 197 198 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 199 return HazardType; 200 201 if (ST.hasReadM0MovRelInterpHazard() && 202 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 203 checkReadM0Hazards(MI) > 0) 204 return HazardType; 205 206 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 207 checkReadM0Hazards(MI) > 0) 208 return HazardType; 209 210 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 211 return HazardType; 212 213 if ((SIInstrInfo::isVMEM(*MI) || 214 SIInstrInfo::isFLAT(*MI) || 215 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 216 return HazardType; 217 218 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 219 return HazardType; 220 221 return NoHazard; 222 } 223 224 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 225 unsigned Quantity) { 226 while (Quantity > 0) { 227 unsigned Arg = std::min(Quantity, 8u); 228 Quantity -= Arg; 229 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 230 .addImm(Arg - 1); 231 } 232 } 233 234 void GCNHazardRecognizer::processBundle() { 235 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 236 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 237 // Check bundled MachineInstr's for hazards. 238 for (; MI != E && MI->isInsideBundle(); ++MI) { 239 CurrCycleInstr = &*MI; 240 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 241 242 if (IsHazardRecognizerMode) { 243 fixHazards(CurrCycleInstr); 244 245 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 246 } 247 248 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 249 // include the bundled MI directly after, only add a maximum of 250 // (MaxLookAhead - 1) noops to EmittedInstrs. 251 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 252 EmittedInstrs.push_front(nullptr); 253 254 EmittedInstrs.push_front(CurrCycleInstr); 255 EmittedInstrs.resize(MaxLookAhead); 256 } 257 CurrCycleInstr = nullptr; 258 } 259 260 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 261 IsHazardRecognizerMode = true; 262 CurrCycleInstr = MI; 263 unsigned W = PreEmitNoopsCommon(MI); 264 fixHazards(MI); 265 CurrCycleInstr = nullptr; 266 return W; 267 } 268 269 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 270 if (MI->isBundle()) 271 return 0; 272 273 int WaitStates = 0; 274 275 if (SIInstrInfo::isSMRD(*MI)) 276 return std::max(WaitStates, checkSMRDHazards(MI)); 277 278 if (ST.hasNSAtoVMEMBug()) 279 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 280 281 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 282 283 if (ST.hasNoDataDepHazard()) 284 return WaitStates; 285 286 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 287 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 288 289 if (SIInstrInfo::isVALU(*MI)) 290 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 291 292 if (SIInstrInfo::isDPP(*MI)) 293 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 294 295 if (isDivFMas(MI->getOpcode())) 296 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 297 298 if (isRWLane(MI->getOpcode())) 299 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 300 301 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 302 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 303 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 304 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 305 306 if (MI->isInlineAsm()) 307 return std::max(WaitStates, checkInlineAsmHazards(MI)); 308 309 if (isSGetReg(MI->getOpcode())) 310 return std::max(WaitStates, checkGetRegHazards(MI)); 311 312 if (isSSetReg(MI->getOpcode())) 313 return std::max(WaitStates, checkSetRegHazards(MI)); 314 315 if (isRFE(MI->getOpcode())) 316 return std::max(WaitStates, checkRFEHazards(MI)); 317 318 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 319 isSMovRel(MI->getOpcode()))) 320 return std::max(WaitStates, checkReadM0Hazards(MI)); 321 322 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 323 return std::max(WaitStates, checkReadM0Hazards(MI)); 324 325 if (SIInstrInfo::isMAI(*MI)) 326 return std::max(WaitStates, checkMAIHazards(MI)); 327 328 if (SIInstrInfo::isVMEM(*MI) || 329 SIInstrInfo::isFLAT(*MI) || 330 SIInstrInfo::isDS(*MI)) 331 return std::max(WaitStates, checkMAILdStHazards(MI)); 332 333 return WaitStates; 334 } 335 336 void GCNHazardRecognizer::EmitNoop() { 337 EmittedInstrs.push_front(nullptr); 338 } 339 340 void GCNHazardRecognizer::AdvanceCycle() { 341 // When the scheduler detects a stall, it will call AdvanceCycle() without 342 // emitting any instructions. 343 if (!CurrCycleInstr) { 344 EmittedInstrs.push_front(nullptr); 345 return; 346 } 347 348 // Do not track non-instructions which do not affect the wait states. 349 // If included, these instructions can lead to buffer overflow such that 350 // detectable hazards are missed. 351 if (CurrCycleInstr->isMetaInstruction()) { 352 CurrCycleInstr = nullptr; 353 return; 354 } 355 356 if (CurrCycleInstr->isBundle()) { 357 processBundle(); 358 return; 359 } 360 361 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 362 363 // Keep track of emitted instructions 364 EmittedInstrs.push_front(CurrCycleInstr); 365 366 // Add a nullptr for each additional wait state after the first. Make sure 367 // not to add more than getMaxLookAhead() items to the list, since we 368 // truncate the list to that size right after this loop. 369 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 370 i < e; ++i) { 371 EmittedInstrs.push_front(nullptr); 372 } 373 374 // getMaxLookahead() is the largest number of wait states we will ever need 375 // to insert, so there is no point in keeping track of more than that many 376 // wait states. 377 EmittedInstrs.resize(getMaxLookAhead()); 378 379 CurrCycleInstr = nullptr; 380 } 381 382 void GCNHazardRecognizer::RecedeCycle() { 383 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 384 } 385 386 //===----------------------------------------------------------------------===// 387 // Helper Functions 388 //===----------------------------------------------------------------------===// 389 390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; 391 392 // Returns a minimum wait states since \p I walking all predecessors. 393 // Only scans until \p IsExpired does not return true. 394 // Can only be run in a hazard recognizer mode. 395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 396 const MachineBasicBlock *MBB, 397 MachineBasicBlock::const_reverse_instr_iterator I, 398 int WaitStates, IsExpiredFn IsExpired, 399 DenseSet<const MachineBasicBlock *> &Visited) { 400 for (auto E = MBB->instr_rend(); I != E; ++I) { 401 // Don't add WaitStates for parent BUNDLE instructions. 402 if (I->isBundle()) 403 continue; 404 405 if (IsHazard(*I)) 406 return WaitStates; 407 408 if (I->isInlineAsm() || I->isMetaInstruction()) 409 continue; 410 411 WaitStates += SIInstrInfo::getNumWaitStates(*I); 412 413 if (IsExpired(*I, WaitStates)) 414 return std::numeric_limits<int>::max(); 415 } 416 417 int MinWaitStates = std::numeric_limits<int>::max(); 418 for (MachineBasicBlock *Pred : MBB->predecessors()) { 419 if (!Visited.insert(Pred).second) 420 continue; 421 422 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 423 WaitStates, IsExpired, Visited); 424 425 MinWaitStates = std::min(MinWaitStates, W); 426 } 427 428 return MinWaitStates; 429 } 430 431 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 432 const MachineInstr *MI, IsExpiredFn IsExpired) { 433 DenseSet<const MachineBasicBlock *> Visited; 434 return getWaitStatesSince(IsHazard, MI->getParent(), 435 std::next(MI->getReverseIterator()), 436 0, IsExpired, Visited); 437 } 438 439 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 440 if (IsHazardRecognizerMode) { 441 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 442 return WaitStates >= Limit; 443 }; 444 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 445 } 446 447 int WaitStates = 0; 448 for (MachineInstr *MI : EmittedInstrs) { 449 if (MI) { 450 if (IsHazard(*MI)) 451 return WaitStates; 452 453 if (MI->isInlineAsm()) 454 continue; 455 } 456 ++WaitStates; 457 458 if (WaitStates >= Limit) 459 break; 460 } 461 return std::numeric_limits<int>::max(); 462 } 463 464 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 465 IsHazardFn IsHazardDef, 466 int Limit) { 467 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 468 469 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 470 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 471 }; 472 473 return getWaitStatesSince(IsHazardFn, Limit); 474 } 475 476 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 477 int Limit) { 478 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 479 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 480 }; 481 482 return getWaitStatesSince(IsHazardFn, Limit); 483 } 484 485 //===----------------------------------------------------------------------===// 486 // No-op Hazard Detection 487 //===----------------------------------------------------------------------===// 488 489 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 490 MCRegister Reg) { 491 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 492 BV.set(*RUI); 493 } 494 495 static void addRegsToSet(const SIRegisterInfo &TRI, 496 iterator_range<MachineInstr::const_mop_iterator> Ops, 497 BitVector &Set) { 498 for (const MachineOperand &Op : Ops) { 499 if (Op.isReg()) 500 addRegUnits(TRI, Set, Op.getReg().asMCReg()); 501 } 502 } 503 504 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 505 // XXX: Do we need to worry about implicit operands 506 addRegsToSet(TRI, MI.defs(), ClauseDefs); 507 addRegsToSet(TRI, MI.uses(), ClauseUses); 508 } 509 510 static bool breaksSMEMSoftClause(MachineInstr *MI) { 511 return !SIInstrInfo::isSMRD(*MI); 512 } 513 514 static bool breaksVMEMSoftClause(MachineInstr *MI) { 515 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 516 } 517 518 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 519 // SMEM soft clause are only present on VI+, and only matter if xnack is 520 // enabled. 521 if (!ST.isXNACKEnabled()) 522 return 0; 523 524 bool IsSMRD = TII.isSMRD(*MEM); 525 526 resetClause(); 527 528 // A soft-clause is any group of consecutive SMEM instructions. The 529 // instructions in this group may return out of order and/or may be 530 // replayed (i.e. the same instruction issued more than once). 531 // 532 // In order to handle these situations correctly we need to make sure that 533 // when a clause has more than one instruction, no instruction in the clause 534 // writes to a register that is read by another instruction in the clause 535 // (including itself). If we encounter this situaion, we need to break the 536 // clause by inserting a non SMEM instruction. 537 538 for (MachineInstr *MI : EmittedInstrs) { 539 // When we hit a non-SMEM instruction then we have passed the start of the 540 // clause and we can stop. 541 if (!MI) 542 break; 543 544 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 545 break; 546 547 addClauseInst(*MI); 548 } 549 550 if (ClauseDefs.none()) 551 return 0; 552 553 // We need to make sure not to put loads and stores in the same clause if they 554 // use the same address. For now, just start a new clause whenever we see a 555 // store. 556 if (MEM->mayStore()) 557 return 1; 558 559 addClauseInst(*MEM); 560 561 // If the set of defs and uses intersect then we cannot add this instruction 562 // to the clause, so we have a hazard. 563 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 564 } 565 566 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 567 int WaitStatesNeeded = 0; 568 569 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 570 571 // This SMRD hazard only affects SI. 572 if (!ST.hasSMRDReadVALUDefHazard()) 573 return WaitStatesNeeded; 574 575 // A read of an SGPR by SMRD instruction requires 4 wait states when the 576 // SGPR was written by a VALU instruction. 577 int SmrdSgprWaitStates = 4; 578 auto IsHazardDefFn = [this](const MachineInstr &MI) { 579 return TII.isVALU(MI); 580 }; 581 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 582 return TII.isSALU(MI); 583 }; 584 585 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 586 587 for (const MachineOperand &Use : SMRD->uses()) { 588 if (!Use.isReg()) 589 continue; 590 int WaitStatesNeededForUse = 591 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 592 SmrdSgprWaitStates); 593 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 594 595 // This fixes what appears to be undocumented hardware behavior in SI where 596 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 597 // needs some number of nops in between. We don't know how many we need, but 598 // let's use 4. This wasn't discovered before probably because the only 599 // case when this happens is when we expand a 64-bit pointer into a full 600 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 601 // probably never encountered in the closed-source land. 602 if (IsBufferSMRD) { 603 int WaitStatesNeededForUse = 604 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 605 IsBufferHazardDefFn, 606 SmrdSgprWaitStates); 607 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 608 } 609 } 610 611 return WaitStatesNeeded; 612 } 613 614 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 615 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 616 return 0; 617 618 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 619 620 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 621 // SGPR was written by a VALU Instruction. 622 const int VmemSgprWaitStates = 5; 623 auto IsHazardDefFn = [this](const MachineInstr &MI) { 624 return TII.isVALU(MI); 625 }; 626 for (const MachineOperand &Use : VMEM->uses()) { 627 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 628 continue; 629 630 int WaitStatesNeededForUse = 631 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 632 VmemSgprWaitStates); 633 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 634 } 635 return WaitStatesNeeded; 636 } 637 638 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 639 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 640 const SIInstrInfo *TII = ST.getInstrInfo(); 641 642 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 643 int DppVgprWaitStates = 2; 644 int DppExecWaitStates = 5; 645 int WaitStatesNeeded = 0; 646 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 647 return TII->isVALU(MI); 648 }; 649 650 for (const MachineOperand &Use : DPP->uses()) { 651 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 652 continue; 653 int WaitStatesNeededForUse = 654 DppVgprWaitStates - getWaitStatesSinceDef( 655 Use.getReg(), 656 [](const MachineInstr &) { return true; }, 657 DppVgprWaitStates); 658 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 659 } 660 661 WaitStatesNeeded = std::max( 662 WaitStatesNeeded, 663 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 664 DppExecWaitStates)); 665 666 return WaitStatesNeeded; 667 } 668 669 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 670 const SIInstrInfo *TII = ST.getInstrInfo(); 671 672 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 673 // instruction. 674 const int DivFMasWaitStates = 4; 675 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 676 return TII->isVALU(MI); 677 }; 678 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 679 DivFMasWaitStates); 680 681 return DivFMasWaitStates - WaitStatesNeeded; 682 } 683 684 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 685 const SIInstrInfo *TII = ST.getInstrInfo(); 686 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 687 688 const int GetRegWaitStates = 2; 689 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 690 return GetRegHWReg == getHWReg(TII, MI); 691 }; 692 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 693 694 return GetRegWaitStates - WaitStatesNeeded; 695 } 696 697 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 698 const SIInstrInfo *TII = ST.getInstrInfo(); 699 unsigned HWReg = getHWReg(TII, *SetRegInstr); 700 701 const int SetRegWaitStates = ST.getSetRegWaitStates(); 702 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 703 return HWReg == getHWReg(TII, MI); 704 }; 705 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 706 return SetRegWaitStates - WaitStatesNeeded; 707 } 708 709 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 710 if (!MI.mayStore()) 711 return -1; 712 713 const SIInstrInfo *TII = ST.getInstrInfo(); 714 unsigned Opcode = MI.getOpcode(); 715 const MCInstrDesc &Desc = MI.getDesc(); 716 717 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 718 int VDataRCID = -1; 719 if (VDataIdx != -1) 720 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 721 722 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 723 // There is no hazard if the instruction does not use vector regs 724 // (like wbinvl1) 725 if (VDataIdx == -1) 726 return -1; 727 // For MUBUF/MTBUF instructions this hazard only exists if the 728 // instruction is not using a register in the soffset field. 729 const MachineOperand *SOffset = 730 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 731 // If we have no soffset operand, then assume this field has been 732 // hardcoded to zero. 733 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 734 (!SOffset || !SOffset->isReg())) 735 return VDataIdx; 736 } 737 738 // MIMG instructions create a hazard if they don't use a 256-bit T# and 739 // the store size is greater than 8 bytes and they have more than two bits 740 // of their dmask set. 741 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 742 if (TII->isMIMG(MI)) { 743 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 744 assert(SRsrcIdx != -1 && 745 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 746 (void)SRsrcIdx; 747 } 748 749 if (TII->isFLAT(MI)) { 750 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 751 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 752 return DataIdx; 753 } 754 755 return -1; 756 } 757 758 int 759 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 760 const MachineRegisterInfo &MRI) { 761 // Helper to check for the hazard where VMEM instructions that store more than 762 // 8 bytes can have there store data over written by the next instruction. 763 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 764 765 const int VALUWaitStates = 1; 766 int WaitStatesNeeded = 0; 767 768 if (!TRI->isVectorRegister(MRI, Def.getReg())) 769 return WaitStatesNeeded; 770 Register Reg = Def.getReg(); 771 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 772 int DataIdx = createsVALUHazard(MI); 773 return DataIdx >= 0 && 774 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 775 }; 776 int WaitStatesNeededForDef = 777 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 779 780 return WaitStatesNeeded; 781 } 782 783 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 784 // This checks for the hazard where VMEM instructions that store more than 785 // 8 bytes can have there store data over written by the next instruction. 786 if (!ST.has12DWordStoreHazard()) 787 return 0; 788 789 const MachineRegisterInfo &MRI = MF.getRegInfo(); 790 int WaitStatesNeeded = 0; 791 792 for (const MachineOperand &Def : VALU->defs()) { 793 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 794 } 795 796 return WaitStatesNeeded; 797 } 798 799 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 800 // This checks for hazards associated with inline asm statements. 801 // Since inline asms can contain just about anything, we use this 802 // to call/leverage other check*Hazard routines. Note that 803 // this function doesn't attempt to address all possible inline asm 804 // hazards (good luck), but is a collection of what has been 805 // problematic thus far. 806 807 // see checkVALUHazards() 808 if (!ST.has12DWordStoreHazard()) 809 return 0; 810 811 const MachineRegisterInfo &MRI = MF.getRegInfo(); 812 int WaitStatesNeeded = 0; 813 814 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 815 I != E; ++I) { 816 const MachineOperand &Op = IA->getOperand(I); 817 if (Op.isReg() && Op.isDef()) { 818 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 819 } 820 } 821 822 return WaitStatesNeeded; 823 } 824 825 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 826 const SIInstrInfo *TII = ST.getInstrInfo(); 827 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 828 const MachineRegisterInfo &MRI = MF.getRegInfo(); 829 830 const MachineOperand *LaneSelectOp = 831 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 832 833 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 834 return 0; 835 836 Register LaneSelectReg = LaneSelectOp->getReg(); 837 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 838 839 const int RWLaneWaitStates = 4; 840 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 841 RWLaneWaitStates); 842 return RWLaneWaitStates - WaitStatesSince; 843 } 844 845 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 846 if (!ST.hasRFEHazards()) 847 return 0; 848 849 const SIInstrInfo *TII = ST.getInstrInfo(); 850 851 const int RFEWaitStates = 1; 852 853 auto IsHazardFn = [TII](const MachineInstr &MI) { 854 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 855 }; 856 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 857 return RFEWaitStates - WaitStatesNeeded; 858 } 859 860 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 861 const SIInstrInfo *TII = ST.getInstrInfo(); 862 const int SMovRelWaitStates = 1; 863 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 864 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 865 SMovRelWaitStates); 866 } 867 868 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 869 fixVMEMtoScalarWriteHazards(MI); 870 fixVcmpxPermlaneHazards(MI); 871 fixSMEMtoVectorWriteHazards(MI); 872 fixVcmpxExecWARHazard(MI); 873 fixLdsBranchVmemWARHazard(MI); 874 } 875 876 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 877 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 878 return false; 879 880 const SIInstrInfo *TII = ST.getInstrInfo(); 881 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); }; 882 883 auto IsExpiredFn = [](const MachineInstr &MI, int) { 884 unsigned Opc = MI.getOpcode(); 885 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 886 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 887 }; 888 889 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 890 std::numeric_limits<int>::max()) 891 return false; 892 893 // V_NOP will be discarded by SQ. 894 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 895 // which is always a VGPR and available. 896 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 897 Register Reg = Src0->getReg(); 898 bool IsUndef = Src0->isUndef(); 899 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 900 TII->get(AMDGPU::V_MOV_B32_e32)) 901 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 902 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 903 904 return true; 905 } 906 907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 908 if (!ST.hasVMEMtoScalarWriteHazard()) 909 return false; 910 911 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 912 return false; 913 914 if (MI->getNumDefs() == 0) 915 return false; 916 917 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 918 919 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 920 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 921 !SIInstrInfo::isFLAT(I)) 922 return false; 923 924 for (const MachineOperand &Def : MI->defs()) { 925 const MachineOperand *Op = 926 I.findRegisterUseOperand(Def.getReg(), false, TRI); 927 if (!Op) 928 continue; 929 return true; 930 } 931 return false; 932 }; 933 934 auto IsExpiredFn = [](const MachineInstr &MI, int) { 935 return SIInstrInfo::isVALU(MI) || 936 (MI.getOpcode() == AMDGPU::S_WAITCNT && 937 !MI.getOperand(0).getImm()) || 938 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 939 MI.getOperand(0).getImm() == 0xffe3); 940 }; 941 942 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 943 std::numeric_limits<int>::max()) 944 return false; 945 946 const SIInstrInfo *TII = ST.getInstrInfo(); 947 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 948 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 949 .addImm(0xffe3); 950 return true; 951 } 952 953 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 954 if (!ST.hasSMEMtoVectorWriteHazard()) 955 return false; 956 957 if (!SIInstrInfo::isVALU(*MI)) 958 return false; 959 960 unsigned SDSTName; 961 switch (MI->getOpcode()) { 962 case AMDGPU::V_READLANE_B32: 963 case AMDGPU::V_READFIRSTLANE_B32: 964 SDSTName = AMDGPU::OpName::vdst; 965 break; 966 default: 967 SDSTName = AMDGPU::OpName::sdst; 968 break; 969 } 970 971 const SIInstrInfo *TII = ST.getInstrInfo(); 972 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 973 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 974 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 975 if (!SDST) { 976 for (const auto &MO : MI->implicit_operands()) { 977 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 978 SDST = &MO; 979 break; 980 } 981 } 982 } 983 984 if (!SDST) 985 return false; 986 987 const Register SDSTReg = SDST->getReg(); 988 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 989 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 990 }; 991 992 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 993 if (TII->isSALU(MI)) { 994 switch (MI.getOpcode()) { 995 case AMDGPU::S_SETVSKIP: 996 case AMDGPU::S_VERSION: 997 case AMDGPU::S_WAITCNT_VSCNT: 998 case AMDGPU::S_WAITCNT_VMCNT: 999 case AMDGPU::S_WAITCNT_EXPCNT: 1000 // These instructions cannot not mitigate the hazard. 1001 return false; 1002 case AMDGPU::S_WAITCNT_LGKMCNT: 1003 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1004 return (MI.getOperand(1).getImm() == 0) && 1005 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1006 case AMDGPU::S_WAITCNT: { 1007 const int64_t Imm = MI.getOperand(0).getImm(); 1008 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1009 return (Decoded.LgkmCnt == 0); 1010 } 1011 default: 1012 // SOPP instructions cannot mitigate the hazard. 1013 if (TII->isSOPP(MI)) 1014 return false; 1015 // At this point the SALU can be assumed to mitigate the hazard 1016 // because either: 1017 // (a) it is independent of the at risk SMEM (breaking chain), 1018 // or 1019 // (b) it is dependent on the SMEM, in which case an appropriate 1020 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1021 // SMEM instruction. 1022 return true; 1023 } 1024 } 1025 return false; 1026 }; 1027 1028 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1029 std::numeric_limits<int>::max()) 1030 return false; 1031 1032 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1033 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1034 .addImm(0); 1035 return true; 1036 } 1037 1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1039 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1040 return false; 1041 1042 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1043 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1044 return false; 1045 1046 auto IsHazardFn = [TRI](const MachineInstr &I) { 1047 if (SIInstrInfo::isVALU(I)) 1048 return false; 1049 return I.readsRegister(AMDGPU::EXEC, TRI); 1050 }; 1051 1052 const SIInstrInfo *TII = ST.getInstrInfo(); 1053 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1054 if (SIInstrInfo::isVALU(MI)) { 1055 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1056 return true; 1057 for (auto MO : MI.implicit_operands()) 1058 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1059 return true; 1060 } 1061 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1062 (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe) 1063 return true; 1064 return false; 1065 }; 1066 1067 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1068 std::numeric_limits<int>::max()) 1069 return false; 1070 1071 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1072 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1073 .addImm(0xfffe); 1074 return true; 1075 } 1076 1077 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1078 if (!ST.hasLdsBranchVmemWARHazard()) 1079 return false; 1080 1081 auto IsHazardInst = [](const MachineInstr &MI) { 1082 if (SIInstrInfo::isDS(MI)) 1083 return 1; 1084 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 1085 return 2; 1086 return 0; 1087 }; 1088 1089 auto InstType = IsHazardInst(*MI); 1090 if (!InstType) 1091 return false; 1092 1093 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1094 return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1095 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1096 !I.getOperand(1).getImm()); 1097 }; 1098 1099 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1100 if (!I.isBranch()) 1101 return false; 1102 1103 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1104 auto InstType2 = IsHazardInst(I); 1105 return InstType2 && InstType != InstType2; 1106 }; 1107 1108 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1109 auto InstType2 = IsHazardInst(I); 1110 if (InstType == InstType2) 1111 return true; 1112 1113 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1114 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1115 !I.getOperand(1).getImm(); 1116 }; 1117 1118 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1119 std::numeric_limits<int>::max(); 1120 }; 1121 1122 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1123 std::numeric_limits<int>::max()) 1124 return false; 1125 1126 const SIInstrInfo *TII = ST.getInstrInfo(); 1127 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1128 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1129 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1130 .addImm(0); 1131 1132 return true; 1133 } 1134 1135 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1136 int NSAtoVMEMWaitStates = 1; 1137 1138 if (!ST.hasNSAtoVMEMBug()) 1139 return 0; 1140 1141 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1142 return 0; 1143 1144 const SIInstrInfo *TII = ST.getInstrInfo(); 1145 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1146 if (!Offset || (Offset->getImm() & 6) == 0) 1147 return 0; 1148 1149 auto IsHazardFn = [TII](const MachineInstr &I) { 1150 if (!SIInstrInfo::isMIMG(I)) 1151 return false; 1152 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 1153 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1154 TII->getInstSizeInBytes(I) >= 16; 1155 }; 1156 1157 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1158 } 1159 1160 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1161 int FPAtomicToDenormModeWaitStates = 3; 1162 1163 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1164 return 0; 1165 1166 auto IsHazardFn = [](const MachineInstr &I) { 1167 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 1168 return false; 1169 return SIInstrInfo::isFPAtomic(I); 1170 }; 1171 1172 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 1173 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 1174 return true; 1175 1176 switch (MI.getOpcode()) { 1177 case AMDGPU::S_WAITCNT: 1178 case AMDGPU::S_WAITCNT_VSCNT: 1179 case AMDGPU::S_WAITCNT_VMCNT: 1180 case AMDGPU::S_WAITCNT_EXPCNT: 1181 case AMDGPU::S_WAITCNT_LGKMCNT: 1182 case AMDGPU::S_WAIT_IDLE: 1183 return true; 1184 default: 1185 break; 1186 } 1187 1188 return false; 1189 }; 1190 1191 return FPAtomicToDenormModeWaitStates - 1192 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1193 } 1194 1195 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1196 assert(SIInstrInfo::isMAI(*MI)); 1197 1198 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 1199 } 1200 1201 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 1202 int WaitStatesNeeded = 0; 1203 unsigned Opc = MI->getOpcode(); 1204 1205 auto IsVALUFn = [](const MachineInstr &MI) { 1206 return SIInstrInfo::isVALU(MI); 1207 }; 1208 1209 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 1210 const int LegacyVALUWritesVGPRWaitStates = 2; 1211 const int VALUWritesExecWaitStates = 4; 1212 const int MaxWaitStates = 4; 1213 1214 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1215 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1216 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1217 1218 if (WaitStatesNeeded < MaxWaitStates) { 1219 for (const MachineOperand &Use : MI->explicit_uses()) { 1220 const int MaxWaitStates = 2; 1221 1222 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1223 continue; 1224 1225 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1226 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1227 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1228 1229 if (WaitStatesNeeded == MaxWaitStates) 1230 break; 1231 } 1232 } 1233 } 1234 1235 auto IsMFMAFn = [](const MachineInstr &MI) { 1236 return SIInstrInfo::isMAI(MI) && 1237 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1238 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; 1239 }; 1240 1241 for (const MachineOperand &Op : MI->explicit_operands()) { 1242 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1243 continue; 1244 1245 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1246 continue; 1247 1248 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1249 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1250 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1251 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1252 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1253 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1254 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1255 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1256 const int MaxWaitStates = 18; 1257 Register Reg = Op.getReg(); 1258 unsigned HazardDefLatency = 0; 1259 1260 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, 1261 this](const MachineInstr &MI) { 1262 if (!IsMFMAFn(MI)) 1263 return false; 1264 Register DstReg = MI.getOperand(0).getReg(); 1265 if (DstReg == Reg) 1266 return false; 1267 HazardDefLatency = 1268 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 1269 return TRI.regsOverlap(DstReg, Reg); 1270 }; 1271 1272 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1273 MaxWaitStates); 1274 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1275 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1276 int OpNo = MI->getOperandNo(&Op); 1277 if (OpNo == SrcCIdx) { 1278 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1279 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 1280 switch (HazardDefLatency) { 1281 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1282 break; 1283 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1284 break; 1285 case 16: LLVM_FALLTHROUGH; 1286 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1287 break; 1288 } 1289 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1290 switch (HazardDefLatency) { 1291 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1292 break; 1293 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1294 break; 1295 case 16: LLVM_FALLTHROUGH; 1296 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1297 break; 1298 } 1299 } 1300 1301 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1302 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1303 1304 if (WaitStatesNeeded == MaxWaitStates) 1305 return WaitStatesNeeded; // Early exit. 1306 1307 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 1308 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1309 return false; 1310 Register DstReg = MI.getOperand(0).getReg(); 1311 return TRI.regsOverlap(Reg, DstReg); 1312 }; 1313 1314 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1315 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1316 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1317 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1318 if (OpNo == SrcCIdx) 1319 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1320 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 1321 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1322 1323 WaitStatesNeededForUse = NeedWaitStates - 1324 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1326 1327 if (WaitStatesNeeded == MaxWaitStates) 1328 return WaitStatesNeeded; // Early exit. 1329 } 1330 1331 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1332 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1333 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1334 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1335 const int MaxWaitStates = 13; 1336 Register DstReg = MI->getOperand(0).getReg(); 1337 unsigned HazardDefLatency = 0; 1338 1339 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, 1340 this](const MachineInstr &MI) { 1341 if (!IsMFMAFn(MI)) 1342 return false; 1343 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 1344 HazardDefLatency = 1345 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 1346 return TRI.regsOverlap(Reg, DstReg); 1347 }; 1348 1349 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1350 int NeedWaitStates; 1351 switch (HazardDefLatency) { 1352 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1353 break; 1354 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1355 break; 1356 case 16: LLVM_FALLTHROUGH; 1357 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1358 break; 1359 } 1360 1361 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1362 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1363 } 1364 1365 return WaitStatesNeeded; 1366 } 1367 1368 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 1369 int WaitStatesNeeded = 0; 1370 unsigned Opc = MI->getOpcode(); 1371 1372 auto IsMFMAFn = [](const MachineInstr &MI) { 1373 return SIInstrInfo::isMAI(MI) && 1374 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1375 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; 1376 }; 1377 1378 auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) { 1379 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI); 1380 }; 1381 1382 auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) { 1383 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI); 1384 }; 1385 1386 if (!IsMFMAFn(*MI)) 1387 return WaitStatesNeeded; 1388 1389 const int VALUWritesExecWaitStates = 4; 1390 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1391 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 1392 VALUWritesExecWaitStates); 1393 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1394 1395 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1396 1397 // Loop for both DGEMM and S/HGEMM 2nd instruction. 1398 for (const MachineOperand &Use : MI->explicit_uses()) { 1399 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 1400 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 1401 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 1402 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 1403 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 1404 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 1405 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 1406 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 1407 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 1408 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 1409 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 1410 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 1411 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 1412 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 1413 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 1414 const int MaxWaitStates = 19; 1415 1416 if (!Use.isReg()) 1417 continue; 1418 unsigned Reg = Use.getReg(); 1419 bool FullReg; 1420 const MachineInstr *MI1; 1421 1422 auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1, 1423 this](const MachineInstr &MI) { 1424 if (!IsMFMAFn(MI)) 1425 return false; 1426 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) 1427 return false; 1428 Register DstReg = MI.getOperand(0).getReg(); 1429 FullReg = (DstReg == Reg); 1430 MI1 = &MI; 1431 return TRI.regsOverlap(DstReg, Reg); 1432 }; 1433 1434 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 1435 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 1436 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1437 1438 int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn, 1439 MaxWaitStates); 1440 if (NumWaitStates == std::numeric_limits<int>::max()) 1441 continue; 1442 1443 int OpNo = MI->getOperandNo(&Use); 1444 unsigned Opc1 = MI1->getOpcode(); 1445 int NeedWaitStates = 0; 1446 if (OpNo == SrcCIdx) { 1447 if (!isDGEMM(Opc) && isDGEMM(Opc1)) { 1448 NeedWaitStates = 0; 1449 } else if (FullReg) { 1450 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 1451 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 1452 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 1453 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 1454 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 1455 } else { 1456 switch (Opc1) { 1457 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 1458 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 1459 if (!isXDL(ST, *MI)) 1460 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 1461 break; 1462 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 1463 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 1464 if (!isXDL(ST, *MI)) 1465 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 1466 break; 1467 default: 1468 switch (TSchedModel.computeInstrLatency(MI1)) { 1469 case 2: 1470 NeedWaitStates = isDGEMM(Opc) 1471 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 1472 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 1473 break; 1474 case 8: 1475 NeedWaitStates = isDGEMM(Opc) 1476 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 1477 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 1478 break; 1479 case 16: LLVM_FALLTHROUGH; 1480 default: 1481 NeedWaitStates = isDGEMM(Opc) 1482 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 1483 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 1484 } 1485 } 1486 } 1487 } else { 1488 switch (Opc1) { 1489 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 1490 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 1491 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 1492 break; 1493 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 1494 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 1495 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 1496 break; 1497 default: 1498 switch (TSchedModel.computeInstrLatency(MI1)) { 1499 case 2: 1500 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 1501 break; 1502 case 8: 1503 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 1504 break; 1505 case 16: LLVM_FALLTHROUGH; 1506 default: 1507 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 1508 } 1509 } 1510 } 1511 if (WaitStatesNeeded >= NeedWaitStates) 1512 continue; 1513 1514 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 1515 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1516 1517 if (WaitStatesNeeded == MaxWaitStates) 1518 break; 1519 } 1520 1521 return WaitStatesNeeded; 1522 } 1523 1524 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1525 // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards() 1526 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 1527 return 0; 1528 1529 int WaitStatesNeeded = 0; 1530 1531 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 1532 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 1533 }; 1534 1535 for (const MachineOperand &Op : MI->explicit_uses()) { 1536 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1537 continue; 1538 1539 Register Reg = Op.getReg(); 1540 1541 const int AccVgprReadLdStWaitStates = 2; 1542 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1543 const int MaxWaitStates = 2; 1544 1545 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1546 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1547 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1548 1549 if (WaitStatesNeeded == MaxWaitStates) 1550 return WaitStatesNeeded; // Early exit. 1551 1552 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 1553 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 1554 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1555 return false; 1556 auto IsVALUFn = [](const MachineInstr &MI) { 1557 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 1558 }; 1559 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1560 std::numeric_limits<int>::max(); 1561 }; 1562 1563 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1564 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1565 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1566 } 1567 1568 return WaitStatesNeeded; 1569 } 1570 1571 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 1572 if (!ST.hasGFX90AInsts()) 1573 return 0; 1574 1575 auto IsMFMAFn = [](const MachineInstr &MI) -> bool { 1576 return SIInstrInfo::isMAI(MI) && 1577 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1578 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; 1579 }; 1580 1581 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 1582 return isDGEMM(MI.getOpcode()); 1583 }; 1584 1585 // This is checked in checkMAIHazards90A() 1586 if (IsMFMAFn(*MI)) 1587 return 0; 1588 1589 int WaitStatesNeeded = 0; 1590 1591 bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) || 1592 SIInstrInfo::isFLAT(*MI) || 1593 SIInstrInfo::isDS(*MI) || 1594 SIInstrInfo::isEXP(*MI); 1595 bool IsVALU = SIInstrInfo::isVALU(*MI); 1596 1597 const MachineInstr *MFMA = nullptr; 1598 unsigned Reg; 1599 auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA, 1600 this](const MachineInstr &MI) { 1601 if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 1602 return false; 1603 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) 1604 return false; 1605 MFMA = &MI; 1606 return true; 1607 }; 1608 1609 const MachineInstr *DOT = nullptr; 1610 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 1611 if (!SIInstrInfo::isDOT(MI) || 1612 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 1613 return false; 1614 DOT = &MI; 1615 return true; 1616 }; 1617 1618 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1619 AMDGPU::OpName::src2); 1620 1621 if (IsMemOrExport || IsVALU) { 1622 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 1623 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 1624 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 1625 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 1626 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 1627 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 1628 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 1629 const int DotWriteSameDotReadSrcAB = 3; 1630 const int DotWriteDifferentVALURead = 3; 1631 const int MaxWaitStates = 19; 1632 1633 for (const MachineOperand &Use : MI->explicit_uses()) { 1634 if (!Use.isReg()) 1635 continue; 1636 Reg = Use.getReg(); 1637 1638 DOT = nullptr; 1639 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 1640 MaxWaitStates); 1641 if (DOT) { 1642 int NeedWaitStates = 0; 1643 if (DOT->getOpcode() == MI->getOpcode()) { 1644 if (&Use - &MI->getOperand(0) != SrcCIdx) 1645 NeedWaitStates = DotWriteSameDotReadSrcAB; 1646 } else { 1647 NeedWaitStates = DotWriteDifferentVALURead; 1648 } 1649 1650 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1651 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1652 } 1653 1654 MFMA = nullptr; 1655 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, 1656 MaxWaitStates); 1657 if (!MFMA) 1658 continue; 1659 1660 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 1661 int NeedWaitStates = MaxWaitStates; 1662 switch (HazardDefLatency) { 1663 case 2: 1664 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 1665 break; 1666 case 4: 1667 assert(isDGEMM(MFMA->getOpcode())); 1668 NeedWaitStates = 1669 IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 1670 : DMFMA4x4WriteVgprVALUReadWaitStates; 1671 break; 1672 case 8: 1673 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 1674 break; 1675 case 16: LLVM_FALLTHROUGH; 1676 default: 1677 NeedWaitStates = 1678 isDGEMM(MFMA->getOpcode()) 1679 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates 1680 : DMFMA16x16WriteVgprVALUReadWaitStates 1681 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 1682 break; 1683 } 1684 1685 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1686 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1687 1688 if (WaitStatesNeeded == MaxWaitStates) 1689 break; 1690 } 1691 } 1692 1693 unsigned Opc = MI->getOpcode(); 1694 const int DMFMAToFMA64WaitStates = 2; 1695 if ((Opc == AMDGPU::V_FMA_F64_e64 || 1696 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 1697 Opc == AMDGPU::V_FMAC_F64_dpp) && 1698 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 1699 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 1700 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 1701 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1702 } 1703 1704 if (!IsVALU && !IsMemOrExport) 1705 return WaitStatesNeeded; 1706 1707 for (const MachineOperand &Def : MI->defs()) { 1708 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 1709 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 1710 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 1711 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 1712 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 1713 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 1714 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 1715 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 1716 const int DotWriteDifferentVALUWrite = 3; 1717 const int MaxWaitStates = 19; 1718 const int MaxWarWaitStates = 15; 1719 1720 Reg = Def.getReg(); 1721 1722 DOT = nullptr; 1723 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 1724 MaxWaitStates); 1725 if (DOT && DOT->getOpcode() != MI->getOpcode()) 1726 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 1727 WaitStatesSinceDef); 1728 1729 MFMA = nullptr; 1730 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, 1731 MaxWaitStates); 1732 if (MFMA) { 1733 int NeedWaitStates = MaxWaitStates; 1734 switch (TSchedModel.computeInstrLatency(MFMA)) { 1735 case 2: 1736 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 1737 break; 1738 case 4: 1739 assert(isDGEMM(MFMA->getOpcode())); 1740 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 1741 break; 1742 case 8: 1743 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 1744 break; 1745 case 16: LLVM_FALLTHROUGH; 1746 default: 1747 NeedWaitStates = isDGEMM(MFMA->getOpcode()) 1748 ? DMFMA16x16WriteVgprVALUWriteWaitStates 1749 : SMFMA32x32WriteVgprVALUWawWaitStates; 1750 break; 1751 } 1752 1753 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1754 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1755 1756 if (WaitStatesNeeded == MaxWaitStates) 1757 break; 1758 } 1759 1760 auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA, 1761 this](const MachineInstr &MI) { 1762 if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) || 1763 !MI.readsRegister(Reg, &TRI)) 1764 return false; 1765 1766 const MachineOperand *SrcC = 1767 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 1768 assert(SrcC); 1769 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 1770 return false; 1771 1772 MFMA = &MI; 1773 return true; 1774 }; 1775 1776 MFMA = nullptr; 1777 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 1778 MaxWarWaitStates); 1779 if (!MFMA) 1780 continue; 1781 1782 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 1783 int NeedWaitStates = MaxWaitStates; 1784 switch (HazardDefLatency) { 1785 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 1786 break; 1787 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 1788 break; 1789 case 16: LLVM_FALLTHROUGH; 1790 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 1791 break; 1792 } 1793 1794 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 1795 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1796 } 1797 1798 return WaitStatesNeeded; 1799 } 1800 1801 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1802 if (!SU->isInstr()) 1803 return false; 1804 1805 const MachineInstr *MAI = nullptr; 1806 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 1807 MAI = nullptr; 1808 if (SIInstrInfo::isMAI(MI) && 1809 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1810 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) 1811 MAI = &MI; 1812 return MAI != nullptr; 1813 }; 1814 1815 MachineInstr *MI = SU->getInstr(); 1816 if (IsMFMAFn(*MI)) { 1817 int W = getWaitStatesSince(IsMFMAFn, 16); 1818 if (MAI) 1819 return W < (int)TSchedModel.computeInstrLatency(MAI); 1820 } 1821 1822 return false; 1823 } 1824