1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19
20 using namespace llvm;
21
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25
GCNHazardRecognizer(const MachineFunction & MF)26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
27 IsHazardRecognizerMode(false),
28 CurrCycleInstr(nullptr),
29 MF(MF),
30 ST(MF.getSubtarget<GCNSubtarget>()),
31 TII(*ST.getInstrInfo()),
32 TRI(TII.getRegisterInfo()),
33 ClauseUses(TRI.getNumRegUnits()),
34 ClauseDefs(TRI.getNumRegUnits()) {
35 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
36 TSchedModel.init(&ST);
37 }
38
Reset()39 void GCNHazardRecognizer::Reset() {
40 EmittedInstrs.clear();
41 }
42
EmitInstruction(SUnit * SU)43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
44 EmitInstruction(SU->getInstr());
45 }
46
EmitInstruction(MachineInstr * MI)47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
48 CurrCycleInstr = MI;
49 }
50
isDivFMas(unsigned Opcode)51 static bool isDivFMas(unsigned Opcode) {
52 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
53 }
54
isSGetReg(unsigned Opcode)55 static bool isSGetReg(unsigned Opcode) {
56 return Opcode == AMDGPU::S_GETREG_B32;
57 }
58
isSSetReg(unsigned Opcode)59 static bool isSSetReg(unsigned Opcode) {
60 switch (Opcode) {
61 case AMDGPU::S_SETREG_B32:
62 case AMDGPU::S_SETREG_B32_mode:
63 case AMDGPU::S_SETREG_IMM32_B32:
64 case AMDGPU::S_SETREG_IMM32_B32_mode:
65 return true;
66 }
67 return false;
68 }
69
isRWLane(unsigned Opcode)70 static bool isRWLane(unsigned Opcode) {
71 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
72 }
73
isRFE(unsigned Opcode)74 static bool isRFE(unsigned Opcode) {
75 return Opcode == AMDGPU::S_RFE_B64;
76 }
77
isSMovRel(unsigned Opcode)78 static bool isSMovRel(unsigned Opcode) {
79 switch (Opcode) {
80 case AMDGPU::S_MOVRELS_B32:
81 case AMDGPU::S_MOVRELS_B64:
82 case AMDGPU::S_MOVRELD_B32:
83 case AMDGPU::S_MOVRELD_B64:
84 return true;
85 default:
86 return false;
87 }
88 }
89
isDGEMM(unsigned Opcode)90 static bool isDGEMM(unsigned Opcode) {
91 return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
92 Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
93 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
94 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
95 }
96
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)97 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
98 unsigned Opcode = MI.getOpcode();
99
100 if (!SIInstrInfo::isMAI(MI) ||
101 isDGEMM(Opcode) ||
102 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
103 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
104 return false;
105
106 return true;
107 }
108
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)109 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
110 const MachineInstr &MI) {
111 if (TII.isAlwaysGDS(MI.getOpcode()))
112 return true;
113
114 switch (MI.getOpcode()) {
115 case AMDGPU::S_SENDMSG:
116 case AMDGPU::S_SENDMSGHALT:
117 case AMDGPU::S_TTRACEDATA:
118 return true;
119 // These DS opcodes don't support GDS.
120 case AMDGPU::DS_NOP:
121 case AMDGPU::DS_PERMUTE_B32:
122 case AMDGPU::DS_BPERMUTE_B32:
123 return false;
124 default:
125 if (TII.isDS(MI.getOpcode())) {
126 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
127 AMDGPU::OpName::gds);
128 if (MI.getOperand(GDS).getImm())
129 return true;
130 }
131 return false;
132 }
133 }
134
isPermlane(const MachineInstr & MI)135 static bool isPermlane(const MachineInstr &MI) {
136 unsigned Opcode = MI.getOpcode();
137 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
138 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
139 }
140
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)141 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
142 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
143 AMDGPU::OpName::simm16);
144 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
145 }
146
147 ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)148 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
149 MachineInstr *MI = SU->getInstr();
150 // If we are not in "HazardRecognizerMode" and therefore not being run from
151 // the scheduler, track possible stalls from hazards but don't insert noops.
152 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
153
154 if (MI->isBundle())
155 return NoHazard;
156
157 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
158 return HazardType;
159
160 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
161 return HazardType;
162
163 if (checkFPAtomicToDenormModeHazard(MI) > 0)
164 return HazardType;
165
166 if (ST.hasNoDataDepHazard())
167 return NoHazard;
168
169 // FIXME: Should flat be considered vmem?
170 if ((SIInstrInfo::isVMEM(*MI) ||
171 SIInstrInfo::isFLAT(*MI))
172 && checkVMEMHazards(MI) > 0)
173 return HazardType;
174
175 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
176 return HazardType;
177
178 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
179 return HazardType;
180
181 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
182 return HazardType;
183
184 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
185 return HazardType;
186
187 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
188 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
189 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
190 return HazardType;
191
192 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
193 return HazardType;
194
195 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
196 return HazardType;
197
198 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
199 return HazardType;
200
201 if (ST.hasReadM0MovRelInterpHazard() &&
202 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
203 checkReadM0Hazards(MI) > 0)
204 return HazardType;
205
206 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
207 checkReadM0Hazards(MI) > 0)
208 return HazardType;
209
210 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
211 return HazardType;
212
213 if ((SIInstrInfo::isVMEM(*MI) ||
214 SIInstrInfo::isFLAT(*MI) ||
215 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
216 return HazardType;
217
218 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
219 return HazardType;
220
221 return NoHazard;
222 }
223
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)224 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
225 unsigned Quantity) {
226 while (Quantity > 0) {
227 unsigned Arg = std::min(Quantity, 8u);
228 Quantity -= Arg;
229 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
230 .addImm(Arg - 1);
231 }
232 }
233
processBundle()234 void GCNHazardRecognizer::processBundle() {
235 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
236 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
237 // Check bundled MachineInstr's for hazards.
238 for (; MI != E && MI->isInsideBundle(); ++MI) {
239 CurrCycleInstr = &*MI;
240 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
241
242 if (IsHazardRecognizerMode) {
243 fixHazards(CurrCycleInstr);
244
245 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
246 }
247
248 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
249 // include the bundled MI directly after, only add a maximum of
250 // (MaxLookAhead - 1) noops to EmittedInstrs.
251 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
252 EmittedInstrs.push_front(nullptr);
253
254 EmittedInstrs.push_front(CurrCycleInstr);
255 EmittedInstrs.resize(MaxLookAhead);
256 }
257 CurrCycleInstr = nullptr;
258 }
259
PreEmitNoops(MachineInstr * MI)260 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
261 IsHazardRecognizerMode = true;
262 CurrCycleInstr = MI;
263 unsigned W = PreEmitNoopsCommon(MI);
264 fixHazards(MI);
265 CurrCycleInstr = nullptr;
266 return W;
267 }
268
PreEmitNoopsCommon(MachineInstr * MI)269 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
270 if (MI->isBundle())
271 return 0;
272
273 int WaitStates = 0;
274
275 if (SIInstrInfo::isSMRD(*MI))
276 return std::max(WaitStates, checkSMRDHazards(MI));
277
278 if (ST.hasNSAtoVMEMBug())
279 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
280
281 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
282
283 if (ST.hasNoDataDepHazard())
284 return WaitStates;
285
286 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
287 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
288
289 if (SIInstrInfo::isVALU(*MI))
290 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
291
292 if (SIInstrInfo::isDPP(*MI))
293 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
294
295 if (isDivFMas(MI->getOpcode()))
296 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
297
298 if (isRWLane(MI->getOpcode()))
299 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
300
301 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
302 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
303 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
304 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
305
306 if (MI->isInlineAsm())
307 return std::max(WaitStates, checkInlineAsmHazards(MI));
308
309 if (isSGetReg(MI->getOpcode()))
310 return std::max(WaitStates, checkGetRegHazards(MI));
311
312 if (isSSetReg(MI->getOpcode()))
313 return std::max(WaitStates, checkSetRegHazards(MI));
314
315 if (isRFE(MI->getOpcode()))
316 return std::max(WaitStates, checkRFEHazards(MI));
317
318 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
319 isSMovRel(MI->getOpcode())))
320 return std::max(WaitStates, checkReadM0Hazards(MI));
321
322 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
323 return std::max(WaitStates, checkReadM0Hazards(MI));
324
325 if (SIInstrInfo::isMAI(*MI))
326 return std::max(WaitStates, checkMAIHazards(MI));
327
328 if (SIInstrInfo::isVMEM(*MI) ||
329 SIInstrInfo::isFLAT(*MI) ||
330 SIInstrInfo::isDS(*MI))
331 return std::max(WaitStates, checkMAILdStHazards(MI));
332
333 return WaitStates;
334 }
335
EmitNoop()336 void GCNHazardRecognizer::EmitNoop() {
337 EmittedInstrs.push_front(nullptr);
338 }
339
AdvanceCycle()340 void GCNHazardRecognizer::AdvanceCycle() {
341 // When the scheduler detects a stall, it will call AdvanceCycle() without
342 // emitting any instructions.
343 if (!CurrCycleInstr) {
344 EmittedInstrs.push_front(nullptr);
345 return;
346 }
347
348 // Do not track non-instructions which do not affect the wait states.
349 // If included, these instructions can lead to buffer overflow such that
350 // detectable hazards are missed.
351 if (CurrCycleInstr->isMetaInstruction()) {
352 CurrCycleInstr = nullptr;
353 return;
354 }
355
356 if (CurrCycleInstr->isBundle()) {
357 processBundle();
358 return;
359 }
360
361 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
362
363 // Keep track of emitted instructions
364 EmittedInstrs.push_front(CurrCycleInstr);
365
366 // Add a nullptr for each additional wait state after the first. Make sure
367 // not to add more than getMaxLookAhead() items to the list, since we
368 // truncate the list to that size right after this loop.
369 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
370 i < e; ++i) {
371 EmittedInstrs.push_front(nullptr);
372 }
373
374 // getMaxLookahead() is the largest number of wait states we will ever need
375 // to insert, so there is no point in keeping track of more than that many
376 // wait states.
377 EmittedInstrs.resize(getMaxLookAhead());
378
379 CurrCycleInstr = nullptr;
380 }
381
RecedeCycle()382 void GCNHazardRecognizer::RecedeCycle() {
383 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
384 }
385
386 //===----------------------------------------------------------------------===//
387 // Helper Functions
388 //===----------------------------------------------------------------------===//
389
390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
391
392 // Returns a minimum wait states since \p I walking all predecessors.
393 // Only scans until \p IsExpired does not return true.
394 // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited)395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
396 const MachineBasicBlock *MBB,
397 MachineBasicBlock::const_reverse_instr_iterator I,
398 int WaitStates, IsExpiredFn IsExpired,
399 DenseSet<const MachineBasicBlock *> &Visited) {
400 for (auto E = MBB->instr_rend(); I != E; ++I) {
401 // Don't add WaitStates for parent BUNDLE instructions.
402 if (I->isBundle())
403 continue;
404
405 if (IsHazard(*I))
406 return WaitStates;
407
408 if (I->isInlineAsm() || I->isMetaInstruction())
409 continue;
410
411 WaitStates += SIInstrInfo::getNumWaitStates(*I);
412
413 if (IsExpired(*I, WaitStates))
414 return std::numeric_limits<int>::max();
415 }
416
417 int MinWaitStates = std::numeric_limits<int>::max();
418 for (MachineBasicBlock *Pred : MBB->predecessors()) {
419 if (!Visited.insert(Pred).second)
420 continue;
421
422 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
423 WaitStates, IsExpired, Visited);
424
425 MinWaitStates = std::min(MinWaitStates, W);
426 }
427
428 return MinWaitStates;
429 }
430
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)431 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
432 const MachineInstr *MI, IsExpiredFn IsExpired) {
433 DenseSet<const MachineBasicBlock *> Visited;
434 return getWaitStatesSince(IsHazard, MI->getParent(),
435 std::next(MI->getReverseIterator()),
436 0, IsExpired, Visited);
437 }
438
getWaitStatesSince(IsHazardFn IsHazard,int Limit)439 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
440 if (IsHazardRecognizerMode) {
441 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
442 return WaitStates >= Limit;
443 };
444 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
445 }
446
447 int WaitStates = 0;
448 for (MachineInstr *MI : EmittedInstrs) {
449 if (MI) {
450 if (IsHazard(*MI))
451 return WaitStates;
452
453 if (MI->isInlineAsm())
454 continue;
455 }
456 ++WaitStates;
457
458 if (WaitStates >= Limit)
459 break;
460 }
461 return std::numeric_limits<int>::max();
462 }
463
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)464 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
465 IsHazardFn IsHazardDef,
466 int Limit) {
467 const SIRegisterInfo *TRI = ST.getRegisterInfo();
468
469 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
470 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
471 };
472
473 return getWaitStatesSince(IsHazardFn, Limit);
474 }
475
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)476 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
477 int Limit) {
478 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
479 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
480 };
481
482 return getWaitStatesSince(IsHazardFn, Limit);
483 }
484
485 //===----------------------------------------------------------------------===//
486 // No-op Hazard Detection
487 //===----------------------------------------------------------------------===//
488
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)489 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
490 MCRegister Reg) {
491 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
492 BV.set(*RUI);
493 }
494
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & Set)495 static void addRegsToSet(const SIRegisterInfo &TRI,
496 iterator_range<MachineInstr::const_mop_iterator> Ops,
497 BitVector &Set) {
498 for (const MachineOperand &Op : Ops) {
499 if (Op.isReg())
500 addRegUnits(TRI, Set, Op.getReg().asMCReg());
501 }
502 }
503
addClauseInst(const MachineInstr & MI)504 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
505 // XXX: Do we need to worry about implicit operands
506 addRegsToSet(TRI, MI.defs(), ClauseDefs);
507 addRegsToSet(TRI, MI.uses(), ClauseUses);
508 }
509
breaksSMEMSoftClause(MachineInstr * MI)510 static bool breaksSMEMSoftClause(MachineInstr *MI) {
511 return !SIInstrInfo::isSMRD(*MI);
512 }
513
breaksVMEMSoftClause(MachineInstr * MI)514 static bool breaksVMEMSoftClause(MachineInstr *MI) {
515 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
516 }
517
checkSoftClauseHazards(MachineInstr * MEM)518 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
519 // SMEM soft clause are only present on VI+, and only matter if xnack is
520 // enabled.
521 if (!ST.isXNACKEnabled())
522 return 0;
523
524 bool IsSMRD = TII.isSMRD(*MEM);
525
526 resetClause();
527
528 // A soft-clause is any group of consecutive SMEM instructions. The
529 // instructions in this group may return out of order and/or may be
530 // replayed (i.e. the same instruction issued more than once).
531 //
532 // In order to handle these situations correctly we need to make sure that
533 // when a clause has more than one instruction, no instruction in the clause
534 // writes to a register that is read by another instruction in the clause
535 // (including itself). If we encounter this situaion, we need to break the
536 // clause by inserting a non SMEM instruction.
537
538 for (MachineInstr *MI : EmittedInstrs) {
539 // When we hit a non-SMEM instruction then we have passed the start of the
540 // clause and we can stop.
541 if (!MI)
542 break;
543
544 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
545 break;
546
547 addClauseInst(*MI);
548 }
549
550 if (ClauseDefs.none())
551 return 0;
552
553 // We need to make sure not to put loads and stores in the same clause if they
554 // use the same address. For now, just start a new clause whenever we see a
555 // store.
556 if (MEM->mayStore())
557 return 1;
558
559 addClauseInst(*MEM);
560
561 // If the set of defs and uses intersect then we cannot add this instruction
562 // to the clause, so we have a hazard.
563 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
564 }
565
checkSMRDHazards(MachineInstr * SMRD)566 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
567 int WaitStatesNeeded = 0;
568
569 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
570
571 // This SMRD hazard only affects SI.
572 if (!ST.hasSMRDReadVALUDefHazard())
573 return WaitStatesNeeded;
574
575 // A read of an SGPR by SMRD instruction requires 4 wait states when the
576 // SGPR was written by a VALU instruction.
577 int SmrdSgprWaitStates = 4;
578 auto IsHazardDefFn = [this](const MachineInstr &MI) {
579 return TII.isVALU(MI);
580 };
581 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
582 return TII.isSALU(MI);
583 };
584
585 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
586
587 for (const MachineOperand &Use : SMRD->uses()) {
588 if (!Use.isReg())
589 continue;
590 int WaitStatesNeededForUse =
591 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
592 SmrdSgprWaitStates);
593 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
594
595 // This fixes what appears to be undocumented hardware behavior in SI where
596 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
597 // needs some number of nops in between. We don't know how many we need, but
598 // let's use 4. This wasn't discovered before probably because the only
599 // case when this happens is when we expand a 64-bit pointer into a full
600 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
601 // probably never encountered in the closed-source land.
602 if (IsBufferSMRD) {
603 int WaitStatesNeededForUse =
604 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
605 IsBufferHazardDefFn,
606 SmrdSgprWaitStates);
607 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
608 }
609 }
610
611 return WaitStatesNeeded;
612 }
613
checkVMEMHazards(MachineInstr * VMEM)614 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
615 if (!ST.hasVMEMReadSGPRVALUDefHazard())
616 return 0;
617
618 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
619
620 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
621 // SGPR was written by a VALU Instruction.
622 const int VmemSgprWaitStates = 5;
623 auto IsHazardDefFn = [this](const MachineInstr &MI) {
624 return TII.isVALU(MI);
625 };
626 for (const MachineOperand &Use : VMEM->uses()) {
627 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
628 continue;
629
630 int WaitStatesNeededForUse =
631 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
632 VmemSgprWaitStates);
633 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
634 }
635 return WaitStatesNeeded;
636 }
637
checkDPPHazards(MachineInstr * DPP)638 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
639 const SIRegisterInfo *TRI = ST.getRegisterInfo();
640 const SIInstrInfo *TII = ST.getInstrInfo();
641
642 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
643 int DppVgprWaitStates = 2;
644 int DppExecWaitStates = 5;
645 int WaitStatesNeeded = 0;
646 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
647 return TII->isVALU(MI);
648 };
649
650 for (const MachineOperand &Use : DPP->uses()) {
651 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
652 continue;
653 int WaitStatesNeededForUse =
654 DppVgprWaitStates - getWaitStatesSinceDef(
655 Use.getReg(),
656 [](const MachineInstr &) { return true; },
657 DppVgprWaitStates);
658 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
659 }
660
661 WaitStatesNeeded = std::max(
662 WaitStatesNeeded,
663 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
664 DppExecWaitStates));
665
666 return WaitStatesNeeded;
667 }
668
checkDivFMasHazards(MachineInstr * DivFMas)669 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
670 const SIInstrInfo *TII = ST.getInstrInfo();
671
672 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
673 // instruction.
674 const int DivFMasWaitStates = 4;
675 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
676 return TII->isVALU(MI);
677 };
678 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
679 DivFMasWaitStates);
680
681 return DivFMasWaitStates - WaitStatesNeeded;
682 }
683
checkGetRegHazards(MachineInstr * GetRegInstr)684 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
685 const SIInstrInfo *TII = ST.getInstrInfo();
686 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
687
688 const int GetRegWaitStates = 2;
689 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
690 return GetRegHWReg == getHWReg(TII, MI);
691 };
692 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
693
694 return GetRegWaitStates - WaitStatesNeeded;
695 }
696
checkSetRegHazards(MachineInstr * SetRegInstr)697 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
698 const SIInstrInfo *TII = ST.getInstrInfo();
699 unsigned HWReg = getHWReg(TII, *SetRegInstr);
700
701 const int SetRegWaitStates = ST.getSetRegWaitStates();
702 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
703 return HWReg == getHWReg(TII, MI);
704 };
705 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
706 return SetRegWaitStates - WaitStatesNeeded;
707 }
708
createsVALUHazard(const MachineInstr & MI)709 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
710 if (!MI.mayStore())
711 return -1;
712
713 const SIInstrInfo *TII = ST.getInstrInfo();
714 unsigned Opcode = MI.getOpcode();
715 const MCInstrDesc &Desc = MI.getDesc();
716
717 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
718 int VDataRCID = -1;
719 if (VDataIdx != -1)
720 VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
721
722 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
723 // There is no hazard if the instruction does not use vector regs
724 // (like wbinvl1)
725 if (VDataIdx == -1)
726 return -1;
727 // For MUBUF/MTBUF instructions this hazard only exists if the
728 // instruction is not using a register in the soffset field.
729 const MachineOperand *SOffset =
730 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
731 // If we have no soffset operand, then assume this field has been
732 // hardcoded to zero.
733 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
734 (!SOffset || !SOffset->isReg()))
735 return VDataIdx;
736 }
737
738 // MIMG instructions create a hazard if they don't use a 256-bit T# and
739 // the store size is greater than 8 bytes and they have more than two bits
740 // of their dmask set.
741 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
742 if (TII->isMIMG(MI)) {
743 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
744 assert(SRsrcIdx != -1 &&
745 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
746 (void)SRsrcIdx;
747 }
748
749 if (TII->isFLAT(MI)) {
750 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
751 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
752 return DataIdx;
753 }
754
755 return -1;
756 }
757
758 int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)759 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
760 const MachineRegisterInfo &MRI) {
761 // Helper to check for the hazard where VMEM instructions that store more than
762 // 8 bytes can have there store data over written by the next instruction.
763 const SIRegisterInfo *TRI = ST.getRegisterInfo();
764
765 const int VALUWaitStates = 1;
766 int WaitStatesNeeded = 0;
767
768 if (!TRI->isVectorRegister(MRI, Def.getReg()))
769 return WaitStatesNeeded;
770 Register Reg = Def.getReg();
771 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
772 int DataIdx = createsVALUHazard(MI);
773 return DataIdx >= 0 &&
774 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
775 };
776 int WaitStatesNeededForDef =
777 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
779
780 return WaitStatesNeeded;
781 }
782
checkVALUHazards(MachineInstr * VALU)783 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
784 // This checks for the hazard where VMEM instructions that store more than
785 // 8 bytes can have there store data over written by the next instruction.
786 if (!ST.has12DWordStoreHazard())
787 return 0;
788
789 const MachineRegisterInfo &MRI = MF.getRegInfo();
790 int WaitStatesNeeded = 0;
791
792 for (const MachineOperand &Def : VALU->defs()) {
793 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
794 }
795
796 return WaitStatesNeeded;
797 }
798
checkInlineAsmHazards(MachineInstr * IA)799 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
800 // This checks for hazards associated with inline asm statements.
801 // Since inline asms can contain just about anything, we use this
802 // to call/leverage other check*Hazard routines. Note that
803 // this function doesn't attempt to address all possible inline asm
804 // hazards (good luck), but is a collection of what has been
805 // problematic thus far.
806
807 // see checkVALUHazards()
808 if (!ST.has12DWordStoreHazard())
809 return 0;
810
811 const MachineRegisterInfo &MRI = MF.getRegInfo();
812 int WaitStatesNeeded = 0;
813
814 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
815 I != E; ++I) {
816 const MachineOperand &Op = IA->getOperand(I);
817 if (Op.isReg() && Op.isDef()) {
818 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
819 }
820 }
821
822 return WaitStatesNeeded;
823 }
824
checkRWLaneHazards(MachineInstr * RWLane)825 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
826 const SIInstrInfo *TII = ST.getInstrInfo();
827 const SIRegisterInfo *TRI = ST.getRegisterInfo();
828 const MachineRegisterInfo &MRI = MF.getRegInfo();
829
830 const MachineOperand *LaneSelectOp =
831 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
832
833 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
834 return 0;
835
836 Register LaneSelectReg = LaneSelectOp->getReg();
837 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
838
839 const int RWLaneWaitStates = 4;
840 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
841 RWLaneWaitStates);
842 return RWLaneWaitStates - WaitStatesSince;
843 }
844
checkRFEHazards(MachineInstr * RFE)845 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
846 if (!ST.hasRFEHazards())
847 return 0;
848
849 const SIInstrInfo *TII = ST.getInstrInfo();
850
851 const int RFEWaitStates = 1;
852
853 auto IsHazardFn = [TII](const MachineInstr &MI) {
854 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
855 };
856 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
857 return RFEWaitStates - WaitStatesNeeded;
858 }
859
checkReadM0Hazards(MachineInstr * MI)860 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
861 const SIInstrInfo *TII = ST.getInstrInfo();
862 const int SMovRelWaitStates = 1;
863 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
864 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
865 SMovRelWaitStates);
866 }
867
fixHazards(MachineInstr * MI)868 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
869 fixVMEMtoScalarWriteHazards(MI);
870 fixVcmpxPermlaneHazards(MI);
871 fixSMEMtoVectorWriteHazards(MI);
872 fixVcmpxExecWARHazard(MI);
873 fixLdsBranchVmemWARHazard(MI);
874 }
875
fixVcmpxPermlaneHazards(MachineInstr * MI)876 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
877 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
878 return false;
879
880 const SIInstrInfo *TII = ST.getInstrInfo();
881 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
882
883 auto IsExpiredFn = [](const MachineInstr &MI, int) {
884 unsigned Opc = MI.getOpcode();
885 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
886 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
887 };
888
889 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
890 std::numeric_limits<int>::max())
891 return false;
892
893 // V_NOP will be discarded by SQ.
894 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
895 // which is always a VGPR and available.
896 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
897 Register Reg = Src0->getReg();
898 bool IsUndef = Src0->isUndef();
899 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
900 TII->get(AMDGPU::V_MOV_B32_e32))
901 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
902 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
903
904 return true;
905 }
906
fixVMEMtoScalarWriteHazards(MachineInstr * MI)907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
908 if (!ST.hasVMEMtoScalarWriteHazard())
909 return false;
910
911 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
912 return false;
913
914 if (MI->getNumDefs() == 0)
915 return false;
916
917 const SIRegisterInfo *TRI = ST.getRegisterInfo();
918
919 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
920 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
921 !SIInstrInfo::isFLAT(I))
922 return false;
923
924 for (const MachineOperand &Def : MI->defs()) {
925 const MachineOperand *Op =
926 I.findRegisterUseOperand(Def.getReg(), false, TRI);
927 if (!Op)
928 continue;
929 return true;
930 }
931 return false;
932 };
933
934 auto IsExpiredFn = [](const MachineInstr &MI, int) {
935 return SIInstrInfo::isVALU(MI) ||
936 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
937 !MI.getOperand(0).getImm()) ||
938 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
939 MI.getOperand(0).getImm() == 0xffe3);
940 };
941
942 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
943 std::numeric_limits<int>::max())
944 return false;
945
946 const SIInstrInfo *TII = ST.getInstrInfo();
947 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
948 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
949 .addImm(0xffe3);
950 return true;
951 }
952
fixSMEMtoVectorWriteHazards(MachineInstr * MI)953 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
954 if (!ST.hasSMEMtoVectorWriteHazard())
955 return false;
956
957 if (!SIInstrInfo::isVALU(*MI))
958 return false;
959
960 unsigned SDSTName;
961 switch (MI->getOpcode()) {
962 case AMDGPU::V_READLANE_B32:
963 case AMDGPU::V_READFIRSTLANE_B32:
964 SDSTName = AMDGPU::OpName::vdst;
965 break;
966 default:
967 SDSTName = AMDGPU::OpName::sdst;
968 break;
969 }
970
971 const SIInstrInfo *TII = ST.getInstrInfo();
972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
973 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
974 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
975 if (!SDST) {
976 for (const auto &MO : MI->implicit_operands()) {
977 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
978 SDST = &MO;
979 break;
980 }
981 }
982 }
983
984 if (!SDST)
985 return false;
986
987 const Register SDSTReg = SDST->getReg();
988 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
989 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
990 };
991
992 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
993 if (TII->isSALU(MI)) {
994 switch (MI.getOpcode()) {
995 case AMDGPU::S_SETVSKIP:
996 case AMDGPU::S_VERSION:
997 case AMDGPU::S_WAITCNT_VSCNT:
998 case AMDGPU::S_WAITCNT_VMCNT:
999 case AMDGPU::S_WAITCNT_EXPCNT:
1000 // These instructions cannot not mitigate the hazard.
1001 return false;
1002 case AMDGPU::S_WAITCNT_LGKMCNT:
1003 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1004 return (MI.getOperand(1).getImm() == 0) &&
1005 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1006 case AMDGPU::S_WAITCNT: {
1007 const int64_t Imm = MI.getOperand(0).getImm();
1008 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1009 return (Decoded.LgkmCnt == 0);
1010 }
1011 default:
1012 // SOPP instructions cannot mitigate the hazard.
1013 if (TII->isSOPP(MI))
1014 return false;
1015 // At this point the SALU can be assumed to mitigate the hazard
1016 // because either:
1017 // (a) it is independent of the at risk SMEM (breaking chain),
1018 // or
1019 // (b) it is dependent on the SMEM, in which case an appropriate
1020 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1021 // SMEM instruction.
1022 return true;
1023 }
1024 }
1025 return false;
1026 };
1027
1028 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1029 std::numeric_limits<int>::max())
1030 return false;
1031
1032 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1033 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1034 .addImm(0);
1035 return true;
1036 }
1037
fixVcmpxExecWARHazard(MachineInstr * MI)1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1039 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1040 return false;
1041
1042 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1043 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1044 return false;
1045
1046 auto IsHazardFn = [TRI](const MachineInstr &I) {
1047 if (SIInstrInfo::isVALU(I))
1048 return false;
1049 return I.readsRegister(AMDGPU::EXEC, TRI);
1050 };
1051
1052 const SIInstrInfo *TII = ST.getInstrInfo();
1053 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1054 if (SIInstrInfo::isVALU(MI)) {
1055 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1056 return true;
1057 for (auto MO : MI.implicit_operands())
1058 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1059 return true;
1060 }
1061 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062 (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1063 return true;
1064 return false;
1065 };
1066
1067 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1068 std::numeric_limits<int>::max())
1069 return false;
1070
1071 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1072 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1073 .addImm(0xfffe);
1074 return true;
1075 }
1076
fixLdsBranchVmemWARHazard(MachineInstr * MI)1077 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1078 if (!ST.hasLdsBranchVmemWARHazard())
1079 return false;
1080
1081 auto IsHazardInst = [](const MachineInstr &MI) {
1082 if (SIInstrInfo::isDS(MI))
1083 return 1;
1084 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1085 return 2;
1086 return 0;
1087 };
1088
1089 auto InstType = IsHazardInst(*MI);
1090 if (!InstType)
1091 return false;
1092
1093 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1094 return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1095 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1096 !I.getOperand(1).getImm());
1097 };
1098
1099 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1100 if (!I.isBranch())
1101 return false;
1102
1103 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1104 auto InstType2 = IsHazardInst(I);
1105 return InstType2 && InstType != InstType2;
1106 };
1107
1108 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1109 auto InstType2 = IsHazardInst(I);
1110 if (InstType == InstType2)
1111 return true;
1112
1113 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1114 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1115 !I.getOperand(1).getImm();
1116 };
1117
1118 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1119 std::numeric_limits<int>::max();
1120 };
1121
1122 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1123 std::numeric_limits<int>::max())
1124 return false;
1125
1126 const SIInstrInfo *TII = ST.getInstrInfo();
1127 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1128 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1129 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1130 .addImm(0);
1131
1132 return true;
1133 }
1134
checkNSAtoVMEMHazard(MachineInstr * MI)1135 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1136 int NSAtoVMEMWaitStates = 1;
1137
1138 if (!ST.hasNSAtoVMEMBug())
1139 return 0;
1140
1141 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1142 return 0;
1143
1144 const SIInstrInfo *TII = ST.getInstrInfo();
1145 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1146 if (!Offset || (Offset->getImm() & 6) == 0)
1147 return 0;
1148
1149 auto IsHazardFn = [TII](const MachineInstr &I) {
1150 if (!SIInstrInfo::isMIMG(I))
1151 return false;
1152 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1153 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1154 TII->getInstSizeInBytes(I) >= 16;
1155 };
1156
1157 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1158 }
1159
checkFPAtomicToDenormModeHazard(MachineInstr * MI)1160 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1161 int FPAtomicToDenormModeWaitStates = 3;
1162
1163 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1164 return 0;
1165
1166 auto IsHazardFn = [](const MachineInstr &I) {
1167 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1168 return false;
1169 return SIInstrInfo::isFPAtomic(I);
1170 };
1171
1172 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1173 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1174 return true;
1175
1176 switch (MI.getOpcode()) {
1177 case AMDGPU::S_WAITCNT:
1178 case AMDGPU::S_WAITCNT_VSCNT:
1179 case AMDGPU::S_WAITCNT_VMCNT:
1180 case AMDGPU::S_WAITCNT_EXPCNT:
1181 case AMDGPU::S_WAITCNT_LGKMCNT:
1182 case AMDGPU::S_WAIT_IDLE:
1183 return true;
1184 default:
1185 break;
1186 }
1187
1188 return false;
1189 };
1190
1191 return FPAtomicToDenormModeWaitStates -
1192 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1193 }
1194
checkMAIHazards(MachineInstr * MI)1195 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1196 assert(SIInstrInfo::isMAI(*MI));
1197
1198 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1199 }
1200
checkMAIHazards908(MachineInstr * MI)1201 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1202 int WaitStatesNeeded = 0;
1203 unsigned Opc = MI->getOpcode();
1204
1205 auto IsVALUFn = [](const MachineInstr &MI) {
1206 return SIInstrInfo::isVALU(MI);
1207 };
1208
1209 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1210 const int LegacyVALUWritesVGPRWaitStates = 2;
1211 const int VALUWritesExecWaitStates = 4;
1212 const int MaxWaitStates = 4;
1213
1214 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1215 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1216 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1217
1218 if (WaitStatesNeeded < MaxWaitStates) {
1219 for (const MachineOperand &Use : MI->explicit_uses()) {
1220 const int MaxWaitStates = 2;
1221
1222 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1223 continue;
1224
1225 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1226 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1227 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1228
1229 if (WaitStatesNeeded == MaxWaitStates)
1230 break;
1231 }
1232 }
1233 }
1234
1235 auto IsMFMAFn = [](const MachineInstr &MI) {
1236 return SIInstrInfo::isMAI(MI) &&
1237 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1238 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1239 };
1240
1241 for (const MachineOperand &Op : MI->explicit_operands()) {
1242 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1243 continue;
1244
1245 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1246 continue;
1247
1248 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1249 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1250 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1251 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1252 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1253 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1254 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1255 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1256 const int MaxWaitStates = 18;
1257 Register Reg = Op.getReg();
1258 unsigned HazardDefLatency = 0;
1259
1260 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1261 this](const MachineInstr &MI) {
1262 if (!IsMFMAFn(MI))
1263 return false;
1264 Register DstReg = MI.getOperand(0).getReg();
1265 if (DstReg == Reg)
1266 return false;
1267 HazardDefLatency =
1268 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1269 return TRI.regsOverlap(DstReg, Reg);
1270 };
1271
1272 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1273 MaxWaitStates);
1274 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1275 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1276 int OpNo = MI->getOperandNo(&Op);
1277 if (OpNo == SrcCIdx) {
1278 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1279 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1280 switch (HazardDefLatency) {
1281 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1282 break;
1283 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1284 break;
1285 case 16: LLVM_FALLTHROUGH;
1286 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1287 break;
1288 }
1289 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1290 switch (HazardDefLatency) {
1291 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1292 break;
1293 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1294 break;
1295 case 16: LLVM_FALLTHROUGH;
1296 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1297 break;
1298 }
1299 }
1300
1301 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1302 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1303
1304 if (WaitStatesNeeded == MaxWaitStates)
1305 return WaitStatesNeeded; // Early exit.
1306
1307 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1308 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1309 return false;
1310 Register DstReg = MI.getOperand(0).getReg();
1311 return TRI.regsOverlap(Reg, DstReg);
1312 };
1313
1314 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1315 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1316 const int AccVGPRWriteAccVgprReadWaitStates = 3;
1317 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1318 if (OpNo == SrcCIdx)
1319 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1320 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1321 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1322
1323 WaitStatesNeededForUse = NeedWaitStates -
1324 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1326
1327 if (WaitStatesNeeded == MaxWaitStates)
1328 return WaitStatesNeeded; // Early exit.
1329 }
1330
1331 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1332 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1333 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1334 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1335 const int MaxWaitStates = 13;
1336 Register DstReg = MI->getOperand(0).getReg();
1337 unsigned HazardDefLatency = 0;
1338
1339 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1340 this](const MachineInstr &MI) {
1341 if (!IsMFMAFn(MI))
1342 return false;
1343 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1344 HazardDefLatency =
1345 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1346 return TRI.regsOverlap(Reg, DstReg);
1347 };
1348
1349 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1350 int NeedWaitStates;
1351 switch (HazardDefLatency) {
1352 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1353 break;
1354 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1355 break;
1356 case 16: LLVM_FALLTHROUGH;
1357 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1358 break;
1359 }
1360
1361 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1362 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1363 }
1364
1365 return WaitStatesNeeded;
1366 }
1367
checkMAIHazards90A(MachineInstr * MI)1368 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1369 int WaitStatesNeeded = 0;
1370 unsigned Opc = MI->getOpcode();
1371
1372 auto IsMFMAFn = [](const MachineInstr &MI) {
1373 return SIInstrInfo::isMAI(MI) &&
1374 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1375 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1376 };
1377
1378 auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1379 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1380 };
1381
1382 auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1383 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1384 };
1385
1386 if (!IsMFMAFn(*MI))
1387 return WaitStatesNeeded;
1388
1389 const int VALUWritesExecWaitStates = 4;
1390 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1391 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1392 VALUWritesExecWaitStates);
1393 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1394
1395 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1396
1397 // Loop for both DGEMM and S/HGEMM 2nd instruction.
1398 for (const MachineOperand &Use : MI->explicit_uses()) {
1399 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1400 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1401 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1402 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1403 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1404 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1405 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1406 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1407 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1408 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1409 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1410 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1411 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1412 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1413 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1414 const int MaxWaitStates = 19;
1415
1416 if (!Use.isReg())
1417 continue;
1418 unsigned Reg = Use.getReg();
1419 bool FullReg;
1420 const MachineInstr *MI1;
1421
1422 auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1423 this](const MachineInstr &MI) {
1424 if (!IsMFMAFn(MI))
1425 return false;
1426 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1427 return false;
1428 Register DstReg = MI.getOperand(0).getReg();
1429 FullReg = (DstReg == Reg);
1430 MI1 = &MI;
1431 return TRI.regsOverlap(DstReg, Reg);
1432 };
1433
1434 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1435 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1436 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1437
1438 int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1439 MaxWaitStates);
1440 if (NumWaitStates == std::numeric_limits<int>::max())
1441 continue;
1442
1443 int OpNo = MI->getOperandNo(&Use);
1444 unsigned Opc1 = MI1->getOpcode();
1445 int NeedWaitStates = 0;
1446 if (OpNo == SrcCIdx) {
1447 if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1448 NeedWaitStates = 0;
1449 } else if (FullReg) {
1450 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1451 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1452 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1453 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1454 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1455 } else {
1456 switch (Opc1) {
1457 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1458 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1459 if (!isXDL(ST, *MI))
1460 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1461 break;
1462 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1463 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1464 if (!isXDL(ST, *MI))
1465 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1466 break;
1467 default:
1468 switch (TSchedModel.computeInstrLatency(MI1)) {
1469 case 2:
1470 NeedWaitStates = isDGEMM(Opc)
1471 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1472 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1473 break;
1474 case 8:
1475 NeedWaitStates = isDGEMM(Opc)
1476 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1477 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1478 break;
1479 case 16: LLVM_FALLTHROUGH;
1480 default:
1481 NeedWaitStates = isDGEMM(Opc)
1482 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1483 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1484 }
1485 }
1486 }
1487 } else {
1488 switch (Opc1) {
1489 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1490 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1491 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1492 break;
1493 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1494 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1495 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1496 break;
1497 default:
1498 switch (TSchedModel.computeInstrLatency(MI1)) {
1499 case 2:
1500 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1501 break;
1502 case 8:
1503 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1504 break;
1505 case 16: LLVM_FALLTHROUGH;
1506 default:
1507 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1508 }
1509 }
1510 }
1511 if (WaitStatesNeeded >= NeedWaitStates)
1512 continue;
1513
1514 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1515 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1516
1517 if (WaitStatesNeeded == MaxWaitStates)
1518 break;
1519 }
1520
1521 return WaitStatesNeeded;
1522 }
1523
checkMAILdStHazards(MachineInstr * MI)1524 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1525 // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
1526 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1527 return 0;
1528
1529 int WaitStatesNeeded = 0;
1530
1531 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1532 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1533 };
1534
1535 for (const MachineOperand &Op : MI->explicit_uses()) {
1536 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1537 continue;
1538
1539 Register Reg = Op.getReg();
1540
1541 const int AccVgprReadLdStWaitStates = 2;
1542 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1543 const int MaxWaitStates = 2;
1544
1545 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1546 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1547 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1548
1549 if (WaitStatesNeeded == MaxWaitStates)
1550 return WaitStatesNeeded; // Early exit.
1551
1552 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1553 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1554 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1555 return false;
1556 auto IsVALUFn = [](const MachineInstr &MI) {
1557 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1558 };
1559 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1560 std::numeric_limits<int>::max();
1561 };
1562
1563 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1564 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1565 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1566 }
1567
1568 return WaitStatesNeeded;
1569 }
1570
checkMAIVALUHazards(MachineInstr * MI)1571 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1572 if (!ST.hasGFX90AInsts())
1573 return 0;
1574
1575 auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1576 return SIInstrInfo::isMAI(MI) &&
1577 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1578 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1579 };
1580
1581 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1582 return isDGEMM(MI.getOpcode());
1583 };
1584
1585 // This is checked in checkMAIHazards90A()
1586 if (IsMFMAFn(*MI))
1587 return 0;
1588
1589 int WaitStatesNeeded = 0;
1590
1591 bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1592 SIInstrInfo::isFLAT(*MI) ||
1593 SIInstrInfo::isDS(*MI) ||
1594 SIInstrInfo::isEXP(*MI);
1595 bool IsVALU = SIInstrInfo::isVALU(*MI);
1596
1597 const MachineInstr *MFMA = nullptr;
1598 unsigned Reg;
1599 auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1600 this](const MachineInstr &MI) {
1601 if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1602 return false;
1603 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1604 return false;
1605 MFMA = &MI;
1606 return true;
1607 };
1608
1609 const MachineInstr *DOT = nullptr;
1610 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1611 if (!SIInstrInfo::isDOT(MI) ||
1612 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1613 return false;
1614 DOT = &MI;
1615 return true;
1616 };
1617
1618 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1619 AMDGPU::OpName::src2);
1620
1621 if (IsMemOrExport || IsVALU) {
1622 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1623 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1624 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1625 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1626 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1627 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1628 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1629 const int DotWriteSameDotReadSrcAB = 3;
1630 const int DotWriteDifferentVALURead = 3;
1631 const int MaxWaitStates = 19;
1632
1633 for (const MachineOperand &Use : MI->explicit_uses()) {
1634 if (!Use.isReg())
1635 continue;
1636 Reg = Use.getReg();
1637
1638 DOT = nullptr;
1639 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1640 MaxWaitStates);
1641 if (DOT) {
1642 int NeedWaitStates = 0;
1643 if (DOT->getOpcode() == MI->getOpcode()) {
1644 if (&Use - &MI->getOperand(0) != SrcCIdx)
1645 NeedWaitStates = DotWriteSameDotReadSrcAB;
1646 } else {
1647 NeedWaitStates = DotWriteDifferentVALURead;
1648 }
1649
1650 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1651 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1652 }
1653
1654 MFMA = nullptr;
1655 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1656 MaxWaitStates);
1657 if (!MFMA)
1658 continue;
1659
1660 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1661 int NeedWaitStates = MaxWaitStates;
1662 switch (HazardDefLatency) {
1663 case 2:
1664 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1665 break;
1666 case 4:
1667 assert(isDGEMM(MFMA->getOpcode()));
1668 NeedWaitStates =
1669 IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1670 : DMFMA4x4WriteVgprVALUReadWaitStates;
1671 break;
1672 case 8:
1673 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1674 break;
1675 case 16: LLVM_FALLTHROUGH;
1676 default:
1677 NeedWaitStates =
1678 isDGEMM(MFMA->getOpcode())
1679 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1680 : DMFMA16x16WriteVgprVALUReadWaitStates
1681 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1682 break;
1683 }
1684
1685 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1686 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1687
1688 if (WaitStatesNeeded == MaxWaitStates)
1689 break;
1690 }
1691 }
1692
1693 unsigned Opc = MI->getOpcode();
1694 const int DMFMAToFMA64WaitStates = 2;
1695 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1696 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1697 Opc == AMDGPU::V_FMAC_F64_dpp) &&
1698 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1699 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1700 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1701 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1702 }
1703
1704 if (!IsVALU && !IsMemOrExport)
1705 return WaitStatesNeeded;
1706
1707 for (const MachineOperand &Def : MI->defs()) {
1708 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1709 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1710 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1711 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1712 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1713 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1714 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1715 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1716 const int DotWriteDifferentVALUWrite = 3;
1717 const int MaxWaitStates = 19;
1718 const int MaxWarWaitStates = 15;
1719
1720 Reg = Def.getReg();
1721
1722 DOT = nullptr;
1723 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1724 MaxWaitStates);
1725 if (DOT && DOT->getOpcode() != MI->getOpcode())
1726 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1727 WaitStatesSinceDef);
1728
1729 MFMA = nullptr;
1730 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1731 MaxWaitStates);
1732 if (MFMA) {
1733 int NeedWaitStates = MaxWaitStates;
1734 switch (TSchedModel.computeInstrLatency(MFMA)) {
1735 case 2:
1736 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1737 break;
1738 case 4:
1739 assert(isDGEMM(MFMA->getOpcode()));
1740 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1741 break;
1742 case 8:
1743 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1744 break;
1745 case 16: LLVM_FALLTHROUGH;
1746 default:
1747 NeedWaitStates = isDGEMM(MFMA->getOpcode())
1748 ? DMFMA16x16WriteVgprVALUWriteWaitStates
1749 : SMFMA32x32WriteVgprVALUWawWaitStates;
1750 break;
1751 }
1752
1753 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1754 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1755
1756 if (WaitStatesNeeded == MaxWaitStates)
1757 break;
1758 }
1759
1760 auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1761 this](const MachineInstr &MI) {
1762 if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1763 !MI.readsRegister(Reg, &TRI))
1764 return false;
1765
1766 const MachineOperand *SrcC =
1767 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1768 assert(SrcC);
1769 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1770 return false;
1771
1772 MFMA = &MI;
1773 return true;
1774 };
1775
1776 MFMA = nullptr;
1777 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1778 MaxWarWaitStates);
1779 if (!MFMA)
1780 continue;
1781
1782 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1783 int NeedWaitStates = MaxWaitStates;
1784 switch (HazardDefLatency) {
1785 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1786 break;
1787 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1788 break;
1789 case 16: LLVM_FALLTHROUGH;
1790 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1791 break;
1792 }
1793
1794 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1795 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1796 }
1797
1798 return WaitStatesNeeded;
1799 }
1800
ShouldPreferAnother(SUnit * SU)1801 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1802 if (!SU->isInstr())
1803 return false;
1804
1805 const MachineInstr *MAI = nullptr;
1806 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1807 MAI = nullptr;
1808 if (SIInstrInfo::isMAI(MI) &&
1809 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1810 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1811 MAI = &MI;
1812 return MAI != nullptr;
1813 };
1814
1815 MachineInstr *MI = SU->getInstr();
1816 if (IsMFMAFn(*MI)) {
1817 int W = getWaitStatesSince(IsMFMAFn, 16);
1818 if (MAI)
1819 return W < (int)TSchedModel.computeInstrLatency(MAI);
1820 }
1821
1822 return false;
1823 }
1824