1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91
92 using namespace llvm;
93 using namespace MIPatternMatch;
94
95 namespace {
96
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100 const AMDGPURegisterBankInfo &RBI;
101 MachineRegisterInfo &MRI;
102 const RegisterBank *NewBank;
103 SmallVector<MachineInstr *, 4> NewInsts;
104
105 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107 MachineRegisterInfo &MRI_, const RegisterBank *RB)
108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109
~ApplyRegBankMapping()110 ~ApplyRegBankMapping() {
111 for (MachineInstr *MI : NewInsts)
112 applyBank(*MI);
113 }
114
115 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)116 void applyBank(MachineInstr &MI) {
117 const unsigned Opc = MI.getOpcode();
118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119 Opc == AMDGPU::G_SEXT) {
120 // LegalizerHelper wants to use the basic legalization artifacts when
121 // widening etc. We don't handle selection with vcc in artifact sources,
122 // so we need to use a select instead to handle these properly.
123 Register DstReg = MI.getOperand(0).getReg();
124 Register SrcReg = MI.getOperand(1).getReg();
125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126 if (SrcBank == &AMDGPU::VCCRegBank) {
127 const LLT S32 = LLT::scalar(32);
128 assert(MRI.getType(SrcReg) == LLT::scalar(1));
129 assert(MRI.getType(DstReg) == S32);
130 assert(NewBank == &AMDGPU::VGPRRegBank);
131
132 // Replace the extension with a select, which really uses the boolean
133 // source.
134 MachineIRBuilder B(MI);
135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136 auto False = B.buildConstant(S32, 0);
137 B.buildSelect(DstReg, SrcReg, True, False);
138 MRI.setRegBank(True.getReg(0), *NewBank);
139 MRI.setRegBank(False.getReg(0), *NewBank);
140 MI.eraseFromParent();
141 }
142
143 assert(!MRI.getRegClassOrRegBank(DstReg));
144 MRI.setRegBank(DstReg, *NewBank);
145 return;
146 }
147
148 #ifndef NDEBUG
149 if (Opc == AMDGPU::G_TRUNC) {
150 Register DstReg = MI.getOperand(0).getReg();
151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152 assert(DstBank != &AMDGPU::VCCRegBank);
153 }
154 #endif
155
156 for (MachineOperand &Op : MI.operands()) {
157 if (!Op.isReg())
158 continue;
159
160 // We may see physical registers if building a real MI
161 Register Reg = Op.getReg();
162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163 continue;
164
165 const RegisterBank *RB = NewBank;
166 if (MRI.getType(Reg) == LLT::scalar(1)) {
167 assert(NewBank == &AMDGPU::VGPRRegBank &&
168 "s1 operands should only be used for vector bools");
169 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171 "not expecting legalization artifacts here");
172 RB = &AMDGPU::VCCRegBank;
173 }
174
175 MRI.setRegBank(Reg, *RB);
176 }
177 }
178
erasingInstr(MachineInstr & MI)179 void erasingInstr(MachineInstr &MI) override {}
180
createdInstr(MachineInstr & MI)181 void createdInstr(MachineInstr &MI) override {
182 // At this point, the instruction was just inserted and has no operands.
183 NewInsts.push_back(&MI);
184 }
185
changingInstr(MachineInstr & MI)186 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)187 void changedInstr(MachineInstr &MI) override {
188 // FIXME: In principle we should probably add the instruction to NewInsts,
189 // but the way the LegalizerHelper uses the observer, we will always see the
190 // registers we need to set the regbank on also referenced in a new
191 // instruction.
192 }
193 };
194
195 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212
isVectorRegisterBank(const RegisterBank & Bank)213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 // There is no direct copy between AGPRs.
242 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243 Src.getID() == AMDGPU::AGPRRegBankID)
244 return 4;
245
246 return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250 const ValueMapping &ValMapping,
251 const RegisterBank *CurBank) const {
252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253 // VGPR.
254 // FIXME: Is there a better way to do this?
255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256 return 10; // This is expensive.
257
258 assert(ValMapping.NumBreakDowns == 2 &&
259 ValMapping.BreakDown[0].Length == 32 &&
260 ValMapping.BreakDown[0].StartIdx == 0 &&
261 ValMapping.BreakDown[1].Length == 32 &&
262 ValMapping.BreakDown[1].StartIdx == 32 &&
263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264
265 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267 // want.
268
269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270 // alignment restrictions, but this probably isn't important.
271 return 1;
272 }
273
274 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276 LLT Ty) const {
277 if (&RC == &AMDGPU::SReg_1RegClass)
278 return AMDGPU::VCCRegBank;
279
280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281 // VCC-like use.
282 if (TRI->isSGPRClass(&RC)) {
283 // FIXME: This probably came from a copy from a physical register, which
284 // should be inferable from the copied to-type. We don't have many boolean
285 // physical register constraints so just assume a normal SGPR for now.
286 if (!Ty.isValid())
287 return AMDGPU::SGPRRegBank;
288
289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290 }
291
292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const297 AMDGPURegisterBankInfo::addMappingFromTable(
298 const MachineInstr &MI, const MachineRegisterInfo &MRI,
299 const std::array<unsigned, NumOps> RegSrcOpIdx,
300 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301
302 InstructionMappings AltMappings;
303
304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305
306 unsigned Sizes[NumOps];
307 for (unsigned I = 0; I < NumOps; ++I) {
308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310 }
311
312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315 }
316
317 // getInstrMapping's default mapping uses ID 1, so start at 2.
318 unsigned MappingID = 2;
319 for (const auto &Entry : Table) {
320 for (unsigned I = 0; I < NumOps; ++I) {
321 int OpIdx = RegSrcOpIdx[I];
322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323 }
324
325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326 getOperandsMapping(Operands),
327 Operands.size()));
328 }
329
330 return AltMappings;
331 }
332
333 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336 switch (MI.getIntrinsicID()) {
337 case Intrinsic::amdgcn_readlane: {
338 static const OpRegBankEntry<3> Table[2] = {
339 // Perfectly legal.
340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341
342 // Need a readfirstlane for the index.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344 };
345
346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
348 }
349 case Intrinsic::amdgcn_writelane: {
350 static const OpRegBankEntry<4> Table[4] = {
351 // Perfectly legal.
352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353
354 // Need readfirstlane of first op
355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356
357 // Need readfirstlane of second op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of both ops
361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362 };
363
364 // rsrc, voffset, offset
365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
367 }
368 default:
369 return RegisterBankInfo::getInstrAlternativeMappings(MI);
370 }
371 }
372
373 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376
377 switch (MI.getIntrinsicID()) {
378 case Intrinsic::amdgcn_s_buffer_load: {
379 static const OpRegBankEntry<2> Table[4] = {
380 // Perfectly legal.
381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382
383 // Only need 1 register in loop
384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385
386 // Have to waterfall the resource.
387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388
389 // Have to waterfall the resource, and the offset.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391 };
392
393 // rsrc, offset
394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
396 }
397 case Intrinsic::amdgcn_ds_ordered_add:
398 case Intrinsic::amdgcn_ds_ordered_swap: {
399 // VGPR = M0, VGPR
400 static const OpRegBankEntry<3> Table[2] = {
401 // Perfectly legal.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403
404 // Need a readfirstlane for m0
405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406 };
407
408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
410 }
411 case Intrinsic::amdgcn_s_sendmsg:
412 case Intrinsic::amdgcn_s_sendmsghalt: {
413 // FIXME: Should have no register for immediate
414 static const OpRegBankEntry<1> Table[2] = {
415 // Perfectly legal.
416 { { AMDGPU::SGPRRegBankID }, 1 },
417
418 // Need readlane
419 { { AMDGPU::VGPRRegBankID }, 3 }
420 };
421
422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
424 }
425 default:
426 return RegisterBankInfo::getInstrAlternativeMappings(MI);
427 }
428 }
429
430 // FIXME: Returns uniform if there's no source value information. This is
431 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)432 static bool isScalarLoadLegal(const MachineInstr &MI) {
433 if (!MI.hasOneMemOperand())
434 return false;
435
436 const MachineMemOperand *MMO = *MI.memoperands_begin();
437 const unsigned AS = MMO->getAddrSpace();
438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
439 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
440 // Require 4-byte alignment.
441 return MMO->getAlign() >= Align(4) &&
442 // Can't do a scalar atomic load.
443 !MMO->isAtomic() &&
444 // Don't use scalar loads for volatile accesses to non-constant address
445 // spaces.
446 (IsConst || !MMO->isVolatile()) &&
447 // Memory must be known constant, or not written before this load.
448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
449 AMDGPUInstrInfo::isUniformMMO(MMO);
450 }
451
452 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const453 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
454 const MachineInstr &MI) const {
455
456 const MachineFunction &MF = *MI.getParent()->getParent();
457 const MachineRegisterInfo &MRI = MF.getRegInfo();
458
459
460 InstructionMappings AltMappings;
461 switch (MI.getOpcode()) {
462 case TargetOpcode::G_CONSTANT:
463 case TargetOpcode::G_IMPLICIT_DEF: {
464 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
465 if (Size == 1) {
466 static const OpRegBankEntry<1> Table[3] = {
467 { { AMDGPU::VGPRRegBankID }, 1 },
468 { { AMDGPU::SGPRRegBankID }, 1 },
469 { { AMDGPU::VCCRegBankID }, 1 }
470 };
471
472 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
473 }
474
475 [[fallthrough]];
476 }
477 case TargetOpcode::G_FCONSTANT:
478 case TargetOpcode::G_FRAME_INDEX:
479 case TargetOpcode::G_GLOBAL_VALUE: {
480 static const OpRegBankEntry<1> Table[2] = {
481 { { AMDGPU::VGPRRegBankID }, 1 },
482 { { AMDGPU::SGPRRegBankID }, 1 }
483 };
484
485 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
486 }
487 case TargetOpcode::G_AND:
488 case TargetOpcode::G_OR:
489 case TargetOpcode::G_XOR: {
490 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
491
492 if (Size == 1) {
493 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
494 const InstructionMapping &SCCMapping = getInstructionMapping(
495 1, 1, getOperandsMapping(
496 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
499 3); // Num Operands
500 AltMappings.push_back(&SCCMapping);
501
502 const InstructionMapping &VCCMapping0 = getInstructionMapping(
503 2, 1, getOperandsMapping(
504 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
507 3); // Num Operands
508 AltMappings.push_back(&VCCMapping0);
509 return AltMappings;
510 }
511
512 if (Size != 64)
513 break;
514
515 const InstructionMapping &SSMapping = getInstructionMapping(
516 1, 1, getOperandsMapping(
517 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
520 3); // Num Operands
521 AltMappings.push_back(&SSMapping);
522
523 const InstructionMapping &VVMapping = getInstructionMapping(
524 2, 2, getOperandsMapping(
525 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
528 3); // Num Operands
529 AltMappings.push_back(&VVMapping);
530 break;
531 }
532 case TargetOpcode::G_LOAD:
533 case TargetOpcode::G_ZEXTLOAD:
534 case TargetOpcode::G_SEXTLOAD: {
535 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
536 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
537 unsigned PtrSize = PtrTy.getSizeInBits();
538 unsigned AS = PtrTy.getAddressSpace();
539
540 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
541 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
542 isScalarLoadLegal(MI)) {
543 const InstructionMapping &SSMapping = getInstructionMapping(
544 1, 1, getOperandsMapping(
545 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
546 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
547 2); // Num Operands
548 AltMappings.push_back(&SSMapping);
549 }
550
551 const InstructionMapping &VVMapping = getInstructionMapping(
552 2, 1,
553 getOperandsMapping(
554 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
555 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
556 2); // Num Operands
557 AltMappings.push_back(&VVMapping);
558
559 // It may be possible to have a vgpr = load sgpr mapping here, because
560 // the mubuf instructions support this kind of load, but probably for only
561 // gfx7 and older. However, the addressing mode matching in the instruction
562 // selector should be able to do a better job of detecting and selecting
563 // these kinds of loads from the vgpr = load vgpr mapping.
564
565 return AltMappings;
566
567 }
568 case TargetOpcode::G_SELECT: {
569 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
570 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
571 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
575 4); // Num Operands
576 AltMappings.push_back(&SSMapping);
577
578 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
579 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
580 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
583 4); // Num Operands
584 AltMappings.push_back(&VVMapping);
585
586 return AltMappings;
587 }
588 case TargetOpcode::G_UADDE:
589 case TargetOpcode::G_USUBE:
590 case TargetOpcode::G_SADDE:
591 case TargetOpcode::G_SSUBE: {
592 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
593 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
594 getOperandsMapping(
595 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
600 5); // Num Operands
601 AltMappings.push_back(&SSMapping);
602
603 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
604 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
605 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
608 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
609 5); // Num Operands
610 AltMappings.push_back(&VVMapping);
611 return AltMappings;
612 }
613 case AMDGPU::G_BRCOND: {
614 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
615
616 // TODO: Change type to 32 for scalar
617 const InstructionMapping &SMapping = getInstructionMapping(
618 1, 1, getOperandsMapping(
619 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
620 2); // Num Operands
621 AltMappings.push_back(&SMapping);
622
623 const InstructionMapping &VMapping = getInstructionMapping(
624 1, 1, getOperandsMapping(
625 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
626 2); // Num Operands
627 AltMappings.push_back(&VMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_INTRINSIC:
631 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
632 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
633 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
634 default:
635 break;
636 }
637 return RegisterBankInfo::getInstrAlternativeMappings(MI);
638 }
639
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const640 void AMDGPURegisterBankInfo::split64BitValueForMapping(
641 MachineIRBuilder &B,
642 SmallVector<Register, 2> &Regs,
643 LLT HalfTy,
644 Register Reg) const {
645 assert(HalfTy.getSizeInBits() == 32);
646 MachineRegisterInfo *MRI = B.getMRI();
647 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
648 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
649 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
650 MRI->setRegBank(LoLHS, *Bank);
651 MRI->setRegBank(HiLHS, *Bank);
652
653 Regs.push_back(LoLHS);
654 Regs.push_back(HiLHS);
655
656 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
657 .addDef(LoLHS)
658 .addDef(HiLHS)
659 .addUse(Reg);
660 }
661
662 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)663 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
664 LLT NewTy) {
665 for (Register Reg : Regs) {
666 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
667 MRI.setType(Reg, NewTy);
668 }
669 }
670
getHalfSizedType(LLT Ty)671 static LLT getHalfSizedType(LLT Ty) {
672 if (Ty.isVector()) {
673 assert(Ty.getElementCount().isKnownMultipleOf(2));
674 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
675 Ty.getElementType());
676 }
677
678 assert(Ty.getScalarSizeInBits() % 2 == 0);
679 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
680 }
681
682 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
683 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const684 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
685 MachineRegisterInfo &MRI,
686 Register Src) const {
687 LLT Ty = MRI.getType(Src);
688 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
689
690 if (Bank == &AMDGPU::SGPRRegBank)
691 return Src;
692
693 unsigned Bits = Ty.getSizeInBits();
694 assert(Bits % 32 == 0);
695
696 if (Bank != &AMDGPU::VGPRRegBank) {
697 // We need to copy from AGPR to VGPR
698 Src = B.buildCopy(Ty, Src).getReg(0);
699 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
700 }
701
702 LLT S32 = LLT::scalar(32);
703 unsigned NumParts = Bits / 32;
704 SmallVector<Register, 8> SrcParts;
705 SmallVector<Register, 8> DstParts;
706
707 if (Bits == 32) {
708 SrcParts.push_back(Src);
709 } else {
710 auto Unmerge = B.buildUnmerge(S32, Src);
711 for (unsigned i = 0; i < NumParts; ++i)
712 SrcParts.push_back(Unmerge.getReg(i));
713 }
714
715 for (unsigned i = 0; i < NumParts; ++i) {
716 Register SrcPart = SrcParts[i];
717 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
718 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
719
720 const TargetRegisterClass *Constrained =
721 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
722 (void)Constrained;
723 assert(Constrained && "Failed to constrain readfirstlane src reg");
724
725 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
726
727 DstParts.push_back(DstPart);
728 }
729
730 if (Bits == 32)
731 return DstParts[0];
732
733 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
734 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
735 return Dst;
736 }
737
738 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
739 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
740 /// execute the instruction for each unique combination of values in all lanes
741 /// in the wave. The block will be split such that rest of the instructions are
742 /// moved to a new block.
743 ///
744 /// Essentially performs this loop:
745 //
746 /// Save Execution Mask
747 /// For (Lane : Wavefront) {
748 /// Enable Lane, Disable all other lanes
749 /// SGPR = read SGPR value for current lane from VGPR
750 /// VGPRResult[Lane] = use_op SGPR
751 /// }
752 /// Restore Execution Mask
753 ///
754 /// There is additional complexity to try for compare values to identify the
755 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const756 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
757 MachineIRBuilder &B,
758 iterator_range<MachineBasicBlock::iterator> Range,
759 SmallSet<Register, 4> &SGPROperandRegs,
760 MachineRegisterInfo &MRI) const {
761
762 // Track use registers which have already been expanded with a readfirstlane
763 // sequence. This may have multiple uses if moving a sequence.
764 DenseMap<Register, Register> WaterfalledRegMap;
765
766 MachineBasicBlock &MBB = B.getMBB();
767 MachineFunction *MF = &B.getMF();
768
769 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
770 const unsigned MovExecOpc =
771 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
772 const unsigned MovExecTermOpc =
773 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
774
775 const unsigned XorTermOpc = Subtarget.isWave32() ?
776 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
777 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
778 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
779 const unsigned ExecReg = Subtarget.isWave32() ?
780 AMDGPU::EXEC_LO : AMDGPU::EXEC;
781
782 #ifndef NDEBUG
783 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
784 #endif
785
786 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
787 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
788
789 // Don't bother using generic instructions/registers for the exec mask.
790 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
791 .addDef(InitSaveExecReg);
792
793 Register PhiExec = MRI.createVirtualRegister(WaveRC);
794 Register NewExec = MRI.createVirtualRegister(WaveRC);
795
796 // To insert the loop we need to split the block. Move everything before this
797 // point to a new block, and insert a new empty block before this instruction.
798 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
799 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
800 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
801 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
802 MachineFunction::iterator MBBI(MBB);
803 ++MBBI;
804 MF->insert(MBBI, LoopBB);
805 MF->insert(MBBI, BodyBB);
806 MF->insert(MBBI, RestoreExecBB);
807 MF->insert(MBBI, RemainderBB);
808
809 LoopBB->addSuccessor(BodyBB);
810 BodyBB->addSuccessor(RestoreExecBB);
811 BodyBB->addSuccessor(LoopBB);
812
813 // Move the rest of the block into a new block.
814 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
815 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
816
817 MBB.addSuccessor(LoopBB);
818 RestoreExecBB->addSuccessor(RemainderBB);
819
820 B.setInsertPt(*LoopBB, LoopBB->end());
821
822 B.buildInstr(TargetOpcode::PHI)
823 .addDef(PhiExec)
824 .addReg(InitSaveExecReg)
825 .addMBB(&MBB)
826 .addReg(NewExec)
827 .addMBB(BodyBB);
828
829 const DebugLoc &DL = B.getDL();
830
831 MachineInstr &FirstInst = *Range.begin();
832
833 // Move the instruction into the loop body. Note we moved everything after
834 // Range.end() already into a new block, so Range.end() is no longer valid.
835 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
836
837 // Figure out the iterator range after splicing the instructions.
838 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
839 auto NewEnd = BodyBB->end();
840
841 B.setMBB(*LoopBB);
842
843 LLT S1 = LLT::scalar(1);
844 Register CondReg;
845
846 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
847
848 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
849 for (MachineOperand &Op : MI.uses()) {
850 if (!Op.isReg() || Op.isDef())
851 continue;
852
853 Register OldReg = Op.getReg();
854 if (!SGPROperandRegs.count(OldReg))
855 continue;
856
857 // See if we already processed this register in another instruction in the
858 // sequence.
859 auto OldVal = WaterfalledRegMap.find(OldReg);
860 if (OldVal != WaterfalledRegMap.end()) {
861 Op.setReg(OldVal->second);
862 continue;
863 }
864
865 Register OpReg = Op.getReg();
866 LLT OpTy = MRI.getType(OpReg);
867
868 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
869 if (OpBank != &AMDGPU::VGPRRegBank) {
870 // Insert copy from AGPR to VGPR before the loop.
871 B.setMBB(MBB);
872 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
873 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
874 B.setMBB(*LoopBB);
875 }
876
877 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
878
879 // Build the comparison(s).
880 unsigned OpSize = OpTy.getSizeInBits();
881 bool Is64 = OpSize % 64 == 0;
882 unsigned PartSize = Is64 ? 64 : 32;
883 LLT PartTy = LLT::scalar(PartSize);
884 unsigned NumParts = OpSize / PartSize;
885 SmallVector<Register, 8> OpParts;
886 SmallVector<Register, 8> CurrentLaneParts;
887
888 if (NumParts == 1) {
889 OpParts.push_back(OpReg);
890 CurrentLaneParts.push_back(CurrentLaneReg);
891 } else {
892 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
893 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
894 for (unsigned i = 0; i < NumParts; ++i) {
895 OpParts.push_back(UnmergeOp.getReg(i));
896 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
897 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
898 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
899 }
900 }
901
902 for (unsigned i = 0; i < NumParts; ++i) {
903 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
904 OpParts[i]).getReg(0);
905 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
906
907 if (!CondReg) {
908 CondReg = CmpReg;
909 } else {
910 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
911 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
912 }
913 }
914
915 Op.setReg(CurrentLaneReg);
916
917 // Make sure we don't re-process this register again.
918 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
919 }
920 }
921
922 // The ballot becomes a no-op during instruction selection.
923 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
924 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
925 false)
926 .addReg(CondReg)
927 .getReg(0);
928 MRI.setRegClass(CondReg, WaveRC);
929
930 // Update EXEC, save the original EXEC value to VCC.
931 B.buildInstr(AndSaveExecOpc)
932 .addDef(NewExec)
933 .addReg(CondReg, RegState::Kill);
934
935 MRI.setSimpleHint(NewExec, CondReg);
936
937 B.setInsertPt(*BodyBB, BodyBB->end());
938
939 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
940 B.buildInstr(XorTermOpc)
941 .addDef(ExecReg)
942 .addReg(ExecReg)
943 .addReg(NewExec);
944
945 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
946 // s_cbranch_scc0?
947
948 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
949 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
950
951 // Save the EXEC mask before the loop.
952 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
953 .addReg(ExecReg);
954
955 // Restore the EXEC mask after the loop.
956 B.setMBB(*RestoreExecBB);
957 B.buildInstr(MovExecTermOpc)
958 .addDef(ExecReg)
959 .addReg(SaveExecReg);
960
961 // Set the insert point after the original instruction, so any new
962 // instructions will be in the remainder.
963 B.setInsertPt(*RemainderBB, RemainderBB->begin());
964
965 return true;
966 }
967
968 // Return any unique registers used by \p MI at \p OpIndices that need to be
969 // handled in a waterfall loop. Returns these registers in \p
970 // SGPROperandRegs. Returns true if there are any operands to handle and a
971 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const972 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975 for (unsigned Op : OpIndices) {
976 assert(MI.getOperand(Op).isUse());
977 Register Reg = MI.getOperand(Op).getReg();
978 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
979 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980 SGPROperandRegs.insert(Reg);
981 }
982
983 // No operands need to be replaced, so no need to loop.
984 return !SGPROperandRegs.empty();
985 }
986
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const987 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
989 ArrayRef<unsigned> OpIndices) const {
990 // Use a set to avoid extra readfirstlanes in the case where multiple operands
991 // are the same register.
992 SmallSet<Register, 4> SGPROperandRegs;
993
994 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
995 return false;
996
997 MachineBasicBlock::iterator I = MI.getIterator();
998 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
999 SGPROperandRegs, MRI);
1000 }
1001
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1002 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1003 MachineInstr &MI, MachineRegisterInfo &MRI,
1004 ArrayRef<unsigned> OpIndices) const {
1005 MachineIRBuilder B(MI);
1006 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1007 }
1008
1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1011 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1012 Register Reg = MI.getOperand(OpIdx).getReg();
1013 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1014 if (Bank == &AMDGPU::SGPRRegBank)
1015 return;
1016
1017 MachineIRBuilder B(MI);
1018
1019 Reg = buildReadFirstLane(B, MRI, Reg);
1020 MI.getOperand(OpIdx).setReg(Reg);
1021 }
1022
1023 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1024 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1025 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1026 unsigned TotalSize = Ty.getSizeInBits();
1027 if (!Ty.isVector())
1028 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1029
1030 LLT EltTy = Ty.getElementType();
1031 unsigned EltSize = EltTy.getSizeInBits();
1032 assert(FirstSize % EltSize == 0);
1033
1034 unsigned FirstPartNumElts = FirstSize / EltSize;
1035 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1036
1037 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1038 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1039 }
1040
widen96To128(LLT Ty)1041 static LLT widen96To128(LLT Ty) {
1042 if (!Ty.isVector())
1043 return LLT::scalar(128);
1044
1045 LLT EltTy = Ty.getElementType();
1046 assert(128 % EltTy.getSizeInBits() == 0);
1047 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1048 }
1049
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1050 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1051 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1052 MachineRegisterInfo &MRI) const {
1053 Register DstReg = MI.getOperand(0).getReg();
1054 const LLT LoadTy = MRI.getType(DstReg);
1055 unsigned LoadSize = LoadTy.getSizeInBits();
1056 const unsigned MaxNonSmrdLoadSize = 128;
1057
1058 const RegisterBank *DstBank =
1059 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1060 if (DstBank == &AMDGPU::SGPRRegBank) {
1061 // There are some special cases that we need to look at for 32 bit and 96
1062 // bit SGPR loads otherwise we have nothing to do.
1063 if (LoadSize != 32 && LoadSize != 96)
1064 return false;
1065
1066 MachineMemOperand *MMO = *MI.memoperands_begin();
1067 const unsigned MemSize = 8 * MMO->getSize();
1068 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1069 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1070 // scalar loads should have a load size of 32 but memory access size of less
1071 // than 32.
1072 if (LoadSize == 32 &&
1073 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1074 return false;
1075
1076 Register PtrReg = MI.getOperand(1).getReg();
1077
1078 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1079 MachineIRBuilder B(MI, O);
1080
1081 if (LoadSize == 32) {
1082 // This is an extending load from a sub-dword size. Widen the memory
1083 // access size to 4 bytes and clear the extra high bits appropriately
1084 const LLT S32 = LLT::scalar(32);
1085 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093 } else
1094 // We do not need to touch the higher bits for regular loads.
1095 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096 } else {
1097 // 96-bit loads are only available for vector loads. We need to split this
1098 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099 if (MMO->getAlign() < Align(16)) {
1100 MachineFunction *MF = MI.getParent()->getParent();
1101 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1102 MachineIRBuilder B(MI, ApplyBank);
1103 LegalizerHelper Helper(*MF, ApplyBank, B);
1104 LLT Part64, Part32;
1105 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1106 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1107 LegalizerHelper::Legalized)
1108 return false;
1109 return true;
1110 } else {
1111 LLT WiderTy = widen96To128(LoadTy);
1112 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1113 if (WiderTy.isScalar())
1114 B.buildTrunc(MI.getOperand(0), WideLoad);
1115 else {
1116 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1117 WideLoad);
1118 }
1119 }
1120 }
1121
1122 MI.eraseFromParent();
1123 return true;
1124 }
1125
1126 // 128-bit loads are supported for all instruction types.
1127 if (LoadSize <= MaxNonSmrdLoadSize)
1128 return false;
1129
1130 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1131 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1132
1133 if (SrcRegs.empty())
1134 SrcRegs.push_back(MI.getOperand(1).getReg());
1135
1136 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137
1138 // RegBankSelect only emits scalar types, so we need to reset the pointer
1139 // operand to a pointer type.
1140 Register BasePtrReg = SrcRegs[0];
1141 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1142 MRI.setType(BasePtrReg, PtrTy);
1143
1144 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1145 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1146 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1147 MachineIRBuilder B(MI, Observer);
1148 LegalizerHelper Helper(B.getMF(), Observer, B);
1149
1150 if (LoadTy.isVector()) {
1151 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1152 return false;
1153 } else {
1154 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1155 return false;
1156 }
1157
1158 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1159 return true;
1160 }
1161
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1162 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1163 MachineInstr &MI,
1164 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1165 MachineRegisterInfo &MRI) const {
1166 const MachineFunction &MF = *MI.getMF();
1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168 const auto &TFI = *ST.getFrameLowering();
1169
1170 // Guard in case the stack growth direction ever changes with scratch
1171 // instructions.
1172 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1173 return false;
1174
1175 Register Dst = MI.getOperand(0).getReg();
1176 Register AllocSize = MI.getOperand(1).getReg();
1177 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1178
1179 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1180
1181 // TODO: Need to emit a wave reduction to get the maximum size.
1182 if (SizeBank != &AMDGPU::SGPRRegBank)
1183 return false;
1184
1185 LLT PtrTy = MRI.getType(Dst);
1186 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1187
1188 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1189 Register SPReg = Info->getStackPtrOffsetReg();
1190 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1191 MachineIRBuilder B(MI, ApplyBank);
1192
1193 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1194 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1195
1196 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1197 if (Alignment > TFI.getStackAlign()) {
1198 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1199 B.buildMaskLowPtrBits(Dst, PtrAdd,
1200 Log2(Alignment) + ST.getWavefrontSizeLog2());
1201 } else {
1202 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1203 }
1204
1205 MI.eraseFromParent();
1206 return true;
1207 }
1208
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1209 bool AMDGPURegisterBankInfo::applyMappingImage(
1210 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1211 MachineRegisterInfo &MRI, int RsrcIdx) const {
1212 const int NumDefs = MI.getNumExplicitDefs();
1213
1214 // The reported argument index is relative to the IR intrinsic call arguments,
1215 // so we need to shift by the number of defs and the intrinsic ID.
1216 RsrcIdx += NumDefs + 1;
1217
1218 // Insert copies to VGPR arguments.
1219 applyDefaultMapping(OpdMapper);
1220
1221 // Fixup any SGPR arguments.
1222 SmallVector<unsigned, 4> SGPRIndexes;
1223 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1224 if (!MI.getOperand(I).isReg())
1225 continue;
1226
1227 // If this intrinsic has a sampler, it immediately follows rsrc.
1228 if (I == RsrcIdx || I == RsrcIdx + 1)
1229 SGPRIndexes.push_back(I);
1230 }
1231
1232 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1233 return true;
1234 }
1235
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1236 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1237 Register Reg) {
1238 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1239 if (!Def)
1240 return Reg;
1241
1242 // TODO: Guard against this being an implicit def
1243 return Def->getOperand(0).getReg();
1244 }
1245
1246 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1247 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1248 static unsigned setBufferOffsets(MachineIRBuilder &B,
1249 const AMDGPURegisterBankInfo &RBI,
1250 Register CombinedOffset, Register &VOffsetReg,
1251 Register &SOffsetReg, int64_t &InstOffsetVal,
1252 Align Alignment) {
1253 const LLT S32 = LLT::scalar(32);
1254 MachineRegisterInfo *MRI = B.getMRI();
1255
1256 if (std::optional<int64_t> Imm =
1257 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1258 uint32_t SOffset, ImmOffset;
1259 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1260 Alignment)) {
1261 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1262 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1263 InstOffsetVal = ImmOffset;
1264
1265 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1266 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1267 return SOffset + ImmOffset;
1268 }
1269 }
1270
1271 Register Base;
1272 unsigned Offset;
1273
1274 std::tie(Base, Offset) =
1275 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1276
1277 uint32_t SOffset, ImmOffset;
1278 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1279 &RBI.Subtarget, Alignment)) {
1280 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1281 VOffsetReg = Base;
1282 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1283 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1284 InstOffsetVal = ImmOffset;
1285 return 0; // XXX - Why is this 0?
1286 }
1287
1288 // If we have SGPR base, we can use it for soffset.
1289 if (SOffset == 0) {
1290 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1291 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1292 SOffsetReg = Base;
1293 InstOffsetVal = ImmOffset;
1294 return 0; // XXX - Why is this 0?
1295 }
1296 }
1297
1298 // Handle the variable sgpr + vgpr case.
1299 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1300 if (Add && (int)Offset >= 0) {
1301 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1302 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1303
1304 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1305 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1306
1307 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1308 VOffsetReg = Src0;
1309 SOffsetReg = Src1;
1310 return 0;
1311 }
1312
1313 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1314 VOffsetReg = Src1;
1315 SOffsetReg = Src0;
1316 return 0;
1317 }
1318 }
1319
1320 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1321 // have an SGPR offset and a VGPR resource.
1322 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1323 VOffsetReg = CombinedOffset;
1324 } else {
1325 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1326 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1327 }
1328
1329 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1330 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1331 return 0;
1332 }
1333
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1334 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1335 const OperandsMapper &OpdMapper) const {
1336 MachineInstr &MI = OpdMapper.getMI();
1337 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1338
1339 const LLT S32 = LLT::scalar(32);
1340 Register Dst = MI.getOperand(0).getReg();
1341 LLT Ty = MRI.getType(Dst);
1342
1343 const RegisterBank *RSrcBank =
1344 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1345 const RegisterBank *OffsetBank =
1346 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1347 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1348 OffsetBank == &AMDGPU::SGPRRegBank)
1349 return true; // Legal mapping
1350
1351 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1352 // here but don't have an MMO.
1353
1354 unsigned LoadSize = Ty.getSizeInBits();
1355 int NumLoads = 1;
1356 if (LoadSize == 256 || LoadSize == 512) {
1357 NumLoads = LoadSize / 128;
1358 Ty = Ty.divide(NumLoads);
1359 }
1360
1361 // Use the alignment to ensure that the required offsets will fit into the
1362 // immediate offsets.
1363 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1364
1365 MachineIRBuilder B(MI);
1366 MachineFunction &MF = B.getMF();
1367
1368 Register SOffset;
1369 Register VOffset;
1370 int64_t ImmOffset = 0;
1371
1372 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1373 VOffset, SOffset, ImmOffset, Alignment);
1374
1375 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1376 // can, but we need to track an MMO for that.
1377 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1378 const Align MemAlign(4); // FIXME: ABI type alignment?
1379 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1380 MachinePointerInfo(),
1381 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1382 MachineMemOperand::MOInvariant,
1383 MemSize, MemAlign);
1384 if (MMOOffset != 0)
1385 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1386
1387 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1388 // assume that the buffer is unswizzled.
1389
1390 Register RSrc = MI.getOperand(1).getReg();
1391 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1392 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1393
1394 SmallVector<Register, 4> LoadParts(NumLoads);
1395
1396 MachineBasicBlock::iterator MII = MI.getIterator();
1397 MachineInstrSpan Span(MII, &B.getMBB());
1398
1399 for (int i = 0; i < NumLoads; ++i) {
1400 if (NumLoads == 1) {
1401 LoadParts[i] = Dst;
1402 } else {
1403 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1404 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1405 }
1406
1407 MachineMemOperand *MMO = BaseMMO;
1408 if (i != 0)
1409 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1410
1411 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1412 .addDef(LoadParts[i]) // vdata
1413 .addUse(RSrc) // rsrc
1414 .addUse(VIndex) // vindex
1415 .addUse(VOffset) // voffset
1416 .addUse(SOffset) // soffset
1417 .addImm(ImmOffset + 16 * i) // offset(imm)
1418 .addImm(0) // cachepolicy, swizzled buffer(imm)
1419 .addImm(0) // idxen(imm)
1420 .addMemOperand(MMO);
1421 }
1422
1423 // TODO: If only the resource is a VGPR, it may be better to execute the
1424 // scalar load in the waterfall loop if the resource is expected to frequently
1425 // be dynamically uniform.
1426 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1427 // Remove the original instruction to avoid potentially confusing the
1428 // waterfall loop logic.
1429 B.setInstr(*Span.begin());
1430 MI.eraseFromParent();
1431
1432 SmallSet<Register, 4> OpsToWaterfall;
1433
1434 OpsToWaterfall.insert(RSrc);
1435 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1436 OpsToWaterfall, MRI);
1437 }
1438
1439 if (NumLoads != 1) {
1440 if (Ty.isVector())
1441 B.buildConcatVectors(Dst, LoadParts);
1442 else
1443 B.buildMergeLikeInstr(Dst, LoadParts);
1444 }
1445
1446 // We removed the instruction earlier with a waterfall loop.
1447 if (RSrcBank == &AMDGPU::SGPRRegBank)
1448 MI.eraseFromParent();
1449
1450 return true;
1451 }
1452
applyMappingBFE(const OperandsMapper & OpdMapper,bool Signed) const1453 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1454 bool Signed) const {
1455 MachineInstr &MI = OpdMapper.getMI();
1456 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1457
1458 // Insert basic copies
1459 applyDefaultMapping(OpdMapper);
1460
1461 Register DstReg = MI.getOperand(0).getReg();
1462 LLT Ty = MRI.getType(DstReg);
1463
1464 const LLT S32 = LLT::scalar(32);
1465
1466 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1467 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1468 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1469 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1470
1471 const RegisterBank *DstBank =
1472 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1473 if (DstBank == &AMDGPU::VGPRRegBank) {
1474 if (Ty == S32)
1475 return true;
1476
1477 // There is no 64-bit vgpr bitfield extract instructions so the operation
1478 // is expanded to a sequence of instructions that implement the operation.
1479 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1480 MachineIRBuilder B(MI, ApplyBank);
1481
1482 const LLT S64 = LLT::scalar(64);
1483 // Shift the source operand so that extracted bits start at bit 0.
1484 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1485 : B.buildLShr(S64, SrcReg, OffsetReg);
1486 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1487
1488 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1489 // if the width is a constant.
1490 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1491 // Use the 32-bit bitfield extract instruction if the width is a constant.
1492 // Depending on the width size, use either the low or high 32-bits.
1493 auto Zero = B.buildConstant(S32, 0);
1494 auto WidthImm = ConstWidth->Value.getZExtValue();
1495 if (WidthImm <= 32) {
1496 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1497 // or clear the upper 32-bits.
1498 auto Extract =
1499 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1500 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1501 auto Extend =
1502 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1503 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1504 } else {
1505 // Use bitfield extract on upper 32-bit source, and combine with lower
1506 // 32-bit source.
1507 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1508 auto Extract =
1509 Signed
1510 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1511 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1512 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1513 }
1514 MI.eraseFromParent();
1515 return true;
1516 }
1517
1518 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1519 // operations.
1520 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1521 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1522 if (Signed)
1523 B.buildAShr(S64, SignBit, ExtShift);
1524 else
1525 B.buildLShr(S64, SignBit, ExtShift);
1526 MI.eraseFromParent();
1527 return true;
1528 }
1529
1530 // The scalar form packs the offset and width in a single operand.
1531
1532 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1533 MachineIRBuilder B(MI, ApplyBank);
1534
1535 // Ensure the high bits are clear to insert the offset.
1536 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1537 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1538
1539 // Zeros out the low bits, so don't bother clamping the input value.
1540 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1541
1542 // Transformation function, pack the offset and width of a BFE into
1543 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1544 // source, bits [5:0] contain the offset and bits [22:16] the width.
1545 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1546
1547 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1548 // register class constraints.
1549 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1550 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1551
1552 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1553 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1554 llvm_unreachable("failed to constrain BFE");
1555
1556 MI.eraseFromParent();
1557 return true;
1558 }
1559
applyMappingMAD_64_32(const OperandsMapper & OpdMapper) const1560 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1561 const OperandsMapper &OpdMapper) const {
1562 MachineInstr &MI = OpdMapper.getMI();
1563 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1564
1565 // Insert basic copies.
1566 applyDefaultMapping(OpdMapper);
1567
1568 Register Dst0 = MI.getOperand(0).getReg();
1569 Register Dst1 = MI.getOperand(1).getReg();
1570 Register Src0 = MI.getOperand(2).getReg();
1571 Register Src1 = MI.getOperand(3).getReg();
1572 Register Src2 = MI.getOperand(4).getReg();
1573
1574 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1575 return true;
1576
1577 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1578 LLT S1 = LLT::scalar(1);
1579 LLT S32 = LLT::scalar(32);
1580
1581 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1582 bool Accumulate = true;
1583
1584 if (!DstOnValu) {
1585 if (mi_match(Src2, MRI, m_ZeroInt()))
1586 Accumulate = false;
1587 }
1588
1589 // Keep the multiplication on the SALU.
1590 MachineIRBuilder B(MI);
1591
1592 Register DstHi;
1593 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1594 bool MulHiInVgpr = false;
1595
1596 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1597
1598 if (Subtarget.hasSMulHi()) {
1599 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1600 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1601 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1602 } else {
1603 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1604 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1605
1606 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1607 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1608
1609 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1610 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1611 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1612
1613 if (!DstOnValu) {
1614 DstHi = buildReadFirstLane(B, MRI, DstHi);
1615 } else {
1616 MulHiInVgpr = true;
1617 }
1618 }
1619
1620 // Accumulate and produce the "carry-out" bit.
1621 //
1622 // The "carry-out" is defined as bit 64 of the result when computed as a
1623 // big integer. For unsigned multiply-add, this matches the usual definition
1624 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1625 // result, which is determined as:
1626 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1627 LLT CarryType = DstOnValu ? S1 : S32;
1628 const RegisterBank &CarryBank =
1629 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1630 const RegisterBank &DstBank =
1631 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1632 Register Carry;
1633 Register Zero;
1634
1635 if (!IsUnsigned) {
1636 Zero = B.buildConstant(S32, 0).getReg(0);
1637 MRI.setRegBank(Zero,
1638 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1639
1640 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1641 .getReg(0);
1642 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1643 : AMDGPU::SGPRRegBank);
1644
1645 if (DstOnValu && !MulHiInVgpr) {
1646 Carry = B.buildTrunc(S1, Carry).getReg(0);
1647 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1648 }
1649 }
1650
1651 if (Accumulate) {
1652 if (DstOnValu) {
1653 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1654 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1655 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1656 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1657 }
1658
1659 auto Unmerge = B.buildUnmerge(S32, Src2);
1660 Register Src2Lo = Unmerge.getReg(0);
1661 Register Src2Hi = Unmerge.getReg(1);
1662 MRI.setRegBank(Src2Lo, DstBank);
1663 MRI.setRegBank(Src2Hi, DstBank);
1664
1665 if (!IsUnsigned) {
1666 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1667 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1668
1669 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1670 MRI.setRegBank(Carry, CarryBank);
1671 }
1672
1673 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1674 DstLo = AddLo.getReg(0);
1675 Register CarryLo = AddLo.getReg(1);
1676 MRI.setRegBank(DstLo, DstBank);
1677 MRI.setRegBank(CarryLo, CarryBank);
1678
1679 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1680 DstHi = AddHi.getReg(0);
1681 MRI.setRegBank(DstHi, DstBank);
1682
1683 Register CarryHi = AddHi.getReg(1);
1684 MRI.setRegBank(CarryHi, CarryBank);
1685
1686 if (IsUnsigned) {
1687 Carry = CarryHi;
1688 } else {
1689 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1690 MRI.setRegBank(Carry, CarryBank);
1691 }
1692 } else {
1693 if (IsUnsigned) {
1694 Carry = B.buildConstant(CarryType, 0).getReg(0);
1695 MRI.setRegBank(Carry, CarryBank);
1696 }
1697 }
1698
1699 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1700
1701 if (DstOnValu) {
1702 B.buildCopy(Dst1, Carry);
1703 } else {
1704 B.buildTrunc(Dst1, Carry);
1705 }
1706
1707 MI.eraseFromParent();
1708 return true;
1709 }
1710
1711 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1712 static unsigned getExtendOp(unsigned Opc) {
1713 switch (Opc) {
1714 case TargetOpcode::G_ASHR:
1715 case TargetOpcode::G_SMIN:
1716 case TargetOpcode::G_SMAX:
1717 return TargetOpcode::G_SEXT;
1718 case TargetOpcode::G_LSHR:
1719 case TargetOpcode::G_UMIN:
1720 case TargetOpcode::G_UMAX:
1721 return TargetOpcode::G_ZEXT;
1722 default:
1723 return TargetOpcode::G_ANYEXT;
1724 }
1725 }
1726
1727 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1728 // any illegal vector extend or unmerge operations.
1729 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1730 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1731 const LLT S32 = LLT::scalar(32);
1732 auto Bitcast = B.buildBitcast(S32, Src);
1733
1734 if (ExtOpcode == TargetOpcode::G_SEXT) {
1735 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1736 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1737 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1738 }
1739
1740 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1741 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1742 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1743 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1744 }
1745
1746 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1747 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1748 }
1749
1750 // For cases where only a single copy is inserted for matching register banks.
1751 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1752 static bool substituteSimpleCopyRegs(
1753 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1754 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1755 if (!SrcReg.empty()) {
1756 assert(SrcReg.size() == 1);
1757 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1758 return true;
1759 }
1760
1761 return false;
1762 }
1763
1764 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1765 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1766 MachineRegisterInfo &MRI,
1767 Register Reg) const {
1768 if (!Subtarget.hasUnpackedD16VMem())
1769 return Reg;
1770
1771 const LLT S16 = LLT::scalar(16);
1772 LLT StoreVT = MRI.getType(Reg);
1773 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1774 return Reg;
1775
1776 auto Unmerge = B.buildUnmerge(S16, Reg);
1777
1778
1779 SmallVector<Register, 4> WideRegs;
1780 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1781 WideRegs.push_back(Unmerge.getReg(I));
1782
1783 const LLT S32 = LLT::scalar(32);
1784 int NumElts = StoreVT.getNumElements();
1785
1786 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1787 .getReg(0);
1788 }
1789
1790 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1791 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1792 int64_t Const;
1793 if (mi_match(Reg, MRI, m_ICst(Const)))
1794 return std::pair(Register(), Const);
1795
1796 Register Base;
1797 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1798 return std::pair(Base, Const);
1799
1800 // TODO: Handle G_OR used for add case
1801 return std::pair(Reg, 0);
1802 }
1803
1804 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1805 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1806 Register OrigOffset) const {
1807 const unsigned MaxImm = 4095;
1808 Register BaseReg;
1809 unsigned ImmOffset;
1810 const LLT S32 = LLT::scalar(32);
1811
1812 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1813 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1814 OrigOffset);
1815
1816 unsigned C1 = 0;
1817 if (ImmOffset != 0) {
1818 // If the immediate value is too big for the immoffset field, put the value
1819 // and -4096 into the immoffset field so that the value that is copied/added
1820 // for the voffset field is a multiple of 4096, and it stands more chance
1821 // of being CSEd with the copy/add for another similar load/store.
1822 // However, do not do that rounding down to a multiple of 4096 if that is a
1823 // negative number, as it appears to be illegal to have a negative offset
1824 // in the vgpr, even if adding the immediate offset makes it positive.
1825 unsigned Overflow = ImmOffset & ~MaxImm;
1826 ImmOffset -= Overflow;
1827 if ((int32_t)Overflow < 0) {
1828 Overflow += ImmOffset;
1829 ImmOffset = 0;
1830 }
1831
1832 C1 = ImmOffset;
1833 if (Overflow != 0) {
1834 if (!BaseReg)
1835 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1836 else {
1837 auto OverflowVal = B.buildConstant(S32, Overflow);
1838 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1839 }
1840 }
1841 }
1842
1843 if (!BaseReg)
1844 BaseReg = B.buildConstant(S32, 0).getReg(0);
1845
1846 return {BaseReg, C1};
1847 }
1848
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1849 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1850 Register SrcReg) const {
1851 MachineRegisterInfo &MRI = *B.getMRI();
1852 LLT SrcTy = MRI.getType(SrcReg);
1853 if (SrcTy.getSizeInBits() == 32) {
1854 // Use a v_mov_b32 here to make the exec dependency explicit.
1855 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1856 .addDef(DstReg)
1857 .addUse(SrcReg);
1858 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1859 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1860 }
1861
1862 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1863 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1864
1865 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1866 .addDef(TmpReg0)
1867 .addUse(SrcReg, 0, AMDGPU::sub0);
1868 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1869 .addDef(TmpReg1)
1870 .addUse(SrcReg, 0, AMDGPU::sub1);
1871 B.buildInstr(AMDGPU::REG_SEQUENCE)
1872 .addDef(DstReg)
1873 .addUse(TmpReg0)
1874 .addImm(AMDGPU::sub0)
1875 .addUse(TmpReg1)
1876 .addImm(AMDGPU::sub1);
1877
1878 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1879 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1880 }
1881
1882 /// Utility function for pushing dynamic vector indexes with a constant offset
1883 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1884 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1885 MachineInstr &IdxUseInstr,
1886 unsigned OpIdx,
1887 unsigned ConstOffset) {
1888 MachineRegisterInfo &MRI = *B.getMRI();
1889 const LLT S32 = LLT::scalar(32);
1890 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1891 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1892
1893 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1894
1895 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1896 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1897 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1898 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1899 }
1900
1901 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1902 /// original 32-bit source value (to be inserted in the low part of the combined
1903 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1904 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1905 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1906 Register Hi32Reg, Register Lo32Reg,
1907 unsigned ExtOpc,
1908 const RegisterBank &RegBank,
1909 bool IsBooleanSrc = false) {
1910 if (ExtOpc == AMDGPU::G_ZEXT) {
1911 B.buildConstant(Hi32Reg, 0);
1912 } else if (ExtOpc == AMDGPU::G_SEXT) {
1913 if (IsBooleanSrc) {
1914 // If we know the original source was an s1, the high half is the same as
1915 // the low.
1916 B.buildCopy(Hi32Reg, Lo32Reg);
1917 } else {
1918 // Replicate sign bit from 32-bit extended part.
1919 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1920 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1921 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1922 }
1923 } else {
1924 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1925 B.buildUndef(Hi32Reg);
1926 }
1927 }
1928
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1929 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1930 MachineInstr &MI, MachineRegisterInfo &MRI,
1931 const OperandsMapper &OpdMapper) const {
1932
1933 Register VecReg = MI.getOperand(1).getReg();
1934 Register Idx = MI.getOperand(2).getReg();
1935
1936 const RegisterBank &IdxBank =
1937 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1938
1939 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1940
1941 LLT VecTy = MRI.getType(VecReg);
1942 unsigned EltSize = VecTy.getScalarSizeInBits();
1943 unsigned NumElem = VecTy.getNumElements();
1944
1945 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1946 IsDivergentIdx, &Subtarget))
1947 return false;
1948
1949 MachineIRBuilder B(MI);
1950 LLT S32 = LLT::scalar(32);
1951
1952 const RegisterBank &DstBank =
1953 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1954 const RegisterBank &SrcBank =
1955 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1956
1957 const RegisterBank &CCBank =
1958 (DstBank == AMDGPU::SGPRRegBank &&
1959 SrcBank == AMDGPU::SGPRRegBank &&
1960 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1961 : AMDGPU::VCCRegBank;
1962 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1963
1964 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1965 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1966 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1967 }
1968
1969 LLT EltTy = VecTy.getScalarType();
1970 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1971 unsigned NumLanes = DstRegs.size();
1972 if (!NumLanes)
1973 NumLanes = 1;
1974 else
1975 EltTy = MRI.getType(DstRegs[0]);
1976
1977 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1978 SmallVector<Register, 2> Res(NumLanes);
1979 for (unsigned L = 0; L < NumLanes; ++L)
1980 Res[L] = UnmergeToEltTy.getReg(L);
1981
1982 for (unsigned I = 1; I < NumElem; ++I) {
1983 auto IC = B.buildConstant(S32, I);
1984 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1985 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1986 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1987
1988 for (unsigned L = 0; L < NumLanes; ++L) {
1989 auto S = B.buildSelect(EltTy, Cmp,
1990 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1991
1992 for (unsigned N : { 0, 2, 3 })
1993 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1994
1995 Res[L] = S->getOperand(0).getReg();
1996 }
1997 }
1998
1999 for (unsigned L = 0; L < NumLanes; ++L) {
2000 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2001 B.buildCopy(DstReg, Res[L]);
2002 MRI.setRegBank(DstReg, DstBank);
2003 }
2004
2005 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2006 MI.eraseFromParent();
2007
2008 return true;
2009 }
2010
2011 // Insert a cross regbank copy for a register if it already has a bank that
2012 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2013 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2014 MachineIRBuilder &B, Register &Reg,
2015 const RegisterBank &Bank) {
2016 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2017 if (CurrBank && *CurrBank != Bank) {
2018 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2019 MRI.setRegBank(Copy, Bank);
2020 return Copy;
2021 }
2022
2023 MRI.setRegBank(Reg, Bank);
2024 return Reg;
2025 }
2026
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2027 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2028 MachineInstr &MI, MachineRegisterInfo &MRI,
2029 const OperandsMapper &OpdMapper) const {
2030
2031 Register VecReg = MI.getOperand(1).getReg();
2032 Register Idx = MI.getOperand(3).getReg();
2033
2034 const RegisterBank &IdxBank =
2035 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2036
2037 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2038
2039 LLT VecTy = MRI.getType(VecReg);
2040 unsigned EltSize = VecTy.getScalarSizeInBits();
2041 unsigned NumElem = VecTy.getNumElements();
2042
2043 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2044 IsDivergentIdx, &Subtarget))
2045 return false;
2046
2047 MachineIRBuilder B(MI);
2048 LLT S32 = LLT::scalar(32);
2049
2050 const RegisterBank &DstBank =
2051 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2052 const RegisterBank &SrcBank =
2053 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2054 const RegisterBank &InsBank =
2055 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2056
2057 const RegisterBank &CCBank =
2058 (DstBank == AMDGPU::SGPRRegBank &&
2059 SrcBank == AMDGPU::SGPRRegBank &&
2060 InsBank == AMDGPU::SGPRRegBank &&
2061 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2062 : AMDGPU::VCCRegBank;
2063 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2064
2065 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2066 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2067 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2068 }
2069
2070 LLT EltTy = VecTy.getScalarType();
2071 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2072 unsigned NumLanes = InsRegs.size();
2073 if (!NumLanes) {
2074 NumLanes = 1;
2075 InsRegs.push_back(MI.getOperand(2).getReg());
2076 } else {
2077 EltTy = MRI.getType(InsRegs[0]);
2078 }
2079
2080 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2081 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2082
2083 for (unsigned I = 0; I < NumElem; ++I) {
2084 auto IC = B.buildConstant(S32, I);
2085 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2086 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2087 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2088
2089 for (unsigned L = 0; L < NumLanes; ++L) {
2090 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2091 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2092 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2093
2094 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2095 MRI.setRegBank(Select, DstBank);
2096
2097 Ops[I * NumLanes + L] = Select;
2098 }
2099 }
2100
2101 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2102 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2103 B.buildBuildVector(MI.getOperand(0), Ops);
2104 } else {
2105 auto Vec = B.buildBuildVector(MergeTy, Ops);
2106 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2107 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2108 }
2109
2110 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2111 MI.eraseFromParent();
2112
2113 return true;
2114 }
2115
applyMappingImpl(const OperandsMapper & OpdMapper) const2116 void AMDGPURegisterBankInfo::applyMappingImpl(
2117 const OperandsMapper &OpdMapper) const {
2118 MachineInstr &MI = OpdMapper.getMI();
2119 unsigned Opc = MI.getOpcode();
2120 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2121 switch (Opc) {
2122 case AMDGPU::G_CONSTANT:
2123 case AMDGPU::G_IMPLICIT_DEF: {
2124 Register DstReg = MI.getOperand(0).getReg();
2125 LLT DstTy = MRI.getType(DstReg);
2126 if (DstTy != LLT::scalar(1))
2127 break;
2128
2129 const RegisterBank *DstBank =
2130 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2131 if (DstBank == &AMDGPU::VCCRegBank)
2132 break;
2133 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2134 if (DefRegs.empty())
2135 DefRegs.push_back(DstReg);
2136
2137 MachineIRBuilder B(MI);
2138 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2139
2140 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2141 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2142
2143 MI.getOperand(0).setReg(NewDstReg);
2144 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2145 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2146 MI.getOperand(1).setCImm(
2147 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2148 }
2149
2150 MRI.setRegBank(NewDstReg, *DstBank);
2151 B.buildTrunc(DefRegs[0], NewDstReg);
2152 return;
2153 }
2154 case AMDGPU::G_PHI: {
2155 Register DstReg = MI.getOperand(0).getReg();
2156 LLT DstTy = MRI.getType(DstReg);
2157 if (DstTy != LLT::scalar(1))
2158 break;
2159
2160 const LLT S32 = LLT::scalar(32);
2161 const RegisterBank *DstBank =
2162 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2163 if (DstBank == &AMDGPU::VCCRegBank) {
2164 applyDefaultMapping(OpdMapper);
2165 // The standard handling only considers the result register bank for
2166 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2167 // produce an invalid copy. We can only copy with some kind of compare to
2168 // get a vector boolean result. Insert a register bank copy that will be
2169 // correctly lowered to a compare.
2170 MachineIRBuilder B(*MI.getParent()->getParent());
2171
2172 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2173 Register SrcReg = MI.getOperand(I).getReg();
2174 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2175
2176 if (SrcBank != &AMDGPU::VCCRegBank) {
2177 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2178 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2179
2180 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2181 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2182 MI.getOperand(I).setReg(Copy.getReg(0));
2183 }
2184 }
2185
2186 return;
2187 }
2188
2189 // Phi handling is strange and only considers the bank of the destination.
2190 substituteSimpleCopyRegs(OpdMapper, 0);
2191
2192 // Promote SGPR/VGPR booleans to s32
2193 MachineFunction *MF = MI.getParent()->getParent();
2194 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2195 MachineIRBuilder B(MI, ApplyBank);
2196 LegalizerHelper Helper(*MF, ApplyBank, B);
2197
2198 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2199 llvm_unreachable("widen scalar should have succeeded");
2200
2201 return;
2202 }
2203 case AMDGPU::G_ICMP:
2204 case AMDGPU::G_UADDO:
2205 case AMDGPU::G_USUBO:
2206 case AMDGPU::G_UADDE:
2207 case AMDGPU::G_SADDE:
2208 case AMDGPU::G_USUBE:
2209 case AMDGPU::G_SSUBE: {
2210 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2211 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2212
2213 const RegisterBank *DstBank =
2214 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2215 if (DstBank != &AMDGPU::SGPRRegBank)
2216 break;
2217
2218 const bool HasCarryIn = MI.getNumOperands() == 5;
2219
2220 // If this is a scalar compare, promote the result to s32, as the selection
2221 // will end up using a copy to a 32-bit vreg.
2222 const LLT S32 = LLT::scalar(32);
2223 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2224 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2225 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2226 MachineIRBuilder B(MI);
2227
2228 if (HasCarryIn) {
2229 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2230 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2231 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2232 MI.getOperand(4).setReg(NewSrcReg);
2233 }
2234
2235 MachineBasicBlock *MBB = MI.getParent();
2236 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2237
2238 // If we had a constrained VCC result register, a copy was inserted to VCC
2239 // from SGPR.
2240 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2241 if (DefRegs.empty())
2242 DefRegs.push_back(DstReg);
2243 B.buildTrunc(DefRegs[0], NewDstReg);
2244 return;
2245 }
2246 case AMDGPU::G_SELECT: {
2247 Register DstReg = MI.getOperand(0).getReg();
2248 LLT DstTy = MRI.getType(DstReg);
2249
2250 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2251 if (CondRegs.empty())
2252 CondRegs.push_back(MI.getOperand(1).getReg());
2253 else {
2254 assert(CondRegs.size() == 1);
2255 }
2256
2257 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2258 if (CondBank == &AMDGPU::SGPRRegBank) {
2259 MachineIRBuilder B(MI);
2260 const LLT S32 = LLT::scalar(32);
2261 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2262 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2263
2264 MI.getOperand(1).setReg(NewCondReg);
2265 B.buildZExt(NewCondReg, CondRegs[0]);
2266 }
2267
2268 if (DstTy.getSizeInBits() != 64)
2269 break;
2270
2271 MachineIRBuilder B(MI);
2272 LLT HalfTy = getHalfSizedType(DstTy);
2273
2274 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2275 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2276 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2277
2278 // All inputs are SGPRs, nothing special to do.
2279 if (DefRegs.empty()) {
2280 assert(Src1Regs.empty() && Src2Regs.empty());
2281 break;
2282 }
2283
2284 if (Src1Regs.empty())
2285 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2286 else {
2287 setRegsToType(MRI, Src1Regs, HalfTy);
2288 }
2289
2290 if (Src2Regs.empty())
2291 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2292 else
2293 setRegsToType(MRI, Src2Regs, HalfTy);
2294
2295 setRegsToType(MRI, DefRegs, HalfTy);
2296
2297 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2298 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2299
2300 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2301 MI.eraseFromParent();
2302 return;
2303 }
2304 case AMDGPU::G_BRCOND: {
2305 Register CondReg = MI.getOperand(0).getReg();
2306 // FIXME: Should use legalizer helper, but should change bool ext type.
2307 const RegisterBank *CondBank =
2308 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2309
2310 if (CondBank == &AMDGPU::SGPRRegBank) {
2311 MachineIRBuilder B(MI);
2312 const LLT S32 = LLT::scalar(32);
2313 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2314 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2315
2316 MI.getOperand(0).setReg(NewCondReg);
2317 B.buildZExt(NewCondReg, CondReg);
2318 return;
2319 }
2320
2321 break;
2322 }
2323 case AMDGPU::G_AND:
2324 case AMDGPU::G_OR:
2325 case AMDGPU::G_XOR: {
2326 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2327 // there is a VGPR input.
2328 Register DstReg = MI.getOperand(0).getReg();
2329 LLT DstTy = MRI.getType(DstReg);
2330
2331 if (DstTy.getSizeInBits() == 1) {
2332 const RegisterBank *DstBank =
2333 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2334 if (DstBank == &AMDGPU::VCCRegBank)
2335 break;
2336
2337 MachineFunction *MF = MI.getParent()->getParent();
2338 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2339 MachineIRBuilder B(MI, ApplyBank);
2340 LegalizerHelper Helper(*MF, ApplyBank, B);
2341
2342 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2343 LegalizerHelper::Legalized)
2344 llvm_unreachable("widen scalar should have succeeded");
2345 return;
2346 }
2347
2348 if (DstTy.getSizeInBits() != 64)
2349 break;
2350
2351 LLT HalfTy = getHalfSizedType(DstTy);
2352 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2353 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2354 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2355
2356 // All inputs are SGPRs, nothing special to do.
2357 if (DefRegs.empty()) {
2358 assert(Src0Regs.empty() && Src1Regs.empty());
2359 break;
2360 }
2361
2362 assert(DefRegs.size() == 2);
2363 assert(Src0Regs.size() == Src1Regs.size() &&
2364 (Src0Regs.empty() || Src0Regs.size() == 2));
2365
2366 // Depending on where the source registers came from, the generic code may
2367 // have decided to split the inputs already or not. If not, we still need to
2368 // extract the values.
2369 MachineIRBuilder B(MI);
2370
2371 if (Src0Regs.empty())
2372 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2373 else
2374 setRegsToType(MRI, Src0Regs, HalfTy);
2375
2376 if (Src1Regs.empty())
2377 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2378 else
2379 setRegsToType(MRI, Src1Regs, HalfTy);
2380
2381 setRegsToType(MRI, DefRegs, HalfTy);
2382
2383 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2384 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2385
2386 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2387 MI.eraseFromParent();
2388 return;
2389 }
2390 case AMDGPU::G_ABS: {
2391 Register SrcReg = MI.getOperand(1).getReg();
2392 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2393
2394 // There is no VALU abs instruction so we need to replace it with a sub and
2395 // max combination.
2396 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2397 MachineFunction *MF = MI.getParent()->getParent();
2398 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2399 MachineIRBuilder B(MI, Apply);
2400 LegalizerHelper Helper(*MF, Apply, B);
2401
2402 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2403 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2404 return;
2405 }
2406 [[fallthrough]];
2407 }
2408 case AMDGPU::G_ADD:
2409 case AMDGPU::G_SUB:
2410 case AMDGPU::G_MUL:
2411 case AMDGPU::G_SHL:
2412 case AMDGPU::G_LSHR:
2413 case AMDGPU::G_ASHR:
2414 case AMDGPU::G_SMIN:
2415 case AMDGPU::G_SMAX:
2416 case AMDGPU::G_UMIN:
2417 case AMDGPU::G_UMAX: {
2418 Register DstReg = MI.getOperand(0).getReg();
2419 LLT DstTy = MRI.getType(DstReg);
2420
2421 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2422 // Packed 16-bit operations need to be scalarized and promoted.
2423 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2424 break;
2425
2426 const RegisterBank *DstBank =
2427 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2428 if (DstBank == &AMDGPU::VGPRRegBank)
2429 break;
2430
2431 const LLT S32 = LLT::scalar(32);
2432 MachineBasicBlock *MBB = MI.getParent();
2433 MachineFunction *MF = MBB->getParent();
2434 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2435 MachineIRBuilder B(MI, ApplySALU);
2436
2437 if (DstTy.isVector()) {
2438 Register WideSrc0Lo, WideSrc0Hi;
2439 Register WideSrc1Lo, WideSrc1Hi;
2440
2441 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2442 std::tie(WideSrc0Lo, WideSrc0Hi)
2443 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2444 std::tie(WideSrc1Lo, WideSrc1Hi)
2445 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2446 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2447 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2448 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2449 MI.eraseFromParent();
2450 } else {
2451 LegalizerHelper Helper(*MF, ApplySALU, B);
2452
2453 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2454 llvm_unreachable("widen scalar should have succeeded");
2455
2456 // FIXME: s16 shift amounts should be legal.
2457 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2458 Opc == AMDGPU::G_ASHR) {
2459 B.setInsertPt(*MBB, MI.getIterator());
2460 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2461 llvm_unreachable("widen scalar should have succeeded");
2462 }
2463 }
2464
2465 return;
2466 }
2467 case AMDGPU::G_SEXT_INREG: {
2468 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2469 if (SrcRegs.empty())
2470 break; // Nothing to repair
2471
2472 const LLT S32 = LLT::scalar(32);
2473 MachineIRBuilder B(MI);
2474 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2475 GISelObserverWrapper Observer(&O);
2476 B.setChangeObserver(Observer);
2477
2478 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2479 // we would need to further expand, and doesn't let us directly set the
2480 // result registers.
2481 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2482
2483 int Amt = MI.getOperand(2).getImm();
2484 if (Amt <= 32) {
2485 // Downstream users have expectations for the high bit behavior, so freeze
2486 // incoming undefined bits.
2487 if (Amt == 32) {
2488 // The low bits are unchanged.
2489 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2490 } else {
2491 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2492 // Extend in the low bits and propagate the sign bit to the high half.
2493 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2494 }
2495
2496 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2497 } else {
2498 // The low bits are unchanged, and extend in the high bits.
2499 // No freeze required
2500 B.buildCopy(DstRegs[0], SrcRegs[0]);
2501 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2502 }
2503
2504 Register DstReg = MI.getOperand(0).getReg();
2505 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2506 MI.eraseFromParent();
2507 return;
2508 }
2509 case AMDGPU::G_CTPOP:
2510 case AMDGPU::G_BITREVERSE: {
2511 const RegisterBank *DstBank =
2512 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2513 if (DstBank == &AMDGPU::SGPRRegBank)
2514 break;
2515
2516 Register SrcReg = MI.getOperand(1).getReg();
2517 const LLT S32 = LLT::scalar(32);
2518 LLT Ty = MRI.getType(SrcReg);
2519 if (Ty == S32)
2520 break;
2521
2522 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2523 MachineIRBuilder B(MI, ApplyVALU);
2524
2525 MachineFunction &MF = B.getMF();
2526 LegalizerHelper Helper(MF, ApplyVALU, B);
2527
2528 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2529 llvm_unreachable("narrowScalar should have succeeded");
2530 return;
2531 }
2532 case AMDGPU::G_AMDGPU_FFBH_U32:
2533 case AMDGPU::G_AMDGPU_FFBL_B32:
2534 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2535 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2536 const RegisterBank *DstBank =
2537 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2538 if (DstBank == &AMDGPU::SGPRRegBank)
2539 break;
2540
2541 Register SrcReg = MI.getOperand(1).getReg();
2542 const LLT S32 = LLT::scalar(32);
2543 LLT Ty = MRI.getType(SrcReg);
2544 if (Ty == S32)
2545 break;
2546
2547 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2548 // which return -1 when the input is zero:
2549 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2550 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2551 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2552 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2553 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2554 MachineIRBuilder B(MI, ApplyVALU);
2555 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2556 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2557 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2558 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2559 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2560 : Opc;
2561 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2562 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2563 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2564 unsigned AddOpc =
2565 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2566 ? AMDGPU::G_ADD
2567 : AMDGPU::G_UADDSAT;
2568 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2569 Register DstReg = MI.getOperand(0).getReg();
2570 B.buildUMin(DstReg, X, Y);
2571 MI.eraseFromParent();
2572 return;
2573 }
2574 case AMDGPU::G_SEXT:
2575 case AMDGPU::G_ZEXT:
2576 case AMDGPU::G_ANYEXT: {
2577 Register SrcReg = MI.getOperand(1).getReg();
2578 LLT SrcTy = MRI.getType(SrcReg);
2579 const bool Signed = Opc == AMDGPU::G_SEXT;
2580
2581 assert(OpdMapper.getVRegs(1).empty());
2582
2583 MachineIRBuilder B(MI);
2584 const RegisterBank *SrcBank =
2585 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2586
2587 Register DstReg = MI.getOperand(0).getReg();
2588 LLT DstTy = MRI.getType(DstReg);
2589 if (DstTy.isScalar() &&
2590 SrcBank != &AMDGPU::SGPRRegBank &&
2591 SrcBank != &AMDGPU::VCCRegBank &&
2592 // FIXME: Should handle any type that round to s64 when irregular
2593 // breakdowns supported.
2594 DstTy.getSizeInBits() == 64 &&
2595 SrcTy.getSizeInBits() <= 32) {
2596 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2597
2598 // Extend to 32-bit, and then extend the low half.
2599 if (Signed) {
2600 // TODO: Should really be buildSExtOrCopy
2601 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2602 } else if (Opc == AMDGPU::G_ZEXT) {
2603 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2604 } else {
2605 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2606 }
2607
2608 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2609 MRI.setRegBank(DstReg, *SrcBank);
2610 MI.eraseFromParent();
2611 return;
2612 }
2613
2614 if (SrcTy != LLT::scalar(1))
2615 return;
2616
2617 // It is not legal to have a legalization artifact with a VCC source. Rather
2618 // than introducing a copy, insert the select we would have to select the
2619 // copy to.
2620 if (SrcBank == &AMDGPU::VCCRegBank) {
2621 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2622
2623 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2624
2625 unsigned DstSize = DstTy.getSizeInBits();
2626 // 64-bit select is SGPR only
2627 const bool UseSel64 = DstSize > 32 &&
2628 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2629
2630 // TODO: Should s16 select be legal?
2631 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2632 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2633 auto False = B.buildConstant(SelType, 0);
2634
2635 MRI.setRegBank(True.getReg(0), *DstBank);
2636 MRI.setRegBank(False.getReg(0), *DstBank);
2637 MRI.setRegBank(DstReg, *DstBank);
2638
2639 if (DstSize > 32) {
2640 B.buildSelect(DefRegs[0], SrcReg, True, False);
2641 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2642 } else if (DstSize < 32) {
2643 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2644 MRI.setRegBank(Sel.getReg(0), *DstBank);
2645 B.buildTrunc(DstReg, Sel);
2646 } else {
2647 B.buildSelect(DstReg, SrcReg, True, False);
2648 }
2649
2650 MI.eraseFromParent();
2651 return;
2652 }
2653
2654 break;
2655 }
2656 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2657 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2658
2659 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2660
2661 Register DstReg = MI.getOperand(0).getReg();
2662 Register SrcReg = MI.getOperand(1).getReg();
2663
2664 const LLT S32 = LLT::scalar(32);
2665 LLT DstTy = MRI.getType(DstReg);
2666 LLT SrcTy = MRI.getType(SrcReg);
2667
2668 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2669 return;
2670
2671 MachineIRBuilder B(MI);
2672
2673 const ValueMapping &DstMapping
2674 = OpdMapper.getInstrMapping().getOperandMapping(0);
2675 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2676 const RegisterBank *SrcBank =
2677 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2678 const RegisterBank *IdxBank =
2679 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2680
2681 Register BaseIdxReg;
2682 unsigned ConstOffset;
2683 std::tie(BaseIdxReg, ConstOffset) =
2684 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2685
2686 // See if the index is an add of a constant which will be foldable by moving
2687 // the base register of the index later if this is going to be executed in a
2688 // waterfall loop. This is essentially to reassociate the add of a constant
2689 // with the readfirstlane.
2690 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2691 ConstOffset > 0 &&
2692 ConstOffset < SrcTy.getNumElements();
2693
2694 // Move the base register. We'll re-insert the add later.
2695 if (ShouldMoveIndexIntoLoop)
2696 MI.getOperand(2).setReg(BaseIdxReg);
2697
2698 // If this is a VGPR result only because the index was a VGPR result, the
2699 // actual indexing will be done on the SGPR source vector, which will
2700 // produce a scalar result. We need to copy to the VGPR result inside the
2701 // waterfall loop.
2702 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2703 SrcBank == &AMDGPU::SGPRRegBank;
2704 if (DstRegs.empty()) {
2705 applyDefaultMapping(OpdMapper);
2706
2707 executeInWaterfallLoop(MI, MRI, { 2 });
2708
2709 if (NeedCopyToVGPR) {
2710 // We don't want a phi for this temporary reg.
2711 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2712 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2713 MI.getOperand(0).setReg(TmpReg);
2714 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2715
2716 // Use a v_mov_b32 here to make the exec dependency explicit.
2717 buildVCopy(B, DstReg, TmpReg);
2718 }
2719
2720 // Re-insert the constant offset add inside the waterfall loop.
2721 if (ShouldMoveIndexIntoLoop)
2722 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2723
2724 return;
2725 }
2726
2727 assert(DstTy.getSizeInBits() == 64);
2728
2729 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2730
2731 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2732 auto One = B.buildConstant(S32, 1);
2733
2734 MachineBasicBlock::iterator MII = MI.getIterator();
2735
2736 // Split the vector index into 32-bit pieces. Prepare to move all of the
2737 // new instructions into a waterfall loop if necessary.
2738 //
2739 // Don't put the bitcast or constant in the loop.
2740 MachineInstrSpan Span(MII, &B.getMBB());
2741
2742 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2743 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2744 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2745
2746 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2747 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2748
2749 MRI.setRegBank(DstReg, *DstBank);
2750 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2751 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2752 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2753 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2754
2755 SmallSet<Register, 4> OpsToWaterfall;
2756 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2757 MI.eraseFromParent();
2758 return;
2759 }
2760
2761 // Remove the original instruction to avoid potentially confusing the
2762 // waterfall loop logic.
2763 B.setInstr(*Span.begin());
2764 MI.eraseFromParent();
2765 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2766 OpsToWaterfall, MRI);
2767
2768 if (NeedCopyToVGPR) {
2769 MachineBasicBlock *LoopBB = Extract1->getParent();
2770 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2771 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2772 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2773 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2774
2775 Extract0->getOperand(0).setReg(TmpReg0);
2776 Extract1->getOperand(0).setReg(TmpReg1);
2777
2778 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2779
2780 buildVCopy(B, DstRegs[0], TmpReg0);
2781 buildVCopy(B, DstRegs[1], TmpReg1);
2782 }
2783
2784 if (ShouldMoveIndexIntoLoop)
2785 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2786
2787 return;
2788 }
2789 case AMDGPU::G_INSERT_VECTOR_ELT: {
2790 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2791
2792 Register DstReg = MI.getOperand(0).getReg();
2793 LLT VecTy = MRI.getType(DstReg);
2794
2795 assert(OpdMapper.getVRegs(0).empty());
2796 assert(OpdMapper.getVRegs(3).empty());
2797
2798 if (substituteSimpleCopyRegs(OpdMapper, 1))
2799 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2800
2801 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2802 return;
2803
2804 const RegisterBank *IdxBank =
2805 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2806
2807 Register SrcReg = MI.getOperand(1).getReg();
2808 Register InsReg = MI.getOperand(2).getReg();
2809 LLT InsTy = MRI.getType(InsReg);
2810 (void)InsTy;
2811
2812 Register BaseIdxReg;
2813 unsigned ConstOffset;
2814 std::tie(BaseIdxReg, ConstOffset) =
2815 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2816
2817 // See if the index is an add of a constant which will be foldable by moving
2818 // the base register of the index later if this is going to be executed in a
2819 // waterfall loop. This is essentially to reassociate the add of a constant
2820 // with the readfirstlane.
2821 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2822 ConstOffset > 0 &&
2823 ConstOffset < VecTy.getNumElements();
2824
2825 // Move the base register. We'll re-insert the add later.
2826 if (ShouldMoveIndexIntoLoop)
2827 MI.getOperand(3).setReg(BaseIdxReg);
2828
2829
2830 if (InsRegs.empty()) {
2831 executeInWaterfallLoop(MI, MRI, { 3 });
2832
2833 // Re-insert the constant offset add inside the waterfall loop.
2834 if (ShouldMoveIndexIntoLoop) {
2835 MachineIRBuilder B(MI);
2836 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2837 }
2838
2839 return;
2840 }
2841
2842
2843 assert(InsTy.getSizeInBits() == 64);
2844
2845 const LLT S32 = LLT::scalar(32);
2846 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2847
2848 MachineIRBuilder B(MI);
2849 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2850 auto One = B.buildConstant(S32, 1);
2851
2852 // Split the vector index into 32-bit pieces. Prepare to move all of the
2853 // new instructions into a waterfall loop if necessary.
2854 //
2855 // Don't put the bitcast or constant in the loop.
2856 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2857
2858 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2859 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2860 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2861
2862 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2863 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2864
2865 const RegisterBank *DstBank =
2866 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2867 const RegisterBank *SrcBank =
2868 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2869 const RegisterBank *InsSrcBank =
2870 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2871
2872 MRI.setRegBank(InsReg, *InsSrcBank);
2873 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2874 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2875 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2876 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2877 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2878 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2879
2880
2881 SmallSet<Register, 4> OpsToWaterfall;
2882 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2883 B.setInsertPt(B.getMBB(), MI);
2884 B.buildBitcast(DstReg, InsHi);
2885 MI.eraseFromParent();
2886 return;
2887 }
2888
2889 B.setInstr(*Span.begin());
2890 MI.eraseFromParent();
2891
2892 // Figure out the point after the waterfall loop before mangling the control
2893 // flow.
2894 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2895 OpsToWaterfall, MRI);
2896
2897 // The insertion point is now right after the original instruction.
2898 //
2899 // Keep the bitcast to the original vector type out of the loop. Doing this
2900 // saved an extra phi we don't need inside the loop.
2901 B.buildBitcast(DstReg, InsHi);
2902
2903 // Re-insert the constant offset add inside the waterfall loop.
2904 if (ShouldMoveIndexIntoLoop)
2905 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2906
2907 return;
2908 }
2909 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2910 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2911 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2912 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2913 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2914 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2915 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2916 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2917 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2918 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2919 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2920 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2921 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2922 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2923 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2924 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2925 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2926 applyDefaultMapping(OpdMapper);
2927 executeInWaterfallLoop(MI, MRI, {1, 4});
2928 return;
2929 }
2930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2931 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2932 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2933 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2934 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2935 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2938 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2939 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2940 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2942 applyDefaultMapping(OpdMapper);
2943 executeInWaterfallLoop(MI, MRI, {2, 5});
2944 return;
2945 }
2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2948 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2949 applyDefaultMapping(OpdMapper);
2950 executeInWaterfallLoop(MI, MRI, {2, 5});
2951 return;
2952 }
2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2954 applyDefaultMapping(OpdMapper);
2955 executeInWaterfallLoop(MI, MRI, {3, 6});
2956 return;
2957 }
2958 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2959 applyMappingSBufferLoad(OpdMapper);
2960 return;
2961 }
2962 case AMDGPU::G_INTRINSIC: {
2963 switch (MI.getIntrinsicID()) {
2964 case Intrinsic::amdgcn_readlane: {
2965 substituteSimpleCopyRegs(OpdMapper, 2);
2966
2967 assert(OpdMapper.getVRegs(0).empty());
2968 assert(OpdMapper.getVRegs(3).empty());
2969
2970 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2971 // waterfall loop, so assume it's a uniform value.
2972 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2973 return;
2974 }
2975 case Intrinsic::amdgcn_writelane: {
2976 assert(OpdMapper.getVRegs(0).empty());
2977 assert(OpdMapper.getVRegs(2).empty());
2978 assert(OpdMapper.getVRegs(3).empty());
2979
2980 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2981 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2982 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2983 return;
2984 }
2985 case Intrinsic::amdgcn_interp_p1:
2986 case Intrinsic::amdgcn_interp_p2:
2987 case Intrinsic::amdgcn_interp_mov:
2988 case Intrinsic::amdgcn_interp_p1_f16:
2989 case Intrinsic::amdgcn_interp_p2_f16:
2990 case Intrinsic::amdgcn_lds_param_load: {
2991 applyDefaultMapping(OpdMapper);
2992
2993 // Readlane for m0 value, which is always the last operand.
2994 // FIXME: Should this be a waterfall loop instead?
2995 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2996 return;
2997 }
2998 case Intrinsic::amdgcn_interp_inreg_p10:
2999 case Intrinsic::amdgcn_interp_inreg_p2:
3000 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3001 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3002 applyDefaultMapping(OpdMapper);
3003 return;
3004 case Intrinsic::amdgcn_permlane16:
3005 case Intrinsic::amdgcn_permlanex16: {
3006 // Doing a waterfall loop over these wouldn't make any sense.
3007 substituteSimpleCopyRegs(OpdMapper, 2);
3008 substituteSimpleCopyRegs(OpdMapper, 3);
3009 constrainOpWithReadfirstlane(MI, MRI, 4);
3010 constrainOpWithReadfirstlane(MI, MRI, 5);
3011 return;
3012 }
3013 case Intrinsic::amdgcn_sbfe:
3014 applyMappingBFE(OpdMapper, true);
3015 return;
3016 case Intrinsic::amdgcn_ubfe:
3017 applyMappingBFE(OpdMapper, false);
3018 return;
3019 case Intrinsic::amdgcn_ballot:
3020 // Use default handling and insert copy to vcc source.
3021 break;
3022 }
3023 break;
3024 }
3025 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3026 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3027 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3028 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3029 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3030 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3031 assert(RSrcIntrin && RSrcIntrin->IsImage);
3032 // Non-images can have complications from operands that allow both SGPR
3033 // and VGPR. For now it's too complicated to figure out the final opcode
3034 // to derive the register bank from the MCInstrDesc.
3035 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3036 return;
3037 }
3038 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3039 unsigned N = MI.getNumExplicitOperands() - 2;
3040 applyDefaultMapping(OpdMapper);
3041 executeInWaterfallLoop(MI, MRI, { N });
3042 return;
3043 }
3044 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3045 auto IntrID = MI.getIntrinsicID();
3046 switch (IntrID) {
3047 case Intrinsic::amdgcn_ds_ordered_add:
3048 case Intrinsic::amdgcn_ds_ordered_swap: {
3049 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3050 assert(OpdMapper.getVRegs(0).empty());
3051 substituteSimpleCopyRegs(OpdMapper, 3);
3052 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3053 return;
3054 }
3055 case Intrinsic::amdgcn_ds_gws_init:
3056 case Intrinsic::amdgcn_ds_gws_barrier:
3057 case Intrinsic::amdgcn_ds_gws_sema_br: {
3058 // Only the first lane is executes, so readfirstlane is safe.
3059 substituteSimpleCopyRegs(OpdMapper, 1);
3060 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3061 return;
3062 }
3063 case Intrinsic::amdgcn_ds_gws_sema_v:
3064 case Intrinsic::amdgcn_ds_gws_sema_p:
3065 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3066 // Only the first lane is executes, so readfirstlane is safe.
3067 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3068 return;
3069 }
3070 case Intrinsic::amdgcn_ds_append:
3071 case Intrinsic::amdgcn_ds_consume: {
3072 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3073 return;
3074 }
3075 case Intrinsic::amdgcn_s_sendmsg:
3076 case Intrinsic::amdgcn_s_sendmsghalt: {
3077 // FIXME: Should this use a waterfall loop?
3078 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3079 return;
3080 }
3081 case Intrinsic::amdgcn_s_setreg: {
3082 constrainOpWithReadfirstlane(MI, MRI, 2);
3083 return;
3084 }
3085 case Intrinsic::amdgcn_raw_buffer_load_lds: {
3086 applyDefaultMapping(OpdMapper);
3087 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3088 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3089 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3090 return;
3091 }
3092 case Intrinsic::amdgcn_struct_buffer_load_lds: {
3093 applyDefaultMapping(OpdMapper);
3094 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3095 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3096 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3097 return;
3098 }
3099 case Intrinsic::amdgcn_global_load_lds: {
3100 applyDefaultMapping(OpdMapper);
3101 constrainOpWithReadfirstlane(MI, MRI, 2);
3102 return;
3103 }
3104 case Intrinsic::amdgcn_lds_direct_load: {
3105 applyDefaultMapping(OpdMapper);
3106 // Readlane for m0 value, which is always the last operand.
3107 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3108 return;
3109 }
3110 case Intrinsic::amdgcn_exp_row:
3111 applyDefaultMapping(OpdMapper);
3112 constrainOpWithReadfirstlane(MI, MRI, 8); // M0
3113 return;
3114 default: {
3115 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3116 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3117 // Non-images can have complications from operands that allow both SGPR
3118 // and VGPR. For now it's too complicated to figure out the final opcode
3119 // to derive the register bank from the MCInstrDesc.
3120 if (RSrcIntrin->IsImage) {
3121 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3122 return;
3123 }
3124 }
3125
3126 break;
3127 }
3128 }
3129 break;
3130 }
3131 case AMDGPU::G_SI_CALL: {
3132 // Use a set to avoid extra readfirstlanes in the case where multiple
3133 // operands are the same register.
3134 SmallSet<Register, 4> SGPROperandRegs;
3135
3136 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3137 break;
3138
3139 // Move all copies to physical SGPRs that are used by the call instruction
3140 // into the loop block. Start searching for these copies until the
3141 // ADJCALLSTACKUP.
3142 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3143 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3144
3145 // Move all non-copies before the copies, so that a complete range can be
3146 // moved into the waterfall loop.
3147 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3148 // Count of NonCopyInstrs found until the current LastCopy.
3149 unsigned NonCopyInstrsLen = 0;
3150 MachineBasicBlock::iterator Start(&MI);
3151 MachineBasicBlock::iterator LastCopy = Start;
3152 MachineBasicBlock *MBB = MI.getParent();
3153 const SIMachineFunctionInfo *Info =
3154 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3155 while (Start->getOpcode() != FrameSetupOpcode) {
3156 --Start;
3157 bool IsCopy = false;
3158 if (Start->getOpcode() == AMDGPU::COPY) {
3159 auto &Dst = Start->getOperand(0);
3160 if (Dst.isReg()) {
3161 Register Reg = Dst.getReg();
3162 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3163 IsCopy = true;
3164 } else {
3165 // Also move the copy from the scratch rsrc descriptor into the loop
3166 // to allow it to be optimized away.
3167 auto &Src = Start->getOperand(1);
3168 if (Src.isReg()) {
3169 Reg = Src.getReg();
3170 IsCopy = Info->getScratchRSrcReg() == Reg;
3171 }
3172 }
3173 }
3174 }
3175
3176 if (IsCopy) {
3177 LastCopy = Start;
3178 NonCopyInstrsLen = NonCopyInstrs.size();
3179 } else {
3180 NonCopyInstrs.push_back(&*Start);
3181 }
3182 }
3183 NonCopyInstrs.resize(NonCopyInstrsLen);
3184
3185 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3186 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3187 }
3188 Start = LastCopy;
3189
3190 // Do the same for copies after the loop
3191 NonCopyInstrs.clear();
3192 NonCopyInstrsLen = 0;
3193 MachineBasicBlock::iterator End(&MI);
3194 LastCopy = End;
3195 while (End->getOpcode() != FrameDestroyOpcode) {
3196 ++End;
3197 bool IsCopy = false;
3198 if (End->getOpcode() == AMDGPU::COPY) {
3199 auto &Src = End->getOperand(1);
3200 if (Src.isReg()) {
3201 Register Reg = Src.getReg();
3202 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3203 }
3204 }
3205
3206 if (IsCopy) {
3207 LastCopy = End;
3208 NonCopyInstrsLen = NonCopyInstrs.size();
3209 } else {
3210 NonCopyInstrs.push_back(&*End);
3211 }
3212 }
3213 NonCopyInstrs.resize(NonCopyInstrsLen);
3214
3215 End = LastCopy;
3216 ++LastCopy;
3217 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3218 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3219 }
3220
3221 ++End;
3222 MachineIRBuilder B(*Start);
3223 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3224 break;
3225 }
3226 case AMDGPU::G_LOAD:
3227 case AMDGPU::G_ZEXTLOAD:
3228 case AMDGPU::G_SEXTLOAD: {
3229 if (applyMappingLoad(MI, OpdMapper, MRI))
3230 return;
3231 break;
3232 }
3233 case AMDGPU::G_DYN_STACKALLOC:
3234 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3235 return;
3236 case AMDGPU::G_SBFX:
3237 applyMappingBFE(OpdMapper, /*Signed*/ true);
3238 return;
3239 case AMDGPU::G_UBFX:
3240 applyMappingBFE(OpdMapper, /*Signed*/ false);
3241 return;
3242 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3243 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3244 applyMappingMAD_64_32(OpdMapper);
3245 return;
3246 default:
3247 break;
3248 }
3249
3250 return applyDefaultMapping(OpdMapper);
3251 }
3252
3253 // vgpr, sgpr -> vgpr
3254 // vgpr, agpr -> vgpr
3255 // agpr, agpr -> agpr
3256 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3257 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3258 if (RB0 == AMDGPU::InvalidRegBankID)
3259 return RB1;
3260 if (RB1 == AMDGPU::InvalidRegBankID)
3261 return RB0;
3262
3263 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3264 return AMDGPU::SGPRRegBankID;
3265
3266 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3267 return AMDGPU::AGPRRegBankID;
3268
3269 return AMDGPU::VGPRRegBankID;
3270 }
3271
regBankBoolUnion(unsigned RB0,unsigned RB1)3272 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3273 if (RB0 == AMDGPU::InvalidRegBankID)
3274 return RB1;
3275 if (RB1 == AMDGPU::InvalidRegBankID)
3276 return RB0;
3277
3278 // vcc, vcc -> vcc
3279 // vcc, sgpr -> vcc
3280 // vcc, vgpr -> vcc
3281 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3282 return AMDGPU::VCCRegBankID;
3283
3284 // vcc, vgpr -> vgpr
3285 return regBankUnion(RB0, RB1);
3286 }
3287
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3288 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3289 const MachineInstr &MI) const {
3290 unsigned RegBank = AMDGPU::InvalidRegBankID;
3291
3292 for (const MachineOperand &MO : MI.operands()) {
3293 if (!MO.isReg())
3294 continue;
3295 Register Reg = MO.getReg();
3296 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3297 RegBank = regBankUnion(RegBank, Bank->getID());
3298 if (RegBank == AMDGPU::VGPRRegBankID)
3299 break;
3300 }
3301 }
3302
3303 return RegBank;
3304 }
3305
isSALUMapping(const MachineInstr & MI) const3306 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3307 const MachineFunction &MF = *MI.getParent()->getParent();
3308 const MachineRegisterInfo &MRI = MF.getRegInfo();
3309 for (const MachineOperand &MO : MI.operands()) {
3310 if (!MO.isReg())
3311 continue;
3312 Register Reg = MO.getReg();
3313 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3314 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3315 return false;
3316 }
3317 }
3318 return true;
3319 }
3320
3321 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3322 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3323 const MachineFunction &MF = *MI.getParent()->getParent();
3324 const MachineRegisterInfo &MRI = MF.getRegInfo();
3325 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3326
3327 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3328 const MachineOperand &SrcOp = MI.getOperand(i);
3329 if (!SrcOp.isReg())
3330 continue;
3331
3332 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3333 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3334 }
3335 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3336 MI.getNumOperands());
3337 }
3338
3339 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3340 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3341 const MachineFunction &MF = *MI.getParent()->getParent();
3342 const MachineRegisterInfo &MRI = MF.getRegInfo();
3343 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3344
3345 // Even though we technically could use SGPRs, this would require knowledge of
3346 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3347 //
3348 // TODO: Unary ops are trivially OK, so accept SGPRs?
3349 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3350 const MachineOperand &Src = MI.getOperand(i);
3351 if (!Src.isReg())
3352 continue;
3353
3354 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3355 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3356 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3357 }
3358
3359 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3360 MI.getNumOperands());
3361 }
3362
3363 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3364 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3365 const MachineFunction &MF = *MI.getParent()->getParent();
3366 const MachineRegisterInfo &MRI = MF.getRegInfo();
3367 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3368
3369 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3370 const MachineOperand &Op = MI.getOperand(I);
3371 if (!Op.isReg())
3372 continue;
3373
3374 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3375 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3376 }
3377
3378 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3379 MI.getNumOperands());
3380 }
3381
3382 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3383 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3384 const MachineInstr &MI,
3385 int RsrcIdx) const {
3386 // The reported argument index is relative to the IR intrinsic call arguments,
3387 // so we need to shift by the number of defs and the intrinsic ID.
3388 RsrcIdx += MI.getNumExplicitDefs() + 1;
3389
3390 const int NumOps = MI.getNumOperands();
3391 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3392
3393 // TODO: Should packed/unpacked D16 difference be reported here as part of
3394 // the value mapping?
3395 for (int I = 0; I != NumOps; ++I) {
3396 if (!MI.getOperand(I).isReg())
3397 continue;
3398
3399 Register OpReg = MI.getOperand(I).getReg();
3400 // We replace some dead address operands with $noreg
3401 if (!OpReg)
3402 continue;
3403
3404 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3405
3406 // FIXME: Probably need a new intrinsic register bank searchable table to
3407 // handle arbitrary intrinsics easily.
3408 //
3409 // If this has a sampler, it immediately follows rsrc.
3410 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3411
3412 if (MustBeSGPR) {
3413 // If this must be an SGPR, so we must report whatever it is as legal.
3414 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3415 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3416 } else {
3417 // Some operands must be VGPR, and these are easy to copy to.
3418 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3419 }
3420 }
3421
3422 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3423 }
3424
3425 /// Return the mapping for a pointer argument.
3426 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3427 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3428 Register PtrReg) const {
3429 LLT PtrTy = MRI.getType(PtrReg);
3430 unsigned Size = PtrTy.getSizeInBits();
3431 if (Subtarget.useFlatForGlobal() ||
3432 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3433 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3434
3435 // If we're using MUBUF instructions for global memory, an SGPR base register
3436 // is possible. Otherwise this needs to be a VGPR.
3437 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3438 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3439 }
3440
3441 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3442 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3443
3444 const MachineFunction &MF = *MI.getParent()->getParent();
3445 const MachineRegisterInfo &MRI = MF.getRegInfo();
3446 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3447 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3448 Register PtrReg = MI.getOperand(1).getReg();
3449 LLT PtrTy = MRI.getType(PtrReg);
3450 unsigned AS = PtrTy.getAddressSpace();
3451 unsigned PtrSize = PtrTy.getSizeInBits();
3452
3453 const ValueMapping *ValMapping;
3454 const ValueMapping *PtrMapping;
3455
3456 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3457
3458 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3459 if (isScalarLoadLegal(MI)) {
3460 // We have a uniform instruction so we want to use an SMRD load
3461 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3462 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3463 } else {
3464 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3465
3466 // If we're using MUBUF instructions for global memory, an SGPR base
3467 // register is possible. Otherwise this needs to be a VGPR.
3468 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3469 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3470
3471 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3472 }
3473 } else {
3474 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3475 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3476 }
3477
3478 OpdsMapping[0] = ValMapping;
3479 OpdsMapping[1] = PtrMapping;
3480 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3481 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3482 return Mapping;
3483
3484 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3485 // handle that during instruction selection?
3486 }
3487
3488 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3489 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3490 const MachineRegisterInfo &MRI,
3491 unsigned Default) const {
3492 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3493 return Bank ? Bank->getID() : Default;
3494 }
3495
3496 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3497 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3498 const MachineRegisterInfo &MRI,
3499 const TargetRegisterInfo &TRI) const {
3500 // Lie and claim anything is legal, even though this needs to be an SGPR
3501 // applyMapping will have to deal with it as a waterfall loop.
3502 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3503 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3504 return AMDGPU::getValueMapping(Bank, Size);
3505 }
3506
3507 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3508 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3509 const MachineRegisterInfo &MRI,
3510 const TargetRegisterInfo &TRI) const {
3511 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3512 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3513 }
3514
3515 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3516 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3517 const MachineRegisterInfo &MRI,
3518 const TargetRegisterInfo &TRI) const {
3519 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3520 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3521 }
3522
3523 ///
3524 /// This function must return a legal mapping, because
3525 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3526 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3527 /// VGPR to SGPR generated is illegal.
3528 ///
3529 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3530 // legal. These will be dealt with in applyMappingImpl.
3531 //
3532 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3533 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3534 const MachineFunction &MF = *MI.getParent()->getParent();
3535 const MachineRegisterInfo &MRI = MF.getRegInfo();
3536
3537 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3538 // The default logic bothers to analyze impossible alternative mappings. We
3539 // want the most straightforward mapping, so just directly handle this.
3540 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3541 *TRI);
3542 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3543 *TRI);
3544 assert(SrcBank && "src bank should have been assigned already");
3545 if (!DstBank)
3546 DstBank = SrcBank;
3547
3548 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3549 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3550 cannotCopy(*DstBank, *SrcBank, Size))
3551 return getInvalidInstructionMapping();
3552
3553 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3554 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3555 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3556 OpdsMapping[0] = &ValMap;
3557 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3558 OpdsMapping[1] = &ValMap;
3559
3560 return getInstructionMapping(
3561 1, /*Cost*/ 1,
3562 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3563 }
3564
3565 if (MI.isRegSequence()) {
3566 // If any input is a VGPR, the result must be a VGPR. The default handling
3567 // assumes any copy between banks is legal.
3568 unsigned BankID = AMDGPU::SGPRRegBankID;
3569
3570 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3571 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3572 // It doesn't make sense to use vcc or scc banks here, so just ignore
3573 // them.
3574 if (OpBank != AMDGPU::SGPRRegBankID) {
3575 BankID = AMDGPU::VGPRRegBankID;
3576 break;
3577 }
3578 }
3579 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3580
3581 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3582 return getInstructionMapping(
3583 1, /*Cost*/ 1,
3584 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3585 }
3586
3587 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3588 // properly.
3589 //
3590 // TODO: There are additional exec masking dependencies to analyze.
3591 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3592 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3593 Register DstReg = MI.getOperand(0).getReg();
3594
3595 // Sometimes the result may have already been assigned a bank.
3596 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3597 ResultBank = DstBank->getID();
3598
3599 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3600 Register Reg = MI.getOperand(I).getReg();
3601 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3602
3603 // FIXME: Assuming VGPR for any undetermined inputs.
3604 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3605 ResultBank = AMDGPU::VGPRRegBankID;
3606 break;
3607 }
3608
3609 // FIXME: Need to promote SGPR case to s32
3610 unsigned OpBank = Bank->getID();
3611 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3612 }
3613
3614 assert(ResultBank != AMDGPU::InvalidRegBankID);
3615
3616 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3617
3618 const ValueMapping &ValMap =
3619 getValueMapping(0, Size, getRegBank(ResultBank));
3620 return getInstructionMapping(
3621 1, /*Cost*/ 1,
3622 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3623 }
3624
3625 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3626 if (Mapping.isValid())
3627 return Mapping;
3628
3629 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3630
3631 switch (MI.getOpcode()) {
3632 default:
3633 return getInvalidInstructionMapping();
3634
3635 case AMDGPU::G_AND:
3636 case AMDGPU::G_OR:
3637 case AMDGPU::G_XOR: {
3638 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3639 if (Size == 1) {
3640 const RegisterBank *DstBank
3641 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3642
3643 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3644 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3645 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3646 if (DstBank) {
3647 TargetBankID = DstBank->getID();
3648 if (DstBank == &AMDGPU::VCCRegBank) {
3649 TargetBankID = AMDGPU::VCCRegBankID;
3650 BankLHS = AMDGPU::VCCRegBankID;
3651 BankRHS = AMDGPU::VCCRegBankID;
3652 } else {
3653 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3654 AMDGPU::SGPRRegBankID);
3655 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3656 AMDGPU::SGPRRegBankID);
3657 }
3658 } else {
3659 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3660 AMDGPU::VCCRegBankID);
3661 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3662 AMDGPU::VCCRegBankID);
3663
3664 // Both inputs should be true booleans to produce a boolean result.
3665 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3666 TargetBankID = AMDGPU::VGPRRegBankID;
3667 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3668 TargetBankID = AMDGPU::VCCRegBankID;
3669 BankLHS = AMDGPU::VCCRegBankID;
3670 BankRHS = AMDGPU::VCCRegBankID;
3671 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3672 TargetBankID = AMDGPU::SGPRRegBankID;
3673 }
3674 }
3675
3676 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3677 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3678 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3679 break;
3680 }
3681
3682 if (Size == 64) {
3683
3684 if (isSALUMapping(MI)) {
3685 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3686 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3687 } else {
3688 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3689 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3690 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3691
3692 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3693 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3694 }
3695
3696 break;
3697 }
3698
3699 [[fallthrough]];
3700 }
3701 case AMDGPU::G_PTR_ADD:
3702 case AMDGPU::G_PTRMASK:
3703 case AMDGPU::G_ADD:
3704 case AMDGPU::G_SUB:
3705 case AMDGPU::G_MUL:
3706 case AMDGPU::G_SHL:
3707 case AMDGPU::G_LSHR:
3708 case AMDGPU::G_ASHR:
3709 case AMDGPU::G_UADDO:
3710 case AMDGPU::G_USUBO:
3711 case AMDGPU::G_UADDE:
3712 case AMDGPU::G_SADDE:
3713 case AMDGPU::G_USUBE:
3714 case AMDGPU::G_SSUBE:
3715 case AMDGPU::G_SMIN:
3716 case AMDGPU::G_SMAX:
3717 case AMDGPU::G_UMIN:
3718 case AMDGPU::G_UMAX:
3719 case AMDGPU::G_ABS:
3720 case AMDGPU::G_SHUFFLE_VECTOR:
3721 case AMDGPU::G_SBFX:
3722 case AMDGPU::G_UBFX:
3723 if (isSALUMapping(MI))
3724 return getDefaultMappingSOP(MI);
3725 [[fallthrough]];
3726
3727 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3728 case AMDGPU::G_SSUBSAT:
3729 case AMDGPU::G_UADDSAT:
3730 case AMDGPU::G_USUBSAT:
3731 case AMDGPU::G_FADD:
3732 case AMDGPU::G_FSUB:
3733 case AMDGPU::G_FPTOSI:
3734 case AMDGPU::G_FPTOUI:
3735 case AMDGPU::G_FMUL:
3736 case AMDGPU::G_FMA:
3737 case AMDGPU::G_FMAD:
3738 case AMDGPU::G_FSQRT:
3739 case AMDGPU::G_FFLOOR:
3740 case AMDGPU::G_FCEIL:
3741 case AMDGPU::G_FRINT:
3742 case AMDGPU::G_SITOFP:
3743 case AMDGPU::G_UITOFP:
3744 case AMDGPU::G_FPTRUNC:
3745 case AMDGPU::G_FPEXT:
3746 case AMDGPU::G_FEXP2:
3747 case AMDGPU::G_FLOG2:
3748 case AMDGPU::G_FMINNUM:
3749 case AMDGPU::G_FMAXNUM:
3750 case AMDGPU::G_FMINNUM_IEEE:
3751 case AMDGPU::G_FMAXNUM_IEEE:
3752 case AMDGPU::G_FCANONICALIZE:
3753 case AMDGPU::G_INTRINSIC_TRUNC:
3754 case AMDGPU::G_STRICT_FADD:
3755 case AMDGPU::G_STRICT_FSUB:
3756 case AMDGPU::G_STRICT_FMUL:
3757 case AMDGPU::G_STRICT_FMA:
3758 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3759 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3760 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3761 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3762 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3763 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3764 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3765 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3766 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3767 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3768 case AMDGPU::G_AMDGPU_SMED3:
3769 return getDefaultMappingVOP(MI);
3770 case AMDGPU::G_UMULH:
3771 case AMDGPU::G_SMULH: {
3772 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3773 return getDefaultMappingSOP(MI);
3774 return getDefaultMappingVOP(MI);
3775 }
3776 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3777 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3778 // Three possible mappings:
3779 //
3780 // - Default SOP
3781 // - Default VOP
3782 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3783 //
3784 // This allows instruction selection to keep the multiplication part of the
3785 // instruction on the SALU.
3786 bool AllSalu = true;
3787 bool MulSalu = true;
3788 for (unsigned i = 0; i < 5; ++i) {
3789 Register Reg = MI.getOperand(i).getReg();
3790 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3791 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3792 AllSalu = false;
3793 if (i == 2 || i == 3) {
3794 MulSalu = false;
3795 break;
3796 }
3797 }
3798 }
3799 }
3800
3801 if (AllSalu)
3802 return getDefaultMappingSOP(MI);
3803
3804 // If the multiply-add is full-rate in VALU, use that even if the
3805 // multiplication part is scalar. Accumulating separately on the VALU would
3806 // take two instructions.
3807 if (!MulSalu || Subtarget.hasFullRate64Ops())
3808 return getDefaultMappingVOP(MI);
3809
3810 // Keep the multiplication on the SALU, then accumulate on the VALU.
3811 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3812 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3813 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3814 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3815 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3816 break;
3817 }
3818 case AMDGPU::G_IMPLICIT_DEF: {
3819 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3820 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3821 break;
3822 }
3823 case AMDGPU::G_FCONSTANT:
3824 case AMDGPU::G_CONSTANT:
3825 case AMDGPU::G_GLOBAL_VALUE:
3826 case AMDGPU::G_BLOCK_ADDR:
3827 case AMDGPU::G_READCYCLECOUNTER: {
3828 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3829 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3830 break;
3831 }
3832 case AMDGPU::G_FRAME_INDEX: {
3833 // TODO: This should be the same as other constants, but eliminateFrameIndex
3834 // currently assumes VALU uses.
3835 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3836 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3837 break;
3838 }
3839 case AMDGPU::G_DYN_STACKALLOC: {
3840 // Result is always uniform, and a wave reduction is needed for the source.
3841 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3842 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3843 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3844 break;
3845 }
3846 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3847 // This case is weird because we expect a physical register in the source,
3848 // but need to set a bank anyway.
3849 //
3850 // We could select the result to SGPR or VGPR, but for the one current use
3851 // it's more practical to always use VGPR.
3852 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3853 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3854 break;
3855 }
3856 case AMDGPU::G_INSERT: {
3857 unsigned BankID = getMappingType(MRI, MI);
3858 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3859 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3860 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3861 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3862 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3863 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3864 OpdsMapping[3] = nullptr;
3865 break;
3866 }
3867 case AMDGPU::G_EXTRACT: {
3868 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3869 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3870 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3871 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3872 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3873 OpdsMapping[2] = nullptr;
3874 break;
3875 }
3876 case AMDGPU::G_BUILD_VECTOR:
3877 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3878 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3879 if (DstTy == LLT::fixed_vector(2, 16)) {
3880 unsigned DstSize = DstTy.getSizeInBits();
3881 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3882 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3883 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3884 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3885
3886 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3887 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3888 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3889 break;
3890 }
3891
3892 [[fallthrough]];
3893 }
3894 case AMDGPU::G_MERGE_VALUES:
3895 case AMDGPU::G_CONCAT_VECTORS: {
3896 unsigned Bank = getMappingType(MRI, MI);
3897 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3898 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3899
3900 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3901 // Op1 and Dst should use the same register bank.
3902 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3903 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3904 break;
3905 }
3906 case AMDGPU::G_BITREVERSE:
3907 case AMDGPU::G_BITCAST:
3908 case AMDGPU::G_INTTOPTR:
3909 case AMDGPU::G_PTRTOINT:
3910 case AMDGPU::G_FABS:
3911 case AMDGPU::G_FNEG: {
3912 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3913 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3914 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3915 break;
3916 }
3917 case AMDGPU::G_AMDGPU_FFBH_U32:
3918 case AMDGPU::G_AMDGPU_FFBL_B32:
3919 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3920 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3921 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3922 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3923 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3924 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3925 break;
3926 }
3927 case AMDGPU::G_CTPOP: {
3928 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3929 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3930 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3931
3932 // This should really be getValueMappingSGPR64Only, but allowing the generic
3933 // code to handle the register split just makes using LegalizerHelper more
3934 // difficult.
3935 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3936 break;
3937 }
3938 case AMDGPU::G_TRUNC: {
3939 Register Dst = MI.getOperand(0).getReg();
3940 Register Src = MI.getOperand(1).getReg();
3941 unsigned Bank = getRegBankID(Src, MRI);
3942 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3943 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3944 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3945 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3946 break;
3947 }
3948 case AMDGPU::G_ZEXT:
3949 case AMDGPU::G_SEXT:
3950 case AMDGPU::G_ANYEXT:
3951 case AMDGPU::G_SEXT_INREG: {
3952 Register Dst = MI.getOperand(0).getReg();
3953 Register Src = MI.getOperand(1).getReg();
3954 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3955 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3956
3957 unsigned DstBank;
3958 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3959 assert(SrcBank);
3960 switch (SrcBank->getID()) {
3961 case AMDGPU::SGPRRegBankID:
3962 DstBank = AMDGPU::SGPRRegBankID;
3963 break;
3964 default:
3965 DstBank = AMDGPU::VGPRRegBankID;
3966 break;
3967 }
3968
3969 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3970 // 32-bits, and then to 64.
3971 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3972 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3973 SrcSize);
3974 break;
3975 }
3976 case AMDGPU::G_FCMP: {
3977 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3978 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3979 OpdsMapping[1] = nullptr; // Predicate Operand.
3980 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3981 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3982 break;
3983 }
3984 case AMDGPU::G_IS_FPCLASS: {
3985 Register SrcReg = MI.getOperand(1).getReg();
3986 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3987 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3988 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3989 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3990 break;
3991 }
3992 case AMDGPU::G_STORE: {
3993 assert(MI.getOperand(0).isReg());
3994 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3995
3996 // FIXME: We need to specify a different reg bank once scalar stores are
3997 // supported.
3998 const ValueMapping *ValMapping =
3999 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4000 OpdsMapping[0] = ValMapping;
4001 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4002 break;
4003 }
4004 case AMDGPU::G_ICMP: {
4005 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4006 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4007
4008 // See if the result register has already been constrained to vcc, which may
4009 // happen due to control flow intrinsic lowering.
4010 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4011 AMDGPU::SGPRRegBankID);
4012 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4013 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4014
4015 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4016 Op2Bank == AMDGPU::SGPRRegBankID &&
4017 Op3Bank == AMDGPU::SGPRRegBankID &&
4018 (Size == 32 || (Size == 64 &&
4019 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4020 Subtarget.hasScalarCompareEq64()));
4021
4022 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4023 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4024
4025 // TODO: Use 32-bit for scalar output size.
4026 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4027 const unsigned ResultSize = 1;
4028
4029 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4030 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4031 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4032 break;
4033 }
4034 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4035 // VGPR index can be used for waterfall when indexing a SGPR vector.
4036 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4037 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4038 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4039 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4040 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4041 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4042
4043 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4044 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4045
4046 // The index can be either if the source vector is VGPR.
4047 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4048 break;
4049 }
4050 case AMDGPU::G_INSERT_VECTOR_ELT: {
4051 unsigned OutputBankID = isSALUMapping(MI) ?
4052 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4053
4054 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4055 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4056 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4057 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4058 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4059
4060 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4061 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4062
4063 // This is a weird case, because we need to break down the mapping based on
4064 // the register bank of a different operand.
4065 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4066 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4067 InsertSize);
4068 } else {
4069 assert(InsertSize == 32 || InsertSize == 64);
4070 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4071 }
4072
4073 // The index can be either if the source vector is VGPR.
4074 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4075 break;
4076 }
4077 case AMDGPU::G_UNMERGE_VALUES: {
4078 unsigned Bank = getMappingType(MRI, MI);
4079
4080 // Op1 and Dst should use the same register bank.
4081 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4082 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4083 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4084 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4085 }
4086 break;
4087 }
4088 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4094 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4095 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4096 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4097 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4098 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4099 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4100 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4101 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4102 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4103 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4104 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4105 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4106
4107 // rsrc
4108 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4109
4110 // vindex
4111 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4112
4113 // voffset
4114 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4115
4116 // soffset
4117 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4118
4119 // Any remaining operands are immediates and were correctly null
4120 // initialized.
4121 break;
4122 }
4123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4129 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4130 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4134 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4135 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4136 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4138 // vdata_out
4139 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4140
4141 // vdata_in
4142 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4143
4144 // rsrc
4145 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4146
4147 // vindex
4148 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4149
4150 // voffset
4151 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4152
4153 // soffset
4154 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4155
4156 // Any remaining operands are immediates and were correctly null
4157 // initialized.
4158 break;
4159 }
4160 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4161 // vdata_out
4162 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4163
4164 // vdata_in
4165 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4166
4167 // cmp
4168 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4169
4170 // rsrc
4171 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4172
4173 // vindex
4174 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4175
4176 // voffset
4177 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4178
4179 // soffset
4180 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4181
4182 // Any remaining operands are immediates and were correctly null
4183 // initialized.
4184 break;
4185 }
4186 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4187 // Lie and claim everything is legal, even though some need to be
4188 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4189 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4190 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4191
4192 // We need to convert this to a MUBUF if either the resource of offset is
4193 // VGPR.
4194 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4195 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4196 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4197
4198 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4199 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4200 break;
4201 }
4202 case AMDGPU::G_INTRINSIC: {
4203 switch (MI.getIntrinsicID()) {
4204 default:
4205 return getInvalidInstructionMapping();
4206 case Intrinsic::amdgcn_div_fmas:
4207 case Intrinsic::amdgcn_div_fixup:
4208 case Intrinsic::amdgcn_trig_preop:
4209 case Intrinsic::amdgcn_sin:
4210 case Intrinsic::amdgcn_cos:
4211 case Intrinsic::amdgcn_log_clamp:
4212 case Intrinsic::amdgcn_rcp:
4213 case Intrinsic::amdgcn_rcp_legacy:
4214 case Intrinsic::amdgcn_sqrt:
4215 case Intrinsic::amdgcn_rsq:
4216 case Intrinsic::amdgcn_rsq_legacy:
4217 case Intrinsic::amdgcn_rsq_clamp:
4218 case Intrinsic::amdgcn_fmul_legacy:
4219 case Intrinsic::amdgcn_fma_legacy:
4220 case Intrinsic::amdgcn_ldexp:
4221 case Intrinsic::amdgcn_frexp_mant:
4222 case Intrinsic::amdgcn_frexp_exp:
4223 case Intrinsic::amdgcn_fract:
4224 case Intrinsic::amdgcn_cvt_pkrtz:
4225 case Intrinsic::amdgcn_cvt_pknorm_i16:
4226 case Intrinsic::amdgcn_cvt_pknorm_u16:
4227 case Intrinsic::amdgcn_cvt_pk_i16:
4228 case Intrinsic::amdgcn_cvt_pk_u16:
4229 case Intrinsic::amdgcn_fmed3:
4230 case Intrinsic::amdgcn_cubeid:
4231 case Intrinsic::amdgcn_cubema:
4232 case Intrinsic::amdgcn_cubesc:
4233 case Intrinsic::amdgcn_cubetc:
4234 case Intrinsic::amdgcn_sffbh:
4235 case Intrinsic::amdgcn_fmad_ftz:
4236 case Intrinsic::amdgcn_mbcnt_lo:
4237 case Intrinsic::amdgcn_mbcnt_hi:
4238 case Intrinsic::amdgcn_mul_u24:
4239 case Intrinsic::amdgcn_mul_i24:
4240 case Intrinsic::amdgcn_mulhi_u24:
4241 case Intrinsic::amdgcn_mulhi_i24:
4242 case Intrinsic::amdgcn_lerp:
4243 case Intrinsic::amdgcn_sad_u8:
4244 case Intrinsic::amdgcn_msad_u8:
4245 case Intrinsic::amdgcn_sad_hi_u8:
4246 case Intrinsic::amdgcn_sad_u16:
4247 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4248 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4249 case Intrinsic::amdgcn_mqsad_u32_u8:
4250 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4251 case Intrinsic::amdgcn_alignbyte:
4252 case Intrinsic::amdgcn_perm:
4253 case Intrinsic::amdgcn_fdot2:
4254 case Intrinsic::amdgcn_sdot2:
4255 case Intrinsic::amdgcn_udot2:
4256 case Intrinsic::amdgcn_sdot4:
4257 case Intrinsic::amdgcn_udot4:
4258 case Intrinsic::amdgcn_sdot8:
4259 case Intrinsic::amdgcn_udot8:
4260 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4261 case Intrinsic::amdgcn_fdot2_f16_f16:
4262 case Intrinsic::amdgcn_fdot2_f32_bf16:
4263 case Intrinsic::amdgcn_sudot4:
4264 case Intrinsic::amdgcn_sudot8:
4265 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4266 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4267 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4268 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4269 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4270 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4271 return getDefaultMappingVOP(MI);
4272 case Intrinsic::amdgcn_sbfe:
4273 case Intrinsic::amdgcn_ubfe:
4274 if (isSALUMapping(MI))
4275 return getDefaultMappingSOP(MI);
4276 return getDefaultMappingVOP(MI);
4277 case Intrinsic::amdgcn_ds_swizzle:
4278 case Intrinsic::amdgcn_ds_permute:
4279 case Intrinsic::amdgcn_ds_bpermute:
4280 case Intrinsic::amdgcn_update_dpp:
4281 case Intrinsic::amdgcn_mov_dpp8:
4282 case Intrinsic::amdgcn_mov_dpp:
4283 case Intrinsic::amdgcn_strict_wwm:
4284 case Intrinsic::amdgcn_wwm:
4285 case Intrinsic::amdgcn_strict_wqm:
4286 case Intrinsic::amdgcn_wqm:
4287 case Intrinsic::amdgcn_softwqm:
4288 case Intrinsic::amdgcn_set_inactive:
4289 case Intrinsic::amdgcn_permlane64:
4290 return getDefaultMappingAllVGPR(MI);
4291 case Intrinsic::amdgcn_kernarg_segment_ptr:
4292 case Intrinsic::amdgcn_s_getpc:
4293 case Intrinsic::amdgcn_groupstaticsize:
4294 case Intrinsic::amdgcn_reloc_constant:
4295 case Intrinsic::returnaddress: {
4296 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4297 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4298 break;
4299 }
4300 case Intrinsic::amdgcn_wqm_vote: {
4301 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4302 OpdsMapping[0] = OpdsMapping[2]
4303 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4304 break;
4305 }
4306 case Intrinsic::amdgcn_ps_live: {
4307 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4308 break;
4309 }
4310 case Intrinsic::amdgcn_div_scale: {
4311 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4312 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4313 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4314 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4315
4316 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4317 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4318 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4319 break;
4320 }
4321 case Intrinsic::amdgcn_class: {
4322 Register Src0Reg = MI.getOperand(2).getReg();
4323 Register Src1Reg = MI.getOperand(3).getReg();
4324 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4325 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4326 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4327 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4328 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4329 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4330 break;
4331 }
4332 case Intrinsic::amdgcn_icmp:
4333 case Intrinsic::amdgcn_fcmp: {
4334 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4335 // This is not VCCRegBank because this is not used in boolean contexts.
4336 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4337 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4338 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4339 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4340 break;
4341 }
4342 case Intrinsic::amdgcn_readlane: {
4343 // This must be an SGPR, but accept a VGPR.
4344 Register IdxReg = MI.getOperand(3).getReg();
4345 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4346 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4347 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4348 [[fallthrough]];
4349 }
4350 case Intrinsic::amdgcn_readfirstlane: {
4351 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4352 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4353 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4354 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4355 break;
4356 }
4357 case Intrinsic::amdgcn_writelane: {
4358 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4359 Register SrcReg = MI.getOperand(2).getReg();
4360 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4361 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4362 Register IdxReg = MI.getOperand(3).getReg();
4363 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4364 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4365 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4366
4367 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4368 // to legalize.
4369 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4370 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4371 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4372 break;
4373 }
4374 case Intrinsic::amdgcn_if_break: {
4375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4376 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4377 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4378 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4379 break;
4380 }
4381 case Intrinsic::amdgcn_permlane16:
4382 case Intrinsic::amdgcn_permlanex16: {
4383 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4384 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4385 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4386 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4387 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4388 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4389 break;
4390 }
4391 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4392 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4393 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4394 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4395 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4396 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4397 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4398 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4399 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4400 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4401 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4402 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4403 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4404 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4405 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4406 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4407 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4408 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4409 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4410 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4411 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4412 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4413 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4414 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4415 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4416 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4417 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4418 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4419 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4420 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4421 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4422 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4423 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4424 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4425 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4426 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4427 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4428 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4429 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4430 // Default for MAI intrinsics.
4431 // srcC can also be an immediate which can be folded later.
4432 // FIXME: Should we eventually add an alternative mapping with AGPR src
4433 // for srcA/srcB?
4434 //
4435 // vdst, srcA, srcB, srcC
4436 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4437 OpdsMapping[0] =
4438 Info->mayNeedAGPRs()
4439 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4440 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4441 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4442 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4443 OpdsMapping[4] =
4444 Info->mayNeedAGPRs()
4445 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4446 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4447 break;
4448 }
4449 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4450 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4451 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4452 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4453 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4454 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4455 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4456 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4457 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4458 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4459 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4460 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4461 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4462 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4463 // vdst, srcA, srcB, srcC, idx
4464 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4465 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4466 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4467 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4468 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4469 break;
4470 }
4471 case Intrinsic::amdgcn_interp_p1:
4472 case Intrinsic::amdgcn_interp_p2:
4473 case Intrinsic::amdgcn_interp_mov:
4474 case Intrinsic::amdgcn_interp_p1_f16:
4475 case Intrinsic::amdgcn_interp_p2_f16:
4476 case Intrinsic::amdgcn_lds_param_load: {
4477 const int M0Idx = MI.getNumOperands() - 1;
4478 Register M0Reg = MI.getOperand(M0Idx).getReg();
4479 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4480 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4481
4482 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4483 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4484 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4485
4486 // Must be SGPR, but we must take whatever the original bank is and fix it
4487 // later.
4488 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4489 break;
4490 }
4491 case Intrinsic::amdgcn_interp_inreg_p10:
4492 case Intrinsic::amdgcn_interp_inreg_p2:
4493 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4494 case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4495 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4496 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4497 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4498 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4499 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4500 break;
4501 }
4502 case Intrinsic::amdgcn_ballot: {
4503 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4504 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4505 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4506 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4507 break;
4508 }
4509 }
4510 break;
4511 }
4512 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4513 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4514 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4515 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4516 auto IntrID = MI.getIntrinsicID();
4517 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4518 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4519 // Non-images can have complications from operands that allow both SGPR
4520 // and VGPR. For now it's too complicated to figure out the final opcode
4521 // to derive the register bank from the MCInstrDesc.
4522 assert(RSrcIntrin->IsImage);
4523 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4524 }
4525 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4526 unsigned N = MI.getNumExplicitOperands() - 2;
4527 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4528 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4529 if (N == 3) {
4530 // Sequential form: all operands combined into VGPR256/VGPR512
4531 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4532 if (Size > 256)
4533 Size = 512;
4534 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4535 } else {
4536 // NSA form
4537 for (unsigned I = 2; I < N; ++I) {
4538 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4539 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4540 }
4541 }
4542 break;
4543 }
4544 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4545 auto IntrID = MI.getIntrinsicID();
4546 switch (IntrID) {
4547 case Intrinsic::amdgcn_s_getreg:
4548 case Intrinsic::amdgcn_s_memtime:
4549 case Intrinsic::amdgcn_s_memrealtime:
4550 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4551 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4552 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4553 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4554 break;
4555 }
4556 case Intrinsic::amdgcn_global_atomic_fadd:
4557 case Intrinsic::amdgcn_global_atomic_csub:
4558 case Intrinsic::amdgcn_global_atomic_fmin:
4559 case Intrinsic::amdgcn_global_atomic_fmax:
4560 case Intrinsic::amdgcn_flat_atomic_fadd:
4561 case Intrinsic::amdgcn_flat_atomic_fmin:
4562 case Intrinsic::amdgcn_flat_atomic_fmax:
4563 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4564 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4565 return getDefaultMappingAllVGPR(MI);
4566 case Intrinsic::amdgcn_ds_ordered_add:
4567 case Intrinsic::amdgcn_ds_ordered_swap:
4568 case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4569 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4570 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4571 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4572 AMDGPU::SGPRRegBankID);
4573 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4574 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4575 break;
4576 }
4577 case Intrinsic::amdgcn_ds_append:
4578 case Intrinsic::amdgcn_ds_consume: {
4579 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4580 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4581 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4582 break;
4583 }
4584 case Intrinsic::amdgcn_exp_compr:
4585 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4586 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4587 break;
4588 case Intrinsic::amdgcn_exp:
4589 // FIXME: Could we support packed types here?
4590 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4591 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4592 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4593 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4594 break;
4595 case Intrinsic::amdgcn_exp_row:
4596 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4597 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4598 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4599 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4600 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4601 break;
4602 case Intrinsic::amdgcn_s_sendmsg:
4603 case Intrinsic::amdgcn_s_sendmsghalt: {
4604 // This must be an SGPR, but accept a VGPR.
4605 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4606 AMDGPU::SGPRRegBankID);
4607 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4608 break;
4609 }
4610 case Intrinsic::amdgcn_s_setreg: {
4611 // This must be an SGPR, but accept a VGPR.
4612 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4613 AMDGPU::SGPRRegBankID);
4614 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4615 break;
4616 }
4617 case Intrinsic::amdgcn_end_cf: {
4618 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4619 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4620 break;
4621 }
4622 case Intrinsic::amdgcn_else: {
4623 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4624 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4625 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4626 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4627 break;
4628 }
4629 case Intrinsic::amdgcn_live_mask: {
4630 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4631 break;
4632 }
4633 case Intrinsic::amdgcn_wqm_demote:
4634 case Intrinsic::amdgcn_kill: {
4635 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4636 break;
4637 }
4638 case Intrinsic::amdgcn_raw_buffer_load:
4639 case Intrinsic::amdgcn_raw_tbuffer_load: {
4640 // FIXME: Should make intrinsic ID the last operand of the instruction,
4641 // then this would be the same as store
4642 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4643 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4644 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4645 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4646 break;
4647 }
4648 case Intrinsic::amdgcn_raw_buffer_load_lds: {
4649 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4650 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4651 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4652 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4653 break;
4654 }
4655 case Intrinsic::amdgcn_raw_buffer_store:
4656 case Intrinsic::amdgcn_raw_buffer_store_format:
4657 case Intrinsic::amdgcn_raw_tbuffer_store: {
4658 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4659 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4660 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4661 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4662 break;
4663 }
4664 case Intrinsic::amdgcn_struct_buffer_load:
4665 case Intrinsic::amdgcn_struct_tbuffer_load: {
4666 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4667 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4668 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4669 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4670 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4671 break;
4672 }
4673 case Intrinsic::amdgcn_struct_buffer_load_lds: {
4674 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4675 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4676 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4678 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4679 break;
4680 }
4681 case Intrinsic::amdgcn_struct_buffer_store:
4682 case Intrinsic::amdgcn_struct_tbuffer_store: {
4683 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4684 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4685 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4686 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4687 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4688 break;
4689 }
4690 case Intrinsic::amdgcn_init_exec_from_input: {
4691 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4692 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4693 break;
4694 }
4695 case Intrinsic::amdgcn_ds_gws_init:
4696 case Intrinsic::amdgcn_ds_gws_barrier:
4697 case Intrinsic::amdgcn_ds_gws_sema_br: {
4698 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4699
4700 // This must be an SGPR, but accept a VGPR.
4701 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4702 AMDGPU::SGPRRegBankID);
4703 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4704 break;
4705 }
4706 case Intrinsic::amdgcn_ds_gws_sema_v:
4707 case Intrinsic::amdgcn_ds_gws_sema_p:
4708 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4709 // This must be an SGPR, but accept a VGPR.
4710 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4711 AMDGPU::SGPRRegBankID);
4712 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4713 break;
4714 }
4715 case Intrinsic::amdgcn_global_load_lds: {
4716 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4717 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4718 break;
4719 }
4720 case Intrinsic::amdgcn_lds_direct_load: {
4721 const int M0Idx = MI.getNumOperands() - 1;
4722 Register M0Reg = MI.getOperand(M0Idx).getReg();
4723 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4724 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4725
4726 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4727 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4728 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4729
4730 // Must be SGPR, but we must take whatever the original bank is and fix it
4731 // later.
4732 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4733 break;
4734 }
4735 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4736 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4737 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4738 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4739 break;
4740 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
4741 OpdsMapping[0] =
4742 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
4743 OpdsMapping[1] =
4744 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
4745 OpdsMapping[3] =
4746 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
4747 OpdsMapping[4] =
4748 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
4749 OpdsMapping[5] =
4750 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
4751 break;
4752 }
4753
4754 default:
4755 return getInvalidInstructionMapping();
4756 }
4757 break;
4758 }
4759 case AMDGPU::G_SELECT: {
4760 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4761 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4762 AMDGPU::SGPRRegBankID);
4763 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4764 AMDGPU::SGPRRegBankID);
4765 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4766 Op3Bank == AMDGPU::SGPRRegBankID;
4767
4768 unsigned CondBankDefault = SGPRSrcs ?
4769 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4770 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4771 CondBankDefault);
4772 if (CondBank == AMDGPU::SGPRRegBankID)
4773 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4774 else if (CondBank == AMDGPU::VGPRRegBankID)
4775 CondBank = AMDGPU::VCCRegBankID;
4776
4777 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4778 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4779
4780 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4781
4782 // TODO: Should report 32-bit for scalar condition type.
4783 if (Size == 64) {
4784 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4785 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4786 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4787 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4788 } else {
4789 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4790 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4791 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4792 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4793 }
4794
4795 break;
4796 }
4797
4798 case AMDGPU::G_SI_CALL: {
4799 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4800 // Lie and claim everything is legal, even though some need to be
4801 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4802 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4803
4804 // Allow anything for implicit arguments
4805 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4806 if (MI.getOperand(I).isReg()) {
4807 Register Reg = MI.getOperand(I).getReg();
4808 auto OpBank = getRegBankID(Reg, MRI);
4809 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4810 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4811 }
4812 }
4813 break;
4814 }
4815 case AMDGPU::G_LOAD:
4816 case AMDGPU::G_ZEXTLOAD:
4817 case AMDGPU::G_SEXTLOAD:
4818 return getInstrMappingForLoad(MI);
4819
4820 case AMDGPU::G_ATOMICRMW_XCHG:
4821 case AMDGPU::G_ATOMICRMW_ADD:
4822 case AMDGPU::G_ATOMICRMW_SUB:
4823 case AMDGPU::G_ATOMICRMW_AND:
4824 case AMDGPU::G_ATOMICRMW_OR:
4825 case AMDGPU::G_ATOMICRMW_XOR:
4826 case AMDGPU::G_ATOMICRMW_MAX:
4827 case AMDGPU::G_ATOMICRMW_MIN:
4828 case AMDGPU::G_ATOMICRMW_UMAX:
4829 case AMDGPU::G_ATOMICRMW_UMIN:
4830 case AMDGPU::G_ATOMICRMW_FADD:
4831 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4832 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4833 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4834 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4835 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4836 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4837 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4838 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4839 break;
4840 }
4841 case AMDGPU::G_ATOMIC_CMPXCHG: {
4842 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4843 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4844 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4845 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4846 break;
4847 }
4848 case AMDGPU::G_BRCOND: {
4849 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4850 AMDGPU::SGPRRegBankID);
4851 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4852 if (Bank != AMDGPU::SGPRRegBankID)
4853 Bank = AMDGPU::VCCRegBankID;
4854
4855 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4856 break;
4857 }
4858 case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4859 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4860 return getDefaultMappingVOP(MI);
4861 }
4862
4863 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4864 getOperandsMapping(OpdsMapping),
4865 MI.getNumOperands());
4866 }
4867