109467b48Spatrick //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This pass tries to fuse DS instructions with close by immediate offsets.
1009467b48Spatrick // This will fuse operations such as
1109467b48Spatrick // ds_read_b32 v0, v2 offset:16
1209467b48Spatrick // ds_read_b32 v1, v2 offset:32
1309467b48Spatrick // ==>
1409467b48Spatrick // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
1509467b48Spatrick //
1609467b48Spatrick // The same is done for certain SMEM and VMEM opcodes, e.g.:
1709467b48Spatrick // s_buffer_load_dword s4, s[0:3], 4
1809467b48Spatrick // s_buffer_load_dword s5, s[0:3], 8
1909467b48Spatrick // ==>
2009467b48Spatrick // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
2109467b48Spatrick //
2209467b48Spatrick // This pass also tries to promote constant offset to the immediate by
2309467b48Spatrick // adjusting the base. It tries to use a base from the nearby instructions that
2409467b48Spatrick // allows it to have a 13bit constant offset and then promotes the 13bit offset
2509467b48Spatrick // to the immediate.
2609467b48Spatrick // E.g.
2709467b48Spatrick // s_movk_i32 s0, 0x1800
2809467b48Spatrick // v_add_co_u32_e32 v0, vcc, s0, v2
2909467b48Spatrick // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
3009467b48Spatrick //
3109467b48Spatrick // s_movk_i32 s0, 0x1000
3209467b48Spatrick // v_add_co_u32_e32 v5, vcc, s0, v2
3309467b48Spatrick // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3409467b48Spatrick // global_load_dwordx2 v[5:6], v[5:6], off
3509467b48Spatrick // global_load_dwordx2 v[0:1], v[0:1], off
3609467b48Spatrick // =>
3709467b48Spatrick // s_movk_i32 s0, 0x1000
3809467b48Spatrick // v_add_co_u32_e32 v5, vcc, s0, v2
3909467b48Spatrick // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
4009467b48Spatrick // global_load_dwordx2 v[5:6], v[5:6], off
4109467b48Spatrick // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
4209467b48Spatrick //
4309467b48Spatrick // Future improvements:
4409467b48Spatrick //
4509467b48Spatrick // - This is currently missing stores of constants because loading
4609467b48Spatrick // the constant into the data register is placed between the stores, although
4709467b48Spatrick // this is arguably a scheduling problem.
4809467b48Spatrick //
4909467b48Spatrick // - Live interval recomputing seems inefficient. This currently only matches
5009467b48Spatrick // one pair, and recomputes live intervals and moves on to the next pair. It
5109467b48Spatrick // would be better to compute a list of all merges that need to occur.
5209467b48Spatrick //
5309467b48Spatrick // - With a list of instructions to process, we can also merge more. If a
5409467b48Spatrick // cluster of loads have offsets that are too large to fit in the 8-bit
5509467b48Spatrick // offsets, but are close enough to fit in the 8 bits, we can add to the base
5609467b48Spatrick // pointer and use the new reduced offsets.
5709467b48Spatrick //
5809467b48Spatrick //===----------------------------------------------------------------------===//
5909467b48Spatrick
6009467b48Spatrick #include "AMDGPU.h"
6173471bf0Spatrick #include "GCNSubtarget.h"
6209467b48Spatrick #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
6309467b48Spatrick #include "llvm/Analysis/AliasAnalysis.h"
6409467b48Spatrick #include "llvm/CodeGen/MachineFunctionPass.h"
6509467b48Spatrick #include "llvm/InitializePasses.h"
6609467b48Spatrick
6709467b48Spatrick using namespace llvm;
6809467b48Spatrick
6909467b48Spatrick #define DEBUG_TYPE "si-load-store-opt"
7009467b48Spatrick
7109467b48Spatrick namespace {
7209467b48Spatrick enum InstClassEnum {
7309467b48Spatrick UNKNOWN,
7409467b48Spatrick DS_READ,
7509467b48Spatrick DS_WRITE,
7609467b48Spatrick S_BUFFER_LOAD_IMM,
77*d415bd75Srobert S_BUFFER_LOAD_SGPR_IMM,
78*d415bd75Srobert S_LOAD_IMM,
7909467b48Spatrick BUFFER_LOAD,
8009467b48Spatrick BUFFER_STORE,
8109467b48Spatrick MIMG,
8209467b48Spatrick TBUFFER_LOAD,
8309467b48Spatrick TBUFFER_STORE,
84*d415bd75Srobert GLOBAL_LOAD_SADDR,
85*d415bd75Srobert GLOBAL_STORE_SADDR,
86*d415bd75Srobert FLAT_LOAD,
87*d415bd75Srobert FLAT_STORE,
88*d415bd75Srobert GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89*d415bd75Srobert GLOBAL_STORE // any CombineInfo, they are only ever returned by
90*d415bd75Srobert // getCommonInstClass.
9109467b48Spatrick };
9209467b48Spatrick
93097a140dSpatrick struct AddressRegs {
94097a140dSpatrick unsigned char NumVAddrs = 0;
95097a140dSpatrick bool SBase = false;
96097a140dSpatrick bool SRsrc = false;
97097a140dSpatrick bool SOffset = false;
98*d415bd75Srobert bool SAddr = false;
99097a140dSpatrick bool VAddr = false;
100097a140dSpatrick bool Addr = false;
101097a140dSpatrick bool SSamp = false;
10209467b48Spatrick };
10309467b48Spatrick
104097a140dSpatrick // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105097a140dSpatrick const unsigned MaxAddressRegs = 12 + 1 + 1;
106097a140dSpatrick
10709467b48Spatrick class SILoadStoreOptimizer : public MachineFunctionPass {
10809467b48Spatrick struct CombineInfo {
10909467b48Spatrick MachineBasicBlock::iterator I;
11009467b48Spatrick unsigned EltSize;
11109467b48Spatrick unsigned Offset;
11209467b48Spatrick unsigned Width;
11309467b48Spatrick unsigned Format;
11409467b48Spatrick unsigned BaseOff;
11509467b48Spatrick unsigned DMask;
11609467b48Spatrick InstClassEnum InstClass;
11773471bf0Spatrick unsigned CPol = 0;
118*d415bd75Srobert bool IsAGPR;
11909467b48Spatrick bool UseST64;
120097a140dSpatrick int AddrIdx[MaxAddressRegs];
121097a140dSpatrick const MachineOperand *AddrReg[MaxAddressRegs];
12209467b48Spatrick unsigned NumAddresses;
123097a140dSpatrick unsigned Order;
12409467b48Spatrick
hasSameBaseAddress__anondd5682560111::SILoadStoreOptimizer::CombineInfo125*d415bd75Srobert bool hasSameBaseAddress(const CombineInfo &CI) {
126*d415bd75Srobert if (NumAddresses != CI.NumAddresses)
127*d415bd75Srobert return false;
128*d415bd75Srobert
129*d415bd75Srobert const MachineInstr &MI = *CI.I;
13009467b48Spatrick for (unsigned i = 0; i < NumAddresses; i++) {
13109467b48Spatrick const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
13209467b48Spatrick
13309467b48Spatrick if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
13409467b48Spatrick if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
13509467b48Spatrick AddrReg[i]->getImm() != AddrRegNext.getImm()) {
13609467b48Spatrick return false;
13709467b48Spatrick }
13809467b48Spatrick continue;
13909467b48Spatrick }
14009467b48Spatrick
14109467b48Spatrick // Check same base pointer. Be careful of subregisters, which can occur
14209467b48Spatrick // with vectors of pointers.
14309467b48Spatrick if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
14409467b48Spatrick AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
14509467b48Spatrick return false;
14609467b48Spatrick }
14709467b48Spatrick }
14809467b48Spatrick return true;
14909467b48Spatrick }
15009467b48Spatrick
hasMergeableAddress__anondd5682560111::SILoadStoreOptimizer::CombineInfo15109467b48Spatrick bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
15209467b48Spatrick for (unsigned i = 0; i < NumAddresses; ++i) {
15309467b48Spatrick const MachineOperand *AddrOp = AddrReg[i];
15409467b48Spatrick // Immediates are always OK.
15509467b48Spatrick if (AddrOp->isImm())
15609467b48Spatrick continue;
15709467b48Spatrick
15809467b48Spatrick // Don't try to merge addresses that aren't either immediates or registers.
15909467b48Spatrick // TODO: Should be possible to merge FrameIndexes and maybe some other
16009467b48Spatrick // non-register
16109467b48Spatrick if (!AddrOp->isReg())
16209467b48Spatrick return false;
16309467b48Spatrick
164*d415bd75Srobert // TODO: We should be able to merge physical reg addresses.
16573471bf0Spatrick if (AddrOp->getReg().isPhysical())
16609467b48Spatrick return false;
16709467b48Spatrick
168*d415bd75Srobert // If an address has only one use then there will be no other
16909467b48Spatrick // instructions with the same address, so we can't merge this one.
17009467b48Spatrick if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
17109467b48Spatrick return false;
17209467b48Spatrick }
17309467b48Spatrick return true;
17409467b48Spatrick }
17509467b48Spatrick
176*d415bd75Srobert void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177*d415bd75Srobert
178*d415bd75Srobert // Compare by pointer order.
operator <__anondd5682560111::SILoadStoreOptimizer::CombineInfo179*d415bd75Srobert bool operator<(const CombineInfo& Other) const {
180*d415bd75Srobert return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181*d415bd75Srobert }
18209467b48Spatrick };
18309467b48Spatrick
18409467b48Spatrick struct BaseRegisters {
185097a140dSpatrick Register LoReg;
186097a140dSpatrick Register HiReg;
18709467b48Spatrick
18809467b48Spatrick unsigned LoSubReg = 0;
18909467b48Spatrick unsigned HiSubReg = 0;
19009467b48Spatrick };
19109467b48Spatrick
19209467b48Spatrick struct MemAddress {
19309467b48Spatrick BaseRegisters Base;
19409467b48Spatrick int64_t Offset = 0;
19509467b48Spatrick };
19609467b48Spatrick
19709467b48Spatrick using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
19809467b48Spatrick
19909467b48Spatrick private:
20009467b48Spatrick const GCNSubtarget *STM = nullptr;
20109467b48Spatrick const SIInstrInfo *TII = nullptr;
20209467b48Spatrick const SIRegisterInfo *TRI = nullptr;
20309467b48Spatrick MachineRegisterInfo *MRI = nullptr;
20409467b48Spatrick AliasAnalysis *AA = nullptr;
20509467b48Spatrick bool OptimizeAgain;
20609467b48Spatrick
207*d415bd75Srobert bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208*d415bd75Srobert const DenseSet<Register> &ARegUses,
209*d415bd75Srobert const MachineInstr &A, const MachineInstr &B) const;
21009467b48Spatrick static bool dmasksCanBeCombined(const CombineInfo &CI,
21109467b48Spatrick const SIInstrInfo &TII,
21209467b48Spatrick const CombineInfo &Paired);
213097a140dSpatrick static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214097a140dSpatrick CombineInfo &Paired, bool Modify = false);
215097a140dSpatrick static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
21609467b48Spatrick const CombineInfo &Paired);
21709467b48Spatrick static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
21809467b48Spatrick static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
21909467b48Spatrick const CombineInfo &Paired);
22009467b48Spatrick const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
22109467b48Spatrick const CombineInfo &Paired);
22273471bf0Spatrick const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
22309467b48Spatrick
224*d415bd75Srobert CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
22509467b48Spatrick
22609467b48Spatrick unsigned read2Opcode(unsigned EltSize) const;
22709467b48Spatrick unsigned read2ST64Opcode(unsigned EltSize) const;
228*d415bd75Srobert MachineBasicBlock::iterator
229*d415bd75Srobert mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
23109467b48Spatrick
23209467b48Spatrick unsigned write2Opcode(unsigned EltSize) const;
23309467b48Spatrick unsigned write2ST64Opcode(unsigned EltSize) const;
234097a140dSpatrick MachineBasicBlock::iterator
235097a140dSpatrick mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
237097a140dSpatrick MachineBasicBlock::iterator
238097a140dSpatrick mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
240097a140dSpatrick MachineBasicBlock::iterator
241*d415bd75Srobert mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
243097a140dSpatrick MachineBasicBlock::iterator
244097a140dSpatrick mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
246097a140dSpatrick MachineBasicBlock::iterator
247097a140dSpatrick mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
249097a140dSpatrick MachineBasicBlock::iterator
250097a140dSpatrick mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
252097a140dSpatrick MachineBasicBlock::iterator
253097a140dSpatrick mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
255*d415bd75Srobert MachineBasicBlock::iterator
256*d415bd75Srobert mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
258*d415bd75Srobert MachineBasicBlock::iterator
259*d415bd75Srobert mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260*d415bd75Srobert MachineBasicBlock::iterator InsertBefore);
26109467b48Spatrick
262097a140dSpatrick void updateBaseAndOffset(MachineInstr &I, Register NewBase,
26309467b48Spatrick int32_t NewOffset) const;
264097a140dSpatrick Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
26509467b48Spatrick MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266*d415bd75Srobert std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
26709467b48Spatrick void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
26809467b48Spatrick /// Promotes constant offset to the immediate by adjusting the base. It
26909467b48Spatrick /// tries to use a base from the nearby instructions that allows it to have
27009467b48Spatrick /// a 13bit constant offset which gets promoted to the immediate.
27109467b48Spatrick bool promoteConstantOffsetToImm(MachineInstr &CI,
27209467b48Spatrick MemInfoMap &Visited,
27309467b48Spatrick SmallPtrSet<MachineInstr *, 4> &Promoted) const;
27409467b48Spatrick void addInstToMergeableList(const CombineInfo &CI,
27509467b48Spatrick std::list<std::list<CombineInfo> > &MergeableInsts) const;
276097a140dSpatrick
277097a140dSpatrick std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278097a140dSpatrick MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279097a140dSpatrick MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
28009467b48Spatrick std::list<std::list<CombineInfo>> &MergeableInsts) const;
28109467b48Spatrick
282*d415bd75Srobert static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283*d415bd75Srobert const CombineInfo &Paired);
284*d415bd75Srobert
285*d415bd75Srobert static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286*d415bd75Srobert const CombineInfo &Paired);
287*d415bd75Srobert
28809467b48Spatrick public:
28909467b48Spatrick static char ID;
29009467b48Spatrick
SILoadStoreOptimizer()29109467b48Spatrick SILoadStoreOptimizer() : MachineFunctionPass(ID) {
29209467b48Spatrick initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
29309467b48Spatrick }
29409467b48Spatrick
29509467b48Spatrick bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
29609467b48Spatrick bool &OptimizeListAgain);
29709467b48Spatrick bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
29809467b48Spatrick
29909467b48Spatrick bool runOnMachineFunction(MachineFunction &MF) override;
30009467b48Spatrick
getPassName() const30109467b48Spatrick StringRef getPassName() const override { return "SI Load Store Optimizer"; }
30209467b48Spatrick
getAnalysisUsage(AnalysisUsage & AU) const30309467b48Spatrick void getAnalysisUsage(AnalysisUsage &AU) const override {
30409467b48Spatrick AU.setPreservesCFG();
30509467b48Spatrick AU.addRequired<AAResultsWrapperPass>();
30609467b48Spatrick
30709467b48Spatrick MachineFunctionPass::getAnalysisUsage(AU);
30809467b48Spatrick }
309097a140dSpatrick
getRequiredProperties() const310097a140dSpatrick MachineFunctionProperties getRequiredProperties() const override {
311097a140dSpatrick return MachineFunctionProperties()
312097a140dSpatrick .set(MachineFunctionProperties::Property::IsSSA);
313097a140dSpatrick }
31409467b48Spatrick };
31509467b48Spatrick
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)31609467b48Spatrick static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
31709467b48Spatrick const unsigned Opc = MI.getOpcode();
31809467b48Spatrick
31909467b48Spatrick if (TII.isMUBUF(Opc)) {
32009467b48Spatrick // FIXME: Handle d16 correctly
32109467b48Spatrick return AMDGPU::getMUBUFElements(Opc);
32209467b48Spatrick }
32309467b48Spatrick if (TII.isMIMG(MI)) {
32409467b48Spatrick uint64_t DMaskImm =
32509467b48Spatrick TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326*d415bd75Srobert return llvm::popcount(DMaskImm);
32709467b48Spatrick }
32809467b48Spatrick if (TII.isMTBUF(Opc)) {
32909467b48Spatrick return AMDGPU::getMTBUFElements(Opc);
33009467b48Spatrick }
33109467b48Spatrick
33209467b48Spatrick switch (Opc) {
33309467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336*d415bd75Srobert case AMDGPU::S_LOAD_DWORD_IMM:
337*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD:
338*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD:
340*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORD:
342*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORD:
34309467b48Spatrick return 1;
34409467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX2_IMM:
348*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2:
349*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2:
351*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX2:
353*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX2:
35409467b48Spatrick return 2;
355*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3:
356*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3:
358*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX3:
360*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX3:
361*d415bd75Srobert return 3;
36209467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX4_IMM:
366*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4:
367*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4:
369*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX4:
371*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX4:
37209467b48Spatrick return 4;
373*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX8_IMM:
377*d415bd75Srobert return 8;
378*d415bd75Srobert case AMDGPU::DS_READ_B32: [[fallthrough]];
379*d415bd75Srobert case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380*d415bd75Srobert case AMDGPU::DS_WRITE_B32: [[fallthrough]];
38173471bf0Spatrick case AMDGPU::DS_WRITE_B32_gfx9:
38273471bf0Spatrick return 1;
383*d415bd75Srobert case AMDGPU::DS_READ_B64: [[fallthrough]];
384*d415bd75Srobert case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385*d415bd75Srobert case AMDGPU::DS_WRITE_B64: [[fallthrough]];
38673471bf0Spatrick case AMDGPU::DS_WRITE_B64_gfx9:
38773471bf0Spatrick return 2;
38809467b48Spatrick default:
38909467b48Spatrick return 0;
39009467b48Spatrick }
39109467b48Spatrick }
39209467b48Spatrick
39309467b48Spatrick /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)39409467b48Spatrick static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
39509467b48Spatrick switch (Opc) {
39609467b48Spatrick default:
39709467b48Spatrick if (TII.isMUBUF(Opc)) {
39809467b48Spatrick switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
39909467b48Spatrick default:
40009467b48Spatrick return UNKNOWN;
40109467b48Spatrick case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
40209467b48Spatrick case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
40309467b48Spatrick case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
40409467b48Spatrick case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
40509467b48Spatrick return BUFFER_LOAD;
40609467b48Spatrick case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
40709467b48Spatrick case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
40809467b48Spatrick case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
40909467b48Spatrick case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
41009467b48Spatrick return BUFFER_STORE;
41109467b48Spatrick }
41209467b48Spatrick }
41309467b48Spatrick if (TII.isMIMG(Opc)) {
41409467b48Spatrick // Ignore instructions encoded without vaddr.
415*d415bd75Srobert if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416*d415bd75Srobert !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
417*d415bd75Srobert return UNKNOWN;
418*d415bd75Srobert // Ignore BVH instructions
419*d415bd75Srobert if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
42009467b48Spatrick return UNKNOWN;
42109467b48Spatrick // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
42209467b48Spatrick if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
42309467b48Spatrick TII.isGather4(Opc))
42409467b48Spatrick return UNKNOWN;
42509467b48Spatrick return MIMG;
42609467b48Spatrick }
42709467b48Spatrick if (TII.isMTBUF(Opc)) {
42809467b48Spatrick switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
42909467b48Spatrick default:
43009467b48Spatrick return UNKNOWN;
43109467b48Spatrick case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
43209467b48Spatrick case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
43309467b48Spatrick case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
43409467b48Spatrick case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
43509467b48Spatrick return TBUFFER_LOAD;
43609467b48Spatrick case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
43709467b48Spatrick case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
43809467b48Spatrick case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
43909467b48Spatrick case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
44009467b48Spatrick return TBUFFER_STORE;
44109467b48Spatrick }
44209467b48Spatrick }
44309467b48Spatrick return UNKNOWN;
44409467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
44509467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
44609467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
44809467b48Spatrick return S_BUFFER_LOAD_IMM;
449*d415bd75Srobert // For the purposes of this optimization SGPR variants of buffer loads
450*d415bd75Srobert // are considered to be zero-offsetted SGPR_IMM loads.
451*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459*d415bd75Srobert return S_BUFFER_LOAD_SGPR_IMM;
460*d415bd75Srobert case AMDGPU::S_LOAD_DWORD_IMM:
461*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX2_IMM:
462*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX4_IMM:
463*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX8_IMM:
464*d415bd75Srobert return S_LOAD_IMM;
46509467b48Spatrick case AMDGPU::DS_READ_B32:
46609467b48Spatrick case AMDGPU::DS_READ_B32_gfx9:
46709467b48Spatrick case AMDGPU::DS_READ_B64:
46809467b48Spatrick case AMDGPU::DS_READ_B64_gfx9:
46909467b48Spatrick return DS_READ;
47009467b48Spatrick case AMDGPU::DS_WRITE_B32:
47109467b48Spatrick case AMDGPU::DS_WRITE_B32_gfx9:
47209467b48Spatrick case AMDGPU::DS_WRITE_B64:
47309467b48Spatrick case AMDGPU::DS_WRITE_B64_gfx9:
47409467b48Spatrick return DS_WRITE;
475*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD:
476*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2:
477*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3:
478*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4:
479*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORD:
480*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX2:
481*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX3:
482*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX4:
483*d415bd75Srobert return FLAT_LOAD;
484*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488*d415bd75Srobert return GLOBAL_LOAD_SADDR;
489*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD:
490*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2:
491*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3:
492*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4:
493*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORD:
494*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX2:
495*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX3:
496*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX4:
497*d415bd75Srobert return FLAT_STORE;
498*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502*d415bd75Srobert return GLOBAL_STORE_SADDR;
50309467b48Spatrick }
50409467b48Spatrick }
50509467b48Spatrick
50609467b48Spatrick /// Determines instruction subclass from opcode. Only instructions
507*d415bd75Srobert /// of the same subclass can be merged together. The merged instruction may have
508*d415bd75Srobert /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)50909467b48Spatrick static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
51009467b48Spatrick switch (Opc) {
51109467b48Spatrick default:
51209467b48Spatrick if (TII.isMUBUF(Opc))
51309467b48Spatrick return AMDGPU::getMUBUFBaseOpcode(Opc);
51409467b48Spatrick if (TII.isMIMG(Opc)) {
51509467b48Spatrick const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
51609467b48Spatrick assert(Info);
51709467b48Spatrick return Info->BaseOpcode;
51809467b48Spatrick }
51909467b48Spatrick if (TII.isMTBUF(Opc))
52009467b48Spatrick return AMDGPU::getMTBUFBaseOpcode(Opc);
52109467b48Spatrick return -1;
52209467b48Spatrick case AMDGPU::DS_READ_B32:
52309467b48Spatrick case AMDGPU::DS_READ_B32_gfx9:
52409467b48Spatrick case AMDGPU::DS_READ_B64:
52509467b48Spatrick case AMDGPU::DS_READ_B64_gfx9:
52609467b48Spatrick case AMDGPU::DS_WRITE_B32:
52709467b48Spatrick case AMDGPU::DS_WRITE_B32_gfx9:
52809467b48Spatrick case AMDGPU::DS_WRITE_B64:
52909467b48Spatrick case AMDGPU::DS_WRITE_B64_gfx9:
53009467b48Spatrick return Opc;
53109467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
53209467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
53309467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
53509467b48Spatrick return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
536*d415bd75Srobert // For the purposes of this optimization SGPR variants of buffer loads
537*d415bd75Srobert // are considered to be zero-offsetted SGPR_IMM loads.
538*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546*d415bd75Srobert return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547*d415bd75Srobert case AMDGPU::S_LOAD_DWORD_IMM:
548*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX2_IMM:
549*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX4_IMM:
550*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX8_IMM:
551*d415bd75Srobert return AMDGPU::S_LOAD_DWORD_IMM;
552*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD:
553*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2:
554*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3:
555*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4:
556*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORD:
557*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX2:
558*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX3:
559*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX4:
560*d415bd75Srobert return AMDGPU::FLAT_LOAD_DWORD;
561*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD:
567*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2:
568*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3:
569*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4:
570*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORD:
571*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX2:
572*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX3:
573*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX4:
574*d415bd75Srobert return AMDGPU::FLAT_STORE_DWORD;
575*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
58009467b48Spatrick }
58109467b48Spatrick }
58209467b48Spatrick
583*d415bd75Srobert // GLOBAL loads and stores are classified as FLAT initially. If both combined
584*d415bd75Srobert // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
585*d415bd75Srobert // If either or both instructions are non segment specific FLAT the resulting
586*d415bd75Srobert // combined operation will be FLAT, potentially promoting one of the GLOBAL
587*d415bd75Srobert // operations to FLAT.
588*d415bd75Srobert // For other instructions return the original unmodified class.
589*d415bd75Srobert InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)590*d415bd75Srobert SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
591*d415bd75Srobert const CombineInfo &Paired) {
592*d415bd75Srobert assert(CI.InstClass == Paired.InstClass);
593*d415bd75Srobert
594*d415bd75Srobert if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
595*d415bd75Srobert SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
596*d415bd75Srobert return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
597*d415bd75Srobert
598*d415bd75Srobert return CI.InstClass;
599*d415bd75Srobert }
600*d415bd75Srobert
getRegs(unsigned Opc,const SIInstrInfo & TII)601097a140dSpatrick static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
602097a140dSpatrick AddressRegs Result;
603097a140dSpatrick
60409467b48Spatrick if (TII.isMUBUF(Opc)) {
605097a140dSpatrick if (AMDGPU::getMUBUFHasVAddr(Opc))
606097a140dSpatrick Result.VAddr = true;
607097a140dSpatrick if (AMDGPU::getMUBUFHasSrsrc(Opc))
608097a140dSpatrick Result.SRsrc = true;
609097a140dSpatrick if (AMDGPU::getMUBUFHasSoffset(Opc))
610097a140dSpatrick Result.SOffset = true;
61109467b48Spatrick
612097a140dSpatrick return Result;
61309467b48Spatrick }
61409467b48Spatrick
61509467b48Spatrick if (TII.isMIMG(Opc)) {
616097a140dSpatrick int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617097a140dSpatrick if (VAddr0Idx >= 0) {
618097a140dSpatrick int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619097a140dSpatrick Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
620097a140dSpatrick } else {
621097a140dSpatrick Result.VAddr = true;
622097a140dSpatrick }
623097a140dSpatrick Result.SRsrc = true;
62409467b48Spatrick const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
62509467b48Spatrick if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
626097a140dSpatrick Result.SSamp = true;
62709467b48Spatrick
628097a140dSpatrick return Result;
62909467b48Spatrick }
63009467b48Spatrick if (TII.isMTBUF(Opc)) {
631097a140dSpatrick if (AMDGPU::getMTBUFHasVAddr(Opc))
632097a140dSpatrick Result.VAddr = true;
633097a140dSpatrick if (AMDGPU::getMTBUFHasSrsrc(Opc))
634097a140dSpatrick Result.SRsrc = true;
635097a140dSpatrick if (AMDGPU::getMTBUFHasSoffset(Opc))
636097a140dSpatrick Result.SOffset = true;
63709467b48Spatrick
638097a140dSpatrick return Result;
63909467b48Spatrick }
64009467b48Spatrick
64109467b48Spatrick switch (Opc) {
64209467b48Spatrick default:
643097a140dSpatrick return Result;
644*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652*d415bd75Srobert Result.SOffset = true;
653*d415bd75Srobert [[fallthrough]];
65409467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
65509467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
65609467b48Spatrick case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657*d415bd75Srobert case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658*d415bd75Srobert case AMDGPU::S_LOAD_DWORD_IMM:
659*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX2_IMM:
660*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX4_IMM:
661*d415bd75Srobert case AMDGPU::S_LOAD_DWORDX8_IMM:
662097a140dSpatrick Result.SBase = true;
663097a140dSpatrick return Result;
66409467b48Spatrick case AMDGPU::DS_READ_B32:
66509467b48Spatrick case AMDGPU::DS_READ_B64:
66609467b48Spatrick case AMDGPU::DS_READ_B32_gfx9:
66709467b48Spatrick case AMDGPU::DS_READ_B64_gfx9:
66809467b48Spatrick case AMDGPU::DS_WRITE_B32:
66909467b48Spatrick case AMDGPU::DS_WRITE_B64:
67009467b48Spatrick case AMDGPU::DS_WRITE_B32_gfx9:
67109467b48Spatrick case AMDGPU::DS_WRITE_B64_gfx9:
672097a140dSpatrick Result.Addr = true;
673097a140dSpatrick return Result;
674*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682*d415bd75Srobert Result.SAddr = true;
683*d415bd75Srobert [[fallthrough]];
684*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORD:
685*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX2:
686*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX3:
687*d415bd75Srobert case AMDGPU::GLOBAL_LOAD_DWORDX4:
688*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORD:
689*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX2:
690*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX3:
691*d415bd75Srobert case AMDGPU::GLOBAL_STORE_DWORDX4:
692*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORD:
693*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX2:
694*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX3:
695*d415bd75Srobert case AMDGPU::FLAT_LOAD_DWORDX4:
696*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORD:
697*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX2:
698*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX3:
699*d415bd75Srobert case AMDGPU::FLAT_STORE_DWORDX4:
700*d415bd75Srobert Result.VAddr = true;
701*d415bd75Srobert return Result;
70209467b48Spatrick }
70309467b48Spatrick }
70409467b48Spatrick
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)70509467b48Spatrick void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706*d415bd75Srobert const SILoadStoreOptimizer &LSO) {
70709467b48Spatrick I = MI;
70809467b48Spatrick unsigned Opc = MI->getOpcode();
709*d415bd75Srobert InstClass = getInstClass(Opc, *LSO.TII);
71009467b48Spatrick
71109467b48Spatrick if (InstClass == UNKNOWN)
71209467b48Spatrick return;
71309467b48Spatrick
714*d415bd75Srobert IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715*d415bd75Srobert
71609467b48Spatrick switch (InstClass) {
71709467b48Spatrick case DS_READ:
71809467b48Spatrick EltSize =
71909467b48Spatrick (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
72009467b48Spatrick : 4;
72109467b48Spatrick break;
72209467b48Spatrick case DS_WRITE:
72309467b48Spatrick EltSize =
72409467b48Spatrick (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
72509467b48Spatrick : 4;
72609467b48Spatrick break;
72709467b48Spatrick case S_BUFFER_LOAD_IMM:
728*d415bd75Srobert case S_BUFFER_LOAD_SGPR_IMM:
729*d415bd75Srobert case S_LOAD_IMM:
730*d415bd75Srobert EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
73109467b48Spatrick break;
73209467b48Spatrick default:
73309467b48Spatrick EltSize = 4;
73409467b48Spatrick break;
73509467b48Spatrick }
73609467b48Spatrick
73709467b48Spatrick if (InstClass == MIMG) {
738*d415bd75Srobert DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739097a140dSpatrick // Offset is not considered for MIMG instructions.
740097a140dSpatrick Offset = 0;
74109467b48Spatrick } else {
74209467b48Spatrick int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743*d415bd75Srobert Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
74409467b48Spatrick }
74509467b48Spatrick
74609467b48Spatrick if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747*d415bd75Srobert Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
74809467b48Spatrick
749*d415bd75Srobert Width = getOpcodeWidth(*I, *LSO.TII);
75009467b48Spatrick
75109467b48Spatrick if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
75209467b48Spatrick Offset &= 0xffff;
75309467b48Spatrick } else if (InstClass != MIMG) {
754*d415bd75Srobert CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
75509467b48Spatrick }
75609467b48Spatrick
757*d415bd75Srobert AddressRegs Regs = getRegs(Opc, *LSO.TII);
758097a140dSpatrick
75909467b48Spatrick NumAddresses = 0;
760097a140dSpatrick for (unsigned J = 0; J < Regs.NumVAddrs; J++)
761097a140dSpatrick AddrIdx[NumAddresses++] =
762097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
763097a140dSpatrick if (Regs.Addr)
764097a140dSpatrick AddrIdx[NumAddresses++] =
765097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
766097a140dSpatrick if (Regs.SBase)
767097a140dSpatrick AddrIdx[NumAddresses++] =
768097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
769097a140dSpatrick if (Regs.SRsrc)
770097a140dSpatrick AddrIdx[NumAddresses++] =
771097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
772097a140dSpatrick if (Regs.SOffset)
773097a140dSpatrick AddrIdx[NumAddresses++] =
774097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
775*d415bd75Srobert if (Regs.SAddr)
776*d415bd75Srobert AddrIdx[NumAddresses++] =
777*d415bd75Srobert AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
778097a140dSpatrick if (Regs.VAddr)
779097a140dSpatrick AddrIdx[NumAddresses++] =
780097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
781097a140dSpatrick if (Regs.SSamp)
782097a140dSpatrick AddrIdx[NumAddresses++] =
783097a140dSpatrick AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784097a140dSpatrick assert(NumAddresses <= MaxAddressRegs);
78509467b48Spatrick
786097a140dSpatrick for (unsigned J = 0; J < NumAddresses; J++)
787097a140dSpatrick AddrReg[J] = &I->getOperand(AddrIdx[J]);
78809467b48Spatrick }
78909467b48Spatrick
79009467b48Spatrick } // end anonymous namespace.
79109467b48Spatrick
79209467b48Spatrick INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
79309467b48Spatrick "SI Load Store Optimizer", false, false)
79409467b48Spatrick INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
79509467b48Spatrick INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
79609467b48Spatrick false, false)
79709467b48Spatrick
79809467b48Spatrick char SILoadStoreOptimizer::ID = 0;
79909467b48Spatrick
80009467b48Spatrick char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
80109467b48Spatrick
createSILoadStoreOptimizerPass()80209467b48Spatrick FunctionPass *llvm::createSILoadStoreOptimizerPass() {
80309467b48Spatrick return new SILoadStoreOptimizer();
80409467b48Spatrick }
80509467b48Spatrick
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)80609467b48Spatrick static void addDefsUsesToList(const MachineInstr &MI,
807097a140dSpatrick DenseSet<Register> &RegDefs,
808*d415bd75Srobert DenseSet<Register> &RegUses) {
809*d415bd75Srobert for (const auto &Op : MI.operands()) {
810*d415bd75Srobert if (!Op.isReg())
811*d415bd75Srobert continue;
81209467b48Spatrick if (Op.isDef())
81309467b48Spatrick RegDefs.insert(Op.getReg());
814*d415bd75Srobert if (Op.readsReg())
815*d415bd75Srobert RegUses.insert(Op.getReg());
81609467b48Spatrick }
81709467b48Spatrick }
81809467b48Spatrick
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const819*d415bd75Srobert bool SILoadStoreOptimizer::canSwapInstructions(
820*d415bd75Srobert const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
821*d415bd75Srobert const MachineInstr &A, const MachineInstr &B) const {
822*d415bd75Srobert if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
823*d415bd75Srobert (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
82409467b48Spatrick return false;
825*d415bd75Srobert for (const auto &BOp : B.operands()) {
826*d415bd75Srobert if (!BOp.isReg())
82709467b48Spatrick continue;
828*d415bd75Srobert if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
829*d415bd75Srobert return false;
830*d415bd75Srobert if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
83109467b48Spatrick return false;
83209467b48Spatrick }
83309467b48Spatrick return true;
83409467b48Spatrick }
83509467b48Spatrick
836*d415bd75Srobert // Given that \p CI and \p Paired are adjacent memory operations produce a new
837*d415bd75Srobert // MMO for the combined operation with a new access size.
838*d415bd75Srobert MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)839*d415bd75Srobert SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
840*d415bd75Srobert const CombineInfo &Paired) {
841*d415bd75Srobert const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
842*d415bd75Srobert const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
843*d415bd75Srobert
844*d415bd75Srobert unsigned Size = MMOa->getSize() + MMOb->getSize();
845*d415bd75Srobert
846*d415bd75Srobert // A base pointer for the combined operation is the same as the leading
847*d415bd75Srobert // operation's pointer.
848*d415bd75Srobert if (Paired < CI)
849*d415bd75Srobert std::swap(MMOa, MMOb);
850*d415bd75Srobert
851*d415bd75Srobert MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
852*d415bd75Srobert // If merging FLAT and GLOBAL set address space to FLAT.
853*d415bd75Srobert if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
854*d415bd75Srobert PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
855*d415bd75Srobert
856*d415bd75Srobert MachineFunction *MF = CI.I->getMF();
857*d415bd75Srobert return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
85809467b48Spatrick }
85909467b48Spatrick
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)86009467b48Spatrick bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
86109467b48Spatrick const SIInstrInfo &TII,
86209467b48Spatrick const CombineInfo &Paired) {
86309467b48Spatrick assert(CI.InstClass == MIMG);
86409467b48Spatrick
86509467b48Spatrick // Ignore instructions with tfe/lwe set.
86609467b48Spatrick const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
86709467b48Spatrick const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
86809467b48Spatrick
86909467b48Spatrick if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
87009467b48Spatrick return false;
87109467b48Spatrick
87209467b48Spatrick // Check other optional immediate operands for equality.
87373471bf0Spatrick unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
87473471bf0Spatrick AMDGPU::OpName::unorm, AMDGPU::OpName::da,
87573471bf0Spatrick AMDGPU::OpName::r128, AMDGPU::OpName::a16};
87609467b48Spatrick
87709467b48Spatrick for (auto op : OperandsToMatch) {
87809467b48Spatrick int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
87909467b48Spatrick if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
88009467b48Spatrick return false;
88109467b48Spatrick if (Idx != -1 &&
88209467b48Spatrick CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
88309467b48Spatrick return false;
88409467b48Spatrick }
88509467b48Spatrick
88609467b48Spatrick // Check DMask for overlaps.
88709467b48Spatrick unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
88809467b48Spatrick unsigned MinMask = std::min(CI.DMask, Paired.DMask);
88909467b48Spatrick
89009467b48Spatrick unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
89109467b48Spatrick if ((1u << AllowedBitsForMin) <= MinMask)
89209467b48Spatrick return false;
89309467b48Spatrick
89409467b48Spatrick return true;
89509467b48Spatrick }
89609467b48Spatrick
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)89709467b48Spatrick static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
89809467b48Spatrick unsigned ComponentCount,
899097a140dSpatrick const GCNSubtarget &STI) {
90009467b48Spatrick if (ComponentCount > 4)
90109467b48Spatrick return 0;
90209467b48Spatrick
90309467b48Spatrick const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
90409467b48Spatrick llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
90509467b48Spatrick if (!OldFormatInfo)
90609467b48Spatrick return 0;
90709467b48Spatrick
90809467b48Spatrick const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
90909467b48Spatrick llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
91009467b48Spatrick ComponentCount,
91109467b48Spatrick OldFormatInfo->NumFormat, STI);
91209467b48Spatrick
91309467b48Spatrick if (!NewFormatInfo)
91409467b48Spatrick return 0;
91509467b48Spatrick
91609467b48Spatrick assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
91709467b48Spatrick NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
91809467b48Spatrick
91909467b48Spatrick return NewFormatInfo->Format;
92009467b48Spatrick }
92109467b48Spatrick
92273471bf0Spatrick // Return the value in the inclusive range [Lo,Hi] that is aligned to the
92373471bf0Spatrick // highest power of two. Note that the result is well defined for all inputs
92473471bf0Spatrick // including corner cases like:
92573471bf0Spatrick // - if Lo == Hi, return that value
92673471bf0Spatrick // - if Lo == 0, return 0 (even though the "- 1" below underflows
92773471bf0Spatrick // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)92873471bf0Spatrick static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
92973471bf0Spatrick return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
93073471bf0Spatrick }
93173471bf0Spatrick
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)93209467b48Spatrick bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
933097a140dSpatrick const GCNSubtarget &STI,
934097a140dSpatrick CombineInfo &Paired,
935097a140dSpatrick bool Modify) {
93609467b48Spatrick assert(CI.InstClass != MIMG);
93709467b48Spatrick
93809467b48Spatrick // XXX - Would the same offset be OK? Is there any reason this would happen or
93909467b48Spatrick // be useful?
94009467b48Spatrick if (CI.Offset == Paired.Offset)
94109467b48Spatrick return false;
94209467b48Spatrick
94309467b48Spatrick // This won't be valid if the offset isn't aligned.
94409467b48Spatrick if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
94509467b48Spatrick return false;
94609467b48Spatrick
94709467b48Spatrick if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
94809467b48Spatrick
94909467b48Spatrick const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
95009467b48Spatrick llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
95109467b48Spatrick if (!Info0)
95209467b48Spatrick return false;
95309467b48Spatrick const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
95409467b48Spatrick llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
95509467b48Spatrick if (!Info1)
95609467b48Spatrick return false;
95709467b48Spatrick
95809467b48Spatrick if (Info0->BitsPerComp != Info1->BitsPerComp ||
95909467b48Spatrick Info0->NumFormat != Info1->NumFormat)
96009467b48Spatrick return false;
96109467b48Spatrick
96209467b48Spatrick // TODO: Should be possible to support more formats, but if format loads
96309467b48Spatrick // are not dword-aligned, the merged load might not be valid.
96409467b48Spatrick if (Info0->BitsPerComp != 32)
96509467b48Spatrick return false;
96609467b48Spatrick
96709467b48Spatrick if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
96809467b48Spatrick return false;
96909467b48Spatrick }
97009467b48Spatrick
97173471bf0Spatrick uint32_t EltOffset0 = CI.Offset / CI.EltSize;
97273471bf0Spatrick uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
97309467b48Spatrick CI.UseST64 = false;
97409467b48Spatrick CI.BaseOff = 0;
97509467b48Spatrick
97673471bf0Spatrick // Handle all non-DS instructions.
97709467b48Spatrick if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
97809467b48Spatrick return (EltOffset0 + CI.Width == EltOffset1 ||
97909467b48Spatrick EltOffset1 + Paired.Width == EltOffset0) &&
980*d415bd75Srobert CI.CPol == Paired.CPol;
98109467b48Spatrick }
98209467b48Spatrick
98309467b48Spatrick // If the offset in elements doesn't fit in 8-bits, we might be able to use
98409467b48Spatrick // the stride 64 versions.
98509467b48Spatrick if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
98609467b48Spatrick isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
987097a140dSpatrick if (Modify) {
98809467b48Spatrick CI.Offset = EltOffset0 / 64;
98909467b48Spatrick Paired.Offset = EltOffset1 / 64;
99009467b48Spatrick CI.UseST64 = true;
991097a140dSpatrick }
99209467b48Spatrick return true;
99309467b48Spatrick }
99409467b48Spatrick
99509467b48Spatrick // Check if the new offsets fit in the reduced 8-bit range.
99609467b48Spatrick if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
997097a140dSpatrick if (Modify) {
99809467b48Spatrick CI.Offset = EltOffset0;
99909467b48Spatrick Paired.Offset = EltOffset1;
1000097a140dSpatrick }
100109467b48Spatrick return true;
100209467b48Spatrick }
100309467b48Spatrick
100409467b48Spatrick // Try to shift base address to decrease offsets.
100573471bf0Spatrick uint32_t Min = std::min(EltOffset0, EltOffset1);
100673471bf0Spatrick uint32_t Max = std::max(EltOffset0, EltOffset1);
100709467b48Spatrick
100873471bf0Spatrick const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
100973471bf0Spatrick if (((Max - Min) & ~Mask) == 0) {
1010097a140dSpatrick if (Modify) {
101173471bf0Spatrick // From the range of values we could use for BaseOff, choose the one that
101273471bf0Spatrick // is aligned to the highest power of two, to maximise the chance that
101373471bf0Spatrick // the same offset can be reused for other load/store pairs.
101473471bf0Spatrick uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
101573471bf0Spatrick // Copy the low bits of the offsets, so that when we adjust them by
101673471bf0Spatrick // subtracting BaseOff they will be multiples of 64.
101773471bf0Spatrick BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
101873471bf0Spatrick CI.BaseOff = BaseOff * CI.EltSize;
101973471bf0Spatrick CI.Offset = (EltOffset0 - BaseOff) / 64;
102073471bf0Spatrick Paired.Offset = (EltOffset1 - BaseOff) / 64;
102109467b48Spatrick CI.UseST64 = true;
1022097a140dSpatrick }
102309467b48Spatrick return true;
102409467b48Spatrick }
102509467b48Spatrick
102673471bf0Spatrick if (isUInt<8>(Max - Min)) {
1027097a140dSpatrick if (Modify) {
102873471bf0Spatrick // From the range of values we could use for BaseOff, choose the one that
102973471bf0Spatrick // is aligned to the highest power of two, to maximise the chance that
103073471bf0Spatrick // the same offset can be reused for other load/store pairs.
103173471bf0Spatrick uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
103273471bf0Spatrick CI.BaseOff = BaseOff * CI.EltSize;
103373471bf0Spatrick CI.Offset = EltOffset0 - BaseOff;
103473471bf0Spatrick Paired.Offset = EltOffset1 - BaseOff;
1035097a140dSpatrick }
103609467b48Spatrick return true;
103709467b48Spatrick }
103809467b48Spatrick
103909467b48Spatrick return false;
104009467b48Spatrick }
104109467b48Spatrick
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)104209467b48Spatrick bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
104309467b48Spatrick const CombineInfo &CI,
104409467b48Spatrick const CombineInfo &Paired) {
104509467b48Spatrick const unsigned Width = (CI.Width + Paired.Width);
104609467b48Spatrick switch (CI.InstClass) {
104709467b48Spatrick default:
104809467b48Spatrick return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
104909467b48Spatrick case S_BUFFER_LOAD_IMM:
1050*d415bd75Srobert case S_BUFFER_LOAD_SGPR_IMM:
1051*d415bd75Srobert case S_LOAD_IMM:
105209467b48Spatrick switch (Width) {
105309467b48Spatrick default:
105409467b48Spatrick return false;
105509467b48Spatrick case 2:
105609467b48Spatrick case 4:
1057*d415bd75Srobert case 8:
105809467b48Spatrick return true;
105909467b48Spatrick }
106009467b48Spatrick }
106109467b48Spatrick }
106209467b48Spatrick
106373471bf0Spatrick const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const106473471bf0Spatrick SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
106573471bf0Spatrick if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
106673471bf0Spatrick return TRI->getRegClassForReg(*MRI, Dst->getReg());
106773471bf0Spatrick }
106873471bf0Spatrick if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
106973471bf0Spatrick return TRI->getRegClassForReg(*MRI, Src->getReg());
107073471bf0Spatrick }
107173471bf0Spatrick if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
107273471bf0Spatrick return TRI->getRegClassForReg(*MRI, Src->getReg());
107373471bf0Spatrick }
107473471bf0Spatrick if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
107573471bf0Spatrick return TRI->getRegClassForReg(*MRI, Dst->getReg());
107673471bf0Spatrick }
107773471bf0Spatrick if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
107873471bf0Spatrick return TRI->getRegClassForReg(*MRI, Src->getReg());
107973471bf0Spatrick }
108073471bf0Spatrick return nullptr;
108173471bf0Spatrick }
108273471bf0Spatrick
1083*d415bd75Srobert /// This function assumes that CI comes before Paired in a basic block. Return
1084*d415bd75Srobert /// an insertion point for the merged instruction or nullptr on failure.
1085*d415bd75Srobert SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1086*d415bd75Srobert SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087*d415bd75Srobert CombineInfo &Paired) {
1088*d415bd75Srobert // If another instruction has already been merged into CI, it may now be a
1089*d415bd75Srobert // type that we can't do any further merging into.
1090*d415bd75Srobert if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1091*d415bd75Srobert return nullptr;
1092*d415bd75Srobert assert(CI.InstClass == Paired.InstClass);
1093*d415bd75Srobert
1094*d415bd75Srobert if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1095*d415bd75Srobert getInstSubclass(Paired.I->getOpcode(), *TII))
1096*d415bd75Srobert return nullptr;
1097097a140dSpatrick
1098097a140dSpatrick // Check both offsets (or masks for MIMG) can be combined and fit in the
1099097a140dSpatrick // reduced range.
1100*d415bd75Srobert if (CI.InstClass == MIMG) {
1101*d415bd75Srobert if (!dmasksCanBeCombined(CI, *TII, Paired))
1102*d415bd75Srobert return nullptr;
1103*d415bd75Srobert } else {
1104*d415bd75Srobert if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1105*d415bd75Srobert return nullptr;
1106097a140dSpatrick }
1107097a140dSpatrick
1108*d415bd75Srobert DenseSet<Register> RegDefs;
1109*d415bd75Srobert DenseSet<Register> RegUses;
1110*d415bd75Srobert CombineInfo *Where;
1111*d415bd75Srobert if (CI.I->mayLoad()) {
1112*d415bd75Srobert // Try to hoist Paired up to CI.
1113*d415bd75Srobert addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1114*d415bd75Srobert for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1115*d415bd75Srobert if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1116*d415bd75Srobert return nullptr;
111709467b48Spatrick }
1118*d415bd75Srobert Where = &CI;
1119*d415bd75Srobert } else {
1120*d415bd75Srobert // Try to sink CI down to Paired.
1121*d415bd75Srobert addDefsUsesToList(*CI.I, RegDefs, RegUses);
1122*d415bd75Srobert for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1123*d415bd75Srobert if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1124*d415bd75Srobert return nullptr;
112509467b48Spatrick }
1126*d415bd75Srobert Where = &Paired;
112709467b48Spatrick }
112809467b48Spatrick
1129097a140dSpatrick // Call offsetsCanBeCombined with modify = true so that the offsets are
1130097a140dSpatrick // correct for the new instruction. This should return true, because
1131097a140dSpatrick // this function should only be called on CombineInfo objects that
1132097a140dSpatrick // have already been confirmed to be mergeable.
1133*d415bd75Srobert if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134097a140dSpatrick offsetsCanBeCombined(CI, *STM, Paired, true);
1135*d415bd75Srobert return Where;
113609467b48Spatrick }
113709467b48Spatrick
read2Opcode(unsigned EltSize) const113809467b48Spatrick unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
113909467b48Spatrick if (STM->ldsRequiresM0Init())
114009467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
114109467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
114209467b48Spatrick }
114309467b48Spatrick
read2ST64Opcode(unsigned EltSize) const114409467b48Spatrick unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
114509467b48Spatrick if (STM->ldsRequiresM0Init())
114609467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
114709467b48Spatrick
114809467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
114909467b48Spatrick : AMDGPU::DS_READ2ST64_B64_gfx9;
115009467b48Spatrick }
115109467b48Spatrick
115209467b48Spatrick MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1153097a140dSpatrick SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1154*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
115509467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
115609467b48Spatrick
115709467b48Spatrick // Be careful, since the addresses could be subregisters themselves in weird
115809467b48Spatrick // cases, like vectors of pointers.
115909467b48Spatrick const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
116009467b48Spatrick
116109467b48Spatrick const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
116209467b48Spatrick const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
116309467b48Spatrick
116409467b48Spatrick unsigned NewOffset0 = CI.Offset;
116509467b48Spatrick unsigned NewOffset1 = Paired.Offset;
116609467b48Spatrick unsigned Opc =
116709467b48Spatrick CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
116809467b48Spatrick
116909467b48Spatrick unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
117009467b48Spatrick unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
117109467b48Spatrick
117209467b48Spatrick if (NewOffset0 > NewOffset1) {
117309467b48Spatrick // Canonicalize the merged instruction so the smaller offset comes first.
117409467b48Spatrick std::swap(NewOffset0, NewOffset1);
117509467b48Spatrick std::swap(SubRegIdx0, SubRegIdx1);
117609467b48Spatrick }
117709467b48Spatrick
117809467b48Spatrick assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
117909467b48Spatrick (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
118009467b48Spatrick
118109467b48Spatrick const MCInstrDesc &Read2Desc = TII->get(Opc);
118209467b48Spatrick
118373471bf0Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
118409467b48Spatrick Register DestReg = MRI->createVirtualRegister(SuperRC);
118509467b48Spatrick
118609467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
118709467b48Spatrick
118809467b48Spatrick Register BaseReg = AddrReg->getReg();
118909467b48Spatrick unsigned BaseSubReg = AddrReg->getSubReg();
119009467b48Spatrick unsigned BaseRegFlags = 0;
119109467b48Spatrick if (CI.BaseOff) {
119209467b48Spatrick Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1193*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
119409467b48Spatrick .addImm(CI.BaseOff);
119509467b48Spatrick
119609467b48Spatrick BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
119709467b48Spatrick BaseRegFlags = RegState::Kill;
119809467b48Spatrick
1199*d415bd75Srobert TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
120009467b48Spatrick .addReg(ImmReg)
120109467b48Spatrick .addReg(AddrReg->getReg(), 0, BaseSubReg)
120209467b48Spatrick .addImm(0); // clamp bit
120309467b48Spatrick BaseSubReg = 0;
120409467b48Spatrick }
120509467b48Spatrick
120609467b48Spatrick MachineInstrBuilder Read2 =
1207*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
120809467b48Spatrick .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
120909467b48Spatrick .addImm(NewOffset0) // offset0
121009467b48Spatrick .addImm(NewOffset1) // offset1
121109467b48Spatrick .addImm(0) // gds
121209467b48Spatrick .cloneMergedMemRefs({&*CI.I, &*Paired.I});
121309467b48Spatrick
121409467b48Spatrick (void)Read2;
121509467b48Spatrick
121609467b48Spatrick const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
121709467b48Spatrick
121809467b48Spatrick // Copy to the old destination registers.
1219*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
122009467b48Spatrick .add(*Dest0) // Copy to same destination including flags and sub reg.
122109467b48Spatrick .addReg(DestReg, 0, SubRegIdx0);
1222*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
122309467b48Spatrick .add(*Dest1)
122409467b48Spatrick .addReg(DestReg, RegState::Kill, SubRegIdx1);
122509467b48Spatrick
122609467b48Spatrick CI.I->eraseFromParent();
122709467b48Spatrick Paired.I->eraseFromParent();
122809467b48Spatrick
122909467b48Spatrick LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
123009467b48Spatrick return Read2;
123109467b48Spatrick }
123209467b48Spatrick
write2Opcode(unsigned EltSize) const123309467b48Spatrick unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
123409467b48Spatrick if (STM->ldsRequiresM0Init())
123509467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
123609467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
123709467b48Spatrick : AMDGPU::DS_WRITE2_B64_gfx9;
123809467b48Spatrick }
123909467b48Spatrick
write2ST64Opcode(unsigned EltSize) const124009467b48Spatrick unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
124109467b48Spatrick if (STM->ldsRequiresM0Init())
124209467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
124309467b48Spatrick : AMDGPU::DS_WRITE2ST64_B64;
124409467b48Spatrick
124509467b48Spatrick return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
124609467b48Spatrick : AMDGPU::DS_WRITE2ST64_B64_gfx9;
124709467b48Spatrick }
124809467b48Spatrick
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1249*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1250*d415bd75Srobert CombineInfo &CI, CombineInfo &Paired,
1251*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
125209467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
125309467b48Spatrick
125409467b48Spatrick // Be sure to use .addOperand(), and not .addReg() with these. We want to be
125509467b48Spatrick // sure we preserve the subregister index and any register flags set on them.
125609467b48Spatrick const MachineOperand *AddrReg =
125709467b48Spatrick TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
125809467b48Spatrick const MachineOperand *Data0 =
125909467b48Spatrick TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
126009467b48Spatrick const MachineOperand *Data1 =
126109467b48Spatrick TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
126209467b48Spatrick
126309467b48Spatrick unsigned NewOffset0 = CI.Offset;
126409467b48Spatrick unsigned NewOffset1 = Paired.Offset;
126509467b48Spatrick unsigned Opc =
126609467b48Spatrick CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
126709467b48Spatrick
126809467b48Spatrick if (NewOffset0 > NewOffset1) {
126909467b48Spatrick // Canonicalize the merged instruction so the smaller offset comes first.
127009467b48Spatrick std::swap(NewOffset0, NewOffset1);
127109467b48Spatrick std::swap(Data0, Data1);
127209467b48Spatrick }
127309467b48Spatrick
127409467b48Spatrick assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
127509467b48Spatrick (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
127609467b48Spatrick
127709467b48Spatrick const MCInstrDesc &Write2Desc = TII->get(Opc);
127809467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
127909467b48Spatrick
128009467b48Spatrick Register BaseReg = AddrReg->getReg();
128109467b48Spatrick unsigned BaseSubReg = AddrReg->getSubReg();
128209467b48Spatrick unsigned BaseRegFlags = 0;
128309467b48Spatrick if (CI.BaseOff) {
128409467b48Spatrick Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1285*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
128609467b48Spatrick .addImm(CI.BaseOff);
128709467b48Spatrick
128809467b48Spatrick BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
128909467b48Spatrick BaseRegFlags = RegState::Kill;
129009467b48Spatrick
1291*d415bd75Srobert TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
129209467b48Spatrick .addReg(ImmReg)
129309467b48Spatrick .addReg(AddrReg->getReg(), 0, BaseSubReg)
129409467b48Spatrick .addImm(0); // clamp bit
129509467b48Spatrick BaseSubReg = 0;
129609467b48Spatrick }
129709467b48Spatrick
129809467b48Spatrick MachineInstrBuilder Write2 =
1299*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, Write2Desc)
130009467b48Spatrick .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
130109467b48Spatrick .add(*Data0) // data0
130209467b48Spatrick .add(*Data1) // data1
130309467b48Spatrick .addImm(NewOffset0) // offset0
130409467b48Spatrick .addImm(NewOffset1) // offset1
130509467b48Spatrick .addImm(0) // gds
130609467b48Spatrick .cloneMergedMemRefs({&*CI.I, &*Paired.I});
130709467b48Spatrick
130809467b48Spatrick CI.I->eraseFromParent();
130909467b48Spatrick Paired.I->eraseFromParent();
131009467b48Spatrick
131109467b48Spatrick LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
131209467b48Spatrick return Write2;
131309467b48Spatrick }
131409467b48Spatrick
131509467b48Spatrick MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1316097a140dSpatrick SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1317*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
131809467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
131909467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
132009467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
132109467b48Spatrick
132209467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
132309467b48Spatrick
132409467b48Spatrick Register DestReg = MRI->createVirtualRegister(SuperRC);
132509467b48Spatrick unsigned MergedDMask = CI.DMask | Paired.DMask;
132609467b48Spatrick unsigned DMaskIdx =
132709467b48Spatrick AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
132809467b48Spatrick
1329*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
133009467b48Spatrick for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
133109467b48Spatrick if (I == DMaskIdx)
133209467b48Spatrick MIB.addImm(MergedDMask);
133309467b48Spatrick else
133409467b48Spatrick MIB.add((*CI.I).getOperand(I));
133509467b48Spatrick }
133609467b48Spatrick
133709467b48Spatrick // It shouldn't be possible to get this far if the two instructions
133809467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
133909467b48Spatrick // will return true if this is the case.
134009467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
134109467b48Spatrick
1342*d415bd75Srobert MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
134309467b48Spatrick
134409467b48Spatrick unsigned SubRegIdx0, SubRegIdx1;
134509467b48Spatrick std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
134609467b48Spatrick
134709467b48Spatrick // Copy to the old destination registers.
134809467b48Spatrick const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
134909467b48Spatrick const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
135009467b48Spatrick const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
135109467b48Spatrick
1352*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
135309467b48Spatrick .add(*Dest0) // Copy to same destination including flags and sub reg.
135409467b48Spatrick .addReg(DestReg, 0, SubRegIdx0);
1355*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
135609467b48Spatrick .add(*Dest1)
135709467b48Spatrick .addReg(DestReg, RegState::Kill, SubRegIdx1);
135809467b48Spatrick
135909467b48Spatrick CI.I->eraseFromParent();
136009467b48Spatrick Paired.I->eraseFromParent();
136109467b48Spatrick return New;
136209467b48Spatrick }
136309467b48Spatrick
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1365097a140dSpatrick CombineInfo &CI, CombineInfo &Paired,
1366*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
136709467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
136809467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
136909467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
137009467b48Spatrick
137109467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
137209467b48Spatrick
137309467b48Spatrick Register DestReg = MRI->createVirtualRegister(SuperRC);
137409467b48Spatrick unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
137509467b48Spatrick
137609467b48Spatrick // It shouldn't be possible to get this far if the two instructions
137709467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
137809467b48Spatrick // will return true if this is the case.
137909467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
138009467b48Spatrick
1381*d415bd75Srobert MachineInstrBuilder New =
1382*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1383*d415bd75Srobert .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384*d415bd75Srobert if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385*d415bd75Srobert New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1386*d415bd75Srobert // For convenience, when SGPR_IMM buffer loads are merged into a
1387*d415bd75Srobert // zero-offset load, we generate its SGPR variant.
1388*d415bd75Srobert if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1389*d415bd75Srobert New.addImm(MergedOffset);
1390*d415bd75Srobert New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
139109467b48Spatrick
139209467b48Spatrick std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
139309467b48Spatrick const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
139409467b48Spatrick const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
139509467b48Spatrick
139609467b48Spatrick // Copy to the old destination registers.
139709467b48Spatrick const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
139809467b48Spatrick const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
139909467b48Spatrick const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
140009467b48Spatrick
1401*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
140209467b48Spatrick .add(*Dest0) // Copy to same destination including flags and sub reg.
140309467b48Spatrick .addReg(DestReg, 0, SubRegIdx0);
1404*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
140509467b48Spatrick .add(*Dest1)
140609467b48Spatrick .addReg(DestReg, RegState::Kill, SubRegIdx1);
140709467b48Spatrick
140809467b48Spatrick CI.I->eraseFromParent();
140909467b48Spatrick Paired.I->eraseFromParent();
141009467b48Spatrick return New;
141109467b48Spatrick }
141209467b48Spatrick
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1413097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1414097a140dSpatrick CombineInfo &CI, CombineInfo &Paired,
1415*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
141609467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
141709467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
141809467b48Spatrick
141909467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
142009467b48Spatrick
142109467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
142209467b48Spatrick
142309467b48Spatrick // Copy to the new source register.
142409467b48Spatrick Register DestReg = MRI->createVirtualRegister(SuperRC);
142509467b48Spatrick unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
142609467b48Spatrick
1427*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
142809467b48Spatrick
1429097a140dSpatrick AddressRegs Regs = getRegs(Opcode, *TII);
143009467b48Spatrick
1431097a140dSpatrick if (Regs.VAddr)
143209467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
143309467b48Spatrick
143409467b48Spatrick // It shouldn't be possible to get this far if the two instructions
143509467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
143609467b48Spatrick // will return true if this is the case.
143709467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
143809467b48Spatrick
143909467b48Spatrick MachineInstr *New =
144009467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
144109467b48Spatrick .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
144209467b48Spatrick .addImm(MergedOffset) // offset
144373471bf0Spatrick .addImm(CI.CPol) // cpol
144409467b48Spatrick .addImm(0) // swz
1445*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
144609467b48Spatrick
144709467b48Spatrick std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
144809467b48Spatrick const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
144909467b48Spatrick const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
145009467b48Spatrick
145109467b48Spatrick // Copy to the old destination registers.
145209467b48Spatrick const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
145309467b48Spatrick const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
145409467b48Spatrick const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
145509467b48Spatrick
1456*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
145709467b48Spatrick .add(*Dest0) // Copy to same destination including flags and sub reg.
145809467b48Spatrick .addReg(DestReg, 0, SubRegIdx0);
1459*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
146009467b48Spatrick .add(*Dest1)
146109467b48Spatrick .addReg(DestReg, RegState::Kill, SubRegIdx1);
146209467b48Spatrick
146309467b48Spatrick CI.I->eraseFromParent();
146409467b48Spatrick Paired.I->eraseFromParent();
146509467b48Spatrick return New;
146609467b48Spatrick }
146709467b48Spatrick
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1468097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1469097a140dSpatrick CombineInfo &CI, CombineInfo &Paired,
1470*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
147109467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
147209467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
147309467b48Spatrick
147409467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
147509467b48Spatrick
147609467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
147709467b48Spatrick
147809467b48Spatrick // Copy to the new source register.
147909467b48Spatrick Register DestReg = MRI->createVirtualRegister(SuperRC);
148009467b48Spatrick unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
148109467b48Spatrick
1482*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
148309467b48Spatrick
1484097a140dSpatrick AddressRegs Regs = getRegs(Opcode, *TII);
148509467b48Spatrick
1486097a140dSpatrick if (Regs.VAddr)
148709467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
148809467b48Spatrick
148909467b48Spatrick unsigned JoinedFormat =
1490097a140dSpatrick getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
149109467b48Spatrick
149209467b48Spatrick // It shouldn't be possible to get this far if the two instructions
149309467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
149409467b48Spatrick // will return true if this is the case.
149509467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
149609467b48Spatrick
149709467b48Spatrick MachineInstr *New =
149809467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
149909467b48Spatrick .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
150009467b48Spatrick .addImm(MergedOffset) // offset
150109467b48Spatrick .addImm(JoinedFormat) // format
150273471bf0Spatrick .addImm(CI.CPol) // cpol
150309467b48Spatrick .addImm(0) // swz
1504*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
150509467b48Spatrick
150609467b48Spatrick std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
150709467b48Spatrick const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
150809467b48Spatrick const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
150909467b48Spatrick
151009467b48Spatrick // Copy to the old destination registers.
151109467b48Spatrick const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
151209467b48Spatrick const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
151309467b48Spatrick const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
151409467b48Spatrick
1515*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
151609467b48Spatrick .add(*Dest0) // Copy to same destination including flags and sub reg.
151709467b48Spatrick .addReg(DestReg, 0, SubRegIdx0);
1518*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
151909467b48Spatrick .add(*Dest1)
152009467b48Spatrick .addReg(DestReg, RegState::Kill, SubRegIdx1);
152109467b48Spatrick
152209467b48Spatrick CI.I->eraseFromParent();
152309467b48Spatrick Paired.I->eraseFromParent();
152409467b48Spatrick return New;
152509467b48Spatrick }
152609467b48Spatrick
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1527097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1528097a140dSpatrick CombineInfo &CI, CombineInfo &Paired,
1529*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
153009467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
153109467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
153209467b48Spatrick
153309467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
153409467b48Spatrick
153509467b48Spatrick std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
153609467b48Spatrick const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
153709467b48Spatrick const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
153809467b48Spatrick
153909467b48Spatrick // Copy to the new source register.
154009467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
154109467b48Spatrick Register SrcReg = MRI->createVirtualRegister(SuperRC);
154209467b48Spatrick
154309467b48Spatrick const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
154409467b48Spatrick const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
154509467b48Spatrick
1546*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
154709467b48Spatrick .add(*Src0)
154809467b48Spatrick .addImm(SubRegIdx0)
154909467b48Spatrick .add(*Src1)
155009467b48Spatrick .addImm(SubRegIdx1);
155109467b48Spatrick
1552*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
155309467b48Spatrick .addReg(SrcReg, RegState::Kill);
155409467b48Spatrick
1555097a140dSpatrick AddressRegs Regs = getRegs(Opcode, *TII);
155609467b48Spatrick
1557097a140dSpatrick if (Regs.VAddr)
155809467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
155909467b48Spatrick
156009467b48Spatrick unsigned JoinedFormat =
1561097a140dSpatrick getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
156209467b48Spatrick
156309467b48Spatrick // It shouldn't be possible to get this far if the two instructions
156409467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
156509467b48Spatrick // will return true if this is the case.
156609467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
156709467b48Spatrick
156809467b48Spatrick MachineInstr *New =
156909467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
157009467b48Spatrick .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
157109467b48Spatrick .addImm(std::min(CI.Offset, Paired.Offset)) // offset
157209467b48Spatrick .addImm(JoinedFormat) // format
157373471bf0Spatrick .addImm(CI.CPol) // cpol
157409467b48Spatrick .addImm(0) // swz
1575*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
157609467b48Spatrick
1577*d415bd75Srobert CI.I->eraseFromParent();
1578*d415bd75Srobert Paired.I->eraseFromParent();
1579*d415bd75Srobert return New;
1580*d415bd75Srobert }
1581*d415bd75Srobert
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1582*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1583*d415bd75Srobert CombineInfo &CI, CombineInfo &Paired,
1584*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
1585*d415bd75Srobert MachineBasicBlock *MBB = CI.I->getParent();
1586*d415bd75Srobert DebugLoc DL = CI.I->getDebugLoc();
1587*d415bd75Srobert
1588*d415bd75Srobert const unsigned Opcode = getNewOpcode(CI, Paired);
1589*d415bd75Srobert
1590*d415bd75Srobert const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1591*d415bd75Srobert Register DestReg = MRI->createVirtualRegister(SuperRC);
1592*d415bd75Srobert
1593*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1594*d415bd75Srobert
1595*d415bd75Srobert if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1596*d415bd75Srobert MIB.add(*SAddr);
1597*d415bd75Srobert
1598*d415bd75Srobert MachineInstr *New =
1599*d415bd75Srobert MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1600*d415bd75Srobert .addImm(std::min(CI.Offset, Paired.Offset))
1601*d415bd75Srobert .addImm(CI.CPol)
1602*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1603*d415bd75Srobert
1604*d415bd75Srobert std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1605*d415bd75Srobert const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1606*d415bd75Srobert const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1607*d415bd75Srobert
1608*d415bd75Srobert // Copy to the old destination registers.
1609*d415bd75Srobert const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1610*d415bd75Srobert const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1611*d415bd75Srobert const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1612*d415bd75Srobert
1613*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1614*d415bd75Srobert .add(*Dest0) // Copy to same destination including flags and sub reg.
1615*d415bd75Srobert .addReg(DestReg, 0, SubRegIdx0);
1616*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1617*d415bd75Srobert .add(*Dest1)
1618*d415bd75Srobert .addReg(DestReg, RegState::Kill, SubRegIdx1);
1619*d415bd75Srobert
1620*d415bd75Srobert CI.I->eraseFromParent();
1621*d415bd75Srobert Paired.I->eraseFromParent();
1622*d415bd75Srobert return New;
1623*d415bd75Srobert }
1624*d415bd75Srobert
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1626*d415bd75Srobert CombineInfo &CI, CombineInfo &Paired,
1627*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
1628*d415bd75Srobert MachineBasicBlock *MBB = CI.I->getParent();
1629*d415bd75Srobert DebugLoc DL = CI.I->getDebugLoc();
1630*d415bd75Srobert
1631*d415bd75Srobert const unsigned Opcode = getNewOpcode(CI, Paired);
1632*d415bd75Srobert
1633*d415bd75Srobert std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1634*d415bd75Srobert const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1635*d415bd75Srobert const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1636*d415bd75Srobert
1637*d415bd75Srobert // Copy to the new source register.
1638*d415bd75Srobert const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1639*d415bd75Srobert Register SrcReg = MRI->createVirtualRegister(SuperRC);
1640*d415bd75Srobert
1641*d415bd75Srobert const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1642*d415bd75Srobert const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1643*d415bd75Srobert
1644*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1645*d415bd75Srobert .add(*Src0)
1646*d415bd75Srobert .addImm(SubRegIdx0)
1647*d415bd75Srobert .add(*Src1)
1648*d415bd75Srobert .addImm(SubRegIdx1);
1649*d415bd75Srobert
1650*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1651*d415bd75Srobert .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1652*d415bd75Srobert .addReg(SrcReg, RegState::Kill);
1653*d415bd75Srobert
1654*d415bd75Srobert if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1655*d415bd75Srobert MIB.add(*SAddr);
1656*d415bd75Srobert
1657*d415bd75Srobert MachineInstr *New =
1658*d415bd75Srobert MIB.addImm(std::min(CI.Offset, Paired.Offset))
1659*d415bd75Srobert .addImm(CI.CPol)
1660*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
166109467b48Spatrick
166209467b48Spatrick CI.I->eraseFromParent();
166309467b48Spatrick Paired.I->eraseFromParent();
166409467b48Spatrick return New;
166509467b48Spatrick }
166609467b48Spatrick
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)166709467b48Spatrick unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
166809467b48Spatrick const CombineInfo &Paired) {
166909467b48Spatrick const unsigned Width = CI.Width + Paired.Width;
167009467b48Spatrick
1671*d415bd75Srobert switch (getCommonInstClass(CI, Paired)) {
167209467b48Spatrick default:
167309467b48Spatrick assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
167409467b48Spatrick // FIXME: Handle d16 correctly
167509467b48Spatrick return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
167609467b48Spatrick Width);
167709467b48Spatrick case TBUFFER_LOAD:
167809467b48Spatrick case TBUFFER_STORE:
167909467b48Spatrick return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
168009467b48Spatrick Width);
168109467b48Spatrick
168209467b48Spatrick case UNKNOWN:
168309467b48Spatrick llvm_unreachable("Unknown instruction class");
168409467b48Spatrick case S_BUFFER_LOAD_IMM:
168509467b48Spatrick switch (Width) {
168609467b48Spatrick default:
168709467b48Spatrick return 0;
168809467b48Spatrick case 2:
168909467b48Spatrick return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
169009467b48Spatrick case 4:
169109467b48Spatrick return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1692*d415bd75Srobert case 8:
1693*d415bd75Srobert return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1694*d415bd75Srobert }
1695*d415bd75Srobert case S_BUFFER_LOAD_SGPR_IMM:
1696*d415bd75Srobert switch (Width) {
1697*d415bd75Srobert default:
1698*d415bd75Srobert return 0;
1699*d415bd75Srobert case 2:
1700*d415bd75Srobert return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1701*d415bd75Srobert : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1702*d415bd75Srobert case 4:
1703*d415bd75Srobert return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1704*d415bd75Srobert : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1705*d415bd75Srobert case 8:
1706*d415bd75Srobert return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1707*d415bd75Srobert : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1708*d415bd75Srobert }
1709*d415bd75Srobert case S_LOAD_IMM:
1710*d415bd75Srobert switch (Width) {
1711*d415bd75Srobert default:
1712*d415bd75Srobert return 0;
1713*d415bd75Srobert case 2:
1714*d415bd75Srobert return AMDGPU::S_LOAD_DWORDX2_IMM;
1715*d415bd75Srobert case 4:
1716*d415bd75Srobert return AMDGPU::S_LOAD_DWORDX4_IMM;
1717*d415bd75Srobert case 8:
1718*d415bd75Srobert return AMDGPU::S_LOAD_DWORDX8_IMM;
1719*d415bd75Srobert }
1720*d415bd75Srobert case GLOBAL_LOAD:
1721*d415bd75Srobert switch (Width) {
1722*d415bd75Srobert default:
1723*d415bd75Srobert return 0;
1724*d415bd75Srobert case 2:
1725*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX2;
1726*d415bd75Srobert case 3:
1727*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX3;
1728*d415bd75Srobert case 4:
1729*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX4;
1730*d415bd75Srobert }
1731*d415bd75Srobert case GLOBAL_LOAD_SADDR:
1732*d415bd75Srobert switch (Width) {
1733*d415bd75Srobert default:
1734*d415bd75Srobert return 0;
1735*d415bd75Srobert case 2:
1736*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1737*d415bd75Srobert case 3:
1738*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1739*d415bd75Srobert case 4:
1740*d415bd75Srobert return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1741*d415bd75Srobert }
1742*d415bd75Srobert case GLOBAL_STORE:
1743*d415bd75Srobert switch (Width) {
1744*d415bd75Srobert default:
1745*d415bd75Srobert return 0;
1746*d415bd75Srobert case 2:
1747*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX2;
1748*d415bd75Srobert case 3:
1749*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX3;
1750*d415bd75Srobert case 4:
1751*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX4;
1752*d415bd75Srobert }
1753*d415bd75Srobert case GLOBAL_STORE_SADDR:
1754*d415bd75Srobert switch (Width) {
1755*d415bd75Srobert default:
1756*d415bd75Srobert return 0;
1757*d415bd75Srobert case 2:
1758*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1759*d415bd75Srobert case 3:
1760*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1761*d415bd75Srobert case 4:
1762*d415bd75Srobert return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1763*d415bd75Srobert }
1764*d415bd75Srobert case FLAT_LOAD:
1765*d415bd75Srobert switch (Width) {
1766*d415bd75Srobert default:
1767*d415bd75Srobert return 0;
1768*d415bd75Srobert case 2:
1769*d415bd75Srobert return AMDGPU::FLAT_LOAD_DWORDX2;
1770*d415bd75Srobert case 3:
1771*d415bd75Srobert return AMDGPU::FLAT_LOAD_DWORDX3;
1772*d415bd75Srobert case 4:
1773*d415bd75Srobert return AMDGPU::FLAT_LOAD_DWORDX4;
1774*d415bd75Srobert }
1775*d415bd75Srobert case FLAT_STORE:
1776*d415bd75Srobert switch (Width) {
1777*d415bd75Srobert default:
1778*d415bd75Srobert return 0;
1779*d415bd75Srobert case 2:
1780*d415bd75Srobert return AMDGPU::FLAT_STORE_DWORDX2;
1781*d415bd75Srobert case 3:
1782*d415bd75Srobert return AMDGPU::FLAT_STORE_DWORDX3;
1783*d415bd75Srobert case 4:
1784*d415bd75Srobert return AMDGPU::FLAT_STORE_DWORDX4;
178509467b48Spatrick }
178609467b48Spatrick case MIMG:
1787*d415bd75Srobert assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1788*d415bd75Srobert "No overlaps");
178909467b48Spatrick return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
179009467b48Spatrick }
179109467b48Spatrick }
179209467b48Spatrick
179309467b48Spatrick std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1794*d415bd75Srobert SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1795*d415bd75Srobert const CombineInfo &Paired) {
1796*d415bd75Srobert assert((CI.InstClass != MIMG ||
1797*d415bd75Srobert ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1798*d415bd75Srobert CI.Width + Paired.Width)) &&
179909467b48Spatrick "No overlaps");
180009467b48Spatrick
180109467b48Spatrick unsigned Idx0;
180209467b48Spatrick unsigned Idx1;
180309467b48Spatrick
1804*d415bd75Srobert static const unsigned Idxs[5][4] = {
1805*d415bd75Srobert {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1806*d415bd75Srobert {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1807*d415bd75Srobert {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1808*d415bd75Srobert {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1809*d415bd75Srobert {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1810*d415bd75Srobert };
181109467b48Spatrick
1812*d415bd75Srobert assert(CI.Width >= 1 && CI.Width <= 4);
1813*d415bd75Srobert assert(Paired.Width >= 1 && Paired.Width <= 4);
1814*d415bd75Srobert
1815*d415bd75Srobert if (Paired < CI) {
181609467b48Spatrick Idx1 = Idxs[0][Paired.Width - 1];
181709467b48Spatrick Idx0 = Idxs[Paired.Width][CI.Width - 1];
181809467b48Spatrick } else {
181909467b48Spatrick Idx0 = Idxs[0][CI.Width - 1];
182009467b48Spatrick Idx1 = Idxs[CI.Width][Paired.Width - 1];
182109467b48Spatrick }
182209467b48Spatrick
1823*d415bd75Srobert return std::pair(Idx0, Idx1);
182409467b48Spatrick }
182509467b48Spatrick
182609467b48Spatrick const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)182709467b48Spatrick SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
182809467b48Spatrick const CombineInfo &Paired) {
1829*d415bd75Srobert if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1830*d415bd75Srobert CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
183109467b48Spatrick switch (CI.Width + Paired.Width) {
183209467b48Spatrick default:
183309467b48Spatrick return nullptr;
183409467b48Spatrick case 2:
183509467b48Spatrick return &AMDGPU::SReg_64_XEXECRegClass;
183609467b48Spatrick case 4:
183709467b48Spatrick return &AMDGPU::SGPR_128RegClass;
183809467b48Spatrick case 8:
1839097a140dSpatrick return &AMDGPU::SGPR_256RegClass;
184009467b48Spatrick case 16:
1841097a140dSpatrick return &AMDGPU::SGPR_512RegClass;
184209467b48Spatrick }
184309467b48Spatrick }
184473471bf0Spatrick
184573471bf0Spatrick unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1846*d415bd75Srobert return TRI->isAGPRClass(getDataRegClass(*CI.I))
184773471bf0Spatrick ? TRI->getAGPRClassForBitWidth(BitWidth)
184873471bf0Spatrick : TRI->getVGPRClassForBitWidth(BitWidth);
184909467b48Spatrick }
185009467b48Spatrick
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1851097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1852097a140dSpatrick CombineInfo &CI, CombineInfo &Paired,
1853*d415bd75Srobert MachineBasicBlock::iterator InsertBefore) {
185409467b48Spatrick MachineBasicBlock *MBB = CI.I->getParent();
185509467b48Spatrick DebugLoc DL = CI.I->getDebugLoc();
185609467b48Spatrick
185709467b48Spatrick const unsigned Opcode = getNewOpcode(CI, Paired);
185809467b48Spatrick
185909467b48Spatrick std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
186009467b48Spatrick const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
186109467b48Spatrick const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
186209467b48Spatrick
186309467b48Spatrick // Copy to the new source register.
186409467b48Spatrick const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
186509467b48Spatrick Register SrcReg = MRI->createVirtualRegister(SuperRC);
186609467b48Spatrick
186709467b48Spatrick const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
186809467b48Spatrick const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
186909467b48Spatrick
1870*d415bd75Srobert BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
187109467b48Spatrick .add(*Src0)
187209467b48Spatrick .addImm(SubRegIdx0)
187309467b48Spatrick .add(*Src1)
187409467b48Spatrick .addImm(SubRegIdx1);
187509467b48Spatrick
1876*d415bd75Srobert auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
187709467b48Spatrick .addReg(SrcReg, RegState::Kill);
187809467b48Spatrick
1879097a140dSpatrick AddressRegs Regs = getRegs(Opcode, *TII);
188009467b48Spatrick
1881097a140dSpatrick if (Regs.VAddr)
188209467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
188309467b48Spatrick
188409467b48Spatrick
188509467b48Spatrick // It shouldn't be possible to get this far if the two instructions
188609467b48Spatrick // don't have a single memoperand, because MachineInstr::mayAlias()
188709467b48Spatrick // will return true if this is the case.
188809467b48Spatrick assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
188909467b48Spatrick
189009467b48Spatrick MachineInstr *New =
189109467b48Spatrick MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
189209467b48Spatrick .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
189309467b48Spatrick .addImm(std::min(CI.Offset, Paired.Offset)) // offset
189473471bf0Spatrick .addImm(CI.CPol) // cpol
189509467b48Spatrick .addImm(0) // swz
1896*d415bd75Srobert .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
189709467b48Spatrick
189809467b48Spatrick CI.I->eraseFromParent();
189909467b48Spatrick Paired.I->eraseFromParent();
190009467b48Spatrick return New;
190109467b48Spatrick }
190209467b48Spatrick
190309467b48Spatrick MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const190409467b48Spatrick SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
190509467b48Spatrick APInt V(32, Val, true);
190609467b48Spatrick if (TII->isInlineConstant(V))
190709467b48Spatrick return MachineOperand::CreateImm(Val);
190809467b48Spatrick
190909467b48Spatrick Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
191009467b48Spatrick MachineInstr *Mov =
191109467b48Spatrick BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
191209467b48Spatrick TII->get(AMDGPU::S_MOV_B32), Reg)
191309467b48Spatrick .addImm(Val);
191409467b48Spatrick (void)Mov;
191509467b48Spatrick LLVM_DEBUG(dbgs() << " "; Mov->dump());
191609467b48Spatrick return MachineOperand::CreateReg(Reg, false);
191709467b48Spatrick }
191809467b48Spatrick
191909467b48Spatrick // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1920097a140dSpatrick Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
192109467b48Spatrick const MemAddress &Addr) const {
192209467b48Spatrick MachineBasicBlock *MBB = MI.getParent();
192309467b48Spatrick MachineBasicBlock::iterator MBBI = MI.getIterator();
192409467b48Spatrick DebugLoc DL = MI.getDebugLoc();
192509467b48Spatrick
192609467b48Spatrick assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
192709467b48Spatrick Addr.Base.LoSubReg) &&
192809467b48Spatrick "Expected 32-bit Base-Register-Low!!");
192909467b48Spatrick
193009467b48Spatrick assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
193109467b48Spatrick Addr.Base.HiSubReg) &&
193209467b48Spatrick "Expected 32-bit Base-Register-Hi!!");
193309467b48Spatrick
193409467b48Spatrick LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
193509467b48Spatrick MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
193609467b48Spatrick MachineOperand OffsetHi =
193709467b48Spatrick createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
193809467b48Spatrick
193909467b48Spatrick const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
194009467b48Spatrick Register CarryReg = MRI->createVirtualRegister(CarryRC);
194109467b48Spatrick Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
194209467b48Spatrick
194309467b48Spatrick Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
194409467b48Spatrick Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
194509467b48Spatrick MachineInstr *LoHalf =
194673471bf0Spatrick BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
194709467b48Spatrick .addReg(CarryReg, RegState::Define)
194809467b48Spatrick .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
194909467b48Spatrick .add(OffsetLo)
195009467b48Spatrick .addImm(0); // clamp bit
195109467b48Spatrick (void)LoHalf;
195209467b48Spatrick LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
195309467b48Spatrick
195409467b48Spatrick MachineInstr *HiHalf =
195509467b48Spatrick BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
195609467b48Spatrick .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
195709467b48Spatrick .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
195809467b48Spatrick .add(OffsetHi)
195909467b48Spatrick .addReg(CarryReg, RegState::Kill)
196009467b48Spatrick .addImm(0); // clamp bit
196109467b48Spatrick (void)HiHalf;
196209467b48Spatrick LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
196309467b48Spatrick
196473471bf0Spatrick Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
196509467b48Spatrick MachineInstr *FullBase =
196609467b48Spatrick BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
196709467b48Spatrick .addReg(DestSub0)
196809467b48Spatrick .addImm(AMDGPU::sub0)
196909467b48Spatrick .addReg(DestSub1)
197009467b48Spatrick .addImm(AMDGPU::sub1);
197109467b48Spatrick (void)FullBase;
197209467b48Spatrick LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
197309467b48Spatrick
197409467b48Spatrick return FullDestReg;
197509467b48Spatrick }
197609467b48Spatrick
197709467b48Spatrick // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const197809467b48Spatrick void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1979097a140dSpatrick Register NewBase,
198009467b48Spatrick int32_t NewOffset) const {
198109467b48Spatrick auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
198209467b48Spatrick Base->setReg(NewBase);
198309467b48Spatrick Base->setIsKill(false);
198409467b48Spatrick TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
198509467b48Spatrick }
198609467b48Spatrick
1987*d415bd75Srobert std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const198809467b48Spatrick SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
198909467b48Spatrick if (Op.isImm())
199009467b48Spatrick return Op.getImm();
199109467b48Spatrick
199209467b48Spatrick if (!Op.isReg())
1993*d415bd75Srobert return std::nullopt;
199409467b48Spatrick
199509467b48Spatrick MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
199609467b48Spatrick if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
199709467b48Spatrick !Def->getOperand(1).isImm())
1998*d415bd75Srobert return std::nullopt;
199909467b48Spatrick
200009467b48Spatrick return Def->getOperand(1).getImm();
200109467b48Spatrick }
200209467b48Spatrick
200309467b48Spatrick // Analyze Base and extracts:
200409467b48Spatrick // - 32bit base registers, subregisters
200509467b48Spatrick // - 64bit constant offset
200609467b48Spatrick // Expecting base computation as:
200709467b48Spatrick // %OFFSET0:sgpr_32 = S_MOV_B32 8000
200809467b48Spatrick // %LO:vgpr_32, %c:sreg_64_xexec =
200973471bf0Spatrick // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
201009467b48Spatrick // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
201109467b48Spatrick // %Base:vreg_64 =
201209467b48Spatrick // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const201309467b48Spatrick void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
201409467b48Spatrick MemAddress &Addr) const {
201509467b48Spatrick if (!Base.isReg())
201609467b48Spatrick return;
201709467b48Spatrick
201809467b48Spatrick MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
201909467b48Spatrick if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
202009467b48Spatrick || Def->getNumOperands() != 5)
202109467b48Spatrick return;
202209467b48Spatrick
202309467b48Spatrick MachineOperand BaseLo = Def->getOperand(1);
202409467b48Spatrick MachineOperand BaseHi = Def->getOperand(3);
202509467b48Spatrick if (!BaseLo.isReg() || !BaseHi.isReg())
202609467b48Spatrick return;
202709467b48Spatrick
202809467b48Spatrick MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
202909467b48Spatrick MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
203009467b48Spatrick
203173471bf0Spatrick if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
203209467b48Spatrick !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
203309467b48Spatrick return;
203409467b48Spatrick
203509467b48Spatrick const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
203609467b48Spatrick const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
203709467b48Spatrick
203809467b48Spatrick auto Offset0P = extractConstOffset(*Src0);
203909467b48Spatrick if (Offset0P)
204009467b48Spatrick BaseLo = *Src1;
204109467b48Spatrick else {
204209467b48Spatrick if (!(Offset0P = extractConstOffset(*Src1)))
204309467b48Spatrick return;
204409467b48Spatrick BaseLo = *Src0;
204509467b48Spatrick }
204609467b48Spatrick
204709467b48Spatrick Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
204809467b48Spatrick Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
204909467b48Spatrick
205009467b48Spatrick if (Src0->isImm())
205109467b48Spatrick std::swap(Src0, Src1);
205209467b48Spatrick
205309467b48Spatrick if (!Src1->isImm())
205409467b48Spatrick return;
205509467b48Spatrick
205609467b48Spatrick uint64_t Offset1 = Src1->getImm();
205709467b48Spatrick BaseHi = *Src0;
205809467b48Spatrick
205909467b48Spatrick Addr.Base.LoReg = BaseLo.getReg();
206009467b48Spatrick Addr.Base.HiReg = BaseHi.getReg();
206109467b48Spatrick Addr.Base.LoSubReg = BaseLo.getSubReg();
206209467b48Spatrick Addr.Base.HiSubReg = BaseHi.getSubReg();
206309467b48Spatrick Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
206409467b48Spatrick }
206509467b48Spatrick
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const206609467b48Spatrick bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
206709467b48Spatrick MachineInstr &MI,
206809467b48Spatrick MemInfoMap &Visited,
206909467b48Spatrick SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
207009467b48Spatrick
207109467b48Spatrick if (!(MI.mayLoad() ^ MI.mayStore()))
207209467b48Spatrick return false;
207309467b48Spatrick
207409467b48Spatrick // TODO: Support flat and scratch.
207509467b48Spatrick if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
207609467b48Spatrick return false;
207709467b48Spatrick
2078*d415bd75Srobert if (MI.mayLoad() &&
2079*d415bd75Srobert TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
208009467b48Spatrick return false;
208109467b48Spatrick
208209467b48Spatrick if (AnchorList.count(&MI))
208309467b48Spatrick return false;
208409467b48Spatrick
208509467b48Spatrick LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
208609467b48Spatrick
208709467b48Spatrick if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
208809467b48Spatrick LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
208909467b48Spatrick return false;
209009467b48Spatrick }
209109467b48Spatrick
209209467b48Spatrick // Step1: Find the base-registers and a 64bit constant offset.
209309467b48Spatrick MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
209409467b48Spatrick MemAddress MAddr;
209509467b48Spatrick if (Visited.find(&MI) == Visited.end()) {
209609467b48Spatrick processBaseWithConstOffset(Base, MAddr);
209709467b48Spatrick Visited[&MI] = MAddr;
209809467b48Spatrick } else
209909467b48Spatrick MAddr = Visited[&MI];
210009467b48Spatrick
210109467b48Spatrick if (MAddr.Offset == 0) {
210209467b48Spatrick LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
210309467b48Spatrick " constant offsets that can be promoted.\n";);
210409467b48Spatrick return false;
210509467b48Spatrick }
210609467b48Spatrick
210709467b48Spatrick LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
210809467b48Spatrick << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
210909467b48Spatrick
211009467b48Spatrick // Step2: Traverse through MI's basic block and find an anchor(that has the
211109467b48Spatrick // same base-registers) with the highest 13bit distance from MI's offset.
211209467b48Spatrick // E.g. (64bit loads)
211309467b48Spatrick // bb:
211409467b48Spatrick // addr1 = &a + 4096; load1 = load(addr1, 0)
211509467b48Spatrick // addr2 = &a + 6144; load2 = load(addr2, 0)
211609467b48Spatrick // addr3 = &a + 8192; load3 = load(addr3, 0)
211709467b48Spatrick // addr4 = &a + 10240; load4 = load(addr4, 0)
211809467b48Spatrick // addr5 = &a + 12288; load5 = load(addr5, 0)
211909467b48Spatrick //
212009467b48Spatrick // Starting from the first load, the optimization will try to find a new base
212109467b48Spatrick // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
212209467b48Spatrick // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
212309467b48Spatrick // as the new-base(anchor) because of the maximum distance which can
2124*d415bd75Srobert // accommodate more intermediate bases presumably.
212509467b48Spatrick //
212609467b48Spatrick // Step3: move (&a + 8192) above load1. Compute and promote offsets from
212709467b48Spatrick // (&a + 8192) for load1, load2, load4.
212809467b48Spatrick // addr = &a + 8192
212909467b48Spatrick // load1 = load(addr, -4096)
213009467b48Spatrick // load2 = load(addr, -2048)
213109467b48Spatrick // load3 = load(addr, 0)
213209467b48Spatrick // load4 = load(addr, 2048)
213309467b48Spatrick // addr5 = &a + 12288; load5 = load(addr5, 0)
213409467b48Spatrick //
213509467b48Spatrick MachineInstr *AnchorInst = nullptr;
213609467b48Spatrick MemAddress AnchorAddr;
213709467b48Spatrick uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
213809467b48Spatrick SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
213909467b48Spatrick
214009467b48Spatrick MachineBasicBlock *MBB = MI.getParent();
214109467b48Spatrick MachineBasicBlock::iterator E = MBB->end();
214209467b48Spatrick MachineBasicBlock::iterator MBBI = MI.getIterator();
214309467b48Spatrick ++MBBI;
214409467b48Spatrick const SITargetLowering *TLI =
214509467b48Spatrick static_cast<const SITargetLowering *>(STM->getTargetLowering());
214609467b48Spatrick
214709467b48Spatrick for ( ; MBBI != E; ++MBBI) {
214809467b48Spatrick MachineInstr &MINext = *MBBI;
214909467b48Spatrick // TODO: Support finding an anchor(with same base) from store addresses or
215009467b48Spatrick // any other load addresses where the opcodes are different.
215109467b48Spatrick if (MINext.getOpcode() != MI.getOpcode() ||
215209467b48Spatrick TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
215309467b48Spatrick continue;
215409467b48Spatrick
215509467b48Spatrick const MachineOperand &BaseNext =
215609467b48Spatrick *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
215709467b48Spatrick MemAddress MAddrNext;
215809467b48Spatrick if (Visited.find(&MINext) == Visited.end()) {
215909467b48Spatrick processBaseWithConstOffset(BaseNext, MAddrNext);
216009467b48Spatrick Visited[&MINext] = MAddrNext;
216109467b48Spatrick } else
216209467b48Spatrick MAddrNext = Visited[&MINext];
216309467b48Spatrick
216409467b48Spatrick if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
216509467b48Spatrick MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
216609467b48Spatrick MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
216709467b48Spatrick MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
216809467b48Spatrick continue;
216909467b48Spatrick
2170*d415bd75Srobert InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
217109467b48Spatrick
217209467b48Spatrick int64_t Dist = MAddr.Offset - MAddrNext.Offset;
217309467b48Spatrick TargetLoweringBase::AddrMode AM;
217409467b48Spatrick AM.HasBaseReg = true;
217509467b48Spatrick AM.BaseOffs = Dist;
217609467b48Spatrick if (TLI->isLegalGlobalAddressingMode(AM) &&
217709467b48Spatrick (uint32_t)std::abs(Dist) > MaxDist) {
217809467b48Spatrick MaxDist = std::abs(Dist);
217909467b48Spatrick
218009467b48Spatrick AnchorAddr = MAddrNext;
218109467b48Spatrick AnchorInst = &MINext;
218209467b48Spatrick }
218309467b48Spatrick }
218409467b48Spatrick
218509467b48Spatrick if (AnchorInst) {
218609467b48Spatrick LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
218709467b48Spatrick AnchorInst->dump());
218809467b48Spatrick LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
218909467b48Spatrick << AnchorAddr.Offset << "\n\n");
219009467b48Spatrick
219109467b48Spatrick // Instead of moving up, just re-compute anchor-instruction's base address.
2192097a140dSpatrick Register Base = computeBase(MI, AnchorAddr);
219309467b48Spatrick
219409467b48Spatrick updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
219509467b48Spatrick LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
219609467b48Spatrick
219709467b48Spatrick for (auto P : InstsWCommonBase) {
219809467b48Spatrick TargetLoweringBase::AddrMode AM;
219909467b48Spatrick AM.HasBaseReg = true;
220009467b48Spatrick AM.BaseOffs = P.second - AnchorAddr.Offset;
220109467b48Spatrick
220209467b48Spatrick if (TLI->isLegalGlobalAddressingMode(AM)) {
220309467b48Spatrick LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
220409467b48Spatrick dbgs() << ")"; P.first->dump());
220509467b48Spatrick updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
220609467b48Spatrick LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
220709467b48Spatrick }
220809467b48Spatrick }
220909467b48Spatrick AnchorList.insert(AnchorInst);
221009467b48Spatrick return true;
221109467b48Spatrick }
221209467b48Spatrick
221309467b48Spatrick return false;
221409467b48Spatrick }
221509467b48Spatrick
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const221609467b48Spatrick void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
221709467b48Spatrick std::list<std::list<CombineInfo> > &MergeableInsts) const {
221809467b48Spatrick for (std::list<CombineInfo> &AddrList : MergeableInsts) {
221909467b48Spatrick if (AddrList.front().InstClass == CI.InstClass &&
2220*d415bd75Srobert AddrList.front().IsAGPR == CI.IsAGPR &&
2221*d415bd75Srobert AddrList.front().hasSameBaseAddress(CI)) {
222209467b48Spatrick AddrList.emplace_back(CI);
222309467b48Spatrick return;
222409467b48Spatrick }
222509467b48Spatrick }
222609467b48Spatrick
222709467b48Spatrick // Base address not found, so add a new list.
222809467b48Spatrick MergeableInsts.emplace_back(1, CI);
222909467b48Spatrick }
223009467b48Spatrick
2231097a140dSpatrick std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2232097a140dSpatrick SILoadStoreOptimizer::collectMergeableInsts(
2233097a140dSpatrick MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2234097a140dSpatrick MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
223509467b48Spatrick std::list<std::list<CombineInfo>> &MergeableInsts) const {
223609467b48Spatrick bool Modified = false;
223709467b48Spatrick
223809467b48Spatrick // Sort potential mergeable instructions into lists. One list per base address.
2239097a140dSpatrick unsigned Order = 0;
2240097a140dSpatrick MachineBasicBlock::iterator BlockI = Begin;
2241097a140dSpatrick for (; BlockI != End; ++BlockI) {
2242097a140dSpatrick MachineInstr &MI = *BlockI;
2243097a140dSpatrick
224409467b48Spatrick // We run this before checking if an address is mergeable, because it can produce
224509467b48Spatrick // better code even if the instructions aren't mergeable.
224609467b48Spatrick if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
224709467b48Spatrick Modified = true;
224809467b48Spatrick
2249*d415bd75Srobert // Treat volatile accesses, ordered accesses and unmodeled side effects as
2250*d415bd75Srobert // barriers. We can look after this barrier for separate merges.
2251*d415bd75Srobert if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2252*d415bd75Srobert LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2253097a140dSpatrick
2254097a140dSpatrick // Search will resume after this instruction in a separate merge list.
2255097a140dSpatrick ++BlockI;
2256097a140dSpatrick break;
2257097a140dSpatrick }
2258097a140dSpatrick
225909467b48Spatrick const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
226009467b48Spatrick if (InstClass == UNKNOWN)
226109467b48Spatrick continue;
226209467b48Spatrick
2263*d415bd75Srobert // Do not merge VMEM buffer instructions with "swizzled" bit set.
2264*d415bd75Srobert int Swizzled =
2265*d415bd75Srobert AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2266*d415bd75Srobert if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2267*d415bd75Srobert continue;
2268*d415bd75Srobert
226909467b48Spatrick CombineInfo CI;
2270*d415bd75Srobert CI.setMI(MI, *this);
2271097a140dSpatrick CI.Order = Order++;
227209467b48Spatrick
227309467b48Spatrick if (!CI.hasMergeableAddress(*MRI))
227409467b48Spatrick continue;
227509467b48Spatrick
2276*d415bd75Srobert if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2277*d415bd75Srobert // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2278*d415bd75Srobert // operands. However we are reporting that ds_write2 shall have
2279*d415bd75Srobert // only VGPR data so that machine copy propagation does not
2280*d415bd75Srobert // create an illegal instruction with a VGPR and AGPR sources.
2281*d415bd75Srobert // Consequenctially if we create such instruction the verifier
2282*d415bd75Srobert // will complain.
2283*d415bd75Srobert continue;
2284*d415bd75Srobert }
2285*d415bd75Srobert
2286097a140dSpatrick LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2287097a140dSpatrick
228809467b48Spatrick addInstToMergeableList(CI, MergeableInsts);
228909467b48Spatrick }
2290097a140dSpatrick
2291097a140dSpatrick // At this point we have lists of Mergeable instructions.
2292097a140dSpatrick //
2293097a140dSpatrick // Part 2: Sort lists by offset and then for each CombineInfo object in the
2294097a140dSpatrick // list try to find an instruction that can be merged with I. If an instruction
2295097a140dSpatrick // is found, it is stored in the Paired field. If no instructions are found, then
2296097a140dSpatrick // the CombineInfo object is deleted from the list.
2297097a140dSpatrick
2298097a140dSpatrick for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2299097a140dSpatrick E = MergeableInsts.end(); I != E;) {
2300097a140dSpatrick
2301097a140dSpatrick std::list<CombineInfo> &MergeList = *I;
2302097a140dSpatrick if (MergeList.size() <= 1) {
2303097a140dSpatrick // This means we have found only one instruction with a given address
2304097a140dSpatrick // that can be merged, and we need at least 2 instructions to do a merge,
2305097a140dSpatrick // so this list can be discarded.
2306097a140dSpatrick I = MergeableInsts.erase(I);
2307097a140dSpatrick continue;
2308097a140dSpatrick }
2309097a140dSpatrick
2310097a140dSpatrick // Sort the lists by offsets, this way mergeable instructions will be
2311097a140dSpatrick // adjacent to each other in the list, which will make it easier to find
2312097a140dSpatrick // matches.
2313097a140dSpatrick MergeList.sort(
2314*d415bd75Srobert [] (const CombineInfo &A, const CombineInfo &B) {
2315097a140dSpatrick return A.Offset < B.Offset;
2316097a140dSpatrick });
2317097a140dSpatrick ++I;
2318097a140dSpatrick }
2319097a140dSpatrick
2320*d415bd75Srobert return std::pair(BlockI, Modified);
232109467b48Spatrick }
232209467b48Spatrick
232309467b48Spatrick // Scan through looking for adjacent LDS operations with constant offsets from
232409467b48Spatrick // the same base register. We rely on the scheduler to do the hard work of
232509467b48Spatrick // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)232609467b48Spatrick bool SILoadStoreOptimizer::optimizeBlock(
232709467b48Spatrick std::list<std::list<CombineInfo> > &MergeableInsts) {
232809467b48Spatrick bool Modified = false;
232909467b48Spatrick
2330097a140dSpatrick for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2331097a140dSpatrick E = MergeableInsts.end(); I != E;) {
2332097a140dSpatrick std::list<CombineInfo> &MergeList = *I;
233309467b48Spatrick
233409467b48Spatrick bool OptimizeListAgain = false;
233509467b48Spatrick if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2336097a140dSpatrick // We weren't able to make any changes, so delete the list so we don't
233709467b48Spatrick // process the same instructions the next time we try to optimize this
233809467b48Spatrick // block.
2339097a140dSpatrick I = MergeableInsts.erase(I);
234009467b48Spatrick continue;
234109467b48Spatrick }
234209467b48Spatrick
2343097a140dSpatrick Modified = true;
2344097a140dSpatrick
234509467b48Spatrick // We made changes, but also determined that there were no more optimization
234609467b48Spatrick // opportunities, so we don't need to reprocess the list
2347097a140dSpatrick if (!OptimizeListAgain) {
2348097a140dSpatrick I = MergeableInsts.erase(I);
2349097a140dSpatrick continue;
2350097a140dSpatrick }
2351097a140dSpatrick OptimizeAgain = true;
235209467b48Spatrick }
235309467b48Spatrick return Modified;
235409467b48Spatrick }
235509467b48Spatrick
235609467b48Spatrick bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)235709467b48Spatrick SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
235809467b48Spatrick std::list<CombineInfo> &MergeList,
235909467b48Spatrick bool &OptimizeListAgain) {
2360097a140dSpatrick if (MergeList.empty())
2361097a140dSpatrick return false;
2362097a140dSpatrick
236309467b48Spatrick bool Modified = false;
236409467b48Spatrick
2365097a140dSpatrick for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2366097a140dSpatrick Next = std::next(I)) {
2367097a140dSpatrick
2368097a140dSpatrick auto First = I;
2369097a140dSpatrick auto Second = Next;
2370097a140dSpatrick
2371097a140dSpatrick if ((*First).Order > (*Second).Order)
2372097a140dSpatrick std::swap(First, Second);
2373097a140dSpatrick CombineInfo &CI = *First;
2374097a140dSpatrick CombineInfo &Paired = *Second;
2375097a140dSpatrick
2376*d415bd75Srobert CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2377*d415bd75Srobert if (!Where) {
2378097a140dSpatrick ++I;
237909467b48Spatrick continue;
2380097a140dSpatrick }
238109467b48Spatrick
238209467b48Spatrick Modified = true;
2383097a140dSpatrick
2384097a140dSpatrick LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
238509467b48Spatrick
2386*d415bd75Srobert MachineBasicBlock::iterator NewMI;
238709467b48Spatrick switch (CI.InstClass) {
238809467b48Spatrick default:
238909467b48Spatrick llvm_unreachable("unknown InstClass");
239009467b48Spatrick break;
2391*d415bd75Srobert case DS_READ:
2392*d415bd75Srobert NewMI = mergeRead2Pair(CI, Paired, Where->I);
2393*d415bd75Srobert break;
2394*d415bd75Srobert case DS_WRITE:
2395*d415bd75Srobert NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2396*d415bd75Srobert break;
2397*d415bd75Srobert case S_BUFFER_LOAD_IMM:
2398*d415bd75Srobert case S_BUFFER_LOAD_SGPR_IMM:
2399*d415bd75Srobert case S_LOAD_IMM:
2400*d415bd75Srobert NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2401*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 8;
2402*d415bd75Srobert break;
2403*d415bd75Srobert case BUFFER_LOAD:
2404*d415bd75Srobert NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2405*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406*d415bd75Srobert break;
2407*d415bd75Srobert case BUFFER_STORE:
2408*d415bd75Srobert NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2409*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2410*d415bd75Srobert break;
2411*d415bd75Srobert case MIMG:
2412*d415bd75Srobert NewMI = mergeImagePair(CI, Paired, Where->I);
2413*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2414*d415bd75Srobert break;
2415*d415bd75Srobert case TBUFFER_LOAD:
2416*d415bd75Srobert NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2417*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418*d415bd75Srobert break;
2419*d415bd75Srobert case TBUFFER_STORE:
2420*d415bd75Srobert NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2421*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2422*d415bd75Srobert break;
2423*d415bd75Srobert case FLAT_LOAD:
2424*d415bd75Srobert case GLOBAL_LOAD:
2425*d415bd75Srobert case GLOBAL_LOAD_SADDR:
2426*d415bd75Srobert NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2427*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
2428*d415bd75Srobert break;
2429*d415bd75Srobert case FLAT_STORE:
2430*d415bd75Srobert case GLOBAL_STORE:
2431*d415bd75Srobert case GLOBAL_STORE_SADDR:
2432*d415bd75Srobert NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2433*d415bd75Srobert OptimizeListAgain |= CI.Width + Paired.Width < 4;
243409467b48Spatrick break;
243509467b48Spatrick }
2436*d415bd75Srobert CI.setMI(NewMI, *this);
2437*d415bd75Srobert CI.Order = Where->Order;
2438097a140dSpatrick if (I == Second)
2439097a140dSpatrick I = Next;
244009467b48Spatrick
2441097a140dSpatrick MergeList.erase(Second);
244209467b48Spatrick }
244309467b48Spatrick
244409467b48Spatrick return Modified;
244509467b48Spatrick }
244609467b48Spatrick
runOnMachineFunction(MachineFunction & MF)244709467b48Spatrick bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
244809467b48Spatrick if (skipFunction(MF.getFunction()))
244909467b48Spatrick return false;
245009467b48Spatrick
245109467b48Spatrick STM = &MF.getSubtarget<GCNSubtarget>();
245209467b48Spatrick if (!STM->loadStoreOptEnabled())
245309467b48Spatrick return false;
245409467b48Spatrick
245509467b48Spatrick TII = STM->getInstrInfo();
245609467b48Spatrick TRI = &TII->getRegisterInfo();
245709467b48Spatrick
245809467b48Spatrick MRI = &MF.getRegInfo();
245909467b48Spatrick AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
246009467b48Spatrick
246109467b48Spatrick LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
246209467b48Spatrick
246309467b48Spatrick bool Modified = false;
246409467b48Spatrick
2465097a140dSpatrick // Contains the list of instructions for which constant offsets are being
2466097a140dSpatrick // promoted to the IMM. This is tracked for an entire block at time.
2467097a140dSpatrick SmallPtrSet<MachineInstr *, 4> AnchorList;
2468097a140dSpatrick MemInfoMap Visited;
246909467b48Spatrick
247009467b48Spatrick for (MachineBasicBlock &MBB : MF) {
2471097a140dSpatrick MachineBasicBlock::iterator SectionEnd;
2472097a140dSpatrick for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2473097a140dSpatrick I = SectionEnd) {
2474097a140dSpatrick bool CollectModified;
247509467b48Spatrick std::list<std::list<CombineInfo>> MergeableInsts;
2476097a140dSpatrick
2477097a140dSpatrick // First pass: Collect list of all instructions we know how to merge in a
2478097a140dSpatrick // subset of the block.
2479097a140dSpatrick std::tie(SectionEnd, CollectModified) =
2480097a140dSpatrick collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2481097a140dSpatrick
2482097a140dSpatrick Modified |= CollectModified;
2483097a140dSpatrick
248409467b48Spatrick do {
248509467b48Spatrick OptimizeAgain = false;
248609467b48Spatrick Modified |= optimizeBlock(MergeableInsts);
248709467b48Spatrick } while (OptimizeAgain);
248809467b48Spatrick }
248909467b48Spatrick
2490097a140dSpatrick Visited.clear();
2491097a140dSpatrick AnchorList.clear();
2492097a140dSpatrick }
2493097a140dSpatrick
249409467b48Spatrick return Modified;
249509467b48Spatrick }
2496