xref: /openbsd-src/gnu/llvm/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
109467b48Spatrick //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This pass tries to fuse DS instructions with close by immediate offsets.
1009467b48Spatrick // This will fuse operations such as
1109467b48Spatrick //  ds_read_b32 v0, v2 offset:16
1209467b48Spatrick //  ds_read_b32 v1, v2 offset:32
1309467b48Spatrick // ==>
1409467b48Spatrick //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
1509467b48Spatrick //
1609467b48Spatrick // The same is done for certain SMEM and VMEM opcodes, e.g.:
1709467b48Spatrick //  s_buffer_load_dword s4, s[0:3], 4
1809467b48Spatrick //  s_buffer_load_dword s5, s[0:3], 8
1909467b48Spatrick // ==>
2009467b48Spatrick //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
2109467b48Spatrick //
2209467b48Spatrick // This pass also tries to promote constant offset to the immediate by
2309467b48Spatrick // adjusting the base. It tries to use a base from the nearby instructions that
2409467b48Spatrick // allows it to have a 13bit constant offset and then promotes the 13bit offset
2509467b48Spatrick // to the immediate.
2609467b48Spatrick // E.g.
2709467b48Spatrick //  s_movk_i32 s0, 0x1800
2809467b48Spatrick //  v_add_co_u32_e32 v0, vcc, s0, v2
2909467b48Spatrick //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
3009467b48Spatrick //
3109467b48Spatrick //  s_movk_i32 s0, 0x1000
3209467b48Spatrick //  v_add_co_u32_e32 v5, vcc, s0, v2
3309467b48Spatrick //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3409467b48Spatrick //  global_load_dwordx2 v[5:6], v[5:6], off
3509467b48Spatrick //  global_load_dwordx2 v[0:1], v[0:1], off
3609467b48Spatrick // =>
3709467b48Spatrick //  s_movk_i32 s0, 0x1000
3809467b48Spatrick //  v_add_co_u32_e32 v5, vcc, s0, v2
3909467b48Spatrick //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
4009467b48Spatrick //  global_load_dwordx2 v[5:6], v[5:6], off
4109467b48Spatrick //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
4209467b48Spatrick //
4309467b48Spatrick // Future improvements:
4409467b48Spatrick //
4509467b48Spatrick // - This is currently missing stores of constants because loading
4609467b48Spatrick //   the constant into the data register is placed between the stores, although
4709467b48Spatrick //   this is arguably a scheduling problem.
4809467b48Spatrick //
4909467b48Spatrick // - Live interval recomputing seems inefficient. This currently only matches
5009467b48Spatrick //   one pair, and recomputes live intervals and moves on to the next pair. It
5109467b48Spatrick //   would be better to compute a list of all merges that need to occur.
5209467b48Spatrick //
5309467b48Spatrick // - With a list of instructions to process, we can also merge more. If a
5409467b48Spatrick //   cluster of loads have offsets that are too large to fit in the 8-bit
5509467b48Spatrick //   offsets, but are close enough to fit in the 8 bits, we can add to the base
5609467b48Spatrick //   pointer and use the new reduced offsets.
5709467b48Spatrick //
5809467b48Spatrick //===----------------------------------------------------------------------===//
5909467b48Spatrick 
6009467b48Spatrick #include "AMDGPU.h"
6173471bf0Spatrick #include "GCNSubtarget.h"
6209467b48Spatrick #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
6309467b48Spatrick #include "llvm/Analysis/AliasAnalysis.h"
6409467b48Spatrick #include "llvm/CodeGen/MachineFunctionPass.h"
6509467b48Spatrick #include "llvm/InitializePasses.h"
6609467b48Spatrick 
6709467b48Spatrick using namespace llvm;
6809467b48Spatrick 
6909467b48Spatrick #define DEBUG_TYPE "si-load-store-opt"
7009467b48Spatrick 
7109467b48Spatrick namespace {
7209467b48Spatrick enum InstClassEnum {
7309467b48Spatrick   UNKNOWN,
7409467b48Spatrick   DS_READ,
7509467b48Spatrick   DS_WRITE,
7609467b48Spatrick   S_BUFFER_LOAD_IMM,
77*d415bd75Srobert   S_BUFFER_LOAD_SGPR_IMM,
78*d415bd75Srobert   S_LOAD_IMM,
7909467b48Spatrick   BUFFER_LOAD,
8009467b48Spatrick   BUFFER_STORE,
8109467b48Spatrick   MIMG,
8209467b48Spatrick   TBUFFER_LOAD,
8309467b48Spatrick   TBUFFER_STORE,
84*d415bd75Srobert   GLOBAL_LOAD_SADDR,
85*d415bd75Srobert   GLOBAL_STORE_SADDR,
86*d415bd75Srobert   FLAT_LOAD,
87*d415bd75Srobert   FLAT_STORE,
88*d415bd75Srobert   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89*d415bd75Srobert   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90*d415bd75Srobert                // getCommonInstClass.
9109467b48Spatrick };
9209467b48Spatrick 
93097a140dSpatrick struct AddressRegs {
94097a140dSpatrick   unsigned char NumVAddrs = 0;
95097a140dSpatrick   bool SBase = false;
96097a140dSpatrick   bool SRsrc = false;
97097a140dSpatrick   bool SOffset = false;
98*d415bd75Srobert   bool SAddr = false;
99097a140dSpatrick   bool VAddr = false;
100097a140dSpatrick   bool Addr = false;
101097a140dSpatrick   bool SSamp = false;
10209467b48Spatrick };
10309467b48Spatrick 
104097a140dSpatrick // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105097a140dSpatrick const unsigned MaxAddressRegs = 12 + 1 + 1;
106097a140dSpatrick 
10709467b48Spatrick class SILoadStoreOptimizer : public MachineFunctionPass {
10809467b48Spatrick   struct CombineInfo {
10909467b48Spatrick     MachineBasicBlock::iterator I;
11009467b48Spatrick     unsigned EltSize;
11109467b48Spatrick     unsigned Offset;
11209467b48Spatrick     unsigned Width;
11309467b48Spatrick     unsigned Format;
11409467b48Spatrick     unsigned BaseOff;
11509467b48Spatrick     unsigned DMask;
11609467b48Spatrick     InstClassEnum InstClass;
11773471bf0Spatrick     unsigned CPol = 0;
118*d415bd75Srobert     bool IsAGPR;
11909467b48Spatrick     bool UseST64;
120097a140dSpatrick     int AddrIdx[MaxAddressRegs];
121097a140dSpatrick     const MachineOperand *AddrReg[MaxAddressRegs];
12209467b48Spatrick     unsigned NumAddresses;
123097a140dSpatrick     unsigned Order;
12409467b48Spatrick 
hasSameBaseAddress__anondd5682560111::SILoadStoreOptimizer::CombineInfo125*d415bd75Srobert     bool hasSameBaseAddress(const CombineInfo &CI) {
126*d415bd75Srobert       if (NumAddresses != CI.NumAddresses)
127*d415bd75Srobert         return false;
128*d415bd75Srobert 
129*d415bd75Srobert       const MachineInstr &MI = *CI.I;
13009467b48Spatrick       for (unsigned i = 0; i < NumAddresses; i++) {
13109467b48Spatrick         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
13209467b48Spatrick 
13309467b48Spatrick         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
13409467b48Spatrick           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
13509467b48Spatrick               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
13609467b48Spatrick             return false;
13709467b48Spatrick           }
13809467b48Spatrick           continue;
13909467b48Spatrick         }
14009467b48Spatrick 
14109467b48Spatrick         // Check same base pointer. Be careful of subregisters, which can occur
14209467b48Spatrick         // with vectors of pointers.
14309467b48Spatrick         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
14409467b48Spatrick             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
14509467b48Spatrick          return false;
14609467b48Spatrick         }
14709467b48Spatrick       }
14809467b48Spatrick       return true;
14909467b48Spatrick     }
15009467b48Spatrick 
hasMergeableAddress__anondd5682560111::SILoadStoreOptimizer::CombineInfo15109467b48Spatrick     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
15209467b48Spatrick       for (unsigned i = 0; i < NumAddresses; ++i) {
15309467b48Spatrick         const MachineOperand *AddrOp = AddrReg[i];
15409467b48Spatrick         // Immediates are always OK.
15509467b48Spatrick         if (AddrOp->isImm())
15609467b48Spatrick           continue;
15709467b48Spatrick 
15809467b48Spatrick         // Don't try to merge addresses that aren't either immediates or registers.
15909467b48Spatrick         // TODO: Should be possible to merge FrameIndexes and maybe some other
16009467b48Spatrick         // non-register
16109467b48Spatrick         if (!AddrOp->isReg())
16209467b48Spatrick           return false;
16309467b48Spatrick 
164*d415bd75Srobert         // TODO: We should be able to merge physical reg addresses.
16573471bf0Spatrick         if (AddrOp->getReg().isPhysical())
16609467b48Spatrick           return false;
16709467b48Spatrick 
168*d415bd75Srobert         // If an address has only one use then there will be no other
16909467b48Spatrick         // instructions with the same address, so we can't merge this one.
17009467b48Spatrick         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
17109467b48Spatrick           return false;
17209467b48Spatrick       }
17309467b48Spatrick       return true;
17409467b48Spatrick     }
17509467b48Spatrick 
176*d415bd75Srobert     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177*d415bd75Srobert 
178*d415bd75Srobert     // Compare by pointer order.
operator <__anondd5682560111::SILoadStoreOptimizer::CombineInfo179*d415bd75Srobert     bool operator<(const CombineInfo& Other) const {
180*d415bd75Srobert       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181*d415bd75Srobert     }
18209467b48Spatrick   };
18309467b48Spatrick 
18409467b48Spatrick   struct BaseRegisters {
185097a140dSpatrick     Register LoReg;
186097a140dSpatrick     Register HiReg;
18709467b48Spatrick 
18809467b48Spatrick     unsigned LoSubReg = 0;
18909467b48Spatrick     unsigned HiSubReg = 0;
19009467b48Spatrick   };
19109467b48Spatrick 
19209467b48Spatrick   struct MemAddress {
19309467b48Spatrick     BaseRegisters Base;
19409467b48Spatrick     int64_t Offset = 0;
19509467b48Spatrick   };
19609467b48Spatrick 
19709467b48Spatrick   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
19809467b48Spatrick 
19909467b48Spatrick private:
20009467b48Spatrick   const GCNSubtarget *STM = nullptr;
20109467b48Spatrick   const SIInstrInfo *TII = nullptr;
20209467b48Spatrick   const SIRegisterInfo *TRI = nullptr;
20309467b48Spatrick   MachineRegisterInfo *MRI = nullptr;
20409467b48Spatrick   AliasAnalysis *AA = nullptr;
20509467b48Spatrick   bool OptimizeAgain;
20609467b48Spatrick 
207*d415bd75Srobert   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208*d415bd75Srobert                            const DenseSet<Register> &ARegUses,
209*d415bd75Srobert                            const MachineInstr &A, const MachineInstr &B) const;
21009467b48Spatrick   static bool dmasksCanBeCombined(const CombineInfo &CI,
21109467b48Spatrick                                   const SIInstrInfo &TII,
21209467b48Spatrick                                   const CombineInfo &Paired);
213097a140dSpatrick   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214097a140dSpatrick                                    CombineInfo &Paired, bool Modify = false);
215097a140dSpatrick   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
21609467b48Spatrick                         const CombineInfo &Paired);
21709467b48Spatrick   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
21809467b48Spatrick   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
21909467b48Spatrick                                                      const CombineInfo &Paired);
22009467b48Spatrick   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
22109467b48Spatrick                                                     const CombineInfo &Paired);
22273471bf0Spatrick   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
22309467b48Spatrick 
224*d415bd75Srobert   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
22509467b48Spatrick 
22609467b48Spatrick   unsigned read2Opcode(unsigned EltSize) const;
22709467b48Spatrick   unsigned read2ST64Opcode(unsigned EltSize) const;
228*d415bd75Srobert   MachineBasicBlock::iterator
229*d415bd75Srobert   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230*d415bd75Srobert                  MachineBasicBlock::iterator InsertBefore);
23109467b48Spatrick 
23209467b48Spatrick   unsigned write2Opcode(unsigned EltSize) const;
23309467b48Spatrick   unsigned write2ST64Opcode(unsigned EltSize) const;
234097a140dSpatrick   MachineBasicBlock::iterator
235097a140dSpatrick   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236*d415bd75Srobert                   MachineBasicBlock::iterator InsertBefore);
237097a140dSpatrick   MachineBasicBlock::iterator
238097a140dSpatrick   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239*d415bd75Srobert                  MachineBasicBlock::iterator InsertBefore);
240097a140dSpatrick   MachineBasicBlock::iterator
241*d415bd75Srobert   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242*d415bd75Srobert                        MachineBasicBlock::iterator InsertBefore);
243097a140dSpatrick   MachineBasicBlock::iterator
244097a140dSpatrick   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245*d415bd75Srobert                       MachineBasicBlock::iterator InsertBefore);
246097a140dSpatrick   MachineBasicBlock::iterator
247097a140dSpatrick   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248*d415bd75Srobert                        MachineBasicBlock::iterator InsertBefore);
249097a140dSpatrick   MachineBasicBlock::iterator
250097a140dSpatrick   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251*d415bd75Srobert                        MachineBasicBlock::iterator InsertBefore);
252097a140dSpatrick   MachineBasicBlock::iterator
253097a140dSpatrick   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254*d415bd75Srobert                         MachineBasicBlock::iterator InsertBefore);
255*d415bd75Srobert   MachineBasicBlock::iterator
256*d415bd75Srobert   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257*d415bd75Srobert                     MachineBasicBlock::iterator InsertBefore);
258*d415bd75Srobert   MachineBasicBlock::iterator
259*d415bd75Srobert   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260*d415bd75Srobert                      MachineBasicBlock::iterator InsertBefore);
26109467b48Spatrick 
262097a140dSpatrick   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
26309467b48Spatrick                            int32_t NewOffset) const;
264097a140dSpatrick   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
26509467b48Spatrick   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266*d415bd75Srobert   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
26709467b48Spatrick   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
26809467b48Spatrick   /// Promotes constant offset to the immediate by adjusting the base. It
26909467b48Spatrick   /// tries to use a base from the nearby instructions that allows it to have
27009467b48Spatrick   /// a 13bit constant offset which gets promoted to the immediate.
27109467b48Spatrick   bool promoteConstantOffsetToImm(MachineInstr &CI,
27209467b48Spatrick                                   MemInfoMap &Visited,
27309467b48Spatrick                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
27409467b48Spatrick   void addInstToMergeableList(const CombineInfo &CI,
27509467b48Spatrick                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
276097a140dSpatrick 
277097a140dSpatrick   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278097a140dSpatrick       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279097a140dSpatrick       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
28009467b48Spatrick       std::list<std::list<CombineInfo>> &MergeableInsts) const;
28109467b48Spatrick 
282*d415bd75Srobert   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283*d415bd75Srobert                                                      const CombineInfo &Paired);
284*d415bd75Srobert 
285*d415bd75Srobert   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286*d415bd75Srobert                                           const CombineInfo &Paired);
287*d415bd75Srobert 
28809467b48Spatrick public:
28909467b48Spatrick   static char ID;
29009467b48Spatrick 
SILoadStoreOptimizer()29109467b48Spatrick   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
29209467b48Spatrick     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
29309467b48Spatrick   }
29409467b48Spatrick 
29509467b48Spatrick   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
29609467b48Spatrick                                      bool &OptimizeListAgain);
29709467b48Spatrick   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
29809467b48Spatrick 
29909467b48Spatrick   bool runOnMachineFunction(MachineFunction &MF) override;
30009467b48Spatrick 
getPassName() const30109467b48Spatrick   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
30209467b48Spatrick 
getAnalysisUsage(AnalysisUsage & AU) const30309467b48Spatrick   void getAnalysisUsage(AnalysisUsage &AU) const override {
30409467b48Spatrick     AU.setPreservesCFG();
30509467b48Spatrick     AU.addRequired<AAResultsWrapperPass>();
30609467b48Spatrick 
30709467b48Spatrick     MachineFunctionPass::getAnalysisUsage(AU);
30809467b48Spatrick   }
309097a140dSpatrick 
getRequiredProperties() const310097a140dSpatrick   MachineFunctionProperties getRequiredProperties() const override {
311097a140dSpatrick     return MachineFunctionProperties()
312097a140dSpatrick       .set(MachineFunctionProperties::Property::IsSSA);
313097a140dSpatrick   }
31409467b48Spatrick };
31509467b48Spatrick 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)31609467b48Spatrick static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
31709467b48Spatrick   const unsigned Opc = MI.getOpcode();
31809467b48Spatrick 
31909467b48Spatrick   if (TII.isMUBUF(Opc)) {
32009467b48Spatrick     // FIXME: Handle d16 correctly
32109467b48Spatrick     return AMDGPU::getMUBUFElements(Opc);
32209467b48Spatrick   }
32309467b48Spatrick   if (TII.isMIMG(MI)) {
32409467b48Spatrick     uint64_t DMaskImm =
32509467b48Spatrick         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326*d415bd75Srobert     return llvm::popcount(DMaskImm);
32709467b48Spatrick   }
32809467b48Spatrick   if (TII.isMTBUF(Opc)) {
32909467b48Spatrick     return AMDGPU::getMTBUFElements(Opc);
33009467b48Spatrick   }
33109467b48Spatrick 
33209467b48Spatrick   switch (Opc) {
33309467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336*d415bd75Srobert   case AMDGPU::S_LOAD_DWORD_IMM:
337*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD:
338*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD:
340*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORD:
342*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORD:
34309467b48Spatrick     return 1;
34409467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX2_IMM:
348*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2:
351*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX2:
353*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX2:
35409467b48Spatrick     return 2;
355*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3:
356*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3:
358*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX3:
360*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX3:
361*d415bd75Srobert     return 3;
36209467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX4_IMM:
366*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4:
367*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4:
369*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX4:
371*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX4:
37209467b48Spatrick     return 4;
373*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX8_IMM:
377*d415bd75Srobert     return 8;
378*d415bd75Srobert   case AMDGPU::DS_READ_B32:      [[fallthrough]];
379*d415bd75Srobert   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380*d415bd75Srobert   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
38173471bf0Spatrick   case AMDGPU::DS_WRITE_B32_gfx9:
38273471bf0Spatrick     return 1;
383*d415bd75Srobert   case AMDGPU::DS_READ_B64:      [[fallthrough]];
384*d415bd75Srobert   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385*d415bd75Srobert   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
38673471bf0Spatrick   case AMDGPU::DS_WRITE_B64_gfx9:
38773471bf0Spatrick     return 2;
38809467b48Spatrick   default:
38909467b48Spatrick     return 0;
39009467b48Spatrick   }
39109467b48Spatrick }
39209467b48Spatrick 
39309467b48Spatrick /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)39409467b48Spatrick static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
39509467b48Spatrick   switch (Opc) {
39609467b48Spatrick   default:
39709467b48Spatrick     if (TII.isMUBUF(Opc)) {
39809467b48Spatrick       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
39909467b48Spatrick       default:
40009467b48Spatrick         return UNKNOWN;
40109467b48Spatrick       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
40209467b48Spatrick       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
40309467b48Spatrick       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
40409467b48Spatrick       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
40509467b48Spatrick         return BUFFER_LOAD;
40609467b48Spatrick       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
40709467b48Spatrick       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
40809467b48Spatrick       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
40909467b48Spatrick       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
41009467b48Spatrick         return BUFFER_STORE;
41109467b48Spatrick       }
41209467b48Spatrick     }
41309467b48Spatrick     if (TII.isMIMG(Opc)) {
41409467b48Spatrick       // Ignore instructions encoded without vaddr.
415*d415bd75Srobert       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416*d415bd75Srobert           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
417*d415bd75Srobert         return UNKNOWN;
418*d415bd75Srobert       // Ignore BVH instructions
419*d415bd75Srobert       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
42009467b48Spatrick         return UNKNOWN;
42109467b48Spatrick       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
42209467b48Spatrick       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
42309467b48Spatrick           TII.isGather4(Opc))
42409467b48Spatrick         return UNKNOWN;
42509467b48Spatrick       return MIMG;
42609467b48Spatrick     }
42709467b48Spatrick     if (TII.isMTBUF(Opc)) {
42809467b48Spatrick       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
42909467b48Spatrick       default:
43009467b48Spatrick         return UNKNOWN;
43109467b48Spatrick       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
43209467b48Spatrick       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
43309467b48Spatrick       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
43409467b48Spatrick       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
43509467b48Spatrick         return TBUFFER_LOAD;
43609467b48Spatrick       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
43709467b48Spatrick       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
43809467b48Spatrick       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
43909467b48Spatrick       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
44009467b48Spatrick         return TBUFFER_STORE;
44109467b48Spatrick       }
44209467b48Spatrick     }
44309467b48Spatrick     return UNKNOWN;
44409467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
44509467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
44609467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
44809467b48Spatrick     return S_BUFFER_LOAD_IMM;
449*d415bd75Srobert   // For the purposes of this optimization SGPR variants of buffer loads
450*d415bd75Srobert   // are considered to be zero-offsetted SGPR_IMM loads.
451*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459*d415bd75Srobert     return S_BUFFER_LOAD_SGPR_IMM;
460*d415bd75Srobert   case AMDGPU::S_LOAD_DWORD_IMM:
461*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX2_IMM:
462*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX4_IMM:
463*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX8_IMM:
464*d415bd75Srobert     return S_LOAD_IMM;
46509467b48Spatrick   case AMDGPU::DS_READ_B32:
46609467b48Spatrick   case AMDGPU::DS_READ_B32_gfx9:
46709467b48Spatrick   case AMDGPU::DS_READ_B64:
46809467b48Spatrick   case AMDGPU::DS_READ_B64_gfx9:
46909467b48Spatrick     return DS_READ;
47009467b48Spatrick   case AMDGPU::DS_WRITE_B32:
47109467b48Spatrick   case AMDGPU::DS_WRITE_B32_gfx9:
47209467b48Spatrick   case AMDGPU::DS_WRITE_B64:
47309467b48Spatrick   case AMDGPU::DS_WRITE_B64_gfx9:
47409467b48Spatrick     return DS_WRITE;
475*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD:
476*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2:
477*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3:
478*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4:
479*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORD:
480*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX2:
481*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX3:
482*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX4:
483*d415bd75Srobert     return FLAT_LOAD;
484*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488*d415bd75Srobert     return GLOBAL_LOAD_SADDR;
489*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD:
490*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2:
491*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3:
492*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4:
493*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORD:
494*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX2:
495*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX3:
496*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX4:
497*d415bd75Srobert     return FLAT_STORE;
498*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502*d415bd75Srobert     return GLOBAL_STORE_SADDR;
50309467b48Spatrick   }
50409467b48Spatrick }
50509467b48Spatrick 
50609467b48Spatrick /// Determines instruction subclass from opcode. Only instructions
507*d415bd75Srobert /// of the same subclass can be merged together. The merged instruction may have
508*d415bd75Srobert /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)50909467b48Spatrick static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
51009467b48Spatrick   switch (Opc) {
51109467b48Spatrick   default:
51209467b48Spatrick     if (TII.isMUBUF(Opc))
51309467b48Spatrick       return AMDGPU::getMUBUFBaseOpcode(Opc);
51409467b48Spatrick     if (TII.isMIMG(Opc)) {
51509467b48Spatrick       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
51609467b48Spatrick       assert(Info);
51709467b48Spatrick       return Info->BaseOpcode;
51809467b48Spatrick     }
51909467b48Spatrick     if (TII.isMTBUF(Opc))
52009467b48Spatrick       return AMDGPU::getMTBUFBaseOpcode(Opc);
52109467b48Spatrick     return -1;
52209467b48Spatrick   case AMDGPU::DS_READ_B32:
52309467b48Spatrick   case AMDGPU::DS_READ_B32_gfx9:
52409467b48Spatrick   case AMDGPU::DS_READ_B64:
52509467b48Spatrick   case AMDGPU::DS_READ_B64_gfx9:
52609467b48Spatrick   case AMDGPU::DS_WRITE_B32:
52709467b48Spatrick   case AMDGPU::DS_WRITE_B32_gfx9:
52809467b48Spatrick   case AMDGPU::DS_WRITE_B64:
52909467b48Spatrick   case AMDGPU::DS_WRITE_B64_gfx9:
53009467b48Spatrick     return Opc;
53109467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
53209467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
53309467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
53509467b48Spatrick     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
536*d415bd75Srobert   // For the purposes of this optimization SGPR variants of buffer loads
537*d415bd75Srobert   // are considered to be zero-offsetted SGPR_IMM loads.
538*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546*d415bd75Srobert     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547*d415bd75Srobert   case AMDGPU::S_LOAD_DWORD_IMM:
548*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX2_IMM:
549*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX4_IMM:
550*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX8_IMM:
551*d415bd75Srobert     return AMDGPU::S_LOAD_DWORD_IMM;
552*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD:
553*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2:
554*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3:
555*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4:
556*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORD:
557*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX2:
558*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX3:
559*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX4:
560*d415bd75Srobert     return AMDGPU::FLAT_LOAD_DWORD;
561*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565*d415bd75Srobert     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD:
567*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2:
568*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3:
569*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4:
570*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORD:
571*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX2:
572*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX3:
573*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX4:
574*d415bd75Srobert     return AMDGPU::FLAT_STORE_DWORD;
575*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579*d415bd75Srobert     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
58009467b48Spatrick   }
58109467b48Spatrick }
58209467b48Spatrick 
583*d415bd75Srobert // GLOBAL loads and stores are classified as FLAT initially. If both combined
584*d415bd75Srobert // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
585*d415bd75Srobert // If either or both instructions are non segment specific FLAT the resulting
586*d415bd75Srobert // combined operation will be FLAT, potentially promoting one of the GLOBAL
587*d415bd75Srobert // operations to FLAT.
588*d415bd75Srobert // For other instructions return the original unmodified class.
589*d415bd75Srobert InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)590*d415bd75Srobert SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
591*d415bd75Srobert                                          const CombineInfo &Paired) {
592*d415bd75Srobert   assert(CI.InstClass == Paired.InstClass);
593*d415bd75Srobert 
594*d415bd75Srobert   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
595*d415bd75Srobert       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
596*d415bd75Srobert     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
597*d415bd75Srobert 
598*d415bd75Srobert   return CI.InstClass;
599*d415bd75Srobert }
600*d415bd75Srobert 
getRegs(unsigned Opc,const SIInstrInfo & TII)601097a140dSpatrick static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
602097a140dSpatrick   AddressRegs Result;
603097a140dSpatrick 
60409467b48Spatrick   if (TII.isMUBUF(Opc)) {
605097a140dSpatrick     if (AMDGPU::getMUBUFHasVAddr(Opc))
606097a140dSpatrick       Result.VAddr = true;
607097a140dSpatrick     if (AMDGPU::getMUBUFHasSrsrc(Opc))
608097a140dSpatrick       Result.SRsrc = true;
609097a140dSpatrick     if (AMDGPU::getMUBUFHasSoffset(Opc))
610097a140dSpatrick       Result.SOffset = true;
61109467b48Spatrick 
612097a140dSpatrick     return Result;
61309467b48Spatrick   }
61409467b48Spatrick 
61509467b48Spatrick   if (TII.isMIMG(Opc)) {
616097a140dSpatrick     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617097a140dSpatrick     if (VAddr0Idx >= 0) {
618097a140dSpatrick       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619097a140dSpatrick       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
620097a140dSpatrick     } else {
621097a140dSpatrick       Result.VAddr = true;
622097a140dSpatrick     }
623097a140dSpatrick     Result.SRsrc = true;
62409467b48Spatrick     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
62509467b48Spatrick     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
626097a140dSpatrick       Result.SSamp = true;
62709467b48Spatrick 
628097a140dSpatrick     return Result;
62909467b48Spatrick   }
63009467b48Spatrick   if (TII.isMTBUF(Opc)) {
631097a140dSpatrick     if (AMDGPU::getMTBUFHasVAddr(Opc))
632097a140dSpatrick       Result.VAddr = true;
633097a140dSpatrick     if (AMDGPU::getMTBUFHasSrsrc(Opc))
634097a140dSpatrick       Result.SRsrc = true;
635097a140dSpatrick     if (AMDGPU::getMTBUFHasSoffset(Opc))
636097a140dSpatrick       Result.SOffset = true;
63709467b48Spatrick 
638097a140dSpatrick     return Result;
63909467b48Spatrick   }
64009467b48Spatrick 
64109467b48Spatrick   switch (Opc) {
64209467b48Spatrick   default:
643097a140dSpatrick     return Result;
644*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652*d415bd75Srobert     Result.SOffset = true;
653*d415bd75Srobert     [[fallthrough]];
65409467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
65509467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
65609467b48Spatrick   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657*d415bd75Srobert   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658*d415bd75Srobert   case AMDGPU::S_LOAD_DWORD_IMM:
659*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX2_IMM:
660*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX4_IMM:
661*d415bd75Srobert   case AMDGPU::S_LOAD_DWORDX8_IMM:
662097a140dSpatrick     Result.SBase = true;
663097a140dSpatrick     return Result;
66409467b48Spatrick   case AMDGPU::DS_READ_B32:
66509467b48Spatrick   case AMDGPU::DS_READ_B64:
66609467b48Spatrick   case AMDGPU::DS_READ_B32_gfx9:
66709467b48Spatrick   case AMDGPU::DS_READ_B64_gfx9:
66809467b48Spatrick   case AMDGPU::DS_WRITE_B32:
66909467b48Spatrick   case AMDGPU::DS_WRITE_B64:
67009467b48Spatrick   case AMDGPU::DS_WRITE_B32_gfx9:
67109467b48Spatrick   case AMDGPU::DS_WRITE_B64_gfx9:
672097a140dSpatrick     Result.Addr = true;
673097a140dSpatrick     return Result;
674*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682*d415bd75Srobert     Result.SAddr = true;
683*d415bd75Srobert     [[fallthrough]];
684*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORD:
685*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX2:
686*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX3:
687*d415bd75Srobert   case AMDGPU::GLOBAL_LOAD_DWORDX4:
688*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORD:
689*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX2:
690*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX3:
691*d415bd75Srobert   case AMDGPU::GLOBAL_STORE_DWORDX4:
692*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORD:
693*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX2:
694*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX3:
695*d415bd75Srobert   case AMDGPU::FLAT_LOAD_DWORDX4:
696*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORD:
697*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX2:
698*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX3:
699*d415bd75Srobert   case AMDGPU::FLAT_STORE_DWORDX4:
700*d415bd75Srobert     Result.VAddr = true;
701*d415bd75Srobert     return Result;
70209467b48Spatrick   }
70309467b48Spatrick }
70409467b48Spatrick 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)70509467b48Spatrick void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706*d415bd75Srobert                                               const SILoadStoreOptimizer &LSO) {
70709467b48Spatrick   I = MI;
70809467b48Spatrick   unsigned Opc = MI->getOpcode();
709*d415bd75Srobert   InstClass = getInstClass(Opc, *LSO.TII);
71009467b48Spatrick 
71109467b48Spatrick   if (InstClass == UNKNOWN)
71209467b48Spatrick     return;
71309467b48Spatrick 
714*d415bd75Srobert   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715*d415bd75Srobert 
71609467b48Spatrick   switch (InstClass) {
71709467b48Spatrick   case DS_READ:
71809467b48Spatrick    EltSize =
71909467b48Spatrick           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
72009467b48Spatrick                                                                           : 4;
72109467b48Spatrick    break;
72209467b48Spatrick   case DS_WRITE:
72309467b48Spatrick     EltSize =
72409467b48Spatrick           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
72509467b48Spatrick                                                                             : 4;
72609467b48Spatrick     break;
72709467b48Spatrick   case S_BUFFER_LOAD_IMM:
728*d415bd75Srobert   case S_BUFFER_LOAD_SGPR_IMM:
729*d415bd75Srobert   case S_LOAD_IMM:
730*d415bd75Srobert     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
73109467b48Spatrick     break;
73209467b48Spatrick   default:
73309467b48Spatrick     EltSize = 4;
73409467b48Spatrick     break;
73509467b48Spatrick   }
73609467b48Spatrick 
73709467b48Spatrick   if (InstClass == MIMG) {
738*d415bd75Srobert     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739097a140dSpatrick     // Offset is not considered for MIMG instructions.
740097a140dSpatrick     Offset = 0;
74109467b48Spatrick   } else {
74209467b48Spatrick     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743*d415bd75Srobert     Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
74409467b48Spatrick   }
74509467b48Spatrick 
74609467b48Spatrick   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747*d415bd75Srobert     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
74809467b48Spatrick 
749*d415bd75Srobert   Width = getOpcodeWidth(*I, *LSO.TII);
75009467b48Spatrick 
75109467b48Spatrick   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
75209467b48Spatrick     Offset &= 0xffff;
75309467b48Spatrick   } else if (InstClass != MIMG) {
754*d415bd75Srobert     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
75509467b48Spatrick   }
75609467b48Spatrick 
757*d415bd75Srobert   AddressRegs Regs = getRegs(Opc, *LSO.TII);
758097a140dSpatrick 
75909467b48Spatrick   NumAddresses = 0;
760097a140dSpatrick   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
761097a140dSpatrick     AddrIdx[NumAddresses++] =
762097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
763097a140dSpatrick   if (Regs.Addr)
764097a140dSpatrick     AddrIdx[NumAddresses++] =
765097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
766097a140dSpatrick   if (Regs.SBase)
767097a140dSpatrick     AddrIdx[NumAddresses++] =
768097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
769097a140dSpatrick   if (Regs.SRsrc)
770097a140dSpatrick     AddrIdx[NumAddresses++] =
771097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
772097a140dSpatrick   if (Regs.SOffset)
773097a140dSpatrick     AddrIdx[NumAddresses++] =
774097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
775*d415bd75Srobert   if (Regs.SAddr)
776*d415bd75Srobert     AddrIdx[NumAddresses++] =
777*d415bd75Srobert         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
778097a140dSpatrick   if (Regs.VAddr)
779097a140dSpatrick     AddrIdx[NumAddresses++] =
780097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
781097a140dSpatrick   if (Regs.SSamp)
782097a140dSpatrick     AddrIdx[NumAddresses++] =
783097a140dSpatrick         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784097a140dSpatrick   assert(NumAddresses <= MaxAddressRegs);
78509467b48Spatrick 
786097a140dSpatrick   for (unsigned J = 0; J < NumAddresses; J++)
787097a140dSpatrick     AddrReg[J] = &I->getOperand(AddrIdx[J]);
78809467b48Spatrick }
78909467b48Spatrick 
79009467b48Spatrick } // end anonymous namespace.
79109467b48Spatrick 
79209467b48Spatrick INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
79309467b48Spatrick                       "SI Load Store Optimizer", false, false)
79409467b48Spatrick INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
79509467b48Spatrick INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
79609467b48Spatrick                     false, false)
79709467b48Spatrick 
79809467b48Spatrick char SILoadStoreOptimizer::ID = 0;
79909467b48Spatrick 
80009467b48Spatrick char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
80109467b48Spatrick 
createSILoadStoreOptimizerPass()80209467b48Spatrick FunctionPass *llvm::createSILoadStoreOptimizerPass() {
80309467b48Spatrick   return new SILoadStoreOptimizer();
80409467b48Spatrick }
80509467b48Spatrick 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)80609467b48Spatrick static void addDefsUsesToList(const MachineInstr &MI,
807097a140dSpatrick                               DenseSet<Register> &RegDefs,
808*d415bd75Srobert                               DenseSet<Register> &RegUses) {
809*d415bd75Srobert   for (const auto &Op : MI.operands()) {
810*d415bd75Srobert     if (!Op.isReg())
811*d415bd75Srobert       continue;
81209467b48Spatrick     if (Op.isDef())
81309467b48Spatrick       RegDefs.insert(Op.getReg());
814*d415bd75Srobert     if (Op.readsReg())
815*d415bd75Srobert       RegUses.insert(Op.getReg());
81609467b48Spatrick   }
81709467b48Spatrick }
81809467b48Spatrick 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const819*d415bd75Srobert bool SILoadStoreOptimizer::canSwapInstructions(
820*d415bd75Srobert     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
821*d415bd75Srobert     const MachineInstr &A, const MachineInstr &B) const {
822*d415bd75Srobert   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
823*d415bd75Srobert       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
82409467b48Spatrick     return false;
825*d415bd75Srobert   for (const auto &BOp : B.operands()) {
826*d415bd75Srobert     if (!BOp.isReg())
82709467b48Spatrick       continue;
828*d415bd75Srobert     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
829*d415bd75Srobert       return false;
830*d415bd75Srobert     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
83109467b48Spatrick       return false;
83209467b48Spatrick   }
83309467b48Spatrick   return true;
83409467b48Spatrick }
83509467b48Spatrick 
836*d415bd75Srobert // Given that \p CI and \p Paired are adjacent memory operations produce a new
837*d415bd75Srobert // MMO for the combined operation with a new access size.
838*d415bd75Srobert MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)839*d415bd75Srobert SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
840*d415bd75Srobert                                                const CombineInfo &Paired) {
841*d415bd75Srobert   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
842*d415bd75Srobert   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
843*d415bd75Srobert 
844*d415bd75Srobert   unsigned Size = MMOa->getSize() + MMOb->getSize();
845*d415bd75Srobert 
846*d415bd75Srobert   // A base pointer for the combined operation is the same as the leading
847*d415bd75Srobert   // operation's pointer.
848*d415bd75Srobert   if (Paired < CI)
849*d415bd75Srobert     std::swap(MMOa, MMOb);
850*d415bd75Srobert 
851*d415bd75Srobert   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
852*d415bd75Srobert   // If merging FLAT and GLOBAL set address space to FLAT.
853*d415bd75Srobert   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
854*d415bd75Srobert     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
855*d415bd75Srobert 
856*d415bd75Srobert   MachineFunction *MF = CI.I->getMF();
857*d415bd75Srobert   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
85809467b48Spatrick }
85909467b48Spatrick 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)86009467b48Spatrick bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
86109467b48Spatrick                                                const SIInstrInfo &TII,
86209467b48Spatrick                                                const CombineInfo &Paired) {
86309467b48Spatrick   assert(CI.InstClass == MIMG);
86409467b48Spatrick 
86509467b48Spatrick   // Ignore instructions with tfe/lwe set.
86609467b48Spatrick   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
86709467b48Spatrick   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
86809467b48Spatrick 
86909467b48Spatrick   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
87009467b48Spatrick     return false;
87109467b48Spatrick 
87209467b48Spatrick   // Check other optional immediate operands for equality.
87373471bf0Spatrick   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
87473471bf0Spatrick                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
87573471bf0Spatrick                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
87609467b48Spatrick 
87709467b48Spatrick   for (auto op : OperandsToMatch) {
87809467b48Spatrick     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
87909467b48Spatrick     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
88009467b48Spatrick       return false;
88109467b48Spatrick     if (Idx != -1 &&
88209467b48Spatrick         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
88309467b48Spatrick       return false;
88409467b48Spatrick   }
88509467b48Spatrick 
88609467b48Spatrick   // Check DMask for overlaps.
88709467b48Spatrick   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
88809467b48Spatrick   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
88909467b48Spatrick 
89009467b48Spatrick   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
89109467b48Spatrick   if ((1u << AllowedBitsForMin) <= MinMask)
89209467b48Spatrick     return false;
89309467b48Spatrick 
89409467b48Spatrick   return true;
89509467b48Spatrick }
89609467b48Spatrick 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)89709467b48Spatrick static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
89809467b48Spatrick                                        unsigned ComponentCount,
899097a140dSpatrick                                        const GCNSubtarget &STI) {
90009467b48Spatrick   if (ComponentCount > 4)
90109467b48Spatrick     return 0;
90209467b48Spatrick 
90309467b48Spatrick   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
90409467b48Spatrick       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
90509467b48Spatrick   if (!OldFormatInfo)
90609467b48Spatrick     return 0;
90709467b48Spatrick 
90809467b48Spatrick   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
90909467b48Spatrick       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
91009467b48Spatrick                                            ComponentCount,
91109467b48Spatrick                                            OldFormatInfo->NumFormat, STI);
91209467b48Spatrick 
91309467b48Spatrick   if (!NewFormatInfo)
91409467b48Spatrick     return 0;
91509467b48Spatrick 
91609467b48Spatrick   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
91709467b48Spatrick          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
91809467b48Spatrick 
91909467b48Spatrick   return NewFormatInfo->Format;
92009467b48Spatrick }
92109467b48Spatrick 
92273471bf0Spatrick // Return the value in the inclusive range [Lo,Hi] that is aligned to the
92373471bf0Spatrick // highest power of two. Note that the result is well defined for all inputs
92473471bf0Spatrick // including corner cases like:
92573471bf0Spatrick // - if Lo == Hi, return that value
92673471bf0Spatrick // - if Lo == 0, return 0 (even though the "- 1" below underflows
92773471bf0Spatrick // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)92873471bf0Spatrick static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
92973471bf0Spatrick   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
93073471bf0Spatrick }
93173471bf0Spatrick 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)93209467b48Spatrick bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
933097a140dSpatrick                                                 const GCNSubtarget &STI,
934097a140dSpatrick                                                 CombineInfo &Paired,
935097a140dSpatrick                                                 bool Modify) {
93609467b48Spatrick   assert(CI.InstClass != MIMG);
93709467b48Spatrick 
93809467b48Spatrick   // XXX - Would the same offset be OK? Is there any reason this would happen or
93909467b48Spatrick   // be useful?
94009467b48Spatrick   if (CI.Offset == Paired.Offset)
94109467b48Spatrick     return false;
94209467b48Spatrick 
94309467b48Spatrick   // This won't be valid if the offset isn't aligned.
94409467b48Spatrick   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
94509467b48Spatrick     return false;
94609467b48Spatrick 
94709467b48Spatrick   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
94809467b48Spatrick 
94909467b48Spatrick     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
95009467b48Spatrick         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
95109467b48Spatrick     if (!Info0)
95209467b48Spatrick       return false;
95309467b48Spatrick     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
95409467b48Spatrick         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
95509467b48Spatrick     if (!Info1)
95609467b48Spatrick       return false;
95709467b48Spatrick 
95809467b48Spatrick     if (Info0->BitsPerComp != Info1->BitsPerComp ||
95909467b48Spatrick         Info0->NumFormat != Info1->NumFormat)
96009467b48Spatrick       return false;
96109467b48Spatrick 
96209467b48Spatrick     // TODO: Should be possible to support more formats, but if format loads
96309467b48Spatrick     // are not dword-aligned, the merged load might not be valid.
96409467b48Spatrick     if (Info0->BitsPerComp != 32)
96509467b48Spatrick       return false;
96609467b48Spatrick 
96709467b48Spatrick     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
96809467b48Spatrick       return false;
96909467b48Spatrick   }
97009467b48Spatrick 
97173471bf0Spatrick   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
97273471bf0Spatrick   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
97309467b48Spatrick   CI.UseST64 = false;
97409467b48Spatrick   CI.BaseOff = 0;
97509467b48Spatrick 
97673471bf0Spatrick   // Handle all non-DS instructions.
97709467b48Spatrick   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
97809467b48Spatrick     return (EltOffset0 + CI.Width == EltOffset1 ||
97909467b48Spatrick             EltOffset1 + Paired.Width == EltOffset0) &&
980*d415bd75Srobert            CI.CPol == Paired.CPol;
98109467b48Spatrick   }
98209467b48Spatrick 
98309467b48Spatrick   // If the offset in elements doesn't fit in 8-bits, we might be able to use
98409467b48Spatrick   // the stride 64 versions.
98509467b48Spatrick   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
98609467b48Spatrick       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
987097a140dSpatrick     if (Modify) {
98809467b48Spatrick       CI.Offset = EltOffset0 / 64;
98909467b48Spatrick       Paired.Offset = EltOffset1 / 64;
99009467b48Spatrick       CI.UseST64 = true;
991097a140dSpatrick     }
99209467b48Spatrick     return true;
99309467b48Spatrick   }
99409467b48Spatrick 
99509467b48Spatrick   // Check if the new offsets fit in the reduced 8-bit range.
99609467b48Spatrick   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
997097a140dSpatrick     if (Modify) {
99809467b48Spatrick       CI.Offset = EltOffset0;
99909467b48Spatrick       Paired.Offset = EltOffset1;
1000097a140dSpatrick     }
100109467b48Spatrick     return true;
100209467b48Spatrick   }
100309467b48Spatrick 
100409467b48Spatrick   // Try to shift base address to decrease offsets.
100573471bf0Spatrick   uint32_t Min = std::min(EltOffset0, EltOffset1);
100673471bf0Spatrick   uint32_t Max = std::max(EltOffset0, EltOffset1);
100709467b48Spatrick 
100873471bf0Spatrick   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
100973471bf0Spatrick   if (((Max - Min) & ~Mask) == 0) {
1010097a140dSpatrick     if (Modify) {
101173471bf0Spatrick       // From the range of values we could use for BaseOff, choose the one that
101273471bf0Spatrick       // is aligned to the highest power of two, to maximise the chance that
101373471bf0Spatrick       // the same offset can be reused for other load/store pairs.
101473471bf0Spatrick       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
101573471bf0Spatrick       // Copy the low bits of the offsets, so that when we adjust them by
101673471bf0Spatrick       // subtracting BaseOff they will be multiples of 64.
101773471bf0Spatrick       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
101873471bf0Spatrick       CI.BaseOff = BaseOff * CI.EltSize;
101973471bf0Spatrick       CI.Offset = (EltOffset0 - BaseOff) / 64;
102073471bf0Spatrick       Paired.Offset = (EltOffset1 - BaseOff) / 64;
102109467b48Spatrick       CI.UseST64 = true;
1022097a140dSpatrick     }
102309467b48Spatrick     return true;
102409467b48Spatrick   }
102509467b48Spatrick 
102673471bf0Spatrick   if (isUInt<8>(Max - Min)) {
1027097a140dSpatrick     if (Modify) {
102873471bf0Spatrick       // From the range of values we could use for BaseOff, choose the one that
102973471bf0Spatrick       // is aligned to the highest power of two, to maximise the chance that
103073471bf0Spatrick       // the same offset can be reused for other load/store pairs.
103173471bf0Spatrick       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
103273471bf0Spatrick       CI.BaseOff = BaseOff * CI.EltSize;
103373471bf0Spatrick       CI.Offset = EltOffset0 - BaseOff;
103473471bf0Spatrick       Paired.Offset = EltOffset1 - BaseOff;
1035097a140dSpatrick     }
103609467b48Spatrick     return true;
103709467b48Spatrick   }
103809467b48Spatrick 
103909467b48Spatrick   return false;
104009467b48Spatrick }
104109467b48Spatrick 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)104209467b48Spatrick bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
104309467b48Spatrick                                      const CombineInfo &CI,
104409467b48Spatrick                                      const CombineInfo &Paired) {
104509467b48Spatrick   const unsigned Width = (CI.Width + Paired.Width);
104609467b48Spatrick   switch (CI.InstClass) {
104709467b48Spatrick   default:
104809467b48Spatrick     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
104909467b48Spatrick   case S_BUFFER_LOAD_IMM:
1050*d415bd75Srobert   case S_BUFFER_LOAD_SGPR_IMM:
1051*d415bd75Srobert   case S_LOAD_IMM:
105209467b48Spatrick     switch (Width) {
105309467b48Spatrick     default:
105409467b48Spatrick       return false;
105509467b48Spatrick     case 2:
105609467b48Spatrick     case 4:
1057*d415bd75Srobert     case 8:
105809467b48Spatrick       return true;
105909467b48Spatrick     }
106009467b48Spatrick   }
106109467b48Spatrick }
106209467b48Spatrick 
106373471bf0Spatrick const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const106473471bf0Spatrick SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
106573471bf0Spatrick   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
106673471bf0Spatrick     return TRI->getRegClassForReg(*MRI, Dst->getReg());
106773471bf0Spatrick   }
106873471bf0Spatrick   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
106973471bf0Spatrick     return TRI->getRegClassForReg(*MRI, Src->getReg());
107073471bf0Spatrick   }
107173471bf0Spatrick   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
107273471bf0Spatrick     return TRI->getRegClassForReg(*MRI, Src->getReg());
107373471bf0Spatrick   }
107473471bf0Spatrick   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
107573471bf0Spatrick     return TRI->getRegClassForReg(*MRI, Dst->getReg());
107673471bf0Spatrick   }
107773471bf0Spatrick   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
107873471bf0Spatrick     return TRI->getRegClassForReg(*MRI, Src->getReg());
107973471bf0Spatrick   }
108073471bf0Spatrick   return nullptr;
108173471bf0Spatrick }
108273471bf0Spatrick 
1083*d415bd75Srobert /// This function assumes that CI comes before Paired in a basic block. Return
1084*d415bd75Srobert /// an insertion point for the merged instruction or nullptr on failure.
1085*d415bd75Srobert SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1086*d415bd75Srobert SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087*d415bd75Srobert                                            CombineInfo &Paired) {
1088*d415bd75Srobert   // If another instruction has already been merged into CI, it may now be a
1089*d415bd75Srobert   // type that we can't do any further merging into.
1090*d415bd75Srobert   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1091*d415bd75Srobert     return nullptr;
1092*d415bd75Srobert   assert(CI.InstClass == Paired.InstClass);
1093*d415bd75Srobert 
1094*d415bd75Srobert   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1095*d415bd75Srobert       getInstSubclass(Paired.I->getOpcode(), *TII))
1096*d415bd75Srobert     return nullptr;
1097097a140dSpatrick 
1098097a140dSpatrick   // Check both offsets (or masks for MIMG) can be combined and fit in the
1099097a140dSpatrick   // reduced range.
1100*d415bd75Srobert   if (CI.InstClass == MIMG) {
1101*d415bd75Srobert     if (!dmasksCanBeCombined(CI, *TII, Paired))
1102*d415bd75Srobert       return nullptr;
1103*d415bd75Srobert   } else {
1104*d415bd75Srobert     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1105*d415bd75Srobert       return nullptr;
1106097a140dSpatrick   }
1107097a140dSpatrick 
1108*d415bd75Srobert   DenseSet<Register> RegDefs;
1109*d415bd75Srobert   DenseSet<Register> RegUses;
1110*d415bd75Srobert   CombineInfo *Where;
1111*d415bd75Srobert   if (CI.I->mayLoad()) {
1112*d415bd75Srobert     // Try to hoist Paired up to CI.
1113*d415bd75Srobert     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1114*d415bd75Srobert     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1115*d415bd75Srobert       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1116*d415bd75Srobert         return nullptr;
111709467b48Spatrick     }
1118*d415bd75Srobert     Where = &CI;
1119*d415bd75Srobert   } else {
1120*d415bd75Srobert     // Try to sink CI down to Paired.
1121*d415bd75Srobert     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1122*d415bd75Srobert     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1123*d415bd75Srobert       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1124*d415bd75Srobert         return nullptr;
112509467b48Spatrick     }
1126*d415bd75Srobert     Where = &Paired;
112709467b48Spatrick   }
112809467b48Spatrick 
1129097a140dSpatrick   // Call offsetsCanBeCombined with modify = true so that the offsets are
1130097a140dSpatrick   // correct for the new instruction.  This should return true, because
1131097a140dSpatrick   // this function should only be called on CombineInfo objects that
1132097a140dSpatrick   // have already been confirmed to be mergeable.
1133*d415bd75Srobert   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134097a140dSpatrick     offsetsCanBeCombined(CI, *STM, Paired, true);
1135*d415bd75Srobert   return Where;
113609467b48Spatrick }
113709467b48Spatrick 
read2Opcode(unsigned EltSize) const113809467b48Spatrick unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
113909467b48Spatrick   if (STM->ldsRequiresM0Init())
114009467b48Spatrick     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
114109467b48Spatrick   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
114209467b48Spatrick }
114309467b48Spatrick 
read2ST64Opcode(unsigned EltSize) const114409467b48Spatrick unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
114509467b48Spatrick   if (STM->ldsRequiresM0Init())
114609467b48Spatrick     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
114709467b48Spatrick 
114809467b48Spatrick   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
114909467b48Spatrick                         : AMDGPU::DS_READ2ST64_B64_gfx9;
115009467b48Spatrick }
115109467b48Spatrick 
115209467b48Spatrick MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1153097a140dSpatrick SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1154*d415bd75Srobert                                      MachineBasicBlock::iterator InsertBefore) {
115509467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
115609467b48Spatrick 
115709467b48Spatrick   // Be careful, since the addresses could be subregisters themselves in weird
115809467b48Spatrick   // cases, like vectors of pointers.
115909467b48Spatrick   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
116009467b48Spatrick 
116109467b48Spatrick   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
116209467b48Spatrick   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
116309467b48Spatrick 
116409467b48Spatrick   unsigned NewOffset0 = CI.Offset;
116509467b48Spatrick   unsigned NewOffset1 = Paired.Offset;
116609467b48Spatrick   unsigned Opc =
116709467b48Spatrick       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
116809467b48Spatrick 
116909467b48Spatrick   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
117009467b48Spatrick   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
117109467b48Spatrick 
117209467b48Spatrick   if (NewOffset0 > NewOffset1) {
117309467b48Spatrick     // Canonicalize the merged instruction so the smaller offset comes first.
117409467b48Spatrick     std::swap(NewOffset0, NewOffset1);
117509467b48Spatrick     std::swap(SubRegIdx0, SubRegIdx1);
117609467b48Spatrick   }
117709467b48Spatrick 
117809467b48Spatrick   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
117909467b48Spatrick          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
118009467b48Spatrick 
118109467b48Spatrick   const MCInstrDesc &Read2Desc = TII->get(Opc);
118209467b48Spatrick 
118373471bf0Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
118409467b48Spatrick   Register DestReg = MRI->createVirtualRegister(SuperRC);
118509467b48Spatrick 
118609467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
118709467b48Spatrick 
118809467b48Spatrick   Register BaseReg = AddrReg->getReg();
118909467b48Spatrick   unsigned BaseSubReg = AddrReg->getSubReg();
119009467b48Spatrick   unsigned BaseRegFlags = 0;
119109467b48Spatrick   if (CI.BaseOff) {
119209467b48Spatrick     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1193*d415bd75Srobert     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
119409467b48Spatrick         .addImm(CI.BaseOff);
119509467b48Spatrick 
119609467b48Spatrick     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
119709467b48Spatrick     BaseRegFlags = RegState::Kill;
119809467b48Spatrick 
1199*d415bd75Srobert     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
120009467b48Spatrick         .addReg(ImmReg)
120109467b48Spatrick         .addReg(AddrReg->getReg(), 0, BaseSubReg)
120209467b48Spatrick         .addImm(0); // clamp bit
120309467b48Spatrick     BaseSubReg = 0;
120409467b48Spatrick   }
120509467b48Spatrick 
120609467b48Spatrick   MachineInstrBuilder Read2 =
1207*d415bd75Srobert       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
120809467b48Spatrick           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
120909467b48Spatrick           .addImm(NewOffset0)                        // offset0
121009467b48Spatrick           .addImm(NewOffset1)                        // offset1
121109467b48Spatrick           .addImm(0)                                 // gds
121209467b48Spatrick           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
121309467b48Spatrick 
121409467b48Spatrick   (void)Read2;
121509467b48Spatrick 
121609467b48Spatrick   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
121709467b48Spatrick 
121809467b48Spatrick   // Copy to the old destination registers.
1219*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
122009467b48Spatrick       .add(*Dest0) // Copy to same destination including flags and sub reg.
122109467b48Spatrick       .addReg(DestReg, 0, SubRegIdx0);
1222*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
122309467b48Spatrick       .add(*Dest1)
122409467b48Spatrick       .addReg(DestReg, RegState::Kill, SubRegIdx1);
122509467b48Spatrick 
122609467b48Spatrick   CI.I->eraseFromParent();
122709467b48Spatrick   Paired.I->eraseFromParent();
122809467b48Spatrick 
122909467b48Spatrick   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
123009467b48Spatrick   return Read2;
123109467b48Spatrick }
123209467b48Spatrick 
write2Opcode(unsigned EltSize) const123309467b48Spatrick unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
123409467b48Spatrick   if (STM->ldsRequiresM0Init())
123509467b48Spatrick     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
123609467b48Spatrick   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
123709467b48Spatrick                         : AMDGPU::DS_WRITE2_B64_gfx9;
123809467b48Spatrick }
123909467b48Spatrick 
write2ST64Opcode(unsigned EltSize) const124009467b48Spatrick unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
124109467b48Spatrick   if (STM->ldsRequiresM0Init())
124209467b48Spatrick     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
124309467b48Spatrick                           : AMDGPU::DS_WRITE2ST64_B64;
124409467b48Spatrick 
124509467b48Spatrick   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
124609467b48Spatrick                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
124709467b48Spatrick }
124809467b48Spatrick 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1249*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1250*d415bd75Srobert     CombineInfo &CI, CombineInfo &Paired,
1251*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
125209467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
125309467b48Spatrick 
125409467b48Spatrick   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
125509467b48Spatrick   // sure we preserve the subregister index and any register flags set on them.
125609467b48Spatrick   const MachineOperand *AddrReg =
125709467b48Spatrick       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
125809467b48Spatrick   const MachineOperand *Data0 =
125909467b48Spatrick       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
126009467b48Spatrick   const MachineOperand *Data1 =
126109467b48Spatrick       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
126209467b48Spatrick 
126309467b48Spatrick   unsigned NewOffset0 = CI.Offset;
126409467b48Spatrick   unsigned NewOffset1 = Paired.Offset;
126509467b48Spatrick   unsigned Opc =
126609467b48Spatrick       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
126709467b48Spatrick 
126809467b48Spatrick   if (NewOffset0 > NewOffset1) {
126909467b48Spatrick     // Canonicalize the merged instruction so the smaller offset comes first.
127009467b48Spatrick     std::swap(NewOffset0, NewOffset1);
127109467b48Spatrick     std::swap(Data0, Data1);
127209467b48Spatrick   }
127309467b48Spatrick 
127409467b48Spatrick   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
127509467b48Spatrick          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
127609467b48Spatrick 
127709467b48Spatrick   const MCInstrDesc &Write2Desc = TII->get(Opc);
127809467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
127909467b48Spatrick 
128009467b48Spatrick   Register BaseReg = AddrReg->getReg();
128109467b48Spatrick   unsigned BaseSubReg = AddrReg->getSubReg();
128209467b48Spatrick   unsigned BaseRegFlags = 0;
128309467b48Spatrick   if (CI.BaseOff) {
128409467b48Spatrick     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1285*d415bd75Srobert     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
128609467b48Spatrick         .addImm(CI.BaseOff);
128709467b48Spatrick 
128809467b48Spatrick     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
128909467b48Spatrick     BaseRegFlags = RegState::Kill;
129009467b48Spatrick 
1291*d415bd75Srobert     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
129209467b48Spatrick         .addReg(ImmReg)
129309467b48Spatrick         .addReg(AddrReg->getReg(), 0, BaseSubReg)
129409467b48Spatrick         .addImm(0); // clamp bit
129509467b48Spatrick     BaseSubReg = 0;
129609467b48Spatrick   }
129709467b48Spatrick 
129809467b48Spatrick   MachineInstrBuilder Write2 =
1299*d415bd75Srobert       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
130009467b48Spatrick           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
130109467b48Spatrick           .add(*Data0)                               // data0
130209467b48Spatrick           .add(*Data1)                               // data1
130309467b48Spatrick           .addImm(NewOffset0)                        // offset0
130409467b48Spatrick           .addImm(NewOffset1)                        // offset1
130509467b48Spatrick           .addImm(0)                                 // gds
130609467b48Spatrick           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
130709467b48Spatrick 
130809467b48Spatrick   CI.I->eraseFromParent();
130909467b48Spatrick   Paired.I->eraseFromParent();
131009467b48Spatrick 
131109467b48Spatrick   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
131209467b48Spatrick   return Write2;
131309467b48Spatrick }
131409467b48Spatrick 
131509467b48Spatrick MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1316097a140dSpatrick SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1317*d415bd75Srobert                                      MachineBasicBlock::iterator InsertBefore) {
131809467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
131909467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
132009467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
132109467b48Spatrick 
132209467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
132309467b48Spatrick 
132409467b48Spatrick   Register DestReg = MRI->createVirtualRegister(SuperRC);
132509467b48Spatrick   unsigned MergedDMask = CI.DMask | Paired.DMask;
132609467b48Spatrick   unsigned DMaskIdx =
132709467b48Spatrick       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
132809467b48Spatrick 
1329*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
133009467b48Spatrick   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
133109467b48Spatrick     if (I == DMaskIdx)
133209467b48Spatrick       MIB.addImm(MergedDMask);
133309467b48Spatrick     else
133409467b48Spatrick       MIB.add((*CI.I).getOperand(I));
133509467b48Spatrick   }
133609467b48Spatrick 
133709467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
133809467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
133909467b48Spatrick   // will return true if this is the case.
134009467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
134109467b48Spatrick 
1342*d415bd75Srobert   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
134309467b48Spatrick 
134409467b48Spatrick   unsigned SubRegIdx0, SubRegIdx1;
134509467b48Spatrick   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
134609467b48Spatrick 
134709467b48Spatrick   // Copy to the old destination registers.
134809467b48Spatrick   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
134909467b48Spatrick   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
135009467b48Spatrick   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
135109467b48Spatrick 
1352*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
135309467b48Spatrick       .add(*Dest0) // Copy to same destination including flags and sub reg.
135409467b48Spatrick       .addReg(DestReg, 0, SubRegIdx0);
1355*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
135609467b48Spatrick       .add(*Dest1)
135709467b48Spatrick       .addReg(DestReg, RegState::Kill, SubRegIdx1);
135809467b48Spatrick 
135909467b48Spatrick   CI.I->eraseFromParent();
136009467b48Spatrick   Paired.I->eraseFromParent();
136109467b48Spatrick   return New;
136209467b48Spatrick }
136309467b48Spatrick 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1365097a140dSpatrick     CombineInfo &CI, CombineInfo &Paired,
1366*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
136709467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
136809467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
136909467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
137009467b48Spatrick 
137109467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
137209467b48Spatrick 
137309467b48Spatrick   Register DestReg = MRI->createVirtualRegister(SuperRC);
137409467b48Spatrick   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
137509467b48Spatrick 
137609467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
137709467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
137809467b48Spatrick   // will return true if this is the case.
137909467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
138009467b48Spatrick 
1381*d415bd75Srobert   MachineInstrBuilder New =
1382*d415bd75Srobert       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1383*d415bd75Srobert           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384*d415bd75Srobert   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385*d415bd75Srobert     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1386*d415bd75Srobert   // For convenience, when SGPR_IMM buffer loads are merged into a
1387*d415bd75Srobert   // zero-offset load, we generate its SGPR variant.
1388*d415bd75Srobert   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1389*d415bd75Srobert     New.addImm(MergedOffset);
1390*d415bd75Srobert   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
139109467b48Spatrick 
139209467b48Spatrick   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
139309467b48Spatrick   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
139409467b48Spatrick   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
139509467b48Spatrick 
139609467b48Spatrick   // Copy to the old destination registers.
139709467b48Spatrick   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
139809467b48Spatrick   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
139909467b48Spatrick   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
140009467b48Spatrick 
1401*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
140209467b48Spatrick       .add(*Dest0) // Copy to same destination including flags and sub reg.
140309467b48Spatrick       .addReg(DestReg, 0, SubRegIdx0);
1404*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
140509467b48Spatrick       .add(*Dest1)
140609467b48Spatrick       .addReg(DestReg, RegState::Kill, SubRegIdx1);
140709467b48Spatrick 
140809467b48Spatrick   CI.I->eraseFromParent();
140909467b48Spatrick   Paired.I->eraseFromParent();
141009467b48Spatrick   return New;
141109467b48Spatrick }
141209467b48Spatrick 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1413097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1414097a140dSpatrick     CombineInfo &CI, CombineInfo &Paired,
1415*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
141609467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
141709467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
141809467b48Spatrick 
141909467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
142009467b48Spatrick 
142109467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
142209467b48Spatrick 
142309467b48Spatrick   // Copy to the new source register.
142409467b48Spatrick   Register DestReg = MRI->createVirtualRegister(SuperRC);
142509467b48Spatrick   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
142609467b48Spatrick 
1427*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
142809467b48Spatrick 
1429097a140dSpatrick   AddressRegs Regs = getRegs(Opcode, *TII);
143009467b48Spatrick 
1431097a140dSpatrick   if (Regs.VAddr)
143209467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
143309467b48Spatrick 
143409467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
143509467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
143609467b48Spatrick   // will return true if this is the case.
143709467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
143809467b48Spatrick 
143909467b48Spatrick   MachineInstr *New =
144009467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
144109467b48Spatrick         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
144209467b48Spatrick         .addImm(MergedOffset) // offset
144373471bf0Spatrick         .addImm(CI.CPol)      // cpol
144409467b48Spatrick         .addImm(0)            // swz
1445*d415bd75Srobert         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
144609467b48Spatrick 
144709467b48Spatrick   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
144809467b48Spatrick   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
144909467b48Spatrick   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
145009467b48Spatrick 
145109467b48Spatrick   // Copy to the old destination registers.
145209467b48Spatrick   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
145309467b48Spatrick   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
145409467b48Spatrick   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
145509467b48Spatrick 
1456*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
145709467b48Spatrick       .add(*Dest0) // Copy to same destination including flags and sub reg.
145809467b48Spatrick       .addReg(DestReg, 0, SubRegIdx0);
1459*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
146009467b48Spatrick       .add(*Dest1)
146109467b48Spatrick       .addReg(DestReg, RegState::Kill, SubRegIdx1);
146209467b48Spatrick 
146309467b48Spatrick   CI.I->eraseFromParent();
146409467b48Spatrick   Paired.I->eraseFromParent();
146509467b48Spatrick   return New;
146609467b48Spatrick }
146709467b48Spatrick 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1468097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1469097a140dSpatrick     CombineInfo &CI, CombineInfo &Paired,
1470*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
147109467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
147209467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
147309467b48Spatrick 
147409467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
147509467b48Spatrick 
147609467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
147709467b48Spatrick 
147809467b48Spatrick   // Copy to the new source register.
147909467b48Spatrick   Register DestReg = MRI->createVirtualRegister(SuperRC);
148009467b48Spatrick   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
148109467b48Spatrick 
1482*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
148309467b48Spatrick 
1484097a140dSpatrick   AddressRegs Regs = getRegs(Opcode, *TII);
148509467b48Spatrick 
1486097a140dSpatrick   if (Regs.VAddr)
148709467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
148809467b48Spatrick 
148909467b48Spatrick   unsigned JoinedFormat =
1490097a140dSpatrick       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
149109467b48Spatrick 
149209467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
149309467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
149409467b48Spatrick   // will return true if this is the case.
149509467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
149609467b48Spatrick 
149709467b48Spatrick   MachineInstr *New =
149809467b48Spatrick       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
149909467b48Spatrick           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
150009467b48Spatrick           .addImm(MergedOffset) // offset
150109467b48Spatrick           .addImm(JoinedFormat) // format
150273471bf0Spatrick           .addImm(CI.CPol)      // cpol
150309467b48Spatrick           .addImm(0)            // swz
1504*d415bd75Srobert           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
150509467b48Spatrick 
150609467b48Spatrick   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
150709467b48Spatrick   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
150809467b48Spatrick   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
150909467b48Spatrick 
151009467b48Spatrick   // Copy to the old destination registers.
151109467b48Spatrick   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
151209467b48Spatrick   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
151309467b48Spatrick   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
151409467b48Spatrick 
1515*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
151609467b48Spatrick       .add(*Dest0) // Copy to same destination including flags and sub reg.
151709467b48Spatrick       .addReg(DestReg, 0, SubRegIdx0);
1518*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
151909467b48Spatrick       .add(*Dest1)
152009467b48Spatrick       .addReg(DestReg, RegState::Kill, SubRegIdx1);
152109467b48Spatrick 
152209467b48Spatrick   CI.I->eraseFromParent();
152309467b48Spatrick   Paired.I->eraseFromParent();
152409467b48Spatrick   return New;
152509467b48Spatrick }
152609467b48Spatrick 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1527097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1528097a140dSpatrick     CombineInfo &CI, CombineInfo &Paired,
1529*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
153009467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
153109467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
153209467b48Spatrick 
153309467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
153409467b48Spatrick 
153509467b48Spatrick   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
153609467b48Spatrick   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
153709467b48Spatrick   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
153809467b48Spatrick 
153909467b48Spatrick   // Copy to the new source register.
154009467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
154109467b48Spatrick   Register SrcReg = MRI->createVirtualRegister(SuperRC);
154209467b48Spatrick 
154309467b48Spatrick   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
154409467b48Spatrick   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
154509467b48Spatrick 
1546*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
154709467b48Spatrick       .add(*Src0)
154809467b48Spatrick       .addImm(SubRegIdx0)
154909467b48Spatrick       .add(*Src1)
155009467b48Spatrick       .addImm(SubRegIdx1);
155109467b48Spatrick 
1552*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
155309467b48Spatrick                  .addReg(SrcReg, RegState::Kill);
155409467b48Spatrick 
1555097a140dSpatrick   AddressRegs Regs = getRegs(Opcode, *TII);
155609467b48Spatrick 
1557097a140dSpatrick   if (Regs.VAddr)
155809467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
155909467b48Spatrick 
156009467b48Spatrick   unsigned JoinedFormat =
1561097a140dSpatrick       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
156209467b48Spatrick 
156309467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
156409467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
156509467b48Spatrick   // will return true if this is the case.
156609467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
156709467b48Spatrick 
156809467b48Spatrick   MachineInstr *New =
156909467b48Spatrick       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
157009467b48Spatrick           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
157109467b48Spatrick           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
157209467b48Spatrick           .addImm(JoinedFormat)                     // format
157373471bf0Spatrick           .addImm(CI.CPol)                          // cpol
157409467b48Spatrick           .addImm(0)                                // swz
1575*d415bd75Srobert           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
157609467b48Spatrick 
1577*d415bd75Srobert   CI.I->eraseFromParent();
1578*d415bd75Srobert   Paired.I->eraseFromParent();
1579*d415bd75Srobert   return New;
1580*d415bd75Srobert }
1581*d415bd75Srobert 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1582*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1583*d415bd75Srobert     CombineInfo &CI, CombineInfo &Paired,
1584*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
1585*d415bd75Srobert   MachineBasicBlock *MBB = CI.I->getParent();
1586*d415bd75Srobert   DebugLoc DL = CI.I->getDebugLoc();
1587*d415bd75Srobert 
1588*d415bd75Srobert   const unsigned Opcode = getNewOpcode(CI, Paired);
1589*d415bd75Srobert 
1590*d415bd75Srobert   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1591*d415bd75Srobert   Register DestReg = MRI->createVirtualRegister(SuperRC);
1592*d415bd75Srobert 
1593*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1594*d415bd75Srobert 
1595*d415bd75Srobert   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1596*d415bd75Srobert     MIB.add(*SAddr);
1597*d415bd75Srobert 
1598*d415bd75Srobert   MachineInstr *New =
1599*d415bd75Srobert     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1600*d415bd75Srobert        .addImm(std::min(CI.Offset, Paired.Offset))
1601*d415bd75Srobert        .addImm(CI.CPol)
1602*d415bd75Srobert        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1603*d415bd75Srobert 
1604*d415bd75Srobert   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1605*d415bd75Srobert   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1606*d415bd75Srobert   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1607*d415bd75Srobert 
1608*d415bd75Srobert   // Copy to the old destination registers.
1609*d415bd75Srobert   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1610*d415bd75Srobert   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1611*d415bd75Srobert   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1612*d415bd75Srobert 
1613*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1614*d415bd75Srobert       .add(*Dest0) // Copy to same destination including flags and sub reg.
1615*d415bd75Srobert       .addReg(DestReg, 0, SubRegIdx0);
1616*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1617*d415bd75Srobert       .add(*Dest1)
1618*d415bd75Srobert       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1619*d415bd75Srobert 
1620*d415bd75Srobert   CI.I->eraseFromParent();
1621*d415bd75Srobert   Paired.I->eraseFromParent();
1622*d415bd75Srobert   return New;
1623*d415bd75Srobert }
1624*d415bd75Srobert 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625*d415bd75Srobert MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1626*d415bd75Srobert     CombineInfo &CI, CombineInfo &Paired,
1627*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
1628*d415bd75Srobert   MachineBasicBlock *MBB = CI.I->getParent();
1629*d415bd75Srobert   DebugLoc DL = CI.I->getDebugLoc();
1630*d415bd75Srobert 
1631*d415bd75Srobert   const unsigned Opcode = getNewOpcode(CI, Paired);
1632*d415bd75Srobert 
1633*d415bd75Srobert   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1634*d415bd75Srobert   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1635*d415bd75Srobert   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1636*d415bd75Srobert 
1637*d415bd75Srobert   // Copy to the new source register.
1638*d415bd75Srobert   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1639*d415bd75Srobert   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1640*d415bd75Srobert 
1641*d415bd75Srobert   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1642*d415bd75Srobert   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1643*d415bd75Srobert 
1644*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1645*d415bd75Srobert       .add(*Src0)
1646*d415bd75Srobert       .addImm(SubRegIdx0)
1647*d415bd75Srobert       .add(*Src1)
1648*d415bd75Srobert       .addImm(SubRegIdx1);
1649*d415bd75Srobert 
1650*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1651*d415bd75Srobert                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1652*d415bd75Srobert                  .addReg(SrcReg, RegState::Kill);
1653*d415bd75Srobert 
1654*d415bd75Srobert   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1655*d415bd75Srobert     MIB.add(*SAddr);
1656*d415bd75Srobert 
1657*d415bd75Srobert   MachineInstr *New =
1658*d415bd75Srobert     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1659*d415bd75Srobert        .addImm(CI.CPol)
1660*d415bd75Srobert        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
166109467b48Spatrick 
166209467b48Spatrick   CI.I->eraseFromParent();
166309467b48Spatrick   Paired.I->eraseFromParent();
166409467b48Spatrick   return New;
166509467b48Spatrick }
166609467b48Spatrick 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)166709467b48Spatrick unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
166809467b48Spatrick                                             const CombineInfo &Paired) {
166909467b48Spatrick   const unsigned Width = CI.Width + Paired.Width;
167009467b48Spatrick 
1671*d415bd75Srobert   switch (getCommonInstClass(CI, Paired)) {
167209467b48Spatrick   default:
167309467b48Spatrick     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
167409467b48Spatrick     // FIXME: Handle d16 correctly
167509467b48Spatrick     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
167609467b48Spatrick                                   Width);
167709467b48Spatrick   case TBUFFER_LOAD:
167809467b48Spatrick   case TBUFFER_STORE:
167909467b48Spatrick     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
168009467b48Spatrick                                   Width);
168109467b48Spatrick 
168209467b48Spatrick   case UNKNOWN:
168309467b48Spatrick     llvm_unreachable("Unknown instruction class");
168409467b48Spatrick   case S_BUFFER_LOAD_IMM:
168509467b48Spatrick     switch (Width) {
168609467b48Spatrick     default:
168709467b48Spatrick       return 0;
168809467b48Spatrick     case 2:
168909467b48Spatrick       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
169009467b48Spatrick     case 4:
169109467b48Spatrick       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1692*d415bd75Srobert     case 8:
1693*d415bd75Srobert       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1694*d415bd75Srobert     }
1695*d415bd75Srobert   case S_BUFFER_LOAD_SGPR_IMM:
1696*d415bd75Srobert     switch (Width) {
1697*d415bd75Srobert     default:
1698*d415bd75Srobert       return 0;
1699*d415bd75Srobert     case 2:
1700*d415bd75Srobert       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1701*d415bd75Srobert                             : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1702*d415bd75Srobert     case 4:
1703*d415bd75Srobert       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1704*d415bd75Srobert                             : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1705*d415bd75Srobert     case 8:
1706*d415bd75Srobert       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1707*d415bd75Srobert                             : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1708*d415bd75Srobert     }
1709*d415bd75Srobert   case S_LOAD_IMM:
1710*d415bd75Srobert     switch (Width) {
1711*d415bd75Srobert     default:
1712*d415bd75Srobert       return 0;
1713*d415bd75Srobert     case 2:
1714*d415bd75Srobert       return AMDGPU::S_LOAD_DWORDX2_IMM;
1715*d415bd75Srobert     case 4:
1716*d415bd75Srobert       return AMDGPU::S_LOAD_DWORDX4_IMM;
1717*d415bd75Srobert     case 8:
1718*d415bd75Srobert       return AMDGPU::S_LOAD_DWORDX8_IMM;
1719*d415bd75Srobert     }
1720*d415bd75Srobert   case GLOBAL_LOAD:
1721*d415bd75Srobert     switch (Width) {
1722*d415bd75Srobert     default:
1723*d415bd75Srobert       return 0;
1724*d415bd75Srobert     case 2:
1725*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1726*d415bd75Srobert     case 3:
1727*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1728*d415bd75Srobert     case 4:
1729*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1730*d415bd75Srobert     }
1731*d415bd75Srobert   case GLOBAL_LOAD_SADDR:
1732*d415bd75Srobert     switch (Width) {
1733*d415bd75Srobert     default:
1734*d415bd75Srobert       return 0;
1735*d415bd75Srobert     case 2:
1736*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1737*d415bd75Srobert     case 3:
1738*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1739*d415bd75Srobert     case 4:
1740*d415bd75Srobert       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1741*d415bd75Srobert     }
1742*d415bd75Srobert   case GLOBAL_STORE:
1743*d415bd75Srobert     switch (Width) {
1744*d415bd75Srobert     default:
1745*d415bd75Srobert       return 0;
1746*d415bd75Srobert     case 2:
1747*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX2;
1748*d415bd75Srobert     case 3:
1749*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX3;
1750*d415bd75Srobert     case 4:
1751*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX4;
1752*d415bd75Srobert     }
1753*d415bd75Srobert   case GLOBAL_STORE_SADDR:
1754*d415bd75Srobert     switch (Width) {
1755*d415bd75Srobert     default:
1756*d415bd75Srobert       return 0;
1757*d415bd75Srobert     case 2:
1758*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1759*d415bd75Srobert     case 3:
1760*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1761*d415bd75Srobert     case 4:
1762*d415bd75Srobert       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1763*d415bd75Srobert     }
1764*d415bd75Srobert   case FLAT_LOAD:
1765*d415bd75Srobert     switch (Width) {
1766*d415bd75Srobert     default:
1767*d415bd75Srobert       return 0;
1768*d415bd75Srobert     case 2:
1769*d415bd75Srobert       return AMDGPU::FLAT_LOAD_DWORDX2;
1770*d415bd75Srobert     case 3:
1771*d415bd75Srobert       return AMDGPU::FLAT_LOAD_DWORDX3;
1772*d415bd75Srobert     case 4:
1773*d415bd75Srobert       return AMDGPU::FLAT_LOAD_DWORDX4;
1774*d415bd75Srobert     }
1775*d415bd75Srobert   case FLAT_STORE:
1776*d415bd75Srobert     switch (Width) {
1777*d415bd75Srobert     default:
1778*d415bd75Srobert       return 0;
1779*d415bd75Srobert     case 2:
1780*d415bd75Srobert       return AMDGPU::FLAT_STORE_DWORDX2;
1781*d415bd75Srobert     case 3:
1782*d415bd75Srobert       return AMDGPU::FLAT_STORE_DWORDX3;
1783*d415bd75Srobert     case 4:
1784*d415bd75Srobert       return AMDGPU::FLAT_STORE_DWORDX4;
178509467b48Spatrick     }
178609467b48Spatrick   case MIMG:
1787*d415bd75Srobert     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1788*d415bd75Srobert            "No overlaps");
178909467b48Spatrick     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
179009467b48Spatrick   }
179109467b48Spatrick }
179209467b48Spatrick 
179309467b48Spatrick std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1794*d415bd75Srobert SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1795*d415bd75Srobert                                     const CombineInfo &Paired) {
1796*d415bd75Srobert   assert((CI.InstClass != MIMG ||
1797*d415bd75Srobert           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1798*d415bd75Srobert            CI.Width + Paired.Width)) &&
179909467b48Spatrick          "No overlaps");
180009467b48Spatrick 
180109467b48Spatrick   unsigned Idx0;
180209467b48Spatrick   unsigned Idx1;
180309467b48Spatrick 
1804*d415bd75Srobert   static const unsigned Idxs[5][4] = {
1805*d415bd75Srobert       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1806*d415bd75Srobert       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1807*d415bd75Srobert       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1808*d415bd75Srobert       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1809*d415bd75Srobert       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1810*d415bd75Srobert   };
181109467b48Spatrick 
1812*d415bd75Srobert   assert(CI.Width >= 1 && CI.Width <= 4);
1813*d415bd75Srobert   assert(Paired.Width >= 1 && Paired.Width <= 4);
1814*d415bd75Srobert 
1815*d415bd75Srobert   if (Paired < CI) {
181609467b48Spatrick     Idx1 = Idxs[0][Paired.Width - 1];
181709467b48Spatrick     Idx0 = Idxs[Paired.Width][CI.Width - 1];
181809467b48Spatrick   } else {
181909467b48Spatrick     Idx0 = Idxs[0][CI.Width - 1];
182009467b48Spatrick     Idx1 = Idxs[CI.Width][Paired.Width - 1];
182109467b48Spatrick   }
182209467b48Spatrick 
1823*d415bd75Srobert   return std::pair(Idx0, Idx1);
182409467b48Spatrick }
182509467b48Spatrick 
182609467b48Spatrick const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)182709467b48Spatrick SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
182809467b48Spatrick                                              const CombineInfo &Paired) {
1829*d415bd75Srobert   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1830*d415bd75Srobert       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
183109467b48Spatrick     switch (CI.Width + Paired.Width) {
183209467b48Spatrick     default:
183309467b48Spatrick       return nullptr;
183409467b48Spatrick     case 2:
183509467b48Spatrick       return &AMDGPU::SReg_64_XEXECRegClass;
183609467b48Spatrick     case 4:
183709467b48Spatrick       return &AMDGPU::SGPR_128RegClass;
183809467b48Spatrick     case 8:
1839097a140dSpatrick       return &AMDGPU::SGPR_256RegClass;
184009467b48Spatrick     case 16:
1841097a140dSpatrick       return &AMDGPU::SGPR_512RegClass;
184209467b48Spatrick     }
184309467b48Spatrick   }
184473471bf0Spatrick 
184573471bf0Spatrick   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1846*d415bd75Srobert   return TRI->isAGPRClass(getDataRegClass(*CI.I))
184773471bf0Spatrick              ? TRI->getAGPRClassForBitWidth(BitWidth)
184873471bf0Spatrick              : TRI->getVGPRClassForBitWidth(BitWidth);
184909467b48Spatrick }
185009467b48Spatrick 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1851097a140dSpatrick MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1852097a140dSpatrick     CombineInfo &CI, CombineInfo &Paired,
1853*d415bd75Srobert     MachineBasicBlock::iterator InsertBefore) {
185409467b48Spatrick   MachineBasicBlock *MBB = CI.I->getParent();
185509467b48Spatrick   DebugLoc DL = CI.I->getDebugLoc();
185609467b48Spatrick 
185709467b48Spatrick   const unsigned Opcode = getNewOpcode(CI, Paired);
185809467b48Spatrick 
185909467b48Spatrick   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
186009467b48Spatrick   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
186109467b48Spatrick   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
186209467b48Spatrick 
186309467b48Spatrick   // Copy to the new source register.
186409467b48Spatrick   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
186509467b48Spatrick   Register SrcReg = MRI->createVirtualRegister(SuperRC);
186609467b48Spatrick 
186709467b48Spatrick   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
186809467b48Spatrick   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
186909467b48Spatrick 
1870*d415bd75Srobert   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
187109467b48Spatrick       .add(*Src0)
187209467b48Spatrick       .addImm(SubRegIdx0)
187309467b48Spatrick       .add(*Src1)
187409467b48Spatrick       .addImm(SubRegIdx1);
187509467b48Spatrick 
1876*d415bd75Srobert   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
187709467b48Spatrick                  .addReg(SrcReg, RegState::Kill);
187809467b48Spatrick 
1879097a140dSpatrick   AddressRegs Regs = getRegs(Opcode, *TII);
188009467b48Spatrick 
1881097a140dSpatrick   if (Regs.VAddr)
188209467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
188309467b48Spatrick 
188409467b48Spatrick 
188509467b48Spatrick   // It shouldn't be possible to get this far if the two instructions
188609467b48Spatrick   // don't have a single memoperand, because MachineInstr::mayAlias()
188709467b48Spatrick   // will return true if this is the case.
188809467b48Spatrick   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
188909467b48Spatrick 
189009467b48Spatrick   MachineInstr *New =
189109467b48Spatrick     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
189209467b48Spatrick         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
189309467b48Spatrick         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
189473471bf0Spatrick         .addImm(CI.CPol)      // cpol
189509467b48Spatrick         .addImm(0)            // swz
1896*d415bd75Srobert         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
189709467b48Spatrick 
189809467b48Spatrick   CI.I->eraseFromParent();
189909467b48Spatrick   Paired.I->eraseFromParent();
190009467b48Spatrick   return New;
190109467b48Spatrick }
190209467b48Spatrick 
190309467b48Spatrick MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const190409467b48Spatrick SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
190509467b48Spatrick   APInt V(32, Val, true);
190609467b48Spatrick   if (TII->isInlineConstant(V))
190709467b48Spatrick     return MachineOperand::CreateImm(Val);
190809467b48Spatrick 
190909467b48Spatrick   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
191009467b48Spatrick   MachineInstr *Mov =
191109467b48Spatrick   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
191209467b48Spatrick           TII->get(AMDGPU::S_MOV_B32), Reg)
191309467b48Spatrick     .addImm(Val);
191409467b48Spatrick   (void)Mov;
191509467b48Spatrick   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
191609467b48Spatrick   return MachineOperand::CreateReg(Reg, false);
191709467b48Spatrick }
191809467b48Spatrick 
191909467b48Spatrick // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1920097a140dSpatrick Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
192109467b48Spatrick                                            const MemAddress &Addr) const {
192209467b48Spatrick   MachineBasicBlock *MBB = MI.getParent();
192309467b48Spatrick   MachineBasicBlock::iterator MBBI = MI.getIterator();
192409467b48Spatrick   DebugLoc DL = MI.getDebugLoc();
192509467b48Spatrick 
192609467b48Spatrick   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
192709467b48Spatrick           Addr.Base.LoSubReg) &&
192809467b48Spatrick          "Expected 32-bit Base-Register-Low!!");
192909467b48Spatrick 
193009467b48Spatrick   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
193109467b48Spatrick           Addr.Base.HiSubReg) &&
193209467b48Spatrick          "Expected 32-bit Base-Register-Hi!!");
193309467b48Spatrick 
193409467b48Spatrick   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
193509467b48Spatrick   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
193609467b48Spatrick   MachineOperand OffsetHi =
193709467b48Spatrick     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
193809467b48Spatrick 
193909467b48Spatrick   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
194009467b48Spatrick   Register CarryReg = MRI->createVirtualRegister(CarryRC);
194109467b48Spatrick   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
194209467b48Spatrick 
194309467b48Spatrick   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
194409467b48Spatrick   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
194509467b48Spatrick   MachineInstr *LoHalf =
194673471bf0Spatrick     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
194709467b48Spatrick       .addReg(CarryReg, RegState::Define)
194809467b48Spatrick       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
194909467b48Spatrick       .add(OffsetLo)
195009467b48Spatrick       .addImm(0); // clamp bit
195109467b48Spatrick   (void)LoHalf;
195209467b48Spatrick   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
195309467b48Spatrick 
195409467b48Spatrick   MachineInstr *HiHalf =
195509467b48Spatrick   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
195609467b48Spatrick     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
195709467b48Spatrick     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
195809467b48Spatrick     .add(OffsetHi)
195909467b48Spatrick     .addReg(CarryReg, RegState::Kill)
196009467b48Spatrick     .addImm(0); // clamp bit
196109467b48Spatrick   (void)HiHalf;
196209467b48Spatrick   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
196309467b48Spatrick 
196473471bf0Spatrick   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
196509467b48Spatrick   MachineInstr *FullBase =
196609467b48Spatrick     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
196709467b48Spatrick       .addReg(DestSub0)
196809467b48Spatrick       .addImm(AMDGPU::sub0)
196909467b48Spatrick       .addReg(DestSub1)
197009467b48Spatrick       .addImm(AMDGPU::sub1);
197109467b48Spatrick   (void)FullBase;
197209467b48Spatrick   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
197309467b48Spatrick 
197409467b48Spatrick   return FullDestReg;
197509467b48Spatrick }
197609467b48Spatrick 
197709467b48Spatrick // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const197809467b48Spatrick void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1979097a140dSpatrick                                                Register NewBase,
198009467b48Spatrick                                                int32_t NewOffset) const {
198109467b48Spatrick   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
198209467b48Spatrick   Base->setReg(NewBase);
198309467b48Spatrick   Base->setIsKill(false);
198409467b48Spatrick   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
198509467b48Spatrick }
198609467b48Spatrick 
1987*d415bd75Srobert std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const198809467b48Spatrick SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
198909467b48Spatrick   if (Op.isImm())
199009467b48Spatrick     return Op.getImm();
199109467b48Spatrick 
199209467b48Spatrick   if (!Op.isReg())
1993*d415bd75Srobert     return std::nullopt;
199409467b48Spatrick 
199509467b48Spatrick   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
199609467b48Spatrick   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
199709467b48Spatrick       !Def->getOperand(1).isImm())
1998*d415bd75Srobert     return std::nullopt;
199909467b48Spatrick 
200009467b48Spatrick   return Def->getOperand(1).getImm();
200109467b48Spatrick }
200209467b48Spatrick 
200309467b48Spatrick // Analyze Base and extracts:
200409467b48Spatrick //  - 32bit base registers, subregisters
200509467b48Spatrick //  - 64bit constant offset
200609467b48Spatrick // Expecting base computation as:
200709467b48Spatrick //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
200809467b48Spatrick //   %LO:vgpr_32, %c:sreg_64_xexec =
200973471bf0Spatrick //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
201009467b48Spatrick //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
201109467b48Spatrick //   %Base:vreg_64 =
201209467b48Spatrick //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const201309467b48Spatrick void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
201409467b48Spatrick                                                       MemAddress &Addr) const {
201509467b48Spatrick   if (!Base.isReg())
201609467b48Spatrick     return;
201709467b48Spatrick 
201809467b48Spatrick   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
201909467b48Spatrick   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
202009467b48Spatrick       || Def->getNumOperands() != 5)
202109467b48Spatrick     return;
202209467b48Spatrick 
202309467b48Spatrick   MachineOperand BaseLo = Def->getOperand(1);
202409467b48Spatrick   MachineOperand BaseHi = Def->getOperand(3);
202509467b48Spatrick   if (!BaseLo.isReg() || !BaseHi.isReg())
202609467b48Spatrick     return;
202709467b48Spatrick 
202809467b48Spatrick   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
202909467b48Spatrick   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
203009467b48Spatrick 
203173471bf0Spatrick   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
203209467b48Spatrick       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
203309467b48Spatrick     return;
203409467b48Spatrick 
203509467b48Spatrick   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
203609467b48Spatrick   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
203709467b48Spatrick 
203809467b48Spatrick   auto Offset0P = extractConstOffset(*Src0);
203909467b48Spatrick   if (Offset0P)
204009467b48Spatrick     BaseLo = *Src1;
204109467b48Spatrick   else {
204209467b48Spatrick     if (!(Offset0P = extractConstOffset(*Src1)))
204309467b48Spatrick       return;
204409467b48Spatrick     BaseLo = *Src0;
204509467b48Spatrick   }
204609467b48Spatrick 
204709467b48Spatrick   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
204809467b48Spatrick   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
204909467b48Spatrick 
205009467b48Spatrick   if (Src0->isImm())
205109467b48Spatrick     std::swap(Src0, Src1);
205209467b48Spatrick 
205309467b48Spatrick   if (!Src1->isImm())
205409467b48Spatrick     return;
205509467b48Spatrick 
205609467b48Spatrick   uint64_t Offset1 = Src1->getImm();
205709467b48Spatrick   BaseHi = *Src0;
205809467b48Spatrick 
205909467b48Spatrick   Addr.Base.LoReg = BaseLo.getReg();
206009467b48Spatrick   Addr.Base.HiReg = BaseHi.getReg();
206109467b48Spatrick   Addr.Base.LoSubReg = BaseLo.getSubReg();
206209467b48Spatrick   Addr.Base.HiSubReg = BaseHi.getSubReg();
206309467b48Spatrick   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
206409467b48Spatrick }
206509467b48Spatrick 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const206609467b48Spatrick bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
206709467b48Spatrick     MachineInstr &MI,
206809467b48Spatrick     MemInfoMap &Visited,
206909467b48Spatrick     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
207009467b48Spatrick 
207109467b48Spatrick   if (!(MI.mayLoad() ^ MI.mayStore()))
207209467b48Spatrick     return false;
207309467b48Spatrick 
207409467b48Spatrick   // TODO: Support flat and scratch.
207509467b48Spatrick   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
207609467b48Spatrick     return false;
207709467b48Spatrick 
2078*d415bd75Srobert   if (MI.mayLoad() &&
2079*d415bd75Srobert       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
208009467b48Spatrick     return false;
208109467b48Spatrick 
208209467b48Spatrick   if (AnchorList.count(&MI))
208309467b48Spatrick     return false;
208409467b48Spatrick 
208509467b48Spatrick   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
208609467b48Spatrick 
208709467b48Spatrick   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
208809467b48Spatrick     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
208909467b48Spatrick     return false;
209009467b48Spatrick   }
209109467b48Spatrick 
209209467b48Spatrick   // Step1: Find the base-registers and a 64bit constant offset.
209309467b48Spatrick   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
209409467b48Spatrick   MemAddress MAddr;
209509467b48Spatrick   if (Visited.find(&MI) == Visited.end()) {
209609467b48Spatrick     processBaseWithConstOffset(Base, MAddr);
209709467b48Spatrick     Visited[&MI] = MAddr;
209809467b48Spatrick   } else
209909467b48Spatrick     MAddr = Visited[&MI];
210009467b48Spatrick 
210109467b48Spatrick   if (MAddr.Offset == 0) {
210209467b48Spatrick     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
210309467b48Spatrick                          " constant offsets that can be promoted.\n";);
210409467b48Spatrick     return false;
210509467b48Spatrick   }
210609467b48Spatrick 
210709467b48Spatrick   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
210809467b48Spatrick              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
210909467b48Spatrick 
211009467b48Spatrick   // Step2: Traverse through MI's basic block and find an anchor(that has the
211109467b48Spatrick   // same base-registers) with the highest 13bit distance from MI's offset.
211209467b48Spatrick   // E.g. (64bit loads)
211309467b48Spatrick   // bb:
211409467b48Spatrick   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
211509467b48Spatrick   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
211609467b48Spatrick   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
211709467b48Spatrick   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
211809467b48Spatrick   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
211909467b48Spatrick   //
212009467b48Spatrick   // Starting from the first load, the optimization will try to find a new base
212109467b48Spatrick   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
212209467b48Spatrick   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
212309467b48Spatrick   // as the new-base(anchor) because of the maximum distance which can
2124*d415bd75Srobert   // accommodate more intermediate bases presumably.
212509467b48Spatrick   //
212609467b48Spatrick   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
212709467b48Spatrick   // (&a + 8192) for load1, load2, load4.
212809467b48Spatrick   //   addr = &a + 8192
212909467b48Spatrick   //   load1 = load(addr,       -4096)
213009467b48Spatrick   //   load2 = load(addr,       -2048)
213109467b48Spatrick   //   load3 = load(addr,       0)
213209467b48Spatrick   //   load4 = load(addr,       2048)
213309467b48Spatrick   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
213409467b48Spatrick   //
213509467b48Spatrick   MachineInstr *AnchorInst = nullptr;
213609467b48Spatrick   MemAddress AnchorAddr;
213709467b48Spatrick   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
213809467b48Spatrick   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
213909467b48Spatrick 
214009467b48Spatrick   MachineBasicBlock *MBB = MI.getParent();
214109467b48Spatrick   MachineBasicBlock::iterator E = MBB->end();
214209467b48Spatrick   MachineBasicBlock::iterator MBBI = MI.getIterator();
214309467b48Spatrick   ++MBBI;
214409467b48Spatrick   const SITargetLowering *TLI =
214509467b48Spatrick     static_cast<const SITargetLowering *>(STM->getTargetLowering());
214609467b48Spatrick 
214709467b48Spatrick   for ( ; MBBI != E; ++MBBI) {
214809467b48Spatrick     MachineInstr &MINext = *MBBI;
214909467b48Spatrick     // TODO: Support finding an anchor(with same base) from store addresses or
215009467b48Spatrick     // any other load addresses where the opcodes are different.
215109467b48Spatrick     if (MINext.getOpcode() != MI.getOpcode() ||
215209467b48Spatrick         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
215309467b48Spatrick       continue;
215409467b48Spatrick 
215509467b48Spatrick     const MachineOperand &BaseNext =
215609467b48Spatrick       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
215709467b48Spatrick     MemAddress MAddrNext;
215809467b48Spatrick     if (Visited.find(&MINext) == Visited.end()) {
215909467b48Spatrick       processBaseWithConstOffset(BaseNext, MAddrNext);
216009467b48Spatrick       Visited[&MINext] = MAddrNext;
216109467b48Spatrick     } else
216209467b48Spatrick       MAddrNext = Visited[&MINext];
216309467b48Spatrick 
216409467b48Spatrick     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
216509467b48Spatrick         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
216609467b48Spatrick         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
216709467b48Spatrick         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
216809467b48Spatrick       continue;
216909467b48Spatrick 
2170*d415bd75Srobert     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
217109467b48Spatrick 
217209467b48Spatrick     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
217309467b48Spatrick     TargetLoweringBase::AddrMode AM;
217409467b48Spatrick     AM.HasBaseReg = true;
217509467b48Spatrick     AM.BaseOffs = Dist;
217609467b48Spatrick     if (TLI->isLegalGlobalAddressingMode(AM) &&
217709467b48Spatrick         (uint32_t)std::abs(Dist) > MaxDist) {
217809467b48Spatrick       MaxDist = std::abs(Dist);
217909467b48Spatrick 
218009467b48Spatrick       AnchorAddr = MAddrNext;
218109467b48Spatrick       AnchorInst = &MINext;
218209467b48Spatrick     }
218309467b48Spatrick   }
218409467b48Spatrick 
218509467b48Spatrick   if (AnchorInst) {
218609467b48Spatrick     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
218709467b48Spatrick                AnchorInst->dump());
218809467b48Spatrick     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
218909467b48Spatrick                <<  AnchorAddr.Offset << "\n\n");
219009467b48Spatrick 
219109467b48Spatrick     // Instead of moving up, just re-compute anchor-instruction's base address.
2192097a140dSpatrick     Register Base = computeBase(MI, AnchorAddr);
219309467b48Spatrick 
219409467b48Spatrick     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
219509467b48Spatrick     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
219609467b48Spatrick 
219709467b48Spatrick     for (auto P : InstsWCommonBase) {
219809467b48Spatrick       TargetLoweringBase::AddrMode AM;
219909467b48Spatrick       AM.HasBaseReg = true;
220009467b48Spatrick       AM.BaseOffs = P.second - AnchorAddr.Offset;
220109467b48Spatrick 
220209467b48Spatrick       if (TLI->isLegalGlobalAddressingMode(AM)) {
220309467b48Spatrick         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
220409467b48Spatrick                    dbgs() << ")"; P.first->dump());
220509467b48Spatrick         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
220609467b48Spatrick         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
220709467b48Spatrick       }
220809467b48Spatrick     }
220909467b48Spatrick     AnchorList.insert(AnchorInst);
221009467b48Spatrick     return true;
221109467b48Spatrick   }
221209467b48Spatrick 
221309467b48Spatrick   return false;
221409467b48Spatrick }
221509467b48Spatrick 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const221609467b48Spatrick void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
221709467b48Spatrick                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
221809467b48Spatrick   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
221909467b48Spatrick     if (AddrList.front().InstClass == CI.InstClass &&
2220*d415bd75Srobert         AddrList.front().IsAGPR == CI.IsAGPR &&
2221*d415bd75Srobert         AddrList.front().hasSameBaseAddress(CI)) {
222209467b48Spatrick       AddrList.emplace_back(CI);
222309467b48Spatrick       return;
222409467b48Spatrick     }
222509467b48Spatrick   }
222609467b48Spatrick 
222709467b48Spatrick   // Base address not found, so add a new list.
222809467b48Spatrick   MergeableInsts.emplace_back(1, CI);
222909467b48Spatrick }
223009467b48Spatrick 
2231097a140dSpatrick std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2232097a140dSpatrick SILoadStoreOptimizer::collectMergeableInsts(
2233097a140dSpatrick     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2234097a140dSpatrick     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
223509467b48Spatrick     std::list<std::list<CombineInfo>> &MergeableInsts) const {
223609467b48Spatrick   bool Modified = false;
223709467b48Spatrick 
223809467b48Spatrick   // Sort potential mergeable instructions into lists.  One list per base address.
2239097a140dSpatrick   unsigned Order = 0;
2240097a140dSpatrick   MachineBasicBlock::iterator BlockI = Begin;
2241097a140dSpatrick   for (; BlockI != End; ++BlockI) {
2242097a140dSpatrick     MachineInstr &MI = *BlockI;
2243097a140dSpatrick 
224409467b48Spatrick     // We run this before checking if an address is mergeable, because it can produce
224509467b48Spatrick     // better code even if the instructions aren't mergeable.
224609467b48Spatrick     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
224709467b48Spatrick       Modified = true;
224809467b48Spatrick 
2249*d415bd75Srobert     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2250*d415bd75Srobert     // barriers. We can look after this barrier for separate merges.
2251*d415bd75Srobert     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2252*d415bd75Srobert       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2253097a140dSpatrick 
2254097a140dSpatrick       // Search will resume after this instruction in a separate merge list.
2255097a140dSpatrick       ++BlockI;
2256097a140dSpatrick       break;
2257097a140dSpatrick     }
2258097a140dSpatrick 
225909467b48Spatrick     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
226009467b48Spatrick     if (InstClass == UNKNOWN)
226109467b48Spatrick       continue;
226209467b48Spatrick 
2263*d415bd75Srobert     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2264*d415bd75Srobert     int Swizzled =
2265*d415bd75Srobert         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2266*d415bd75Srobert     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2267*d415bd75Srobert       continue;
2268*d415bd75Srobert 
226909467b48Spatrick     CombineInfo CI;
2270*d415bd75Srobert     CI.setMI(MI, *this);
2271097a140dSpatrick     CI.Order = Order++;
227209467b48Spatrick 
227309467b48Spatrick     if (!CI.hasMergeableAddress(*MRI))
227409467b48Spatrick       continue;
227509467b48Spatrick 
2276*d415bd75Srobert     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2277*d415bd75Srobert       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2278*d415bd75Srobert       //        operands. However we are reporting that ds_write2 shall have
2279*d415bd75Srobert       //        only VGPR data so that machine copy propagation does not
2280*d415bd75Srobert       //        create an illegal instruction with a VGPR and AGPR sources.
2281*d415bd75Srobert       //        Consequenctially if we create such instruction the verifier
2282*d415bd75Srobert       //        will complain.
2283*d415bd75Srobert       continue;
2284*d415bd75Srobert     }
2285*d415bd75Srobert 
2286097a140dSpatrick     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2287097a140dSpatrick 
228809467b48Spatrick     addInstToMergeableList(CI, MergeableInsts);
228909467b48Spatrick   }
2290097a140dSpatrick 
2291097a140dSpatrick   // At this point we have lists of Mergeable instructions.
2292097a140dSpatrick   //
2293097a140dSpatrick   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2294097a140dSpatrick   // list try to find an instruction that can be merged with I.  If an instruction
2295097a140dSpatrick   // is found, it is stored in the Paired field.  If no instructions are found, then
2296097a140dSpatrick   // the CombineInfo object is deleted from the list.
2297097a140dSpatrick 
2298097a140dSpatrick   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2299097a140dSpatrick                                                    E = MergeableInsts.end(); I != E;) {
2300097a140dSpatrick 
2301097a140dSpatrick     std::list<CombineInfo> &MergeList = *I;
2302097a140dSpatrick     if (MergeList.size() <= 1) {
2303097a140dSpatrick       // This means we have found only one instruction with a given address
2304097a140dSpatrick       // that can be merged, and we need at least 2 instructions to do a merge,
2305097a140dSpatrick       // so this list can be discarded.
2306097a140dSpatrick       I = MergeableInsts.erase(I);
2307097a140dSpatrick       continue;
2308097a140dSpatrick     }
2309097a140dSpatrick 
2310097a140dSpatrick     // Sort the lists by offsets, this way mergeable instructions will be
2311097a140dSpatrick     // adjacent to each other in the list, which will make it easier to find
2312097a140dSpatrick     // matches.
2313097a140dSpatrick     MergeList.sort(
2314*d415bd75Srobert         [] (const CombineInfo &A, const CombineInfo &B) {
2315097a140dSpatrick           return A.Offset < B.Offset;
2316097a140dSpatrick         });
2317097a140dSpatrick     ++I;
2318097a140dSpatrick   }
2319097a140dSpatrick 
2320*d415bd75Srobert   return std::pair(BlockI, Modified);
232109467b48Spatrick }
232209467b48Spatrick 
232309467b48Spatrick // Scan through looking for adjacent LDS operations with constant offsets from
232409467b48Spatrick // the same base register. We rely on the scheduler to do the hard work of
232509467b48Spatrick // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)232609467b48Spatrick bool SILoadStoreOptimizer::optimizeBlock(
232709467b48Spatrick                        std::list<std::list<CombineInfo> > &MergeableInsts) {
232809467b48Spatrick   bool Modified = false;
232909467b48Spatrick 
2330097a140dSpatrick   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2331097a140dSpatrick                                                    E = MergeableInsts.end(); I != E;) {
2332097a140dSpatrick     std::list<CombineInfo> &MergeList = *I;
233309467b48Spatrick 
233409467b48Spatrick     bool OptimizeListAgain = false;
233509467b48Spatrick     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2336097a140dSpatrick       // We weren't able to make any changes, so delete the list so we don't
233709467b48Spatrick       // process the same instructions the next time we try to optimize this
233809467b48Spatrick       // block.
2339097a140dSpatrick       I = MergeableInsts.erase(I);
234009467b48Spatrick       continue;
234109467b48Spatrick     }
234209467b48Spatrick 
2343097a140dSpatrick     Modified = true;
2344097a140dSpatrick 
234509467b48Spatrick     // We made changes, but also determined that there were no more optimization
234609467b48Spatrick     // opportunities, so we don't need to reprocess the list
2347097a140dSpatrick     if (!OptimizeListAgain) {
2348097a140dSpatrick       I = MergeableInsts.erase(I);
2349097a140dSpatrick       continue;
2350097a140dSpatrick     }
2351097a140dSpatrick     OptimizeAgain = true;
235209467b48Spatrick   }
235309467b48Spatrick   return Modified;
235409467b48Spatrick }
235509467b48Spatrick 
235609467b48Spatrick bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)235709467b48Spatrick SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
235809467b48Spatrick                                           std::list<CombineInfo> &MergeList,
235909467b48Spatrick                                           bool &OptimizeListAgain) {
2360097a140dSpatrick   if (MergeList.empty())
2361097a140dSpatrick     return false;
2362097a140dSpatrick 
236309467b48Spatrick   bool Modified = false;
236409467b48Spatrick 
2365097a140dSpatrick   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2366097a140dSpatrick        Next = std::next(I)) {
2367097a140dSpatrick 
2368097a140dSpatrick     auto First = I;
2369097a140dSpatrick     auto Second = Next;
2370097a140dSpatrick 
2371097a140dSpatrick     if ((*First).Order > (*Second).Order)
2372097a140dSpatrick       std::swap(First, Second);
2373097a140dSpatrick     CombineInfo &CI = *First;
2374097a140dSpatrick     CombineInfo &Paired = *Second;
2375097a140dSpatrick 
2376*d415bd75Srobert     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2377*d415bd75Srobert     if (!Where) {
2378097a140dSpatrick       ++I;
237909467b48Spatrick       continue;
2380097a140dSpatrick     }
238109467b48Spatrick 
238209467b48Spatrick     Modified = true;
2383097a140dSpatrick 
2384097a140dSpatrick     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
238509467b48Spatrick 
2386*d415bd75Srobert     MachineBasicBlock::iterator NewMI;
238709467b48Spatrick     switch (CI.InstClass) {
238809467b48Spatrick     default:
238909467b48Spatrick       llvm_unreachable("unknown InstClass");
239009467b48Spatrick       break;
2391*d415bd75Srobert     case DS_READ:
2392*d415bd75Srobert       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2393*d415bd75Srobert       break;
2394*d415bd75Srobert     case DS_WRITE:
2395*d415bd75Srobert       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2396*d415bd75Srobert       break;
2397*d415bd75Srobert     case S_BUFFER_LOAD_IMM:
2398*d415bd75Srobert     case S_BUFFER_LOAD_SGPR_IMM:
2399*d415bd75Srobert     case S_LOAD_IMM:
2400*d415bd75Srobert       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2401*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2402*d415bd75Srobert       break;
2403*d415bd75Srobert     case BUFFER_LOAD:
2404*d415bd75Srobert       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2405*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406*d415bd75Srobert       break;
2407*d415bd75Srobert     case BUFFER_STORE:
2408*d415bd75Srobert       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2409*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2410*d415bd75Srobert       break;
2411*d415bd75Srobert     case MIMG:
2412*d415bd75Srobert       NewMI = mergeImagePair(CI, Paired, Where->I);
2413*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2414*d415bd75Srobert       break;
2415*d415bd75Srobert     case TBUFFER_LOAD:
2416*d415bd75Srobert       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2417*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418*d415bd75Srobert       break;
2419*d415bd75Srobert     case TBUFFER_STORE:
2420*d415bd75Srobert       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2421*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2422*d415bd75Srobert       break;
2423*d415bd75Srobert     case FLAT_LOAD:
2424*d415bd75Srobert     case GLOBAL_LOAD:
2425*d415bd75Srobert     case GLOBAL_LOAD_SADDR:
2426*d415bd75Srobert       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2427*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2428*d415bd75Srobert       break;
2429*d415bd75Srobert     case FLAT_STORE:
2430*d415bd75Srobert     case GLOBAL_STORE:
2431*d415bd75Srobert     case GLOBAL_STORE_SADDR:
2432*d415bd75Srobert       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2433*d415bd75Srobert       OptimizeListAgain |= CI.Width + Paired.Width < 4;
243409467b48Spatrick       break;
243509467b48Spatrick     }
2436*d415bd75Srobert     CI.setMI(NewMI, *this);
2437*d415bd75Srobert     CI.Order = Where->Order;
2438097a140dSpatrick     if (I == Second)
2439097a140dSpatrick       I = Next;
244009467b48Spatrick 
2441097a140dSpatrick     MergeList.erase(Second);
244209467b48Spatrick   }
244309467b48Spatrick 
244409467b48Spatrick   return Modified;
244509467b48Spatrick }
244609467b48Spatrick 
runOnMachineFunction(MachineFunction & MF)244709467b48Spatrick bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
244809467b48Spatrick   if (skipFunction(MF.getFunction()))
244909467b48Spatrick     return false;
245009467b48Spatrick 
245109467b48Spatrick   STM = &MF.getSubtarget<GCNSubtarget>();
245209467b48Spatrick   if (!STM->loadStoreOptEnabled())
245309467b48Spatrick     return false;
245409467b48Spatrick 
245509467b48Spatrick   TII = STM->getInstrInfo();
245609467b48Spatrick   TRI = &TII->getRegisterInfo();
245709467b48Spatrick 
245809467b48Spatrick   MRI = &MF.getRegInfo();
245909467b48Spatrick   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
246009467b48Spatrick 
246109467b48Spatrick   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
246209467b48Spatrick 
246309467b48Spatrick   bool Modified = false;
246409467b48Spatrick 
2465097a140dSpatrick   // Contains the list of instructions for which constant offsets are being
2466097a140dSpatrick   // promoted to the IMM. This is tracked for an entire block at time.
2467097a140dSpatrick   SmallPtrSet<MachineInstr *, 4> AnchorList;
2468097a140dSpatrick   MemInfoMap Visited;
246909467b48Spatrick 
247009467b48Spatrick   for (MachineBasicBlock &MBB : MF) {
2471097a140dSpatrick     MachineBasicBlock::iterator SectionEnd;
2472097a140dSpatrick     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2473097a140dSpatrick          I = SectionEnd) {
2474097a140dSpatrick       bool CollectModified;
247509467b48Spatrick       std::list<std::list<CombineInfo>> MergeableInsts;
2476097a140dSpatrick 
2477097a140dSpatrick       // First pass: Collect list of all instructions we know how to merge in a
2478097a140dSpatrick       // subset of the block.
2479097a140dSpatrick       std::tie(SectionEnd, CollectModified) =
2480097a140dSpatrick           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2481097a140dSpatrick 
2482097a140dSpatrick       Modified |= CollectModified;
2483097a140dSpatrick 
248409467b48Spatrick       do {
248509467b48Spatrick         OptimizeAgain = false;
248609467b48Spatrick         Modified |= optimizeBlock(MergeableInsts);
248709467b48Spatrick       } while (OptimizeAgain);
248809467b48Spatrick     }
248909467b48Spatrick 
2490097a140dSpatrick     Visited.clear();
2491097a140dSpatrick     AnchorList.clear();
2492097a140dSpatrick   }
2493097a140dSpatrick 
249409467b48Spatrick   return Modified;
249509467b48Spatrick }
2496