xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
100b57cec5SDimitry Andric // This will fuse operations such as
110b57cec5SDimitry Andric //  ds_read_b32 v0, v2 offset:16
120b57cec5SDimitry Andric //  ds_read_b32 v1, v2 offset:32
130b57cec5SDimitry Andric // ==>
140b57cec5SDimitry Andric //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
150b57cec5SDimitry Andric //
160b57cec5SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
170b57cec5SDimitry Andric //  s_buffer_load_dword s4, s[0:3], 4
180b57cec5SDimitry Andric //  s_buffer_load_dword s5, s[0:3], 8
190b57cec5SDimitry Andric // ==>
200b57cec5SDimitry Andric //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
210b57cec5SDimitry Andric //
220b57cec5SDimitry Andric // This pass also tries to promote constant offset to the immediate by
230b57cec5SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
240b57cec5SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
250b57cec5SDimitry Andric // to the immediate.
260b57cec5SDimitry Andric // E.g.
270b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1800
280b57cec5SDimitry Andric //  v_add_co_u32_e32 v0, vcc, s0, v2
290b57cec5SDimitry Andric //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
300b57cec5SDimitry Andric //
310b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1000
320b57cec5SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
330b57cec5SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
340b57cec5SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
350b57cec5SDimitry Andric //  global_load_dwordx2 v[0:1], v[0:1], off
360b57cec5SDimitry Andric // =>
370b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1000
380b57cec5SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
390b57cec5SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
400b57cec5SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
410b57cec5SDimitry Andric //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
420b57cec5SDimitry Andric //
430b57cec5SDimitry Andric // Future improvements:
440b57cec5SDimitry Andric //
458bcb0991SDimitry Andric // - This is currently missing stores of constants because loading
460b57cec5SDimitry Andric //   the constant into the data register is placed between the stores, although
470b57cec5SDimitry Andric //   this is arguably a scheduling problem.
480b57cec5SDimitry Andric //
490b57cec5SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
500b57cec5SDimitry Andric //   one pair, and recomputes live intervals and moves on to the next pair. It
510b57cec5SDimitry Andric //   would be better to compute a list of all merges that need to occur.
520b57cec5SDimitry Andric //
530b57cec5SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
540b57cec5SDimitry Andric //   cluster of loads have offsets that are too large to fit in the 8-bit
550b57cec5SDimitry Andric //   offsets, but are close enough to fit in the 8 bits, we can add to the base
560b57cec5SDimitry Andric //   pointer and use the new reduced offsets.
570b57cec5SDimitry Andric //
580b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
590b57cec5SDimitry Andric 
600b57cec5SDimitry Andric #include "AMDGPU.h"
61e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
620b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
630b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
640b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
65480093f4SDimitry Andric #include "llvm/InitializePasses.h"
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric using namespace llvm;
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric namespace {
720b57cec5SDimitry Andric enum InstClassEnum {
730b57cec5SDimitry Andric   UNKNOWN,
740b57cec5SDimitry Andric   DS_READ,
750b57cec5SDimitry Andric   DS_WRITE,
760b57cec5SDimitry Andric   S_BUFFER_LOAD_IMM,
77bdd1243dSDimitry Andric   S_BUFFER_LOAD_SGPR_IMM,
78bdd1243dSDimitry Andric   S_LOAD_IMM,
798bcb0991SDimitry Andric   BUFFER_LOAD,
808bcb0991SDimitry Andric   BUFFER_STORE,
818bcb0991SDimitry Andric   MIMG,
82480093f4SDimitry Andric   TBUFFER_LOAD,
83480093f4SDimitry Andric   TBUFFER_STORE,
8481ad6265SDimitry Andric   GLOBAL_LOAD_SADDR,
8581ad6265SDimitry Andric   GLOBAL_STORE_SADDR,
8681ad6265SDimitry Andric   FLAT_LOAD,
8781ad6265SDimitry Andric   FLAT_STORE,
8881ad6265SDimitry Andric   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
8981ad6265SDimitry Andric   GLOBAL_STORE // any CombineInfo, they are only ever returned by
9081ad6265SDimitry Andric                // getCommonInstClass.
910b57cec5SDimitry Andric };
920b57cec5SDimitry Andric 
935ffd83dbSDimitry Andric struct AddressRegs {
945ffd83dbSDimitry Andric   unsigned char NumVAddrs = 0;
955ffd83dbSDimitry Andric   bool SBase = false;
965ffd83dbSDimitry Andric   bool SRsrc = false;
975ffd83dbSDimitry Andric   bool SOffset = false;
9881ad6265SDimitry Andric   bool SAddr = false;
995ffd83dbSDimitry Andric   bool VAddr = false;
1005ffd83dbSDimitry Andric   bool Addr = false;
1015ffd83dbSDimitry Andric   bool SSamp = false;
1020b57cec5SDimitry Andric };
1030b57cec5SDimitry Andric 
1045ffd83dbSDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
1055ffd83dbSDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1;
1065ffd83dbSDimitry Andric 
1070b57cec5SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
1080b57cec5SDimitry Andric   struct CombineInfo {
1090b57cec5SDimitry Andric     MachineBasicBlock::iterator I;
1100b57cec5SDimitry Andric     unsigned EltSize;
111480093f4SDimitry Andric     unsigned Offset;
112480093f4SDimitry Andric     unsigned Width;
113480093f4SDimitry Andric     unsigned Format;
1140b57cec5SDimitry Andric     unsigned BaseOff;
115480093f4SDimitry Andric     unsigned DMask;
1160b57cec5SDimitry Andric     InstClassEnum InstClass;
117fe6060f1SDimitry Andric     unsigned CPol = 0;
11804eeddc0SDimitry Andric     bool IsAGPR;
1190b57cec5SDimitry Andric     bool UseST64;
1205ffd83dbSDimitry Andric     int AddrIdx[MaxAddressRegs];
1215ffd83dbSDimitry Andric     const MachineOperand *AddrReg[MaxAddressRegs];
1228bcb0991SDimitry Andric     unsigned NumAddresses;
1235ffd83dbSDimitry Andric     unsigned Order;
1248bcb0991SDimitry Andric 
125bdd1243dSDimitry Andric     bool hasSameBaseAddress(const CombineInfo &CI) {
126bdd1243dSDimitry Andric       if (NumAddresses != CI.NumAddresses)
127bdd1243dSDimitry Andric         return false;
128bdd1243dSDimitry Andric 
129bdd1243dSDimitry Andric       const MachineInstr &MI = *CI.I;
1308bcb0991SDimitry Andric       for (unsigned i = 0; i < NumAddresses; i++) {
1318bcb0991SDimitry Andric         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
1328bcb0991SDimitry Andric 
1338bcb0991SDimitry Andric         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
1348bcb0991SDimitry Andric           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
1358bcb0991SDimitry Andric               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
1368bcb0991SDimitry Andric             return false;
1378bcb0991SDimitry Andric           }
1388bcb0991SDimitry Andric           continue;
1398bcb0991SDimitry Andric         }
1408bcb0991SDimitry Andric 
1418bcb0991SDimitry Andric         // Check same base pointer. Be careful of subregisters, which can occur
1428bcb0991SDimitry Andric         // with vectors of pointers.
1438bcb0991SDimitry Andric         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
1448bcb0991SDimitry Andric             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
1458bcb0991SDimitry Andric          return false;
1468bcb0991SDimitry Andric         }
1478bcb0991SDimitry Andric       }
1488bcb0991SDimitry Andric       return true;
1498bcb0991SDimitry Andric     }
1508bcb0991SDimitry Andric 
1518bcb0991SDimitry Andric     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
1528bcb0991SDimitry Andric       for (unsigned i = 0; i < NumAddresses; ++i) {
1538bcb0991SDimitry Andric         const MachineOperand *AddrOp = AddrReg[i];
1548bcb0991SDimitry Andric         // Immediates are always OK.
1558bcb0991SDimitry Andric         if (AddrOp->isImm())
1568bcb0991SDimitry Andric           continue;
1578bcb0991SDimitry Andric 
1588bcb0991SDimitry Andric         // Don't try to merge addresses that aren't either immediates or registers.
1598bcb0991SDimitry Andric         // TODO: Should be possible to merge FrameIndexes and maybe some other
1608bcb0991SDimitry Andric         // non-register
1618bcb0991SDimitry Andric         if (!AddrOp->isReg())
1628bcb0991SDimitry Andric           return false;
1638bcb0991SDimitry Andric 
1645f757f3fSDimitry Andric         // TODO: We should be able to merge instructions with other physical reg
1655f757f3fSDimitry Andric         // addresses too.
1665f757f3fSDimitry Andric         if (AddrOp->getReg().isPhysical() &&
1675f757f3fSDimitry Andric             AddrOp->getReg() != AMDGPU::SGPR_NULL)
1688bcb0991SDimitry Andric           return false;
1698bcb0991SDimitry Andric 
170bdd1243dSDimitry Andric         // If an address has only one use then there will be no other
1718bcb0991SDimitry Andric         // instructions with the same address, so we can't merge this one.
1728bcb0991SDimitry Andric         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
1738bcb0991SDimitry Andric           return false;
1748bcb0991SDimitry Andric       }
1758bcb0991SDimitry Andric       return true;
1768bcb0991SDimitry Andric     }
1778bcb0991SDimitry Andric 
17804eeddc0SDimitry Andric     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
17981ad6265SDimitry Andric 
18081ad6265SDimitry Andric     // Compare by pointer order.
18181ad6265SDimitry Andric     bool operator<(const CombineInfo& Other) const {
18281ad6265SDimitry Andric       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
18381ad6265SDimitry Andric     }
1840b57cec5SDimitry Andric   };
1850b57cec5SDimitry Andric 
1860b57cec5SDimitry Andric   struct BaseRegisters {
1875ffd83dbSDimitry Andric     Register LoReg;
1885ffd83dbSDimitry Andric     Register HiReg;
1890b57cec5SDimitry Andric 
1900b57cec5SDimitry Andric     unsigned LoSubReg = 0;
1910b57cec5SDimitry Andric     unsigned HiSubReg = 0;
1920b57cec5SDimitry Andric   };
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric   struct MemAddress {
1950b57cec5SDimitry Andric     BaseRegisters Base;
1960b57cec5SDimitry Andric     int64_t Offset = 0;
1970b57cec5SDimitry Andric   };
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
2000b57cec5SDimitry Andric 
2010b57cec5SDimitry Andric private:
2020b57cec5SDimitry Andric   const GCNSubtarget *STM = nullptr;
2030b57cec5SDimitry Andric   const SIInstrInfo *TII = nullptr;
2040b57cec5SDimitry Andric   const SIRegisterInfo *TRI = nullptr;
2050b57cec5SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
2060b57cec5SDimitry Andric   AliasAnalysis *AA = nullptr;
2070b57cec5SDimitry Andric   bool OptimizeAgain;
2080b57cec5SDimitry Andric 
20981ad6265SDimitry Andric   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
21081ad6265SDimitry Andric                            const DenseSet<Register> &ARegUses,
21181ad6265SDimitry Andric                            const MachineInstr &A, const MachineInstr &B) const;
212480093f4SDimitry Andric   static bool dmasksCanBeCombined(const CombineInfo &CI,
213480093f4SDimitry Andric                                   const SIInstrInfo &TII,
214480093f4SDimitry Andric                                   const CombineInfo &Paired);
2155ffd83dbSDimitry Andric   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
2165ffd83dbSDimitry Andric                                    CombineInfo &Paired, bool Modify = false);
2175ffd83dbSDimitry Andric   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218480093f4SDimitry Andric                         const CombineInfo &Paired);
219*0fca6ea1SDimitry Andric   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220480093f4SDimitry Andric   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221480093f4SDimitry Andric                                                      const CombineInfo &Paired);
222*0fca6ea1SDimitry Andric   const TargetRegisterClass *
223*0fca6ea1SDimitry Andric   getTargetRegisterClass(const CombineInfo &CI,
224*0fca6ea1SDimitry Andric                          const CombineInfo &Paired) const;
225fe6060f1SDimitry Andric   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
2260b57cec5SDimitry Andric 
22781ad6265SDimitry Andric   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
2280b57cec5SDimitry Andric 
229*0fca6ea1SDimitry Andric   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230*0fca6ea1SDimitry Andric                       MachineBasicBlock::iterator InsertBefore, int OpName,
231*0fca6ea1SDimitry Andric                       Register DestReg) const;
232*0fca6ea1SDimitry Andric   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233*0fca6ea1SDimitry Andric                            MachineBasicBlock::iterator InsertBefore,
234*0fca6ea1SDimitry Andric                            int OpName) const;
235*0fca6ea1SDimitry Andric 
2360b57cec5SDimitry Andric   unsigned read2Opcode(unsigned EltSize) const;
2370b57cec5SDimitry Andric   unsigned read2ST64Opcode(unsigned EltSize) const;
23881ad6265SDimitry Andric   MachineBasicBlock::iterator
23981ad6265SDimitry Andric   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
24081ad6265SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
2410b57cec5SDimitry Andric 
2420b57cec5SDimitry Andric   unsigned write2Opcode(unsigned EltSize) const;
2430b57cec5SDimitry Andric   unsigned write2ST64Opcode(unsigned EltSize) const;
2445ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2455ffd83dbSDimitry Andric   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
24681ad6265SDimitry Andric                   MachineBasicBlock::iterator InsertBefore);
2475ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2485ffd83dbSDimitry Andric   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
24981ad6265SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
2505ffd83dbSDimitry Andric   MachineBasicBlock::iterator
251bdd1243dSDimitry Andric   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
25281ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2535ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2545ffd83dbSDimitry Andric   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
25581ad6265SDimitry Andric                       MachineBasicBlock::iterator InsertBefore);
2565ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2575ffd83dbSDimitry Andric   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
25881ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2595ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2605ffd83dbSDimitry Andric   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
26181ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2625ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2635ffd83dbSDimitry Andric   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
26481ad6265SDimitry Andric                         MachineBasicBlock::iterator InsertBefore);
26581ad6265SDimitry Andric   MachineBasicBlock::iterator
26681ad6265SDimitry Andric   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
26781ad6265SDimitry Andric                     MachineBasicBlock::iterator InsertBefore);
26881ad6265SDimitry Andric   MachineBasicBlock::iterator
26981ad6265SDimitry Andric   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
27081ad6265SDimitry Andric                      MachineBasicBlock::iterator InsertBefore);
2710b57cec5SDimitry Andric 
2725ffd83dbSDimitry Andric   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
2738bcb0991SDimitry Andric                            int32_t NewOffset) const;
2745ffd83dbSDimitry Andric   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
2758bcb0991SDimitry Andric   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276bdd1243dSDimitry Andric   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
2778bcb0991SDimitry Andric   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
2780b57cec5SDimitry Andric   /// Promotes constant offset to the immediate by adjusting the base. It
2790b57cec5SDimitry Andric   /// tries to use a base from the nearby instructions that allows it to have
2800b57cec5SDimitry Andric   /// a 13bit constant offset which gets promoted to the immediate.
2810b57cec5SDimitry Andric   bool promoteConstantOffsetToImm(MachineInstr &CI,
2820b57cec5SDimitry Andric                                   MemInfoMap &Visited,
2838bcb0991SDimitry Andric                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
2848bcb0991SDimitry Andric   void addInstToMergeableList(const CombineInfo &CI,
2858bcb0991SDimitry Andric                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
2865ffd83dbSDimitry Andric 
2875ffd83dbSDimitry Andric   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
2885ffd83dbSDimitry Andric       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2895ffd83dbSDimitry Andric       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2908bcb0991SDimitry Andric       std::list<std::list<CombineInfo>> &MergeableInsts) const;
2910b57cec5SDimitry Andric 
29281ad6265SDimitry Andric   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
29381ad6265SDimitry Andric                                                      const CombineInfo &Paired);
29481ad6265SDimitry Andric 
29581ad6265SDimitry Andric   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
29681ad6265SDimitry Andric                                           const CombineInfo &Paired);
29781ad6265SDimitry Andric 
2980b57cec5SDimitry Andric public:
2990b57cec5SDimitry Andric   static char ID;
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
3020b57cec5SDimitry Andric     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
3030b57cec5SDimitry Andric   }
3040b57cec5SDimitry Andric 
3058bcb0991SDimitry Andric   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
3068bcb0991SDimitry Andric                                      bool &OptimizeListAgain);
3078bcb0991SDimitry Andric   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
3100b57cec5SDimitry Andric 
3110b57cec5SDimitry Andric   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
3120b57cec5SDimitry Andric 
3130b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
3140b57cec5SDimitry Andric     AU.setPreservesCFG();
3150b57cec5SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
3160b57cec5SDimitry Andric 
3170b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
3180b57cec5SDimitry Andric   }
3195ffd83dbSDimitry Andric 
3205ffd83dbSDimitry Andric   MachineFunctionProperties getRequiredProperties() const override {
3215ffd83dbSDimitry Andric     return MachineFunctionProperties()
3225ffd83dbSDimitry Andric       .set(MachineFunctionProperties::Property::IsSSA);
3235ffd83dbSDimitry Andric   }
3240b57cec5SDimitry Andric };
3250b57cec5SDimitry Andric 
3268bcb0991SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
3278bcb0991SDimitry Andric   const unsigned Opc = MI.getOpcode();
3288bcb0991SDimitry Andric 
3298bcb0991SDimitry Andric   if (TII.isMUBUF(Opc)) {
3308bcb0991SDimitry Andric     // FIXME: Handle d16 correctly
3318bcb0991SDimitry Andric     return AMDGPU::getMUBUFElements(Opc);
3328bcb0991SDimitry Andric   }
3335f757f3fSDimitry Andric   if (TII.isImage(MI)) {
3348bcb0991SDimitry Andric     uint64_t DMaskImm =
3358bcb0991SDimitry Andric         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336bdd1243dSDimitry Andric     return llvm::popcount(DMaskImm);
3378bcb0991SDimitry Andric   }
338480093f4SDimitry Andric   if (TII.isMTBUF(Opc)) {
339480093f4SDimitry Andric     return AMDGPU::getMTBUFElements(Opc);
340480093f4SDimitry Andric   }
3418bcb0991SDimitry Andric 
3428bcb0991SDimitry Andric   switch (Opc) {
3438bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
34681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
34781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
34881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
34981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
35081ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
35181ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
3528bcb0991SDimitry Andric     return 1;
3538bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
356*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
35781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
35881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
35981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
36081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
36181ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
36281ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
3638bcb0991SDimitry Andric     return 2;
3645f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
3655f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
3665f757f3fSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
367*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
36881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
36981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
37081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
37181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
37281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
37381ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
37481ad6265SDimitry Andric     return 3;
3758bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
378*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
37981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
38081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
38181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
38281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
38381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
38481ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
3858bcb0991SDimitry Andric     return 4;
386349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
389*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390349cc55cSDimitry Andric     return 8;
391*0fca6ea1SDimitry Andric   case AMDGPU::DS_READ_B32:
392*0fca6ea1SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
393*0fca6ea1SDimitry Andric   case AMDGPU::DS_WRITE_B32:
394fe6060f1SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
395fe6060f1SDimitry Andric     return 1;
396*0fca6ea1SDimitry Andric   case AMDGPU::DS_READ_B64:
397*0fca6ea1SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
398*0fca6ea1SDimitry Andric   case AMDGPU::DS_WRITE_B64:
399fe6060f1SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
400fe6060f1SDimitry Andric     return 2;
4018bcb0991SDimitry Andric   default:
4028bcb0991SDimitry Andric     return 0;
4038bcb0991SDimitry Andric   }
4048bcb0991SDimitry Andric }
4058bcb0991SDimitry Andric 
4068bcb0991SDimitry Andric /// Maps instruction opcode to enum InstClassEnum.
4078bcb0991SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
4088bcb0991SDimitry Andric   switch (Opc) {
4098bcb0991SDimitry Andric   default:
4108bcb0991SDimitry Andric     if (TII.isMUBUF(Opc)) {
4118bcb0991SDimitry Andric       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
4128bcb0991SDimitry Andric       default:
4138bcb0991SDimitry Andric         return UNKNOWN;
414*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
4188bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
4198bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
4208bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
4218bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
4265f757f3fSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
4275f757f3fSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
4285f757f3fSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
4295f757f3fSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
4308bcb0991SDimitry Andric         return BUFFER_LOAD;
431*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
4358bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
4368bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
4378bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
4388bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442*0fca6ea1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
4435f757f3fSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
4445f757f3fSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
4455f757f3fSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
4465f757f3fSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
4478bcb0991SDimitry Andric         return BUFFER_STORE;
4488bcb0991SDimitry Andric       }
4498bcb0991SDimitry Andric     }
4505f757f3fSDimitry Andric     if (TII.isImage(Opc)) {
4518bcb0991SDimitry Andric       // Ignore instructions encoded without vaddr.
452bdd1243dSDimitry Andric       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453bdd1243dSDimitry Andric           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
4548bcb0991SDimitry Andric         return UNKNOWN;
455349cc55cSDimitry Andric       // Ignore BVH instructions
456349cc55cSDimitry Andric       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457349cc55cSDimitry Andric         return UNKNOWN;
4588bcb0991SDimitry Andric       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459480093f4SDimitry Andric       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460480093f4SDimitry Andric           TII.isGather4(Opc))
4618bcb0991SDimitry Andric         return UNKNOWN;
4628bcb0991SDimitry Andric       return MIMG;
4638bcb0991SDimitry Andric     }
464480093f4SDimitry Andric     if (TII.isMTBUF(Opc)) {
465480093f4SDimitry Andric       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466480093f4SDimitry Andric       default:
467480093f4SDimitry Andric         return UNKNOWN;
4685f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
4695f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
4705f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
4715f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
4765f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
4775f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
4785f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
4795f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
4805f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
4815f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
4825f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
4835f757f3fSDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484480093f4SDimitry Andric         return TBUFFER_LOAD;
485480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
4895f757f3fSDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
4905f757f3fSDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
4915f757f3fSDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
4925f757f3fSDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493480093f4SDimitry Andric         return TBUFFER_STORE;
494480093f4SDimitry Andric       }
495480093f4SDimitry Andric     }
4968bcb0991SDimitry Andric     return UNKNOWN;
4978bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
4988bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
4995f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5008bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5028bcb0991SDimitry Andric     return S_BUFFER_LOAD_IMM;
503bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
5055f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508bdd1243dSDimitry Andric     return S_BUFFER_LOAD_SGPR_IMM;
509bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
510bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
5115f757f3fSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
512bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
513bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
514*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518bdd1243dSDimitry Andric     return S_LOAD_IMM;
5198bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
5208bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
5218bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
5228bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
5238bcb0991SDimitry Andric     return DS_READ;
5248bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
5258bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
5268bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
5278bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
5288bcb0991SDimitry Andric     return DS_WRITE;
52981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
53081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
53181ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
53281ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
53381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
53481ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
53581ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
53681ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
53781ad6265SDimitry Andric     return FLAT_LOAD;
53881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
53981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
54081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
54181ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
54281ad6265SDimitry Andric     return GLOBAL_LOAD_SADDR;
54381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
54481ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
54581ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
54681ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
54781ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
54881ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
54981ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
55081ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
55181ad6265SDimitry Andric     return FLAT_STORE;
55281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
55381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
55481ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
55581ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
55681ad6265SDimitry Andric     return GLOBAL_STORE_SADDR;
5578bcb0991SDimitry Andric   }
5588bcb0991SDimitry Andric }
5598bcb0991SDimitry Andric 
5608bcb0991SDimitry Andric /// Determines instruction subclass from opcode. Only instructions
56181ad6265SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have
56281ad6265SDimitry Andric /// a different subclass but must have the same class.
5638bcb0991SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
5648bcb0991SDimitry Andric   switch (Opc) {
5658bcb0991SDimitry Andric   default:
5668bcb0991SDimitry Andric     if (TII.isMUBUF(Opc))
5678bcb0991SDimitry Andric       return AMDGPU::getMUBUFBaseOpcode(Opc);
5685f757f3fSDimitry Andric     if (TII.isImage(Opc)) {
5698bcb0991SDimitry Andric       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
5708bcb0991SDimitry Andric       assert(Info);
5718bcb0991SDimitry Andric       return Info->BaseOpcode;
5728bcb0991SDimitry Andric     }
573480093f4SDimitry Andric     if (TII.isMTBUF(Opc))
574480093f4SDimitry Andric       return AMDGPU::getMTBUFBaseOpcode(Opc);
5758bcb0991SDimitry Andric     return -1;
5768bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
5778bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
5788bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
5798bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
5808bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
5818bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
5828bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
5838bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
5848bcb0991SDimitry Andric     return Opc;
5858bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
5868bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
5875f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5888bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5908bcb0991SDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
5935f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596bdd1243dSDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
598bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
5995f757f3fSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
600bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
601bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
602*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606bdd1243dSDimitry Andric     return AMDGPU::S_LOAD_DWORD_IMM;
60781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
60881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
60981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
61081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
61181ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
61281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
61381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
61481ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
61581ad6265SDimitry Andric     return AMDGPU::FLAT_LOAD_DWORD;
61681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
61781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
61881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
61981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
62081ad6265SDimitry Andric     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
62181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
62281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
62381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
62481ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
62581ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
62681ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
62781ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
62881ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
62981ad6265SDimitry Andric     return AMDGPU::FLAT_STORE_DWORD;
63081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
63181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
63281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
63381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
63481ad6265SDimitry Andric     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
6358bcb0991SDimitry Andric   }
6368bcb0991SDimitry Andric }
6378bcb0991SDimitry Andric 
63881ad6265SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined
63981ad6265SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
64081ad6265SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting
64181ad6265SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL
64281ad6265SDimitry Andric // operations to FLAT.
64381ad6265SDimitry Andric // For other instructions return the original unmodified class.
64481ad6265SDimitry Andric InstClassEnum
64581ad6265SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
64681ad6265SDimitry Andric                                          const CombineInfo &Paired) {
64781ad6265SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
64881ad6265SDimitry Andric 
64981ad6265SDimitry Andric   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
65081ad6265SDimitry Andric       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
65181ad6265SDimitry Andric     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
65281ad6265SDimitry Andric 
65381ad6265SDimitry Andric   return CI.InstClass;
65481ad6265SDimitry Andric }
65581ad6265SDimitry Andric 
6565ffd83dbSDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
6575ffd83dbSDimitry Andric   AddressRegs Result;
6585ffd83dbSDimitry Andric 
6598bcb0991SDimitry Andric   if (TII.isMUBUF(Opc)) {
6605ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasVAddr(Opc))
6615ffd83dbSDimitry Andric       Result.VAddr = true;
6625ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasSrsrc(Opc))
6635ffd83dbSDimitry Andric       Result.SRsrc = true;
6645ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasSoffset(Opc))
6655ffd83dbSDimitry Andric       Result.SOffset = true;
6668bcb0991SDimitry Andric 
6675ffd83dbSDimitry Andric     return Result;
6688bcb0991SDimitry Andric   }
6698bcb0991SDimitry Andric 
6705f757f3fSDimitry Andric   if (TII.isImage(Opc)) {
6715ffd83dbSDimitry Andric     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
6725ffd83dbSDimitry Andric     if (VAddr0Idx >= 0) {
6735f757f3fSDimitry Andric       int RsrcName =
6745f757f3fSDimitry Andric           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
6755f757f3fSDimitry Andric       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
6765f757f3fSDimitry Andric       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
6775ffd83dbSDimitry Andric     } else {
6785ffd83dbSDimitry Andric       Result.VAddr = true;
6795ffd83dbSDimitry Andric     }
6805ffd83dbSDimitry Andric     Result.SRsrc = true;
6818bcb0991SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
6828bcb0991SDimitry Andric     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
6835ffd83dbSDimitry Andric       Result.SSamp = true;
684480093f4SDimitry Andric 
6855ffd83dbSDimitry Andric     return Result;
686480093f4SDimitry Andric   }
687480093f4SDimitry Andric   if (TII.isMTBUF(Opc)) {
6885ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasVAddr(Opc))
6895ffd83dbSDimitry Andric       Result.VAddr = true;
6905ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasSrsrc(Opc))
6915ffd83dbSDimitry Andric       Result.SRsrc = true;
6925ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasSoffset(Opc))
6935ffd83dbSDimitry Andric       Result.SOffset = true;
694480093f4SDimitry Andric 
6955ffd83dbSDimitry Andric     return Result;
6968bcb0991SDimitry Andric   }
6978bcb0991SDimitry Andric 
6988bcb0991SDimitry Andric   switch (Opc) {
6998bcb0991SDimitry Andric   default:
7005ffd83dbSDimitry Andric     return Result;
701bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
7035f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706bdd1243dSDimitry Andric     Result.SOffset = true;
707bdd1243dSDimitry Andric     [[fallthrough]];
7088bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
7098bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
7105f757f3fSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
7118bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
714bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
7155f757f3fSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
716bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
717bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
718*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721*0fca6ea1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
7225ffd83dbSDimitry Andric     Result.SBase = true;
7235ffd83dbSDimitry Andric     return Result;
7248bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
7258bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
7268bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
7278bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
7288bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
7298bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
7308bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
7318bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
7325ffd83dbSDimitry Andric     Result.Addr = true;
7335ffd83dbSDimitry Andric     return Result;
73481ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
73581ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
73681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
73781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
73881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
73981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
74081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
74181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
74281ad6265SDimitry Andric     Result.SAddr = true;
743bdd1243dSDimitry Andric     [[fallthrough]];
74481ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
74581ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
74681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
74781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
74881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
74981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
75081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
75181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
75281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
75381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
75481ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
75581ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
75681ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
75781ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
75881ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
75981ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
76081ad6265SDimitry Andric     Result.VAddr = true;
76181ad6265SDimitry Andric     return Result;
7628bcb0991SDimitry Andric   }
7638bcb0991SDimitry Andric }
7648bcb0991SDimitry Andric 
7658bcb0991SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
76604eeddc0SDimitry Andric                                               const SILoadStoreOptimizer &LSO) {
7678bcb0991SDimitry Andric   I = MI;
7688bcb0991SDimitry Andric   unsigned Opc = MI->getOpcode();
76904eeddc0SDimitry Andric   InstClass = getInstClass(Opc, *LSO.TII);
7708bcb0991SDimitry Andric 
7718bcb0991SDimitry Andric   if (InstClass == UNKNOWN)
7728bcb0991SDimitry Andric     return;
7738bcb0991SDimitry Andric 
77404eeddc0SDimitry Andric   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
77504eeddc0SDimitry Andric 
7768bcb0991SDimitry Andric   switch (InstClass) {
7778bcb0991SDimitry Andric   case DS_READ:
7788bcb0991SDimitry Andric    EltSize =
7798bcb0991SDimitry Andric           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
7808bcb0991SDimitry Andric                                                                           : 4;
7818bcb0991SDimitry Andric    break;
7828bcb0991SDimitry Andric   case DS_WRITE:
7838bcb0991SDimitry Andric     EltSize =
7848bcb0991SDimitry Andric           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
7858bcb0991SDimitry Andric                                                                             : 4;
7868bcb0991SDimitry Andric     break;
7878bcb0991SDimitry Andric   case S_BUFFER_LOAD_IMM:
788bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
789bdd1243dSDimitry Andric   case S_LOAD_IMM:
79004eeddc0SDimitry Andric     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
7918bcb0991SDimitry Andric     break;
7928bcb0991SDimitry Andric   default:
7938bcb0991SDimitry Andric     EltSize = 4;
7948bcb0991SDimitry Andric     break;
7958bcb0991SDimitry Andric   }
7968bcb0991SDimitry Andric 
7978bcb0991SDimitry Andric   if (InstClass == MIMG) {
79804eeddc0SDimitry Andric     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
7995ffd83dbSDimitry Andric     // Offset is not considered for MIMG instructions.
8005ffd83dbSDimitry Andric     Offset = 0;
8018bcb0991SDimitry Andric   } else {
8028bcb0991SDimitry Andric     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
80306c3fb27SDimitry Andric     Offset = I->getOperand(OffsetIdx).getImm();
8048bcb0991SDimitry Andric   }
8058bcb0991SDimitry Andric 
806480093f4SDimitry Andric   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
80704eeddc0SDimitry Andric     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808480093f4SDimitry Andric 
80904eeddc0SDimitry Andric   Width = getOpcodeWidth(*I, *LSO.TII);
8108bcb0991SDimitry Andric 
8118bcb0991SDimitry Andric   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812480093f4SDimitry Andric     Offset &= 0xffff;
8138bcb0991SDimitry Andric   } else if (InstClass != MIMG) {
81404eeddc0SDimitry Andric     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
8158bcb0991SDimitry Andric   }
8168bcb0991SDimitry Andric 
81704eeddc0SDimitry Andric   AddressRegs Regs = getRegs(Opc, *LSO.TII);
8185f757f3fSDimitry Andric   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
8195ffd83dbSDimitry Andric 
8208bcb0991SDimitry Andric   NumAddresses = 0;
8215ffd83dbSDimitry Andric   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
8225ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8235ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
8245ffd83dbSDimitry Andric   if (Regs.Addr)
8255ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8265ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
8275ffd83dbSDimitry Andric   if (Regs.SBase)
8285ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8295ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
8305ffd83dbSDimitry Andric   if (Regs.SRsrc)
8315f757f3fSDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
8325f757f3fSDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
8335ffd83dbSDimitry Andric   if (Regs.SOffset)
8345ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8355ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
83681ad6265SDimitry Andric   if (Regs.SAddr)
83781ad6265SDimitry Andric     AddrIdx[NumAddresses++] =
83881ad6265SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
8395ffd83dbSDimitry Andric   if (Regs.VAddr)
8405ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8415ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
8425ffd83dbSDimitry Andric   if (Regs.SSamp)
8435f757f3fSDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
8445f757f3fSDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
8455ffd83dbSDimitry Andric   assert(NumAddresses <= MaxAddressRegs);
8468bcb0991SDimitry Andric 
8475ffd83dbSDimitry Andric   for (unsigned J = 0; J < NumAddresses; J++)
8485ffd83dbSDimitry Andric     AddrReg[J] = &I->getOperand(AddrIdx[J]);
8498bcb0991SDimitry Andric }
8508bcb0991SDimitry Andric 
8510b57cec5SDimitry Andric } // end anonymous namespace.
8520b57cec5SDimitry Andric 
8530b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
8540b57cec5SDimitry Andric                       "SI Load Store Optimizer", false, false)
8550b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
8560b57cec5SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
8570b57cec5SDimitry Andric                     false, false)
8580b57cec5SDimitry Andric 
8590b57cec5SDimitry Andric char SILoadStoreOptimizer::ID = 0;
8600b57cec5SDimitry Andric 
8610b57cec5SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
8620b57cec5SDimitry Andric 
8630b57cec5SDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
8640b57cec5SDimitry Andric   return new SILoadStoreOptimizer();
8650b57cec5SDimitry Andric }
8660b57cec5SDimitry Andric 
8670b57cec5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
8685ffd83dbSDimitry Andric                               DenseSet<Register> &RegDefs,
86981ad6265SDimitry Andric                               DenseSet<Register> &RegUses) {
87081ad6265SDimitry Andric   for (const auto &Op : MI.operands()) {
87181ad6265SDimitry Andric     if (!Op.isReg())
87281ad6265SDimitry Andric       continue;
8730b57cec5SDimitry Andric     if (Op.isDef())
8740b57cec5SDimitry Andric       RegDefs.insert(Op.getReg());
87581ad6265SDimitry Andric     if (Op.readsReg())
87681ad6265SDimitry Andric       RegUses.insert(Op.getReg());
8770b57cec5SDimitry Andric   }
8780b57cec5SDimitry Andric }
8790b57cec5SDimitry Andric 
88081ad6265SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions(
88181ad6265SDimitry Andric     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
88281ad6265SDimitry Andric     const MachineInstr &A, const MachineInstr &B) const {
88381ad6265SDimitry Andric   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
88481ad6265SDimitry Andric       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
8850b57cec5SDimitry Andric     return false;
88681ad6265SDimitry Andric   for (const auto &BOp : B.operands()) {
88781ad6265SDimitry Andric     if (!BOp.isReg())
8880b57cec5SDimitry Andric       continue;
88981ad6265SDimitry Andric     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
89081ad6265SDimitry Andric       return false;
89181ad6265SDimitry Andric     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
8920b57cec5SDimitry Andric       return false;
8930b57cec5SDimitry Andric   }
8940b57cec5SDimitry Andric   return true;
8950b57cec5SDimitry Andric }
8960b57cec5SDimitry Andric 
89781ad6265SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new
89881ad6265SDimitry Andric // MMO for the combined operation with a new access size.
89981ad6265SDimitry Andric MachineMemOperand *
90081ad6265SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
90181ad6265SDimitry Andric                                                const CombineInfo &Paired) {
90281ad6265SDimitry Andric   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
90381ad6265SDimitry Andric   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
90481ad6265SDimitry Andric 
905*0fca6ea1SDimitry Andric   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
90681ad6265SDimitry Andric 
90781ad6265SDimitry Andric   // A base pointer for the combined operation is the same as the leading
90881ad6265SDimitry Andric   // operation's pointer.
90981ad6265SDimitry Andric   if (Paired < CI)
91081ad6265SDimitry Andric     std::swap(MMOa, MMOb);
91181ad6265SDimitry Andric 
91281ad6265SDimitry Andric   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
91381ad6265SDimitry Andric   // If merging FLAT and GLOBAL set address space to FLAT.
91481ad6265SDimitry Andric   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
91581ad6265SDimitry Andric     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
91681ad6265SDimitry Andric 
91781ad6265SDimitry Andric   MachineFunction *MF = CI.I->getMF();
91881ad6265SDimitry Andric   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
9198bcb0991SDimitry Andric }
9208bcb0991SDimitry Andric 
921480093f4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922480093f4SDimitry Andric                                                const SIInstrInfo &TII,
923480093f4SDimitry Andric                                                const CombineInfo &Paired) {
9248bcb0991SDimitry Andric   assert(CI.InstClass == MIMG);
9258bcb0991SDimitry Andric 
9268bcb0991SDimitry Andric   // Ignore instructions with tfe/lwe set.
9278bcb0991SDimitry Andric   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
9288bcb0991SDimitry Andric   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
9298bcb0991SDimitry Andric 
9308bcb0991SDimitry Andric   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
9318bcb0991SDimitry Andric     return false;
9328bcb0991SDimitry Andric 
9338bcb0991SDimitry Andric   // Check other optional immediate operands for equality.
934fe6060f1SDimitry Andric   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935fe6060f1SDimitry Andric                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936fe6060f1SDimitry Andric                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
9378bcb0991SDimitry Andric 
9388bcb0991SDimitry Andric   for (auto op : OperandsToMatch) {
9398bcb0991SDimitry Andric     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940480093f4SDimitry Andric     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
9418bcb0991SDimitry Andric       return false;
9428bcb0991SDimitry Andric     if (Idx != -1 &&
943480093f4SDimitry Andric         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
9448bcb0991SDimitry Andric       return false;
9458bcb0991SDimitry Andric   }
9468bcb0991SDimitry Andric 
9478bcb0991SDimitry Andric   // Check DMask for overlaps.
948480093f4SDimitry Andric   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949480093f4SDimitry Andric   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
9508bcb0991SDimitry Andric 
9515f757f3fSDimitry Andric   if (!MaxMask)
9525f757f3fSDimitry Andric     return false;
9535f757f3fSDimitry Andric 
95406c3fb27SDimitry Andric   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
9558bcb0991SDimitry Andric   if ((1u << AllowedBitsForMin) <= MinMask)
9568bcb0991SDimitry Andric     return false;
9578bcb0991SDimitry Andric 
9588bcb0991SDimitry Andric   return true;
9598bcb0991SDimitry Andric }
9608bcb0991SDimitry Andric 
961480093f4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962480093f4SDimitry Andric                                        unsigned ComponentCount,
9635ffd83dbSDimitry Andric                                        const GCNSubtarget &STI) {
964480093f4SDimitry Andric   if (ComponentCount > 4)
965480093f4SDimitry Andric     return 0;
966480093f4SDimitry Andric 
967480093f4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968480093f4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969480093f4SDimitry Andric   if (!OldFormatInfo)
970480093f4SDimitry Andric     return 0;
971480093f4SDimitry Andric 
972480093f4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973480093f4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974480093f4SDimitry Andric                                            ComponentCount,
975480093f4SDimitry Andric                                            OldFormatInfo->NumFormat, STI);
976480093f4SDimitry Andric 
977480093f4SDimitry Andric   if (!NewFormatInfo)
978480093f4SDimitry Andric     return 0;
979480093f4SDimitry Andric 
980480093f4SDimitry Andric   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981480093f4SDimitry Andric          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982480093f4SDimitry Andric 
983480093f4SDimitry Andric   return NewFormatInfo->Format;
984480093f4SDimitry Andric }
985480093f4SDimitry Andric 
986fe6060f1SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987fe6060f1SDimitry Andric // highest power of two. Note that the result is well defined for all inputs
988fe6060f1SDimitry Andric // including corner cases like:
989fe6060f1SDimitry Andric // - if Lo == Hi, return that value
990fe6060f1SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows
991fe6060f1SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around)
992fe6060f1SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
99306c3fb27SDimitry Andric   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994fe6060f1SDimitry Andric }
995fe6060f1SDimitry Andric 
996480093f4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
9975ffd83dbSDimitry Andric                                                 const GCNSubtarget &STI,
9985ffd83dbSDimitry Andric                                                 CombineInfo &Paired,
9995ffd83dbSDimitry Andric                                                 bool Modify) {
10008bcb0991SDimitry Andric   assert(CI.InstClass != MIMG);
10018bcb0991SDimitry Andric 
10020b57cec5SDimitry Andric   // XXX - Would the same offset be OK? Is there any reason this would happen or
10030b57cec5SDimitry Andric   // be useful?
1004480093f4SDimitry Andric   if (CI.Offset == Paired.Offset)
10050b57cec5SDimitry Andric     return false;
10060b57cec5SDimitry Andric 
10070b57cec5SDimitry Andric   // This won't be valid if the offset isn't aligned.
1008480093f4SDimitry Andric   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
10090b57cec5SDimitry Andric     return false;
10100b57cec5SDimitry Andric 
1011480093f4SDimitry Andric   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012480093f4SDimitry Andric 
1013480093f4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014480093f4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015480093f4SDimitry Andric     if (!Info0)
1016480093f4SDimitry Andric       return false;
1017480093f4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018480093f4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019480093f4SDimitry Andric     if (!Info1)
1020480093f4SDimitry Andric       return false;
1021480093f4SDimitry Andric 
1022480093f4SDimitry Andric     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023480093f4SDimitry Andric         Info0->NumFormat != Info1->NumFormat)
1024480093f4SDimitry Andric       return false;
1025480093f4SDimitry Andric 
1026480093f4SDimitry Andric     // TODO: Should be possible to support more formats, but if format loads
1027480093f4SDimitry Andric     // are not dword-aligned, the merged load might not be valid.
1028480093f4SDimitry Andric     if (Info0->BitsPerComp != 32)
1029480093f4SDimitry Andric       return false;
1030480093f4SDimitry Andric 
1031480093f4SDimitry Andric     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032480093f4SDimitry Andric       return false;
1033480093f4SDimitry Andric   }
1034480093f4SDimitry Andric 
1035fe6060f1SDimitry Andric   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036fe6060f1SDimitry Andric   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
10370b57cec5SDimitry Andric   CI.UseST64 = false;
10380b57cec5SDimitry Andric   CI.BaseOff = 0;
10390b57cec5SDimitry Andric 
1040fe6060f1SDimitry Andric   // Handle all non-DS instructions.
10410b57cec5SDimitry Andric   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
104206c3fb27SDimitry Andric     if (EltOffset0 + CI.Width != EltOffset1 &&
104306c3fb27SDimitry Andric             EltOffset1 + Paired.Width != EltOffset0)
104406c3fb27SDimitry Andric       return false;
104506c3fb27SDimitry Andric     if (CI.CPol != Paired.CPol)
104606c3fb27SDimitry Andric       return false;
10475f757f3fSDimitry Andric     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
10485f757f3fSDimitry Andric         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
10495f757f3fSDimitry Andric       // Reject cases like:
10505f757f3fSDimitry Andric       //   dword + dwordx2 -> dwordx3
10515f757f3fSDimitry Andric       //   dword + dwordx3 -> dwordx4
10525f757f3fSDimitry Andric       // If we tried to combine these cases, we would fail to extract a subreg
10535f757f3fSDimitry Andric       // for the result of the second load due to SGPR alignment requirements.
10545f757f3fSDimitry Andric       if (CI.Width != Paired.Width &&
10555f757f3fSDimitry Andric           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
10565f757f3fSDimitry Andric         return false;
10575f757f3fSDimitry Andric     }
105806c3fb27SDimitry Andric     return true;
10590b57cec5SDimitry Andric   }
10600b57cec5SDimitry Andric 
10610b57cec5SDimitry Andric   // If the offset in elements doesn't fit in 8-bits, we might be able to use
10620b57cec5SDimitry Andric   // the stride 64 versions.
10630b57cec5SDimitry Andric   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
10640b57cec5SDimitry Andric       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
10655ffd83dbSDimitry Andric     if (Modify) {
1066480093f4SDimitry Andric       CI.Offset = EltOffset0 / 64;
1067480093f4SDimitry Andric       Paired.Offset = EltOffset1 / 64;
10680b57cec5SDimitry Andric       CI.UseST64 = true;
10695ffd83dbSDimitry Andric     }
10700b57cec5SDimitry Andric     return true;
10710b57cec5SDimitry Andric   }
10720b57cec5SDimitry Andric 
10730b57cec5SDimitry Andric   // Check if the new offsets fit in the reduced 8-bit range.
10740b57cec5SDimitry Andric   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
10755ffd83dbSDimitry Andric     if (Modify) {
1076480093f4SDimitry Andric       CI.Offset = EltOffset0;
1077480093f4SDimitry Andric       Paired.Offset = EltOffset1;
10785ffd83dbSDimitry Andric     }
10790b57cec5SDimitry Andric     return true;
10800b57cec5SDimitry Andric   }
10810b57cec5SDimitry Andric 
10820b57cec5SDimitry Andric   // Try to shift base address to decrease offsets.
1083fe6060f1SDimitry Andric   uint32_t Min = std::min(EltOffset0, EltOffset1);
1084fe6060f1SDimitry Andric   uint32_t Max = std::max(EltOffset0, EltOffset1);
10850b57cec5SDimitry Andric 
1086fe6060f1SDimitry Andric   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087fe6060f1SDimitry Andric   if (((Max - Min) & ~Mask) == 0) {
10885ffd83dbSDimitry Andric     if (Modify) {
1089fe6060f1SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1090fe6060f1SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1091fe6060f1SDimitry Andric       // the same offset can be reused for other load/store pairs.
1092fe6060f1SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093fe6060f1SDimitry Andric       // Copy the low bits of the offsets, so that when we adjust them by
1094fe6060f1SDimitry Andric       // subtracting BaseOff they will be multiples of 64.
1095fe6060f1SDimitry Andric       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096fe6060f1SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1097fe6060f1SDimitry Andric       CI.Offset = (EltOffset0 - BaseOff) / 64;
1098fe6060f1SDimitry Andric       Paired.Offset = (EltOffset1 - BaseOff) / 64;
10990b57cec5SDimitry Andric       CI.UseST64 = true;
11005ffd83dbSDimitry Andric     }
11010b57cec5SDimitry Andric     return true;
11020b57cec5SDimitry Andric   }
11030b57cec5SDimitry Andric 
1104fe6060f1SDimitry Andric   if (isUInt<8>(Max - Min)) {
11055ffd83dbSDimitry Andric     if (Modify) {
1106fe6060f1SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1107fe6060f1SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1108fe6060f1SDimitry Andric       // the same offset can be reused for other load/store pairs.
1109fe6060f1SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110fe6060f1SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1111fe6060f1SDimitry Andric       CI.Offset = EltOffset0 - BaseOff;
1112fe6060f1SDimitry Andric       Paired.Offset = EltOffset1 - BaseOff;
11135ffd83dbSDimitry Andric     }
11140b57cec5SDimitry Andric     return true;
11150b57cec5SDimitry Andric   }
11160b57cec5SDimitry Andric 
11170b57cec5SDimitry Andric   return false;
11180b57cec5SDimitry Andric }
11190b57cec5SDimitry Andric 
11200b57cec5SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121480093f4SDimitry Andric                                      const CombineInfo &CI,
1122480093f4SDimitry Andric                                      const CombineInfo &Paired) {
1123480093f4SDimitry Andric   const unsigned Width = (CI.Width + Paired.Width);
11240b57cec5SDimitry Andric   switch (CI.InstClass) {
11250b57cec5SDimitry Andric   default:
11260b57cec5SDimitry Andric     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
11270b57cec5SDimitry Andric   case S_BUFFER_LOAD_IMM:
1128bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1129bdd1243dSDimitry Andric   case S_LOAD_IMM:
11300b57cec5SDimitry Andric     switch (Width) {
11310b57cec5SDimitry Andric     default:
11320b57cec5SDimitry Andric       return false;
11330b57cec5SDimitry Andric     case 2:
11340b57cec5SDimitry Andric     case 4:
1135349cc55cSDimitry Andric     case 8:
11360b57cec5SDimitry Andric       return true;
11375f757f3fSDimitry Andric     case 3:
11385f757f3fSDimitry Andric       return STM.hasScalarDwordx3Loads();
11390b57cec5SDimitry Andric     }
11400b57cec5SDimitry Andric   }
11410b57cec5SDimitry Andric }
11420b57cec5SDimitry Andric 
1143fe6060f1SDimitry Andric const TargetRegisterClass *
1144fe6060f1SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145fe6060f1SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147fe6060f1SDimitry Andric   }
1148fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1150fe6060f1SDimitry Andric   }
1151fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1153fe6060f1SDimitry Andric   }
1154fe6060f1SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156fe6060f1SDimitry Andric   }
1157fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1159fe6060f1SDimitry Andric   }
1160fe6060f1SDimitry Andric   return nullptr;
1161fe6060f1SDimitry Andric }
1162fe6060f1SDimitry Andric 
116381ad6265SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return
116481ad6265SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure.
116581ad6265SDimitry Andric SILoadStoreOptimizer::CombineInfo *
116681ad6265SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
116781ad6265SDimitry Andric                                            CombineInfo &Paired) {
116881ad6265SDimitry Andric   // If another instruction has already been merged into CI, it may now be a
116981ad6265SDimitry Andric   // type that we can't do any further merging into.
117081ad6265SDimitry Andric   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
117181ad6265SDimitry Andric     return nullptr;
117281ad6265SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
117381ad6265SDimitry Andric 
117481ad6265SDimitry Andric   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
117581ad6265SDimitry Andric       getInstSubclass(Paired.I->getOpcode(), *TII))
117681ad6265SDimitry Andric     return nullptr;
11775ffd83dbSDimitry Andric 
11785ffd83dbSDimitry Andric   // Check both offsets (or masks for MIMG) can be combined and fit in the
11795ffd83dbSDimitry Andric   // reduced range.
118081ad6265SDimitry Andric   if (CI.InstClass == MIMG) {
118181ad6265SDimitry Andric     if (!dmasksCanBeCombined(CI, *TII, Paired))
118281ad6265SDimitry Andric       return nullptr;
118381ad6265SDimitry Andric   } else {
118481ad6265SDimitry Andric     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
118581ad6265SDimitry Andric       return nullptr;
11865ffd83dbSDimitry Andric   }
11875ffd83dbSDimitry Andric 
118881ad6265SDimitry Andric   DenseSet<Register> RegDefs;
118981ad6265SDimitry Andric   DenseSet<Register> RegUses;
119081ad6265SDimitry Andric   CombineInfo *Where;
119181ad6265SDimitry Andric   if (CI.I->mayLoad()) {
119281ad6265SDimitry Andric     // Try to hoist Paired up to CI.
119381ad6265SDimitry Andric     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
119481ad6265SDimitry Andric     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
119581ad6265SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
119681ad6265SDimitry Andric         return nullptr;
11970b57cec5SDimitry Andric     }
119881ad6265SDimitry Andric     Where = &CI;
119981ad6265SDimitry Andric   } else {
120081ad6265SDimitry Andric     // Try to sink CI down to Paired.
120181ad6265SDimitry Andric     addDefsUsesToList(*CI.I, RegDefs, RegUses);
120281ad6265SDimitry Andric     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
120381ad6265SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
120481ad6265SDimitry Andric         return nullptr;
12050b57cec5SDimitry Andric     }
120681ad6265SDimitry Andric     Where = &Paired;
120781ad6265SDimitry Andric   }
12085ffd83dbSDimitry Andric 
12095ffd83dbSDimitry Andric   // Call offsetsCanBeCombined with modify = true so that the offsets are
12105ffd83dbSDimitry Andric   // correct for the new instruction.  This should return true, because
12115ffd83dbSDimitry Andric   // this function should only be called on CombineInfo objects that
12125ffd83dbSDimitry Andric   // have already been confirmed to be mergeable.
121381ad6265SDimitry Andric   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
12145ffd83dbSDimitry Andric     offsetsCanBeCombined(CI, *STM, Paired, true);
121581ad6265SDimitry Andric   return Where;
12160b57cec5SDimitry Andric }
12170b57cec5SDimitry Andric 
1218*0fca6ea1SDimitry Andric // Copy the merged load result from DestReg to the original dest regs of CI and
1219*0fca6ea1SDimitry Andric // Paired.
1220*0fca6ea1SDimitry Andric void SILoadStoreOptimizer::copyToDestRegs(
1221*0fca6ea1SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1222*0fca6ea1SDimitry Andric     MachineBasicBlock::iterator InsertBefore, int OpName,
1223*0fca6ea1SDimitry Andric     Register DestReg) const {
1224*0fca6ea1SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1225*0fca6ea1SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1226*0fca6ea1SDimitry Andric 
1227*0fca6ea1SDimitry Andric   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228*0fca6ea1SDimitry Andric 
1229*0fca6ea1SDimitry Andric   // Copy to the old destination registers.
1230*0fca6ea1SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231*0fca6ea1SDimitry Andric   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232*0fca6ea1SDimitry Andric   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233*0fca6ea1SDimitry Andric 
1234*0fca6ea1SDimitry Andric   // The constrained sload instructions in S_LOAD_IMM class will have
1235*0fca6ea1SDimitry Andric   // `early-clobber` flag in the dst operand. Remove the flag before using the
1236*0fca6ea1SDimitry Andric   // MOs in copies.
1237*0fca6ea1SDimitry Andric   Dest0->setIsEarlyClobber(false);
1238*0fca6ea1SDimitry Andric   Dest1->setIsEarlyClobber(false);
1239*0fca6ea1SDimitry Andric 
1240*0fca6ea1SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241*0fca6ea1SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
1242*0fca6ea1SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
1243*0fca6ea1SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244*0fca6ea1SDimitry Andric       .add(*Dest1)
1245*0fca6ea1SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246*0fca6ea1SDimitry Andric }
1247*0fca6ea1SDimitry Andric 
1248*0fca6ea1SDimitry Andric // Return a register for the source of the merged store after copying the
1249*0fca6ea1SDimitry Andric // original source regs of CI and Paired into it.
1250*0fca6ea1SDimitry Andric Register
1251*0fca6ea1SDimitry Andric SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252*0fca6ea1SDimitry Andric                                       MachineBasicBlock::iterator InsertBefore,
1253*0fca6ea1SDimitry Andric                                       int OpName) const {
1254*0fca6ea1SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1255*0fca6ea1SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1256*0fca6ea1SDimitry Andric 
1257*0fca6ea1SDimitry Andric   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258*0fca6ea1SDimitry Andric 
1259*0fca6ea1SDimitry Andric   // Copy to the new source register.
1260*0fca6ea1SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261*0fca6ea1SDimitry Andric   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262*0fca6ea1SDimitry Andric 
1263*0fca6ea1SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264*0fca6ea1SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265*0fca6ea1SDimitry Andric 
1266*0fca6ea1SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267*0fca6ea1SDimitry Andric       .add(*Src0)
1268*0fca6ea1SDimitry Andric       .addImm(SubRegIdx0)
1269*0fca6ea1SDimitry Andric       .add(*Src1)
1270*0fca6ea1SDimitry Andric       .addImm(SubRegIdx1);
1271*0fca6ea1SDimitry Andric 
1272*0fca6ea1SDimitry Andric   return SrcReg;
1273*0fca6ea1SDimitry Andric }
1274*0fca6ea1SDimitry Andric 
12750b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
12760b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
12770b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
12780b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric 
12810b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
12820b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
12830b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
12840b57cec5SDimitry Andric 
12850b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
12860b57cec5SDimitry Andric                         : AMDGPU::DS_READ2ST64_B64_gfx9;
12870b57cec5SDimitry Andric }
12880b57cec5SDimitry Andric 
12890b57cec5SDimitry Andric MachineBasicBlock::iterator
12905ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
129181ad6265SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
12920b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
12930b57cec5SDimitry Andric 
12940b57cec5SDimitry Andric   // Be careful, since the addresses could be subregisters themselves in weird
12950b57cec5SDimitry Andric   // cases, like vectors of pointers.
12960b57cec5SDimitry Andric   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
12970b57cec5SDimitry Andric 
1298*0fca6ea1SDimitry Andric   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299*0fca6ea1SDimitry Andric   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
13000b57cec5SDimitry Andric   unsigned Opc =
13010b57cec5SDimitry Andric       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
13020b57cec5SDimitry Andric 
13030b57cec5SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
13040b57cec5SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
13050b57cec5SDimitry Andric 
13060b57cec5SDimitry Andric   const MCInstrDesc &Read2Desc = TII->get(Opc);
13070b57cec5SDimitry Andric 
1308fe6060f1SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
13098bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
13100b57cec5SDimitry Andric 
13110b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
13120b57cec5SDimitry Andric 
13138bcb0991SDimitry Andric   Register BaseReg = AddrReg->getReg();
13140b57cec5SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
13150b57cec5SDimitry Andric   unsigned BaseRegFlags = 0;
13160b57cec5SDimitry Andric   if (CI.BaseOff) {
13178bcb0991SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
131881ad6265SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
13190b57cec5SDimitry Andric         .addImm(CI.BaseOff);
13200b57cec5SDimitry Andric 
13210b57cec5SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
13220b57cec5SDimitry Andric     BaseRegFlags = RegState::Kill;
13230b57cec5SDimitry Andric 
132481ad6265SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
13250b57cec5SDimitry Andric         .addReg(ImmReg)
13260b57cec5SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
13270b57cec5SDimitry Andric         .addImm(0); // clamp bit
13280b57cec5SDimitry Andric     BaseSubReg = 0;
13290b57cec5SDimitry Andric   }
13300b57cec5SDimitry Andric 
13310b57cec5SDimitry Andric   MachineInstrBuilder Read2 =
133281ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
13330b57cec5SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
13340b57cec5SDimitry Andric           .addImm(NewOffset0)                        // offset0
13350b57cec5SDimitry Andric           .addImm(NewOffset1)                        // offset1
13360b57cec5SDimitry Andric           .addImm(0)                                 // gds
1337480093f4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
13380b57cec5SDimitry Andric 
1339*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
13400b57cec5SDimitry Andric 
13410b57cec5SDimitry Andric   CI.I->eraseFromParent();
1342480093f4SDimitry Andric   Paired.I->eraseFromParent();
13430b57cec5SDimitry Andric 
13440b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
13458bcb0991SDimitry Andric   return Read2;
13460b57cec5SDimitry Andric }
13470b57cec5SDimitry Andric 
13480b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
13490b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
13500b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
13510b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
13520b57cec5SDimitry Andric                         : AMDGPU::DS_WRITE2_B64_gfx9;
13530b57cec5SDimitry Andric }
13540b57cec5SDimitry Andric 
13550b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
13560b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
13570b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
13580b57cec5SDimitry Andric                           : AMDGPU::DS_WRITE2ST64_B64;
13590b57cec5SDimitry Andric 
13600b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
13610b57cec5SDimitry Andric                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
13620b57cec5SDimitry Andric }
13630b57cec5SDimitry Andric 
136481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
136581ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
136681ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
13670b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
13680b57cec5SDimitry Andric 
13690b57cec5SDimitry Andric   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
13700b57cec5SDimitry Andric   // sure we preserve the subregister index and any register flags set on them.
13710b57cec5SDimitry Andric   const MachineOperand *AddrReg =
13720b57cec5SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
13730b57cec5SDimitry Andric   const MachineOperand *Data0 =
13740b57cec5SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
13750b57cec5SDimitry Andric   const MachineOperand *Data1 =
1376480093f4SDimitry Andric       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
13770b57cec5SDimitry Andric 
1378480093f4SDimitry Andric   unsigned NewOffset0 = CI.Offset;
1379480093f4SDimitry Andric   unsigned NewOffset1 = Paired.Offset;
13800b57cec5SDimitry Andric   unsigned Opc =
13810b57cec5SDimitry Andric       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
13820b57cec5SDimitry Andric 
13830b57cec5SDimitry Andric   if (NewOffset0 > NewOffset1) {
13840b57cec5SDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
13850b57cec5SDimitry Andric     std::swap(NewOffset0, NewOffset1);
13860b57cec5SDimitry Andric     std::swap(Data0, Data1);
13870b57cec5SDimitry Andric   }
13880b57cec5SDimitry Andric 
13890b57cec5SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
13900b57cec5SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
13910b57cec5SDimitry Andric 
13920b57cec5SDimitry Andric   const MCInstrDesc &Write2Desc = TII->get(Opc);
13930b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
13940b57cec5SDimitry Andric 
13958bcb0991SDimitry Andric   Register BaseReg = AddrReg->getReg();
13960b57cec5SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
13970b57cec5SDimitry Andric   unsigned BaseRegFlags = 0;
13980b57cec5SDimitry Andric   if (CI.BaseOff) {
13998bcb0991SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
140081ad6265SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
14010b57cec5SDimitry Andric         .addImm(CI.BaseOff);
14020b57cec5SDimitry Andric 
14030b57cec5SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
14040b57cec5SDimitry Andric     BaseRegFlags = RegState::Kill;
14050b57cec5SDimitry Andric 
140681ad6265SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
14070b57cec5SDimitry Andric         .addReg(ImmReg)
14080b57cec5SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
14090b57cec5SDimitry Andric         .addImm(0); // clamp bit
14100b57cec5SDimitry Andric     BaseSubReg = 0;
14110b57cec5SDimitry Andric   }
14120b57cec5SDimitry Andric 
14130b57cec5SDimitry Andric   MachineInstrBuilder Write2 =
141481ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
14150b57cec5SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
14160b57cec5SDimitry Andric           .add(*Data0)                               // data0
14170b57cec5SDimitry Andric           .add(*Data1)                               // data1
14180b57cec5SDimitry Andric           .addImm(NewOffset0)                        // offset0
14190b57cec5SDimitry Andric           .addImm(NewOffset1)                        // offset1
14200b57cec5SDimitry Andric           .addImm(0)                                 // gds
1421480093f4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
14220b57cec5SDimitry Andric 
14230b57cec5SDimitry Andric   CI.I->eraseFromParent();
1424480093f4SDimitry Andric   Paired.I->eraseFromParent();
14250b57cec5SDimitry Andric 
14260b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
14278bcb0991SDimitry Andric   return Write2;
14280b57cec5SDimitry Andric }
14290b57cec5SDimitry Andric 
14300b57cec5SDimitry Andric MachineBasicBlock::iterator
14315ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
143281ad6265SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
14330b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
14340b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1435480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
14360b57cec5SDimitry Andric 
1437480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14380b57cec5SDimitry Andric 
14398bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1440480093f4SDimitry Andric   unsigned MergedDMask = CI.DMask | Paired.DMask;
14418bcb0991SDimitry Andric   unsigned DMaskIdx =
14428bcb0991SDimitry Andric       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
14430b57cec5SDimitry Andric 
144481ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
14458bcb0991SDimitry Andric   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
14468bcb0991SDimitry Andric     if (I == DMaskIdx)
14478bcb0991SDimitry Andric       MIB.addImm(MergedDMask);
14488bcb0991SDimitry Andric     else
14498bcb0991SDimitry Andric       MIB.add((*CI.I).getOperand(I));
14508bcb0991SDimitry Andric   }
14510b57cec5SDimitry Andric 
14528bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14538bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14548bcb0991SDimitry Andric   // will return true if this is the case.
1455480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14560b57cec5SDimitry Andric 
145781ad6265SDimitry Andric   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14580b57cec5SDimitry Andric 
1459*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
14600b57cec5SDimitry Andric 
14610b57cec5SDimitry Andric   CI.I->eraseFromParent();
1462480093f4SDimitry Andric   Paired.I->eraseFromParent();
14638bcb0991SDimitry Andric   return New;
14648bcb0991SDimitry Andric }
14658bcb0991SDimitry Andric 
1466bdd1243dSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
14675ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
146881ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
14698bcb0991SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
14708bcb0991SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1471480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
14728bcb0991SDimitry Andric 
1473480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14748bcb0991SDimitry Andric 
14758bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1476480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14778bcb0991SDimitry Andric 
14788bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14798bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14808bcb0991SDimitry Andric   // will return true if this is the case.
1481480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14828bcb0991SDimitry Andric 
1483bdd1243dSDimitry Andric   MachineInstrBuilder New =
148481ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485bdd1243dSDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486bdd1243dSDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487bdd1243dSDimitry Andric     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488bdd1243dSDimitry Andric   New.addImm(MergedOffset);
1489bdd1243dSDimitry Andric   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14908bcb0991SDimitry Andric 
1491*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
14928bcb0991SDimitry Andric 
14938bcb0991SDimitry Andric   CI.I->eraseFromParent();
1494480093f4SDimitry Andric   Paired.I->eraseFromParent();
14958bcb0991SDimitry Andric   return New;
14968bcb0991SDimitry Andric }
14978bcb0991SDimitry Andric 
14985ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
14995ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
150081ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
15018bcb0991SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
15028bcb0991SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
15038bcb0991SDimitry Andric 
1504480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
15058bcb0991SDimitry Andric 
1506480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
15078bcb0991SDimitry Andric 
15088bcb0991SDimitry Andric   // Copy to the new source register.
15098bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1510480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
15118bcb0991SDimitry Andric 
151281ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
15138bcb0991SDimitry Andric 
15145ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
15158bcb0991SDimitry Andric 
15165ffd83dbSDimitry Andric   if (Regs.VAddr)
15178bcb0991SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
15188bcb0991SDimitry Andric 
15198bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
15208bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
15218bcb0991SDimitry Andric   // will return true if this is the case.
1522480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
15238bcb0991SDimitry Andric 
15248bcb0991SDimitry Andric   MachineInstr *New =
15258bcb0991SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
15268bcb0991SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
15278bcb0991SDimitry Andric         .addImm(MergedOffset) // offset
1528fe6060f1SDimitry Andric         .addImm(CI.CPol)      // cpol
15298bcb0991SDimitry Andric         .addImm(0)            // swz
153081ad6265SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
15318bcb0991SDimitry Andric 
1532*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
15338bcb0991SDimitry Andric 
15348bcb0991SDimitry Andric   CI.I->eraseFromParent();
1535480093f4SDimitry Andric   Paired.I->eraseFromParent();
15368bcb0991SDimitry Andric   return New;
15370b57cec5SDimitry Andric }
15380b57cec5SDimitry Andric 
15395ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
15405ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
154181ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1542480093f4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1543480093f4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1544480093f4SDimitry Andric 
1545480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1546480093f4SDimitry Andric 
1547480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548480093f4SDimitry Andric 
1549480093f4SDimitry Andric   // Copy to the new source register.
1550480093f4SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1551480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552480093f4SDimitry Andric 
155381ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554480093f4SDimitry Andric 
15555ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1556480093f4SDimitry Andric 
15575ffd83dbSDimitry Andric   if (Regs.VAddr)
1558480093f4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559480093f4SDimitry Andric 
1560480093f4SDimitry Andric   unsigned JoinedFormat =
15615ffd83dbSDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562480093f4SDimitry Andric 
1563480093f4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1564480093f4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1565480093f4SDimitry Andric   // will return true if this is the case.
1566480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567480093f4SDimitry Andric 
1568480093f4SDimitry Andric   MachineInstr *New =
1569480093f4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570480093f4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571480093f4SDimitry Andric           .addImm(MergedOffset) // offset
1572480093f4SDimitry Andric           .addImm(JoinedFormat) // format
1573fe6060f1SDimitry Andric           .addImm(CI.CPol)      // cpol
1574480093f4SDimitry Andric           .addImm(0)            // swz
157581ad6265SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576480093f4SDimitry Andric 
1577*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578480093f4SDimitry Andric 
1579480093f4SDimitry Andric   CI.I->eraseFromParent();
1580480093f4SDimitry Andric   Paired.I->eraseFromParent();
1581480093f4SDimitry Andric   return New;
1582480093f4SDimitry Andric }
1583480093f4SDimitry Andric 
15845ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
15855ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
158681ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1587480093f4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1588480093f4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1589480093f4SDimitry Andric 
1590480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1591480093f4SDimitry Andric 
1592*0fca6ea1SDimitry Andric   Register SrcReg =
1593*0fca6ea1SDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594480093f4SDimitry Andric 
159581ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596480093f4SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
1597480093f4SDimitry Andric 
15985ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1599480093f4SDimitry Andric 
16005ffd83dbSDimitry Andric   if (Regs.VAddr)
1601480093f4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602480093f4SDimitry Andric 
1603480093f4SDimitry Andric   unsigned JoinedFormat =
16045ffd83dbSDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605480093f4SDimitry Andric 
1606480093f4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1607480093f4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1608480093f4SDimitry Andric   // will return true if this is the case.
1609480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610480093f4SDimitry Andric 
1611480093f4SDimitry Andric   MachineInstr *New =
1612480093f4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613480093f4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614480093f4SDimitry Andric           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615480093f4SDimitry Andric           .addImm(JoinedFormat)                     // format
1616fe6060f1SDimitry Andric           .addImm(CI.CPol)                          // cpol
1617480093f4SDimitry Andric           .addImm(0)                                // swz
161881ad6265SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619480093f4SDimitry Andric 
162081ad6265SDimitry Andric   CI.I->eraseFromParent();
162181ad6265SDimitry Andric   Paired.I->eraseFromParent();
162281ad6265SDimitry Andric   return New;
162381ad6265SDimitry Andric }
162481ad6265SDimitry Andric 
162581ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
162681ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
162781ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
162881ad6265SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
162981ad6265SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
163081ad6265SDimitry Andric 
163181ad6265SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
163281ad6265SDimitry Andric 
163381ad6265SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
163481ad6265SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
163581ad6265SDimitry Andric 
163681ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
163781ad6265SDimitry Andric 
163881ad6265SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
163981ad6265SDimitry Andric     MIB.add(*SAddr);
164081ad6265SDimitry Andric 
164181ad6265SDimitry Andric   MachineInstr *New =
164281ad6265SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
164381ad6265SDimitry Andric        .addImm(std::min(CI.Offset, Paired.Offset))
164481ad6265SDimitry Andric        .addImm(CI.CPol)
164581ad6265SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
164681ad6265SDimitry Andric 
1647*0fca6ea1SDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
164881ad6265SDimitry Andric 
164981ad6265SDimitry Andric   CI.I->eraseFromParent();
165081ad6265SDimitry Andric   Paired.I->eraseFromParent();
165181ad6265SDimitry Andric   return New;
165281ad6265SDimitry Andric }
165381ad6265SDimitry Andric 
165481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
165581ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
165681ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
165781ad6265SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
165881ad6265SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
165981ad6265SDimitry Andric 
166081ad6265SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
166181ad6265SDimitry Andric 
1662*0fca6ea1SDimitry Andric   Register SrcReg =
1663*0fca6ea1SDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
166481ad6265SDimitry Andric 
166581ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
166681ad6265SDimitry Andric                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
166781ad6265SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
166881ad6265SDimitry Andric 
166981ad6265SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
167081ad6265SDimitry Andric     MIB.add(*SAddr);
167181ad6265SDimitry Andric 
167281ad6265SDimitry Andric   MachineInstr *New =
167381ad6265SDimitry Andric     MIB.addImm(std::min(CI.Offset, Paired.Offset))
167481ad6265SDimitry Andric        .addImm(CI.CPol)
167581ad6265SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676480093f4SDimitry Andric 
1677480093f4SDimitry Andric   CI.I->eraseFromParent();
1678480093f4SDimitry Andric   Paired.I->eraseFromParent();
1679480093f4SDimitry Andric   return New;
1680480093f4SDimitry Andric }
1681480093f4SDimitry Andric 
1682480093f4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683480093f4SDimitry Andric                                             const CombineInfo &Paired) {
1684480093f4SDimitry Andric   const unsigned Width = CI.Width + Paired.Width;
16850b57cec5SDimitry Andric 
168681ad6265SDimitry Andric   switch (getCommonInstClass(CI, Paired)) {
16870b57cec5SDimitry Andric   default:
16888bcb0991SDimitry Andric     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
16898bcb0991SDimitry Andric     // FIXME: Handle d16 correctly
16908bcb0991SDimitry Andric     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
16918bcb0991SDimitry Andric                                   Width);
1692480093f4SDimitry Andric   case TBUFFER_LOAD:
1693480093f4SDimitry Andric   case TBUFFER_STORE:
1694480093f4SDimitry Andric     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695480093f4SDimitry Andric                                   Width);
1696480093f4SDimitry Andric 
16970b57cec5SDimitry Andric   case UNKNOWN:
16980b57cec5SDimitry Andric     llvm_unreachable("Unknown instruction class");
16990b57cec5SDimitry Andric   case S_BUFFER_LOAD_IMM:
17000b57cec5SDimitry Andric     switch (Width) {
17010b57cec5SDimitry Andric     default:
17020b57cec5SDimitry Andric       return 0;
17030b57cec5SDimitry Andric     case 2:
17040b57cec5SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
17055f757f3fSDimitry Andric     case 3:
17065f757f3fSDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17070b57cec5SDimitry Andric     case 4:
17080b57cec5SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709349cc55cSDimitry Andric     case 8:
1710349cc55cSDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17110b57cec5SDimitry Andric     }
1712bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1713bdd1243dSDimitry Andric     switch (Width) {
1714bdd1243dSDimitry Andric     default:
1715bdd1243dSDimitry Andric       return 0;
1716bdd1243dSDimitry Andric     case 2:
171706c3fb27SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
17185f757f3fSDimitry Andric     case 3:
17195f757f3fSDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720bdd1243dSDimitry Andric     case 4:
172106c3fb27SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722bdd1243dSDimitry Andric     case 8:
172306c3fb27SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724bdd1243dSDimitry Andric     }
1725*0fca6ea1SDimitry Andric   case S_LOAD_IMM: {
1726*0fca6ea1SDimitry Andric     // If XNACK is enabled, use the constrained opcodes when the first load is
1727*0fca6ea1SDimitry Andric     // under-aligned.
1728*0fca6ea1SDimitry Andric     const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729*0fca6ea1SDimitry Andric     bool NeedsConstrainedOpc =
1730*0fca6ea1SDimitry Andric         STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731bdd1243dSDimitry Andric     switch (Width) {
1732bdd1243dSDimitry Andric     default:
1733bdd1243dSDimitry Andric       return 0;
1734bdd1243dSDimitry Andric     case 2:
1735*0fca6ea1SDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736*0fca6ea1SDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
17375f757f3fSDimitry Andric     case 3:
1738*0fca6ea1SDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739*0fca6ea1SDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1740bdd1243dSDimitry Andric     case 4:
1741*0fca6ea1SDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742*0fca6ea1SDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1743bdd1243dSDimitry Andric     case 8:
1744*0fca6ea1SDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745*0fca6ea1SDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1746*0fca6ea1SDimitry Andric     }
1747bdd1243dSDimitry Andric   }
174881ad6265SDimitry Andric   case GLOBAL_LOAD:
174981ad6265SDimitry Andric     switch (Width) {
175081ad6265SDimitry Andric     default:
175181ad6265SDimitry Andric       return 0;
175281ad6265SDimitry Andric     case 2:
175381ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2;
175481ad6265SDimitry Andric     case 3:
175581ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3;
175681ad6265SDimitry Andric     case 4:
175781ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4;
175881ad6265SDimitry Andric     }
175981ad6265SDimitry Andric   case GLOBAL_LOAD_SADDR:
176081ad6265SDimitry Andric     switch (Width) {
176181ad6265SDimitry Andric     default:
176281ad6265SDimitry Andric       return 0;
176381ad6265SDimitry Andric     case 2:
176481ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
176581ad6265SDimitry Andric     case 3:
176681ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
176781ad6265SDimitry Andric     case 4:
176881ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
176981ad6265SDimitry Andric     }
177081ad6265SDimitry Andric   case GLOBAL_STORE:
177181ad6265SDimitry Andric     switch (Width) {
177281ad6265SDimitry Andric     default:
177381ad6265SDimitry Andric       return 0;
177481ad6265SDimitry Andric     case 2:
177581ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2;
177681ad6265SDimitry Andric     case 3:
177781ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3;
177881ad6265SDimitry Andric     case 4:
177981ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4;
178081ad6265SDimitry Andric     }
178181ad6265SDimitry Andric   case GLOBAL_STORE_SADDR:
178281ad6265SDimitry Andric     switch (Width) {
178381ad6265SDimitry Andric     default:
178481ad6265SDimitry Andric       return 0;
178581ad6265SDimitry Andric     case 2:
178681ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
178781ad6265SDimitry Andric     case 3:
178881ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
178981ad6265SDimitry Andric     case 4:
179081ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
179181ad6265SDimitry Andric     }
179281ad6265SDimitry Andric   case FLAT_LOAD:
179381ad6265SDimitry Andric     switch (Width) {
179481ad6265SDimitry Andric     default:
179581ad6265SDimitry Andric       return 0;
179681ad6265SDimitry Andric     case 2:
179781ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX2;
179881ad6265SDimitry Andric     case 3:
179981ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX3;
180081ad6265SDimitry Andric     case 4:
180181ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX4;
180281ad6265SDimitry Andric     }
180381ad6265SDimitry Andric   case FLAT_STORE:
180481ad6265SDimitry Andric     switch (Width) {
180581ad6265SDimitry Andric     default:
180681ad6265SDimitry Andric       return 0;
180781ad6265SDimitry Andric     case 2:
180881ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX2;
180981ad6265SDimitry Andric     case 3:
181081ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX3;
181181ad6265SDimitry Andric     case 4:
181281ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX4;
181381ad6265SDimitry Andric     }
18148bcb0991SDimitry Andric   case MIMG:
1815bdd1243dSDimitry Andric     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816349cc55cSDimitry Andric            "No overlaps");
18178bcb0991SDimitry Andric     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
18180b57cec5SDimitry Andric   }
18190b57cec5SDimitry Andric }
18200b57cec5SDimitry Andric 
18210b57cec5SDimitry Andric std::pair<unsigned, unsigned>
1822349cc55cSDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823349cc55cSDimitry Andric                                     const CombineInfo &Paired) {
1824bdd1243dSDimitry Andric   assert((CI.InstClass != MIMG ||
1825bdd1243dSDimitry Andric           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
182681ad6265SDimitry Andric            CI.Width + Paired.Width)) &&
18278bcb0991SDimitry Andric          "No overlaps");
18288bcb0991SDimitry Andric 
1829349cc55cSDimitry Andric   unsigned Idx0;
1830349cc55cSDimitry Andric   unsigned Idx1;
1831349cc55cSDimitry Andric 
183204eeddc0SDimitry Andric   static const unsigned Idxs[5][4] = {
18338bcb0991SDimitry Andric       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
183404eeddc0SDimitry Andric       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
183504eeddc0SDimitry Andric       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
183604eeddc0SDimitry Andric       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
183704eeddc0SDimitry Andric       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
18388bcb0991SDimitry Andric   };
18398bcb0991SDimitry Andric 
184004eeddc0SDimitry Andric   assert(CI.Width >= 1 && CI.Width <= 4);
184104eeddc0SDimitry Andric   assert(Paired.Width >= 1 && Paired.Width <= 4);
18428bcb0991SDimitry Andric 
184381ad6265SDimitry Andric   if (Paired < CI) {
1844480093f4SDimitry Andric     Idx1 = Idxs[0][Paired.Width - 1];
1845480093f4SDimitry Andric     Idx0 = Idxs[Paired.Width][CI.Width - 1];
18460b57cec5SDimitry Andric   } else {
1847480093f4SDimitry Andric     Idx0 = Idxs[0][CI.Width - 1];
1848480093f4SDimitry Andric     Idx1 = Idxs[CI.Width][Paired.Width - 1];
18490b57cec5SDimitry Andric   }
18508bcb0991SDimitry Andric 
1851*0fca6ea1SDimitry Andric   return {Idx0, Idx1};
18520b57cec5SDimitry Andric }
18530b57cec5SDimitry Andric 
18540b57cec5SDimitry Andric const TargetRegisterClass *
1855480093f4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856*0fca6ea1SDimitry Andric                                              const CombineInfo &Paired) const {
1857bdd1243dSDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858bdd1243dSDimitry Andric       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859480093f4SDimitry Andric     switch (CI.Width + Paired.Width) {
18600b57cec5SDimitry Andric     default:
18610b57cec5SDimitry Andric       return nullptr;
18620b57cec5SDimitry Andric     case 2:
18630b57cec5SDimitry Andric       return &AMDGPU::SReg_64_XEXECRegClass;
18645f757f3fSDimitry Andric     case 3:
18655f757f3fSDimitry Andric       return &AMDGPU::SGPR_96RegClass;
18660b57cec5SDimitry Andric     case 4:
18678bcb0991SDimitry Andric       return &AMDGPU::SGPR_128RegClass;
18680b57cec5SDimitry Andric     case 8:
18695ffd83dbSDimitry Andric       return &AMDGPU::SGPR_256RegClass;
18700b57cec5SDimitry Andric     case 16:
18715ffd83dbSDimitry Andric       return &AMDGPU::SGPR_512RegClass;
18720b57cec5SDimitry Andric     }
18730b57cec5SDimitry Andric   }
1874fe6060f1SDimitry Andric 
1875fe6060f1SDimitry Andric   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
18764824e7fdSDimitry Andric   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877fe6060f1SDimitry Andric              ? TRI->getAGPRClassForBitWidth(BitWidth)
1878fe6060f1SDimitry Andric              : TRI->getVGPRClassForBitWidth(BitWidth);
18790b57cec5SDimitry Andric }
18800b57cec5SDimitry Andric 
18815ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
18825ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
188381ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
18840b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
18850b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
18860b57cec5SDimitry Andric 
1887480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
18880b57cec5SDimitry Andric 
1889*0fca6ea1SDimitry Andric   Register SrcReg =
1890*0fca6ea1SDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
18910b57cec5SDimitry Andric 
189281ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
18930b57cec5SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
18940b57cec5SDimitry Andric 
18955ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
18960b57cec5SDimitry Andric 
18975ffd83dbSDimitry Andric   if (Regs.VAddr)
18980b57cec5SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
18990b57cec5SDimitry Andric 
19008bcb0991SDimitry Andric 
19018bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
19028bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
19038bcb0991SDimitry Andric   // will return true if this is the case.
1904480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
19058bcb0991SDimitry Andric 
19068bcb0991SDimitry Andric   MachineInstr *New =
19070b57cec5SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
19080b57cec5SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909480093f4SDimitry Andric         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910fe6060f1SDimitry Andric         .addImm(CI.CPol)      // cpol
19118bcb0991SDimitry Andric         .addImm(0)            // swz
191281ad6265SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
19130b57cec5SDimitry Andric 
19140b57cec5SDimitry Andric   CI.I->eraseFromParent();
1915480093f4SDimitry Andric   Paired.I->eraseFromParent();
19168bcb0991SDimitry Andric   return New;
19170b57cec5SDimitry Andric }
19180b57cec5SDimitry Andric 
19190b57cec5SDimitry Andric MachineOperand
19208bcb0991SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
19210b57cec5SDimitry Andric   APInt V(32, Val, true);
19220b57cec5SDimitry Andric   if (TII->isInlineConstant(V))
19230b57cec5SDimitry Andric     return MachineOperand::CreateImm(Val);
19240b57cec5SDimitry Andric 
19258bcb0991SDimitry Andric   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
19260b57cec5SDimitry Andric   MachineInstr *Mov =
19270b57cec5SDimitry Andric   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
19280b57cec5SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), Reg)
19290b57cec5SDimitry Andric     .addImm(Val);
19300b57cec5SDimitry Andric   (void)Mov;
19310b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
19320b57cec5SDimitry Andric   return MachineOperand::CreateReg(Reg, false);
19330b57cec5SDimitry Andric }
19340b57cec5SDimitry Andric 
19350b57cec5SDimitry Andric // Compute base address using Addr and return the final register.
19365ffd83dbSDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
19378bcb0991SDimitry Andric                                            const MemAddress &Addr) const {
19380b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
19390b57cec5SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
19400b57cec5SDimitry Andric   DebugLoc DL = MI.getDebugLoc();
19410b57cec5SDimitry Andric 
19420b57cec5SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
19430b57cec5SDimitry Andric           Addr.Base.LoSubReg) &&
19440b57cec5SDimitry Andric          "Expected 32-bit Base-Register-Low!!");
19450b57cec5SDimitry Andric 
19460b57cec5SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
19470b57cec5SDimitry Andric           Addr.Base.HiSubReg) &&
19480b57cec5SDimitry Andric          "Expected 32-bit Base-Register-Hi!!");
19490b57cec5SDimitry Andric 
19500b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
19510b57cec5SDimitry Andric   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
19520b57cec5SDimitry Andric   MachineOperand OffsetHi =
19530b57cec5SDimitry Andric     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
19540b57cec5SDimitry Andric 
19550b57cec5SDimitry Andric   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
19568bcb0991SDimitry Andric   Register CarryReg = MRI->createVirtualRegister(CarryRC);
19578bcb0991SDimitry Andric   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
19580b57cec5SDimitry Andric 
19598bcb0991SDimitry Andric   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19608bcb0991SDimitry Andric   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19610b57cec5SDimitry Andric   MachineInstr *LoHalf =
1962e8d8bef9SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
19630b57cec5SDimitry Andric       .addReg(CarryReg, RegState::Define)
19640b57cec5SDimitry Andric       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
19650b57cec5SDimitry Andric       .add(OffsetLo)
19660b57cec5SDimitry Andric       .addImm(0); // clamp bit
19670b57cec5SDimitry Andric   (void)LoHalf;
19680b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
19690b57cec5SDimitry Andric 
19700b57cec5SDimitry Andric   MachineInstr *HiHalf =
19710b57cec5SDimitry Andric   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
19720b57cec5SDimitry Andric     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
19730b57cec5SDimitry Andric     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
19740b57cec5SDimitry Andric     .add(OffsetHi)
19750b57cec5SDimitry Andric     .addReg(CarryReg, RegState::Kill)
19760b57cec5SDimitry Andric     .addImm(0); // clamp bit
19770b57cec5SDimitry Andric   (void)HiHalf;
19780b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
19790b57cec5SDimitry Andric 
1980fe6060f1SDimitry Andric   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
19810b57cec5SDimitry Andric   MachineInstr *FullBase =
19820b57cec5SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
19830b57cec5SDimitry Andric       .addReg(DestSub0)
19840b57cec5SDimitry Andric       .addImm(AMDGPU::sub0)
19850b57cec5SDimitry Andric       .addReg(DestSub1)
19860b57cec5SDimitry Andric       .addImm(AMDGPU::sub1);
19870b57cec5SDimitry Andric   (void)FullBase;
19880b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
19890b57cec5SDimitry Andric 
19900b57cec5SDimitry Andric   return FullDestReg;
19910b57cec5SDimitry Andric }
19920b57cec5SDimitry Andric 
19930b57cec5SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
19940b57cec5SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
19955ffd83dbSDimitry Andric                                                Register NewBase,
19968bcb0991SDimitry Andric                                                int32_t NewOffset) const {
1997480093f4SDimitry Andric   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998480093f4SDimitry Andric   Base->setReg(NewBase);
1999480093f4SDimitry Andric   Base->setIsKill(false);
20000b57cec5SDimitry Andric   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
20010b57cec5SDimitry Andric }
20020b57cec5SDimitry Andric 
2003bdd1243dSDimitry Andric std::optional<int32_t>
20048bcb0991SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
20050b57cec5SDimitry Andric   if (Op.isImm())
20060b57cec5SDimitry Andric     return Op.getImm();
20070b57cec5SDimitry Andric 
20080b57cec5SDimitry Andric   if (!Op.isReg())
2009bdd1243dSDimitry Andric     return std::nullopt;
20100b57cec5SDimitry Andric 
20110b57cec5SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
20120b57cec5SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
20130b57cec5SDimitry Andric       !Def->getOperand(1).isImm())
2014bdd1243dSDimitry Andric     return std::nullopt;
20150b57cec5SDimitry Andric 
20160b57cec5SDimitry Andric   return Def->getOperand(1).getImm();
20170b57cec5SDimitry Andric }
20180b57cec5SDimitry Andric 
20190b57cec5SDimitry Andric // Analyze Base and extracts:
20200b57cec5SDimitry Andric //  - 32bit base registers, subregisters
20210b57cec5SDimitry Andric //  - 64bit constant offset
20220b57cec5SDimitry Andric // Expecting base computation as:
20230b57cec5SDimitry Andric //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
20240b57cec5SDimitry Andric //   %LO:vgpr_32, %c:sreg_64_xexec =
2025e8d8bef9SDimitry Andric //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
20260b57cec5SDimitry Andric //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
20270b57cec5SDimitry Andric //   %Base:vreg_64 =
20280b57cec5SDimitry Andric //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
20290b57cec5SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
20308bcb0991SDimitry Andric                                                       MemAddress &Addr) const {
20310b57cec5SDimitry Andric   if (!Base.isReg())
20320b57cec5SDimitry Andric     return;
20330b57cec5SDimitry Andric 
20340b57cec5SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
20350b57cec5SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
20360b57cec5SDimitry Andric       || Def->getNumOperands() != 5)
20370b57cec5SDimitry Andric     return;
20380b57cec5SDimitry Andric 
20390b57cec5SDimitry Andric   MachineOperand BaseLo = Def->getOperand(1);
20400b57cec5SDimitry Andric   MachineOperand BaseHi = Def->getOperand(3);
20410b57cec5SDimitry Andric   if (!BaseLo.isReg() || !BaseHi.isReg())
20420b57cec5SDimitry Andric     return;
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
20450b57cec5SDimitry Andric   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
20460b57cec5SDimitry Andric 
2047e8d8bef9SDimitry Andric   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
20480b57cec5SDimitry Andric       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
20490b57cec5SDimitry Andric     return;
20500b57cec5SDimitry Andric 
20510b57cec5SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
20520b57cec5SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
20530b57cec5SDimitry Andric 
20540b57cec5SDimitry Andric   auto Offset0P = extractConstOffset(*Src0);
20550b57cec5SDimitry Andric   if (Offset0P)
20560b57cec5SDimitry Andric     BaseLo = *Src1;
20570b57cec5SDimitry Andric   else {
20580b57cec5SDimitry Andric     if (!(Offset0P = extractConstOffset(*Src1)))
20590b57cec5SDimitry Andric       return;
20600b57cec5SDimitry Andric     BaseLo = *Src0;
20610b57cec5SDimitry Andric   }
20620b57cec5SDimitry Andric 
20630b57cec5SDimitry Andric   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
20640b57cec5SDimitry Andric   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
20650b57cec5SDimitry Andric 
20660b57cec5SDimitry Andric   if (Src0->isImm())
20670b57cec5SDimitry Andric     std::swap(Src0, Src1);
20680b57cec5SDimitry Andric 
2069*0fca6ea1SDimitry Andric   if (!Src1->isImm() || Src0->isImm())
20700b57cec5SDimitry Andric     return;
20710b57cec5SDimitry Andric 
20720b57cec5SDimitry Andric   uint64_t Offset1 = Src1->getImm();
20730b57cec5SDimitry Andric   BaseHi = *Src0;
20740b57cec5SDimitry Andric 
20750b57cec5SDimitry Andric   Addr.Base.LoReg = BaseLo.getReg();
20760b57cec5SDimitry Andric   Addr.Base.HiReg = BaseHi.getReg();
20770b57cec5SDimitry Andric   Addr.Base.LoSubReg = BaseLo.getSubReg();
20780b57cec5SDimitry Andric   Addr.Base.HiSubReg = BaseHi.getSubReg();
20790b57cec5SDimitry Andric   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
20800b57cec5SDimitry Andric }
20810b57cec5SDimitry Andric 
20820b57cec5SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
20830b57cec5SDimitry Andric     MachineInstr &MI,
20840b57cec5SDimitry Andric     MemInfoMap &Visited,
20858bcb0991SDimitry Andric     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
20860b57cec5SDimitry Andric 
2087*0fca6ea1SDimitry Andric   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
20880b57cec5SDimitry Andric     return false;
20890b57cec5SDimitry Andric 
2090*0fca6ea1SDimitry Andric   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091*0fca6ea1SDimitry Andric   if (SIInstrInfo::isFLATScratch(MI))
20928bcb0991SDimitry Andric     return false;
20938bcb0991SDimitry Andric 
2094*0fca6ea1SDimitry Andric   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095*0fca6ea1SDimitry Andric                                               : AMDGPUAS::FLAT_ADDRESS;
20960b57cec5SDimitry Andric 
20970b57cec5SDimitry Andric   if (AnchorList.count(&MI))
20980b57cec5SDimitry Andric     return false;
20990b57cec5SDimitry Andric 
21000b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
21010b57cec5SDimitry Andric 
21020b57cec5SDimitry Andric   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
21030b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
21040b57cec5SDimitry Andric     return false;
21050b57cec5SDimitry Andric   }
21060b57cec5SDimitry Andric 
21070b57cec5SDimitry Andric   // Step1: Find the base-registers and a 64bit constant offset.
21080b57cec5SDimitry Andric   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
21090b57cec5SDimitry Andric   MemAddress MAddr;
211006c3fb27SDimitry Andric   if (!Visited.contains(&MI)) {
21110b57cec5SDimitry Andric     processBaseWithConstOffset(Base, MAddr);
21120b57cec5SDimitry Andric     Visited[&MI] = MAddr;
21130b57cec5SDimitry Andric   } else
21140b57cec5SDimitry Andric     MAddr = Visited[&MI];
21150b57cec5SDimitry Andric 
21160b57cec5SDimitry Andric   if (MAddr.Offset == 0) {
21170b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
21180b57cec5SDimitry Andric                          " constant offsets that can be promoted.\n";);
21190b57cec5SDimitry Andric     return false;
21200b57cec5SDimitry Andric   }
21210b57cec5SDimitry Andric 
21220b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
21230b57cec5SDimitry Andric              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
21240b57cec5SDimitry Andric 
21250b57cec5SDimitry Andric   // Step2: Traverse through MI's basic block and find an anchor(that has the
21260b57cec5SDimitry Andric   // same base-registers) with the highest 13bit distance from MI's offset.
21270b57cec5SDimitry Andric   // E.g. (64bit loads)
21280b57cec5SDimitry Andric   // bb:
21290b57cec5SDimitry Andric   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
21300b57cec5SDimitry Andric   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
21310b57cec5SDimitry Andric   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
21320b57cec5SDimitry Andric   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
21330b57cec5SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
21340b57cec5SDimitry Andric   //
21350b57cec5SDimitry Andric   // Starting from the first load, the optimization will try to find a new base
21360b57cec5SDimitry Andric   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
21370b57cec5SDimitry Andric   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
21380b57cec5SDimitry Andric   // as the new-base(anchor) because of the maximum distance which can
213981ad6265SDimitry Andric   // accommodate more intermediate bases presumably.
21400b57cec5SDimitry Andric   //
21410b57cec5SDimitry Andric   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
21420b57cec5SDimitry Andric   // (&a + 8192) for load1, load2, load4.
21430b57cec5SDimitry Andric   //   addr = &a + 8192
21440b57cec5SDimitry Andric   //   load1 = load(addr,       -4096)
21450b57cec5SDimitry Andric   //   load2 = load(addr,       -2048)
21460b57cec5SDimitry Andric   //   load3 = load(addr,       0)
21470b57cec5SDimitry Andric   //   load4 = load(addr,       2048)
21480b57cec5SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
21490b57cec5SDimitry Andric   //
21500b57cec5SDimitry Andric   MachineInstr *AnchorInst = nullptr;
21510b57cec5SDimitry Andric   MemAddress AnchorAddr;
21520b57cec5SDimitry Andric   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
21530b57cec5SDimitry Andric   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
21540b57cec5SDimitry Andric 
21550b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
21560b57cec5SDimitry Andric   MachineBasicBlock::iterator E = MBB->end();
21570b57cec5SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
21580b57cec5SDimitry Andric   ++MBBI;
21590b57cec5SDimitry Andric   const SITargetLowering *TLI =
21600b57cec5SDimitry Andric     static_cast<const SITargetLowering *>(STM->getTargetLowering());
21610b57cec5SDimitry Andric 
21620b57cec5SDimitry Andric   for ( ; MBBI != E; ++MBBI) {
21630b57cec5SDimitry Andric     MachineInstr &MINext = *MBBI;
21640b57cec5SDimitry Andric     // TODO: Support finding an anchor(with same base) from store addresses or
21650b57cec5SDimitry Andric     // any other load addresses where the opcodes are different.
21660b57cec5SDimitry Andric     if (MINext.getOpcode() != MI.getOpcode() ||
21670b57cec5SDimitry Andric         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
21680b57cec5SDimitry Andric       continue;
21690b57cec5SDimitry Andric 
21700b57cec5SDimitry Andric     const MachineOperand &BaseNext =
21710b57cec5SDimitry Andric       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
21720b57cec5SDimitry Andric     MemAddress MAddrNext;
217306c3fb27SDimitry Andric     if (!Visited.contains(&MINext)) {
21740b57cec5SDimitry Andric       processBaseWithConstOffset(BaseNext, MAddrNext);
21750b57cec5SDimitry Andric       Visited[&MINext] = MAddrNext;
21760b57cec5SDimitry Andric     } else
21770b57cec5SDimitry Andric       MAddrNext = Visited[&MINext];
21780b57cec5SDimitry Andric 
21790b57cec5SDimitry Andric     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
21800b57cec5SDimitry Andric         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
21810b57cec5SDimitry Andric         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
21820b57cec5SDimitry Andric         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
21830b57cec5SDimitry Andric       continue;
21840b57cec5SDimitry Andric 
2185*0fca6ea1SDimitry Andric     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
21860b57cec5SDimitry Andric 
21870b57cec5SDimitry Andric     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
21880b57cec5SDimitry Andric     TargetLoweringBase::AddrMode AM;
21890b57cec5SDimitry Andric     AM.HasBaseReg = true;
21900b57cec5SDimitry Andric     AM.BaseOffs = Dist;
2191*0fca6ea1SDimitry Andric     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
21920b57cec5SDimitry Andric         (uint32_t)std::abs(Dist) > MaxDist) {
21930b57cec5SDimitry Andric       MaxDist = std::abs(Dist);
21940b57cec5SDimitry Andric 
21950b57cec5SDimitry Andric       AnchorAddr = MAddrNext;
21960b57cec5SDimitry Andric       AnchorInst = &MINext;
21970b57cec5SDimitry Andric     }
21980b57cec5SDimitry Andric   }
21990b57cec5SDimitry Andric 
22000b57cec5SDimitry Andric   if (AnchorInst) {
22010b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
22020b57cec5SDimitry Andric                AnchorInst->dump());
22030b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
22040b57cec5SDimitry Andric                <<  AnchorAddr.Offset << "\n\n");
22050b57cec5SDimitry Andric 
22060b57cec5SDimitry Andric     // Instead of moving up, just re-compute anchor-instruction's base address.
22075ffd83dbSDimitry Andric     Register Base = computeBase(MI, AnchorAddr);
22080b57cec5SDimitry Andric 
22090b57cec5SDimitry Andric     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
22100b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
22110b57cec5SDimitry Andric 
2212*0fca6ea1SDimitry Andric     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
22130b57cec5SDimitry Andric       TargetLoweringBase::AddrMode AM;
22140b57cec5SDimitry Andric       AM.HasBaseReg = true;
2215*0fca6ea1SDimitry Andric       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
22160b57cec5SDimitry Andric 
2217*0fca6ea1SDimitry Andric       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218*0fca6ea1SDimitry Andric         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2219*0fca6ea1SDimitry Andric                    OtherMI->dump());
2220*0fca6ea1SDimitry Andric         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221*0fca6ea1SDimitry Andric         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
22220b57cec5SDimitry Andric       }
22230b57cec5SDimitry Andric     }
22240b57cec5SDimitry Andric     AnchorList.insert(AnchorInst);
22250b57cec5SDimitry Andric     return true;
22260b57cec5SDimitry Andric   }
22270b57cec5SDimitry Andric 
22280b57cec5SDimitry Andric   return false;
22290b57cec5SDimitry Andric }
22300b57cec5SDimitry Andric 
22318bcb0991SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
22328bcb0991SDimitry Andric                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
22338bcb0991SDimitry Andric   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234480093f4SDimitry Andric     if (AddrList.front().InstClass == CI.InstClass &&
223504eeddc0SDimitry Andric         AddrList.front().IsAGPR == CI.IsAGPR &&
2236bdd1243dSDimitry Andric         AddrList.front().hasSameBaseAddress(CI)) {
22378bcb0991SDimitry Andric       AddrList.emplace_back(CI);
22388bcb0991SDimitry Andric       return;
22398bcb0991SDimitry Andric     }
22408bcb0991SDimitry Andric   }
22410b57cec5SDimitry Andric 
22428bcb0991SDimitry Andric   // Base address not found, so add a new list.
22438bcb0991SDimitry Andric   MergeableInsts.emplace_back(1, CI);
22448bcb0991SDimitry Andric }
22458bcb0991SDimitry Andric 
22465ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool>
22475ffd83dbSDimitry Andric SILoadStoreOptimizer::collectMergeableInsts(
22485ffd83dbSDimitry Andric     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
22495ffd83dbSDimitry Andric     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
22508bcb0991SDimitry Andric     std::list<std::list<CombineInfo>> &MergeableInsts) const {
22518bcb0991SDimitry Andric   bool Modified = false;
22520b57cec5SDimitry Andric 
22538bcb0991SDimitry Andric   // Sort potential mergeable instructions into lists.  One list per base address.
22545ffd83dbSDimitry Andric   unsigned Order = 0;
22555ffd83dbSDimitry Andric   MachineBasicBlock::iterator BlockI = Begin;
22565ffd83dbSDimitry Andric   for (; BlockI != End; ++BlockI) {
22575ffd83dbSDimitry Andric     MachineInstr &MI = *BlockI;
22585ffd83dbSDimitry Andric 
22598bcb0991SDimitry Andric     // We run this before checking if an address is mergeable, because it can produce
22608bcb0991SDimitry Andric     // better code even if the instructions aren't mergeable.
22610b57cec5SDimitry Andric     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
22620b57cec5SDimitry Andric       Modified = true;
22630b57cec5SDimitry Andric 
22641fd87a68SDimitry Andric     // Treat volatile accesses, ordered accesses and unmodeled side effects as
22651fd87a68SDimitry Andric     // barriers. We can look after this barrier for separate merges.
22661fd87a68SDimitry Andric     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
22671fd87a68SDimitry Andric       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
22685ffd83dbSDimitry Andric 
22695ffd83dbSDimitry Andric       // Search will resume after this instruction in a separate merge list.
22705ffd83dbSDimitry Andric       ++BlockI;
22715ffd83dbSDimitry Andric       break;
22725ffd83dbSDimitry Andric     }
22735ffd83dbSDimitry Andric 
22748bcb0991SDimitry Andric     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
22758bcb0991SDimitry Andric     if (InstClass == UNKNOWN)
22768bcb0991SDimitry Andric       continue;
22778bcb0991SDimitry Andric 
227804eeddc0SDimitry Andric     // Do not merge VMEM buffer instructions with "swizzled" bit set.
227904eeddc0SDimitry Andric     int Swizzled =
228004eeddc0SDimitry Andric         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
228104eeddc0SDimitry Andric     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
228204eeddc0SDimitry Andric       continue;
228304eeddc0SDimitry Andric 
22848bcb0991SDimitry Andric     CombineInfo CI;
228504eeddc0SDimitry Andric     CI.setMI(MI, *this);
22865ffd83dbSDimitry Andric     CI.Order = Order++;
22878bcb0991SDimitry Andric 
22888bcb0991SDimitry Andric     if (!CI.hasMergeableAddress(*MRI))
22898bcb0991SDimitry Andric       continue;
22908bcb0991SDimitry Andric 
229104eeddc0SDimitry Andric     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
229204eeddc0SDimitry Andric       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
229304eeddc0SDimitry Andric       //        operands. However we are reporting that ds_write2 shall have
229404eeddc0SDimitry Andric       //        only VGPR data so that machine copy propagation does not
229504eeddc0SDimitry Andric       //        create an illegal instruction with a VGPR and AGPR sources.
229604eeddc0SDimitry Andric       //        Consequenctially if we create such instruction the verifier
229704eeddc0SDimitry Andric       //        will complain.
229804eeddc0SDimitry Andric       continue;
229904eeddc0SDimitry Andric     }
230004eeddc0SDimitry Andric 
23015ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
23025ffd83dbSDimitry Andric 
23038bcb0991SDimitry Andric     addInstToMergeableList(CI, MergeableInsts);
23048bcb0991SDimitry Andric   }
23055ffd83dbSDimitry Andric 
23065ffd83dbSDimitry Andric   // At this point we have lists of Mergeable instructions.
23075ffd83dbSDimitry Andric   //
23085ffd83dbSDimitry Andric   // Part 2: Sort lists by offset and then for each CombineInfo object in the
23095ffd83dbSDimitry Andric   // list try to find an instruction that can be merged with I.  If an instruction
23105ffd83dbSDimitry Andric   // is found, it is stored in the Paired field.  If no instructions are found, then
23115ffd83dbSDimitry Andric   // the CombineInfo object is deleted from the list.
23125ffd83dbSDimitry Andric 
23135ffd83dbSDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23145ffd83dbSDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
23155ffd83dbSDimitry Andric 
23165ffd83dbSDimitry Andric     std::list<CombineInfo> &MergeList = *I;
23175ffd83dbSDimitry Andric     if (MergeList.size() <= 1) {
23185ffd83dbSDimitry Andric       // This means we have found only one instruction with a given address
23195ffd83dbSDimitry Andric       // that can be merged, and we need at least 2 instructions to do a merge,
23205ffd83dbSDimitry Andric       // so this list can be discarded.
23215ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23225ffd83dbSDimitry Andric       continue;
23235ffd83dbSDimitry Andric     }
23245ffd83dbSDimitry Andric 
23255ffd83dbSDimitry Andric     // Sort the lists by offsets, this way mergeable instructions will be
23265ffd83dbSDimitry Andric     // adjacent to each other in the list, which will make it easier to find
23275ffd83dbSDimitry Andric     // matches.
23285ffd83dbSDimitry Andric     MergeList.sort(
2329349cc55cSDimitry Andric         [] (const CombineInfo &A, const CombineInfo &B) {
23305ffd83dbSDimitry Andric           return A.Offset < B.Offset;
23315ffd83dbSDimitry Andric         });
23325ffd83dbSDimitry Andric     ++I;
23335ffd83dbSDimitry Andric   }
23345ffd83dbSDimitry Andric 
2335*0fca6ea1SDimitry Andric   return {BlockI, Modified};
23368bcb0991SDimitry Andric }
23378bcb0991SDimitry Andric 
23388bcb0991SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
23398bcb0991SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
23408bcb0991SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
23418bcb0991SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(
23428bcb0991SDimitry Andric                        std::list<std::list<CombineInfo> > &MergeableInsts) {
23438bcb0991SDimitry Andric   bool Modified = false;
23448bcb0991SDimitry Andric 
23455ffd83dbSDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23465ffd83dbSDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
23475ffd83dbSDimitry Andric     std::list<CombineInfo> &MergeList = *I;
23488bcb0991SDimitry Andric 
23498bcb0991SDimitry Andric     bool OptimizeListAgain = false;
23508bcb0991SDimitry Andric     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
23515ffd83dbSDimitry Andric       // We weren't able to make any changes, so delete the list so we don't
23528bcb0991SDimitry Andric       // process the same instructions the next time we try to optimize this
23538bcb0991SDimitry Andric       // block.
23545ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23550b57cec5SDimitry Andric       continue;
23560b57cec5SDimitry Andric     }
23570b57cec5SDimitry Andric 
23585ffd83dbSDimitry Andric     Modified = true;
23595ffd83dbSDimitry Andric 
23608bcb0991SDimitry Andric     // We made changes, but also determined that there were no more optimization
23618bcb0991SDimitry Andric     // opportunities, so we don't need to reprocess the list
23625ffd83dbSDimitry Andric     if (!OptimizeListAgain) {
23635ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23645ffd83dbSDimitry Andric       continue;
23655ffd83dbSDimitry Andric     }
23665ffd83dbSDimitry Andric     OptimizeAgain = true;
23678bcb0991SDimitry Andric   }
23688bcb0991SDimitry Andric   return Modified;
23698bcb0991SDimitry Andric }
23708bcb0991SDimitry Andric 
23718bcb0991SDimitry Andric bool
23728bcb0991SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
23738bcb0991SDimitry Andric                                           std::list<CombineInfo> &MergeList,
23748bcb0991SDimitry Andric                                           bool &OptimizeListAgain) {
23755ffd83dbSDimitry Andric   if (MergeList.empty())
23765ffd83dbSDimitry Andric     return false;
23775ffd83dbSDimitry Andric 
23788bcb0991SDimitry Andric   bool Modified = false;
2379480093f4SDimitry Andric 
23805ffd83dbSDimitry Andric   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
23815ffd83dbSDimitry Andric        Next = std::next(I)) {
23825ffd83dbSDimitry Andric 
23835ffd83dbSDimitry Andric     auto First = I;
23845ffd83dbSDimitry Andric     auto Second = Next;
23855ffd83dbSDimitry Andric 
23865ffd83dbSDimitry Andric     if ((*First).Order > (*Second).Order)
23875ffd83dbSDimitry Andric       std::swap(First, Second);
23885ffd83dbSDimitry Andric     CombineInfo &CI = *First;
23895ffd83dbSDimitry Andric     CombineInfo &Paired = *Second;
23905ffd83dbSDimitry Andric 
239181ad6265SDimitry Andric     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
239281ad6265SDimitry Andric     if (!Where) {
23935ffd83dbSDimitry Andric       ++I;
2394480093f4SDimitry Andric       continue;
23955ffd83dbSDimitry Andric     }
2396480093f4SDimitry Andric 
2397480093f4SDimitry Andric     Modified = true;
23985ffd83dbSDimitry Andric 
23995ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
24000b57cec5SDimitry Andric 
240181ad6265SDimitry Andric     MachineBasicBlock::iterator NewMI;
24020b57cec5SDimitry Andric     switch (CI.InstClass) {
24030b57cec5SDimitry Andric     default:
2404480093f4SDimitry Andric       llvm_unreachable("unknown InstClass");
24050b57cec5SDimitry Andric       break;
240681ad6265SDimitry Andric     case DS_READ:
240781ad6265SDimitry Andric       NewMI = mergeRead2Pair(CI, Paired, Where->I);
240881ad6265SDimitry Andric       break;
240981ad6265SDimitry Andric     case DS_WRITE:
241081ad6265SDimitry Andric       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
241181ad6265SDimitry Andric       break;
241281ad6265SDimitry Andric     case S_BUFFER_LOAD_IMM:
2413bdd1243dSDimitry Andric     case S_BUFFER_LOAD_SGPR_IMM:
2414bdd1243dSDimitry Andric     case S_LOAD_IMM:
2415bdd1243dSDimitry Andric       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
241681ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 8;
241781ad6265SDimitry Andric       break;
241881ad6265SDimitry Andric     case BUFFER_LOAD:
241981ad6265SDimitry Andric       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
242081ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
242181ad6265SDimitry Andric       break;
242281ad6265SDimitry Andric     case BUFFER_STORE:
242381ad6265SDimitry Andric       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
242481ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
242581ad6265SDimitry Andric       break;
242681ad6265SDimitry Andric     case MIMG:
242781ad6265SDimitry Andric       NewMI = mergeImagePair(CI, Paired, Where->I);
242881ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
242981ad6265SDimitry Andric       break;
243081ad6265SDimitry Andric     case TBUFFER_LOAD:
243181ad6265SDimitry Andric       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
243281ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
243381ad6265SDimitry Andric       break;
243481ad6265SDimitry Andric     case TBUFFER_STORE:
243581ad6265SDimitry Andric       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
243681ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
243781ad6265SDimitry Andric       break;
243881ad6265SDimitry Andric     case FLAT_LOAD:
243981ad6265SDimitry Andric     case GLOBAL_LOAD:
244081ad6265SDimitry Andric     case GLOBAL_LOAD_SADDR:
244181ad6265SDimitry Andric       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
244281ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
244381ad6265SDimitry Andric       break;
244481ad6265SDimitry Andric     case FLAT_STORE:
244581ad6265SDimitry Andric     case GLOBAL_STORE:
244681ad6265SDimitry Andric     case GLOBAL_STORE_SADDR:
244781ad6265SDimitry Andric       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
244881ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
24498bcb0991SDimitry Andric       break;
2450480093f4SDimitry Andric     }
245104eeddc0SDimitry Andric     CI.setMI(NewMI, *this);
245281ad6265SDimitry Andric     CI.Order = Where->Order;
24535ffd83dbSDimitry Andric     if (I == Second)
24545ffd83dbSDimitry Andric       I = Next;
2455480093f4SDimitry Andric 
24565ffd83dbSDimitry Andric     MergeList.erase(Second);
24570b57cec5SDimitry Andric   }
24580b57cec5SDimitry Andric 
24590b57cec5SDimitry Andric   return Modified;
24600b57cec5SDimitry Andric }
24610b57cec5SDimitry Andric 
24620b57cec5SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
24630b57cec5SDimitry Andric   if (skipFunction(MF.getFunction()))
24640b57cec5SDimitry Andric     return false;
24650b57cec5SDimitry Andric 
24660b57cec5SDimitry Andric   STM = &MF.getSubtarget<GCNSubtarget>();
24670b57cec5SDimitry Andric   if (!STM->loadStoreOptEnabled())
24680b57cec5SDimitry Andric     return false;
24690b57cec5SDimitry Andric 
24700b57cec5SDimitry Andric   TII = STM->getInstrInfo();
24710b57cec5SDimitry Andric   TRI = &TII->getRegisterInfo();
24720b57cec5SDimitry Andric 
24730b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
24740b57cec5SDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
24750b57cec5SDimitry Andric 
24760b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
24770b57cec5SDimitry Andric 
24780b57cec5SDimitry Andric   bool Modified = false;
24790b57cec5SDimitry Andric 
24805ffd83dbSDimitry Andric   // Contains the list of instructions for which constant offsets are being
24815ffd83dbSDimitry Andric   // promoted to the IMM. This is tracked for an entire block at time.
24825ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> AnchorList;
24835ffd83dbSDimitry Andric   MemInfoMap Visited;
24848bcb0991SDimitry Andric 
24850b57cec5SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
24865ffd83dbSDimitry Andric     MachineBasicBlock::iterator SectionEnd;
24875ffd83dbSDimitry Andric     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
24885ffd83dbSDimitry Andric          I = SectionEnd) {
24895ffd83dbSDimitry Andric       bool CollectModified;
24908bcb0991SDimitry Andric       std::list<std::list<CombineInfo>> MergeableInsts;
24915ffd83dbSDimitry Andric 
24925ffd83dbSDimitry Andric       // First pass: Collect list of all instructions we know how to merge in a
24935ffd83dbSDimitry Andric       // subset of the block.
24945ffd83dbSDimitry Andric       std::tie(SectionEnd, CollectModified) =
24955ffd83dbSDimitry Andric           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
24965ffd83dbSDimitry Andric 
24975ffd83dbSDimitry Andric       Modified |= CollectModified;
24985ffd83dbSDimitry Andric 
24990b57cec5SDimitry Andric       do {
25000b57cec5SDimitry Andric         OptimizeAgain = false;
25018bcb0991SDimitry Andric         Modified |= optimizeBlock(MergeableInsts);
25020b57cec5SDimitry Andric       } while (OptimizeAgain);
25030b57cec5SDimitry Andric     }
25040b57cec5SDimitry Andric 
25055ffd83dbSDimitry Andric     Visited.clear();
25065ffd83dbSDimitry Andric     AnchorList.clear();
25075ffd83dbSDimitry Andric   }
25085ffd83dbSDimitry Andric 
25090b57cec5SDimitry Andric   return Modified;
25100b57cec5SDimitry Andric }
2511