xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 8d13e7b8c382499c1cf0c2a3184b483e760f266b)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "SILoadStoreOptimizer.h"
61 #include "AMDGPU.h"
62 #include "GCNSubtarget.h"
63 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64 #include "llvm/Analysis/AliasAnalysis.h"
65 #include "llvm/CodeGen/MachineFunctionPass.h"
66 #include "llvm/InitializePasses.h"
67 
68 using namespace llvm;
69 
70 #define DEBUG_TYPE "si-load-store-opt"
71 
72 namespace {
73 enum InstClassEnum {
74   UNKNOWN,
75   DS_READ,
76   DS_WRITE,
77   S_BUFFER_LOAD_IMM,
78   S_BUFFER_LOAD_SGPR_IMM,
79   S_LOAD_IMM,
80   BUFFER_LOAD,
81   BUFFER_STORE,
82   MIMG,
83   TBUFFER_LOAD,
84   TBUFFER_STORE,
85   GLOBAL_LOAD_SADDR,
86   GLOBAL_STORE_SADDR,
87   FLAT_LOAD,
88   FLAT_STORE,
89   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
90   GLOBAL_STORE // any CombineInfo, they are only ever returned by
91                // getCommonInstClass.
92 };
93 
94 struct AddressRegs {
95   unsigned char NumVAddrs = 0;
96   bool SBase = false;
97   bool SRsrc = false;
98   bool SOffset = false;
99   bool SAddr = false;
100   bool VAddr = false;
101   bool Addr = false;
102   bool SSamp = false;
103 };
104 
105 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
106 const unsigned MaxAddressRegs = 12 + 1 + 1;
107 
108 class SILoadStoreOptimizer {
109   struct CombineInfo {
110     MachineBasicBlock::iterator I;
111     unsigned EltSize;
112     unsigned Offset;
113     unsigned Width;
114     unsigned Format;
115     unsigned BaseOff;
116     unsigned DMask;
117     InstClassEnum InstClass;
118     unsigned CPol = 0;
119     bool IsAGPR;
120     bool UseST64;
121     int AddrIdx[MaxAddressRegs];
122     const MachineOperand *AddrReg[MaxAddressRegs];
123     unsigned NumAddresses;
124     unsigned Order;
125 
126     bool hasSameBaseAddress(const CombineInfo &CI) {
127       if (NumAddresses != CI.NumAddresses)
128         return false;
129 
130       const MachineInstr &MI = *CI.I;
131       for (unsigned i = 0; i < NumAddresses; i++) {
132         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
133 
134         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
135           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
136               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
137             return false;
138           }
139           continue;
140         }
141 
142         // Check same base pointer. Be careful of subregisters, which can occur
143         // with vectors of pointers.
144         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
145             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
146          return false;
147         }
148       }
149       return true;
150     }
151 
152     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
153       for (unsigned i = 0; i < NumAddresses; ++i) {
154         const MachineOperand *AddrOp = AddrReg[i];
155         // Immediates are always OK.
156         if (AddrOp->isImm())
157           continue;
158 
159         // Don't try to merge addresses that aren't either immediates or registers.
160         // TODO: Should be possible to merge FrameIndexes and maybe some other
161         // non-register
162         if (!AddrOp->isReg())
163           return false;
164 
165         // TODO: We should be able to merge instructions with other physical reg
166         // addresses too.
167         if (AddrOp->getReg().isPhysical() &&
168             AddrOp->getReg() != AMDGPU::SGPR_NULL)
169           return false;
170 
171         // If an address has only one use then there will be no other
172         // instructions with the same address, so we can't merge this one.
173         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
174           return false;
175       }
176       return true;
177     }
178 
179     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
180 
181     // Compare by pointer order.
182     bool operator<(const CombineInfo& Other) const {
183       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
184     }
185   };
186 
187   struct BaseRegisters {
188     Register LoReg;
189     Register HiReg;
190 
191     unsigned LoSubReg = 0;
192     unsigned HiSubReg = 0;
193   };
194 
195   struct MemAddress {
196     BaseRegisters Base;
197     int64_t Offset = 0;
198   };
199 
200   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
201 
202 private:
203   const GCNSubtarget *STM = nullptr;
204   const SIInstrInfo *TII = nullptr;
205   const SIRegisterInfo *TRI = nullptr;
206   MachineRegisterInfo *MRI = nullptr;
207   AliasAnalysis *AA = nullptr;
208   bool OptimizeAgain;
209 
210   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
211                            const DenseSet<Register> &ARegUses,
212                            const MachineInstr &A, const MachineInstr &B) const;
213   static bool dmasksCanBeCombined(const CombineInfo &CI,
214                                   const SIInstrInfo &TII,
215                                   const CombineInfo &Paired);
216   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
217                                    CombineInfo &Paired, bool Modify = false);
218   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
219                         const CombineInfo &Paired);
220   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
221   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
222                                                      const CombineInfo &Paired);
223   const TargetRegisterClass *
224   getTargetRegisterClass(const CombineInfo &CI,
225                          const CombineInfo &Paired) const;
226   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
227 
228   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229 
230   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
231                       MachineBasicBlock::iterator InsertBefore, int OpName,
232                       Register DestReg) const;
233   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
234                            MachineBasicBlock::iterator InsertBefore,
235                            int OpName) const;
236 
237   unsigned read2Opcode(unsigned EltSize) const;
238   unsigned read2ST64Opcode(unsigned EltSize) const;
239   MachineBasicBlock::iterator
240   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242 
243   unsigned write2Opcode(unsigned EltSize) const;
244   unsigned write2ST64Opcode(unsigned EltSize) const;
245   MachineBasicBlock::iterator
246   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
247                   MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
250                  MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
256                       MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
259                        MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262                        MachineBasicBlock::iterator InsertBefore);
263   MachineBasicBlock::iterator
264   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265                         MachineBasicBlock::iterator InsertBefore);
266   MachineBasicBlock::iterator
267   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
268                     MachineBasicBlock::iterator InsertBefore);
269   MachineBasicBlock::iterator
270   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
271                      MachineBasicBlock::iterator InsertBefore);
272 
273   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
274                            int32_t NewOffset) const;
275   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
276   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
277   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
278   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
279   /// Promotes constant offset to the immediate by adjusting the base. It
280   /// tries to use a base from the nearby instructions that allows it to have
281   /// a 13bit constant offset which gets promoted to the immediate.
282   bool promoteConstantOffsetToImm(MachineInstr &CI,
283                                   MemInfoMap &Visited,
284                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
285   void addInstToMergeableList(const CombineInfo &CI,
286                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
287 
288   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
289       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
290       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
291       std::list<std::list<CombineInfo>> &MergeableInsts) const;
292 
293   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
294                                                      const CombineInfo &Paired);
295 
296   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
297                                           const CombineInfo &Paired);
298 
299   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300                                      bool &OptimizeListAgain);
301   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
302 
303 public:
304   SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
305   bool run(MachineFunction &MF);
306 };
307 
308 class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
309 public:
310   static char ID;
311 
312   SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
313 
314   bool runOnMachineFunction(MachineFunction &MF) override;
315 
316   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
317 
318   void getAnalysisUsage(AnalysisUsage &AU) const override {
319     AU.setPreservesCFG();
320     AU.addRequired<AAResultsWrapperPass>();
321 
322     MachineFunctionPass::getAnalysisUsage(AU);
323   }
324 
325   MachineFunctionProperties getRequiredProperties() const override {
326     return MachineFunctionProperties()
327       .set(MachineFunctionProperties::Property::IsSSA);
328   }
329 };
330 
331 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
332   const unsigned Opc = MI.getOpcode();
333 
334   if (TII.isMUBUF(Opc)) {
335     // FIXME: Handle d16 correctly
336     return AMDGPU::getMUBUFElements(Opc);
337   }
338   if (TII.isImage(MI)) {
339     uint64_t DMaskImm =
340         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
341     return llvm::popcount(DMaskImm);
342   }
343   if (TII.isMTBUF(Opc)) {
344     return AMDGPU::getMTBUFElements(Opc);
345   }
346 
347   switch (Opc) {
348   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
350   case AMDGPU::S_LOAD_DWORD_IMM:
351   case AMDGPU::GLOBAL_LOAD_DWORD:
352   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
353   case AMDGPU::GLOBAL_STORE_DWORD:
354   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
355   case AMDGPU::FLAT_LOAD_DWORD:
356   case AMDGPU::FLAT_STORE_DWORD:
357     return 1;
358   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
359   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
360   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
361   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
362   case AMDGPU::S_LOAD_DWORDX2_IMM:
363   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
364   case AMDGPU::GLOBAL_LOAD_DWORDX2:
365   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
366   case AMDGPU::GLOBAL_STORE_DWORDX2:
367   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
368   case AMDGPU::FLAT_LOAD_DWORDX2:
369   case AMDGPU::FLAT_STORE_DWORDX2:
370     return 2;
371   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
372   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
373   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
375   case AMDGPU::S_LOAD_DWORDX3_IMM:
376   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
377   case AMDGPU::GLOBAL_LOAD_DWORDX3:
378   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
379   case AMDGPU::GLOBAL_STORE_DWORDX3:
380   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
381   case AMDGPU::FLAT_LOAD_DWORDX3:
382   case AMDGPU::FLAT_STORE_DWORDX3:
383     return 3;
384   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
386   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
387   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
388   case AMDGPU::S_LOAD_DWORDX4_IMM:
389   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
390   case AMDGPU::GLOBAL_LOAD_DWORDX4:
391   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
392   case AMDGPU::GLOBAL_STORE_DWORDX4:
393   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
394   case AMDGPU::FLAT_LOAD_DWORDX4:
395   case AMDGPU::FLAT_STORE_DWORDX4:
396     return 4;
397   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
398   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
399   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
400   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
401   case AMDGPU::S_LOAD_DWORDX8_IMM:
402   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
403     return 8;
404   case AMDGPU::DS_READ_B32:
405   case AMDGPU::DS_READ_B32_gfx9:
406   case AMDGPU::DS_WRITE_B32:
407   case AMDGPU::DS_WRITE_B32_gfx9:
408     return 1;
409   case AMDGPU::DS_READ_B64:
410   case AMDGPU::DS_READ_B64_gfx9:
411   case AMDGPU::DS_WRITE_B64:
412   case AMDGPU::DS_WRITE_B64_gfx9:
413     return 2;
414   default:
415     return 0;
416   }
417 }
418 
419 /// Maps instruction opcode to enum InstClassEnum.
420 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
421   switch (Opc) {
422   default:
423     if (TII.isMUBUF(Opc)) {
424       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
425       default:
426         return UNKNOWN;
427       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
428       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
429       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
430       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
431       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
432       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
433       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
434       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
435       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
436       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
437       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
438       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
439       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
440       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
441       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
442       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
443         return BUFFER_LOAD;
444       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
445       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
446       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
447       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
448       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
449       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
450       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
451       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
452       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
453       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
454       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
455       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
456       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
457       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
458       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
459       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
460         return BUFFER_STORE;
461       }
462     }
463     if (TII.isImage(Opc)) {
464       // Ignore instructions encoded without vaddr.
465       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
466           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
467         return UNKNOWN;
468       // Ignore BVH instructions
469       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
470         return UNKNOWN;
471       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
472       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
473           TII.isGather4(Opc))
474         return UNKNOWN;
475       return MIMG;
476     }
477     if (TII.isMTBUF(Opc)) {
478       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
479       default:
480         return UNKNOWN;
481       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
482       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
483       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
484       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
485       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
486       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
487       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
488       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
489       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
490       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
491       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
492       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
493       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
494       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
495       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
496       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
497         return TBUFFER_LOAD;
498       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
499       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
500       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
501       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
502       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
503       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
504       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
505       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
506         return TBUFFER_STORE;
507       }
508     }
509     return UNKNOWN;
510   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
511   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
512   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
513   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
514   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
515   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
516   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
517   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
518   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
519     return S_BUFFER_LOAD_IMM;
520   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
521   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
522   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
523   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
524   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
525   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
526   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
527   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
528   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
529     return S_BUFFER_LOAD_SGPR_IMM;
530   case AMDGPU::S_LOAD_DWORD_IMM:
531   case AMDGPU::S_LOAD_DWORDX2_IMM:
532   case AMDGPU::S_LOAD_DWORDX3_IMM:
533   case AMDGPU::S_LOAD_DWORDX4_IMM:
534   case AMDGPU::S_LOAD_DWORDX8_IMM:
535   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
536   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
537   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
538   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
539     return S_LOAD_IMM;
540   case AMDGPU::DS_READ_B32:
541   case AMDGPU::DS_READ_B32_gfx9:
542   case AMDGPU::DS_READ_B64:
543   case AMDGPU::DS_READ_B64_gfx9:
544     return DS_READ;
545   case AMDGPU::DS_WRITE_B32:
546   case AMDGPU::DS_WRITE_B32_gfx9:
547   case AMDGPU::DS_WRITE_B64:
548   case AMDGPU::DS_WRITE_B64_gfx9:
549     return DS_WRITE;
550   case AMDGPU::GLOBAL_LOAD_DWORD:
551   case AMDGPU::GLOBAL_LOAD_DWORDX2:
552   case AMDGPU::GLOBAL_LOAD_DWORDX3:
553   case AMDGPU::GLOBAL_LOAD_DWORDX4:
554   case AMDGPU::FLAT_LOAD_DWORD:
555   case AMDGPU::FLAT_LOAD_DWORDX2:
556   case AMDGPU::FLAT_LOAD_DWORDX3:
557   case AMDGPU::FLAT_LOAD_DWORDX4:
558     return FLAT_LOAD;
559   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
560   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
561   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
562   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
563     return GLOBAL_LOAD_SADDR;
564   case AMDGPU::GLOBAL_STORE_DWORD:
565   case AMDGPU::GLOBAL_STORE_DWORDX2:
566   case AMDGPU::GLOBAL_STORE_DWORDX3:
567   case AMDGPU::GLOBAL_STORE_DWORDX4:
568   case AMDGPU::FLAT_STORE_DWORD:
569   case AMDGPU::FLAT_STORE_DWORDX2:
570   case AMDGPU::FLAT_STORE_DWORDX3:
571   case AMDGPU::FLAT_STORE_DWORDX4:
572     return FLAT_STORE;
573   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
574   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
575   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
576   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
577     return GLOBAL_STORE_SADDR;
578   }
579 }
580 
581 /// Determines instruction subclass from opcode. Only instructions
582 /// of the same subclass can be merged together. The merged instruction may have
583 /// a different subclass but must have the same class.
584 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
585   switch (Opc) {
586   default:
587     if (TII.isMUBUF(Opc))
588       return AMDGPU::getMUBUFBaseOpcode(Opc);
589     if (TII.isImage(Opc)) {
590       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
591       assert(Info);
592       return Info->BaseOpcode;
593     }
594     if (TII.isMTBUF(Opc))
595       return AMDGPU::getMTBUFBaseOpcode(Opc);
596     return -1;
597   case AMDGPU::DS_READ_B32:
598   case AMDGPU::DS_READ_B32_gfx9:
599   case AMDGPU::DS_READ_B64:
600   case AMDGPU::DS_READ_B64_gfx9:
601   case AMDGPU::DS_WRITE_B32:
602   case AMDGPU::DS_WRITE_B32_gfx9:
603   case AMDGPU::DS_WRITE_B64:
604   case AMDGPU::DS_WRITE_B64_gfx9:
605     return Opc;
606   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
607   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
608   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
609   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
610   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
611   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
612   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
613   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
614   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
615     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
616   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
617   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
618   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
619   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
620   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
621   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
622   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
623   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
624   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
625     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
626   case AMDGPU::S_LOAD_DWORD_IMM:
627   case AMDGPU::S_LOAD_DWORDX2_IMM:
628   case AMDGPU::S_LOAD_DWORDX3_IMM:
629   case AMDGPU::S_LOAD_DWORDX4_IMM:
630   case AMDGPU::S_LOAD_DWORDX8_IMM:
631   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
632   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
633   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
634   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
635     return AMDGPU::S_LOAD_DWORD_IMM;
636   case AMDGPU::GLOBAL_LOAD_DWORD:
637   case AMDGPU::GLOBAL_LOAD_DWORDX2:
638   case AMDGPU::GLOBAL_LOAD_DWORDX3:
639   case AMDGPU::GLOBAL_LOAD_DWORDX4:
640   case AMDGPU::FLAT_LOAD_DWORD:
641   case AMDGPU::FLAT_LOAD_DWORDX2:
642   case AMDGPU::FLAT_LOAD_DWORDX3:
643   case AMDGPU::FLAT_LOAD_DWORDX4:
644     return AMDGPU::FLAT_LOAD_DWORD;
645   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
646   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
647   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
648   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
649     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
650   case AMDGPU::GLOBAL_STORE_DWORD:
651   case AMDGPU::GLOBAL_STORE_DWORDX2:
652   case AMDGPU::GLOBAL_STORE_DWORDX3:
653   case AMDGPU::GLOBAL_STORE_DWORDX4:
654   case AMDGPU::FLAT_STORE_DWORD:
655   case AMDGPU::FLAT_STORE_DWORDX2:
656   case AMDGPU::FLAT_STORE_DWORDX3:
657   case AMDGPU::FLAT_STORE_DWORDX4:
658     return AMDGPU::FLAT_STORE_DWORD;
659   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
660   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
661   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
662   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
663     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
664   }
665 }
666 
667 // GLOBAL loads and stores are classified as FLAT initially. If both combined
668 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
669 // If either or both instructions are non segment specific FLAT the resulting
670 // combined operation will be FLAT, potentially promoting one of the GLOBAL
671 // operations to FLAT.
672 // For other instructions return the original unmodified class.
673 InstClassEnum
674 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
675                                          const CombineInfo &Paired) {
676   assert(CI.InstClass == Paired.InstClass);
677 
678   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
679       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
680     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
681 
682   return CI.InstClass;
683 }
684 
685 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
686   AddressRegs Result;
687 
688   if (TII.isMUBUF(Opc)) {
689     if (AMDGPU::getMUBUFHasVAddr(Opc))
690       Result.VAddr = true;
691     if (AMDGPU::getMUBUFHasSrsrc(Opc))
692       Result.SRsrc = true;
693     if (AMDGPU::getMUBUFHasSoffset(Opc))
694       Result.SOffset = true;
695 
696     return Result;
697   }
698 
699   if (TII.isImage(Opc)) {
700     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
701     if (VAddr0Idx >= 0) {
702       int RsrcName =
703           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
704       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
705       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
706     } else {
707       Result.VAddr = true;
708     }
709     Result.SRsrc = true;
710     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
711     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
712       Result.SSamp = true;
713 
714     return Result;
715   }
716   if (TII.isMTBUF(Opc)) {
717     if (AMDGPU::getMTBUFHasVAddr(Opc))
718       Result.VAddr = true;
719     if (AMDGPU::getMTBUFHasSrsrc(Opc))
720       Result.SRsrc = true;
721     if (AMDGPU::getMTBUFHasSoffset(Opc))
722       Result.SOffset = true;
723 
724     return Result;
725   }
726 
727   switch (Opc) {
728   default:
729     return Result;
730   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
731   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
732   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
733   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
734   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
735   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
736   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
737   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
738   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
739     Result.SOffset = true;
740     [[fallthrough]];
741   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
742   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
743   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
744   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
745   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
746   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
747   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
748   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
749   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
750   case AMDGPU::S_LOAD_DWORD_IMM:
751   case AMDGPU::S_LOAD_DWORDX2_IMM:
752   case AMDGPU::S_LOAD_DWORDX3_IMM:
753   case AMDGPU::S_LOAD_DWORDX4_IMM:
754   case AMDGPU::S_LOAD_DWORDX8_IMM:
755   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
756   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
757   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
758   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
759     Result.SBase = true;
760     return Result;
761   case AMDGPU::DS_READ_B32:
762   case AMDGPU::DS_READ_B64:
763   case AMDGPU::DS_READ_B32_gfx9:
764   case AMDGPU::DS_READ_B64_gfx9:
765   case AMDGPU::DS_WRITE_B32:
766   case AMDGPU::DS_WRITE_B64:
767   case AMDGPU::DS_WRITE_B32_gfx9:
768   case AMDGPU::DS_WRITE_B64_gfx9:
769     Result.Addr = true;
770     return Result;
771   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
772   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
773   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
774   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
775   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
776   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
777   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
778   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
779     Result.SAddr = true;
780     [[fallthrough]];
781   case AMDGPU::GLOBAL_LOAD_DWORD:
782   case AMDGPU::GLOBAL_LOAD_DWORDX2:
783   case AMDGPU::GLOBAL_LOAD_DWORDX3:
784   case AMDGPU::GLOBAL_LOAD_DWORDX4:
785   case AMDGPU::GLOBAL_STORE_DWORD:
786   case AMDGPU::GLOBAL_STORE_DWORDX2:
787   case AMDGPU::GLOBAL_STORE_DWORDX3:
788   case AMDGPU::GLOBAL_STORE_DWORDX4:
789   case AMDGPU::FLAT_LOAD_DWORD:
790   case AMDGPU::FLAT_LOAD_DWORDX2:
791   case AMDGPU::FLAT_LOAD_DWORDX3:
792   case AMDGPU::FLAT_LOAD_DWORDX4:
793   case AMDGPU::FLAT_STORE_DWORD:
794   case AMDGPU::FLAT_STORE_DWORDX2:
795   case AMDGPU::FLAT_STORE_DWORDX3:
796   case AMDGPU::FLAT_STORE_DWORDX4:
797     Result.VAddr = true;
798     return Result;
799   }
800 }
801 
802 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
803                                               const SILoadStoreOptimizer &LSO) {
804   I = MI;
805   unsigned Opc = MI->getOpcode();
806   InstClass = getInstClass(Opc, *LSO.TII);
807 
808   if (InstClass == UNKNOWN)
809     return;
810 
811   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
812 
813   switch (InstClass) {
814   case DS_READ:
815    EltSize =
816           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
817                                                                           : 4;
818    break;
819   case DS_WRITE:
820     EltSize =
821           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
822                                                                             : 4;
823     break;
824   case S_BUFFER_LOAD_IMM:
825   case S_BUFFER_LOAD_SGPR_IMM:
826   case S_LOAD_IMM:
827     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
828     break;
829   default:
830     EltSize = 4;
831     break;
832   }
833 
834   if (InstClass == MIMG) {
835     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
836     // Offset is not considered for MIMG instructions.
837     Offset = 0;
838   } else {
839     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
840     Offset = I->getOperand(OffsetIdx).getImm();
841   }
842 
843   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
844     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
845 
846   Width = getOpcodeWidth(*I, *LSO.TII);
847 
848   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
849     Offset &= 0xffff;
850   } else if (InstClass != MIMG) {
851     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
852   }
853 
854   AddressRegs Regs = getRegs(Opc, *LSO.TII);
855   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
856 
857   NumAddresses = 0;
858   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
859     AddrIdx[NumAddresses++] =
860         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
861   if (Regs.Addr)
862     AddrIdx[NumAddresses++] =
863         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
864   if (Regs.SBase)
865     AddrIdx[NumAddresses++] =
866         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
867   if (Regs.SRsrc)
868     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
869         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
870   if (Regs.SOffset)
871     AddrIdx[NumAddresses++] =
872         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
873   if (Regs.SAddr)
874     AddrIdx[NumAddresses++] =
875         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
876   if (Regs.VAddr)
877     AddrIdx[NumAddresses++] =
878         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
879   if (Regs.SSamp)
880     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
881         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
882   assert(NumAddresses <= MaxAddressRegs);
883 
884   for (unsigned J = 0; J < NumAddresses; J++)
885     AddrReg[J] = &I->getOperand(AddrIdx[J]);
886 }
887 
888 } // end anonymous namespace.
889 
890 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
891                       "SI Load Store Optimizer", false, false)
892 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
893 INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
894                     "SI Load Store Optimizer", false, false)
895 
896 char SILoadStoreOptimizerLegacy::ID = 0;
897 
898 char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
899 
900 FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
901   return new SILoadStoreOptimizerLegacy();
902 }
903 
904 static void addDefsUsesToList(const MachineInstr &MI,
905                               DenseSet<Register> &RegDefs,
906                               DenseSet<Register> &RegUses) {
907   for (const auto &Op : MI.operands()) {
908     if (!Op.isReg())
909       continue;
910     if (Op.isDef())
911       RegDefs.insert(Op.getReg());
912     if (Op.readsReg())
913       RegUses.insert(Op.getReg());
914   }
915 }
916 
917 bool SILoadStoreOptimizer::canSwapInstructions(
918     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
919     const MachineInstr &A, const MachineInstr &B) const {
920   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
921       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
922     return false;
923   for (const auto &BOp : B.operands()) {
924     if (!BOp.isReg())
925       continue;
926     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
927       return false;
928     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
929       return false;
930   }
931   return true;
932 }
933 
934 // Given that \p CI and \p Paired are adjacent memory operations produce a new
935 // MMO for the combined operation with a new access size.
936 MachineMemOperand *
937 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
938                                                const CombineInfo &Paired) {
939   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
940   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
941 
942   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
943 
944   // A base pointer for the combined operation is the same as the leading
945   // operation's pointer.
946   if (Paired < CI)
947     std::swap(MMOa, MMOb);
948 
949   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
950   // If merging FLAT and GLOBAL set address space to FLAT.
951   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
952     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
953 
954   MachineFunction *MF = CI.I->getMF();
955   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
956 }
957 
958 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
959                                                const SIInstrInfo &TII,
960                                                const CombineInfo &Paired) {
961   assert(CI.InstClass == MIMG);
962 
963   // Ignore instructions with tfe/lwe set.
964   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
965   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
966 
967   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
968     return false;
969 
970   // Check other optional immediate operands for equality.
971   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
972                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
973                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
974 
975   for (auto op : OperandsToMatch) {
976     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
977     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
978       return false;
979     if (Idx != -1 &&
980         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
981       return false;
982   }
983 
984   // Check DMask for overlaps.
985   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
986   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
987 
988   if (!MaxMask)
989     return false;
990 
991   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
992   if ((1u << AllowedBitsForMin) <= MinMask)
993     return false;
994 
995   return true;
996 }
997 
998 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
999                                        unsigned ComponentCount,
1000                                        const GCNSubtarget &STI) {
1001   if (ComponentCount > 4)
1002     return 0;
1003 
1004   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1005       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
1006   if (!OldFormatInfo)
1007     return 0;
1008 
1009   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1010       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
1011                                            ComponentCount,
1012                                            OldFormatInfo->NumFormat, STI);
1013 
1014   if (!NewFormatInfo)
1015     return 0;
1016 
1017   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1018          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1019 
1020   return NewFormatInfo->Format;
1021 }
1022 
1023 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
1024 // highest power of two. Note that the result is well defined for all inputs
1025 // including corner cases like:
1026 // - if Lo == Hi, return that value
1027 // - if Lo == 0, return 0 (even though the "- 1" below underflows
1028 // - if Lo > Hi, return 0 (as if the range wrapped around)
1029 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
1030   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1031 }
1032 
1033 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1034                                                 const GCNSubtarget &STI,
1035                                                 CombineInfo &Paired,
1036                                                 bool Modify) {
1037   assert(CI.InstClass != MIMG);
1038 
1039   // XXX - Would the same offset be OK? Is there any reason this would happen or
1040   // be useful?
1041   if (CI.Offset == Paired.Offset)
1042     return false;
1043 
1044   // This won't be valid if the offset isn't aligned.
1045   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1046     return false;
1047 
1048   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1049 
1050     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1051         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1052     if (!Info0)
1053       return false;
1054     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1055         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1056     if (!Info1)
1057       return false;
1058 
1059     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1060         Info0->NumFormat != Info1->NumFormat)
1061       return false;
1062 
1063     // TODO: Should be possible to support more formats, but if format loads
1064     // are not dword-aligned, the merged load might not be valid.
1065     if (Info0->BitsPerComp != 32)
1066       return false;
1067 
1068     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1069       return false;
1070   }
1071 
1072   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1073   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1074   CI.UseST64 = false;
1075   CI.BaseOff = 0;
1076 
1077   // Handle all non-DS instructions.
1078   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1079     if (EltOffset0 + CI.Width != EltOffset1 &&
1080             EltOffset1 + Paired.Width != EltOffset0)
1081       return false;
1082     if (CI.CPol != Paired.CPol)
1083       return false;
1084     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1085         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1086       // Reject cases like:
1087       //   dword + dwordx2 -> dwordx3
1088       //   dword + dwordx3 -> dwordx4
1089       // If we tried to combine these cases, we would fail to extract a subreg
1090       // for the result of the second load due to SGPR alignment requirements.
1091       if (CI.Width != Paired.Width &&
1092           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1093         return false;
1094     }
1095     return true;
1096   }
1097 
1098   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1099   // the stride 64 versions.
1100   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1101       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1102     if (Modify) {
1103       CI.Offset = EltOffset0 / 64;
1104       Paired.Offset = EltOffset1 / 64;
1105       CI.UseST64 = true;
1106     }
1107     return true;
1108   }
1109 
1110   // Check if the new offsets fit in the reduced 8-bit range.
1111   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1112     if (Modify) {
1113       CI.Offset = EltOffset0;
1114       Paired.Offset = EltOffset1;
1115     }
1116     return true;
1117   }
1118 
1119   // Try to shift base address to decrease offsets.
1120   uint32_t Min = std::min(EltOffset0, EltOffset1);
1121   uint32_t Max = std::max(EltOffset0, EltOffset1);
1122 
1123   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1124   if (((Max - Min) & ~Mask) == 0) {
1125     if (Modify) {
1126       // From the range of values we could use for BaseOff, choose the one that
1127       // is aligned to the highest power of two, to maximise the chance that
1128       // the same offset can be reused for other load/store pairs.
1129       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1130       // Copy the low bits of the offsets, so that when we adjust them by
1131       // subtracting BaseOff they will be multiples of 64.
1132       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1133       CI.BaseOff = BaseOff * CI.EltSize;
1134       CI.Offset = (EltOffset0 - BaseOff) / 64;
1135       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1136       CI.UseST64 = true;
1137     }
1138     return true;
1139   }
1140 
1141   if (isUInt<8>(Max - Min)) {
1142     if (Modify) {
1143       // From the range of values we could use for BaseOff, choose the one that
1144       // is aligned to the highest power of two, to maximise the chance that
1145       // the same offset can be reused for other load/store pairs.
1146       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1147       CI.BaseOff = BaseOff * CI.EltSize;
1148       CI.Offset = EltOffset0 - BaseOff;
1149       Paired.Offset = EltOffset1 - BaseOff;
1150     }
1151     return true;
1152   }
1153 
1154   return false;
1155 }
1156 
1157 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1158                                      const CombineInfo &CI,
1159                                      const CombineInfo &Paired) {
1160   const unsigned Width = (CI.Width + Paired.Width);
1161   switch (CI.InstClass) {
1162   default:
1163     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1164   case S_BUFFER_LOAD_IMM:
1165   case S_BUFFER_LOAD_SGPR_IMM:
1166   case S_LOAD_IMM:
1167     switch (Width) {
1168     default:
1169       return false;
1170     case 2:
1171     case 4:
1172     case 8:
1173       return true;
1174     case 3:
1175       return STM.hasScalarDwordx3Loads();
1176     }
1177   }
1178 }
1179 
1180 const TargetRegisterClass *
1181 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1182   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1183     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1184   }
1185   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1186     return TRI->getRegClassForReg(*MRI, Src->getReg());
1187   }
1188   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1189     return TRI->getRegClassForReg(*MRI, Src->getReg());
1190   }
1191   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1192     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1193   }
1194   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1195     return TRI->getRegClassForReg(*MRI, Src->getReg());
1196   }
1197   return nullptr;
1198 }
1199 
1200 /// This function assumes that CI comes before Paired in a basic block. Return
1201 /// an insertion point for the merged instruction or nullptr on failure.
1202 SILoadStoreOptimizer::CombineInfo *
1203 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1204                                            CombineInfo &Paired) {
1205   // If another instruction has already been merged into CI, it may now be a
1206   // type that we can't do any further merging into.
1207   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1208     return nullptr;
1209   assert(CI.InstClass == Paired.InstClass);
1210 
1211   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1212       getInstSubclass(Paired.I->getOpcode(), *TII))
1213     return nullptr;
1214 
1215   // Check both offsets (or masks for MIMG) can be combined and fit in the
1216   // reduced range.
1217   if (CI.InstClass == MIMG) {
1218     if (!dmasksCanBeCombined(CI, *TII, Paired))
1219       return nullptr;
1220   } else {
1221     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1222       return nullptr;
1223   }
1224 
1225   DenseSet<Register> RegDefs;
1226   DenseSet<Register> RegUses;
1227   CombineInfo *Where;
1228   if (CI.I->mayLoad()) {
1229     // Try to hoist Paired up to CI.
1230     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1231     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1232       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1233         return nullptr;
1234     }
1235     Where = &CI;
1236   } else {
1237     // Try to sink CI down to Paired.
1238     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1239     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1240       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1241         return nullptr;
1242     }
1243     Where = &Paired;
1244   }
1245 
1246   // Call offsetsCanBeCombined with modify = true so that the offsets are
1247   // correct for the new instruction.  This should return true, because
1248   // this function should only be called on CombineInfo objects that
1249   // have already been confirmed to be mergeable.
1250   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1251     offsetsCanBeCombined(CI, *STM, Paired, true);
1252   return Where;
1253 }
1254 
1255 // Copy the merged load result from DestReg to the original dest regs of CI and
1256 // Paired.
1257 void SILoadStoreOptimizer::copyToDestRegs(
1258     CombineInfo &CI, CombineInfo &Paired,
1259     MachineBasicBlock::iterator InsertBefore, int OpName,
1260     Register DestReg) const {
1261   MachineBasicBlock *MBB = CI.I->getParent();
1262   DebugLoc DL = CI.I->getDebugLoc();
1263 
1264   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1265 
1266   // Copy to the old destination registers.
1267   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1268   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1269   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1270 
1271   // The constrained sload instructions in S_LOAD_IMM class will have
1272   // `early-clobber` flag in the dst operand. Remove the flag before using the
1273   // MOs in copies.
1274   Dest0->setIsEarlyClobber(false);
1275   Dest1->setIsEarlyClobber(false);
1276 
1277   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1278       .add(*Dest0) // Copy to same destination including flags and sub reg.
1279       .addReg(DestReg, 0, SubRegIdx0);
1280   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1281       .add(*Dest1)
1282       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1283 }
1284 
1285 // Return a register for the source of the merged store after copying the
1286 // original source regs of CI and Paired into it.
1287 Register
1288 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1289                                       MachineBasicBlock::iterator InsertBefore,
1290                                       int OpName) const {
1291   MachineBasicBlock *MBB = CI.I->getParent();
1292   DebugLoc DL = CI.I->getDebugLoc();
1293 
1294   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1295 
1296   // Copy to the new source register.
1297   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1298   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1299 
1300   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1301   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1302 
1303   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1304       .add(*Src0)
1305       .addImm(SubRegIdx0)
1306       .add(*Src1)
1307       .addImm(SubRegIdx1);
1308 
1309   return SrcReg;
1310 }
1311 
1312 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1313   if (STM->ldsRequiresM0Init())
1314     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1315   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1316 }
1317 
1318 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1319   if (STM->ldsRequiresM0Init())
1320     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1321 
1322   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1323                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1324 }
1325 
1326 MachineBasicBlock::iterator
1327 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1328                                      MachineBasicBlock::iterator InsertBefore) {
1329   MachineBasicBlock *MBB = CI.I->getParent();
1330 
1331   // Be careful, since the addresses could be subregisters themselves in weird
1332   // cases, like vectors of pointers.
1333   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1334 
1335   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1336   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1337   unsigned Opc =
1338       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1339 
1340   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1341          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1342 
1343   const MCInstrDesc &Read2Desc = TII->get(Opc);
1344 
1345   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1346   Register DestReg = MRI->createVirtualRegister(SuperRC);
1347 
1348   DebugLoc DL = CI.I->getDebugLoc();
1349 
1350   Register BaseReg = AddrReg->getReg();
1351   unsigned BaseSubReg = AddrReg->getSubReg();
1352   unsigned BaseRegFlags = 0;
1353   if (CI.BaseOff) {
1354     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1355     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1356         .addImm(CI.BaseOff);
1357 
1358     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1359     BaseRegFlags = RegState::Kill;
1360 
1361     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1362         .addReg(ImmReg)
1363         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1364         .addImm(0); // clamp bit
1365     BaseSubReg = 0;
1366   }
1367 
1368   MachineInstrBuilder Read2 =
1369       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1370           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1371           .addImm(NewOffset0)                        // offset0
1372           .addImm(NewOffset1)                        // offset1
1373           .addImm(0)                                 // gds
1374           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1375 
1376   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1377 
1378   CI.I->eraseFromParent();
1379   Paired.I->eraseFromParent();
1380 
1381   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1382   return Read2;
1383 }
1384 
1385 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1386   if (STM->ldsRequiresM0Init())
1387     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1388   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1389                         : AMDGPU::DS_WRITE2_B64_gfx9;
1390 }
1391 
1392 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1393   if (STM->ldsRequiresM0Init())
1394     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1395                           : AMDGPU::DS_WRITE2ST64_B64;
1396 
1397   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1398                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1399 }
1400 
1401 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1402     CombineInfo &CI, CombineInfo &Paired,
1403     MachineBasicBlock::iterator InsertBefore) {
1404   MachineBasicBlock *MBB = CI.I->getParent();
1405 
1406   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1407   // sure we preserve the subregister index and any register flags set on them.
1408   const MachineOperand *AddrReg =
1409       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1410   const MachineOperand *Data0 =
1411       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1412   const MachineOperand *Data1 =
1413       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1414 
1415   unsigned NewOffset0 = CI.Offset;
1416   unsigned NewOffset1 = Paired.Offset;
1417   unsigned Opc =
1418       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1419 
1420   if (NewOffset0 > NewOffset1) {
1421     // Canonicalize the merged instruction so the smaller offset comes first.
1422     std::swap(NewOffset0, NewOffset1);
1423     std::swap(Data0, Data1);
1424   }
1425 
1426   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1427          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1428 
1429   const MCInstrDesc &Write2Desc = TII->get(Opc);
1430   DebugLoc DL = CI.I->getDebugLoc();
1431 
1432   Register BaseReg = AddrReg->getReg();
1433   unsigned BaseSubReg = AddrReg->getSubReg();
1434   unsigned BaseRegFlags = 0;
1435   if (CI.BaseOff) {
1436     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1438         .addImm(CI.BaseOff);
1439 
1440     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1441     BaseRegFlags = RegState::Kill;
1442 
1443     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1444         .addReg(ImmReg)
1445         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1446         .addImm(0); // clamp bit
1447     BaseSubReg = 0;
1448   }
1449 
1450   MachineInstrBuilder Write2 =
1451       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1452           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1453           .add(*Data0)                               // data0
1454           .add(*Data1)                               // data1
1455           .addImm(NewOffset0)                        // offset0
1456           .addImm(NewOffset1)                        // offset1
1457           .addImm(0)                                 // gds
1458           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1459 
1460   CI.I->eraseFromParent();
1461   Paired.I->eraseFromParent();
1462 
1463   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1464   return Write2;
1465 }
1466 
1467 MachineBasicBlock::iterator
1468 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1469                                      MachineBasicBlock::iterator InsertBefore) {
1470   MachineBasicBlock *MBB = CI.I->getParent();
1471   DebugLoc DL = CI.I->getDebugLoc();
1472   const unsigned Opcode = getNewOpcode(CI, Paired);
1473 
1474   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475 
1476   Register DestReg = MRI->createVirtualRegister(SuperRC);
1477   unsigned MergedDMask = CI.DMask | Paired.DMask;
1478   unsigned DMaskIdx =
1479       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1480 
1481   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1482   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1483     if (I == DMaskIdx)
1484       MIB.addImm(MergedDMask);
1485     else
1486       MIB.add((*CI.I).getOperand(I));
1487   }
1488 
1489   // It shouldn't be possible to get this far if the two instructions
1490   // don't have a single memoperand, because MachineInstr::mayAlias()
1491   // will return true if this is the case.
1492   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1493 
1494   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1495 
1496   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1497 
1498   CI.I->eraseFromParent();
1499   Paired.I->eraseFromParent();
1500   return New;
1501 }
1502 
1503 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1504     CombineInfo &CI, CombineInfo &Paired,
1505     MachineBasicBlock::iterator InsertBefore) {
1506   MachineBasicBlock *MBB = CI.I->getParent();
1507   DebugLoc DL = CI.I->getDebugLoc();
1508   const unsigned Opcode = getNewOpcode(CI, Paired);
1509 
1510   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1511 
1512   Register DestReg = MRI->createVirtualRegister(SuperRC);
1513   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1514 
1515   // It shouldn't be possible to get this far if the two instructions
1516   // don't have a single memoperand, because MachineInstr::mayAlias()
1517   // will return true if this is the case.
1518   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1519 
1520   MachineInstrBuilder New =
1521       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1522           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1523   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1524     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1525   New.addImm(MergedOffset);
1526   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1527 
1528   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1529 
1530   CI.I->eraseFromParent();
1531   Paired.I->eraseFromParent();
1532   return New;
1533 }
1534 
1535 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1536     CombineInfo &CI, CombineInfo &Paired,
1537     MachineBasicBlock::iterator InsertBefore) {
1538   MachineBasicBlock *MBB = CI.I->getParent();
1539   DebugLoc DL = CI.I->getDebugLoc();
1540 
1541   const unsigned Opcode = getNewOpcode(CI, Paired);
1542 
1543   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1544 
1545   // Copy to the new source register.
1546   Register DestReg = MRI->createVirtualRegister(SuperRC);
1547   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1548 
1549   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1550 
1551   AddressRegs Regs = getRegs(Opcode, *TII);
1552 
1553   if (Regs.VAddr)
1554     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1555 
1556   // It shouldn't be possible to get this far if the two instructions
1557   // don't have a single memoperand, because MachineInstr::mayAlias()
1558   // will return true if this is the case.
1559   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1560 
1561   MachineInstr *New =
1562     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1563         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1564         .addImm(MergedOffset) // offset
1565         .addImm(CI.CPol)      // cpol
1566         .addImm(0)            // swz
1567         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1568 
1569   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1570 
1571   CI.I->eraseFromParent();
1572   Paired.I->eraseFromParent();
1573   return New;
1574 }
1575 
1576 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1577     CombineInfo &CI, CombineInfo &Paired,
1578     MachineBasicBlock::iterator InsertBefore) {
1579   MachineBasicBlock *MBB = CI.I->getParent();
1580   DebugLoc DL = CI.I->getDebugLoc();
1581 
1582   const unsigned Opcode = getNewOpcode(CI, Paired);
1583 
1584   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1585 
1586   // Copy to the new source register.
1587   Register DestReg = MRI->createVirtualRegister(SuperRC);
1588   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1589 
1590   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1591 
1592   AddressRegs Regs = getRegs(Opcode, *TII);
1593 
1594   if (Regs.VAddr)
1595     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596 
1597   unsigned JoinedFormat =
1598       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599 
1600   // It shouldn't be possible to get this far if the two instructions
1601   // don't have a single memoperand, because MachineInstr::mayAlias()
1602   // will return true if this is the case.
1603   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604 
1605   MachineInstr *New =
1606       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608           .addImm(MergedOffset) // offset
1609           .addImm(JoinedFormat) // format
1610           .addImm(CI.CPol)      // cpol
1611           .addImm(0)            // swz
1612           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613 
1614   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1615 
1616   CI.I->eraseFromParent();
1617   Paired.I->eraseFromParent();
1618   return New;
1619 }
1620 
1621 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1622     CombineInfo &CI, CombineInfo &Paired,
1623     MachineBasicBlock::iterator InsertBefore) {
1624   MachineBasicBlock *MBB = CI.I->getParent();
1625   DebugLoc DL = CI.I->getDebugLoc();
1626 
1627   const unsigned Opcode = getNewOpcode(CI, Paired);
1628 
1629   Register SrcReg =
1630       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1631 
1632   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1633                  .addReg(SrcReg, RegState::Kill);
1634 
1635   AddressRegs Regs = getRegs(Opcode, *TII);
1636 
1637   if (Regs.VAddr)
1638     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1639 
1640   unsigned JoinedFormat =
1641       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1642 
1643   // It shouldn't be possible to get this far if the two instructions
1644   // don't have a single memoperand, because MachineInstr::mayAlias()
1645   // will return true if this is the case.
1646   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1647 
1648   MachineInstr *New =
1649       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1650           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1651           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1652           .addImm(JoinedFormat)                     // format
1653           .addImm(CI.CPol)                          // cpol
1654           .addImm(0)                                // swz
1655           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656 
1657   CI.I->eraseFromParent();
1658   Paired.I->eraseFromParent();
1659   return New;
1660 }
1661 
1662 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1663     CombineInfo &CI, CombineInfo &Paired,
1664     MachineBasicBlock::iterator InsertBefore) {
1665   MachineBasicBlock *MBB = CI.I->getParent();
1666   DebugLoc DL = CI.I->getDebugLoc();
1667 
1668   const unsigned Opcode = getNewOpcode(CI, Paired);
1669 
1670   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1671   Register DestReg = MRI->createVirtualRegister(SuperRC);
1672 
1673   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1674 
1675   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1676     MIB.add(*SAddr);
1677 
1678   MachineInstr *New =
1679     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1680        .addImm(std::min(CI.Offset, Paired.Offset))
1681        .addImm(CI.CPol)
1682        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1683 
1684   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1685 
1686   CI.I->eraseFromParent();
1687   Paired.I->eraseFromParent();
1688   return New;
1689 }
1690 
1691 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1692     CombineInfo &CI, CombineInfo &Paired,
1693     MachineBasicBlock::iterator InsertBefore) {
1694   MachineBasicBlock *MBB = CI.I->getParent();
1695   DebugLoc DL = CI.I->getDebugLoc();
1696 
1697   const unsigned Opcode = getNewOpcode(CI, Paired);
1698 
1699   Register SrcReg =
1700       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1701 
1702   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1703                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1704                  .addReg(SrcReg, RegState::Kill);
1705 
1706   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1707     MIB.add(*SAddr);
1708 
1709   MachineInstr *New =
1710     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1711        .addImm(CI.CPol)
1712        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1713 
1714   CI.I->eraseFromParent();
1715   Paired.I->eraseFromParent();
1716   return New;
1717 }
1718 
1719 static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1720                                    ArrayRef<MachineMemOperand *> MMOs,
1721                                    unsigned Width) {
1722   // Conservatively returns true if not found the MMO.
1723   return STM.isXNACKEnabled() &&
1724          (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1725 }
1726 
1727 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1728                                             const CombineInfo &Paired) {
1729   const unsigned Width = CI.Width + Paired.Width;
1730 
1731   switch (getCommonInstClass(CI, Paired)) {
1732   default:
1733     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1734     // FIXME: Handle d16 correctly
1735     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1736                                   Width);
1737   case TBUFFER_LOAD:
1738   case TBUFFER_STORE:
1739     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1740                                   Width);
1741 
1742   case UNKNOWN:
1743     llvm_unreachable("Unknown instruction class");
1744   case S_BUFFER_LOAD_IMM: {
1745     // If XNACK is enabled, use the constrained opcodes when the first load is
1746     // under-aligned.
1747     bool NeedsConstrainedOpc =
1748         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1749     switch (Width) {
1750     default:
1751       return 0;
1752     case 2:
1753       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1754                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1755     case 3:
1756       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1757                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1758     case 4:
1759       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1760                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1761     case 8:
1762       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1763                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1764     }
1765   }
1766   case S_BUFFER_LOAD_SGPR_IMM: {
1767     // If XNACK is enabled, use the constrained opcodes when the first load is
1768     // under-aligned.
1769     bool NeedsConstrainedOpc =
1770         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1771     switch (Width) {
1772     default:
1773       return 0;
1774     case 2:
1775       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1776                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1777     case 3:
1778       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1779                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1780     case 4:
1781       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1782                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1783     case 8:
1784       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1785                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1786     }
1787   }
1788   case S_LOAD_IMM: {
1789     // If XNACK is enabled, use the constrained opcodes when the first load is
1790     // under-aligned.
1791     bool NeedsConstrainedOpc =
1792         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1793     switch (Width) {
1794     default:
1795       return 0;
1796     case 2:
1797       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1798                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
1799     case 3:
1800       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1801                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1802     case 4:
1803       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1804                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1805     case 8:
1806       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1807                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1808     }
1809   }
1810   case GLOBAL_LOAD:
1811     switch (Width) {
1812     default:
1813       return 0;
1814     case 2:
1815       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1816     case 3:
1817       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1818     case 4:
1819       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1820     }
1821   case GLOBAL_LOAD_SADDR:
1822     switch (Width) {
1823     default:
1824       return 0;
1825     case 2:
1826       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1827     case 3:
1828       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1829     case 4:
1830       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1831     }
1832   case GLOBAL_STORE:
1833     switch (Width) {
1834     default:
1835       return 0;
1836     case 2:
1837       return AMDGPU::GLOBAL_STORE_DWORDX2;
1838     case 3:
1839       return AMDGPU::GLOBAL_STORE_DWORDX3;
1840     case 4:
1841       return AMDGPU::GLOBAL_STORE_DWORDX4;
1842     }
1843   case GLOBAL_STORE_SADDR:
1844     switch (Width) {
1845     default:
1846       return 0;
1847     case 2:
1848       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1849     case 3:
1850       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1851     case 4:
1852       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1853     }
1854   case FLAT_LOAD:
1855     switch (Width) {
1856     default:
1857       return 0;
1858     case 2:
1859       return AMDGPU::FLAT_LOAD_DWORDX2;
1860     case 3:
1861       return AMDGPU::FLAT_LOAD_DWORDX3;
1862     case 4:
1863       return AMDGPU::FLAT_LOAD_DWORDX4;
1864     }
1865   case FLAT_STORE:
1866     switch (Width) {
1867     default:
1868       return 0;
1869     case 2:
1870       return AMDGPU::FLAT_STORE_DWORDX2;
1871     case 3:
1872       return AMDGPU::FLAT_STORE_DWORDX3;
1873     case 4:
1874       return AMDGPU::FLAT_STORE_DWORDX4;
1875     }
1876   case MIMG:
1877     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1878            "No overlaps");
1879     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1880   }
1881 }
1882 
1883 std::pair<unsigned, unsigned>
1884 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1885                                     const CombineInfo &Paired) {
1886   assert((CI.InstClass != MIMG ||
1887           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1888            CI.Width + Paired.Width)) &&
1889          "No overlaps");
1890 
1891   unsigned Idx0;
1892   unsigned Idx1;
1893 
1894   static const unsigned Idxs[5][4] = {
1895       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1896       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1897       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1898       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1899       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1900   };
1901 
1902   assert(CI.Width >= 1 && CI.Width <= 4);
1903   assert(Paired.Width >= 1 && Paired.Width <= 4);
1904 
1905   if (Paired < CI) {
1906     Idx1 = Idxs[0][Paired.Width - 1];
1907     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1908   } else {
1909     Idx0 = Idxs[0][CI.Width - 1];
1910     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1911   }
1912 
1913   return {Idx0, Idx1};
1914 }
1915 
1916 const TargetRegisterClass *
1917 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1918                                              const CombineInfo &Paired) const {
1919   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1920       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1921     switch (CI.Width + Paired.Width) {
1922     default:
1923       return nullptr;
1924     case 2:
1925       return &AMDGPU::SReg_64_XEXECRegClass;
1926     case 3:
1927       return &AMDGPU::SGPR_96RegClass;
1928     case 4:
1929       return &AMDGPU::SGPR_128RegClass;
1930     case 8:
1931       return &AMDGPU::SGPR_256RegClass;
1932     case 16:
1933       return &AMDGPU::SGPR_512RegClass;
1934     }
1935   }
1936 
1937   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1938   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1939              ? TRI->getAGPRClassForBitWidth(BitWidth)
1940              : TRI->getVGPRClassForBitWidth(BitWidth);
1941 }
1942 
1943 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1944     CombineInfo &CI, CombineInfo &Paired,
1945     MachineBasicBlock::iterator InsertBefore) {
1946   MachineBasicBlock *MBB = CI.I->getParent();
1947   DebugLoc DL = CI.I->getDebugLoc();
1948 
1949   const unsigned Opcode = getNewOpcode(CI, Paired);
1950 
1951   Register SrcReg =
1952       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1953 
1954   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1955                  .addReg(SrcReg, RegState::Kill);
1956 
1957   AddressRegs Regs = getRegs(Opcode, *TII);
1958 
1959   if (Regs.VAddr)
1960     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1961 
1962 
1963   // It shouldn't be possible to get this far if the two instructions
1964   // don't have a single memoperand, because MachineInstr::mayAlias()
1965   // will return true if this is the case.
1966   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1967 
1968   MachineInstr *New =
1969     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1970         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1971         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1972         .addImm(CI.CPol)      // cpol
1973         .addImm(0)            // swz
1974         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1975 
1976   CI.I->eraseFromParent();
1977   Paired.I->eraseFromParent();
1978   return New;
1979 }
1980 
1981 MachineOperand
1982 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1983   APInt V(32, Val, true);
1984   if (TII->isInlineConstant(V))
1985     return MachineOperand::CreateImm(Val);
1986 
1987   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1988   MachineInstr *Mov =
1989   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1990           TII->get(AMDGPU::S_MOV_B32), Reg)
1991     .addImm(Val);
1992   (void)Mov;
1993   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1994   return MachineOperand::CreateReg(Reg, false);
1995 }
1996 
1997 // Compute base address using Addr and return the final register.
1998 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1999                                            const MemAddress &Addr) const {
2000   MachineBasicBlock *MBB = MI.getParent();
2001   MachineBasicBlock::iterator MBBI = MI.getIterator();
2002   DebugLoc DL = MI.getDebugLoc();
2003 
2004   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2005           Addr.Base.LoSubReg) &&
2006          "Expected 32-bit Base-Register-Low!!");
2007 
2008   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2009           Addr.Base.HiSubReg) &&
2010          "Expected 32-bit Base-Register-Hi!!");
2011 
2012   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
2013   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2014   MachineOperand OffsetHi =
2015     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2016 
2017   const auto *CarryRC = TRI->getWaveMaskRegClass();
2018   Register CarryReg = MRI->createVirtualRegister(CarryRC);
2019   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2020 
2021   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2023   MachineInstr *LoHalf =
2024     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2025       .addReg(CarryReg, RegState::Define)
2026       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2027       .add(OffsetLo)
2028       .addImm(0); // clamp bit
2029   (void)LoHalf;
2030   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
2031 
2032   MachineInstr *HiHalf =
2033   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2034     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2035     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2036     .add(OffsetHi)
2037     .addReg(CarryReg, RegState::Kill)
2038     .addImm(0); // clamp bit
2039   (void)HiHalf;
2040   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2041 
2042   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2043   MachineInstr *FullBase =
2044     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2045       .addReg(DestSub0)
2046       .addImm(AMDGPU::sub0)
2047       .addReg(DestSub1)
2048       .addImm(AMDGPU::sub1);
2049   (void)FullBase;
2050   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2051 
2052   return FullDestReg;
2053 }
2054 
2055 // Update base and offset with the NewBase and NewOffset in MI.
2056 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2057                                                Register NewBase,
2058                                                int32_t NewOffset) const {
2059   auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2060   Base->setReg(NewBase);
2061   Base->setIsKill(false);
2062   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2063 }
2064 
2065 std::optional<int32_t>
2066 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2067   if (Op.isImm())
2068     return Op.getImm();
2069 
2070   if (!Op.isReg())
2071     return std::nullopt;
2072 
2073   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2074   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2075       !Def->getOperand(1).isImm())
2076     return std::nullopt;
2077 
2078   return Def->getOperand(1).getImm();
2079 }
2080 
2081 // Analyze Base and extracts:
2082 //  - 32bit base registers, subregisters
2083 //  - 64bit constant offset
2084 // Expecting base computation as:
2085 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2086 //   %LO:vgpr_32, %c:sreg_64_xexec =
2087 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2088 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2089 //   %Base:vreg_64 =
2090 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2091 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2092                                                       MemAddress &Addr) const {
2093   if (!Base.isReg())
2094     return;
2095 
2096   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2097   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2098       || Def->getNumOperands() != 5)
2099     return;
2100 
2101   MachineOperand BaseLo = Def->getOperand(1);
2102   MachineOperand BaseHi = Def->getOperand(3);
2103   if (!BaseLo.isReg() || !BaseHi.isReg())
2104     return;
2105 
2106   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2107   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2108 
2109   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2110       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2111     return;
2112 
2113   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2114   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2115 
2116   auto Offset0P = extractConstOffset(*Src0);
2117   if (Offset0P)
2118     BaseLo = *Src1;
2119   else {
2120     if (!(Offset0P = extractConstOffset(*Src1)))
2121       return;
2122     BaseLo = *Src0;
2123   }
2124 
2125   if (!BaseLo.isReg())
2126     return;
2127 
2128   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2129   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2130 
2131   if (Src0->isImm())
2132     std::swap(Src0, Src1);
2133 
2134   if (!Src1->isImm() || Src0->isImm())
2135     return;
2136 
2137   uint64_t Offset1 = Src1->getImm();
2138   BaseHi = *Src0;
2139 
2140   if (!BaseHi.isReg())
2141     return;
2142 
2143   Addr.Base.LoReg = BaseLo.getReg();
2144   Addr.Base.HiReg = BaseHi.getReg();
2145   Addr.Base.LoSubReg = BaseLo.getSubReg();
2146   Addr.Base.HiSubReg = BaseHi.getSubReg();
2147   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2148 }
2149 
2150 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2151     MachineInstr &MI,
2152     MemInfoMap &Visited,
2153     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2154 
2155   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2156     return false;
2157 
2158   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2159   if (SIInstrInfo::isFLATScratch(MI))
2160     return false;
2161 
2162   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2163                                               : AMDGPUAS::FLAT_ADDRESS;
2164 
2165   if (AnchorList.count(&MI))
2166     return false;
2167 
2168   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2169 
2170   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2171     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2172     return false;
2173   }
2174 
2175   // Step1: Find the base-registers and a 64bit constant offset.
2176   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2177   MemAddress MAddr;
2178   if (!Visited.contains(&MI)) {
2179     processBaseWithConstOffset(Base, MAddr);
2180     Visited[&MI] = MAddr;
2181   } else
2182     MAddr = Visited[&MI];
2183 
2184   if (MAddr.Offset == 0) {
2185     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2186                          " constant offsets that can be promoted.\n";);
2187     return false;
2188   }
2189 
2190   LLVM_DEBUG(dbgs() << "  BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2191                     << printReg(MAddr.Base.LoReg, TRI)
2192                     << "} Offset: " << MAddr.Offset << "\n\n";);
2193 
2194   // Step2: Traverse through MI's basic block and find an anchor(that has the
2195   // same base-registers) with the highest 13bit distance from MI's offset.
2196   // E.g. (64bit loads)
2197   // bb:
2198   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2199   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2200   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2201   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2202   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2203   //
2204   // Starting from the first load, the optimization will try to find a new base
2205   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2206   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2207   // as the new-base(anchor) because of the maximum distance which can
2208   // accommodate more intermediate bases presumably.
2209   //
2210   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2211   // (&a + 8192) for load1, load2, load4.
2212   //   addr = &a + 8192
2213   //   load1 = load(addr,       -4096)
2214   //   load2 = load(addr,       -2048)
2215   //   load3 = load(addr,       0)
2216   //   load4 = load(addr,       2048)
2217   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2218   //
2219   MachineInstr *AnchorInst = nullptr;
2220   MemAddress AnchorAddr;
2221   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2222   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2223 
2224   MachineBasicBlock *MBB = MI.getParent();
2225   MachineBasicBlock::iterator E = MBB->end();
2226   MachineBasicBlock::iterator MBBI = MI.getIterator();
2227   ++MBBI;
2228   const SITargetLowering *TLI =
2229     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2230 
2231   for ( ; MBBI != E; ++MBBI) {
2232     MachineInstr &MINext = *MBBI;
2233     // TODO: Support finding an anchor(with same base) from store addresses or
2234     // any other load addresses where the opcodes are different.
2235     if (MINext.getOpcode() != MI.getOpcode() ||
2236         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2237       continue;
2238 
2239     const MachineOperand &BaseNext =
2240       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241     MemAddress MAddrNext;
2242     if (!Visited.contains(&MINext)) {
2243       processBaseWithConstOffset(BaseNext, MAddrNext);
2244       Visited[&MINext] = MAddrNext;
2245     } else
2246       MAddrNext = Visited[&MINext];
2247 
2248     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2249         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2250         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2251         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2252       continue;
2253 
2254     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2255 
2256     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2257     TargetLoweringBase::AddrMode AM;
2258     AM.HasBaseReg = true;
2259     AM.BaseOffs = Dist;
2260     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2261         (uint32_t)std::abs(Dist) > MaxDist) {
2262       MaxDist = std::abs(Dist);
2263 
2264       AnchorAddr = MAddrNext;
2265       AnchorInst = &MINext;
2266     }
2267   }
2268 
2269   if (AnchorInst) {
2270     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2271                AnchorInst->dump());
2272     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2273                <<  AnchorAddr.Offset << "\n\n");
2274 
2275     // Instead of moving up, just re-compute anchor-instruction's base address.
2276     Register Base = computeBase(MI, AnchorAddr);
2277 
2278     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2279     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2280 
2281     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2282       TargetLoweringBase::AddrMode AM;
2283       AM.HasBaseReg = true;
2284       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2285 
2286       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2287         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2288                    OtherMI->dump());
2289         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2290         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2291       }
2292     }
2293     AnchorList.insert(AnchorInst);
2294     return true;
2295   }
2296 
2297   return false;
2298 }
2299 
2300 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2301                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2302   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2303     if (AddrList.front().InstClass == CI.InstClass &&
2304         AddrList.front().IsAGPR == CI.IsAGPR &&
2305         AddrList.front().hasSameBaseAddress(CI)) {
2306       AddrList.emplace_back(CI);
2307       return;
2308     }
2309   }
2310 
2311   // Base address not found, so add a new list.
2312   MergeableInsts.emplace_back(1, CI);
2313 }
2314 
2315 std::pair<MachineBasicBlock::iterator, bool>
2316 SILoadStoreOptimizer::collectMergeableInsts(
2317     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2318     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2319     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2320   bool Modified = false;
2321 
2322   // Sort potential mergeable instructions into lists.  One list per base address.
2323   unsigned Order = 0;
2324   MachineBasicBlock::iterator BlockI = Begin;
2325   for (; BlockI != End; ++BlockI) {
2326     MachineInstr &MI = *BlockI;
2327 
2328     // We run this before checking if an address is mergeable, because it can produce
2329     // better code even if the instructions aren't mergeable.
2330     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2331       Modified = true;
2332 
2333     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2334     // barriers. We can look after this barrier for separate merges.
2335     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2336       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2337 
2338       // Search will resume after this instruction in a separate merge list.
2339       ++BlockI;
2340       break;
2341     }
2342 
2343     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2344     if (InstClass == UNKNOWN)
2345       continue;
2346 
2347     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2348     int Swizzled =
2349         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2350     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2351       continue;
2352 
2353     CombineInfo CI;
2354     CI.setMI(MI, *this);
2355     CI.Order = Order++;
2356 
2357     if (!CI.hasMergeableAddress(*MRI))
2358       continue;
2359 
2360     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2361       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2362       //        operands. However we are reporting that ds_write2 shall have
2363       //        only VGPR data so that machine copy propagation does not
2364       //        create an illegal instruction with a VGPR and AGPR sources.
2365       //        Consequenctially if we create such instruction the verifier
2366       //        will complain.
2367       continue;
2368     }
2369 
2370     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2371 
2372     addInstToMergeableList(CI, MergeableInsts);
2373   }
2374 
2375   // At this point we have lists of Mergeable instructions.
2376   //
2377   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2378   // list try to find an instruction that can be merged with I.  If an instruction
2379   // is found, it is stored in the Paired field.  If no instructions are found, then
2380   // the CombineInfo object is deleted from the list.
2381 
2382   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2383                                                    E = MergeableInsts.end(); I != E;) {
2384 
2385     std::list<CombineInfo> &MergeList = *I;
2386     if (MergeList.size() <= 1) {
2387       // This means we have found only one instruction with a given address
2388       // that can be merged, and we need at least 2 instructions to do a merge,
2389       // so this list can be discarded.
2390       I = MergeableInsts.erase(I);
2391       continue;
2392     }
2393 
2394     // Sort the lists by offsets, this way mergeable instructions will be
2395     // adjacent to each other in the list, which will make it easier to find
2396     // matches.
2397     MergeList.sort(
2398         [] (const CombineInfo &A, const CombineInfo &B) {
2399           return A.Offset < B.Offset;
2400         });
2401     ++I;
2402   }
2403 
2404   return {BlockI, Modified};
2405 }
2406 
2407 // Scan through looking for adjacent LDS operations with constant offsets from
2408 // the same base register. We rely on the scheduler to do the hard work of
2409 // clustering nearby loads, and assume these are all adjacent.
2410 bool SILoadStoreOptimizer::optimizeBlock(
2411                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2412   bool Modified = false;
2413 
2414   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2415                                                    E = MergeableInsts.end(); I != E;) {
2416     std::list<CombineInfo> &MergeList = *I;
2417 
2418     bool OptimizeListAgain = false;
2419     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2420       // We weren't able to make any changes, so delete the list so we don't
2421       // process the same instructions the next time we try to optimize this
2422       // block.
2423       I = MergeableInsts.erase(I);
2424       continue;
2425     }
2426 
2427     Modified = true;
2428 
2429     // We made changes, but also determined that there were no more optimization
2430     // opportunities, so we don't need to reprocess the list
2431     if (!OptimizeListAgain) {
2432       I = MergeableInsts.erase(I);
2433       continue;
2434     }
2435     OptimizeAgain = true;
2436   }
2437   return Modified;
2438 }
2439 
2440 bool
2441 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2442                                           std::list<CombineInfo> &MergeList,
2443                                           bool &OptimizeListAgain) {
2444   if (MergeList.empty())
2445     return false;
2446 
2447   bool Modified = false;
2448 
2449   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2450        Next = std::next(I)) {
2451 
2452     auto First = I;
2453     auto Second = Next;
2454 
2455     if ((*First).Order > (*Second).Order)
2456       std::swap(First, Second);
2457     CombineInfo &CI = *First;
2458     CombineInfo &Paired = *Second;
2459 
2460     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2461     if (!Where) {
2462       ++I;
2463       continue;
2464     }
2465 
2466     Modified = true;
2467 
2468     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2469 
2470     MachineBasicBlock::iterator NewMI;
2471     switch (CI.InstClass) {
2472     default:
2473       llvm_unreachable("unknown InstClass");
2474       break;
2475     case DS_READ:
2476       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2477       break;
2478     case DS_WRITE:
2479       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2480       break;
2481     case S_BUFFER_LOAD_IMM:
2482     case S_BUFFER_LOAD_SGPR_IMM:
2483     case S_LOAD_IMM:
2484       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2485       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2486       break;
2487     case BUFFER_LOAD:
2488       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2489       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2490       break;
2491     case BUFFER_STORE:
2492       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2493       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494       break;
2495     case MIMG:
2496       NewMI = mergeImagePair(CI, Paired, Where->I);
2497       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2498       break;
2499     case TBUFFER_LOAD:
2500       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2501       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2502       break;
2503     case TBUFFER_STORE:
2504       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2505       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2506       break;
2507     case FLAT_LOAD:
2508     case GLOBAL_LOAD:
2509     case GLOBAL_LOAD_SADDR:
2510       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2511       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2512       break;
2513     case FLAT_STORE:
2514     case GLOBAL_STORE:
2515     case GLOBAL_STORE_SADDR:
2516       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2517       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2518       break;
2519     }
2520     CI.setMI(NewMI, *this);
2521     CI.Order = Where->Order;
2522     if (I == Second)
2523       I = Next;
2524 
2525     MergeList.erase(Second);
2526   }
2527 
2528   return Modified;
2529 }
2530 
2531 bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2532   if (skipFunction(MF.getFunction()))
2533     return false;
2534   return SILoadStoreOptimizer(
2535              &getAnalysis<AAResultsWrapperPass>().getAAResults())
2536       .run(MF);
2537 }
2538 
2539 bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2540   STM = &MF.getSubtarget<GCNSubtarget>();
2541   if (!STM->loadStoreOptEnabled())
2542     return false;
2543 
2544   TII = STM->getInstrInfo();
2545   TRI = &TII->getRegisterInfo();
2546 
2547   MRI = &MF.getRegInfo();
2548 
2549   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2550 
2551   bool Modified = false;
2552 
2553   // Contains the list of instructions for which constant offsets are being
2554   // promoted to the IMM. This is tracked for an entire block at time.
2555   SmallPtrSet<MachineInstr *, 4> AnchorList;
2556   MemInfoMap Visited;
2557 
2558   for (MachineBasicBlock &MBB : MF) {
2559     MachineBasicBlock::iterator SectionEnd;
2560     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2561          I = SectionEnd) {
2562       bool CollectModified;
2563       std::list<std::list<CombineInfo>> MergeableInsts;
2564 
2565       // First pass: Collect list of all instructions we know how to merge in a
2566       // subset of the block.
2567       std::tie(SectionEnd, CollectModified) =
2568           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2569 
2570       Modified |= CollectModified;
2571 
2572       do {
2573         OptimizeAgain = false;
2574         Modified |= optimizeBlock(MergeableInsts);
2575       } while (OptimizeAgain);
2576     }
2577 
2578     Visited.clear();
2579     AnchorList.clear();
2580   }
2581 
2582   return Modified;
2583 }
2584 
2585 PreservedAnalyses
2586 SILoadStoreOptimizerPass::run(MachineFunction &MF,
2587                               MachineFunctionAnalysisManager &MFAM) {
2588   MFPropsModifier _(*this, MF);
2589 
2590   if (MF.getFunction().hasOptNone())
2591     return PreservedAnalyses::all();
2592 
2593   auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2594                   .getManager();
2595   AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
2596 
2597   bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2598   if (!Changed)
2599     return PreservedAnalyses::all();
2600 
2601   PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
2602   PA.preserveSet<CFGAnalyses>();
2603   return PA;
2604 }
2605