xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision e020e287c7733f29aac08b3ed87f2c3f96a88ef7)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                     const CombineInfo &Paired);
224   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225 
226   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227 
228   unsigned read2Opcode(unsigned EltSize) const;
229   unsigned read2ST64Opcode(unsigned EltSize) const;
230   MachineBasicBlock::iterator
231   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232                  MachineBasicBlock::iterator InsertBefore);
233 
234   unsigned write2Opcode(unsigned EltSize) const;
235   unsigned write2ST64Opcode(unsigned EltSize) const;
236   MachineBasicBlock::iterator
237   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                   MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242   MachineBasicBlock::iterator
243   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                        MachineBasicBlock::iterator InsertBefore);
245   MachineBasicBlock::iterator
246   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                       MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                        MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                         MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259                     MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262                      MachineBasicBlock::iterator InsertBefore);
263 
264   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265                            int32_t NewOffset) const;
266   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270   /// Promotes constant offset to the immediate by adjusting the base. It
271   /// tries to use a base from the nearby instructions that allows it to have
272   /// a 13bit constant offset which gets promoted to the immediate.
273   bool promoteConstantOffsetToImm(MachineInstr &CI,
274                                   MemInfoMap &Visited,
275                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276   void addInstToMergeableList(const CombineInfo &CI,
277                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
278 
279   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282       std::list<std::list<CombineInfo>> &MergeableInsts) const;
283 
284   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285                                                      const CombineInfo &Paired);
286 
287   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288                                           const CombineInfo &Paired);
289 
290 public:
291   static char ID;
292 
293   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
294     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295   }
296 
297   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298                                      bool &OptimizeListAgain);
299   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300 
301   bool runOnMachineFunction(MachineFunction &MF) override;
302 
303   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304 
305   void getAnalysisUsage(AnalysisUsage &AU) const override {
306     AU.setPreservesCFG();
307     AU.addRequired<AAResultsWrapperPass>();
308 
309     MachineFunctionPass::getAnalysisUsage(AU);
310   }
311 
312   MachineFunctionProperties getRequiredProperties() const override {
313     return MachineFunctionProperties()
314       .set(MachineFunctionProperties::Property::IsSSA);
315   }
316 };
317 
318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319   const unsigned Opc = MI.getOpcode();
320 
321   if (TII.isMUBUF(Opc)) {
322     // FIXME: Handle d16 correctly
323     return AMDGPU::getMUBUFElements(Opc);
324   }
325   if (TII.isImage(MI)) {
326     uint64_t DMaskImm =
327         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328     return llvm::popcount(DMaskImm);
329   }
330   if (TII.isMTBUF(Opc)) {
331     return AMDGPU::getMTBUFElements(Opc);
332   }
333 
334   switch (Opc) {
335   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337   case AMDGPU::S_LOAD_DWORD_IMM:
338   case AMDGPU::GLOBAL_LOAD_DWORD:
339   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340   case AMDGPU::GLOBAL_STORE_DWORD:
341   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342   case AMDGPU::FLAT_LOAD_DWORD:
343   case AMDGPU::FLAT_STORE_DWORD:
344     return 1;
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357   case AMDGPU::S_LOAD_DWORDX3_IMM:
358   case AMDGPU::GLOBAL_LOAD_DWORDX3:
359   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360   case AMDGPU::GLOBAL_STORE_DWORDX3:
361   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362   case AMDGPU::FLAT_LOAD_DWORDX3:
363   case AMDGPU::FLAT_STORE_DWORDX3:
364     return 3;
365   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367   case AMDGPU::S_LOAD_DWORDX4_IMM:
368   case AMDGPU::GLOBAL_LOAD_DWORDX4:
369   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370   case AMDGPU::GLOBAL_STORE_DWORDX4:
371   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372   case AMDGPU::FLAT_LOAD_DWORDX4:
373   case AMDGPU::FLAT_STORE_DWORDX4:
374     return 4;
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377   case AMDGPU::S_LOAD_DWORDX8_IMM:
378     return 8;
379   case AMDGPU::DS_READ_B32:
380   case AMDGPU::DS_READ_B32_gfx9:
381   case AMDGPU::DS_WRITE_B32:
382   case AMDGPU::DS_WRITE_B32_gfx9:
383     return 1;
384   case AMDGPU::DS_READ_B64:
385   case AMDGPU::DS_READ_B64_gfx9:
386   case AMDGPU::DS_WRITE_B64:
387   case AMDGPU::DS_WRITE_B64_gfx9:
388     return 2;
389   default:
390     return 0;
391   }
392 }
393 
394 /// Maps instruction opcode to enum InstClassEnum.
395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396   switch (Opc) {
397   default:
398     if (TII.isMUBUF(Opc)) {
399       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400       default:
401         return UNKNOWN;
402       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
403       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
404       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
405       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
406       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
407       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
408       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
409       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
410       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
411       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
412       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
413       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
414       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
415       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
416       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
417       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
418         return BUFFER_LOAD;
419       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
420       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
421       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
422       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
423       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
424       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
425       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
426       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
427       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
428       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
429       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
430       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
431       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
432       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
433       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
434       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
435         return BUFFER_STORE;
436       }
437     }
438     if (TII.isImage(Opc)) {
439       // Ignore instructions encoded without vaddr.
440       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
441           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
442         return UNKNOWN;
443       // Ignore BVH instructions
444       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
445         return UNKNOWN;
446       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
447       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
448           TII.isGather4(Opc))
449         return UNKNOWN;
450       return MIMG;
451     }
452     if (TII.isMTBUF(Opc)) {
453       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
454       default:
455         return UNKNOWN;
456       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
457       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
458       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
459       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
460       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
461       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
462       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
463       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
464       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
465       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
466       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
467       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
468       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
469       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
470       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
471       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
472         return TBUFFER_LOAD;
473       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
474       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
475       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
476       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
477       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
478       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
479       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
480       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
481         return TBUFFER_STORE;
482       }
483     }
484     return UNKNOWN;
485   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
486   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
487   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
488   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
489   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
490     return S_BUFFER_LOAD_IMM;
491   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
492   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
493   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
494   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
495   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
496     return S_BUFFER_LOAD_SGPR_IMM;
497   case AMDGPU::S_LOAD_DWORD_IMM:
498   case AMDGPU::S_LOAD_DWORDX2_IMM:
499   case AMDGPU::S_LOAD_DWORDX3_IMM:
500   case AMDGPU::S_LOAD_DWORDX4_IMM:
501   case AMDGPU::S_LOAD_DWORDX8_IMM:
502     return S_LOAD_IMM;
503   case AMDGPU::DS_READ_B32:
504   case AMDGPU::DS_READ_B32_gfx9:
505   case AMDGPU::DS_READ_B64:
506   case AMDGPU::DS_READ_B64_gfx9:
507     return DS_READ;
508   case AMDGPU::DS_WRITE_B32:
509   case AMDGPU::DS_WRITE_B32_gfx9:
510   case AMDGPU::DS_WRITE_B64:
511   case AMDGPU::DS_WRITE_B64_gfx9:
512     return DS_WRITE;
513   case AMDGPU::GLOBAL_LOAD_DWORD:
514   case AMDGPU::GLOBAL_LOAD_DWORDX2:
515   case AMDGPU::GLOBAL_LOAD_DWORDX3:
516   case AMDGPU::GLOBAL_LOAD_DWORDX4:
517   case AMDGPU::FLAT_LOAD_DWORD:
518   case AMDGPU::FLAT_LOAD_DWORDX2:
519   case AMDGPU::FLAT_LOAD_DWORDX3:
520   case AMDGPU::FLAT_LOAD_DWORDX4:
521     return FLAT_LOAD;
522   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
523   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
524   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
525   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
526     return GLOBAL_LOAD_SADDR;
527   case AMDGPU::GLOBAL_STORE_DWORD:
528   case AMDGPU::GLOBAL_STORE_DWORDX2:
529   case AMDGPU::GLOBAL_STORE_DWORDX3:
530   case AMDGPU::GLOBAL_STORE_DWORDX4:
531   case AMDGPU::FLAT_STORE_DWORD:
532   case AMDGPU::FLAT_STORE_DWORDX2:
533   case AMDGPU::FLAT_STORE_DWORDX3:
534   case AMDGPU::FLAT_STORE_DWORDX4:
535     return FLAT_STORE;
536   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
537   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
538   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
539   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
540     return GLOBAL_STORE_SADDR;
541   }
542 }
543 
544 /// Determines instruction subclass from opcode. Only instructions
545 /// of the same subclass can be merged together. The merged instruction may have
546 /// a different subclass but must have the same class.
547 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
548   switch (Opc) {
549   default:
550     if (TII.isMUBUF(Opc))
551       return AMDGPU::getMUBUFBaseOpcode(Opc);
552     if (TII.isImage(Opc)) {
553       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
554       assert(Info);
555       return Info->BaseOpcode;
556     }
557     if (TII.isMTBUF(Opc))
558       return AMDGPU::getMTBUFBaseOpcode(Opc);
559     return -1;
560   case AMDGPU::DS_READ_B32:
561   case AMDGPU::DS_READ_B32_gfx9:
562   case AMDGPU::DS_READ_B64:
563   case AMDGPU::DS_READ_B64_gfx9:
564   case AMDGPU::DS_WRITE_B32:
565   case AMDGPU::DS_WRITE_B32_gfx9:
566   case AMDGPU::DS_WRITE_B64:
567   case AMDGPU::DS_WRITE_B64_gfx9:
568     return Opc;
569   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
570   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
571   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
572   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
573   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
574     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
575   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
576   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
577   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
578   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
579   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
580     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
581   case AMDGPU::S_LOAD_DWORD_IMM:
582   case AMDGPU::S_LOAD_DWORDX2_IMM:
583   case AMDGPU::S_LOAD_DWORDX3_IMM:
584   case AMDGPU::S_LOAD_DWORDX4_IMM:
585   case AMDGPU::S_LOAD_DWORDX8_IMM:
586     return AMDGPU::S_LOAD_DWORD_IMM;
587   case AMDGPU::GLOBAL_LOAD_DWORD:
588   case AMDGPU::GLOBAL_LOAD_DWORDX2:
589   case AMDGPU::GLOBAL_LOAD_DWORDX3:
590   case AMDGPU::GLOBAL_LOAD_DWORDX4:
591   case AMDGPU::FLAT_LOAD_DWORD:
592   case AMDGPU::FLAT_LOAD_DWORDX2:
593   case AMDGPU::FLAT_LOAD_DWORDX3:
594   case AMDGPU::FLAT_LOAD_DWORDX4:
595     return AMDGPU::FLAT_LOAD_DWORD;
596   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
597   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
598   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
599   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
600     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
601   case AMDGPU::GLOBAL_STORE_DWORD:
602   case AMDGPU::GLOBAL_STORE_DWORDX2:
603   case AMDGPU::GLOBAL_STORE_DWORDX3:
604   case AMDGPU::GLOBAL_STORE_DWORDX4:
605   case AMDGPU::FLAT_STORE_DWORD:
606   case AMDGPU::FLAT_STORE_DWORDX2:
607   case AMDGPU::FLAT_STORE_DWORDX3:
608   case AMDGPU::FLAT_STORE_DWORDX4:
609     return AMDGPU::FLAT_STORE_DWORD;
610   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
611   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
612   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
613   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
614     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
615   }
616 }
617 
618 // GLOBAL loads and stores are classified as FLAT initially. If both combined
619 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
620 // If either or both instructions are non segment specific FLAT the resulting
621 // combined operation will be FLAT, potentially promoting one of the GLOBAL
622 // operations to FLAT.
623 // For other instructions return the original unmodified class.
624 InstClassEnum
625 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
626                                          const CombineInfo &Paired) {
627   assert(CI.InstClass == Paired.InstClass);
628 
629   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
630       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
631     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
632 
633   return CI.InstClass;
634 }
635 
636 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
637   AddressRegs Result;
638 
639   if (TII.isMUBUF(Opc)) {
640     if (AMDGPU::getMUBUFHasVAddr(Opc))
641       Result.VAddr = true;
642     if (AMDGPU::getMUBUFHasSrsrc(Opc))
643       Result.SRsrc = true;
644     if (AMDGPU::getMUBUFHasSoffset(Opc))
645       Result.SOffset = true;
646 
647     return Result;
648   }
649 
650   if (TII.isImage(Opc)) {
651     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
652     if (VAddr0Idx >= 0) {
653       int RsrcName =
654           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
655       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
656       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
657     } else {
658       Result.VAddr = true;
659     }
660     Result.SRsrc = true;
661     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
662     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
663       Result.SSamp = true;
664 
665     return Result;
666   }
667   if (TII.isMTBUF(Opc)) {
668     if (AMDGPU::getMTBUFHasVAddr(Opc))
669       Result.VAddr = true;
670     if (AMDGPU::getMTBUFHasSrsrc(Opc))
671       Result.SRsrc = true;
672     if (AMDGPU::getMTBUFHasSoffset(Opc))
673       Result.SOffset = true;
674 
675     return Result;
676   }
677 
678   switch (Opc) {
679   default:
680     return Result;
681   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
682   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
683   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
684   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
685   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
686     Result.SOffset = true;
687     [[fallthrough]];
688   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
689   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
690   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
691   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
692   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
693   case AMDGPU::S_LOAD_DWORD_IMM:
694   case AMDGPU::S_LOAD_DWORDX2_IMM:
695   case AMDGPU::S_LOAD_DWORDX3_IMM:
696   case AMDGPU::S_LOAD_DWORDX4_IMM:
697   case AMDGPU::S_LOAD_DWORDX8_IMM:
698     Result.SBase = true;
699     return Result;
700   case AMDGPU::DS_READ_B32:
701   case AMDGPU::DS_READ_B64:
702   case AMDGPU::DS_READ_B32_gfx9:
703   case AMDGPU::DS_READ_B64_gfx9:
704   case AMDGPU::DS_WRITE_B32:
705   case AMDGPU::DS_WRITE_B64:
706   case AMDGPU::DS_WRITE_B32_gfx9:
707   case AMDGPU::DS_WRITE_B64_gfx9:
708     Result.Addr = true;
709     return Result;
710   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
711   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
712   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
713   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
714   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
715   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
716   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
717   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
718     Result.SAddr = true;
719     [[fallthrough]];
720   case AMDGPU::GLOBAL_LOAD_DWORD:
721   case AMDGPU::GLOBAL_LOAD_DWORDX2:
722   case AMDGPU::GLOBAL_LOAD_DWORDX3:
723   case AMDGPU::GLOBAL_LOAD_DWORDX4:
724   case AMDGPU::GLOBAL_STORE_DWORD:
725   case AMDGPU::GLOBAL_STORE_DWORDX2:
726   case AMDGPU::GLOBAL_STORE_DWORDX3:
727   case AMDGPU::GLOBAL_STORE_DWORDX4:
728   case AMDGPU::FLAT_LOAD_DWORD:
729   case AMDGPU::FLAT_LOAD_DWORDX2:
730   case AMDGPU::FLAT_LOAD_DWORDX3:
731   case AMDGPU::FLAT_LOAD_DWORDX4:
732   case AMDGPU::FLAT_STORE_DWORD:
733   case AMDGPU::FLAT_STORE_DWORDX2:
734   case AMDGPU::FLAT_STORE_DWORDX3:
735   case AMDGPU::FLAT_STORE_DWORDX4:
736     Result.VAddr = true;
737     return Result;
738   }
739 }
740 
741 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
742                                               const SILoadStoreOptimizer &LSO) {
743   I = MI;
744   unsigned Opc = MI->getOpcode();
745   InstClass = getInstClass(Opc, *LSO.TII);
746 
747   if (InstClass == UNKNOWN)
748     return;
749 
750   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
751 
752   switch (InstClass) {
753   case DS_READ:
754    EltSize =
755           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
756                                                                           : 4;
757    break;
758   case DS_WRITE:
759     EltSize =
760           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
761                                                                             : 4;
762     break;
763   case S_BUFFER_LOAD_IMM:
764   case S_BUFFER_LOAD_SGPR_IMM:
765   case S_LOAD_IMM:
766     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
767     break;
768   default:
769     EltSize = 4;
770     break;
771   }
772 
773   if (InstClass == MIMG) {
774     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
775     // Offset is not considered for MIMG instructions.
776     Offset = 0;
777   } else {
778     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
779     Offset = I->getOperand(OffsetIdx).getImm();
780   }
781 
782   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
783     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
784 
785   Width = getOpcodeWidth(*I, *LSO.TII);
786 
787   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
788     Offset &= 0xffff;
789   } else if (InstClass != MIMG) {
790     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
791   }
792 
793   AddressRegs Regs = getRegs(Opc, *LSO.TII);
794   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
795 
796   NumAddresses = 0;
797   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
798     AddrIdx[NumAddresses++] =
799         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
800   if (Regs.Addr)
801     AddrIdx[NumAddresses++] =
802         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
803   if (Regs.SBase)
804     AddrIdx[NumAddresses++] =
805         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
806   if (Regs.SRsrc)
807     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
808         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
809   if (Regs.SOffset)
810     AddrIdx[NumAddresses++] =
811         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
812   if (Regs.SAddr)
813     AddrIdx[NumAddresses++] =
814         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
815   if (Regs.VAddr)
816     AddrIdx[NumAddresses++] =
817         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
818   if (Regs.SSamp)
819     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
820         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
821   assert(NumAddresses <= MaxAddressRegs);
822 
823   for (unsigned J = 0; J < NumAddresses; J++)
824     AddrReg[J] = &I->getOperand(AddrIdx[J]);
825 }
826 
827 } // end anonymous namespace.
828 
829 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
830                       "SI Load Store Optimizer", false, false)
831 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
832 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
833                     false, false)
834 
835 char SILoadStoreOptimizer::ID = 0;
836 
837 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
838 
839 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
840   return new SILoadStoreOptimizer();
841 }
842 
843 static void addDefsUsesToList(const MachineInstr &MI,
844                               DenseSet<Register> &RegDefs,
845                               DenseSet<Register> &RegUses) {
846   for (const auto &Op : MI.operands()) {
847     if (!Op.isReg())
848       continue;
849     if (Op.isDef())
850       RegDefs.insert(Op.getReg());
851     if (Op.readsReg())
852       RegUses.insert(Op.getReg());
853   }
854 }
855 
856 bool SILoadStoreOptimizer::canSwapInstructions(
857     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
858     const MachineInstr &A, const MachineInstr &B) const {
859   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
860       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
861     return false;
862   for (const auto &BOp : B.operands()) {
863     if (!BOp.isReg())
864       continue;
865     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
866       return false;
867     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
868       return false;
869   }
870   return true;
871 }
872 
873 // Given that \p CI and \p Paired are adjacent memory operations produce a new
874 // MMO for the combined operation with a new access size.
875 MachineMemOperand *
876 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
877                                                const CombineInfo &Paired) {
878   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
879   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
880 
881   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
882 
883   // A base pointer for the combined operation is the same as the leading
884   // operation's pointer.
885   if (Paired < CI)
886     std::swap(MMOa, MMOb);
887 
888   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
889   // If merging FLAT and GLOBAL set address space to FLAT.
890   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
891     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
892 
893   MachineFunction *MF = CI.I->getMF();
894   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
895 }
896 
897 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
898                                                const SIInstrInfo &TII,
899                                                const CombineInfo &Paired) {
900   assert(CI.InstClass == MIMG);
901 
902   // Ignore instructions with tfe/lwe set.
903   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
904   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
905 
906   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
907     return false;
908 
909   // Check other optional immediate operands for equality.
910   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
911                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
912                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
913 
914   for (auto op : OperandsToMatch) {
915     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
916     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
917       return false;
918     if (Idx != -1 &&
919         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
920       return false;
921   }
922 
923   // Check DMask for overlaps.
924   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
925   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
926 
927   if (!MaxMask)
928     return false;
929 
930   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
931   if ((1u << AllowedBitsForMin) <= MinMask)
932     return false;
933 
934   return true;
935 }
936 
937 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
938                                        unsigned ComponentCount,
939                                        const GCNSubtarget &STI) {
940   if (ComponentCount > 4)
941     return 0;
942 
943   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
944       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
945   if (!OldFormatInfo)
946     return 0;
947 
948   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
949       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
950                                            ComponentCount,
951                                            OldFormatInfo->NumFormat, STI);
952 
953   if (!NewFormatInfo)
954     return 0;
955 
956   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
957          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
958 
959   return NewFormatInfo->Format;
960 }
961 
962 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
963 // highest power of two. Note that the result is well defined for all inputs
964 // including corner cases like:
965 // - if Lo == Hi, return that value
966 // - if Lo == 0, return 0 (even though the "- 1" below underflows
967 // - if Lo > Hi, return 0 (as if the range wrapped around)
968 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
969   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
970 }
971 
972 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
973                                                 const GCNSubtarget &STI,
974                                                 CombineInfo &Paired,
975                                                 bool Modify) {
976   assert(CI.InstClass != MIMG);
977 
978   // XXX - Would the same offset be OK? Is there any reason this would happen or
979   // be useful?
980   if (CI.Offset == Paired.Offset)
981     return false;
982 
983   // This won't be valid if the offset isn't aligned.
984   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
985     return false;
986 
987   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
988 
989     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
990         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
991     if (!Info0)
992       return false;
993     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
994         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
995     if (!Info1)
996       return false;
997 
998     if (Info0->BitsPerComp != Info1->BitsPerComp ||
999         Info0->NumFormat != Info1->NumFormat)
1000       return false;
1001 
1002     // TODO: Should be possible to support more formats, but if format loads
1003     // are not dword-aligned, the merged load might not be valid.
1004     if (Info0->BitsPerComp != 32)
1005       return false;
1006 
1007     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1008       return false;
1009   }
1010 
1011   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1012   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1013   CI.UseST64 = false;
1014   CI.BaseOff = 0;
1015 
1016   // Handle all non-DS instructions.
1017   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1018     if (EltOffset0 + CI.Width != EltOffset1 &&
1019             EltOffset1 + Paired.Width != EltOffset0)
1020       return false;
1021     if (CI.CPol != Paired.CPol)
1022       return false;
1023     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1024         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1025       // Reject cases like:
1026       //   dword + dwordx2 -> dwordx3
1027       //   dword + dwordx3 -> dwordx4
1028       // If we tried to combine these cases, we would fail to extract a subreg
1029       // for the result of the second load due to SGPR alignment requirements.
1030       if (CI.Width != Paired.Width &&
1031           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1032         return false;
1033     }
1034     return true;
1035   }
1036 
1037   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1038   // the stride 64 versions.
1039   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1040       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1041     if (Modify) {
1042       CI.Offset = EltOffset0 / 64;
1043       Paired.Offset = EltOffset1 / 64;
1044       CI.UseST64 = true;
1045     }
1046     return true;
1047   }
1048 
1049   // Check if the new offsets fit in the reduced 8-bit range.
1050   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1051     if (Modify) {
1052       CI.Offset = EltOffset0;
1053       Paired.Offset = EltOffset1;
1054     }
1055     return true;
1056   }
1057 
1058   // Try to shift base address to decrease offsets.
1059   uint32_t Min = std::min(EltOffset0, EltOffset1);
1060   uint32_t Max = std::max(EltOffset0, EltOffset1);
1061 
1062   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1063   if (((Max - Min) & ~Mask) == 0) {
1064     if (Modify) {
1065       // From the range of values we could use for BaseOff, choose the one that
1066       // is aligned to the highest power of two, to maximise the chance that
1067       // the same offset can be reused for other load/store pairs.
1068       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1069       // Copy the low bits of the offsets, so that when we adjust them by
1070       // subtracting BaseOff they will be multiples of 64.
1071       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1072       CI.BaseOff = BaseOff * CI.EltSize;
1073       CI.Offset = (EltOffset0 - BaseOff) / 64;
1074       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1075       CI.UseST64 = true;
1076     }
1077     return true;
1078   }
1079 
1080   if (isUInt<8>(Max - Min)) {
1081     if (Modify) {
1082       // From the range of values we could use for BaseOff, choose the one that
1083       // is aligned to the highest power of two, to maximise the chance that
1084       // the same offset can be reused for other load/store pairs.
1085       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1086       CI.BaseOff = BaseOff * CI.EltSize;
1087       CI.Offset = EltOffset0 - BaseOff;
1088       Paired.Offset = EltOffset1 - BaseOff;
1089     }
1090     return true;
1091   }
1092 
1093   return false;
1094 }
1095 
1096 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1097                                      const CombineInfo &CI,
1098                                      const CombineInfo &Paired) {
1099   const unsigned Width = (CI.Width + Paired.Width);
1100   switch (CI.InstClass) {
1101   default:
1102     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1103   case S_BUFFER_LOAD_IMM:
1104   case S_BUFFER_LOAD_SGPR_IMM:
1105   case S_LOAD_IMM:
1106     switch (Width) {
1107     default:
1108       return false;
1109     case 2:
1110     case 4:
1111     case 8:
1112       return true;
1113     case 3:
1114       return STM.hasScalarDwordx3Loads();
1115     }
1116   }
1117 }
1118 
1119 const TargetRegisterClass *
1120 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1121   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1122     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1123   }
1124   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1125     return TRI->getRegClassForReg(*MRI, Src->getReg());
1126   }
1127   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1128     return TRI->getRegClassForReg(*MRI, Src->getReg());
1129   }
1130   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1131     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1132   }
1133   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1134     return TRI->getRegClassForReg(*MRI, Src->getReg());
1135   }
1136   return nullptr;
1137 }
1138 
1139 /// This function assumes that CI comes before Paired in a basic block. Return
1140 /// an insertion point for the merged instruction or nullptr on failure.
1141 SILoadStoreOptimizer::CombineInfo *
1142 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1143                                            CombineInfo &Paired) {
1144   // If another instruction has already been merged into CI, it may now be a
1145   // type that we can't do any further merging into.
1146   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1147     return nullptr;
1148   assert(CI.InstClass == Paired.InstClass);
1149 
1150   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1151       getInstSubclass(Paired.I->getOpcode(), *TII))
1152     return nullptr;
1153 
1154   // Check both offsets (or masks for MIMG) can be combined and fit in the
1155   // reduced range.
1156   if (CI.InstClass == MIMG) {
1157     if (!dmasksCanBeCombined(CI, *TII, Paired))
1158       return nullptr;
1159   } else {
1160     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1161       return nullptr;
1162   }
1163 
1164   DenseSet<Register> RegDefs;
1165   DenseSet<Register> RegUses;
1166   CombineInfo *Where;
1167   if (CI.I->mayLoad()) {
1168     // Try to hoist Paired up to CI.
1169     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1170     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1171       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1172         return nullptr;
1173     }
1174     Where = &CI;
1175   } else {
1176     // Try to sink CI down to Paired.
1177     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1178     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1179       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1180         return nullptr;
1181     }
1182     Where = &Paired;
1183   }
1184 
1185   // Call offsetsCanBeCombined with modify = true so that the offsets are
1186   // correct for the new instruction.  This should return true, because
1187   // this function should only be called on CombineInfo objects that
1188   // have already been confirmed to be mergeable.
1189   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1190     offsetsCanBeCombined(CI, *STM, Paired, true);
1191   return Where;
1192 }
1193 
1194 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1195   if (STM->ldsRequiresM0Init())
1196     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1197   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1198 }
1199 
1200 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1201   if (STM->ldsRequiresM0Init())
1202     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1203 
1204   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1205                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1206 }
1207 
1208 MachineBasicBlock::iterator
1209 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1210                                      MachineBasicBlock::iterator InsertBefore) {
1211   MachineBasicBlock *MBB = CI.I->getParent();
1212 
1213   // Be careful, since the addresses could be subregisters themselves in weird
1214   // cases, like vectors of pointers.
1215   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1216 
1217   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1218   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1219 
1220   unsigned NewOffset0 = CI.Offset;
1221   unsigned NewOffset1 = Paired.Offset;
1222   unsigned Opc =
1223       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1224 
1225   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1226   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1227 
1228   if (NewOffset0 > NewOffset1) {
1229     // Canonicalize the merged instruction so the smaller offset comes first.
1230     std::swap(NewOffset0, NewOffset1);
1231     std::swap(SubRegIdx0, SubRegIdx1);
1232   }
1233 
1234   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1235          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1236 
1237   const MCInstrDesc &Read2Desc = TII->get(Opc);
1238 
1239   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1240   Register DestReg = MRI->createVirtualRegister(SuperRC);
1241 
1242   DebugLoc DL = CI.I->getDebugLoc();
1243 
1244   Register BaseReg = AddrReg->getReg();
1245   unsigned BaseSubReg = AddrReg->getSubReg();
1246   unsigned BaseRegFlags = 0;
1247   if (CI.BaseOff) {
1248     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1249     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1250         .addImm(CI.BaseOff);
1251 
1252     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1253     BaseRegFlags = RegState::Kill;
1254 
1255     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1256         .addReg(ImmReg)
1257         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1258         .addImm(0); // clamp bit
1259     BaseSubReg = 0;
1260   }
1261 
1262   MachineInstrBuilder Read2 =
1263       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1264           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1265           .addImm(NewOffset0)                        // offset0
1266           .addImm(NewOffset1)                        // offset1
1267           .addImm(0)                                 // gds
1268           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1269 
1270   (void)Read2;
1271 
1272   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1273 
1274   // Copy to the old destination registers.
1275   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1276       .add(*Dest0) // Copy to same destination including flags and sub reg.
1277       .addReg(DestReg, 0, SubRegIdx0);
1278   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1279       .add(*Dest1)
1280       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1281 
1282   CI.I->eraseFromParent();
1283   Paired.I->eraseFromParent();
1284 
1285   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1286   return Read2;
1287 }
1288 
1289 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1290   if (STM->ldsRequiresM0Init())
1291     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1292   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1293                         : AMDGPU::DS_WRITE2_B64_gfx9;
1294 }
1295 
1296 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1297   if (STM->ldsRequiresM0Init())
1298     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1299                           : AMDGPU::DS_WRITE2ST64_B64;
1300 
1301   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1302                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1303 }
1304 
1305 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1306     CombineInfo &CI, CombineInfo &Paired,
1307     MachineBasicBlock::iterator InsertBefore) {
1308   MachineBasicBlock *MBB = CI.I->getParent();
1309 
1310   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1311   // sure we preserve the subregister index and any register flags set on them.
1312   const MachineOperand *AddrReg =
1313       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1314   const MachineOperand *Data0 =
1315       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1316   const MachineOperand *Data1 =
1317       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1318 
1319   unsigned NewOffset0 = CI.Offset;
1320   unsigned NewOffset1 = Paired.Offset;
1321   unsigned Opc =
1322       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1323 
1324   if (NewOffset0 > NewOffset1) {
1325     // Canonicalize the merged instruction so the smaller offset comes first.
1326     std::swap(NewOffset0, NewOffset1);
1327     std::swap(Data0, Data1);
1328   }
1329 
1330   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1331          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1332 
1333   const MCInstrDesc &Write2Desc = TII->get(Opc);
1334   DebugLoc DL = CI.I->getDebugLoc();
1335 
1336   Register BaseReg = AddrReg->getReg();
1337   unsigned BaseSubReg = AddrReg->getSubReg();
1338   unsigned BaseRegFlags = 0;
1339   if (CI.BaseOff) {
1340     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1341     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1342         .addImm(CI.BaseOff);
1343 
1344     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1345     BaseRegFlags = RegState::Kill;
1346 
1347     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1348         .addReg(ImmReg)
1349         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1350         .addImm(0); // clamp bit
1351     BaseSubReg = 0;
1352   }
1353 
1354   MachineInstrBuilder Write2 =
1355       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1356           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1357           .add(*Data0)                               // data0
1358           .add(*Data1)                               // data1
1359           .addImm(NewOffset0)                        // offset0
1360           .addImm(NewOffset1)                        // offset1
1361           .addImm(0)                                 // gds
1362           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1363 
1364   CI.I->eraseFromParent();
1365   Paired.I->eraseFromParent();
1366 
1367   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1368   return Write2;
1369 }
1370 
1371 MachineBasicBlock::iterator
1372 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1373                                      MachineBasicBlock::iterator InsertBefore) {
1374   MachineBasicBlock *MBB = CI.I->getParent();
1375   DebugLoc DL = CI.I->getDebugLoc();
1376   const unsigned Opcode = getNewOpcode(CI, Paired);
1377 
1378   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1379 
1380   Register DestReg = MRI->createVirtualRegister(SuperRC);
1381   unsigned MergedDMask = CI.DMask | Paired.DMask;
1382   unsigned DMaskIdx =
1383       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1384 
1385   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1386   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1387     if (I == DMaskIdx)
1388       MIB.addImm(MergedDMask);
1389     else
1390       MIB.add((*CI.I).getOperand(I));
1391   }
1392 
1393   // It shouldn't be possible to get this far if the two instructions
1394   // don't have a single memoperand, because MachineInstr::mayAlias()
1395   // will return true if this is the case.
1396   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1397 
1398   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1399 
1400   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1401 
1402   // Copy to the old destination registers.
1403   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1404   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1405   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1406 
1407   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1408       .add(*Dest0) // Copy to same destination including flags and sub reg.
1409       .addReg(DestReg, 0, SubRegIdx0);
1410   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1411       .add(*Dest1)
1412       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1413 
1414   CI.I->eraseFromParent();
1415   Paired.I->eraseFromParent();
1416   return New;
1417 }
1418 
1419 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1420     CombineInfo &CI, CombineInfo &Paired,
1421     MachineBasicBlock::iterator InsertBefore) {
1422   MachineBasicBlock *MBB = CI.I->getParent();
1423   DebugLoc DL = CI.I->getDebugLoc();
1424   const unsigned Opcode = getNewOpcode(CI, Paired);
1425 
1426   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1427 
1428   Register DestReg = MRI->createVirtualRegister(SuperRC);
1429   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1430 
1431   // It shouldn't be possible to get this far if the two instructions
1432   // don't have a single memoperand, because MachineInstr::mayAlias()
1433   // will return true if this is the case.
1434   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1435 
1436   MachineInstrBuilder New =
1437       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1438           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1439   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1440     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1441   New.addImm(MergedOffset);
1442   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1443 
1444   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1445 
1446   // Copy to the old destination registers.
1447   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1448   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1449   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1450 
1451   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1452       .add(*Dest0) // Copy to same destination including flags and sub reg.
1453       .addReg(DestReg, 0, SubRegIdx0);
1454   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1455       .add(*Dest1)
1456       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1457 
1458   CI.I->eraseFromParent();
1459   Paired.I->eraseFromParent();
1460   return New;
1461 }
1462 
1463 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1464     CombineInfo &CI, CombineInfo &Paired,
1465     MachineBasicBlock::iterator InsertBefore) {
1466   MachineBasicBlock *MBB = CI.I->getParent();
1467   DebugLoc DL = CI.I->getDebugLoc();
1468 
1469   const unsigned Opcode = getNewOpcode(CI, Paired);
1470 
1471   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1472 
1473   // Copy to the new source register.
1474   Register DestReg = MRI->createVirtualRegister(SuperRC);
1475   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1476 
1477   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1478 
1479   AddressRegs Regs = getRegs(Opcode, *TII);
1480 
1481   if (Regs.VAddr)
1482     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1483 
1484   // It shouldn't be possible to get this far if the two instructions
1485   // don't have a single memoperand, because MachineInstr::mayAlias()
1486   // will return true if this is the case.
1487   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1488 
1489   MachineInstr *New =
1490     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492         .addImm(MergedOffset) // offset
1493         .addImm(CI.CPol)      // cpol
1494         .addImm(0)            // swz
1495         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1496 
1497   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1498 
1499   // Copy to the old destination registers.
1500   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1501   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1502   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1503 
1504   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1505       .add(*Dest0) // Copy to same destination including flags and sub reg.
1506       .addReg(DestReg, 0, SubRegIdx0);
1507   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1508       .add(*Dest1)
1509       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1510 
1511   CI.I->eraseFromParent();
1512   Paired.I->eraseFromParent();
1513   return New;
1514 }
1515 
1516 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1517     CombineInfo &CI, CombineInfo &Paired,
1518     MachineBasicBlock::iterator InsertBefore) {
1519   MachineBasicBlock *MBB = CI.I->getParent();
1520   DebugLoc DL = CI.I->getDebugLoc();
1521 
1522   const unsigned Opcode = getNewOpcode(CI, Paired);
1523 
1524   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1525 
1526   // Copy to the new source register.
1527   Register DestReg = MRI->createVirtualRegister(SuperRC);
1528   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1529 
1530   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1531 
1532   AddressRegs Regs = getRegs(Opcode, *TII);
1533 
1534   if (Regs.VAddr)
1535     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1536 
1537   unsigned JoinedFormat =
1538       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1539 
1540   // It shouldn't be possible to get this far if the two instructions
1541   // don't have a single memoperand, because MachineInstr::mayAlias()
1542   // will return true if this is the case.
1543   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1544 
1545   MachineInstr *New =
1546       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1547           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1548           .addImm(MergedOffset) // offset
1549           .addImm(JoinedFormat) // format
1550           .addImm(CI.CPol)      // cpol
1551           .addImm(0)            // swz
1552           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1553 
1554   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1555 
1556   // Copy to the old destination registers.
1557   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1558   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1559   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1560 
1561   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1562       .add(*Dest0) // Copy to same destination including flags and sub reg.
1563       .addReg(DestReg, 0, SubRegIdx0);
1564   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1565       .add(*Dest1)
1566       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1567 
1568   CI.I->eraseFromParent();
1569   Paired.I->eraseFromParent();
1570   return New;
1571 }
1572 
1573 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1574     CombineInfo &CI, CombineInfo &Paired,
1575     MachineBasicBlock::iterator InsertBefore) {
1576   MachineBasicBlock *MBB = CI.I->getParent();
1577   DebugLoc DL = CI.I->getDebugLoc();
1578 
1579   const unsigned Opcode = getNewOpcode(CI, Paired);
1580 
1581   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1582 
1583   // Copy to the new source register.
1584   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1585   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1586 
1587   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1588   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1589 
1590   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1591       .add(*Src0)
1592       .addImm(SubRegIdx0)
1593       .add(*Src1)
1594       .addImm(SubRegIdx1);
1595 
1596   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1597                  .addReg(SrcReg, RegState::Kill);
1598 
1599   AddressRegs Regs = getRegs(Opcode, *TII);
1600 
1601   if (Regs.VAddr)
1602     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1603 
1604   unsigned JoinedFormat =
1605       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1606 
1607   // It shouldn't be possible to get this far if the two instructions
1608   // don't have a single memoperand, because MachineInstr::mayAlias()
1609   // will return true if this is the case.
1610   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1611 
1612   MachineInstr *New =
1613       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1614           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1615           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1616           .addImm(JoinedFormat)                     // format
1617           .addImm(CI.CPol)                          // cpol
1618           .addImm(0)                                // swz
1619           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1620 
1621   CI.I->eraseFromParent();
1622   Paired.I->eraseFromParent();
1623   return New;
1624 }
1625 
1626 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1627     CombineInfo &CI, CombineInfo &Paired,
1628     MachineBasicBlock::iterator InsertBefore) {
1629   MachineBasicBlock *MBB = CI.I->getParent();
1630   DebugLoc DL = CI.I->getDebugLoc();
1631 
1632   const unsigned Opcode = getNewOpcode(CI, Paired);
1633 
1634   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1635   Register DestReg = MRI->createVirtualRegister(SuperRC);
1636 
1637   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1638 
1639   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1640     MIB.add(*SAddr);
1641 
1642   MachineInstr *New =
1643     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1644        .addImm(std::min(CI.Offset, Paired.Offset))
1645        .addImm(CI.CPol)
1646        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1647 
1648   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1649 
1650   // Copy to the old destination registers.
1651   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1652   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1653   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1654 
1655   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1656       .add(*Dest0) // Copy to same destination including flags and sub reg.
1657       .addReg(DestReg, 0, SubRegIdx0);
1658   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1659       .add(*Dest1)
1660       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1661 
1662   CI.I->eraseFromParent();
1663   Paired.I->eraseFromParent();
1664   return New;
1665 }
1666 
1667 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1668     CombineInfo &CI, CombineInfo &Paired,
1669     MachineBasicBlock::iterator InsertBefore) {
1670   MachineBasicBlock *MBB = CI.I->getParent();
1671   DebugLoc DL = CI.I->getDebugLoc();
1672 
1673   const unsigned Opcode = getNewOpcode(CI, Paired);
1674 
1675   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1676 
1677   // Copy to the new source register.
1678   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1679   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1680 
1681   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1682   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1683 
1684   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1685       .add(*Src0)
1686       .addImm(SubRegIdx0)
1687       .add(*Src1)
1688       .addImm(SubRegIdx1);
1689 
1690   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1691                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1692                  .addReg(SrcReg, RegState::Kill);
1693 
1694   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1695     MIB.add(*SAddr);
1696 
1697   MachineInstr *New =
1698     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1699        .addImm(CI.CPol)
1700        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1701 
1702   CI.I->eraseFromParent();
1703   Paired.I->eraseFromParent();
1704   return New;
1705 }
1706 
1707 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1708                                             const CombineInfo &Paired) {
1709   const unsigned Width = CI.Width + Paired.Width;
1710 
1711   switch (getCommonInstClass(CI, Paired)) {
1712   default:
1713     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1714     // FIXME: Handle d16 correctly
1715     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1716                                   Width);
1717   case TBUFFER_LOAD:
1718   case TBUFFER_STORE:
1719     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1720                                   Width);
1721 
1722   case UNKNOWN:
1723     llvm_unreachable("Unknown instruction class");
1724   case S_BUFFER_LOAD_IMM:
1725     switch (Width) {
1726     default:
1727       return 0;
1728     case 2:
1729       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1730     case 3:
1731       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1732     case 4:
1733       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1734     case 8:
1735       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1736     }
1737   case S_BUFFER_LOAD_SGPR_IMM:
1738     switch (Width) {
1739     default:
1740       return 0;
1741     case 2:
1742       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1743     case 3:
1744       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1745     case 4:
1746       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1747     case 8:
1748       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1749     }
1750   case S_LOAD_IMM:
1751     switch (Width) {
1752     default:
1753       return 0;
1754     case 2:
1755       return AMDGPU::S_LOAD_DWORDX2_IMM;
1756     case 3:
1757       return AMDGPU::S_LOAD_DWORDX3_IMM;
1758     case 4:
1759       return AMDGPU::S_LOAD_DWORDX4_IMM;
1760     case 8:
1761       return AMDGPU::S_LOAD_DWORDX8_IMM;
1762     }
1763   case GLOBAL_LOAD:
1764     switch (Width) {
1765     default:
1766       return 0;
1767     case 2:
1768       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1769     case 3:
1770       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1771     case 4:
1772       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1773     }
1774   case GLOBAL_LOAD_SADDR:
1775     switch (Width) {
1776     default:
1777       return 0;
1778     case 2:
1779       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1780     case 3:
1781       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1782     case 4:
1783       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1784     }
1785   case GLOBAL_STORE:
1786     switch (Width) {
1787     default:
1788       return 0;
1789     case 2:
1790       return AMDGPU::GLOBAL_STORE_DWORDX2;
1791     case 3:
1792       return AMDGPU::GLOBAL_STORE_DWORDX3;
1793     case 4:
1794       return AMDGPU::GLOBAL_STORE_DWORDX4;
1795     }
1796   case GLOBAL_STORE_SADDR:
1797     switch (Width) {
1798     default:
1799       return 0;
1800     case 2:
1801       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1802     case 3:
1803       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1804     case 4:
1805       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1806     }
1807   case FLAT_LOAD:
1808     switch (Width) {
1809     default:
1810       return 0;
1811     case 2:
1812       return AMDGPU::FLAT_LOAD_DWORDX2;
1813     case 3:
1814       return AMDGPU::FLAT_LOAD_DWORDX3;
1815     case 4:
1816       return AMDGPU::FLAT_LOAD_DWORDX4;
1817     }
1818   case FLAT_STORE:
1819     switch (Width) {
1820     default:
1821       return 0;
1822     case 2:
1823       return AMDGPU::FLAT_STORE_DWORDX2;
1824     case 3:
1825       return AMDGPU::FLAT_STORE_DWORDX3;
1826     case 4:
1827       return AMDGPU::FLAT_STORE_DWORDX4;
1828     }
1829   case MIMG:
1830     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1831            "No overlaps");
1832     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1833   }
1834 }
1835 
1836 std::pair<unsigned, unsigned>
1837 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1838                                     const CombineInfo &Paired) {
1839   assert((CI.InstClass != MIMG ||
1840           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1841            CI.Width + Paired.Width)) &&
1842          "No overlaps");
1843 
1844   unsigned Idx0;
1845   unsigned Idx1;
1846 
1847   static const unsigned Idxs[5][4] = {
1848       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1849       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1850       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1851       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1852       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1853   };
1854 
1855   assert(CI.Width >= 1 && CI.Width <= 4);
1856   assert(Paired.Width >= 1 && Paired.Width <= 4);
1857 
1858   if (Paired < CI) {
1859     Idx1 = Idxs[0][Paired.Width - 1];
1860     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1861   } else {
1862     Idx0 = Idxs[0][CI.Width - 1];
1863     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1864   }
1865 
1866   return {Idx0, Idx1};
1867 }
1868 
1869 const TargetRegisterClass *
1870 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1871                                              const CombineInfo &Paired) {
1872   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1873       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1874     switch (CI.Width + Paired.Width) {
1875     default:
1876       return nullptr;
1877     case 2:
1878       return &AMDGPU::SReg_64_XEXECRegClass;
1879     case 3:
1880       return &AMDGPU::SGPR_96RegClass;
1881     case 4:
1882       return &AMDGPU::SGPR_128RegClass;
1883     case 8:
1884       return &AMDGPU::SGPR_256RegClass;
1885     case 16:
1886       return &AMDGPU::SGPR_512RegClass;
1887     }
1888   }
1889 
1890   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1891   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1892              ? TRI->getAGPRClassForBitWidth(BitWidth)
1893              : TRI->getVGPRClassForBitWidth(BitWidth);
1894 }
1895 
1896 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1897     CombineInfo &CI, CombineInfo &Paired,
1898     MachineBasicBlock::iterator InsertBefore) {
1899   MachineBasicBlock *MBB = CI.I->getParent();
1900   DebugLoc DL = CI.I->getDebugLoc();
1901 
1902   const unsigned Opcode = getNewOpcode(CI, Paired);
1903 
1904   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1905 
1906   // Copy to the new source register.
1907   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1908   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1909 
1910   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1911   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1912 
1913   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1914       .add(*Src0)
1915       .addImm(SubRegIdx0)
1916       .add(*Src1)
1917       .addImm(SubRegIdx1);
1918 
1919   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1920                  .addReg(SrcReg, RegState::Kill);
1921 
1922   AddressRegs Regs = getRegs(Opcode, *TII);
1923 
1924   if (Regs.VAddr)
1925     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1926 
1927 
1928   // It shouldn't be possible to get this far if the two instructions
1929   // don't have a single memoperand, because MachineInstr::mayAlias()
1930   // will return true if this is the case.
1931   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1932 
1933   MachineInstr *New =
1934     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1935         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1936         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1937         .addImm(CI.CPol)      // cpol
1938         .addImm(0)            // swz
1939         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1940 
1941   CI.I->eraseFromParent();
1942   Paired.I->eraseFromParent();
1943   return New;
1944 }
1945 
1946 MachineOperand
1947 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1948   APInt V(32, Val, true);
1949   if (TII->isInlineConstant(V))
1950     return MachineOperand::CreateImm(Val);
1951 
1952   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1953   MachineInstr *Mov =
1954   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1955           TII->get(AMDGPU::S_MOV_B32), Reg)
1956     .addImm(Val);
1957   (void)Mov;
1958   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1959   return MachineOperand::CreateReg(Reg, false);
1960 }
1961 
1962 // Compute base address using Addr and return the final register.
1963 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1964                                            const MemAddress &Addr) const {
1965   MachineBasicBlock *MBB = MI.getParent();
1966   MachineBasicBlock::iterator MBBI = MI.getIterator();
1967   DebugLoc DL = MI.getDebugLoc();
1968 
1969   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1970           Addr.Base.LoSubReg) &&
1971          "Expected 32-bit Base-Register-Low!!");
1972 
1973   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1974           Addr.Base.HiSubReg) &&
1975          "Expected 32-bit Base-Register-Hi!!");
1976 
1977   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1978   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1979   MachineOperand OffsetHi =
1980     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1981 
1982   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1983   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1984   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1985 
1986   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1987   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1988   MachineInstr *LoHalf =
1989     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1990       .addReg(CarryReg, RegState::Define)
1991       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1992       .add(OffsetLo)
1993       .addImm(0); // clamp bit
1994   (void)LoHalf;
1995   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1996 
1997   MachineInstr *HiHalf =
1998   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1999     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2000     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2001     .add(OffsetHi)
2002     .addReg(CarryReg, RegState::Kill)
2003     .addImm(0); // clamp bit
2004   (void)HiHalf;
2005   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2006 
2007   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2008   MachineInstr *FullBase =
2009     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2010       .addReg(DestSub0)
2011       .addImm(AMDGPU::sub0)
2012       .addReg(DestSub1)
2013       .addImm(AMDGPU::sub1);
2014   (void)FullBase;
2015   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2016 
2017   return FullDestReg;
2018 }
2019 
2020 // Update base and offset with the NewBase and NewOffset in MI.
2021 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2022                                                Register NewBase,
2023                                                int32_t NewOffset) const {
2024   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2025   Base->setReg(NewBase);
2026   Base->setIsKill(false);
2027   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2028 }
2029 
2030 std::optional<int32_t>
2031 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2032   if (Op.isImm())
2033     return Op.getImm();
2034 
2035   if (!Op.isReg())
2036     return std::nullopt;
2037 
2038   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2039   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2040       !Def->getOperand(1).isImm())
2041     return std::nullopt;
2042 
2043   return Def->getOperand(1).getImm();
2044 }
2045 
2046 // Analyze Base and extracts:
2047 //  - 32bit base registers, subregisters
2048 //  - 64bit constant offset
2049 // Expecting base computation as:
2050 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2051 //   %LO:vgpr_32, %c:sreg_64_xexec =
2052 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2053 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2054 //   %Base:vreg_64 =
2055 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2056 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2057                                                       MemAddress &Addr) const {
2058   if (!Base.isReg())
2059     return;
2060 
2061   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2062   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2063       || Def->getNumOperands() != 5)
2064     return;
2065 
2066   MachineOperand BaseLo = Def->getOperand(1);
2067   MachineOperand BaseHi = Def->getOperand(3);
2068   if (!BaseLo.isReg() || !BaseHi.isReg())
2069     return;
2070 
2071   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2072   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2073 
2074   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2075       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2076     return;
2077 
2078   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2079   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2080 
2081   auto Offset0P = extractConstOffset(*Src0);
2082   if (Offset0P)
2083     BaseLo = *Src1;
2084   else {
2085     if (!(Offset0P = extractConstOffset(*Src1)))
2086       return;
2087     BaseLo = *Src0;
2088   }
2089 
2090   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2091   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2092 
2093   if (Src0->isImm())
2094     std::swap(Src0, Src1);
2095 
2096   if (!Src1->isImm())
2097     return;
2098 
2099   uint64_t Offset1 = Src1->getImm();
2100   BaseHi = *Src0;
2101 
2102   Addr.Base.LoReg = BaseLo.getReg();
2103   Addr.Base.HiReg = BaseHi.getReg();
2104   Addr.Base.LoSubReg = BaseLo.getSubReg();
2105   Addr.Base.HiSubReg = BaseHi.getSubReg();
2106   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2107 }
2108 
2109 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2110     MachineInstr &MI,
2111     MemInfoMap &Visited,
2112     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2113 
2114   if (!(MI.mayLoad() ^ MI.mayStore()))
2115     return false;
2116 
2117   // TODO: Support flat and scratch.
2118   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2119     return false;
2120 
2121   if (MI.mayLoad() &&
2122       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2123     return false;
2124 
2125   if (AnchorList.count(&MI))
2126     return false;
2127 
2128   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2129 
2130   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2131     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2132     return false;
2133   }
2134 
2135   // Step1: Find the base-registers and a 64bit constant offset.
2136   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2137   MemAddress MAddr;
2138   if (!Visited.contains(&MI)) {
2139     processBaseWithConstOffset(Base, MAddr);
2140     Visited[&MI] = MAddr;
2141   } else
2142     MAddr = Visited[&MI];
2143 
2144   if (MAddr.Offset == 0) {
2145     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2146                          " constant offsets that can be promoted.\n";);
2147     return false;
2148   }
2149 
2150   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2151              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2152 
2153   // Step2: Traverse through MI's basic block and find an anchor(that has the
2154   // same base-registers) with the highest 13bit distance from MI's offset.
2155   // E.g. (64bit loads)
2156   // bb:
2157   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2158   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2159   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2160   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2161   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2162   //
2163   // Starting from the first load, the optimization will try to find a new base
2164   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2165   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2166   // as the new-base(anchor) because of the maximum distance which can
2167   // accommodate more intermediate bases presumably.
2168   //
2169   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2170   // (&a + 8192) for load1, load2, load4.
2171   //   addr = &a + 8192
2172   //   load1 = load(addr,       -4096)
2173   //   load2 = load(addr,       -2048)
2174   //   load3 = load(addr,       0)
2175   //   load4 = load(addr,       2048)
2176   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2177   //
2178   MachineInstr *AnchorInst = nullptr;
2179   MemAddress AnchorAddr;
2180   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2181   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2182 
2183   MachineBasicBlock *MBB = MI.getParent();
2184   MachineBasicBlock::iterator E = MBB->end();
2185   MachineBasicBlock::iterator MBBI = MI.getIterator();
2186   ++MBBI;
2187   const SITargetLowering *TLI =
2188     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2189 
2190   for ( ; MBBI != E; ++MBBI) {
2191     MachineInstr &MINext = *MBBI;
2192     // TODO: Support finding an anchor(with same base) from store addresses or
2193     // any other load addresses where the opcodes are different.
2194     if (MINext.getOpcode() != MI.getOpcode() ||
2195         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2196       continue;
2197 
2198     const MachineOperand &BaseNext =
2199       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2200     MemAddress MAddrNext;
2201     if (!Visited.contains(&MINext)) {
2202       processBaseWithConstOffset(BaseNext, MAddrNext);
2203       Visited[&MINext] = MAddrNext;
2204     } else
2205       MAddrNext = Visited[&MINext];
2206 
2207     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2208         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2209         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2210         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2211       continue;
2212 
2213     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2214 
2215     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2216     TargetLoweringBase::AddrMode AM;
2217     AM.HasBaseReg = true;
2218     AM.BaseOffs = Dist;
2219     if (TLI->isLegalGlobalAddressingMode(AM) &&
2220         (uint32_t)std::abs(Dist) > MaxDist) {
2221       MaxDist = std::abs(Dist);
2222 
2223       AnchorAddr = MAddrNext;
2224       AnchorInst = &MINext;
2225     }
2226   }
2227 
2228   if (AnchorInst) {
2229     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2230                AnchorInst->dump());
2231     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2232                <<  AnchorAddr.Offset << "\n\n");
2233 
2234     // Instead of moving up, just re-compute anchor-instruction's base address.
2235     Register Base = computeBase(MI, AnchorAddr);
2236 
2237     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2238     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2239 
2240     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2241       TargetLoweringBase::AddrMode AM;
2242       AM.HasBaseReg = true;
2243       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2244 
2245       if (TLI->isLegalGlobalAddressingMode(AM)) {
2246         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2247                    OtherMI->dump());
2248         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2249         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2250       }
2251     }
2252     AnchorList.insert(AnchorInst);
2253     return true;
2254   }
2255 
2256   return false;
2257 }
2258 
2259 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2260                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2261   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2262     if (AddrList.front().InstClass == CI.InstClass &&
2263         AddrList.front().IsAGPR == CI.IsAGPR &&
2264         AddrList.front().hasSameBaseAddress(CI)) {
2265       AddrList.emplace_back(CI);
2266       return;
2267     }
2268   }
2269 
2270   // Base address not found, so add a new list.
2271   MergeableInsts.emplace_back(1, CI);
2272 }
2273 
2274 std::pair<MachineBasicBlock::iterator, bool>
2275 SILoadStoreOptimizer::collectMergeableInsts(
2276     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2277     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2278     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2279   bool Modified = false;
2280 
2281   // Sort potential mergeable instructions into lists.  One list per base address.
2282   unsigned Order = 0;
2283   MachineBasicBlock::iterator BlockI = Begin;
2284   for (; BlockI != End; ++BlockI) {
2285     MachineInstr &MI = *BlockI;
2286 
2287     // We run this before checking if an address is mergeable, because it can produce
2288     // better code even if the instructions aren't mergeable.
2289     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2290       Modified = true;
2291 
2292     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2293     // barriers. We can look after this barrier for separate merges.
2294     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2295       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2296 
2297       // Search will resume after this instruction in a separate merge list.
2298       ++BlockI;
2299       break;
2300     }
2301 
2302     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2303     if (InstClass == UNKNOWN)
2304       continue;
2305 
2306     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2307     int Swizzled =
2308         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2309     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2310       continue;
2311 
2312     CombineInfo CI;
2313     CI.setMI(MI, *this);
2314     CI.Order = Order++;
2315 
2316     if (!CI.hasMergeableAddress(*MRI))
2317       continue;
2318 
2319     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2320       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2321       //        operands. However we are reporting that ds_write2 shall have
2322       //        only VGPR data so that machine copy propagation does not
2323       //        create an illegal instruction with a VGPR and AGPR sources.
2324       //        Consequenctially if we create such instruction the verifier
2325       //        will complain.
2326       continue;
2327     }
2328 
2329     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2330 
2331     addInstToMergeableList(CI, MergeableInsts);
2332   }
2333 
2334   // At this point we have lists of Mergeable instructions.
2335   //
2336   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2337   // list try to find an instruction that can be merged with I.  If an instruction
2338   // is found, it is stored in the Paired field.  If no instructions are found, then
2339   // the CombineInfo object is deleted from the list.
2340 
2341   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2342                                                    E = MergeableInsts.end(); I != E;) {
2343 
2344     std::list<CombineInfo> &MergeList = *I;
2345     if (MergeList.size() <= 1) {
2346       // This means we have found only one instruction with a given address
2347       // that can be merged, and we need at least 2 instructions to do a merge,
2348       // so this list can be discarded.
2349       I = MergeableInsts.erase(I);
2350       continue;
2351     }
2352 
2353     // Sort the lists by offsets, this way mergeable instructions will be
2354     // adjacent to each other in the list, which will make it easier to find
2355     // matches.
2356     MergeList.sort(
2357         [] (const CombineInfo &A, const CombineInfo &B) {
2358           return A.Offset < B.Offset;
2359         });
2360     ++I;
2361   }
2362 
2363   return {BlockI, Modified};
2364 }
2365 
2366 // Scan through looking for adjacent LDS operations with constant offsets from
2367 // the same base register. We rely on the scheduler to do the hard work of
2368 // clustering nearby loads, and assume these are all adjacent.
2369 bool SILoadStoreOptimizer::optimizeBlock(
2370                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2371   bool Modified = false;
2372 
2373   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2374                                                    E = MergeableInsts.end(); I != E;) {
2375     std::list<CombineInfo> &MergeList = *I;
2376 
2377     bool OptimizeListAgain = false;
2378     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2379       // We weren't able to make any changes, so delete the list so we don't
2380       // process the same instructions the next time we try to optimize this
2381       // block.
2382       I = MergeableInsts.erase(I);
2383       continue;
2384     }
2385 
2386     Modified = true;
2387 
2388     // We made changes, but also determined that there were no more optimization
2389     // opportunities, so we don't need to reprocess the list
2390     if (!OptimizeListAgain) {
2391       I = MergeableInsts.erase(I);
2392       continue;
2393     }
2394     OptimizeAgain = true;
2395   }
2396   return Modified;
2397 }
2398 
2399 bool
2400 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2401                                           std::list<CombineInfo> &MergeList,
2402                                           bool &OptimizeListAgain) {
2403   if (MergeList.empty())
2404     return false;
2405 
2406   bool Modified = false;
2407 
2408   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2409        Next = std::next(I)) {
2410 
2411     auto First = I;
2412     auto Second = Next;
2413 
2414     if ((*First).Order > (*Second).Order)
2415       std::swap(First, Second);
2416     CombineInfo &CI = *First;
2417     CombineInfo &Paired = *Second;
2418 
2419     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2420     if (!Where) {
2421       ++I;
2422       continue;
2423     }
2424 
2425     Modified = true;
2426 
2427     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2428 
2429     MachineBasicBlock::iterator NewMI;
2430     switch (CI.InstClass) {
2431     default:
2432       llvm_unreachable("unknown InstClass");
2433       break;
2434     case DS_READ:
2435       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2436       break;
2437     case DS_WRITE:
2438       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2439       break;
2440     case S_BUFFER_LOAD_IMM:
2441     case S_BUFFER_LOAD_SGPR_IMM:
2442     case S_LOAD_IMM:
2443       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2444       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2445       break;
2446     case BUFFER_LOAD:
2447       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2448       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2449       break;
2450     case BUFFER_STORE:
2451       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2452       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2453       break;
2454     case MIMG:
2455       NewMI = mergeImagePair(CI, Paired, Where->I);
2456       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2457       break;
2458     case TBUFFER_LOAD:
2459       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2460       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2461       break;
2462     case TBUFFER_STORE:
2463       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2464       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2465       break;
2466     case FLAT_LOAD:
2467     case GLOBAL_LOAD:
2468     case GLOBAL_LOAD_SADDR:
2469       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2470       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2471       break;
2472     case FLAT_STORE:
2473     case GLOBAL_STORE:
2474     case GLOBAL_STORE_SADDR:
2475       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2476       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2477       break;
2478     }
2479     CI.setMI(NewMI, *this);
2480     CI.Order = Where->Order;
2481     if (I == Second)
2482       I = Next;
2483 
2484     MergeList.erase(Second);
2485   }
2486 
2487   return Modified;
2488 }
2489 
2490 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2491   if (skipFunction(MF.getFunction()))
2492     return false;
2493 
2494   STM = &MF.getSubtarget<GCNSubtarget>();
2495   if (!STM->loadStoreOptEnabled())
2496     return false;
2497 
2498   TII = STM->getInstrInfo();
2499   TRI = &TII->getRegisterInfo();
2500 
2501   MRI = &MF.getRegInfo();
2502   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2503 
2504   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2505 
2506   bool Modified = false;
2507 
2508   // Contains the list of instructions for which constant offsets are being
2509   // promoted to the IMM. This is tracked for an entire block at time.
2510   SmallPtrSet<MachineInstr *, 4> AnchorList;
2511   MemInfoMap Visited;
2512 
2513   for (MachineBasicBlock &MBB : MF) {
2514     MachineBasicBlock::iterator SectionEnd;
2515     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2516          I = SectionEnd) {
2517       bool CollectModified;
2518       std::list<std::list<CombineInfo>> MergeableInsts;
2519 
2520       // First pass: Collect list of all instructions we know how to merge in a
2521       // subset of the block.
2522       std::tie(SectionEnd, CollectModified) =
2523           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2524 
2525       Modified |= CollectModified;
2526 
2527       do {
2528         OptimizeAgain = false;
2529         Modified |= optimizeBlock(MergeableInsts);
2530       } while (OptimizeAgain);
2531     }
2532 
2533     Visited.clear();
2534     AnchorList.clear();
2535   }
2536 
2537   return Modified;
2538 }
2539