xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 26b14aedb7a936d43fa753cf9f311524f2c7ad70)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                     const CombineInfo &Paired);
224   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225 
226   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227 
228   unsigned read2Opcode(unsigned EltSize) const;
229   unsigned read2ST64Opcode(unsigned EltSize) const;
230   MachineBasicBlock::iterator
231   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232                  MachineBasicBlock::iterator InsertBefore);
233 
234   unsigned write2Opcode(unsigned EltSize) const;
235   unsigned write2ST64Opcode(unsigned EltSize) const;
236   MachineBasicBlock::iterator
237   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                   MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242   MachineBasicBlock::iterator
243   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                        MachineBasicBlock::iterator InsertBefore);
245   MachineBasicBlock::iterator
246   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                       MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                        MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                         MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259                     MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262                      MachineBasicBlock::iterator InsertBefore);
263 
264   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265                            int32_t NewOffset) const;
266   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270   /// Promotes constant offset to the immediate by adjusting the base. It
271   /// tries to use a base from the nearby instructions that allows it to have
272   /// a 13bit constant offset which gets promoted to the immediate.
273   bool promoteConstantOffsetToImm(MachineInstr &CI,
274                                   MemInfoMap &Visited,
275                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276   void addInstToMergeableList(const CombineInfo &CI,
277                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
278 
279   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282       std::list<std::list<CombineInfo>> &MergeableInsts) const;
283 
284   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285                                                      const CombineInfo &Paired);
286 
287   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288                                           const CombineInfo &Paired);
289 
290 public:
291   static char ID;
292 
293   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
294     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295   }
296 
297   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298                                      bool &OptimizeListAgain);
299   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300 
301   bool runOnMachineFunction(MachineFunction &MF) override;
302 
303   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304 
305   void getAnalysisUsage(AnalysisUsage &AU) const override {
306     AU.setPreservesCFG();
307     AU.addRequired<AAResultsWrapperPass>();
308 
309     MachineFunctionPass::getAnalysisUsage(AU);
310   }
311 
312   MachineFunctionProperties getRequiredProperties() const override {
313     return MachineFunctionProperties()
314       .set(MachineFunctionProperties::Property::IsSSA);
315   }
316 };
317 
318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319   const unsigned Opc = MI.getOpcode();
320 
321   if (TII.isMUBUF(Opc)) {
322     // FIXME: Handle d16 correctly
323     return AMDGPU::getMUBUFElements(Opc);
324   }
325   if (TII.isImage(MI)) {
326     uint64_t DMaskImm =
327         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328     return llvm::popcount(DMaskImm);
329   }
330   if (TII.isMTBUF(Opc)) {
331     return AMDGPU::getMTBUFElements(Opc);
332   }
333 
334   switch (Opc) {
335   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337   case AMDGPU::S_LOAD_DWORD_IMM:
338   case AMDGPU::GLOBAL_LOAD_DWORD:
339   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340   case AMDGPU::GLOBAL_STORE_DWORD:
341   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342   case AMDGPU::FLAT_LOAD_DWORD:
343   case AMDGPU::FLAT_STORE_DWORD:
344     return 1;
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357   case AMDGPU::S_LOAD_DWORDX3_IMM:
358   case AMDGPU::GLOBAL_LOAD_DWORDX3:
359   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360   case AMDGPU::GLOBAL_STORE_DWORDX3:
361   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362   case AMDGPU::FLAT_LOAD_DWORDX3:
363   case AMDGPU::FLAT_STORE_DWORDX3:
364     return 3;
365   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367   case AMDGPU::S_LOAD_DWORDX4_IMM:
368   case AMDGPU::GLOBAL_LOAD_DWORDX4:
369   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370   case AMDGPU::GLOBAL_STORE_DWORDX4:
371   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372   case AMDGPU::FLAT_LOAD_DWORDX4:
373   case AMDGPU::FLAT_STORE_DWORDX4:
374     return 4;
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377   case AMDGPU::S_LOAD_DWORDX8_IMM:
378     return 8;
379   case AMDGPU::DS_READ_B32:      [[fallthrough]];
380   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
382   case AMDGPU::DS_WRITE_B32_gfx9:
383     return 1;
384   case AMDGPU::DS_READ_B64:      [[fallthrough]];
385   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
387   case AMDGPU::DS_WRITE_B64_gfx9:
388     return 2;
389   default:
390     return 0;
391   }
392 }
393 
394 /// Maps instruction opcode to enum InstClassEnum.
395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396   switch (Opc) {
397   default:
398     if (TII.isMUBUF(Opc)) {
399       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400       default:
401         return UNKNOWN;
402       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
403       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
404       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
405       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406         return BUFFER_LOAD;
407       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
408       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
409       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
410       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
411         return BUFFER_STORE;
412       }
413     }
414     if (TII.isImage(Opc)) {
415       // Ignore instructions encoded without vaddr.
416       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
417           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
418         return UNKNOWN;
419       // Ignore BVH instructions
420       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
421         return UNKNOWN;
422       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
423       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
424           TII.isGather4(Opc))
425         return UNKNOWN;
426       return MIMG;
427     }
428     if (TII.isMTBUF(Opc)) {
429       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
430       default:
431         return UNKNOWN;
432       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
433       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
434       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
435       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
436       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
437       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
438       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
439       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
440         return TBUFFER_LOAD;
441       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
442       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
443       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
444       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
445         return TBUFFER_STORE;
446       }
447     }
448     return UNKNOWN;
449   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
450   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
451   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
452   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
453   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
454     return S_BUFFER_LOAD_IMM;
455   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
458   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
459   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
460     return S_BUFFER_LOAD_SGPR_IMM;
461   case AMDGPU::S_LOAD_DWORD_IMM:
462   case AMDGPU::S_LOAD_DWORDX2_IMM:
463   case AMDGPU::S_LOAD_DWORDX3_IMM:
464   case AMDGPU::S_LOAD_DWORDX4_IMM:
465   case AMDGPU::S_LOAD_DWORDX8_IMM:
466     return S_LOAD_IMM;
467   case AMDGPU::DS_READ_B32:
468   case AMDGPU::DS_READ_B32_gfx9:
469   case AMDGPU::DS_READ_B64:
470   case AMDGPU::DS_READ_B64_gfx9:
471     return DS_READ;
472   case AMDGPU::DS_WRITE_B32:
473   case AMDGPU::DS_WRITE_B32_gfx9:
474   case AMDGPU::DS_WRITE_B64:
475   case AMDGPU::DS_WRITE_B64_gfx9:
476     return DS_WRITE;
477   case AMDGPU::GLOBAL_LOAD_DWORD:
478   case AMDGPU::GLOBAL_LOAD_DWORDX2:
479   case AMDGPU::GLOBAL_LOAD_DWORDX3:
480   case AMDGPU::GLOBAL_LOAD_DWORDX4:
481   case AMDGPU::FLAT_LOAD_DWORD:
482   case AMDGPU::FLAT_LOAD_DWORDX2:
483   case AMDGPU::FLAT_LOAD_DWORDX3:
484   case AMDGPU::FLAT_LOAD_DWORDX4:
485     return FLAT_LOAD;
486   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
487   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
488   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
489   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
490     return GLOBAL_LOAD_SADDR;
491   case AMDGPU::GLOBAL_STORE_DWORD:
492   case AMDGPU::GLOBAL_STORE_DWORDX2:
493   case AMDGPU::GLOBAL_STORE_DWORDX3:
494   case AMDGPU::GLOBAL_STORE_DWORDX4:
495   case AMDGPU::FLAT_STORE_DWORD:
496   case AMDGPU::FLAT_STORE_DWORDX2:
497   case AMDGPU::FLAT_STORE_DWORDX3:
498   case AMDGPU::FLAT_STORE_DWORDX4:
499     return FLAT_STORE;
500   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
501   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
502   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
503   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
504     return GLOBAL_STORE_SADDR;
505   }
506 }
507 
508 /// Determines instruction subclass from opcode. Only instructions
509 /// of the same subclass can be merged together. The merged instruction may have
510 /// a different subclass but must have the same class.
511 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
512   switch (Opc) {
513   default:
514     if (TII.isMUBUF(Opc))
515       return AMDGPU::getMUBUFBaseOpcode(Opc);
516     if (TII.isImage(Opc)) {
517       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
518       assert(Info);
519       return Info->BaseOpcode;
520     }
521     if (TII.isMTBUF(Opc))
522       return AMDGPU::getMTBUFBaseOpcode(Opc);
523     return -1;
524   case AMDGPU::DS_READ_B32:
525   case AMDGPU::DS_READ_B32_gfx9:
526   case AMDGPU::DS_READ_B64:
527   case AMDGPU::DS_READ_B64_gfx9:
528   case AMDGPU::DS_WRITE_B32:
529   case AMDGPU::DS_WRITE_B32_gfx9:
530   case AMDGPU::DS_WRITE_B64:
531   case AMDGPU::DS_WRITE_B64_gfx9:
532     return Opc;
533   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
534   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
535   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
536   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
537   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
538     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
539   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
540   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
541   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
542   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
543   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
544     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
545   case AMDGPU::S_LOAD_DWORD_IMM:
546   case AMDGPU::S_LOAD_DWORDX2_IMM:
547   case AMDGPU::S_LOAD_DWORDX3_IMM:
548   case AMDGPU::S_LOAD_DWORDX4_IMM:
549   case AMDGPU::S_LOAD_DWORDX8_IMM:
550     return AMDGPU::S_LOAD_DWORD_IMM;
551   case AMDGPU::GLOBAL_LOAD_DWORD:
552   case AMDGPU::GLOBAL_LOAD_DWORDX2:
553   case AMDGPU::GLOBAL_LOAD_DWORDX3:
554   case AMDGPU::GLOBAL_LOAD_DWORDX4:
555   case AMDGPU::FLAT_LOAD_DWORD:
556   case AMDGPU::FLAT_LOAD_DWORDX2:
557   case AMDGPU::FLAT_LOAD_DWORDX3:
558   case AMDGPU::FLAT_LOAD_DWORDX4:
559     return AMDGPU::FLAT_LOAD_DWORD;
560   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
561   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
562   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
563   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
564     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
565   case AMDGPU::GLOBAL_STORE_DWORD:
566   case AMDGPU::GLOBAL_STORE_DWORDX2:
567   case AMDGPU::GLOBAL_STORE_DWORDX3:
568   case AMDGPU::GLOBAL_STORE_DWORDX4:
569   case AMDGPU::FLAT_STORE_DWORD:
570   case AMDGPU::FLAT_STORE_DWORDX2:
571   case AMDGPU::FLAT_STORE_DWORDX3:
572   case AMDGPU::FLAT_STORE_DWORDX4:
573     return AMDGPU::FLAT_STORE_DWORD;
574   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
575   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
576   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
577   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
578     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
579   }
580 }
581 
582 // GLOBAL loads and stores are classified as FLAT initially. If both combined
583 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
584 // If either or both instructions are non segment specific FLAT the resulting
585 // combined operation will be FLAT, potentially promoting one of the GLOBAL
586 // operations to FLAT.
587 // For other instructions return the original unmodified class.
588 InstClassEnum
589 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
590                                          const CombineInfo &Paired) {
591   assert(CI.InstClass == Paired.InstClass);
592 
593   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
594       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
595     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
596 
597   return CI.InstClass;
598 }
599 
600 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
601   AddressRegs Result;
602 
603   if (TII.isMUBUF(Opc)) {
604     if (AMDGPU::getMUBUFHasVAddr(Opc))
605       Result.VAddr = true;
606     if (AMDGPU::getMUBUFHasSrsrc(Opc))
607       Result.SRsrc = true;
608     if (AMDGPU::getMUBUFHasSoffset(Opc))
609       Result.SOffset = true;
610 
611     return Result;
612   }
613 
614   if (TII.isImage(Opc)) {
615     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
616     if (VAddr0Idx >= 0) {
617       int RsrcName =
618           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
619       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
620       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
621     } else {
622       Result.VAddr = true;
623     }
624     Result.SRsrc = true;
625     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
626     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
627       Result.SSamp = true;
628 
629     return Result;
630   }
631   if (TII.isMTBUF(Opc)) {
632     if (AMDGPU::getMTBUFHasVAddr(Opc))
633       Result.VAddr = true;
634     if (AMDGPU::getMTBUFHasSrsrc(Opc))
635       Result.SRsrc = true;
636     if (AMDGPU::getMTBUFHasSoffset(Opc))
637       Result.SOffset = true;
638 
639     return Result;
640   }
641 
642   switch (Opc) {
643   default:
644     return Result;
645   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
646   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
647   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
648   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
649   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
650     Result.SOffset = true;
651     [[fallthrough]];
652   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
653   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
654   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
655   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
656   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
657   case AMDGPU::S_LOAD_DWORD_IMM:
658   case AMDGPU::S_LOAD_DWORDX2_IMM:
659   case AMDGPU::S_LOAD_DWORDX3_IMM:
660   case AMDGPU::S_LOAD_DWORDX4_IMM:
661   case AMDGPU::S_LOAD_DWORDX8_IMM:
662     Result.SBase = true;
663     return Result;
664   case AMDGPU::DS_READ_B32:
665   case AMDGPU::DS_READ_B64:
666   case AMDGPU::DS_READ_B32_gfx9:
667   case AMDGPU::DS_READ_B64_gfx9:
668   case AMDGPU::DS_WRITE_B32:
669   case AMDGPU::DS_WRITE_B64:
670   case AMDGPU::DS_WRITE_B32_gfx9:
671   case AMDGPU::DS_WRITE_B64_gfx9:
672     Result.Addr = true;
673     return Result;
674   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682     Result.SAddr = true;
683     [[fallthrough]];
684   case AMDGPU::GLOBAL_LOAD_DWORD:
685   case AMDGPU::GLOBAL_LOAD_DWORDX2:
686   case AMDGPU::GLOBAL_LOAD_DWORDX3:
687   case AMDGPU::GLOBAL_LOAD_DWORDX4:
688   case AMDGPU::GLOBAL_STORE_DWORD:
689   case AMDGPU::GLOBAL_STORE_DWORDX2:
690   case AMDGPU::GLOBAL_STORE_DWORDX3:
691   case AMDGPU::GLOBAL_STORE_DWORDX4:
692   case AMDGPU::FLAT_LOAD_DWORD:
693   case AMDGPU::FLAT_LOAD_DWORDX2:
694   case AMDGPU::FLAT_LOAD_DWORDX3:
695   case AMDGPU::FLAT_LOAD_DWORDX4:
696   case AMDGPU::FLAT_STORE_DWORD:
697   case AMDGPU::FLAT_STORE_DWORDX2:
698   case AMDGPU::FLAT_STORE_DWORDX3:
699   case AMDGPU::FLAT_STORE_DWORDX4:
700     Result.VAddr = true;
701     return Result;
702   }
703 }
704 
705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706                                               const SILoadStoreOptimizer &LSO) {
707   I = MI;
708   unsigned Opc = MI->getOpcode();
709   InstClass = getInstClass(Opc, *LSO.TII);
710 
711   if (InstClass == UNKNOWN)
712     return;
713 
714   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715 
716   switch (InstClass) {
717   case DS_READ:
718    EltSize =
719           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
720                                                                           : 4;
721    break;
722   case DS_WRITE:
723     EltSize =
724           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
725                                                                             : 4;
726     break;
727   case S_BUFFER_LOAD_IMM:
728   case S_BUFFER_LOAD_SGPR_IMM:
729   case S_LOAD_IMM:
730     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
731     break;
732   default:
733     EltSize = 4;
734     break;
735   }
736 
737   if (InstClass == MIMG) {
738     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739     // Offset is not considered for MIMG instructions.
740     Offset = 0;
741   } else {
742     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743     Offset = I->getOperand(OffsetIdx).getImm();
744   }
745 
746   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
748 
749   Width = getOpcodeWidth(*I, *LSO.TII);
750 
751   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
752     Offset &= 0xffff;
753   } else if (InstClass != MIMG) {
754     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
755   }
756 
757   AddressRegs Regs = getRegs(Opc, *LSO.TII);
758   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
759 
760   NumAddresses = 0;
761   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
762     AddrIdx[NumAddresses++] =
763         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
764   if (Regs.Addr)
765     AddrIdx[NumAddresses++] =
766         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
767   if (Regs.SBase)
768     AddrIdx[NumAddresses++] =
769         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
770   if (Regs.SRsrc)
771     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
772         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
773   if (Regs.SOffset)
774     AddrIdx[NumAddresses++] =
775         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
776   if (Regs.SAddr)
777     AddrIdx[NumAddresses++] =
778         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
779   if (Regs.VAddr)
780     AddrIdx[NumAddresses++] =
781         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
782   if (Regs.SSamp)
783     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
784         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
785   assert(NumAddresses <= MaxAddressRegs);
786 
787   for (unsigned J = 0; J < NumAddresses; J++)
788     AddrReg[J] = &I->getOperand(AddrIdx[J]);
789 }
790 
791 } // end anonymous namespace.
792 
793 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
794                       "SI Load Store Optimizer", false, false)
795 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
796 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
797                     false, false)
798 
799 char SILoadStoreOptimizer::ID = 0;
800 
801 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
802 
803 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
804   return new SILoadStoreOptimizer();
805 }
806 
807 static void addDefsUsesToList(const MachineInstr &MI,
808                               DenseSet<Register> &RegDefs,
809                               DenseSet<Register> &RegUses) {
810   for (const auto &Op : MI.operands()) {
811     if (!Op.isReg())
812       continue;
813     if (Op.isDef())
814       RegDefs.insert(Op.getReg());
815     if (Op.readsReg())
816       RegUses.insert(Op.getReg());
817   }
818 }
819 
820 bool SILoadStoreOptimizer::canSwapInstructions(
821     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
822     const MachineInstr &A, const MachineInstr &B) const {
823   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
824       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
825     return false;
826   for (const auto &BOp : B.operands()) {
827     if (!BOp.isReg())
828       continue;
829     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
830       return false;
831     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
832       return false;
833   }
834   return true;
835 }
836 
837 // Given that \p CI and \p Paired are adjacent memory operations produce a new
838 // MMO for the combined operation with a new access size.
839 MachineMemOperand *
840 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
841                                                const CombineInfo &Paired) {
842   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
843   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
844 
845   unsigned Size = MMOa->getSize() + MMOb->getSize();
846 
847   // A base pointer for the combined operation is the same as the leading
848   // operation's pointer.
849   if (Paired < CI)
850     std::swap(MMOa, MMOb);
851 
852   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
853   // If merging FLAT and GLOBAL set address space to FLAT.
854   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
855     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
856 
857   MachineFunction *MF = CI.I->getMF();
858   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
859 }
860 
861 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
862                                                const SIInstrInfo &TII,
863                                                const CombineInfo &Paired) {
864   assert(CI.InstClass == MIMG);
865 
866   // Ignore instructions with tfe/lwe set.
867   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
868   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
869 
870   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
871     return false;
872 
873   // Check other optional immediate operands for equality.
874   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
875                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
876                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
877 
878   for (auto op : OperandsToMatch) {
879     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
880     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
881       return false;
882     if (Idx != -1 &&
883         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
884       return false;
885   }
886 
887   // Check DMask for overlaps.
888   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
889   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
890 
891   if (!MaxMask)
892     return false;
893 
894   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
895   if ((1u << AllowedBitsForMin) <= MinMask)
896     return false;
897 
898   return true;
899 }
900 
901 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
902                                        unsigned ComponentCount,
903                                        const GCNSubtarget &STI) {
904   if (ComponentCount > 4)
905     return 0;
906 
907   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
908       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
909   if (!OldFormatInfo)
910     return 0;
911 
912   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
913       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
914                                            ComponentCount,
915                                            OldFormatInfo->NumFormat, STI);
916 
917   if (!NewFormatInfo)
918     return 0;
919 
920   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
921          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
922 
923   return NewFormatInfo->Format;
924 }
925 
926 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
927 // highest power of two. Note that the result is well defined for all inputs
928 // including corner cases like:
929 // - if Lo == Hi, return that value
930 // - if Lo == 0, return 0 (even though the "- 1" below underflows
931 // - if Lo > Hi, return 0 (as if the range wrapped around)
932 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
933   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
934 }
935 
936 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
937                                                 const GCNSubtarget &STI,
938                                                 CombineInfo &Paired,
939                                                 bool Modify) {
940   assert(CI.InstClass != MIMG);
941 
942   // XXX - Would the same offset be OK? Is there any reason this would happen or
943   // be useful?
944   if (CI.Offset == Paired.Offset)
945     return false;
946 
947   // This won't be valid if the offset isn't aligned.
948   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
949     return false;
950 
951   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
952 
953     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
954         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
955     if (!Info0)
956       return false;
957     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
958         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
959     if (!Info1)
960       return false;
961 
962     if (Info0->BitsPerComp != Info1->BitsPerComp ||
963         Info0->NumFormat != Info1->NumFormat)
964       return false;
965 
966     // TODO: Should be possible to support more formats, but if format loads
967     // are not dword-aligned, the merged load might not be valid.
968     if (Info0->BitsPerComp != 32)
969       return false;
970 
971     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
972       return false;
973   }
974 
975   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
976   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
977   CI.UseST64 = false;
978   CI.BaseOff = 0;
979 
980   // Handle all non-DS instructions.
981   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
982     if (EltOffset0 + CI.Width != EltOffset1 &&
983             EltOffset1 + Paired.Width != EltOffset0)
984       return false;
985     if (CI.CPol != Paired.CPol)
986       return false;
987     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
988         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
989       // Reject cases like:
990       //   dword + dwordx2 -> dwordx3
991       //   dword + dwordx3 -> dwordx4
992       // If we tried to combine these cases, we would fail to extract a subreg
993       // for the result of the second load due to SGPR alignment requirements.
994       if (CI.Width != Paired.Width &&
995           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
996         return false;
997     }
998     return true;
999   }
1000 
1001   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1002   // the stride 64 versions.
1003   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1004       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1005     if (Modify) {
1006       CI.Offset = EltOffset0 / 64;
1007       Paired.Offset = EltOffset1 / 64;
1008       CI.UseST64 = true;
1009     }
1010     return true;
1011   }
1012 
1013   // Check if the new offsets fit in the reduced 8-bit range.
1014   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1015     if (Modify) {
1016       CI.Offset = EltOffset0;
1017       Paired.Offset = EltOffset1;
1018     }
1019     return true;
1020   }
1021 
1022   // Try to shift base address to decrease offsets.
1023   uint32_t Min = std::min(EltOffset0, EltOffset1);
1024   uint32_t Max = std::max(EltOffset0, EltOffset1);
1025 
1026   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1027   if (((Max - Min) & ~Mask) == 0) {
1028     if (Modify) {
1029       // From the range of values we could use for BaseOff, choose the one that
1030       // is aligned to the highest power of two, to maximise the chance that
1031       // the same offset can be reused for other load/store pairs.
1032       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1033       // Copy the low bits of the offsets, so that when we adjust them by
1034       // subtracting BaseOff they will be multiples of 64.
1035       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1036       CI.BaseOff = BaseOff * CI.EltSize;
1037       CI.Offset = (EltOffset0 - BaseOff) / 64;
1038       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1039       CI.UseST64 = true;
1040     }
1041     return true;
1042   }
1043 
1044   if (isUInt<8>(Max - Min)) {
1045     if (Modify) {
1046       // From the range of values we could use for BaseOff, choose the one that
1047       // is aligned to the highest power of two, to maximise the chance that
1048       // the same offset can be reused for other load/store pairs.
1049       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1050       CI.BaseOff = BaseOff * CI.EltSize;
1051       CI.Offset = EltOffset0 - BaseOff;
1052       Paired.Offset = EltOffset1 - BaseOff;
1053     }
1054     return true;
1055   }
1056 
1057   return false;
1058 }
1059 
1060 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1061                                      const CombineInfo &CI,
1062                                      const CombineInfo &Paired) {
1063   const unsigned Width = (CI.Width + Paired.Width);
1064   switch (CI.InstClass) {
1065   default:
1066     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1067   case S_BUFFER_LOAD_IMM:
1068   case S_BUFFER_LOAD_SGPR_IMM:
1069   case S_LOAD_IMM:
1070     switch (Width) {
1071     default:
1072       return false;
1073     case 2:
1074     case 4:
1075     case 8:
1076       return true;
1077     case 3:
1078       return STM.hasScalarDwordx3Loads();
1079     }
1080   }
1081 }
1082 
1083 const TargetRegisterClass *
1084 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1085   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1086     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1087   }
1088   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1089     return TRI->getRegClassForReg(*MRI, Src->getReg());
1090   }
1091   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1092     return TRI->getRegClassForReg(*MRI, Src->getReg());
1093   }
1094   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1095     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1096   }
1097   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1098     return TRI->getRegClassForReg(*MRI, Src->getReg());
1099   }
1100   return nullptr;
1101 }
1102 
1103 /// This function assumes that CI comes before Paired in a basic block. Return
1104 /// an insertion point for the merged instruction or nullptr on failure.
1105 SILoadStoreOptimizer::CombineInfo *
1106 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1107                                            CombineInfo &Paired) {
1108   // If another instruction has already been merged into CI, it may now be a
1109   // type that we can't do any further merging into.
1110   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1111     return nullptr;
1112   assert(CI.InstClass == Paired.InstClass);
1113 
1114   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1115       getInstSubclass(Paired.I->getOpcode(), *TII))
1116     return nullptr;
1117 
1118   // Check both offsets (or masks for MIMG) can be combined and fit in the
1119   // reduced range.
1120   if (CI.InstClass == MIMG) {
1121     if (!dmasksCanBeCombined(CI, *TII, Paired))
1122       return nullptr;
1123   } else {
1124     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1125       return nullptr;
1126   }
1127 
1128   DenseSet<Register> RegDefs;
1129   DenseSet<Register> RegUses;
1130   CombineInfo *Where;
1131   if (CI.I->mayLoad()) {
1132     // Try to hoist Paired up to CI.
1133     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1134     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1135       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1136         return nullptr;
1137     }
1138     Where = &CI;
1139   } else {
1140     // Try to sink CI down to Paired.
1141     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1142     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1143       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1144         return nullptr;
1145     }
1146     Where = &Paired;
1147   }
1148 
1149   // Call offsetsCanBeCombined with modify = true so that the offsets are
1150   // correct for the new instruction.  This should return true, because
1151   // this function should only be called on CombineInfo objects that
1152   // have already been confirmed to be mergeable.
1153   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1154     offsetsCanBeCombined(CI, *STM, Paired, true);
1155   return Where;
1156 }
1157 
1158 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1159   if (STM->ldsRequiresM0Init())
1160     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1161   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1162 }
1163 
1164 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1165   if (STM->ldsRequiresM0Init())
1166     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1167 
1168   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1169                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1170 }
1171 
1172 MachineBasicBlock::iterator
1173 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1174                                      MachineBasicBlock::iterator InsertBefore) {
1175   MachineBasicBlock *MBB = CI.I->getParent();
1176 
1177   // Be careful, since the addresses could be subregisters themselves in weird
1178   // cases, like vectors of pointers.
1179   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1180 
1181   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1182   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1183 
1184   unsigned NewOffset0 = CI.Offset;
1185   unsigned NewOffset1 = Paired.Offset;
1186   unsigned Opc =
1187       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1188 
1189   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1190   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1191 
1192   if (NewOffset0 > NewOffset1) {
1193     // Canonicalize the merged instruction so the smaller offset comes first.
1194     std::swap(NewOffset0, NewOffset1);
1195     std::swap(SubRegIdx0, SubRegIdx1);
1196   }
1197 
1198   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1199          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1200 
1201   const MCInstrDesc &Read2Desc = TII->get(Opc);
1202 
1203   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1204   Register DestReg = MRI->createVirtualRegister(SuperRC);
1205 
1206   DebugLoc DL = CI.I->getDebugLoc();
1207 
1208   Register BaseReg = AddrReg->getReg();
1209   unsigned BaseSubReg = AddrReg->getSubReg();
1210   unsigned BaseRegFlags = 0;
1211   if (CI.BaseOff) {
1212     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1213     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1214         .addImm(CI.BaseOff);
1215 
1216     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1217     BaseRegFlags = RegState::Kill;
1218 
1219     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1220         .addReg(ImmReg)
1221         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1222         .addImm(0); // clamp bit
1223     BaseSubReg = 0;
1224   }
1225 
1226   MachineInstrBuilder Read2 =
1227       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1228           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1229           .addImm(NewOffset0)                        // offset0
1230           .addImm(NewOffset1)                        // offset1
1231           .addImm(0)                                 // gds
1232           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1233 
1234   (void)Read2;
1235 
1236   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1237 
1238   // Copy to the old destination registers.
1239   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1240       .add(*Dest0) // Copy to same destination including flags and sub reg.
1241       .addReg(DestReg, 0, SubRegIdx0);
1242   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1243       .add(*Dest1)
1244       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1245 
1246   CI.I->eraseFromParent();
1247   Paired.I->eraseFromParent();
1248 
1249   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1250   return Read2;
1251 }
1252 
1253 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1254   if (STM->ldsRequiresM0Init())
1255     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1256   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1257                         : AMDGPU::DS_WRITE2_B64_gfx9;
1258 }
1259 
1260 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1261   if (STM->ldsRequiresM0Init())
1262     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1263                           : AMDGPU::DS_WRITE2ST64_B64;
1264 
1265   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1266                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1267 }
1268 
1269 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1270     CombineInfo &CI, CombineInfo &Paired,
1271     MachineBasicBlock::iterator InsertBefore) {
1272   MachineBasicBlock *MBB = CI.I->getParent();
1273 
1274   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1275   // sure we preserve the subregister index and any register flags set on them.
1276   const MachineOperand *AddrReg =
1277       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1278   const MachineOperand *Data0 =
1279       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1280   const MachineOperand *Data1 =
1281       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1282 
1283   unsigned NewOffset0 = CI.Offset;
1284   unsigned NewOffset1 = Paired.Offset;
1285   unsigned Opc =
1286       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1287 
1288   if (NewOffset0 > NewOffset1) {
1289     // Canonicalize the merged instruction so the smaller offset comes first.
1290     std::swap(NewOffset0, NewOffset1);
1291     std::swap(Data0, Data1);
1292   }
1293 
1294   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1295          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1296 
1297   const MCInstrDesc &Write2Desc = TII->get(Opc);
1298   DebugLoc DL = CI.I->getDebugLoc();
1299 
1300   Register BaseReg = AddrReg->getReg();
1301   unsigned BaseSubReg = AddrReg->getSubReg();
1302   unsigned BaseRegFlags = 0;
1303   if (CI.BaseOff) {
1304     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1305     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1306         .addImm(CI.BaseOff);
1307 
1308     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1309     BaseRegFlags = RegState::Kill;
1310 
1311     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1312         .addReg(ImmReg)
1313         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1314         .addImm(0); // clamp bit
1315     BaseSubReg = 0;
1316   }
1317 
1318   MachineInstrBuilder Write2 =
1319       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1320           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1321           .add(*Data0)                               // data0
1322           .add(*Data1)                               // data1
1323           .addImm(NewOffset0)                        // offset0
1324           .addImm(NewOffset1)                        // offset1
1325           .addImm(0)                                 // gds
1326           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1327 
1328   CI.I->eraseFromParent();
1329   Paired.I->eraseFromParent();
1330 
1331   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1332   return Write2;
1333 }
1334 
1335 MachineBasicBlock::iterator
1336 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1337                                      MachineBasicBlock::iterator InsertBefore) {
1338   MachineBasicBlock *MBB = CI.I->getParent();
1339   DebugLoc DL = CI.I->getDebugLoc();
1340   const unsigned Opcode = getNewOpcode(CI, Paired);
1341 
1342   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1343 
1344   Register DestReg = MRI->createVirtualRegister(SuperRC);
1345   unsigned MergedDMask = CI.DMask | Paired.DMask;
1346   unsigned DMaskIdx =
1347       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1348 
1349   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1350   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1351     if (I == DMaskIdx)
1352       MIB.addImm(MergedDMask);
1353     else
1354       MIB.add((*CI.I).getOperand(I));
1355   }
1356 
1357   // It shouldn't be possible to get this far if the two instructions
1358   // don't have a single memoperand, because MachineInstr::mayAlias()
1359   // will return true if this is the case.
1360   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1361 
1362   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1363 
1364   unsigned SubRegIdx0, SubRegIdx1;
1365   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1366 
1367   // Copy to the old destination registers.
1368   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1369   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1370   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1371 
1372   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1373       .add(*Dest0) // Copy to same destination including flags and sub reg.
1374       .addReg(DestReg, 0, SubRegIdx0);
1375   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1376       .add(*Dest1)
1377       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1378 
1379   CI.I->eraseFromParent();
1380   Paired.I->eraseFromParent();
1381   return New;
1382 }
1383 
1384 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1385     CombineInfo &CI, CombineInfo &Paired,
1386     MachineBasicBlock::iterator InsertBefore) {
1387   MachineBasicBlock *MBB = CI.I->getParent();
1388   DebugLoc DL = CI.I->getDebugLoc();
1389   const unsigned Opcode = getNewOpcode(CI, Paired);
1390 
1391   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1392 
1393   Register DestReg = MRI->createVirtualRegister(SuperRC);
1394   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1395 
1396   // It shouldn't be possible to get this far if the two instructions
1397   // don't have a single memoperand, because MachineInstr::mayAlias()
1398   // will return true if this is the case.
1399   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1400 
1401   MachineInstrBuilder New =
1402       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1403           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1404   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1405     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1406   New.addImm(MergedOffset);
1407   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1408 
1409   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1410   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1411   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1412 
1413   // Copy to the old destination registers.
1414   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1415   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1416   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1417 
1418   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1419       .add(*Dest0) // Copy to same destination including flags and sub reg.
1420       .addReg(DestReg, 0, SubRegIdx0);
1421   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1422       .add(*Dest1)
1423       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1424 
1425   CI.I->eraseFromParent();
1426   Paired.I->eraseFromParent();
1427   return New;
1428 }
1429 
1430 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1431     CombineInfo &CI, CombineInfo &Paired,
1432     MachineBasicBlock::iterator InsertBefore) {
1433   MachineBasicBlock *MBB = CI.I->getParent();
1434   DebugLoc DL = CI.I->getDebugLoc();
1435 
1436   const unsigned Opcode = getNewOpcode(CI, Paired);
1437 
1438   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1439 
1440   // Copy to the new source register.
1441   Register DestReg = MRI->createVirtualRegister(SuperRC);
1442   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1443 
1444   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1445 
1446   AddressRegs Regs = getRegs(Opcode, *TII);
1447 
1448   if (Regs.VAddr)
1449     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1450 
1451   // It shouldn't be possible to get this far if the two instructions
1452   // don't have a single memoperand, because MachineInstr::mayAlias()
1453   // will return true if this is the case.
1454   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1455 
1456   MachineInstr *New =
1457     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1458         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1459         .addImm(MergedOffset) // offset
1460         .addImm(CI.CPol)      // cpol
1461         .addImm(0)            // swz
1462         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1463 
1464   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1465   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1466   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1467 
1468   // Copy to the old destination registers.
1469   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1470   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1471   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1472 
1473   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1474       .add(*Dest0) // Copy to same destination including flags and sub reg.
1475       .addReg(DestReg, 0, SubRegIdx0);
1476   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1477       .add(*Dest1)
1478       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1479 
1480   CI.I->eraseFromParent();
1481   Paired.I->eraseFromParent();
1482   return New;
1483 }
1484 
1485 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1486     CombineInfo &CI, CombineInfo &Paired,
1487     MachineBasicBlock::iterator InsertBefore) {
1488   MachineBasicBlock *MBB = CI.I->getParent();
1489   DebugLoc DL = CI.I->getDebugLoc();
1490 
1491   const unsigned Opcode = getNewOpcode(CI, Paired);
1492 
1493   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1494 
1495   // Copy to the new source register.
1496   Register DestReg = MRI->createVirtualRegister(SuperRC);
1497   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1498 
1499   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1500 
1501   AddressRegs Regs = getRegs(Opcode, *TII);
1502 
1503   if (Regs.VAddr)
1504     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1505 
1506   unsigned JoinedFormat =
1507       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1508 
1509   // It shouldn't be possible to get this far if the two instructions
1510   // don't have a single memoperand, because MachineInstr::mayAlias()
1511   // will return true if this is the case.
1512   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1513 
1514   MachineInstr *New =
1515       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1516           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1517           .addImm(MergedOffset) // offset
1518           .addImm(JoinedFormat) // format
1519           .addImm(CI.CPol)      // cpol
1520           .addImm(0)            // swz
1521           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1522 
1523   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1524   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1525   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1526 
1527   // Copy to the old destination registers.
1528   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1529   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1530   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1531 
1532   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1533       .add(*Dest0) // Copy to same destination including flags and sub reg.
1534       .addReg(DestReg, 0, SubRegIdx0);
1535   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1536       .add(*Dest1)
1537       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1538 
1539   CI.I->eraseFromParent();
1540   Paired.I->eraseFromParent();
1541   return New;
1542 }
1543 
1544 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1545     CombineInfo &CI, CombineInfo &Paired,
1546     MachineBasicBlock::iterator InsertBefore) {
1547   MachineBasicBlock *MBB = CI.I->getParent();
1548   DebugLoc DL = CI.I->getDebugLoc();
1549 
1550   const unsigned Opcode = getNewOpcode(CI, Paired);
1551 
1552   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1553   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1554   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1555 
1556   // Copy to the new source register.
1557   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1558   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1559 
1560   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1561   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1562 
1563   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1564       .add(*Src0)
1565       .addImm(SubRegIdx0)
1566       .add(*Src1)
1567       .addImm(SubRegIdx1);
1568 
1569   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1570                  .addReg(SrcReg, RegState::Kill);
1571 
1572   AddressRegs Regs = getRegs(Opcode, *TII);
1573 
1574   if (Regs.VAddr)
1575     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1576 
1577   unsigned JoinedFormat =
1578       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1579 
1580   // It shouldn't be possible to get this far if the two instructions
1581   // don't have a single memoperand, because MachineInstr::mayAlias()
1582   // will return true if this is the case.
1583   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1584 
1585   MachineInstr *New =
1586       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1587           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1588           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1589           .addImm(JoinedFormat)                     // format
1590           .addImm(CI.CPol)                          // cpol
1591           .addImm(0)                                // swz
1592           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1593 
1594   CI.I->eraseFromParent();
1595   Paired.I->eraseFromParent();
1596   return New;
1597 }
1598 
1599 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1600     CombineInfo &CI, CombineInfo &Paired,
1601     MachineBasicBlock::iterator InsertBefore) {
1602   MachineBasicBlock *MBB = CI.I->getParent();
1603   DebugLoc DL = CI.I->getDebugLoc();
1604 
1605   const unsigned Opcode = getNewOpcode(CI, Paired);
1606 
1607   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1608   Register DestReg = MRI->createVirtualRegister(SuperRC);
1609 
1610   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1611 
1612   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1613     MIB.add(*SAddr);
1614 
1615   MachineInstr *New =
1616     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1617        .addImm(std::min(CI.Offset, Paired.Offset))
1618        .addImm(CI.CPol)
1619        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1620 
1621   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1622   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1623   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1624 
1625   // Copy to the old destination registers.
1626   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1627   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1628   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1629 
1630   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1631       .add(*Dest0) // Copy to same destination including flags and sub reg.
1632       .addReg(DestReg, 0, SubRegIdx0);
1633   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1634       .add(*Dest1)
1635       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1636 
1637   CI.I->eraseFromParent();
1638   Paired.I->eraseFromParent();
1639   return New;
1640 }
1641 
1642 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1643     CombineInfo &CI, CombineInfo &Paired,
1644     MachineBasicBlock::iterator InsertBefore) {
1645   MachineBasicBlock *MBB = CI.I->getParent();
1646   DebugLoc DL = CI.I->getDebugLoc();
1647 
1648   const unsigned Opcode = getNewOpcode(CI, Paired);
1649 
1650   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1651   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1652   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1653 
1654   // Copy to the new source register.
1655   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1656   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1657 
1658   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1659   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1660 
1661   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1662       .add(*Src0)
1663       .addImm(SubRegIdx0)
1664       .add(*Src1)
1665       .addImm(SubRegIdx1);
1666 
1667   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1668                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1669                  .addReg(SrcReg, RegState::Kill);
1670 
1671   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1672     MIB.add(*SAddr);
1673 
1674   MachineInstr *New =
1675     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1676        .addImm(CI.CPol)
1677        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1678 
1679   CI.I->eraseFromParent();
1680   Paired.I->eraseFromParent();
1681   return New;
1682 }
1683 
1684 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1685                                             const CombineInfo &Paired) {
1686   const unsigned Width = CI.Width + Paired.Width;
1687 
1688   switch (getCommonInstClass(CI, Paired)) {
1689   default:
1690     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1691     // FIXME: Handle d16 correctly
1692     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1693                                   Width);
1694   case TBUFFER_LOAD:
1695   case TBUFFER_STORE:
1696     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1697                                   Width);
1698 
1699   case UNKNOWN:
1700     llvm_unreachable("Unknown instruction class");
1701   case S_BUFFER_LOAD_IMM:
1702     switch (Width) {
1703     default:
1704       return 0;
1705     case 2:
1706       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1707     case 3:
1708       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1709     case 4:
1710       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1711     case 8:
1712       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1713     }
1714   case S_BUFFER_LOAD_SGPR_IMM:
1715     switch (Width) {
1716     default:
1717       return 0;
1718     case 2:
1719       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1720     case 3:
1721       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1722     case 4:
1723       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1724     case 8:
1725       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1726     }
1727   case S_LOAD_IMM:
1728     switch (Width) {
1729     default:
1730       return 0;
1731     case 2:
1732       return AMDGPU::S_LOAD_DWORDX2_IMM;
1733     case 3:
1734       return AMDGPU::S_LOAD_DWORDX3_IMM;
1735     case 4:
1736       return AMDGPU::S_LOAD_DWORDX4_IMM;
1737     case 8:
1738       return AMDGPU::S_LOAD_DWORDX8_IMM;
1739     }
1740   case GLOBAL_LOAD:
1741     switch (Width) {
1742     default:
1743       return 0;
1744     case 2:
1745       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1746     case 3:
1747       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1748     case 4:
1749       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1750     }
1751   case GLOBAL_LOAD_SADDR:
1752     switch (Width) {
1753     default:
1754       return 0;
1755     case 2:
1756       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1757     case 3:
1758       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1759     case 4:
1760       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1761     }
1762   case GLOBAL_STORE:
1763     switch (Width) {
1764     default:
1765       return 0;
1766     case 2:
1767       return AMDGPU::GLOBAL_STORE_DWORDX2;
1768     case 3:
1769       return AMDGPU::GLOBAL_STORE_DWORDX3;
1770     case 4:
1771       return AMDGPU::GLOBAL_STORE_DWORDX4;
1772     }
1773   case GLOBAL_STORE_SADDR:
1774     switch (Width) {
1775     default:
1776       return 0;
1777     case 2:
1778       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1779     case 3:
1780       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1781     case 4:
1782       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1783     }
1784   case FLAT_LOAD:
1785     switch (Width) {
1786     default:
1787       return 0;
1788     case 2:
1789       return AMDGPU::FLAT_LOAD_DWORDX2;
1790     case 3:
1791       return AMDGPU::FLAT_LOAD_DWORDX3;
1792     case 4:
1793       return AMDGPU::FLAT_LOAD_DWORDX4;
1794     }
1795   case FLAT_STORE:
1796     switch (Width) {
1797     default:
1798       return 0;
1799     case 2:
1800       return AMDGPU::FLAT_STORE_DWORDX2;
1801     case 3:
1802       return AMDGPU::FLAT_STORE_DWORDX3;
1803     case 4:
1804       return AMDGPU::FLAT_STORE_DWORDX4;
1805     }
1806   case MIMG:
1807     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1808            "No overlaps");
1809     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1810   }
1811 }
1812 
1813 std::pair<unsigned, unsigned>
1814 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1815                                     const CombineInfo &Paired) {
1816   assert((CI.InstClass != MIMG ||
1817           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1818            CI.Width + Paired.Width)) &&
1819          "No overlaps");
1820 
1821   unsigned Idx0;
1822   unsigned Idx1;
1823 
1824   static const unsigned Idxs[5][4] = {
1825       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1826       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1827       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1828       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1829       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1830   };
1831 
1832   assert(CI.Width >= 1 && CI.Width <= 4);
1833   assert(Paired.Width >= 1 && Paired.Width <= 4);
1834 
1835   if (Paired < CI) {
1836     Idx1 = Idxs[0][Paired.Width - 1];
1837     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1838   } else {
1839     Idx0 = Idxs[0][CI.Width - 1];
1840     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1841   }
1842 
1843   return std::pair(Idx0, Idx1);
1844 }
1845 
1846 const TargetRegisterClass *
1847 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1848                                              const CombineInfo &Paired) {
1849   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1850       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1851     switch (CI.Width + Paired.Width) {
1852     default:
1853       return nullptr;
1854     case 2:
1855       return &AMDGPU::SReg_64_XEXECRegClass;
1856     case 3:
1857       return &AMDGPU::SGPR_96RegClass;
1858     case 4:
1859       return &AMDGPU::SGPR_128RegClass;
1860     case 8:
1861       return &AMDGPU::SGPR_256RegClass;
1862     case 16:
1863       return &AMDGPU::SGPR_512RegClass;
1864     }
1865   }
1866 
1867   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1868   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1869              ? TRI->getAGPRClassForBitWidth(BitWidth)
1870              : TRI->getVGPRClassForBitWidth(BitWidth);
1871 }
1872 
1873 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1874     CombineInfo &CI, CombineInfo &Paired,
1875     MachineBasicBlock::iterator InsertBefore) {
1876   MachineBasicBlock *MBB = CI.I->getParent();
1877   DebugLoc DL = CI.I->getDebugLoc();
1878 
1879   const unsigned Opcode = getNewOpcode(CI, Paired);
1880 
1881   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1882   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1883   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1884 
1885   // Copy to the new source register.
1886   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1887   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1888 
1889   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1890   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1891 
1892   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1893       .add(*Src0)
1894       .addImm(SubRegIdx0)
1895       .add(*Src1)
1896       .addImm(SubRegIdx1);
1897 
1898   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1899                  .addReg(SrcReg, RegState::Kill);
1900 
1901   AddressRegs Regs = getRegs(Opcode, *TII);
1902 
1903   if (Regs.VAddr)
1904     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1905 
1906 
1907   // It shouldn't be possible to get this far if the two instructions
1908   // don't have a single memoperand, because MachineInstr::mayAlias()
1909   // will return true if this is the case.
1910   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1911 
1912   MachineInstr *New =
1913     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1914         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1915         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1916         .addImm(CI.CPol)      // cpol
1917         .addImm(0)            // swz
1918         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1919 
1920   CI.I->eraseFromParent();
1921   Paired.I->eraseFromParent();
1922   return New;
1923 }
1924 
1925 MachineOperand
1926 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1927   APInt V(32, Val, true);
1928   if (TII->isInlineConstant(V))
1929     return MachineOperand::CreateImm(Val);
1930 
1931   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1932   MachineInstr *Mov =
1933   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1934           TII->get(AMDGPU::S_MOV_B32), Reg)
1935     .addImm(Val);
1936   (void)Mov;
1937   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1938   return MachineOperand::CreateReg(Reg, false);
1939 }
1940 
1941 // Compute base address using Addr and return the final register.
1942 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1943                                            const MemAddress &Addr) const {
1944   MachineBasicBlock *MBB = MI.getParent();
1945   MachineBasicBlock::iterator MBBI = MI.getIterator();
1946   DebugLoc DL = MI.getDebugLoc();
1947 
1948   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1949           Addr.Base.LoSubReg) &&
1950          "Expected 32-bit Base-Register-Low!!");
1951 
1952   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1953           Addr.Base.HiSubReg) &&
1954          "Expected 32-bit Base-Register-Hi!!");
1955 
1956   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1957   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1958   MachineOperand OffsetHi =
1959     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1960 
1961   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1962   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1963   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1964 
1965   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1966   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1967   MachineInstr *LoHalf =
1968     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1969       .addReg(CarryReg, RegState::Define)
1970       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1971       .add(OffsetLo)
1972       .addImm(0); // clamp bit
1973   (void)LoHalf;
1974   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1975 
1976   MachineInstr *HiHalf =
1977   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1978     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1979     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1980     .add(OffsetHi)
1981     .addReg(CarryReg, RegState::Kill)
1982     .addImm(0); // clamp bit
1983   (void)HiHalf;
1984   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1985 
1986   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1987   MachineInstr *FullBase =
1988     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1989       .addReg(DestSub0)
1990       .addImm(AMDGPU::sub0)
1991       .addReg(DestSub1)
1992       .addImm(AMDGPU::sub1);
1993   (void)FullBase;
1994   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1995 
1996   return FullDestReg;
1997 }
1998 
1999 // Update base and offset with the NewBase and NewOffset in MI.
2000 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2001                                                Register NewBase,
2002                                                int32_t NewOffset) const {
2003   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2004   Base->setReg(NewBase);
2005   Base->setIsKill(false);
2006   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2007 }
2008 
2009 std::optional<int32_t>
2010 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2011   if (Op.isImm())
2012     return Op.getImm();
2013 
2014   if (!Op.isReg())
2015     return std::nullopt;
2016 
2017   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2018   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2019       !Def->getOperand(1).isImm())
2020     return std::nullopt;
2021 
2022   return Def->getOperand(1).getImm();
2023 }
2024 
2025 // Analyze Base and extracts:
2026 //  - 32bit base registers, subregisters
2027 //  - 64bit constant offset
2028 // Expecting base computation as:
2029 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2030 //   %LO:vgpr_32, %c:sreg_64_xexec =
2031 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2032 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2033 //   %Base:vreg_64 =
2034 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2035 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2036                                                       MemAddress &Addr) const {
2037   if (!Base.isReg())
2038     return;
2039 
2040   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2041   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2042       || Def->getNumOperands() != 5)
2043     return;
2044 
2045   MachineOperand BaseLo = Def->getOperand(1);
2046   MachineOperand BaseHi = Def->getOperand(3);
2047   if (!BaseLo.isReg() || !BaseHi.isReg())
2048     return;
2049 
2050   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2051   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2052 
2053   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2054       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2055     return;
2056 
2057   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2058   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2059 
2060   auto Offset0P = extractConstOffset(*Src0);
2061   if (Offset0P)
2062     BaseLo = *Src1;
2063   else {
2064     if (!(Offset0P = extractConstOffset(*Src1)))
2065       return;
2066     BaseLo = *Src0;
2067   }
2068 
2069   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2070   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2071 
2072   if (Src0->isImm())
2073     std::swap(Src0, Src1);
2074 
2075   if (!Src1->isImm())
2076     return;
2077 
2078   uint64_t Offset1 = Src1->getImm();
2079   BaseHi = *Src0;
2080 
2081   Addr.Base.LoReg = BaseLo.getReg();
2082   Addr.Base.HiReg = BaseHi.getReg();
2083   Addr.Base.LoSubReg = BaseLo.getSubReg();
2084   Addr.Base.HiSubReg = BaseHi.getSubReg();
2085   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2086 }
2087 
2088 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2089     MachineInstr &MI,
2090     MemInfoMap &Visited,
2091     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2092 
2093   if (!(MI.mayLoad() ^ MI.mayStore()))
2094     return false;
2095 
2096   // TODO: Support flat and scratch.
2097   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2098     return false;
2099 
2100   if (MI.mayLoad() &&
2101       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2102     return false;
2103 
2104   if (AnchorList.count(&MI))
2105     return false;
2106 
2107   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2108 
2109   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2110     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2111     return false;
2112   }
2113 
2114   // Step1: Find the base-registers and a 64bit constant offset.
2115   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2116   MemAddress MAddr;
2117   if (!Visited.contains(&MI)) {
2118     processBaseWithConstOffset(Base, MAddr);
2119     Visited[&MI] = MAddr;
2120   } else
2121     MAddr = Visited[&MI];
2122 
2123   if (MAddr.Offset == 0) {
2124     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2125                          " constant offsets that can be promoted.\n";);
2126     return false;
2127   }
2128 
2129   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2130              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2131 
2132   // Step2: Traverse through MI's basic block and find an anchor(that has the
2133   // same base-registers) with the highest 13bit distance from MI's offset.
2134   // E.g. (64bit loads)
2135   // bb:
2136   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2137   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2138   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2139   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2140   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2141   //
2142   // Starting from the first load, the optimization will try to find a new base
2143   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2144   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2145   // as the new-base(anchor) because of the maximum distance which can
2146   // accommodate more intermediate bases presumably.
2147   //
2148   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2149   // (&a + 8192) for load1, load2, load4.
2150   //   addr = &a + 8192
2151   //   load1 = load(addr,       -4096)
2152   //   load2 = load(addr,       -2048)
2153   //   load3 = load(addr,       0)
2154   //   load4 = load(addr,       2048)
2155   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2156   //
2157   MachineInstr *AnchorInst = nullptr;
2158   MemAddress AnchorAddr;
2159   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2160   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2161 
2162   MachineBasicBlock *MBB = MI.getParent();
2163   MachineBasicBlock::iterator E = MBB->end();
2164   MachineBasicBlock::iterator MBBI = MI.getIterator();
2165   ++MBBI;
2166   const SITargetLowering *TLI =
2167     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2168 
2169   for ( ; MBBI != E; ++MBBI) {
2170     MachineInstr &MINext = *MBBI;
2171     // TODO: Support finding an anchor(with same base) from store addresses or
2172     // any other load addresses where the opcodes are different.
2173     if (MINext.getOpcode() != MI.getOpcode() ||
2174         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2175       continue;
2176 
2177     const MachineOperand &BaseNext =
2178       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2179     MemAddress MAddrNext;
2180     if (!Visited.contains(&MINext)) {
2181       processBaseWithConstOffset(BaseNext, MAddrNext);
2182       Visited[&MINext] = MAddrNext;
2183     } else
2184       MAddrNext = Visited[&MINext];
2185 
2186     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2187         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2188         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2189         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2190       continue;
2191 
2192     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2193 
2194     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2195     TargetLoweringBase::AddrMode AM;
2196     AM.HasBaseReg = true;
2197     AM.BaseOffs = Dist;
2198     if (TLI->isLegalGlobalAddressingMode(AM) &&
2199         (uint32_t)std::abs(Dist) > MaxDist) {
2200       MaxDist = std::abs(Dist);
2201 
2202       AnchorAddr = MAddrNext;
2203       AnchorInst = &MINext;
2204     }
2205   }
2206 
2207   if (AnchorInst) {
2208     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2209                AnchorInst->dump());
2210     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2211                <<  AnchorAddr.Offset << "\n\n");
2212 
2213     // Instead of moving up, just re-compute anchor-instruction's base address.
2214     Register Base = computeBase(MI, AnchorAddr);
2215 
2216     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2217     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2218 
2219     for (auto P : InstsWCommonBase) {
2220       TargetLoweringBase::AddrMode AM;
2221       AM.HasBaseReg = true;
2222       AM.BaseOffs = P.second - AnchorAddr.Offset;
2223 
2224       if (TLI->isLegalGlobalAddressingMode(AM)) {
2225         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2226                    dbgs() << ")"; P.first->dump());
2227         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2228         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2229       }
2230     }
2231     AnchorList.insert(AnchorInst);
2232     return true;
2233   }
2234 
2235   return false;
2236 }
2237 
2238 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2239                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2240   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2241     if (AddrList.front().InstClass == CI.InstClass &&
2242         AddrList.front().IsAGPR == CI.IsAGPR &&
2243         AddrList.front().hasSameBaseAddress(CI)) {
2244       AddrList.emplace_back(CI);
2245       return;
2246     }
2247   }
2248 
2249   // Base address not found, so add a new list.
2250   MergeableInsts.emplace_back(1, CI);
2251 }
2252 
2253 std::pair<MachineBasicBlock::iterator, bool>
2254 SILoadStoreOptimizer::collectMergeableInsts(
2255     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2256     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2257     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2258   bool Modified = false;
2259 
2260   // Sort potential mergeable instructions into lists.  One list per base address.
2261   unsigned Order = 0;
2262   MachineBasicBlock::iterator BlockI = Begin;
2263   for (; BlockI != End; ++BlockI) {
2264     MachineInstr &MI = *BlockI;
2265 
2266     // We run this before checking if an address is mergeable, because it can produce
2267     // better code even if the instructions aren't mergeable.
2268     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2269       Modified = true;
2270 
2271     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2272     // barriers. We can look after this barrier for separate merges.
2273     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2274       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2275 
2276       // Search will resume after this instruction in a separate merge list.
2277       ++BlockI;
2278       break;
2279     }
2280 
2281     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2282     if (InstClass == UNKNOWN)
2283       continue;
2284 
2285     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2286     int Swizzled =
2287         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2288     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2289       continue;
2290 
2291     CombineInfo CI;
2292     CI.setMI(MI, *this);
2293     CI.Order = Order++;
2294 
2295     if (!CI.hasMergeableAddress(*MRI))
2296       continue;
2297 
2298     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2299       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2300       //        operands. However we are reporting that ds_write2 shall have
2301       //        only VGPR data so that machine copy propagation does not
2302       //        create an illegal instruction with a VGPR and AGPR sources.
2303       //        Consequenctially if we create such instruction the verifier
2304       //        will complain.
2305       continue;
2306     }
2307 
2308     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2309 
2310     addInstToMergeableList(CI, MergeableInsts);
2311   }
2312 
2313   // At this point we have lists of Mergeable instructions.
2314   //
2315   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2316   // list try to find an instruction that can be merged with I.  If an instruction
2317   // is found, it is stored in the Paired field.  If no instructions are found, then
2318   // the CombineInfo object is deleted from the list.
2319 
2320   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2321                                                    E = MergeableInsts.end(); I != E;) {
2322 
2323     std::list<CombineInfo> &MergeList = *I;
2324     if (MergeList.size() <= 1) {
2325       // This means we have found only one instruction with a given address
2326       // that can be merged, and we need at least 2 instructions to do a merge,
2327       // so this list can be discarded.
2328       I = MergeableInsts.erase(I);
2329       continue;
2330     }
2331 
2332     // Sort the lists by offsets, this way mergeable instructions will be
2333     // adjacent to each other in the list, which will make it easier to find
2334     // matches.
2335     MergeList.sort(
2336         [] (const CombineInfo &A, const CombineInfo &B) {
2337           return A.Offset < B.Offset;
2338         });
2339     ++I;
2340   }
2341 
2342   return std::pair(BlockI, Modified);
2343 }
2344 
2345 // Scan through looking for adjacent LDS operations with constant offsets from
2346 // the same base register. We rely on the scheduler to do the hard work of
2347 // clustering nearby loads, and assume these are all adjacent.
2348 bool SILoadStoreOptimizer::optimizeBlock(
2349                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2350   bool Modified = false;
2351 
2352   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2353                                                    E = MergeableInsts.end(); I != E;) {
2354     std::list<CombineInfo> &MergeList = *I;
2355 
2356     bool OptimizeListAgain = false;
2357     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2358       // We weren't able to make any changes, so delete the list so we don't
2359       // process the same instructions the next time we try to optimize this
2360       // block.
2361       I = MergeableInsts.erase(I);
2362       continue;
2363     }
2364 
2365     Modified = true;
2366 
2367     // We made changes, but also determined that there were no more optimization
2368     // opportunities, so we don't need to reprocess the list
2369     if (!OptimizeListAgain) {
2370       I = MergeableInsts.erase(I);
2371       continue;
2372     }
2373     OptimizeAgain = true;
2374   }
2375   return Modified;
2376 }
2377 
2378 bool
2379 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2380                                           std::list<CombineInfo> &MergeList,
2381                                           bool &OptimizeListAgain) {
2382   if (MergeList.empty())
2383     return false;
2384 
2385   bool Modified = false;
2386 
2387   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2388        Next = std::next(I)) {
2389 
2390     auto First = I;
2391     auto Second = Next;
2392 
2393     if ((*First).Order > (*Second).Order)
2394       std::swap(First, Second);
2395     CombineInfo &CI = *First;
2396     CombineInfo &Paired = *Second;
2397 
2398     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2399     if (!Where) {
2400       ++I;
2401       continue;
2402     }
2403 
2404     Modified = true;
2405 
2406     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2407 
2408     MachineBasicBlock::iterator NewMI;
2409     switch (CI.InstClass) {
2410     default:
2411       llvm_unreachable("unknown InstClass");
2412       break;
2413     case DS_READ:
2414       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2415       break;
2416     case DS_WRITE:
2417       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2418       break;
2419     case S_BUFFER_LOAD_IMM:
2420     case S_BUFFER_LOAD_SGPR_IMM:
2421     case S_LOAD_IMM:
2422       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2423       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2424       break;
2425     case BUFFER_LOAD:
2426       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2427       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2428       break;
2429     case BUFFER_STORE:
2430       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2431       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2432       break;
2433     case MIMG:
2434       NewMI = mergeImagePair(CI, Paired, Where->I);
2435       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2436       break;
2437     case TBUFFER_LOAD:
2438       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2439       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2440       break;
2441     case TBUFFER_STORE:
2442       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2443       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2444       break;
2445     case FLAT_LOAD:
2446     case GLOBAL_LOAD:
2447     case GLOBAL_LOAD_SADDR:
2448       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2449       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2450       break;
2451     case FLAT_STORE:
2452     case GLOBAL_STORE:
2453     case GLOBAL_STORE_SADDR:
2454       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2455       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2456       break;
2457     }
2458     CI.setMI(NewMI, *this);
2459     CI.Order = Where->Order;
2460     if (I == Second)
2461       I = Next;
2462 
2463     MergeList.erase(Second);
2464   }
2465 
2466   return Modified;
2467 }
2468 
2469 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2470   if (skipFunction(MF.getFunction()))
2471     return false;
2472 
2473   STM = &MF.getSubtarget<GCNSubtarget>();
2474   if (!STM->loadStoreOptEnabled())
2475     return false;
2476 
2477   TII = STM->getInstrInfo();
2478   TRI = &TII->getRegisterInfo();
2479 
2480   MRI = &MF.getRegInfo();
2481   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2482 
2483   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2484 
2485   bool Modified = false;
2486 
2487   // Contains the list of instructions for which constant offsets are being
2488   // promoted to the IMM. This is tracked for an entire block at time.
2489   SmallPtrSet<MachineInstr *, 4> AnchorList;
2490   MemInfoMap Visited;
2491 
2492   for (MachineBasicBlock &MBB : MF) {
2493     MachineBasicBlock::iterator SectionEnd;
2494     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2495          I = SectionEnd) {
2496       bool CollectModified;
2497       std::list<std::list<CombineInfo>> MergeableInsts;
2498 
2499       // First pass: Collect list of all instructions we know how to merge in a
2500       // subset of the block.
2501       std::tie(SectionEnd, CollectModified) =
2502           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2503 
2504       Modified |= CollectModified;
2505 
2506       do {
2507         OptimizeAgain = false;
2508         Modified |= optimizeBlock(MergeableInsts);
2509       } while (OptimizeAgain);
2510     }
2511 
2512     Visited.clear();
2513     AnchorList.clear();
2514   }
2515 
2516   return Modified;
2517 }
2518