xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 06cfbe3cfd44cd2ca9eb970b8c0e5f4911468440)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                     const CombineInfo &Paired);
224   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225 
226   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227 
228   unsigned read2Opcode(unsigned EltSize) const;
229   unsigned read2ST64Opcode(unsigned EltSize) const;
230   MachineBasicBlock::iterator
231   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232                  MachineBasicBlock::iterator InsertBefore);
233 
234   unsigned write2Opcode(unsigned EltSize) const;
235   unsigned write2ST64Opcode(unsigned EltSize) const;
236   MachineBasicBlock::iterator
237   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                   MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242   MachineBasicBlock::iterator
243   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                        MachineBasicBlock::iterator InsertBefore);
245   MachineBasicBlock::iterator
246   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                       MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                        MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                         MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259                     MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262                      MachineBasicBlock::iterator InsertBefore);
263 
264   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265                            int32_t NewOffset) const;
266   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270   /// Promotes constant offset to the immediate by adjusting the base. It
271   /// tries to use a base from the nearby instructions that allows it to have
272   /// a 13bit constant offset which gets promoted to the immediate.
273   bool promoteConstantOffsetToImm(MachineInstr &CI,
274                                   MemInfoMap &Visited,
275                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276   void addInstToMergeableList(const CombineInfo &CI,
277                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
278 
279   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282       std::list<std::list<CombineInfo>> &MergeableInsts) const;
283 
284   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285                                                      const CombineInfo &Paired);
286 
287   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288                                           const CombineInfo &Paired);
289 
290 public:
291   static char ID;
292 
293   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
294     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295   }
296 
297   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298                                      bool &OptimizeListAgain);
299   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300 
301   bool runOnMachineFunction(MachineFunction &MF) override;
302 
303   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304 
305   void getAnalysisUsage(AnalysisUsage &AU) const override {
306     AU.setPreservesCFG();
307     AU.addRequired<AAResultsWrapperPass>();
308 
309     MachineFunctionPass::getAnalysisUsage(AU);
310   }
311 
312   MachineFunctionProperties getRequiredProperties() const override {
313     return MachineFunctionProperties()
314       .set(MachineFunctionProperties::Property::IsSSA);
315   }
316 };
317 
318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319   const unsigned Opc = MI.getOpcode();
320 
321   if (TII.isMUBUF(Opc)) {
322     // FIXME: Handle d16 correctly
323     return AMDGPU::getMUBUFElements(Opc);
324   }
325   if (TII.isImage(MI)) {
326     uint64_t DMaskImm =
327         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328     return llvm::popcount(DMaskImm);
329   }
330   if (TII.isMTBUF(Opc)) {
331     return AMDGPU::getMTBUFElements(Opc);
332   }
333 
334   switch (Opc) {
335   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337   case AMDGPU::S_LOAD_DWORD_IMM:
338   case AMDGPU::GLOBAL_LOAD_DWORD:
339   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340   case AMDGPU::GLOBAL_STORE_DWORD:
341   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342   case AMDGPU::FLAT_LOAD_DWORD:
343   case AMDGPU::FLAT_STORE_DWORD:
344     return 1;
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357   case AMDGPU::S_LOAD_DWORDX3_IMM:
358   case AMDGPU::GLOBAL_LOAD_DWORDX3:
359   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360   case AMDGPU::GLOBAL_STORE_DWORDX3:
361   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362   case AMDGPU::FLAT_LOAD_DWORDX3:
363   case AMDGPU::FLAT_STORE_DWORDX3:
364     return 3;
365   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367   case AMDGPU::S_LOAD_DWORDX4_IMM:
368   case AMDGPU::GLOBAL_LOAD_DWORDX4:
369   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370   case AMDGPU::GLOBAL_STORE_DWORDX4:
371   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372   case AMDGPU::FLAT_LOAD_DWORDX4:
373   case AMDGPU::FLAT_STORE_DWORDX4:
374     return 4;
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377   case AMDGPU::S_LOAD_DWORDX8_IMM:
378     return 8;
379   case AMDGPU::DS_READ_B32:      [[fallthrough]];
380   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
382   case AMDGPU::DS_WRITE_B32_gfx9:
383     return 1;
384   case AMDGPU::DS_READ_B64:      [[fallthrough]];
385   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
387   case AMDGPU::DS_WRITE_B64_gfx9:
388     return 2;
389   default:
390     return 0;
391   }
392 }
393 
394 /// Maps instruction opcode to enum InstClassEnum.
395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396   switch (Opc) {
397   default:
398     if (TII.isMUBUF(Opc)) {
399       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400       default:
401         return UNKNOWN;
402       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
403       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
404       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
405       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
406       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
407       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
408       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
409       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
410       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
411       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
412       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
413       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
414       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
415       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
416       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
417       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
418         return BUFFER_LOAD;
419       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
420       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
421       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
422       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
423       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
424       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
425       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
426       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
427       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
428       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
429       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
430       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
431       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
432       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
433       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
434       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
435         return BUFFER_STORE;
436       }
437     }
438     if (TII.isImage(Opc)) {
439       // Ignore instructions encoded without vaddr.
440       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
441           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
442         return UNKNOWN;
443       // Ignore BVH instructions
444       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
445         return UNKNOWN;
446       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
447       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
448           TII.isGather4(Opc))
449         return UNKNOWN;
450       return MIMG;
451     }
452     if (TII.isMTBUF(Opc)) {
453       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
454       default:
455         return UNKNOWN;
456       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
457       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
458       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
459       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
460       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
461       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
462       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
463       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
464       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
465       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
466       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
467       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
468       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
469       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
470       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
471       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
472         return TBUFFER_LOAD;
473       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
474       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
475       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
476       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
477       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
478       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
479       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
480       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
481         return TBUFFER_STORE;
482       }
483     }
484     return UNKNOWN;
485   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
486   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
487   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
488   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
489   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
490     return S_BUFFER_LOAD_IMM;
491   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
492   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
493   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
494   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
495   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
496     return S_BUFFER_LOAD_SGPR_IMM;
497   case AMDGPU::S_LOAD_DWORD_IMM:
498   case AMDGPU::S_LOAD_DWORDX2_IMM:
499   case AMDGPU::S_LOAD_DWORDX3_IMM:
500   case AMDGPU::S_LOAD_DWORDX4_IMM:
501   case AMDGPU::S_LOAD_DWORDX8_IMM:
502     return S_LOAD_IMM;
503   case AMDGPU::DS_READ_B32:
504   case AMDGPU::DS_READ_B32_gfx9:
505   case AMDGPU::DS_READ_B64:
506   case AMDGPU::DS_READ_B64_gfx9:
507     return DS_READ;
508   case AMDGPU::DS_WRITE_B32:
509   case AMDGPU::DS_WRITE_B32_gfx9:
510   case AMDGPU::DS_WRITE_B64:
511   case AMDGPU::DS_WRITE_B64_gfx9:
512     return DS_WRITE;
513   case AMDGPU::GLOBAL_LOAD_DWORD:
514   case AMDGPU::GLOBAL_LOAD_DWORDX2:
515   case AMDGPU::GLOBAL_LOAD_DWORDX3:
516   case AMDGPU::GLOBAL_LOAD_DWORDX4:
517   case AMDGPU::FLAT_LOAD_DWORD:
518   case AMDGPU::FLAT_LOAD_DWORDX2:
519   case AMDGPU::FLAT_LOAD_DWORDX3:
520   case AMDGPU::FLAT_LOAD_DWORDX4:
521     return FLAT_LOAD;
522   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
523   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
524   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
525   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
526     return GLOBAL_LOAD_SADDR;
527   case AMDGPU::GLOBAL_STORE_DWORD:
528   case AMDGPU::GLOBAL_STORE_DWORDX2:
529   case AMDGPU::GLOBAL_STORE_DWORDX3:
530   case AMDGPU::GLOBAL_STORE_DWORDX4:
531   case AMDGPU::FLAT_STORE_DWORD:
532   case AMDGPU::FLAT_STORE_DWORDX2:
533   case AMDGPU::FLAT_STORE_DWORDX3:
534   case AMDGPU::FLAT_STORE_DWORDX4:
535     return FLAT_STORE;
536   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
537   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
538   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
539   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
540     return GLOBAL_STORE_SADDR;
541   }
542 }
543 
544 /// Determines instruction subclass from opcode. Only instructions
545 /// of the same subclass can be merged together. The merged instruction may have
546 /// a different subclass but must have the same class.
547 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
548   switch (Opc) {
549   default:
550     if (TII.isMUBUF(Opc))
551       return AMDGPU::getMUBUFBaseOpcode(Opc);
552     if (TII.isImage(Opc)) {
553       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
554       assert(Info);
555       return Info->BaseOpcode;
556     }
557     if (TII.isMTBUF(Opc))
558       return AMDGPU::getMTBUFBaseOpcode(Opc);
559     return -1;
560   case AMDGPU::DS_READ_B32:
561   case AMDGPU::DS_READ_B32_gfx9:
562   case AMDGPU::DS_READ_B64:
563   case AMDGPU::DS_READ_B64_gfx9:
564   case AMDGPU::DS_WRITE_B32:
565   case AMDGPU::DS_WRITE_B32_gfx9:
566   case AMDGPU::DS_WRITE_B64:
567   case AMDGPU::DS_WRITE_B64_gfx9:
568     return Opc;
569   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
570   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
571   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
572   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
573   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
574     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
575   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
576   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
577   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
578   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
579   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
580     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
581   case AMDGPU::S_LOAD_DWORD_IMM:
582   case AMDGPU::S_LOAD_DWORDX2_IMM:
583   case AMDGPU::S_LOAD_DWORDX3_IMM:
584   case AMDGPU::S_LOAD_DWORDX4_IMM:
585   case AMDGPU::S_LOAD_DWORDX8_IMM:
586     return AMDGPU::S_LOAD_DWORD_IMM;
587   case AMDGPU::GLOBAL_LOAD_DWORD:
588   case AMDGPU::GLOBAL_LOAD_DWORDX2:
589   case AMDGPU::GLOBAL_LOAD_DWORDX3:
590   case AMDGPU::GLOBAL_LOAD_DWORDX4:
591   case AMDGPU::FLAT_LOAD_DWORD:
592   case AMDGPU::FLAT_LOAD_DWORDX2:
593   case AMDGPU::FLAT_LOAD_DWORDX3:
594   case AMDGPU::FLAT_LOAD_DWORDX4:
595     return AMDGPU::FLAT_LOAD_DWORD;
596   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
597   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
598   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
599   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
600     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
601   case AMDGPU::GLOBAL_STORE_DWORD:
602   case AMDGPU::GLOBAL_STORE_DWORDX2:
603   case AMDGPU::GLOBAL_STORE_DWORDX3:
604   case AMDGPU::GLOBAL_STORE_DWORDX4:
605   case AMDGPU::FLAT_STORE_DWORD:
606   case AMDGPU::FLAT_STORE_DWORDX2:
607   case AMDGPU::FLAT_STORE_DWORDX3:
608   case AMDGPU::FLAT_STORE_DWORDX4:
609     return AMDGPU::FLAT_STORE_DWORD;
610   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
611   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
612   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
613   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
614     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
615   }
616 }
617 
618 // GLOBAL loads and stores are classified as FLAT initially. If both combined
619 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
620 // If either or both instructions are non segment specific FLAT the resulting
621 // combined operation will be FLAT, potentially promoting one of the GLOBAL
622 // operations to FLAT.
623 // For other instructions return the original unmodified class.
624 InstClassEnum
625 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
626                                          const CombineInfo &Paired) {
627   assert(CI.InstClass == Paired.InstClass);
628 
629   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
630       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
631     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
632 
633   return CI.InstClass;
634 }
635 
636 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
637   AddressRegs Result;
638 
639   if (TII.isMUBUF(Opc)) {
640     if (AMDGPU::getMUBUFHasVAddr(Opc))
641       Result.VAddr = true;
642     if (AMDGPU::getMUBUFHasSrsrc(Opc))
643       Result.SRsrc = true;
644     if (AMDGPU::getMUBUFHasSoffset(Opc))
645       Result.SOffset = true;
646 
647     return Result;
648   }
649 
650   if (TII.isImage(Opc)) {
651     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
652     if (VAddr0Idx >= 0) {
653       int RsrcName =
654           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
655       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
656       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
657     } else {
658       Result.VAddr = true;
659     }
660     Result.SRsrc = true;
661     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
662     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
663       Result.SSamp = true;
664 
665     return Result;
666   }
667   if (TII.isMTBUF(Opc)) {
668     if (AMDGPU::getMTBUFHasVAddr(Opc))
669       Result.VAddr = true;
670     if (AMDGPU::getMTBUFHasSrsrc(Opc))
671       Result.SRsrc = true;
672     if (AMDGPU::getMTBUFHasSoffset(Opc))
673       Result.SOffset = true;
674 
675     return Result;
676   }
677 
678   switch (Opc) {
679   default:
680     return Result;
681   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
682   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
683   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
684   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
685   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
686     Result.SOffset = true;
687     [[fallthrough]];
688   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
689   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
690   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
691   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
692   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
693   case AMDGPU::S_LOAD_DWORD_IMM:
694   case AMDGPU::S_LOAD_DWORDX2_IMM:
695   case AMDGPU::S_LOAD_DWORDX3_IMM:
696   case AMDGPU::S_LOAD_DWORDX4_IMM:
697   case AMDGPU::S_LOAD_DWORDX8_IMM:
698     Result.SBase = true;
699     return Result;
700   case AMDGPU::DS_READ_B32:
701   case AMDGPU::DS_READ_B64:
702   case AMDGPU::DS_READ_B32_gfx9:
703   case AMDGPU::DS_READ_B64_gfx9:
704   case AMDGPU::DS_WRITE_B32:
705   case AMDGPU::DS_WRITE_B64:
706   case AMDGPU::DS_WRITE_B32_gfx9:
707   case AMDGPU::DS_WRITE_B64_gfx9:
708     Result.Addr = true;
709     return Result;
710   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
711   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
712   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
713   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
714   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
715   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
716   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
717   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
718     Result.SAddr = true;
719     [[fallthrough]];
720   case AMDGPU::GLOBAL_LOAD_DWORD:
721   case AMDGPU::GLOBAL_LOAD_DWORDX2:
722   case AMDGPU::GLOBAL_LOAD_DWORDX3:
723   case AMDGPU::GLOBAL_LOAD_DWORDX4:
724   case AMDGPU::GLOBAL_STORE_DWORD:
725   case AMDGPU::GLOBAL_STORE_DWORDX2:
726   case AMDGPU::GLOBAL_STORE_DWORDX3:
727   case AMDGPU::GLOBAL_STORE_DWORDX4:
728   case AMDGPU::FLAT_LOAD_DWORD:
729   case AMDGPU::FLAT_LOAD_DWORDX2:
730   case AMDGPU::FLAT_LOAD_DWORDX3:
731   case AMDGPU::FLAT_LOAD_DWORDX4:
732   case AMDGPU::FLAT_STORE_DWORD:
733   case AMDGPU::FLAT_STORE_DWORDX2:
734   case AMDGPU::FLAT_STORE_DWORDX3:
735   case AMDGPU::FLAT_STORE_DWORDX4:
736     Result.VAddr = true;
737     return Result;
738   }
739 }
740 
741 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
742                                               const SILoadStoreOptimizer &LSO) {
743   I = MI;
744   unsigned Opc = MI->getOpcode();
745   InstClass = getInstClass(Opc, *LSO.TII);
746 
747   if (InstClass == UNKNOWN)
748     return;
749 
750   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
751 
752   switch (InstClass) {
753   case DS_READ:
754    EltSize =
755           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
756                                                                           : 4;
757    break;
758   case DS_WRITE:
759     EltSize =
760           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
761                                                                             : 4;
762     break;
763   case S_BUFFER_LOAD_IMM:
764   case S_BUFFER_LOAD_SGPR_IMM:
765   case S_LOAD_IMM:
766     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
767     break;
768   default:
769     EltSize = 4;
770     break;
771   }
772 
773   if (InstClass == MIMG) {
774     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
775     // Offset is not considered for MIMG instructions.
776     Offset = 0;
777   } else {
778     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
779     Offset = I->getOperand(OffsetIdx).getImm();
780   }
781 
782   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
783     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
784 
785   Width = getOpcodeWidth(*I, *LSO.TII);
786 
787   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
788     Offset &= 0xffff;
789   } else if (InstClass != MIMG) {
790     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
791   }
792 
793   AddressRegs Regs = getRegs(Opc, *LSO.TII);
794   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
795 
796   NumAddresses = 0;
797   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
798     AddrIdx[NumAddresses++] =
799         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
800   if (Regs.Addr)
801     AddrIdx[NumAddresses++] =
802         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
803   if (Regs.SBase)
804     AddrIdx[NumAddresses++] =
805         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
806   if (Regs.SRsrc)
807     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
808         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
809   if (Regs.SOffset)
810     AddrIdx[NumAddresses++] =
811         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
812   if (Regs.SAddr)
813     AddrIdx[NumAddresses++] =
814         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
815   if (Regs.VAddr)
816     AddrIdx[NumAddresses++] =
817         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
818   if (Regs.SSamp)
819     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
820         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
821   assert(NumAddresses <= MaxAddressRegs);
822 
823   for (unsigned J = 0; J < NumAddresses; J++)
824     AddrReg[J] = &I->getOperand(AddrIdx[J]);
825 }
826 
827 } // end anonymous namespace.
828 
829 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
830                       "SI Load Store Optimizer", false, false)
831 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
832 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
833                     false, false)
834 
835 char SILoadStoreOptimizer::ID = 0;
836 
837 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
838 
839 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
840   return new SILoadStoreOptimizer();
841 }
842 
843 static void addDefsUsesToList(const MachineInstr &MI,
844                               DenseSet<Register> &RegDefs,
845                               DenseSet<Register> &RegUses) {
846   for (const auto &Op : MI.operands()) {
847     if (!Op.isReg())
848       continue;
849     if (Op.isDef())
850       RegDefs.insert(Op.getReg());
851     if (Op.readsReg())
852       RegUses.insert(Op.getReg());
853   }
854 }
855 
856 bool SILoadStoreOptimizer::canSwapInstructions(
857     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
858     const MachineInstr &A, const MachineInstr &B) const {
859   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
860       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
861     return false;
862   for (const auto &BOp : B.operands()) {
863     if (!BOp.isReg())
864       continue;
865     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
866       return false;
867     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
868       return false;
869   }
870   return true;
871 }
872 
873 // Given that \p CI and \p Paired are adjacent memory operations produce a new
874 // MMO for the combined operation with a new access size.
875 MachineMemOperand *
876 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
877                                                const CombineInfo &Paired) {
878   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
879   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
880 
881   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
882 
883   // A base pointer for the combined operation is the same as the leading
884   // operation's pointer.
885   if (Paired < CI)
886     std::swap(MMOa, MMOb);
887 
888   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
889   // If merging FLAT and GLOBAL set address space to FLAT.
890   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
891     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
892 
893   MachineFunction *MF = CI.I->getMF();
894   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
895 }
896 
897 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
898                                                const SIInstrInfo &TII,
899                                                const CombineInfo &Paired) {
900   assert(CI.InstClass == MIMG);
901 
902   // Ignore instructions with tfe/lwe set.
903   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
904   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
905 
906   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
907     return false;
908 
909   // Check other optional immediate operands for equality.
910   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
911                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
912                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
913 
914   for (auto op : OperandsToMatch) {
915     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
916     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
917       return false;
918     if (Idx != -1 &&
919         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
920       return false;
921   }
922 
923   // Check DMask for overlaps.
924   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
925   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
926 
927   if (!MaxMask)
928     return false;
929 
930   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
931   if ((1u << AllowedBitsForMin) <= MinMask)
932     return false;
933 
934   return true;
935 }
936 
937 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
938                                        unsigned ComponentCount,
939                                        const GCNSubtarget &STI) {
940   if (ComponentCount > 4)
941     return 0;
942 
943   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
944       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
945   if (!OldFormatInfo)
946     return 0;
947 
948   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
949       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
950                                            ComponentCount,
951                                            OldFormatInfo->NumFormat, STI);
952 
953   if (!NewFormatInfo)
954     return 0;
955 
956   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
957          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
958 
959   return NewFormatInfo->Format;
960 }
961 
962 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
963 // highest power of two. Note that the result is well defined for all inputs
964 // including corner cases like:
965 // - if Lo == Hi, return that value
966 // - if Lo == 0, return 0 (even though the "- 1" below underflows
967 // - if Lo > Hi, return 0 (as if the range wrapped around)
968 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
969   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
970 }
971 
972 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
973                                                 const GCNSubtarget &STI,
974                                                 CombineInfo &Paired,
975                                                 bool Modify) {
976   assert(CI.InstClass != MIMG);
977 
978   // XXX - Would the same offset be OK? Is there any reason this would happen or
979   // be useful?
980   if (CI.Offset == Paired.Offset)
981     return false;
982 
983   // This won't be valid if the offset isn't aligned.
984   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
985     return false;
986 
987   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
988 
989     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
990         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
991     if (!Info0)
992       return false;
993     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
994         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
995     if (!Info1)
996       return false;
997 
998     if (Info0->BitsPerComp != Info1->BitsPerComp ||
999         Info0->NumFormat != Info1->NumFormat)
1000       return false;
1001 
1002     // TODO: Should be possible to support more formats, but if format loads
1003     // are not dword-aligned, the merged load might not be valid.
1004     if (Info0->BitsPerComp != 32)
1005       return false;
1006 
1007     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1008       return false;
1009   }
1010 
1011   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1012   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1013   CI.UseST64 = false;
1014   CI.BaseOff = 0;
1015 
1016   // Handle all non-DS instructions.
1017   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1018     if (EltOffset0 + CI.Width != EltOffset1 &&
1019             EltOffset1 + Paired.Width != EltOffset0)
1020       return false;
1021     if (CI.CPol != Paired.CPol)
1022       return false;
1023     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1024         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1025       // Reject cases like:
1026       //   dword + dwordx2 -> dwordx3
1027       //   dword + dwordx3 -> dwordx4
1028       // If we tried to combine these cases, we would fail to extract a subreg
1029       // for the result of the second load due to SGPR alignment requirements.
1030       if (CI.Width != Paired.Width &&
1031           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1032         return false;
1033     }
1034     return true;
1035   }
1036 
1037   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1038   // the stride 64 versions.
1039   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1040       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1041     if (Modify) {
1042       CI.Offset = EltOffset0 / 64;
1043       Paired.Offset = EltOffset1 / 64;
1044       CI.UseST64 = true;
1045     }
1046     return true;
1047   }
1048 
1049   // Check if the new offsets fit in the reduced 8-bit range.
1050   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1051     if (Modify) {
1052       CI.Offset = EltOffset0;
1053       Paired.Offset = EltOffset1;
1054     }
1055     return true;
1056   }
1057 
1058   // Try to shift base address to decrease offsets.
1059   uint32_t Min = std::min(EltOffset0, EltOffset1);
1060   uint32_t Max = std::max(EltOffset0, EltOffset1);
1061 
1062   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1063   if (((Max - Min) & ~Mask) == 0) {
1064     if (Modify) {
1065       // From the range of values we could use for BaseOff, choose the one that
1066       // is aligned to the highest power of two, to maximise the chance that
1067       // the same offset can be reused for other load/store pairs.
1068       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1069       // Copy the low bits of the offsets, so that when we adjust them by
1070       // subtracting BaseOff they will be multiples of 64.
1071       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1072       CI.BaseOff = BaseOff * CI.EltSize;
1073       CI.Offset = (EltOffset0 - BaseOff) / 64;
1074       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1075       CI.UseST64 = true;
1076     }
1077     return true;
1078   }
1079 
1080   if (isUInt<8>(Max - Min)) {
1081     if (Modify) {
1082       // From the range of values we could use for BaseOff, choose the one that
1083       // is aligned to the highest power of two, to maximise the chance that
1084       // the same offset can be reused for other load/store pairs.
1085       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1086       CI.BaseOff = BaseOff * CI.EltSize;
1087       CI.Offset = EltOffset0 - BaseOff;
1088       Paired.Offset = EltOffset1 - BaseOff;
1089     }
1090     return true;
1091   }
1092 
1093   return false;
1094 }
1095 
1096 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1097                                      const CombineInfo &CI,
1098                                      const CombineInfo &Paired) {
1099   const unsigned Width = (CI.Width + Paired.Width);
1100   switch (CI.InstClass) {
1101   default:
1102     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1103   case S_BUFFER_LOAD_IMM:
1104   case S_BUFFER_LOAD_SGPR_IMM:
1105   case S_LOAD_IMM:
1106     switch (Width) {
1107     default:
1108       return false;
1109     case 2:
1110     case 4:
1111     case 8:
1112       return true;
1113     case 3:
1114       return STM.hasScalarDwordx3Loads();
1115     }
1116   }
1117 }
1118 
1119 const TargetRegisterClass *
1120 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1121   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1122     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1123   }
1124   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1125     return TRI->getRegClassForReg(*MRI, Src->getReg());
1126   }
1127   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1128     return TRI->getRegClassForReg(*MRI, Src->getReg());
1129   }
1130   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1131     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1132   }
1133   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1134     return TRI->getRegClassForReg(*MRI, Src->getReg());
1135   }
1136   return nullptr;
1137 }
1138 
1139 /// This function assumes that CI comes before Paired in a basic block. Return
1140 /// an insertion point for the merged instruction or nullptr on failure.
1141 SILoadStoreOptimizer::CombineInfo *
1142 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1143                                            CombineInfo &Paired) {
1144   // If another instruction has already been merged into CI, it may now be a
1145   // type that we can't do any further merging into.
1146   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1147     return nullptr;
1148   assert(CI.InstClass == Paired.InstClass);
1149 
1150   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1151       getInstSubclass(Paired.I->getOpcode(), *TII))
1152     return nullptr;
1153 
1154   // Check both offsets (or masks for MIMG) can be combined and fit in the
1155   // reduced range.
1156   if (CI.InstClass == MIMG) {
1157     if (!dmasksCanBeCombined(CI, *TII, Paired))
1158       return nullptr;
1159   } else {
1160     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1161       return nullptr;
1162   }
1163 
1164   DenseSet<Register> RegDefs;
1165   DenseSet<Register> RegUses;
1166   CombineInfo *Where;
1167   if (CI.I->mayLoad()) {
1168     // Try to hoist Paired up to CI.
1169     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1170     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1171       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1172         return nullptr;
1173     }
1174     Where = &CI;
1175   } else {
1176     // Try to sink CI down to Paired.
1177     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1178     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1179       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1180         return nullptr;
1181     }
1182     Where = &Paired;
1183   }
1184 
1185   // Call offsetsCanBeCombined with modify = true so that the offsets are
1186   // correct for the new instruction.  This should return true, because
1187   // this function should only be called on CombineInfo objects that
1188   // have already been confirmed to be mergeable.
1189   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1190     offsetsCanBeCombined(CI, *STM, Paired, true);
1191   return Where;
1192 }
1193 
1194 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1195   if (STM->ldsRequiresM0Init())
1196     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1197   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1198 }
1199 
1200 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1201   if (STM->ldsRequiresM0Init())
1202     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1203 
1204   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1205                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1206 }
1207 
1208 MachineBasicBlock::iterator
1209 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1210                                      MachineBasicBlock::iterator InsertBefore) {
1211   MachineBasicBlock *MBB = CI.I->getParent();
1212 
1213   // Be careful, since the addresses could be subregisters themselves in weird
1214   // cases, like vectors of pointers.
1215   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1216 
1217   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1218   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1219 
1220   unsigned NewOffset0 = CI.Offset;
1221   unsigned NewOffset1 = Paired.Offset;
1222   unsigned Opc =
1223       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1224 
1225   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1226   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1227 
1228   if (NewOffset0 > NewOffset1) {
1229     // Canonicalize the merged instruction so the smaller offset comes first.
1230     std::swap(NewOffset0, NewOffset1);
1231     std::swap(SubRegIdx0, SubRegIdx1);
1232   }
1233 
1234   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1235          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1236 
1237   const MCInstrDesc &Read2Desc = TII->get(Opc);
1238 
1239   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1240   Register DestReg = MRI->createVirtualRegister(SuperRC);
1241 
1242   DebugLoc DL = CI.I->getDebugLoc();
1243 
1244   Register BaseReg = AddrReg->getReg();
1245   unsigned BaseSubReg = AddrReg->getSubReg();
1246   unsigned BaseRegFlags = 0;
1247   if (CI.BaseOff) {
1248     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1249     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1250         .addImm(CI.BaseOff);
1251 
1252     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1253     BaseRegFlags = RegState::Kill;
1254 
1255     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1256         .addReg(ImmReg)
1257         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1258         .addImm(0); // clamp bit
1259     BaseSubReg = 0;
1260   }
1261 
1262   MachineInstrBuilder Read2 =
1263       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1264           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1265           .addImm(NewOffset0)                        // offset0
1266           .addImm(NewOffset1)                        // offset1
1267           .addImm(0)                                 // gds
1268           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1269 
1270   (void)Read2;
1271 
1272   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1273 
1274   // Copy to the old destination registers.
1275   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1276       .add(*Dest0) // Copy to same destination including flags and sub reg.
1277       .addReg(DestReg, 0, SubRegIdx0);
1278   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1279       .add(*Dest1)
1280       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1281 
1282   CI.I->eraseFromParent();
1283   Paired.I->eraseFromParent();
1284 
1285   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1286   return Read2;
1287 }
1288 
1289 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1290   if (STM->ldsRequiresM0Init())
1291     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1292   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1293                         : AMDGPU::DS_WRITE2_B64_gfx9;
1294 }
1295 
1296 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1297   if (STM->ldsRequiresM0Init())
1298     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1299                           : AMDGPU::DS_WRITE2ST64_B64;
1300 
1301   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1302                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1303 }
1304 
1305 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1306     CombineInfo &CI, CombineInfo &Paired,
1307     MachineBasicBlock::iterator InsertBefore) {
1308   MachineBasicBlock *MBB = CI.I->getParent();
1309 
1310   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1311   // sure we preserve the subregister index and any register flags set on them.
1312   const MachineOperand *AddrReg =
1313       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1314   const MachineOperand *Data0 =
1315       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1316   const MachineOperand *Data1 =
1317       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1318 
1319   unsigned NewOffset0 = CI.Offset;
1320   unsigned NewOffset1 = Paired.Offset;
1321   unsigned Opc =
1322       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1323 
1324   if (NewOffset0 > NewOffset1) {
1325     // Canonicalize the merged instruction so the smaller offset comes first.
1326     std::swap(NewOffset0, NewOffset1);
1327     std::swap(Data0, Data1);
1328   }
1329 
1330   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1331          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1332 
1333   const MCInstrDesc &Write2Desc = TII->get(Opc);
1334   DebugLoc DL = CI.I->getDebugLoc();
1335 
1336   Register BaseReg = AddrReg->getReg();
1337   unsigned BaseSubReg = AddrReg->getSubReg();
1338   unsigned BaseRegFlags = 0;
1339   if (CI.BaseOff) {
1340     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1341     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1342         .addImm(CI.BaseOff);
1343 
1344     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1345     BaseRegFlags = RegState::Kill;
1346 
1347     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1348         .addReg(ImmReg)
1349         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1350         .addImm(0); // clamp bit
1351     BaseSubReg = 0;
1352   }
1353 
1354   MachineInstrBuilder Write2 =
1355       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1356           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1357           .add(*Data0)                               // data0
1358           .add(*Data1)                               // data1
1359           .addImm(NewOffset0)                        // offset0
1360           .addImm(NewOffset1)                        // offset1
1361           .addImm(0)                                 // gds
1362           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1363 
1364   CI.I->eraseFromParent();
1365   Paired.I->eraseFromParent();
1366 
1367   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1368   return Write2;
1369 }
1370 
1371 MachineBasicBlock::iterator
1372 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1373                                      MachineBasicBlock::iterator InsertBefore) {
1374   MachineBasicBlock *MBB = CI.I->getParent();
1375   DebugLoc DL = CI.I->getDebugLoc();
1376   const unsigned Opcode = getNewOpcode(CI, Paired);
1377 
1378   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1379 
1380   Register DestReg = MRI->createVirtualRegister(SuperRC);
1381   unsigned MergedDMask = CI.DMask | Paired.DMask;
1382   unsigned DMaskIdx =
1383       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1384 
1385   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1386   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1387     if (I == DMaskIdx)
1388       MIB.addImm(MergedDMask);
1389     else
1390       MIB.add((*CI.I).getOperand(I));
1391   }
1392 
1393   // It shouldn't be possible to get this far if the two instructions
1394   // don't have a single memoperand, because MachineInstr::mayAlias()
1395   // will return true if this is the case.
1396   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1397 
1398   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1399 
1400   unsigned SubRegIdx0, SubRegIdx1;
1401   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1402 
1403   // Copy to the old destination registers.
1404   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1405   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1406   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1407 
1408   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1409       .add(*Dest0) // Copy to same destination including flags and sub reg.
1410       .addReg(DestReg, 0, SubRegIdx0);
1411   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1412       .add(*Dest1)
1413       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1414 
1415   CI.I->eraseFromParent();
1416   Paired.I->eraseFromParent();
1417   return New;
1418 }
1419 
1420 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1421     CombineInfo &CI, CombineInfo &Paired,
1422     MachineBasicBlock::iterator InsertBefore) {
1423   MachineBasicBlock *MBB = CI.I->getParent();
1424   DebugLoc DL = CI.I->getDebugLoc();
1425   const unsigned Opcode = getNewOpcode(CI, Paired);
1426 
1427   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1428 
1429   Register DestReg = MRI->createVirtualRegister(SuperRC);
1430   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1431 
1432   // It shouldn't be possible to get this far if the two instructions
1433   // don't have a single memoperand, because MachineInstr::mayAlias()
1434   // will return true if this is the case.
1435   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1436 
1437   MachineInstrBuilder New =
1438       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1439           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1440   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1441     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1442   New.addImm(MergedOffset);
1443   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1444 
1445   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1446   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1447   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1448 
1449   // Copy to the old destination registers.
1450   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1451   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1452   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1453 
1454   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1455       .add(*Dest0) // Copy to same destination including flags and sub reg.
1456       .addReg(DestReg, 0, SubRegIdx0);
1457   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1458       .add(*Dest1)
1459       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1460 
1461   CI.I->eraseFromParent();
1462   Paired.I->eraseFromParent();
1463   return New;
1464 }
1465 
1466 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1467     CombineInfo &CI, CombineInfo &Paired,
1468     MachineBasicBlock::iterator InsertBefore) {
1469   MachineBasicBlock *MBB = CI.I->getParent();
1470   DebugLoc DL = CI.I->getDebugLoc();
1471 
1472   const unsigned Opcode = getNewOpcode(CI, Paired);
1473 
1474   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475 
1476   // Copy to the new source register.
1477   Register DestReg = MRI->createVirtualRegister(SuperRC);
1478   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1479 
1480   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1481 
1482   AddressRegs Regs = getRegs(Opcode, *TII);
1483 
1484   if (Regs.VAddr)
1485     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1486 
1487   // It shouldn't be possible to get this far if the two instructions
1488   // don't have a single memoperand, because MachineInstr::mayAlias()
1489   // will return true if this is the case.
1490   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1491 
1492   MachineInstr *New =
1493     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1494         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1495         .addImm(MergedOffset) // offset
1496         .addImm(CI.CPol)      // cpol
1497         .addImm(0)            // swz
1498         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1499 
1500   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1501   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1502   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1503 
1504   // Copy to the old destination registers.
1505   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1506   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1507   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1508 
1509   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1510       .add(*Dest0) // Copy to same destination including flags and sub reg.
1511       .addReg(DestReg, 0, SubRegIdx0);
1512   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1513       .add(*Dest1)
1514       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1515 
1516   CI.I->eraseFromParent();
1517   Paired.I->eraseFromParent();
1518   return New;
1519 }
1520 
1521 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1522     CombineInfo &CI, CombineInfo &Paired,
1523     MachineBasicBlock::iterator InsertBefore) {
1524   MachineBasicBlock *MBB = CI.I->getParent();
1525   DebugLoc DL = CI.I->getDebugLoc();
1526 
1527   const unsigned Opcode = getNewOpcode(CI, Paired);
1528 
1529   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1530 
1531   // Copy to the new source register.
1532   Register DestReg = MRI->createVirtualRegister(SuperRC);
1533   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1534 
1535   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1536 
1537   AddressRegs Regs = getRegs(Opcode, *TII);
1538 
1539   if (Regs.VAddr)
1540     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1541 
1542   unsigned JoinedFormat =
1543       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1544 
1545   // It shouldn't be possible to get this far if the two instructions
1546   // don't have a single memoperand, because MachineInstr::mayAlias()
1547   // will return true if this is the case.
1548   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1549 
1550   MachineInstr *New =
1551       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1552           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1553           .addImm(MergedOffset) // offset
1554           .addImm(JoinedFormat) // format
1555           .addImm(CI.CPol)      // cpol
1556           .addImm(0)            // swz
1557           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1558 
1559   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1560   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1561   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1562 
1563   // Copy to the old destination registers.
1564   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1565   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1566   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1567 
1568   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1569       .add(*Dest0) // Copy to same destination including flags and sub reg.
1570       .addReg(DestReg, 0, SubRegIdx0);
1571   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1572       .add(*Dest1)
1573       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1574 
1575   CI.I->eraseFromParent();
1576   Paired.I->eraseFromParent();
1577   return New;
1578 }
1579 
1580 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1581     CombineInfo &CI, CombineInfo &Paired,
1582     MachineBasicBlock::iterator InsertBefore) {
1583   MachineBasicBlock *MBB = CI.I->getParent();
1584   DebugLoc DL = CI.I->getDebugLoc();
1585 
1586   const unsigned Opcode = getNewOpcode(CI, Paired);
1587 
1588   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1589   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1590   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1591 
1592   // Copy to the new source register.
1593   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1594   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1595 
1596   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1597   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1598 
1599   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1600       .add(*Src0)
1601       .addImm(SubRegIdx0)
1602       .add(*Src1)
1603       .addImm(SubRegIdx1);
1604 
1605   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1606                  .addReg(SrcReg, RegState::Kill);
1607 
1608   AddressRegs Regs = getRegs(Opcode, *TII);
1609 
1610   if (Regs.VAddr)
1611     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1612 
1613   unsigned JoinedFormat =
1614       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1615 
1616   // It shouldn't be possible to get this far if the two instructions
1617   // don't have a single memoperand, because MachineInstr::mayAlias()
1618   // will return true if this is the case.
1619   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1620 
1621   MachineInstr *New =
1622       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1623           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1624           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1625           .addImm(JoinedFormat)                     // format
1626           .addImm(CI.CPol)                          // cpol
1627           .addImm(0)                                // swz
1628           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1629 
1630   CI.I->eraseFromParent();
1631   Paired.I->eraseFromParent();
1632   return New;
1633 }
1634 
1635 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1636     CombineInfo &CI, CombineInfo &Paired,
1637     MachineBasicBlock::iterator InsertBefore) {
1638   MachineBasicBlock *MBB = CI.I->getParent();
1639   DebugLoc DL = CI.I->getDebugLoc();
1640 
1641   const unsigned Opcode = getNewOpcode(CI, Paired);
1642 
1643   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1644   Register DestReg = MRI->createVirtualRegister(SuperRC);
1645 
1646   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1647 
1648   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1649     MIB.add(*SAddr);
1650 
1651   MachineInstr *New =
1652     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1653        .addImm(std::min(CI.Offset, Paired.Offset))
1654        .addImm(CI.CPol)
1655        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656 
1657   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1658   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1659   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1660 
1661   // Copy to the old destination registers.
1662   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1663   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1664   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1665 
1666   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1667       .add(*Dest0) // Copy to same destination including flags and sub reg.
1668       .addReg(DestReg, 0, SubRegIdx0);
1669   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1670       .add(*Dest1)
1671       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1672 
1673   CI.I->eraseFromParent();
1674   Paired.I->eraseFromParent();
1675   return New;
1676 }
1677 
1678 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1679     CombineInfo &CI, CombineInfo &Paired,
1680     MachineBasicBlock::iterator InsertBefore) {
1681   MachineBasicBlock *MBB = CI.I->getParent();
1682   DebugLoc DL = CI.I->getDebugLoc();
1683 
1684   const unsigned Opcode = getNewOpcode(CI, Paired);
1685 
1686   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1687   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1688   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1689 
1690   // Copy to the new source register.
1691   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1692   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1693 
1694   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1695   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1696 
1697   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1698       .add(*Src0)
1699       .addImm(SubRegIdx0)
1700       .add(*Src1)
1701       .addImm(SubRegIdx1);
1702 
1703   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1704                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1705                  .addReg(SrcReg, RegState::Kill);
1706 
1707   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1708     MIB.add(*SAddr);
1709 
1710   MachineInstr *New =
1711     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1712        .addImm(CI.CPol)
1713        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1714 
1715   CI.I->eraseFromParent();
1716   Paired.I->eraseFromParent();
1717   return New;
1718 }
1719 
1720 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1721                                             const CombineInfo &Paired) {
1722   const unsigned Width = CI.Width + Paired.Width;
1723 
1724   switch (getCommonInstClass(CI, Paired)) {
1725   default:
1726     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1727     // FIXME: Handle d16 correctly
1728     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1729                                   Width);
1730   case TBUFFER_LOAD:
1731   case TBUFFER_STORE:
1732     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1733                                   Width);
1734 
1735   case UNKNOWN:
1736     llvm_unreachable("Unknown instruction class");
1737   case S_BUFFER_LOAD_IMM:
1738     switch (Width) {
1739     default:
1740       return 0;
1741     case 2:
1742       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1743     case 3:
1744       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1745     case 4:
1746       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1747     case 8:
1748       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1749     }
1750   case S_BUFFER_LOAD_SGPR_IMM:
1751     switch (Width) {
1752     default:
1753       return 0;
1754     case 2:
1755       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1756     case 3:
1757       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1758     case 4:
1759       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1760     case 8:
1761       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1762     }
1763   case S_LOAD_IMM:
1764     switch (Width) {
1765     default:
1766       return 0;
1767     case 2:
1768       return AMDGPU::S_LOAD_DWORDX2_IMM;
1769     case 3:
1770       return AMDGPU::S_LOAD_DWORDX3_IMM;
1771     case 4:
1772       return AMDGPU::S_LOAD_DWORDX4_IMM;
1773     case 8:
1774       return AMDGPU::S_LOAD_DWORDX8_IMM;
1775     }
1776   case GLOBAL_LOAD:
1777     switch (Width) {
1778     default:
1779       return 0;
1780     case 2:
1781       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1782     case 3:
1783       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1784     case 4:
1785       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1786     }
1787   case GLOBAL_LOAD_SADDR:
1788     switch (Width) {
1789     default:
1790       return 0;
1791     case 2:
1792       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1793     case 3:
1794       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1795     case 4:
1796       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1797     }
1798   case GLOBAL_STORE:
1799     switch (Width) {
1800     default:
1801       return 0;
1802     case 2:
1803       return AMDGPU::GLOBAL_STORE_DWORDX2;
1804     case 3:
1805       return AMDGPU::GLOBAL_STORE_DWORDX3;
1806     case 4:
1807       return AMDGPU::GLOBAL_STORE_DWORDX4;
1808     }
1809   case GLOBAL_STORE_SADDR:
1810     switch (Width) {
1811     default:
1812       return 0;
1813     case 2:
1814       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1815     case 3:
1816       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1817     case 4:
1818       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1819     }
1820   case FLAT_LOAD:
1821     switch (Width) {
1822     default:
1823       return 0;
1824     case 2:
1825       return AMDGPU::FLAT_LOAD_DWORDX2;
1826     case 3:
1827       return AMDGPU::FLAT_LOAD_DWORDX3;
1828     case 4:
1829       return AMDGPU::FLAT_LOAD_DWORDX4;
1830     }
1831   case FLAT_STORE:
1832     switch (Width) {
1833     default:
1834       return 0;
1835     case 2:
1836       return AMDGPU::FLAT_STORE_DWORDX2;
1837     case 3:
1838       return AMDGPU::FLAT_STORE_DWORDX3;
1839     case 4:
1840       return AMDGPU::FLAT_STORE_DWORDX4;
1841     }
1842   case MIMG:
1843     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1844            "No overlaps");
1845     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1846   }
1847 }
1848 
1849 std::pair<unsigned, unsigned>
1850 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1851                                     const CombineInfo &Paired) {
1852   assert((CI.InstClass != MIMG ||
1853           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1854            CI.Width + Paired.Width)) &&
1855          "No overlaps");
1856 
1857   unsigned Idx0;
1858   unsigned Idx1;
1859 
1860   static const unsigned Idxs[5][4] = {
1861       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1862       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1863       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1864       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1865       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1866   };
1867 
1868   assert(CI.Width >= 1 && CI.Width <= 4);
1869   assert(Paired.Width >= 1 && Paired.Width <= 4);
1870 
1871   if (Paired < CI) {
1872     Idx1 = Idxs[0][Paired.Width - 1];
1873     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1874   } else {
1875     Idx0 = Idxs[0][CI.Width - 1];
1876     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1877   }
1878 
1879   return std::pair(Idx0, Idx1);
1880 }
1881 
1882 const TargetRegisterClass *
1883 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1884                                              const CombineInfo &Paired) {
1885   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1886       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1887     switch (CI.Width + Paired.Width) {
1888     default:
1889       return nullptr;
1890     case 2:
1891       return &AMDGPU::SReg_64_XEXECRegClass;
1892     case 3:
1893       return &AMDGPU::SGPR_96RegClass;
1894     case 4:
1895       return &AMDGPU::SGPR_128RegClass;
1896     case 8:
1897       return &AMDGPU::SGPR_256RegClass;
1898     case 16:
1899       return &AMDGPU::SGPR_512RegClass;
1900     }
1901   }
1902 
1903   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1904   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1905              ? TRI->getAGPRClassForBitWidth(BitWidth)
1906              : TRI->getVGPRClassForBitWidth(BitWidth);
1907 }
1908 
1909 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1910     CombineInfo &CI, CombineInfo &Paired,
1911     MachineBasicBlock::iterator InsertBefore) {
1912   MachineBasicBlock *MBB = CI.I->getParent();
1913   DebugLoc DL = CI.I->getDebugLoc();
1914 
1915   const unsigned Opcode = getNewOpcode(CI, Paired);
1916 
1917   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1918   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1919   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1920 
1921   // Copy to the new source register.
1922   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1923   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1924 
1925   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1926   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1927 
1928   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1929       .add(*Src0)
1930       .addImm(SubRegIdx0)
1931       .add(*Src1)
1932       .addImm(SubRegIdx1);
1933 
1934   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1935                  .addReg(SrcReg, RegState::Kill);
1936 
1937   AddressRegs Regs = getRegs(Opcode, *TII);
1938 
1939   if (Regs.VAddr)
1940     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1941 
1942 
1943   // It shouldn't be possible to get this far if the two instructions
1944   // don't have a single memoperand, because MachineInstr::mayAlias()
1945   // will return true if this is the case.
1946   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1947 
1948   MachineInstr *New =
1949     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1950         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1951         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1952         .addImm(CI.CPol)      // cpol
1953         .addImm(0)            // swz
1954         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1955 
1956   CI.I->eraseFromParent();
1957   Paired.I->eraseFromParent();
1958   return New;
1959 }
1960 
1961 MachineOperand
1962 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1963   APInt V(32, Val, true);
1964   if (TII->isInlineConstant(V))
1965     return MachineOperand::CreateImm(Val);
1966 
1967   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1968   MachineInstr *Mov =
1969   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1970           TII->get(AMDGPU::S_MOV_B32), Reg)
1971     .addImm(Val);
1972   (void)Mov;
1973   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1974   return MachineOperand::CreateReg(Reg, false);
1975 }
1976 
1977 // Compute base address using Addr and return the final register.
1978 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1979                                            const MemAddress &Addr) const {
1980   MachineBasicBlock *MBB = MI.getParent();
1981   MachineBasicBlock::iterator MBBI = MI.getIterator();
1982   DebugLoc DL = MI.getDebugLoc();
1983 
1984   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1985           Addr.Base.LoSubReg) &&
1986          "Expected 32-bit Base-Register-Low!!");
1987 
1988   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1989           Addr.Base.HiSubReg) &&
1990          "Expected 32-bit Base-Register-Hi!!");
1991 
1992   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1993   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1994   MachineOperand OffsetHi =
1995     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1996 
1997   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1998   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1999   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2000 
2001   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2002   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2003   MachineInstr *LoHalf =
2004     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2005       .addReg(CarryReg, RegState::Define)
2006       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2007       .add(OffsetLo)
2008       .addImm(0); // clamp bit
2009   (void)LoHalf;
2010   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
2011 
2012   MachineInstr *HiHalf =
2013   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2014     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2015     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2016     .add(OffsetHi)
2017     .addReg(CarryReg, RegState::Kill)
2018     .addImm(0); // clamp bit
2019   (void)HiHalf;
2020   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2021 
2022   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2023   MachineInstr *FullBase =
2024     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2025       .addReg(DestSub0)
2026       .addImm(AMDGPU::sub0)
2027       .addReg(DestSub1)
2028       .addImm(AMDGPU::sub1);
2029   (void)FullBase;
2030   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2031 
2032   return FullDestReg;
2033 }
2034 
2035 // Update base and offset with the NewBase and NewOffset in MI.
2036 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2037                                                Register NewBase,
2038                                                int32_t NewOffset) const {
2039   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2040   Base->setReg(NewBase);
2041   Base->setIsKill(false);
2042   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2043 }
2044 
2045 std::optional<int32_t>
2046 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2047   if (Op.isImm())
2048     return Op.getImm();
2049 
2050   if (!Op.isReg())
2051     return std::nullopt;
2052 
2053   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2054   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2055       !Def->getOperand(1).isImm())
2056     return std::nullopt;
2057 
2058   return Def->getOperand(1).getImm();
2059 }
2060 
2061 // Analyze Base and extracts:
2062 //  - 32bit base registers, subregisters
2063 //  - 64bit constant offset
2064 // Expecting base computation as:
2065 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2066 //   %LO:vgpr_32, %c:sreg_64_xexec =
2067 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2068 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2069 //   %Base:vreg_64 =
2070 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2071 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2072                                                       MemAddress &Addr) const {
2073   if (!Base.isReg())
2074     return;
2075 
2076   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2077   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2078       || Def->getNumOperands() != 5)
2079     return;
2080 
2081   MachineOperand BaseLo = Def->getOperand(1);
2082   MachineOperand BaseHi = Def->getOperand(3);
2083   if (!BaseLo.isReg() || !BaseHi.isReg())
2084     return;
2085 
2086   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2087   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2088 
2089   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2090       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2091     return;
2092 
2093   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2094   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2095 
2096   auto Offset0P = extractConstOffset(*Src0);
2097   if (Offset0P)
2098     BaseLo = *Src1;
2099   else {
2100     if (!(Offset0P = extractConstOffset(*Src1)))
2101       return;
2102     BaseLo = *Src0;
2103   }
2104 
2105   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2106   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2107 
2108   if (Src0->isImm())
2109     std::swap(Src0, Src1);
2110 
2111   if (!Src1->isImm())
2112     return;
2113 
2114   uint64_t Offset1 = Src1->getImm();
2115   BaseHi = *Src0;
2116 
2117   Addr.Base.LoReg = BaseLo.getReg();
2118   Addr.Base.HiReg = BaseHi.getReg();
2119   Addr.Base.LoSubReg = BaseLo.getSubReg();
2120   Addr.Base.HiSubReg = BaseHi.getSubReg();
2121   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2122 }
2123 
2124 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2125     MachineInstr &MI,
2126     MemInfoMap &Visited,
2127     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2128 
2129   if (!(MI.mayLoad() ^ MI.mayStore()))
2130     return false;
2131 
2132   // TODO: Support flat and scratch.
2133   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2134     return false;
2135 
2136   if (MI.mayLoad() &&
2137       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2138     return false;
2139 
2140   if (AnchorList.count(&MI))
2141     return false;
2142 
2143   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2144 
2145   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2146     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2147     return false;
2148   }
2149 
2150   // Step1: Find the base-registers and a 64bit constant offset.
2151   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2152   MemAddress MAddr;
2153   if (!Visited.contains(&MI)) {
2154     processBaseWithConstOffset(Base, MAddr);
2155     Visited[&MI] = MAddr;
2156   } else
2157     MAddr = Visited[&MI];
2158 
2159   if (MAddr.Offset == 0) {
2160     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2161                          " constant offsets that can be promoted.\n";);
2162     return false;
2163   }
2164 
2165   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2166              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2167 
2168   // Step2: Traverse through MI's basic block and find an anchor(that has the
2169   // same base-registers) with the highest 13bit distance from MI's offset.
2170   // E.g. (64bit loads)
2171   // bb:
2172   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2173   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2174   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2175   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2176   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2177   //
2178   // Starting from the first load, the optimization will try to find a new base
2179   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2180   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2181   // as the new-base(anchor) because of the maximum distance which can
2182   // accommodate more intermediate bases presumably.
2183   //
2184   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2185   // (&a + 8192) for load1, load2, load4.
2186   //   addr = &a + 8192
2187   //   load1 = load(addr,       -4096)
2188   //   load2 = load(addr,       -2048)
2189   //   load3 = load(addr,       0)
2190   //   load4 = load(addr,       2048)
2191   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2192   //
2193   MachineInstr *AnchorInst = nullptr;
2194   MemAddress AnchorAddr;
2195   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2196   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2197 
2198   MachineBasicBlock *MBB = MI.getParent();
2199   MachineBasicBlock::iterator E = MBB->end();
2200   MachineBasicBlock::iterator MBBI = MI.getIterator();
2201   ++MBBI;
2202   const SITargetLowering *TLI =
2203     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2204 
2205   for ( ; MBBI != E; ++MBBI) {
2206     MachineInstr &MINext = *MBBI;
2207     // TODO: Support finding an anchor(with same base) from store addresses or
2208     // any other load addresses where the opcodes are different.
2209     if (MINext.getOpcode() != MI.getOpcode() ||
2210         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2211       continue;
2212 
2213     const MachineOperand &BaseNext =
2214       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2215     MemAddress MAddrNext;
2216     if (!Visited.contains(&MINext)) {
2217       processBaseWithConstOffset(BaseNext, MAddrNext);
2218       Visited[&MINext] = MAddrNext;
2219     } else
2220       MAddrNext = Visited[&MINext];
2221 
2222     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2223         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2224         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2225         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2226       continue;
2227 
2228     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2229 
2230     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2231     TargetLoweringBase::AddrMode AM;
2232     AM.HasBaseReg = true;
2233     AM.BaseOffs = Dist;
2234     if (TLI->isLegalGlobalAddressingMode(AM) &&
2235         (uint32_t)std::abs(Dist) > MaxDist) {
2236       MaxDist = std::abs(Dist);
2237 
2238       AnchorAddr = MAddrNext;
2239       AnchorInst = &MINext;
2240     }
2241   }
2242 
2243   if (AnchorInst) {
2244     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2245                AnchorInst->dump());
2246     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2247                <<  AnchorAddr.Offset << "\n\n");
2248 
2249     // Instead of moving up, just re-compute anchor-instruction's base address.
2250     Register Base = computeBase(MI, AnchorAddr);
2251 
2252     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2253     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2254 
2255     for (auto P : InstsWCommonBase) {
2256       TargetLoweringBase::AddrMode AM;
2257       AM.HasBaseReg = true;
2258       AM.BaseOffs = P.second - AnchorAddr.Offset;
2259 
2260       if (TLI->isLegalGlobalAddressingMode(AM)) {
2261         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2262                    dbgs() << ")"; P.first->dump());
2263         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2264         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2265       }
2266     }
2267     AnchorList.insert(AnchorInst);
2268     return true;
2269   }
2270 
2271   return false;
2272 }
2273 
2274 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2275                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2276   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2277     if (AddrList.front().InstClass == CI.InstClass &&
2278         AddrList.front().IsAGPR == CI.IsAGPR &&
2279         AddrList.front().hasSameBaseAddress(CI)) {
2280       AddrList.emplace_back(CI);
2281       return;
2282     }
2283   }
2284 
2285   // Base address not found, so add a new list.
2286   MergeableInsts.emplace_back(1, CI);
2287 }
2288 
2289 std::pair<MachineBasicBlock::iterator, bool>
2290 SILoadStoreOptimizer::collectMergeableInsts(
2291     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2292     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2293     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2294   bool Modified = false;
2295 
2296   // Sort potential mergeable instructions into lists.  One list per base address.
2297   unsigned Order = 0;
2298   MachineBasicBlock::iterator BlockI = Begin;
2299   for (; BlockI != End; ++BlockI) {
2300     MachineInstr &MI = *BlockI;
2301 
2302     // We run this before checking if an address is mergeable, because it can produce
2303     // better code even if the instructions aren't mergeable.
2304     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2305       Modified = true;
2306 
2307     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2308     // barriers. We can look after this barrier for separate merges.
2309     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2310       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2311 
2312       // Search will resume after this instruction in a separate merge list.
2313       ++BlockI;
2314       break;
2315     }
2316 
2317     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2318     if (InstClass == UNKNOWN)
2319       continue;
2320 
2321     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2322     int Swizzled =
2323         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2324     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2325       continue;
2326 
2327     CombineInfo CI;
2328     CI.setMI(MI, *this);
2329     CI.Order = Order++;
2330 
2331     if (!CI.hasMergeableAddress(*MRI))
2332       continue;
2333 
2334     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2335       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2336       //        operands. However we are reporting that ds_write2 shall have
2337       //        only VGPR data so that machine copy propagation does not
2338       //        create an illegal instruction with a VGPR and AGPR sources.
2339       //        Consequenctially if we create such instruction the verifier
2340       //        will complain.
2341       continue;
2342     }
2343 
2344     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2345 
2346     addInstToMergeableList(CI, MergeableInsts);
2347   }
2348 
2349   // At this point we have lists of Mergeable instructions.
2350   //
2351   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2352   // list try to find an instruction that can be merged with I.  If an instruction
2353   // is found, it is stored in the Paired field.  If no instructions are found, then
2354   // the CombineInfo object is deleted from the list.
2355 
2356   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2357                                                    E = MergeableInsts.end(); I != E;) {
2358 
2359     std::list<CombineInfo> &MergeList = *I;
2360     if (MergeList.size() <= 1) {
2361       // This means we have found only one instruction with a given address
2362       // that can be merged, and we need at least 2 instructions to do a merge,
2363       // so this list can be discarded.
2364       I = MergeableInsts.erase(I);
2365       continue;
2366     }
2367 
2368     // Sort the lists by offsets, this way mergeable instructions will be
2369     // adjacent to each other in the list, which will make it easier to find
2370     // matches.
2371     MergeList.sort(
2372         [] (const CombineInfo &A, const CombineInfo &B) {
2373           return A.Offset < B.Offset;
2374         });
2375     ++I;
2376   }
2377 
2378   return std::pair(BlockI, Modified);
2379 }
2380 
2381 // Scan through looking for adjacent LDS operations with constant offsets from
2382 // the same base register. We rely on the scheduler to do the hard work of
2383 // clustering nearby loads, and assume these are all adjacent.
2384 bool SILoadStoreOptimizer::optimizeBlock(
2385                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2386   bool Modified = false;
2387 
2388   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2389                                                    E = MergeableInsts.end(); I != E;) {
2390     std::list<CombineInfo> &MergeList = *I;
2391 
2392     bool OptimizeListAgain = false;
2393     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2394       // We weren't able to make any changes, so delete the list so we don't
2395       // process the same instructions the next time we try to optimize this
2396       // block.
2397       I = MergeableInsts.erase(I);
2398       continue;
2399     }
2400 
2401     Modified = true;
2402 
2403     // We made changes, but also determined that there were no more optimization
2404     // opportunities, so we don't need to reprocess the list
2405     if (!OptimizeListAgain) {
2406       I = MergeableInsts.erase(I);
2407       continue;
2408     }
2409     OptimizeAgain = true;
2410   }
2411   return Modified;
2412 }
2413 
2414 bool
2415 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2416                                           std::list<CombineInfo> &MergeList,
2417                                           bool &OptimizeListAgain) {
2418   if (MergeList.empty())
2419     return false;
2420 
2421   bool Modified = false;
2422 
2423   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2424        Next = std::next(I)) {
2425 
2426     auto First = I;
2427     auto Second = Next;
2428 
2429     if ((*First).Order > (*Second).Order)
2430       std::swap(First, Second);
2431     CombineInfo &CI = *First;
2432     CombineInfo &Paired = *Second;
2433 
2434     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2435     if (!Where) {
2436       ++I;
2437       continue;
2438     }
2439 
2440     Modified = true;
2441 
2442     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2443 
2444     MachineBasicBlock::iterator NewMI;
2445     switch (CI.InstClass) {
2446     default:
2447       llvm_unreachable("unknown InstClass");
2448       break;
2449     case DS_READ:
2450       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2451       break;
2452     case DS_WRITE:
2453       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2454       break;
2455     case S_BUFFER_LOAD_IMM:
2456     case S_BUFFER_LOAD_SGPR_IMM:
2457     case S_LOAD_IMM:
2458       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2459       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2460       break;
2461     case BUFFER_LOAD:
2462       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2463       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2464       break;
2465     case BUFFER_STORE:
2466       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2467       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2468       break;
2469     case MIMG:
2470       NewMI = mergeImagePair(CI, Paired, Where->I);
2471       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2472       break;
2473     case TBUFFER_LOAD:
2474       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2475       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2476       break;
2477     case TBUFFER_STORE:
2478       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2479       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2480       break;
2481     case FLAT_LOAD:
2482     case GLOBAL_LOAD:
2483     case GLOBAL_LOAD_SADDR:
2484       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2485       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2486       break;
2487     case FLAT_STORE:
2488     case GLOBAL_STORE:
2489     case GLOBAL_STORE_SADDR:
2490       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2491       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2492       break;
2493     }
2494     CI.setMI(NewMI, *this);
2495     CI.Order = Where->Order;
2496     if (I == Second)
2497       I = Next;
2498 
2499     MergeList.erase(Second);
2500   }
2501 
2502   return Modified;
2503 }
2504 
2505 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2506   if (skipFunction(MF.getFunction()))
2507     return false;
2508 
2509   STM = &MF.getSubtarget<GCNSubtarget>();
2510   if (!STM->loadStoreOptEnabled())
2511     return false;
2512 
2513   TII = STM->getInstrInfo();
2514   TRI = &TII->getRegisterInfo();
2515 
2516   MRI = &MF.getRegInfo();
2517   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2518 
2519   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2520 
2521   bool Modified = false;
2522 
2523   // Contains the list of instructions for which constant offsets are being
2524   // promoted to the IMM. This is tracked for an entire block at time.
2525   SmallPtrSet<MachineInstr *, 4> AnchorList;
2526   MemInfoMap Visited;
2527 
2528   for (MachineBasicBlock &MBB : MF) {
2529     MachineBasicBlock::iterator SectionEnd;
2530     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2531          I = SectionEnd) {
2532       bool CollectModified;
2533       std::list<std::list<CombineInfo>> MergeableInsts;
2534 
2535       // First pass: Collect list of all instructions we know how to merge in a
2536       // subset of the block.
2537       std::tie(SectionEnd, CollectModified) =
2538           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2539 
2540       Modified |= CollectModified;
2541 
2542       do {
2543         OptimizeAgain = false;
2544         Modified |= optimizeBlock(MergeableInsts);
2545       } while (OptimizeAgain);
2546     }
2547 
2548     Visited.clear();
2549     AnchorList.clear();
2550   }
2551 
2552   return Modified;
2553 }
2554