xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 37d7b06da03a46e7bbd700e3d247fdb70e97f933)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *
223   getTargetRegisterClass(const CombineInfo &CI,
224                          const CombineInfo &Paired) const;
225   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226 
227   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228 
229   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230                       MachineBasicBlock::iterator InsertBefore, int OpName,
231                       Register DestReg) const;
232   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233                            MachineBasicBlock::iterator InsertBefore,
234                            int OpName) const;
235 
236   unsigned read2Opcode(unsigned EltSize) const;
237   unsigned read2ST64Opcode(unsigned EltSize) const;
238   MachineBasicBlock::iterator
239   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240                  MachineBasicBlock::iterator InsertBefore);
241 
242   unsigned write2Opcode(unsigned EltSize) const;
243   unsigned write2ST64Opcode(unsigned EltSize) const;
244   MachineBasicBlock::iterator
245   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246                   MachineBasicBlock::iterator InsertBefore);
247   MachineBasicBlock::iterator
248   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249                  MachineBasicBlock::iterator InsertBefore);
250   MachineBasicBlock::iterator
251   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252                        MachineBasicBlock::iterator InsertBefore);
253   MachineBasicBlock::iterator
254   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255                       MachineBasicBlock::iterator InsertBefore);
256   MachineBasicBlock::iterator
257   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258                        MachineBasicBlock::iterator InsertBefore);
259   MachineBasicBlock::iterator
260   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261                        MachineBasicBlock::iterator InsertBefore);
262   MachineBasicBlock::iterator
263   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264                         MachineBasicBlock::iterator InsertBefore);
265   MachineBasicBlock::iterator
266   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267                     MachineBasicBlock::iterator InsertBefore);
268   MachineBasicBlock::iterator
269   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270                      MachineBasicBlock::iterator InsertBefore);
271 
272   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273                            int32_t NewOffset) const;
274   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278   /// Promotes constant offset to the immediate by adjusting the base. It
279   /// tries to use a base from the nearby instructions that allows it to have
280   /// a 13bit constant offset which gets promoted to the immediate.
281   bool promoteConstantOffsetToImm(MachineInstr &CI,
282                                   MemInfoMap &Visited,
283                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284   void addInstToMergeableList(const CombineInfo &CI,
285                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
286 
287   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290       std::list<std::list<CombineInfo>> &MergeableInsts) const;
291 
292   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293                                                      const CombineInfo &Paired);
294 
295   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296                                           const CombineInfo &Paired);
297 
298 public:
299   static char ID;
300 
301   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
302     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
303   }
304 
305   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306                                      bool &OptimizeListAgain);
307   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308 
309   bool runOnMachineFunction(MachineFunction &MF) override;
310 
311   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312 
313   void getAnalysisUsage(AnalysisUsage &AU) const override {
314     AU.setPreservesCFG();
315     AU.addRequired<AAResultsWrapperPass>();
316 
317     MachineFunctionPass::getAnalysisUsage(AU);
318   }
319 
320   MachineFunctionProperties getRequiredProperties() const override {
321     return MachineFunctionProperties()
322       .set(MachineFunctionProperties::Property::IsSSA);
323   }
324 };
325 
326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327   const unsigned Opc = MI.getOpcode();
328 
329   if (TII.isMUBUF(Opc)) {
330     // FIXME: Handle d16 correctly
331     return AMDGPU::getMUBUFElements(Opc);
332   }
333   if (TII.isImage(MI)) {
334     uint64_t DMaskImm =
335         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336     return llvm::popcount(DMaskImm);
337   }
338   if (TII.isMTBUF(Opc)) {
339     return AMDGPU::getMTBUFElements(Opc);
340   }
341 
342   switch (Opc) {
343   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345   case AMDGPU::S_LOAD_DWORD_IMM:
346   case AMDGPU::GLOBAL_LOAD_DWORD:
347   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348   case AMDGPU::GLOBAL_STORE_DWORD:
349   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350   case AMDGPU::FLAT_LOAD_DWORD:
351   case AMDGPU::FLAT_STORE_DWORD:
352     return 1;
353   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
357   case AMDGPU::S_LOAD_DWORDX2_IMM:
358   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
359   case AMDGPU::GLOBAL_LOAD_DWORDX2:
360   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
361   case AMDGPU::GLOBAL_STORE_DWORDX2:
362   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
363   case AMDGPU::FLAT_LOAD_DWORDX2:
364   case AMDGPU::FLAT_STORE_DWORDX2:
365     return 2;
366   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
367   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
370   case AMDGPU::S_LOAD_DWORDX3_IMM:
371   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
372   case AMDGPU::GLOBAL_LOAD_DWORDX3:
373   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
374   case AMDGPU::GLOBAL_STORE_DWORDX3:
375   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
376   case AMDGPU::FLAT_LOAD_DWORDX3:
377   case AMDGPU::FLAT_STORE_DWORDX3:
378     return 3;
379   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
380   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
383   case AMDGPU::S_LOAD_DWORDX4_IMM:
384   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
385   case AMDGPU::GLOBAL_LOAD_DWORDX4:
386   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
387   case AMDGPU::GLOBAL_STORE_DWORDX4:
388   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
389   case AMDGPU::FLAT_LOAD_DWORDX4:
390   case AMDGPU::FLAT_STORE_DWORDX4:
391     return 4;
392   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
393   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
396   case AMDGPU::S_LOAD_DWORDX8_IMM:
397   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
398     return 8;
399   case AMDGPU::DS_READ_B32:
400   case AMDGPU::DS_READ_B32_gfx9:
401   case AMDGPU::DS_WRITE_B32:
402   case AMDGPU::DS_WRITE_B32_gfx9:
403     return 1;
404   case AMDGPU::DS_READ_B64:
405   case AMDGPU::DS_READ_B64_gfx9:
406   case AMDGPU::DS_WRITE_B64:
407   case AMDGPU::DS_WRITE_B64_gfx9:
408     return 2;
409   default:
410     return 0;
411   }
412 }
413 
414 /// Maps instruction opcode to enum InstClassEnum.
415 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
416   switch (Opc) {
417   default:
418     if (TII.isMUBUF(Opc)) {
419       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
420       default:
421         return UNKNOWN;
422       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
423       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
424       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
425       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
426       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
427       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
428       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
429       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
430       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
431       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
432       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
433       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
434       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
435       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
436       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
437       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
438         return BUFFER_LOAD;
439       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
440       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
441       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
442       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
443       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
444       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
445       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
446       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
447       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
448       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
449       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
450       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
451       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
452       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
453       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
454       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
455         return BUFFER_STORE;
456       }
457     }
458     if (TII.isImage(Opc)) {
459       // Ignore instructions encoded without vaddr.
460       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
461           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
462         return UNKNOWN;
463       // Ignore BVH instructions
464       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
465         return UNKNOWN;
466       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
467       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
468           TII.isGather4(Opc))
469         return UNKNOWN;
470       return MIMG;
471     }
472     if (TII.isMTBUF(Opc)) {
473       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
474       default:
475         return UNKNOWN;
476       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
477       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
478       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
479       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
480       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
481       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
482       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
483       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
484       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
485       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
486       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
487       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
488       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
489       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
490       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
491       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
492         return TBUFFER_LOAD;
493       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
494       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
495       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
496       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
497       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
498       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
499       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
500       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
501         return TBUFFER_STORE;
502       }
503     }
504     return UNKNOWN;
505   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
506   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
507   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
508   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
509   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
514     return S_BUFFER_LOAD_IMM;
515   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
516   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
517   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
518   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
519   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
524     return S_BUFFER_LOAD_SGPR_IMM;
525   case AMDGPU::S_LOAD_DWORD_IMM:
526   case AMDGPU::S_LOAD_DWORDX2_IMM:
527   case AMDGPU::S_LOAD_DWORDX3_IMM:
528   case AMDGPU::S_LOAD_DWORDX4_IMM:
529   case AMDGPU::S_LOAD_DWORDX8_IMM:
530   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
531   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
532   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
533   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
534     return S_LOAD_IMM;
535   case AMDGPU::DS_READ_B32:
536   case AMDGPU::DS_READ_B32_gfx9:
537   case AMDGPU::DS_READ_B64:
538   case AMDGPU::DS_READ_B64_gfx9:
539     return DS_READ;
540   case AMDGPU::DS_WRITE_B32:
541   case AMDGPU::DS_WRITE_B32_gfx9:
542   case AMDGPU::DS_WRITE_B64:
543   case AMDGPU::DS_WRITE_B64_gfx9:
544     return DS_WRITE;
545   case AMDGPU::GLOBAL_LOAD_DWORD:
546   case AMDGPU::GLOBAL_LOAD_DWORDX2:
547   case AMDGPU::GLOBAL_LOAD_DWORDX3:
548   case AMDGPU::GLOBAL_LOAD_DWORDX4:
549   case AMDGPU::FLAT_LOAD_DWORD:
550   case AMDGPU::FLAT_LOAD_DWORDX2:
551   case AMDGPU::FLAT_LOAD_DWORDX3:
552   case AMDGPU::FLAT_LOAD_DWORDX4:
553     return FLAT_LOAD;
554   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
555   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
556   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
557   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
558     return GLOBAL_LOAD_SADDR;
559   case AMDGPU::GLOBAL_STORE_DWORD:
560   case AMDGPU::GLOBAL_STORE_DWORDX2:
561   case AMDGPU::GLOBAL_STORE_DWORDX3:
562   case AMDGPU::GLOBAL_STORE_DWORDX4:
563   case AMDGPU::FLAT_STORE_DWORD:
564   case AMDGPU::FLAT_STORE_DWORDX2:
565   case AMDGPU::FLAT_STORE_DWORDX3:
566   case AMDGPU::FLAT_STORE_DWORDX4:
567     return FLAT_STORE;
568   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
569   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
570   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
571   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
572     return GLOBAL_STORE_SADDR;
573   }
574 }
575 
576 /// Determines instruction subclass from opcode. Only instructions
577 /// of the same subclass can be merged together. The merged instruction may have
578 /// a different subclass but must have the same class.
579 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
580   switch (Opc) {
581   default:
582     if (TII.isMUBUF(Opc))
583       return AMDGPU::getMUBUFBaseOpcode(Opc);
584     if (TII.isImage(Opc)) {
585       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
586       assert(Info);
587       return Info->BaseOpcode;
588     }
589     if (TII.isMTBUF(Opc))
590       return AMDGPU::getMTBUFBaseOpcode(Opc);
591     return -1;
592   case AMDGPU::DS_READ_B32:
593   case AMDGPU::DS_READ_B32_gfx9:
594   case AMDGPU::DS_READ_B64:
595   case AMDGPU::DS_READ_B64_gfx9:
596   case AMDGPU::DS_WRITE_B32:
597   case AMDGPU::DS_WRITE_B32_gfx9:
598   case AMDGPU::DS_WRITE_B64:
599   case AMDGPU::DS_WRITE_B64_gfx9:
600     return Opc;
601   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
602   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
603   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
604   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
605   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
610     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
611   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
612   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
613   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
614   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
615   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
620     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
621   case AMDGPU::S_LOAD_DWORD_IMM:
622   case AMDGPU::S_LOAD_DWORDX2_IMM:
623   case AMDGPU::S_LOAD_DWORDX3_IMM:
624   case AMDGPU::S_LOAD_DWORDX4_IMM:
625   case AMDGPU::S_LOAD_DWORDX8_IMM:
626   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
627   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
628   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
629   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
630     return AMDGPU::S_LOAD_DWORD_IMM;
631   case AMDGPU::GLOBAL_LOAD_DWORD:
632   case AMDGPU::GLOBAL_LOAD_DWORDX2:
633   case AMDGPU::GLOBAL_LOAD_DWORDX3:
634   case AMDGPU::GLOBAL_LOAD_DWORDX4:
635   case AMDGPU::FLAT_LOAD_DWORD:
636   case AMDGPU::FLAT_LOAD_DWORDX2:
637   case AMDGPU::FLAT_LOAD_DWORDX3:
638   case AMDGPU::FLAT_LOAD_DWORDX4:
639     return AMDGPU::FLAT_LOAD_DWORD;
640   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
641   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
642   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
643   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
644     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
645   case AMDGPU::GLOBAL_STORE_DWORD:
646   case AMDGPU::GLOBAL_STORE_DWORDX2:
647   case AMDGPU::GLOBAL_STORE_DWORDX3:
648   case AMDGPU::GLOBAL_STORE_DWORDX4:
649   case AMDGPU::FLAT_STORE_DWORD:
650   case AMDGPU::FLAT_STORE_DWORDX2:
651   case AMDGPU::FLAT_STORE_DWORDX3:
652   case AMDGPU::FLAT_STORE_DWORDX4:
653     return AMDGPU::FLAT_STORE_DWORD;
654   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
655   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
656   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
657   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
658     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
659   }
660 }
661 
662 // GLOBAL loads and stores are classified as FLAT initially. If both combined
663 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
664 // If either or both instructions are non segment specific FLAT the resulting
665 // combined operation will be FLAT, potentially promoting one of the GLOBAL
666 // operations to FLAT.
667 // For other instructions return the original unmodified class.
668 InstClassEnum
669 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
670                                          const CombineInfo &Paired) {
671   assert(CI.InstClass == Paired.InstClass);
672 
673   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
674       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
675     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
676 
677   return CI.InstClass;
678 }
679 
680 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
681   AddressRegs Result;
682 
683   if (TII.isMUBUF(Opc)) {
684     if (AMDGPU::getMUBUFHasVAddr(Opc))
685       Result.VAddr = true;
686     if (AMDGPU::getMUBUFHasSrsrc(Opc))
687       Result.SRsrc = true;
688     if (AMDGPU::getMUBUFHasSoffset(Opc))
689       Result.SOffset = true;
690 
691     return Result;
692   }
693 
694   if (TII.isImage(Opc)) {
695     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
696     if (VAddr0Idx >= 0) {
697       int RsrcName =
698           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
699       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
700       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
701     } else {
702       Result.VAddr = true;
703     }
704     Result.SRsrc = true;
705     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
706     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
707       Result.SSamp = true;
708 
709     return Result;
710   }
711   if (TII.isMTBUF(Opc)) {
712     if (AMDGPU::getMTBUFHasVAddr(Opc))
713       Result.VAddr = true;
714     if (AMDGPU::getMTBUFHasSrsrc(Opc))
715       Result.SRsrc = true;
716     if (AMDGPU::getMTBUFHasSoffset(Opc))
717       Result.SOffset = true;
718 
719     return Result;
720   }
721 
722   switch (Opc) {
723   default:
724     return Result;
725   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
726   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
727   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
728   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
729   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
734     Result.SOffset = true;
735     [[fallthrough]];
736   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
737   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
738   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
739   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
740   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
745   case AMDGPU::S_LOAD_DWORD_IMM:
746   case AMDGPU::S_LOAD_DWORDX2_IMM:
747   case AMDGPU::S_LOAD_DWORDX3_IMM:
748   case AMDGPU::S_LOAD_DWORDX4_IMM:
749   case AMDGPU::S_LOAD_DWORDX8_IMM:
750   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
751   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
752   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
753   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
754     Result.SBase = true;
755     return Result;
756   case AMDGPU::DS_READ_B32:
757   case AMDGPU::DS_READ_B64:
758   case AMDGPU::DS_READ_B32_gfx9:
759   case AMDGPU::DS_READ_B64_gfx9:
760   case AMDGPU::DS_WRITE_B32:
761   case AMDGPU::DS_WRITE_B64:
762   case AMDGPU::DS_WRITE_B32_gfx9:
763   case AMDGPU::DS_WRITE_B64_gfx9:
764     Result.Addr = true;
765     return Result;
766   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
767   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
768   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
769   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
770   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
771   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
772   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
773   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
774     Result.SAddr = true;
775     [[fallthrough]];
776   case AMDGPU::GLOBAL_LOAD_DWORD:
777   case AMDGPU::GLOBAL_LOAD_DWORDX2:
778   case AMDGPU::GLOBAL_LOAD_DWORDX3:
779   case AMDGPU::GLOBAL_LOAD_DWORDX4:
780   case AMDGPU::GLOBAL_STORE_DWORD:
781   case AMDGPU::GLOBAL_STORE_DWORDX2:
782   case AMDGPU::GLOBAL_STORE_DWORDX3:
783   case AMDGPU::GLOBAL_STORE_DWORDX4:
784   case AMDGPU::FLAT_LOAD_DWORD:
785   case AMDGPU::FLAT_LOAD_DWORDX2:
786   case AMDGPU::FLAT_LOAD_DWORDX3:
787   case AMDGPU::FLAT_LOAD_DWORDX4:
788   case AMDGPU::FLAT_STORE_DWORD:
789   case AMDGPU::FLAT_STORE_DWORDX2:
790   case AMDGPU::FLAT_STORE_DWORDX3:
791   case AMDGPU::FLAT_STORE_DWORDX4:
792     Result.VAddr = true;
793     return Result;
794   }
795 }
796 
797 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
798                                               const SILoadStoreOptimizer &LSO) {
799   I = MI;
800   unsigned Opc = MI->getOpcode();
801   InstClass = getInstClass(Opc, *LSO.TII);
802 
803   if (InstClass == UNKNOWN)
804     return;
805 
806   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
807 
808   switch (InstClass) {
809   case DS_READ:
810    EltSize =
811           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
812                                                                           : 4;
813    break;
814   case DS_WRITE:
815     EltSize =
816           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
817                                                                             : 4;
818     break;
819   case S_BUFFER_LOAD_IMM:
820   case S_BUFFER_LOAD_SGPR_IMM:
821   case S_LOAD_IMM:
822     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
823     break;
824   default:
825     EltSize = 4;
826     break;
827   }
828 
829   if (InstClass == MIMG) {
830     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
831     // Offset is not considered for MIMG instructions.
832     Offset = 0;
833   } else {
834     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
835     Offset = I->getOperand(OffsetIdx).getImm();
836   }
837 
838   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
839     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
840 
841   Width = getOpcodeWidth(*I, *LSO.TII);
842 
843   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
844     Offset &= 0xffff;
845   } else if (InstClass != MIMG) {
846     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
847   }
848 
849   AddressRegs Regs = getRegs(Opc, *LSO.TII);
850   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
851 
852   NumAddresses = 0;
853   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
854     AddrIdx[NumAddresses++] =
855         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
856   if (Regs.Addr)
857     AddrIdx[NumAddresses++] =
858         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
859   if (Regs.SBase)
860     AddrIdx[NumAddresses++] =
861         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
862   if (Regs.SRsrc)
863     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
864         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
865   if (Regs.SOffset)
866     AddrIdx[NumAddresses++] =
867         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
868   if (Regs.SAddr)
869     AddrIdx[NumAddresses++] =
870         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
871   if (Regs.VAddr)
872     AddrIdx[NumAddresses++] =
873         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
874   if (Regs.SSamp)
875     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
876         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
877   assert(NumAddresses <= MaxAddressRegs);
878 
879   for (unsigned J = 0; J < NumAddresses; J++)
880     AddrReg[J] = &I->getOperand(AddrIdx[J]);
881 }
882 
883 } // end anonymous namespace.
884 
885 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
886                       "SI Load Store Optimizer", false, false)
887 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
888 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
889                     false, false)
890 
891 char SILoadStoreOptimizer::ID = 0;
892 
893 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
894 
895 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
896   return new SILoadStoreOptimizer();
897 }
898 
899 static void addDefsUsesToList(const MachineInstr &MI,
900                               DenseSet<Register> &RegDefs,
901                               DenseSet<Register> &RegUses) {
902   for (const auto &Op : MI.operands()) {
903     if (!Op.isReg())
904       continue;
905     if (Op.isDef())
906       RegDefs.insert(Op.getReg());
907     if (Op.readsReg())
908       RegUses.insert(Op.getReg());
909   }
910 }
911 
912 bool SILoadStoreOptimizer::canSwapInstructions(
913     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
914     const MachineInstr &A, const MachineInstr &B) const {
915   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
916       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
917     return false;
918   for (const auto &BOp : B.operands()) {
919     if (!BOp.isReg())
920       continue;
921     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
922       return false;
923     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
924       return false;
925   }
926   return true;
927 }
928 
929 // Given that \p CI and \p Paired are adjacent memory operations produce a new
930 // MMO for the combined operation with a new access size.
931 MachineMemOperand *
932 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
933                                                const CombineInfo &Paired) {
934   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
935   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
936 
937   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
938 
939   // A base pointer for the combined operation is the same as the leading
940   // operation's pointer.
941   if (Paired < CI)
942     std::swap(MMOa, MMOb);
943 
944   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
945   // If merging FLAT and GLOBAL set address space to FLAT.
946   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
947     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
948 
949   MachineFunction *MF = CI.I->getMF();
950   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
951 }
952 
953 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
954                                                const SIInstrInfo &TII,
955                                                const CombineInfo &Paired) {
956   assert(CI.InstClass == MIMG);
957 
958   // Ignore instructions with tfe/lwe set.
959   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
960   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
961 
962   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
963     return false;
964 
965   // Check other optional immediate operands for equality.
966   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
967                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
968                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
969 
970   for (auto op : OperandsToMatch) {
971     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
972     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
973       return false;
974     if (Idx != -1 &&
975         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
976       return false;
977   }
978 
979   // Check DMask for overlaps.
980   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
981   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
982 
983   if (!MaxMask)
984     return false;
985 
986   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
987   if ((1u << AllowedBitsForMin) <= MinMask)
988     return false;
989 
990   return true;
991 }
992 
993 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
994                                        unsigned ComponentCount,
995                                        const GCNSubtarget &STI) {
996   if (ComponentCount > 4)
997     return 0;
998 
999   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1000       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
1001   if (!OldFormatInfo)
1002     return 0;
1003 
1004   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1005       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
1006                                            ComponentCount,
1007                                            OldFormatInfo->NumFormat, STI);
1008 
1009   if (!NewFormatInfo)
1010     return 0;
1011 
1012   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1013          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1014 
1015   return NewFormatInfo->Format;
1016 }
1017 
1018 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
1019 // highest power of two. Note that the result is well defined for all inputs
1020 // including corner cases like:
1021 // - if Lo == Hi, return that value
1022 // - if Lo == 0, return 0 (even though the "- 1" below underflows
1023 // - if Lo > Hi, return 0 (as if the range wrapped around)
1024 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
1025   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1026 }
1027 
1028 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1029                                                 const GCNSubtarget &STI,
1030                                                 CombineInfo &Paired,
1031                                                 bool Modify) {
1032   assert(CI.InstClass != MIMG);
1033 
1034   // XXX - Would the same offset be OK? Is there any reason this would happen or
1035   // be useful?
1036   if (CI.Offset == Paired.Offset)
1037     return false;
1038 
1039   // This won't be valid if the offset isn't aligned.
1040   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1041     return false;
1042 
1043   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1044 
1045     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1046         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1047     if (!Info0)
1048       return false;
1049     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1050         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1051     if (!Info1)
1052       return false;
1053 
1054     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1055         Info0->NumFormat != Info1->NumFormat)
1056       return false;
1057 
1058     // TODO: Should be possible to support more formats, but if format loads
1059     // are not dword-aligned, the merged load might not be valid.
1060     if (Info0->BitsPerComp != 32)
1061       return false;
1062 
1063     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1064       return false;
1065   }
1066 
1067   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1068   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1069   CI.UseST64 = false;
1070   CI.BaseOff = 0;
1071 
1072   // Handle all non-DS instructions.
1073   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1074     if (EltOffset0 + CI.Width != EltOffset1 &&
1075             EltOffset1 + Paired.Width != EltOffset0)
1076       return false;
1077     if (CI.CPol != Paired.CPol)
1078       return false;
1079     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1080         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1081       // Reject cases like:
1082       //   dword + dwordx2 -> dwordx3
1083       //   dword + dwordx3 -> dwordx4
1084       // If we tried to combine these cases, we would fail to extract a subreg
1085       // for the result of the second load due to SGPR alignment requirements.
1086       if (CI.Width != Paired.Width &&
1087           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1088         return false;
1089     }
1090     return true;
1091   }
1092 
1093   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1094   // the stride 64 versions.
1095   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1096       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1097     if (Modify) {
1098       CI.Offset = EltOffset0 / 64;
1099       Paired.Offset = EltOffset1 / 64;
1100       CI.UseST64 = true;
1101     }
1102     return true;
1103   }
1104 
1105   // Check if the new offsets fit in the reduced 8-bit range.
1106   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1107     if (Modify) {
1108       CI.Offset = EltOffset0;
1109       Paired.Offset = EltOffset1;
1110     }
1111     return true;
1112   }
1113 
1114   // Try to shift base address to decrease offsets.
1115   uint32_t Min = std::min(EltOffset0, EltOffset1);
1116   uint32_t Max = std::max(EltOffset0, EltOffset1);
1117 
1118   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1119   if (((Max - Min) & ~Mask) == 0) {
1120     if (Modify) {
1121       // From the range of values we could use for BaseOff, choose the one that
1122       // is aligned to the highest power of two, to maximise the chance that
1123       // the same offset can be reused for other load/store pairs.
1124       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1125       // Copy the low bits of the offsets, so that when we adjust them by
1126       // subtracting BaseOff they will be multiples of 64.
1127       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1128       CI.BaseOff = BaseOff * CI.EltSize;
1129       CI.Offset = (EltOffset0 - BaseOff) / 64;
1130       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1131       CI.UseST64 = true;
1132     }
1133     return true;
1134   }
1135 
1136   if (isUInt<8>(Max - Min)) {
1137     if (Modify) {
1138       // From the range of values we could use for BaseOff, choose the one that
1139       // is aligned to the highest power of two, to maximise the chance that
1140       // the same offset can be reused for other load/store pairs.
1141       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1142       CI.BaseOff = BaseOff * CI.EltSize;
1143       CI.Offset = EltOffset0 - BaseOff;
1144       Paired.Offset = EltOffset1 - BaseOff;
1145     }
1146     return true;
1147   }
1148 
1149   return false;
1150 }
1151 
1152 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1153                                      const CombineInfo &CI,
1154                                      const CombineInfo &Paired) {
1155   const unsigned Width = (CI.Width + Paired.Width);
1156   switch (CI.InstClass) {
1157   default:
1158     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1159   case S_BUFFER_LOAD_IMM:
1160   case S_BUFFER_LOAD_SGPR_IMM:
1161   case S_LOAD_IMM:
1162     switch (Width) {
1163     default:
1164       return false;
1165     case 2:
1166     case 4:
1167     case 8:
1168       return true;
1169     case 3:
1170       return STM.hasScalarDwordx3Loads();
1171     }
1172   }
1173 }
1174 
1175 const TargetRegisterClass *
1176 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1177   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1178     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1179   }
1180   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1181     return TRI->getRegClassForReg(*MRI, Src->getReg());
1182   }
1183   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1184     return TRI->getRegClassForReg(*MRI, Src->getReg());
1185   }
1186   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1187     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1188   }
1189   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1190     return TRI->getRegClassForReg(*MRI, Src->getReg());
1191   }
1192   return nullptr;
1193 }
1194 
1195 /// This function assumes that CI comes before Paired in a basic block. Return
1196 /// an insertion point for the merged instruction or nullptr on failure.
1197 SILoadStoreOptimizer::CombineInfo *
1198 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1199                                            CombineInfo &Paired) {
1200   // If another instruction has already been merged into CI, it may now be a
1201   // type that we can't do any further merging into.
1202   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1203     return nullptr;
1204   assert(CI.InstClass == Paired.InstClass);
1205 
1206   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1207       getInstSubclass(Paired.I->getOpcode(), *TII))
1208     return nullptr;
1209 
1210   // Check both offsets (or masks for MIMG) can be combined and fit in the
1211   // reduced range.
1212   if (CI.InstClass == MIMG) {
1213     if (!dmasksCanBeCombined(CI, *TII, Paired))
1214       return nullptr;
1215   } else {
1216     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1217       return nullptr;
1218   }
1219 
1220   DenseSet<Register> RegDefs;
1221   DenseSet<Register> RegUses;
1222   CombineInfo *Where;
1223   if (CI.I->mayLoad()) {
1224     // Try to hoist Paired up to CI.
1225     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1226     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1227       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1228         return nullptr;
1229     }
1230     Where = &CI;
1231   } else {
1232     // Try to sink CI down to Paired.
1233     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1234     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1235       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1236         return nullptr;
1237     }
1238     Where = &Paired;
1239   }
1240 
1241   // Call offsetsCanBeCombined with modify = true so that the offsets are
1242   // correct for the new instruction.  This should return true, because
1243   // this function should only be called on CombineInfo objects that
1244   // have already been confirmed to be mergeable.
1245   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1246     offsetsCanBeCombined(CI, *STM, Paired, true);
1247   return Where;
1248 }
1249 
1250 // Copy the merged load result from DestReg to the original dest regs of CI and
1251 // Paired.
1252 void SILoadStoreOptimizer::copyToDestRegs(
1253     CombineInfo &CI, CombineInfo &Paired,
1254     MachineBasicBlock::iterator InsertBefore, int OpName,
1255     Register DestReg) const {
1256   MachineBasicBlock *MBB = CI.I->getParent();
1257   DebugLoc DL = CI.I->getDebugLoc();
1258 
1259   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1260 
1261   // Copy to the old destination registers.
1262   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1263   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1264   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1265 
1266   // The constrained sload instructions in S_LOAD_IMM class will have
1267   // `early-clobber` flag in the dst operand. Remove the flag before using the
1268   // MOs in copies.
1269   Dest0->setIsEarlyClobber(false);
1270   Dest1->setIsEarlyClobber(false);
1271 
1272   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1273       .add(*Dest0) // Copy to same destination including flags and sub reg.
1274       .addReg(DestReg, 0, SubRegIdx0);
1275   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1276       .add(*Dest1)
1277       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1278 }
1279 
1280 // Return a register for the source of the merged store after copying the
1281 // original source regs of CI and Paired into it.
1282 Register
1283 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1284                                       MachineBasicBlock::iterator InsertBefore,
1285                                       int OpName) const {
1286   MachineBasicBlock *MBB = CI.I->getParent();
1287   DebugLoc DL = CI.I->getDebugLoc();
1288 
1289   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1290 
1291   // Copy to the new source register.
1292   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1293   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1294 
1295   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1296   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1297 
1298   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1299       .add(*Src0)
1300       .addImm(SubRegIdx0)
1301       .add(*Src1)
1302       .addImm(SubRegIdx1);
1303 
1304   return SrcReg;
1305 }
1306 
1307 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1308   if (STM->ldsRequiresM0Init())
1309     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1310   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1311 }
1312 
1313 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1314   if (STM->ldsRequiresM0Init())
1315     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1316 
1317   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1318                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1319 }
1320 
1321 MachineBasicBlock::iterator
1322 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1323                                      MachineBasicBlock::iterator InsertBefore) {
1324   MachineBasicBlock *MBB = CI.I->getParent();
1325 
1326   // Be careful, since the addresses could be subregisters themselves in weird
1327   // cases, like vectors of pointers.
1328   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1329 
1330   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1331   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1332   unsigned Opc =
1333       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1334 
1335   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1336          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1337 
1338   const MCInstrDesc &Read2Desc = TII->get(Opc);
1339 
1340   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1341   Register DestReg = MRI->createVirtualRegister(SuperRC);
1342 
1343   DebugLoc DL = CI.I->getDebugLoc();
1344 
1345   Register BaseReg = AddrReg->getReg();
1346   unsigned BaseSubReg = AddrReg->getSubReg();
1347   unsigned BaseRegFlags = 0;
1348   if (CI.BaseOff) {
1349     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1350     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1351         .addImm(CI.BaseOff);
1352 
1353     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1354     BaseRegFlags = RegState::Kill;
1355 
1356     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1357         .addReg(ImmReg)
1358         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1359         .addImm(0); // clamp bit
1360     BaseSubReg = 0;
1361   }
1362 
1363   MachineInstrBuilder Read2 =
1364       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1365           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1366           .addImm(NewOffset0)                        // offset0
1367           .addImm(NewOffset1)                        // offset1
1368           .addImm(0)                                 // gds
1369           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1370 
1371   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1372 
1373   CI.I->eraseFromParent();
1374   Paired.I->eraseFromParent();
1375 
1376   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1377   return Read2;
1378 }
1379 
1380 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1381   if (STM->ldsRequiresM0Init())
1382     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1383   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1384                         : AMDGPU::DS_WRITE2_B64_gfx9;
1385 }
1386 
1387 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1388   if (STM->ldsRequiresM0Init())
1389     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1390                           : AMDGPU::DS_WRITE2ST64_B64;
1391 
1392   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1393                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1394 }
1395 
1396 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1397     CombineInfo &CI, CombineInfo &Paired,
1398     MachineBasicBlock::iterator InsertBefore) {
1399   MachineBasicBlock *MBB = CI.I->getParent();
1400 
1401   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1402   // sure we preserve the subregister index and any register flags set on them.
1403   const MachineOperand *AddrReg =
1404       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1405   const MachineOperand *Data0 =
1406       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1407   const MachineOperand *Data1 =
1408       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1409 
1410   unsigned NewOffset0 = CI.Offset;
1411   unsigned NewOffset1 = Paired.Offset;
1412   unsigned Opc =
1413       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1414 
1415   if (NewOffset0 > NewOffset1) {
1416     // Canonicalize the merged instruction so the smaller offset comes first.
1417     std::swap(NewOffset0, NewOffset1);
1418     std::swap(Data0, Data1);
1419   }
1420 
1421   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1422          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1423 
1424   const MCInstrDesc &Write2Desc = TII->get(Opc);
1425   DebugLoc DL = CI.I->getDebugLoc();
1426 
1427   Register BaseReg = AddrReg->getReg();
1428   unsigned BaseSubReg = AddrReg->getSubReg();
1429   unsigned BaseRegFlags = 0;
1430   if (CI.BaseOff) {
1431     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1432     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1433         .addImm(CI.BaseOff);
1434 
1435     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1436     BaseRegFlags = RegState::Kill;
1437 
1438     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1439         .addReg(ImmReg)
1440         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1441         .addImm(0); // clamp bit
1442     BaseSubReg = 0;
1443   }
1444 
1445   MachineInstrBuilder Write2 =
1446       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1447           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1448           .add(*Data0)                               // data0
1449           .add(*Data1)                               // data1
1450           .addImm(NewOffset0)                        // offset0
1451           .addImm(NewOffset1)                        // offset1
1452           .addImm(0)                                 // gds
1453           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1454 
1455   CI.I->eraseFromParent();
1456   Paired.I->eraseFromParent();
1457 
1458   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1459   return Write2;
1460 }
1461 
1462 MachineBasicBlock::iterator
1463 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1464                                      MachineBasicBlock::iterator InsertBefore) {
1465   MachineBasicBlock *MBB = CI.I->getParent();
1466   DebugLoc DL = CI.I->getDebugLoc();
1467   const unsigned Opcode = getNewOpcode(CI, Paired);
1468 
1469   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1470 
1471   Register DestReg = MRI->createVirtualRegister(SuperRC);
1472   unsigned MergedDMask = CI.DMask | Paired.DMask;
1473   unsigned DMaskIdx =
1474       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1475 
1476   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1477   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1478     if (I == DMaskIdx)
1479       MIB.addImm(MergedDMask);
1480     else
1481       MIB.add((*CI.I).getOperand(I));
1482   }
1483 
1484   // It shouldn't be possible to get this far if the two instructions
1485   // don't have a single memoperand, because MachineInstr::mayAlias()
1486   // will return true if this is the case.
1487   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1488 
1489   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1490 
1491   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1492 
1493   CI.I->eraseFromParent();
1494   Paired.I->eraseFromParent();
1495   return New;
1496 }
1497 
1498 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1499     CombineInfo &CI, CombineInfo &Paired,
1500     MachineBasicBlock::iterator InsertBefore) {
1501   MachineBasicBlock *MBB = CI.I->getParent();
1502   DebugLoc DL = CI.I->getDebugLoc();
1503   const unsigned Opcode = getNewOpcode(CI, Paired);
1504 
1505   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1506 
1507   Register DestReg = MRI->createVirtualRegister(SuperRC);
1508   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1509 
1510   // It shouldn't be possible to get this far if the two instructions
1511   // don't have a single memoperand, because MachineInstr::mayAlias()
1512   // will return true if this is the case.
1513   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1514 
1515   MachineInstrBuilder New =
1516       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1517           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1518   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1519     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1520   New.addImm(MergedOffset);
1521   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1522 
1523   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1524 
1525   CI.I->eraseFromParent();
1526   Paired.I->eraseFromParent();
1527   return New;
1528 }
1529 
1530 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1531     CombineInfo &CI, CombineInfo &Paired,
1532     MachineBasicBlock::iterator InsertBefore) {
1533   MachineBasicBlock *MBB = CI.I->getParent();
1534   DebugLoc DL = CI.I->getDebugLoc();
1535 
1536   const unsigned Opcode = getNewOpcode(CI, Paired);
1537 
1538   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1539 
1540   // Copy to the new source register.
1541   Register DestReg = MRI->createVirtualRegister(SuperRC);
1542   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1543 
1544   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1545 
1546   AddressRegs Regs = getRegs(Opcode, *TII);
1547 
1548   if (Regs.VAddr)
1549     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1550 
1551   // It shouldn't be possible to get this far if the two instructions
1552   // don't have a single memoperand, because MachineInstr::mayAlias()
1553   // will return true if this is the case.
1554   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1555 
1556   MachineInstr *New =
1557     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1558         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1559         .addImm(MergedOffset) // offset
1560         .addImm(CI.CPol)      // cpol
1561         .addImm(0)            // swz
1562         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1563 
1564   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1565 
1566   CI.I->eraseFromParent();
1567   Paired.I->eraseFromParent();
1568   return New;
1569 }
1570 
1571 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1572     CombineInfo &CI, CombineInfo &Paired,
1573     MachineBasicBlock::iterator InsertBefore) {
1574   MachineBasicBlock *MBB = CI.I->getParent();
1575   DebugLoc DL = CI.I->getDebugLoc();
1576 
1577   const unsigned Opcode = getNewOpcode(CI, Paired);
1578 
1579   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1580 
1581   // Copy to the new source register.
1582   Register DestReg = MRI->createVirtualRegister(SuperRC);
1583   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1584 
1585   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1586 
1587   AddressRegs Regs = getRegs(Opcode, *TII);
1588 
1589   if (Regs.VAddr)
1590     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1591 
1592   unsigned JoinedFormat =
1593       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1594 
1595   // It shouldn't be possible to get this far if the two instructions
1596   // don't have a single memoperand, because MachineInstr::mayAlias()
1597   // will return true if this is the case.
1598   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1599 
1600   MachineInstr *New =
1601       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1602           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1603           .addImm(MergedOffset) // offset
1604           .addImm(JoinedFormat) // format
1605           .addImm(CI.CPol)      // cpol
1606           .addImm(0)            // swz
1607           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1608 
1609   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1610 
1611   CI.I->eraseFromParent();
1612   Paired.I->eraseFromParent();
1613   return New;
1614 }
1615 
1616 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1617     CombineInfo &CI, CombineInfo &Paired,
1618     MachineBasicBlock::iterator InsertBefore) {
1619   MachineBasicBlock *MBB = CI.I->getParent();
1620   DebugLoc DL = CI.I->getDebugLoc();
1621 
1622   const unsigned Opcode = getNewOpcode(CI, Paired);
1623 
1624   Register SrcReg =
1625       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1626 
1627   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1628                  .addReg(SrcReg, RegState::Kill);
1629 
1630   AddressRegs Regs = getRegs(Opcode, *TII);
1631 
1632   if (Regs.VAddr)
1633     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1634 
1635   unsigned JoinedFormat =
1636       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1637 
1638   // It shouldn't be possible to get this far if the two instructions
1639   // don't have a single memoperand, because MachineInstr::mayAlias()
1640   // will return true if this is the case.
1641   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1642 
1643   MachineInstr *New =
1644       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1645           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1646           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1647           .addImm(JoinedFormat)                     // format
1648           .addImm(CI.CPol)                          // cpol
1649           .addImm(0)                                // swz
1650           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1651 
1652   CI.I->eraseFromParent();
1653   Paired.I->eraseFromParent();
1654   return New;
1655 }
1656 
1657 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1658     CombineInfo &CI, CombineInfo &Paired,
1659     MachineBasicBlock::iterator InsertBefore) {
1660   MachineBasicBlock *MBB = CI.I->getParent();
1661   DebugLoc DL = CI.I->getDebugLoc();
1662 
1663   const unsigned Opcode = getNewOpcode(CI, Paired);
1664 
1665   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1666   Register DestReg = MRI->createVirtualRegister(SuperRC);
1667 
1668   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1669 
1670   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1671     MIB.add(*SAddr);
1672 
1673   MachineInstr *New =
1674     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1675        .addImm(std::min(CI.Offset, Paired.Offset))
1676        .addImm(CI.CPol)
1677        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1678 
1679   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1680 
1681   CI.I->eraseFromParent();
1682   Paired.I->eraseFromParent();
1683   return New;
1684 }
1685 
1686 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1687     CombineInfo &CI, CombineInfo &Paired,
1688     MachineBasicBlock::iterator InsertBefore) {
1689   MachineBasicBlock *MBB = CI.I->getParent();
1690   DebugLoc DL = CI.I->getDebugLoc();
1691 
1692   const unsigned Opcode = getNewOpcode(CI, Paired);
1693 
1694   Register SrcReg =
1695       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1696 
1697   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1698                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1699                  .addReg(SrcReg, RegState::Kill);
1700 
1701   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1702     MIB.add(*SAddr);
1703 
1704   MachineInstr *New =
1705     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1706        .addImm(CI.CPol)
1707        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1708 
1709   CI.I->eraseFromParent();
1710   Paired.I->eraseFromParent();
1711   return New;
1712 }
1713 
1714 static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1715                                    ArrayRef<MachineMemOperand *> MMOs,
1716                                    unsigned Width) {
1717   // Conservatively returns true if not found the MMO.
1718   return STM.isXNACKEnabled() &&
1719          (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1720 }
1721 
1722 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1723                                             const CombineInfo &Paired) {
1724   const unsigned Width = CI.Width + Paired.Width;
1725 
1726   switch (getCommonInstClass(CI, Paired)) {
1727   default:
1728     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1729     // FIXME: Handle d16 correctly
1730     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1731                                   Width);
1732   case TBUFFER_LOAD:
1733   case TBUFFER_STORE:
1734     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1735                                   Width);
1736 
1737   case UNKNOWN:
1738     llvm_unreachable("Unknown instruction class");
1739   case S_BUFFER_LOAD_IMM: {
1740     // If XNACK is enabled, use the constrained opcodes when the first load is
1741     // under-aligned.
1742     bool NeedsConstrainedOpc =
1743         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1744     switch (Width) {
1745     default:
1746       return 0;
1747     case 2:
1748       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1750     case 3:
1751       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1753     case 4:
1754       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1756     case 8:
1757       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1759     }
1760   }
1761   case S_BUFFER_LOAD_SGPR_IMM: {
1762     // If XNACK is enabled, use the constrained opcodes when the first load is
1763     // under-aligned.
1764     bool NeedsConstrainedOpc =
1765         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1766     switch (Width) {
1767     default:
1768       return 0;
1769     case 2:
1770       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1772     case 3:
1773       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1775     case 4:
1776       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1778     case 8:
1779       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1781     }
1782   }
1783   case S_LOAD_IMM: {
1784     // If XNACK is enabled, use the constrained opcodes when the first load is
1785     // under-aligned.
1786     bool NeedsConstrainedOpc =
1787         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1788     switch (Width) {
1789     default:
1790       return 0;
1791     case 2:
1792       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1793                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
1794     case 3:
1795       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1796                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1797     case 4:
1798       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1799                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1800     case 8:
1801       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1802                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1803     }
1804   }
1805   case GLOBAL_LOAD:
1806     switch (Width) {
1807     default:
1808       return 0;
1809     case 2:
1810       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1811     case 3:
1812       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1813     case 4:
1814       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1815     }
1816   case GLOBAL_LOAD_SADDR:
1817     switch (Width) {
1818     default:
1819       return 0;
1820     case 2:
1821       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1822     case 3:
1823       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1824     case 4:
1825       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1826     }
1827   case GLOBAL_STORE:
1828     switch (Width) {
1829     default:
1830       return 0;
1831     case 2:
1832       return AMDGPU::GLOBAL_STORE_DWORDX2;
1833     case 3:
1834       return AMDGPU::GLOBAL_STORE_DWORDX3;
1835     case 4:
1836       return AMDGPU::GLOBAL_STORE_DWORDX4;
1837     }
1838   case GLOBAL_STORE_SADDR:
1839     switch (Width) {
1840     default:
1841       return 0;
1842     case 2:
1843       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1844     case 3:
1845       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1846     case 4:
1847       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1848     }
1849   case FLAT_LOAD:
1850     switch (Width) {
1851     default:
1852       return 0;
1853     case 2:
1854       return AMDGPU::FLAT_LOAD_DWORDX2;
1855     case 3:
1856       return AMDGPU::FLAT_LOAD_DWORDX3;
1857     case 4:
1858       return AMDGPU::FLAT_LOAD_DWORDX4;
1859     }
1860   case FLAT_STORE:
1861     switch (Width) {
1862     default:
1863       return 0;
1864     case 2:
1865       return AMDGPU::FLAT_STORE_DWORDX2;
1866     case 3:
1867       return AMDGPU::FLAT_STORE_DWORDX3;
1868     case 4:
1869       return AMDGPU::FLAT_STORE_DWORDX4;
1870     }
1871   case MIMG:
1872     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1873            "No overlaps");
1874     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1875   }
1876 }
1877 
1878 std::pair<unsigned, unsigned>
1879 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1880                                     const CombineInfo &Paired) {
1881   assert((CI.InstClass != MIMG ||
1882           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1883            CI.Width + Paired.Width)) &&
1884          "No overlaps");
1885 
1886   unsigned Idx0;
1887   unsigned Idx1;
1888 
1889   static const unsigned Idxs[5][4] = {
1890       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1891       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1892       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1893       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1894       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1895   };
1896 
1897   assert(CI.Width >= 1 && CI.Width <= 4);
1898   assert(Paired.Width >= 1 && Paired.Width <= 4);
1899 
1900   if (Paired < CI) {
1901     Idx1 = Idxs[0][Paired.Width - 1];
1902     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1903   } else {
1904     Idx0 = Idxs[0][CI.Width - 1];
1905     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1906   }
1907 
1908   return {Idx0, Idx1};
1909 }
1910 
1911 const TargetRegisterClass *
1912 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1913                                              const CombineInfo &Paired) const {
1914   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1915       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1916     switch (CI.Width + Paired.Width) {
1917     default:
1918       return nullptr;
1919     case 2:
1920       return &AMDGPU::SReg_64_XEXECRegClass;
1921     case 3:
1922       return &AMDGPU::SGPR_96RegClass;
1923     case 4:
1924       return &AMDGPU::SGPR_128RegClass;
1925     case 8:
1926       return &AMDGPU::SGPR_256RegClass;
1927     case 16:
1928       return &AMDGPU::SGPR_512RegClass;
1929     }
1930   }
1931 
1932   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1933   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1934              ? TRI->getAGPRClassForBitWidth(BitWidth)
1935              : TRI->getVGPRClassForBitWidth(BitWidth);
1936 }
1937 
1938 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1939     CombineInfo &CI, CombineInfo &Paired,
1940     MachineBasicBlock::iterator InsertBefore) {
1941   MachineBasicBlock *MBB = CI.I->getParent();
1942   DebugLoc DL = CI.I->getDebugLoc();
1943 
1944   const unsigned Opcode = getNewOpcode(CI, Paired);
1945 
1946   Register SrcReg =
1947       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1948 
1949   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1950                  .addReg(SrcReg, RegState::Kill);
1951 
1952   AddressRegs Regs = getRegs(Opcode, *TII);
1953 
1954   if (Regs.VAddr)
1955     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1956 
1957 
1958   // It shouldn't be possible to get this far if the two instructions
1959   // don't have a single memoperand, because MachineInstr::mayAlias()
1960   // will return true if this is the case.
1961   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1962 
1963   MachineInstr *New =
1964     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1965         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1966         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1967         .addImm(CI.CPol)      // cpol
1968         .addImm(0)            // swz
1969         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1970 
1971   CI.I->eraseFromParent();
1972   Paired.I->eraseFromParent();
1973   return New;
1974 }
1975 
1976 MachineOperand
1977 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1978   APInt V(32, Val, true);
1979   if (TII->isInlineConstant(V))
1980     return MachineOperand::CreateImm(Val);
1981 
1982   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1983   MachineInstr *Mov =
1984   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1985           TII->get(AMDGPU::S_MOV_B32), Reg)
1986     .addImm(Val);
1987   (void)Mov;
1988   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1989   return MachineOperand::CreateReg(Reg, false);
1990 }
1991 
1992 // Compute base address using Addr and return the final register.
1993 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1994                                            const MemAddress &Addr) const {
1995   MachineBasicBlock *MBB = MI.getParent();
1996   MachineBasicBlock::iterator MBBI = MI.getIterator();
1997   DebugLoc DL = MI.getDebugLoc();
1998 
1999   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2000           Addr.Base.LoSubReg) &&
2001          "Expected 32-bit Base-Register-Low!!");
2002 
2003   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2004           Addr.Base.HiSubReg) &&
2005          "Expected 32-bit Base-Register-Hi!!");
2006 
2007   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
2008   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2009   MachineOperand OffsetHi =
2010     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2011 
2012   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
2013   Register CarryReg = MRI->createVirtualRegister(CarryRC);
2014   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2015 
2016   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2017   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2018   MachineInstr *LoHalf =
2019     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2020       .addReg(CarryReg, RegState::Define)
2021       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2022       .add(OffsetLo)
2023       .addImm(0); // clamp bit
2024   (void)LoHalf;
2025   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
2026 
2027   MachineInstr *HiHalf =
2028   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2029     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2030     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2031     .add(OffsetHi)
2032     .addReg(CarryReg, RegState::Kill)
2033     .addImm(0); // clamp bit
2034   (void)HiHalf;
2035   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2036 
2037   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2038   MachineInstr *FullBase =
2039     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2040       .addReg(DestSub0)
2041       .addImm(AMDGPU::sub0)
2042       .addReg(DestSub1)
2043       .addImm(AMDGPU::sub1);
2044   (void)FullBase;
2045   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2046 
2047   return FullDestReg;
2048 }
2049 
2050 // Update base and offset with the NewBase and NewOffset in MI.
2051 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2052                                                Register NewBase,
2053                                                int32_t NewOffset) const {
2054   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2055   Base->setReg(NewBase);
2056   Base->setIsKill(false);
2057   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2058 }
2059 
2060 std::optional<int32_t>
2061 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2062   if (Op.isImm())
2063     return Op.getImm();
2064 
2065   if (!Op.isReg())
2066     return std::nullopt;
2067 
2068   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2069   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2070       !Def->getOperand(1).isImm())
2071     return std::nullopt;
2072 
2073   return Def->getOperand(1).getImm();
2074 }
2075 
2076 // Analyze Base and extracts:
2077 //  - 32bit base registers, subregisters
2078 //  - 64bit constant offset
2079 // Expecting base computation as:
2080 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2081 //   %LO:vgpr_32, %c:sreg_64_xexec =
2082 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2083 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2084 //   %Base:vreg_64 =
2085 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2086 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2087                                                       MemAddress &Addr) const {
2088   if (!Base.isReg())
2089     return;
2090 
2091   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2092   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2093       || Def->getNumOperands() != 5)
2094     return;
2095 
2096   MachineOperand BaseLo = Def->getOperand(1);
2097   MachineOperand BaseHi = Def->getOperand(3);
2098   if (!BaseLo.isReg() || !BaseHi.isReg())
2099     return;
2100 
2101   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2102   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2103 
2104   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2105       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2106     return;
2107 
2108   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2109   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2110 
2111   auto Offset0P = extractConstOffset(*Src0);
2112   if (Offset0P)
2113     BaseLo = *Src1;
2114   else {
2115     if (!(Offset0P = extractConstOffset(*Src1)))
2116       return;
2117     BaseLo = *Src0;
2118   }
2119 
2120   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2121   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2122 
2123   if (Src0->isImm())
2124     std::swap(Src0, Src1);
2125 
2126   if (!Src1->isImm() || Src0->isImm())
2127     return;
2128 
2129   uint64_t Offset1 = Src1->getImm();
2130   BaseHi = *Src0;
2131 
2132   Addr.Base.LoReg = BaseLo.getReg();
2133   Addr.Base.HiReg = BaseHi.getReg();
2134   Addr.Base.LoSubReg = BaseLo.getSubReg();
2135   Addr.Base.HiSubReg = BaseHi.getSubReg();
2136   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2137 }
2138 
2139 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2140     MachineInstr &MI,
2141     MemInfoMap &Visited,
2142     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2143 
2144   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2145     return false;
2146 
2147   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2148   if (SIInstrInfo::isFLATScratch(MI))
2149     return false;
2150 
2151   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2152                                               : AMDGPUAS::FLAT_ADDRESS;
2153 
2154   if (AnchorList.count(&MI))
2155     return false;
2156 
2157   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2158 
2159   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2160     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2161     return false;
2162   }
2163 
2164   // Step1: Find the base-registers and a 64bit constant offset.
2165   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2166   MemAddress MAddr;
2167   if (!Visited.contains(&MI)) {
2168     processBaseWithConstOffset(Base, MAddr);
2169     Visited[&MI] = MAddr;
2170   } else
2171     MAddr = Visited[&MI];
2172 
2173   if (MAddr.Offset == 0) {
2174     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2175                          " constant offsets that can be promoted.\n";);
2176     return false;
2177   }
2178 
2179   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2180              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2181 
2182   // Step2: Traverse through MI's basic block and find an anchor(that has the
2183   // same base-registers) with the highest 13bit distance from MI's offset.
2184   // E.g. (64bit loads)
2185   // bb:
2186   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2187   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2188   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2189   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2190   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2191   //
2192   // Starting from the first load, the optimization will try to find a new base
2193   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2194   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2195   // as the new-base(anchor) because of the maximum distance which can
2196   // accommodate more intermediate bases presumably.
2197   //
2198   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2199   // (&a + 8192) for load1, load2, load4.
2200   //   addr = &a + 8192
2201   //   load1 = load(addr,       -4096)
2202   //   load2 = load(addr,       -2048)
2203   //   load3 = load(addr,       0)
2204   //   load4 = load(addr,       2048)
2205   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2206   //
2207   MachineInstr *AnchorInst = nullptr;
2208   MemAddress AnchorAddr;
2209   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2210   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2211 
2212   MachineBasicBlock *MBB = MI.getParent();
2213   MachineBasicBlock::iterator E = MBB->end();
2214   MachineBasicBlock::iterator MBBI = MI.getIterator();
2215   ++MBBI;
2216   const SITargetLowering *TLI =
2217     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2218 
2219   for ( ; MBBI != E; ++MBBI) {
2220     MachineInstr &MINext = *MBBI;
2221     // TODO: Support finding an anchor(with same base) from store addresses or
2222     // any other load addresses where the opcodes are different.
2223     if (MINext.getOpcode() != MI.getOpcode() ||
2224         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2225       continue;
2226 
2227     const MachineOperand &BaseNext =
2228       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2229     MemAddress MAddrNext;
2230     if (!Visited.contains(&MINext)) {
2231       processBaseWithConstOffset(BaseNext, MAddrNext);
2232       Visited[&MINext] = MAddrNext;
2233     } else
2234       MAddrNext = Visited[&MINext];
2235 
2236     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2237         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2238         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2239         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2240       continue;
2241 
2242     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2243 
2244     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2245     TargetLoweringBase::AddrMode AM;
2246     AM.HasBaseReg = true;
2247     AM.BaseOffs = Dist;
2248     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2249         (uint32_t)std::abs(Dist) > MaxDist) {
2250       MaxDist = std::abs(Dist);
2251 
2252       AnchorAddr = MAddrNext;
2253       AnchorInst = &MINext;
2254     }
2255   }
2256 
2257   if (AnchorInst) {
2258     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2259                AnchorInst->dump());
2260     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2261                <<  AnchorAddr.Offset << "\n\n");
2262 
2263     // Instead of moving up, just re-compute anchor-instruction's base address.
2264     Register Base = computeBase(MI, AnchorAddr);
2265 
2266     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2267     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2268 
2269     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2270       TargetLoweringBase::AddrMode AM;
2271       AM.HasBaseReg = true;
2272       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2273 
2274       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2275         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2276                    OtherMI->dump());
2277         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2278         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2279       }
2280     }
2281     AnchorList.insert(AnchorInst);
2282     return true;
2283   }
2284 
2285   return false;
2286 }
2287 
2288 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2289                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2290   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2291     if (AddrList.front().InstClass == CI.InstClass &&
2292         AddrList.front().IsAGPR == CI.IsAGPR &&
2293         AddrList.front().hasSameBaseAddress(CI)) {
2294       AddrList.emplace_back(CI);
2295       return;
2296     }
2297   }
2298 
2299   // Base address not found, so add a new list.
2300   MergeableInsts.emplace_back(1, CI);
2301 }
2302 
2303 std::pair<MachineBasicBlock::iterator, bool>
2304 SILoadStoreOptimizer::collectMergeableInsts(
2305     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2306     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2307     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2308   bool Modified = false;
2309 
2310   // Sort potential mergeable instructions into lists.  One list per base address.
2311   unsigned Order = 0;
2312   MachineBasicBlock::iterator BlockI = Begin;
2313   for (; BlockI != End; ++BlockI) {
2314     MachineInstr &MI = *BlockI;
2315 
2316     // We run this before checking if an address is mergeable, because it can produce
2317     // better code even if the instructions aren't mergeable.
2318     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2319       Modified = true;
2320 
2321     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2322     // barriers. We can look after this barrier for separate merges.
2323     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2324       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2325 
2326       // Search will resume after this instruction in a separate merge list.
2327       ++BlockI;
2328       break;
2329     }
2330 
2331     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2332     if (InstClass == UNKNOWN)
2333       continue;
2334 
2335     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2336     int Swizzled =
2337         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2338     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2339       continue;
2340 
2341     CombineInfo CI;
2342     CI.setMI(MI, *this);
2343     CI.Order = Order++;
2344 
2345     if (!CI.hasMergeableAddress(*MRI))
2346       continue;
2347 
2348     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2349       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2350       //        operands. However we are reporting that ds_write2 shall have
2351       //        only VGPR data so that machine copy propagation does not
2352       //        create an illegal instruction with a VGPR and AGPR sources.
2353       //        Consequenctially if we create such instruction the verifier
2354       //        will complain.
2355       continue;
2356     }
2357 
2358     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2359 
2360     addInstToMergeableList(CI, MergeableInsts);
2361   }
2362 
2363   // At this point we have lists of Mergeable instructions.
2364   //
2365   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2366   // list try to find an instruction that can be merged with I.  If an instruction
2367   // is found, it is stored in the Paired field.  If no instructions are found, then
2368   // the CombineInfo object is deleted from the list.
2369 
2370   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2371                                                    E = MergeableInsts.end(); I != E;) {
2372 
2373     std::list<CombineInfo> &MergeList = *I;
2374     if (MergeList.size() <= 1) {
2375       // This means we have found only one instruction with a given address
2376       // that can be merged, and we need at least 2 instructions to do a merge,
2377       // so this list can be discarded.
2378       I = MergeableInsts.erase(I);
2379       continue;
2380     }
2381 
2382     // Sort the lists by offsets, this way mergeable instructions will be
2383     // adjacent to each other in the list, which will make it easier to find
2384     // matches.
2385     MergeList.sort(
2386         [] (const CombineInfo &A, const CombineInfo &B) {
2387           return A.Offset < B.Offset;
2388         });
2389     ++I;
2390   }
2391 
2392   return {BlockI, Modified};
2393 }
2394 
2395 // Scan through looking for adjacent LDS operations with constant offsets from
2396 // the same base register. We rely on the scheduler to do the hard work of
2397 // clustering nearby loads, and assume these are all adjacent.
2398 bool SILoadStoreOptimizer::optimizeBlock(
2399                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2400   bool Modified = false;
2401 
2402   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2403                                                    E = MergeableInsts.end(); I != E;) {
2404     std::list<CombineInfo> &MergeList = *I;
2405 
2406     bool OptimizeListAgain = false;
2407     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2408       // We weren't able to make any changes, so delete the list so we don't
2409       // process the same instructions the next time we try to optimize this
2410       // block.
2411       I = MergeableInsts.erase(I);
2412       continue;
2413     }
2414 
2415     Modified = true;
2416 
2417     // We made changes, but also determined that there were no more optimization
2418     // opportunities, so we don't need to reprocess the list
2419     if (!OptimizeListAgain) {
2420       I = MergeableInsts.erase(I);
2421       continue;
2422     }
2423     OptimizeAgain = true;
2424   }
2425   return Modified;
2426 }
2427 
2428 bool
2429 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2430                                           std::list<CombineInfo> &MergeList,
2431                                           bool &OptimizeListAgain) {
2432   if (MergeList.empty())
2433     return false;
2434 
2435   bool Modified = false;
2436 
2437   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2438        Next = std::next(I)) {
2439 
2440     auto First = I;
2441     auto Second = Next;
2442 
2443     if ((*First).Order > (*Second).Order)
2444       std::swap(First, Second);
2445     CombineInfo &CI = *First;
2446     CombineInfo &Paired = *Second;
2447 
2448     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2449     if (!Where) {
2450       ++I;
2451       continue;
2452     }
2453 
2454     Modified = true;
2455 
2456     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2457 
2458     MachineBasicBlock::iterator NewMI;
2459     switch (CI.InstClass) {
2460     default:
2461       llvm_unreachable("unknown InstClass");
2462       break;
2463     case DS_READ:
2464       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2465       break;
2466     case DS_WRITE:
2467       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2468       break;
2469     case S_BUFFER_LOAD_IMM:
2470     case S_BUFFER_LOAD_SGPR_IMM:
2471     case S_LOAD_IMM:
2472       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2473       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2474       break;
2475     case BUFFER_LOAD:
2476       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2477       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2478       break;
2479     case BUFFER_STORE:
2480       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2481       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2482       break;
2483     case MIMG:
2484       NewMI = mergeImagePair(CI, Paired, Where->I);
2485       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2486       break;
2487     case TBUFFER_LOAD:
2488       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2489       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2490       break;
2491     case TBUFFER_STORE:
2492       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2493       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494       break;
2495     case FLAT_LOAD:
2496     case GLOBAL_LOAD:
2497     case GLOBAL_LOAD_SADDR:
2498       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2499       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2500       break;
2501     case FLAT_STORE:
2502     case GLOBAL_STORE:
2503     case GLOBAL_STORE_SADDR:
2504       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2505       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2506       break;
2507     }
2508     CI.setMI(NewMI, *this);
2509     CI.Order = Where->Order;
2510     if (I == Second)
2511       I = Next;
2512 
2513     MergeList.erase(Second);
2514   }
2515 
2516   return Modified;
2517 }
2518 
2519 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2520   if (skipFunction(MF.getFunction()))
2521     return false;
2522 
2523   STM = &MF.getSubtarget<GCNSubtarget>();
2524   if (!STM->loadStoreOptEnabled())
2525     return false;
2526 
2527   TII = STM->getInstrInfo();
2528   TRI = &TII->getRegisterInfo();
2529 
2530   MRI = &MF.getRegInfo();
2531   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2532 
2533   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2534 
2535   bool Modified = false;
2536 
2537   // Contains the list of instructions for which constant offsets are being
2538   // promoted to the IMM. This is tracked for an entire block at time.
2539   SmallPtrSet<MachineInstr *, 4> AnchorList;
2540   MemInfoMap Visited;
2541 
2542   for (MachineBasicBlock &MBB : MF) {
2543     MachineBasicBlock::iterator SectionEnd;
2544     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2545          I = SectionEnd) {
2546       bool CollectModified;
2547       std::list<std::list<CombineInfo>> MergeableInsts;
2548 
2549       // First pass: Collect list of all instructions we know how to merge in a
2550       // subset of the block.
2551       std::tie(SectionEnd, CollectModified) =
2552           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2553 
2554       Modified |= CollectModified;
2555 
2556       do {
2557         OptimizeAgain = false;
2558         Modified |= optimizeBlock(MergeableInsts);
2559       } while (OptimizeAgain);
2560     }
2561 
2562     Visited.clear();
2563     AnchorList.clear();
2564   }
2565 
2566   return Modified;
2567 }
2568