xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision f6e70ed1c73a2f3ac15eb6650423c1c10d278f50)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge physical reg addresses.
165         if (AddrOp->getReg().isPhysical())
166           return false;
167 
168         // If an address has only one use then there will be no other
169         // instructions with the same address, so we can't merge this one.
170         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171           return false;
172       }
173       return true;
174     }
175 
176     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177 
178     // Compare by pointer order.
179     bool operator<(const CombineInfo& Other) const {
180       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181     }
182   };
183 
184   struct BaseRegisters {
185     Register LoReg;
186     Register HiReg;
187 
188     unsigned LoSubReg = 0;
189     unsigned HiSubReg = 0;
190   };
191 
192   struct MemAddress {
193     BaseRegisters Base;
194     int64_t Offset = 0;
195   };
196 
197   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198 
199 private:
200   const GCNSubtarget *STM = nullptr;
201   const SIInstrInfo *TII = nullptr;
202   const SIRegisterInfo *TRI = nullptr;
203   MachineRegisterInfo *MRI = nullptr;
204   AliasAnalysis *AA = nullptr;
205   bool OptimizeAgain;
206 
207   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208                            const DenseSet<Register> &ARegUses,
209                            const MachineInstr &A, const MachineInstr &B) const;
210   static bool dmasksCanBeCombined(const CombineInfo &CI,
211                                   const SIInstrInfo &TII,
212                                   const CombineInfo &Paired);
213   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214                                    CombineInfo &Paired, bool Modify = false);
215   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216                         const CombineInfo &Paired);
217   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219                                                      const CombineInfo &Paired);
220   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221                                                     const CombineInfo &Paired);
222   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223 
224   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225 
226   unsigned read2Opcode(unsigned EltSize) const;
227   unsigned read2ST64Opcode(unsigned EltSize) const;
228   MachineBasicBlock::iterator
229   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230                  MachineBasicBlock::iterator InsertBefore);
231 
232   unsigned write2Opcode(unsigned EltSize) const;
233   unsigned write2ST64Opcode(unsigned EltSize) const;
234   MachineBasicBlock::iterator
235   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236                   MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239                  MachineBasicBlock::iterator InsertBefore);
240   MachineBasicBlock::iterator
241   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242                        MachineBasicBlock::iterator InsertBefore);
243   MachineBasicBlock::iterator
244   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245                       MachineBasicBlock::iterator InsertBefore);
246   MachineBasicBlock::iterator
247   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248                        MachineBasicBlock::iterator InsertBefore);
249   MachineBasicBlock::iterator
250   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251                        MachineBasicBlock::iterator InsertBefore);
252   MachineBasicBlock::iterator
253   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254                         MachineBasicBlock::iterator InsertBefore);
255   MachineBasicBlock::iterator
256   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257                     MachineBasicBlock::iterator InsertBefore);
258   MachineBasicBlock::iterator
259   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260                      MachineBasicBlock::iterator InsertBefore);
261 
262   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263                            int32_t NewOffset) const;
264   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268   /// Promotes constant offset to the immediate by adjusting the base. It
269   /// tries to use a base from the nearby instructions that allows it to have
270   /// a 13bit constant offset which gets promoted to the immediate.
271   bool promoteConstantOffsetToImm(MachineInstr &CI,
272                                   MemInfoMap &Visited,
273                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274   void addInstToMergeableList(const CombineInfo &CI,
275                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
276 
277   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280       std::list<std::list<CombineInfo>> &MergeableInsts) const;
281 
282   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283                                                      const CombineInfo &Paired);
284 
285   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286                                           const CombineInfo &Paired);
287 
288 public:
289   static char ID;
290 
291   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
292     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
293   }
294 
295   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296                                      bool &OptimizeListAgain);
297   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298 
299   bool runOnMachineFunction(MachineFunction &MF) override;
300 
301   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302 
303   void getAnalysisUsage(AnalysisUsage &AU) const override {
304     AU.setPreservesCFG();
305     AU.addRequired<AAResultsWrapperPass>();
306 
307     MachineFunctionPass::getAnalysisUsage(AU);
308   }
309 
310   MachineFunctionProperties getRequiredProperties() const override {
311     return MachineFunctionProperties()
312       .set(MachineFunctionProperties::Property::IsSSA);
313   }
314 };
315 
316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317   const unsigned Opc = MI.getOpcode();
318 
319   if (TII.isMUBUF(Opc)) {
320     // FIXME: Handle d16 correctly
321     return AMDGPU::getMUBUFElements(Opc);
322   }
323   if (TII.isMIMG(MI)) {
324     uint64_t DMaskImm =
325         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326     return llvm::popcount(DMaskImm);
327   }
328   if (TII.isMTBUF(Opc)) {
329     return AMDGPU::getMTBUFElements(Opc);
330   }
331 
332   switch (Opc) {
333   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336   case AMDGPU::S_LOAD_DWORD_IMM:
337   case AMDGPU::GLOBAL_LOAD_DWORD:
338   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339   case AMDGPU::GLOBAL_STORE_DWORD:
340   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341   case AMDGPU::FLAT_LOAD_DWORD:
342   case AMDGPU::FLAT_STORE_DWORD:
343     return 1;
344   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::GLOBAL_LOAD_DWORDX3:
356   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357   case AMDGPU::GLOBAL_STORE_DWORDX3:
358   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359   case AMDGPU::FLAT_LOAD_DWORDX3:
360   case AMDGPU::FLAT_STORE_DWORDX3:
361     return 3;
362   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365   case AMDGPU::S_LOAD_DWORDX4_IMM:
366   case AMDGPU::GLOBAL_LOAD_DWORDX4:
367   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368   case AMDGPU::GLOBAL_STORE_DWORDX4:
369   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370   case AMDGPU::FLAT_LOAD_DWORDX4:
371   case AMDGPU::FLAT_STORE_DWORDX4:
372     return 4;
373   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376   case AMDGPU::S_LOAD_DWORDX8_IMM:
377     return 8;
378   case AMDGPU::DS_READ_B32:      [[fallthrough]];
379   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
381   case AMDGPU::DS_WRITE_B32_gfx9:
382     return 1;
383   case AMDGPU::DS_READ_B64:      [[fallthrough]];
384   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
386   case AMDGPU::DS_WRITE_B64_gfx9:
387     return 2;
388   default:
389     return 0;
390   }
391 }
392 
393 /// Maps instruction opcode to enum InstClassEnum.
394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
395   switch (Opc) {
396   default:
397     if (TII.isMUBUF(Opc)) {
398       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
399       default:
400         return UNKNOWN;
401       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
402       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
403       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
404       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
405         return BUFFER_LOAD;
406       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
407       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
408       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
409       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
410         return BUFFER_STORE;
411       }
412     }
413     if (TII.isMIMG(Opc)) {
414       // Ignore instructions encoded without vaddr.
415       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
417         return UNKNOWN;
418       // Ignore BVH instructions
419       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
420         return UNKNOWN;
421       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
422       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
423           TII.isGather4(Opc))
424         return UNKNOWN;
425       return MIMG;
426     }
427     if (TII.isMTBUF(Opc)) {
428       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
429       default:
430         return UNKNOWN;
431       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
432       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
433       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
434       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
435       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
436       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
437       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
438       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
439         return TBUFFER_LOAD;
440       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
441       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
442       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
443       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
444         return TBUFFER_STORE;
445       }
446     }
447     return UNKNOWN;
448   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
449   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
450   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
451   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
452     return S_BUFFER_LOAD_IMM;
453   // For the purposes of this optimization SGPR variants of buffer loads
454   // are considered to be zero-offsetted SGPR_IMM loads.
455   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
456   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
457   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
458   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
459   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
460   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
461   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
462   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
463     return S_BUFFER_LOAD_SGPR_IMM;
464   case AMDGPU::S_LOAD_DWORD_IMM:
465   case AMDGPU::S_LOAD_DWORDX2_IMM:
466   case AMDGPU::S_LOAD_DWORDX4_IMM:
467   case AMDGPU::S_LOAD_DWORDX8_IMM:
468     return S_LOAD_IMM;
469   case AMDGPU::DS_READ_B32:
470   case AMDGPU::DS_READ_B32_gfx9:
471   case AMDGPU::DS_READ_B64:
472   case AMDGPU::DS_READ_B64_gfx9:
473     return DS_READ;
474   case AMDGPU::DS_WRITE_B32:
475   case AMDGPU::DS_WRITE_B32_gfx9:
476   case AMDGPU::DS_WRITE_B64:
477   case AMDGPU::DS_WRITE_B64_gfx9:
478     return DS_WRITE;
479   case AMDGPU::GLOBAL_LOAD_DWORD:
480   case AMDGPU::GLOBAL_LOAD_DWORDX2:
481   case AMDGPU::GLOBAL_LOAD_DWORDX3:
482   case AMDGPU::GLOBAL_LOAD_DWORDX4:
483   case AMDGPU::FLAT_LOAD_DWORD:
484   case AMDGPU::FLAT_LOAD_DWORDX2:
485   case AMDGPU::FLAT_LOAD_DWORDX3:
486   case AMDGPU::FLAT_LOAD_DWORDX4:
487     return FLAT_LOAD;
488   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
489   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
490   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
491   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
492     return GLOBAL_LOAD_SADDR;
493   case AMDGPU::GLOBAL_STORE_DWORD:
494   case AMDGPU::GLOBAL_STORE_DWORDX2:
495   case AMDGPU::GLOBAL_STORE_DWORDX3:
496   case AMDGPU::GLOBAL_STORE_DWORDX4:
497   case AMDGPU::FLAT_STORE_DWORD:
498   case AMDGPU::FLAT_STORE_DWORDX2:
499   case AMDGPU::FLAT_STORE_DWORDX3:
500   case AMDGPU::FLAT_STORE_DWORDX4:
501     return FLAT_STORE;
502   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
503   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
504   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
505   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
506     return GLOBAL_STORE_SADDR;
507   }
508 }
509 
510 /// Determines instruction subclass from opcode. Only instructions
511 /// of the same subclass can be merged together. The merged instruction may have
512 /// a different subclass but must have the same class.
513 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
514   switch (Opc) {
515   default:
516     if (TII.isMUBUF(Opc))
517       return AMDGPU::getMUBUFBaseOpcode(Opc);
518     if (TII.isMIMG(Opc)) {
519       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
520       assert(Info);
521       return Info->BaseOpcode;
522     }
523     if (TII.isMTBUF(Opc))
524       return AMDGPU::getMTBUFBaseOpcode(Opc);
525     return -1;
526   case AMDGPU::DS_READ_B32:
527   case AMDGPU::DS_READ_B32_gfx9:
528   case AMDGPU::DS_READ_B64:
529   case AMDGPU::DS_READ_B64_gfx9:
530   case AMDGPU::DS_WRITE_B32:
531   case AMDGPU::DS_WRITE_B32_gfx9:
532   case AMDGPU::DS_WRITE_B64:
533   case AMDGPU::DS_WRITE_B64_gfx9:
534     return Opc;
535   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
536   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
537   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
538   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
539     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
540   // For the purposes of this optimization SGPR variants of buffer loads
541   // are considered to be zero-offsetted SGPR_IMM loads.
542   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
543   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
544   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
545   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
546   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
547   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
548   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
549   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
550     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
551   case AMDGPU::S_LOAD_DWORD_IMM:
552   case AMDGPU::S_LOAD_DWORDX2_IMM:
553   case AMDGPU::S_LOAD_DWORDX4_IMM:
554   case AMDGPU::S_LOAD_DWORDX8_IMM:
555     return AMDGPU::S_LOAD_DWORD_IMM;
556   case AMDGPU::GLOBAL_LOAD_DWORD:
557   case AMDGPU::GLOBAL_LOAD_DWORDX2:
558   case AMDGPU::GLOBAL_LOAD_DWORDX3:
559   case AMDGPU::GLOBAL_LOAD_DWORDX4:
560   case AMDGPU::FLAT_LOAD_DWORD:
561   case AMDGPU::FLAT_LOAD_DWORDX2:
562   case AMDGPU::FLAT_LOAD_DWORDX3:
563   case AMDGPU::FLAT_LOAD_DWORDX4:
564     return AMDGPU::FLAT_LOAD_DWORD;
565   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
566   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
567   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
568   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
569     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
570   case AMDGPU::GLOBAL_STORE_DWORD:
571   case AMDGPU::GLOBAL_STORE_DWORDX2:
572   case AMDGPU::GLOBAL_STORE_DWORDX3:
573   case AMDGPU::GLOBAL_STORE_DWORDX4:
574   case AMDGPU::FLAT_STORE_DWORD:
575   case AMDGPU::FLAT_STORE_DWORDX2:
576   case AMDGPU::FLAT_STORE_DWORDX3:
577   case AMDGPU::FLAT_STORE_DWORDX4:
578     return AMDGPU::FLAT_STORE_DWORD;
579   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
580   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
581   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
582   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
583     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
584   }
585 }
586 
587 // GLOBAL loads and stores are classified as FLAT initially. If both combined
588 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
589 // If either or both instructions are non segment specific FLAT the resulting
590 // combined operation will be FLAT, potentially promoting one of the GLOBAL
591 // operations to FLAT.
592 // For other instructions return the original unmodified class.
593 InstClassEnum
594 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
595                                          const CombineInfo &Paired) {
596   assert(CI.InstClass == Paired.InstClass);
597 
598   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
599       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
600     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
601 
602   return CI.InstClass;
603 }
604 
605 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
606   AddressRegs Result;
607 
608   if (TII.isMUBUF(Opc)) {
609     if (AMDGPU::getMUBUFHasVAddr(Opc))
610       Result.VAddr = true;
611     if (AMDGPU::getMUBUFHasSrsrc(Opc))
612       Result.SRsrc = true;
613     if (AMDGPU::getMUBUFHasSoffset(Opc))
614       Result.SOffset = true;
615 
616     return Result;
617   }
618 
619   if (TII.isMIMG(Opc)) {
620     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
621     if (VAddr0Idx >= 0) {
622       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
623       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
624     } else {
625       Result.VAddr = true;
626     }
627     Result.SRsrc = true;
628     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
629     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
630       Result.SSamp = true;
631 
632     return Result;
633   }
634   if (TII.isMTBUF(Opc)) {
635     if (AMDGPU::getMTBUFHasVAddr(Opc))
636       Result.VAddr = true;
637     if (AMDGPU::getMTBUFHasSrsrc(Opc))
638       Result.SRsrc = true;
639     if (AMDGPU::getMTBUFHasSoffset(Opc))
640       Result.SOffset = true;
641 
642     return Result;
643   }
644 
645   switch (Opc) {
646   default:
647     return Result;
648   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
649   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
650   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
651   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
652   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
653   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
654   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
655   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
656     Result.SOffset = true;
657     [[fallthrough]];
658   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
659   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
660   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
661   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
662   case AMDGPU::S_LOAD_DWORD_IMM:
663   case AMDGPU::S_LOAD_DWORDX2_IMM:
664   case AMDGPU::S_LOAD_DWORDX4_IMM:
665   case AMDGPU::S_LOAD_DWORDX8_IMM:
666     Result.SBase = true;
667     return Result;
668   case AMDGPU::DS_READ_B32:
669   case AMDGPU::DS_READ_B64:
670   case AMDGPU::DS_READ_B32_gfx9:
671   case AMDGPU::DS_READ_B64_gfx9:
672   case AMDGPU::DS_WRITE_B32:
673   case AMDGPU::DS_WRITE_B64:
674   case AMDGPU::DS_WRITE_B32_gfx9:
675   case AMDGPU::DS_WRITE_B64_gfx9:
676     Result.Addr = true;
677     return Result;
678   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
679   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
680   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
681   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
682   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
683   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
684   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
685   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
686     Result.SAddr = true;
687     [[fallthrough]];
688   case AMDGPU::GLOBAL_LOAD_DWORD:
689   case AMDGPU::GLOBAL_LOAD_DWORDX2:
690   case AMDGPU::GLOBAL_LOAD_DWORDX3:
691   case AMDGPU::GLOBAL_LOAD_DWORDX4:
692   case AMDGPU::GLOBAL_STORE_DWORD:
693   case AMDGPU::GLOBAL_STORE_DWORDX2:
694   case AMDGPU::GLOBAL_STORE_DWORDX3:
695   case AMDGPU::GLOBAL_STORE_DWORDX4:
696   case AMDGPU::FLAT_LOAD_DWORD:
697   case AMDGPU::FLAT_LOAD_DWORDX2:
698   case AMDGPU::FLAT_LOAD_DWORDX3:
699   case AMDGPU::FLAT_LOAD_DWORDX4:
700   case AMDGPU::FLAT_STORE_DWORD:
701   case AMDGPU::FLAT_STORE_DWORDX2:
702   case AMDGPU::FLAT_STORE_DWORDX3:
703   case AMDGPU::FLAT_STORE_DWORDX4:
704     Result.VAddr = true;
705     return Result;
706   }
707 }
708 
709 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
710                                               const SILoadStoreOptimizer &LSO) {
711   I = MI;
712   unsigned Opc = MI->getOpcode();
713   InstClass = getInstClass(Opc, *LSO.TII);
714 
715   if (InstClass == UNKNOWN)
716     return;
717 
718   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
719 
720   switch (InstClass) {
721   case DS_READ:
722    EltSize =
723           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
724                                                                           : 4;
725    break;
726   case DS_WRITE:
727     EltSize =
728           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
729                                                                             : 4;
730     break;
731   case S_BUFFER_LOAD_IMM:
732   case S_BUFFER_LOAD_SGPR_IMM:
733   case S_LOAD_IMM:
734     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
735     break;
736   default:
737     EltSize = 4;
738     break;
739   }
740 
741   if (InstClass == MIMG) {
742     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
743     // Offset is not considered for MIMG instructions.
744     Offset = 0;
745   } else {
746     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
747     Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
748   }
749 
750   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
751     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
752 
753   Width = getOpcodeWidth(*I, *LSO.TII);
754 
755   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
756     Offset &= 0xffff;
757   } else if (InstClass != MIMG) {
758     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
759   }
760 
761   AddressRegs Regs = getRegs(Opc, *LSO.TII);
762 
763   NumAddresses = 0;
764   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
765     AddrIdx[NumAddresses++] =
766         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
767   if (Regs.Addr)
768     AddrIdx[NumAddresses++] =
769         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
770   if (Regs.SBase)
771     AddrIdx[NumAddresses++] =
772         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
773   if (Regs.SRsrc)
774     AddrIdx[NumAddresses++] =
775         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
776   if (Regs.SOffset)
777     AddrIdx[NumAddresses++] =
778         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
779   if (Regs.SAddr)
780     AddrIdx[NumAddresses++] =
781         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
782   if (Regs.VAddr)
783     AddrIdx[NumAddresses++] =
784         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
785   if (Regs.SSamp)
786     AddrIdx[NumAddresses++] =
787         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
788   assert(NumAddresses <= MaxAddressRegs);
789 
790   for (unsigned J = 0; J < NumAddresses; J++)
791     AddrReg[J] = &I->getOperand(AddrIdx[J]);
792 }
793 
794 } // end anonymous namespace.
795 
796 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
797                       "SI Load Store Optimizer", false, false)
798 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
799 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
800                     false, false)
801 
802 char SILoadStoreOptimizer::ID = 0;
803 
804 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
805 
806 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
807   return new SILoadStoreOptimizer();
808 }
809 
810 static void addDefsUsesToList(const MachineInstr &MI,
811                               DenseSet<Register> &RegDefs,
812                               DenseSet<Register> &RegUses) {
813   for (const auto &Op : MI.operands()) {
814     if (!Op.isReg())
815       continue;
816     if (Op.isDef())
817       RegDefs.insert(Op.getReg());
818     if (Op.readsReg())
819       RegUses.insert(Op.getReg());
820   }
821 }
822 
823 bool SILoadStoreOptimizer::canSwapInstructions(
824     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
825     const MachineInstr &A, const MachineInstr &B) const {
826   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
827       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
828     return false;
829   for (const auto &BOp : B.operands()) {
830     if (!BOp.isReg())
831       continue;
832     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
833       return false;
834     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
835       return false;
836   }
837   return true;
838 }
839 
840 // Given that \p CI and \p Paired are adjacent memory operations produce a new
841 // MMO for the combined operation with a new access size.
842 MachineMemOperand *
843 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
844                                                const CombineInfo &Paired) {
845   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
846   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
847 
848   unsigned Size = MMOa->getSize() + MMOb->getSize();
849 
850   // A base pointer for the combined operation is the same as the leading
851   // operation's pointer.
852   if (Paired < CI)
853     std::swap(MMOa, MMOb);
854 
855   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
856   // If merging FLAT and GLOBAL set address space to FLAT.
857   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
858     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
859 
860   MachineFunction *MF = CI.I->getMF();
861   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
862 }
863 
864 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
865                                                const SIInstrInfo &TII,
866                                                const CombineInfo &Paired) {
867   assert(CI.InstClass == MIMG);
868 
869   // Ignore instructions with tfe/lwe set.
870   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
871   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
872 
873   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
874     return false;
875 
876   // Check other optional immediate operands for equality.
877   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
878                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
879                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
880 
881   for (auto op : OperandsToMatch) {
882     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
883     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
884       return false;
885     if (Idx != -1 &&
886         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
887       return false;
888   }
889 
890   // Check DMask for overlaps.
891   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
892   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
893 
894   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
895   if ((1u << AllowedBitsForMin) <= MinMask)
896     return false;
897 
898   return true;
899 }
900 
901 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
902                                        unsigned ComponentCount,
903                                        const GCNSubtarget &STI) {
904   if (ComponentCount > 4)
905     return 0;
906 
907   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
908       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
909   if (!OldFormatInfo)
910     return 0;
911 
912   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
913       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
914                                            ComponentCount,
915                                            OldFormatInfo->NumFormat, STI);
916 
917   if (!NewFormatInfo)
918     return 0;
919 
920   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
921          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
922 
923   return NewFormatInfo->Format;
924 }
925 
926 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
927 // highest power of two. Note that the result is well defined for all inputs
928 // including corner cases like:
929 // - if Lo == Hi, return that value
930 // - if Lo == 0, return 0 (even though the "- 1" below underflows
931 // - if Lo > Hi, return 0 (as if the range wrapped around)
932 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
933   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
934 }
935 
936 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
937                                                 const GCNSubtarget &STI,
938                                                 CombineInfo &Paired,
939                                                 bool Modify) {
940   assert(CI.InstClass != MIMG);
941 
942   // XXX - Would the same offset be OK? Is there any reason this would happen or
943   // be useful?
944   if (CI.Offset == Paired.Offset)
945     return false;
946 
947   // This won't be valid if the offset isn't aligned.
948   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
949     return false;
950 
951   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
952 
953     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
954         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
955     if (!Info0)
956       return false;
957     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
958         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
959     if (!Info1)
960       return false;
961 
962     if (Info0->BitsPerComp != Info1->BitsPerComp ||
963         Info0->NumFormat != Info1->NumFormat)
964       return false;
965 
966     // TODO: Should be possible to support more formats, but if format loads
967     // are not dword-aligned, the merged load might not be valid.
968     if (Info0->BitsPerComp != 32)
969       return false;
970 
971     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
972       return false;
973   }
974 
975   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
976   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
977   CI.UseST64 = false;
978   CI.BaseOff = 0;
979 
980   // Handle all non-DS instructions.
981   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
982     return (EltOffset0 + CI.Width == EltOffset1 ||
983             EltOffset1 + Paired.Width == EltOffset0) &&
984            CI.CPol == Paired.CPol;
985   }
986 
987   // If the offset in elements doesn't fit in 8-bits, we might be able to use
988   // the stride 64 versions.
989   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
990       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
991     if (Modify) {
992       CI.Offset = EltOffset0 / 64;
993       Paired.Offset = EltOffset1 / 64;
994       CI.UseST64 = true;
995     }
996     return true;
997   }
998 
999   // Check if the new offsets fit in the reduced 8-bit range.
1000   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1001     if (Modify) {
1002       CI.Offset = EltOffset0;
1003       Paired.Offset = EltOffset1;
1004     }
1005     return true;
1006   }
1007 
1008   // Try to shift base address to decrease offsets.
1009   uint32_t Min = std::min(EltOffset0, EltOffset1);
1010   uint32_t Max = std::max(EltOffset0, EltOffset1);
1011 
1012   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1013   if (((Max - Min) & ~Mask) == 0) {
1014     if (Modify) {
1015       // From the range of values we could use for BaseOff, choose the one that
1016       // is aligned to the highest power of two, to maximise the chance that
1017       // the same offset can be reused for other load/store pairs.
1018       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1019       // Copy the low bits of the offsets, so that when we adjust them by
1020       // subtracting BaseOff they will be multiples of 64.
1021       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1022       CI.BaseOff = BaseOff * CI.EltSize;
1023       CI.Offset = (EltOffset0 - BaseOff) / 64;
1024       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1025       CI.UseST64 = true;
1026     }
1027     return true;
1028   }
1029 
1030   if (isUInt<8>(Max - Min)) {
1031     if (Modify) {
1032       // From the range of values we could use for BaseOff, choose the one that
1033       // is aligned to the highest power of two, to maximise the chance that
1034       // the same offset can be reused for other load/store pairs.
1035       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1036       CI.BaseOff = BaseOff * CI.EltSize;
1037       CI.Offset = EltOffset0 - BaseOff;
1038       Paired.Offset = EltOffset1 - BaseOff;
1039     }
1040     return true;
1041   }
1042 
1043   return false;
1044 }
1045 
1046 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1047                                      const CombineInfo &CI,
1048                                      const CombineInfo &Paired) {
1049   const unsigned Width = (CI.Width + Paired.Width);
1050   switch (CI.InstClass) {
1051   default:
1052     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1053   case S_BUFFER_LOAD_IMM:
1054   case S_BUFFER_LOAD_SGPR_IMM:
1055   case S_LOAD_IMM:
1056     switch (Width) {
1057     default:
1058       return false;
1059     case 2:
1060     case 4:
1061     case 8:
1062       return true;
1063     }
1064   }
1065 }
1066 
1067 const TargetRegisterClass *
1068 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1069   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1070     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1071   }
1072   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1073     return TRI->getRegClassForReg(*MRI, Src->getReg());
1074   }
1075   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1076     return TRI->getRegClassForReg(*MRI, Src->getReg());
1077   }
1078   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1079     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1080   }
1081   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1082     return TRI->getRegClassForReg(*MRI, Src->getReg());
1083   }
1084   return nullptr;
1085 }
1086 
1087 /// This function assumes that CI comes before Paired in a basic block. Return
1088 /// an insertion point for the merged instruction or nullptr on failure.
1089 SILoadStoreOptimizer::CombineInfo *
1090 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1091                                            CombineInfo &Paired) {
1092   // If another instruction has already been merged into CI, it may now be a
1093   // type that we can't do any further merging into.
1094   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1095     return nullptr;
1096   assert(CI.InstClass == Paired.InstClass);
1097 
1098   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1099       getInstSubclass(Paired.I->getOpcode(), *TII))
1100     return nullptr;
1101 
1102   // Check both offsets (or masks for MIMG) can be combined and fit in the
1103   // reduced range.
1104   if (CI.InstClass == MIMG) {
1105     if (!dmasksCanBeCombined(CI, *TII, Paired))
1106       return nullptr;
1107   } else {
1108     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1109       return nullptr;
1110   }
1111 
1112   DenseSet<Register> RegDefs;
1113   DenseSet<Register> RegUses;
1114   CombineInfo *Where;
1115   if (CI.I->mayLoad()) {
1116     // Try to hoist Paired up to CI.
1117     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1118     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1119       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1120         return nullptr;
1121     }
1122     Where = &CI;
1123   } else {
1124     // Try to sink CI down to Paired.
1125     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1126     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1127       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1128         return nullptr;
1129     }
1130     Where = &Paired;
1131   }
1132 
1133   // Call offsetsCanBeCombined with modify = true so that the offsets are
1134   // correct for the new instruction.  This should return true, because
1135   // this function should only be called on CombineInfo objects that
1136   // have already been confirmed to be mergeable.
1137   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1138     offsetsCanBeCombined(CI, *STM, Paired, true);
1139   return Where;
1140 }
1141 
1142 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1143   if (STM->ldsRequiresM0Init())
1144     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1145   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1146 }
1147 
1148 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1149   if (STM->ldsRequiresM0Init())
1150     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1151 
1152   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1153                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1154 }
1155 
1156 MachineBasicBlock::iterator
1157 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1158                                      MachineBasicBlock::iterator InsertBefore) {
1159   MachineBasicBlock *MBB = CI.I->getParent();
1160 
1161   // Be careful, since the addresses could be subregisters themselves in weird
1162   // cases, like vectors of pointers.
1163   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1164 
1165   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1166   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1167 
1168   unsigned NewOffset0 = CI.Offset;
1169   unsigned NewOffset1 = Paired.Offset;
1170   unsigned Opc =
1171       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1172 
1173   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1174   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1175 
1176   if (NewOffset0 > NewOffset1) {
1177     // Canonicalize the merged instruction so the smaller offset comes first.
1178     std::swap(NewOffset0, NewOffset1);
1179     std::swap(SubRegIdx0, SubRegIdx1);
1180   }
1181 
1182   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1183          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1184 
1185   const MCInstrDesc &Read2Desc = TII->get(Opc);
1186 
1187   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1188   Register DestReg = MRI->createVirtualRegister(SuperRC);
1189 
1190   DebugLoc DL = CI.I->getDebugLoc();
1191 
1192   Register BaseReg = AddrReg->getReg();
1193   unsigned BaseSubReg = AddrReg->getSubReg();
1194   unsigned BaseRegFlags = 0;
1195   if (CI.BaseOff) {
1196     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1197     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1198         .addImm(CI.BaseOff);
1199 
1200     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1201     BaseRegFlags = RegState::Kill;
1202 
1203     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1204         .addReg(ImmReg)
1205         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1206         .addImm(0); // clamp bit
1207     BaseSubReg = 0;
1208   }
1209 
1210   MachineInstrBuilder Read2 =
1211       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1212           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1213           .addImm(NewOffset0)                        // offset0
1214           .addImm(NewOffset1)                        // offset1
1215           .addImm(0)                                 // gds
1216           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1217 
1218   (void)Read2;
1219 
1220   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1221 
1222   // Copy to the old destination registers.
1223   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1224       .add(*Dest0) // Copy to same destination including flags and sub reg.
1225       .addReg(DestReg, 0, SubRegIdx0);
1226   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1227       .add(*Dest1)
1228       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1229 
1230   CI.I->eraseFromParent();
1231   Paired.I->eraseFromParent();
1232 
1233   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1234   return Read2;
1235 }
1236 
1237 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1238   if (STM->ldsRequiresM0Init())
1239     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1240   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1241                         : AMDGPU::DS_WRITE2_B64_gfx9;
1242 }
1243 
1244 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1245   if (STM->ldsRequiresM0Init())
1246     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1247                           : AMDGPU::DS_WRITE2ST64_B64;
1248 
1249   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1250                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1251 }
1252 
1253 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1254     CombineInfo &CI, CombineInfo &Paired,
1255     MachineBasicBlock::iterator InsertBefore) {
1256   MachineBasicBlock *MBB = CI.I->getParent();
1257 
1258   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1259   // sure we preserve the subregister index and any register flags set on them.
1260   const MachineOperand *AddrReg =
1261       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1262   const MachineOperand *Data0 =
1263       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1264   const MachineOperand *Data1 =
1265       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1266 
1267   unsigned NewOffset0 = CI.Offset;
1268   unsigned NewOffset1 = Paired.Offset;
1269   unsigned Opc =
1270       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1271 
1272   if (NewOffset0 > NewOffset1) {
1273     // Canonicalize the merged instruction so the smaller offset comes first.
1274     std::swap(NewOffset0, NewOffset1);
1275     std::swap(Data0, Data1);
1276   }
1277 
1278   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1279          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1280 
1281   const MCInstrDesc &Write2Desc = TII->get(Opc);
1282   DebugLoc DL = CI.I->getDebugLoc();
1283 
1284   Register BaseReg = AddrReg->getReg();
1285   unsigned BaseSubReg = AddrReg->getSubReg();
1286   unsigned BaseRegFlags = 0;
1287   if (CI.BaseOff) {
1288     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1289     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1290         .addImm(CI.BaseOff);
1291 
1292     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1293     BaseRegFlags = RegState::Kill;
1294 
1295     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1296         .addReg(ImmReg)
1297         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1298         .addImm(0); // clamp bit
1299     BaseSubReg = 0;
1300   }
1301 
1302   MachineInstrBuilder Write2 =
1303       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1304           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1305           .add(*Data0)                               // data0
1306           .add(*Data1)                               // data1
1307           .addImm(NewOffset0)                        // offset0
1308           .addImm(NewOffset1)                        // offset1
1309           .addImm(0)                                 // gds
1310           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1311 
1312   CI.I->eraseFromParent();
1313   Paired.I->eraseFromParent();
1314 
1315   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1316   return Write2;
1317 }
1318 
1319 MachineBasicBlock::iterator
1320 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1321                                      MachineBasicBlock::iterator InsertBefore) {
1322   MachineBasicBlock *MBB = CI.I->getParent();
1323   DebugLoc DL = CI.I->getDebugLoc();
1324   const unsigned Opcode = getNewOpcode(CI, Paired);
1325 
1326   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327 
1328   Register DestReg = MRI->createVirtualRegister(SuperRC);
1329   unsigned MergedDMask = CI.DMask | Paired.DMask;
1330   unsigned DMaskIdx =
1331       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1332 
1333   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1334   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1335     if (I == DMaskIdx)
1336       MIB.addImm(MergedDMask);
1337     else
1338       MIB.add((*CI.I).getOperand(I));
1339   }
1340 
1341   // It shouldn't be possible to get this far if the two instructions
1342   // don't have a single memoperand, because MachineInstr::mayAlias()
1343   // will return true if this is the case.
1344   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1345 
1346   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1347 
1348   unsigned SubRegIdx0, SubRegIdx1;
1349   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1350 
1351   // Copy to the old destination registers.
1352   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1353   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1354   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1355 
1356   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1357       .add(*Dest0) // Copy to same destination including flags and sub reg.
1358       .addReg(DestReg, 0, SubRegIdx0);
1359   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1360       .add(*Dest1)
1361       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1362 
1363   CI.I->eraseFromParent();
1364   Paired.I->eraseFromParent();
1365   return New;
1366 }
1367 
1368 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1369     CombineInfo &CI, CombineInfo &Paired,
1370     MachineBasicBlock::iterator InsertBefore) {
1371   MachineBasicBlock *MBB = CI.I->getParent();
1372   DebugLoc DL = CI.I->getDebugLoc();
1373   const unsigned Opcode = getNewOpcode(CI, Paired);
1374 
1375   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1376 
1377   Register DestReg = MRI->createVirtualRegister(SuperRC);
1378   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1379 
1380   // It shouldn't be possible to get this far if the two instructions
1381   // don't have a single memoperand, because MachineInstr::mayAlias()
1382   // will return true if this is the case.
1383   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1384 
1385   MachineInstrBuilder New =
1386       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1387           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1388   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1389     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1390   // For convenience, when SGPR_IMM buffer loads are merged into a
1391   // zero-offset load, we generate its SGPR variant.
1392   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1393     New.addImm(MergedOffset);
1394   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1395 
1396   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1397   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1398   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1399 
1400   // Copy to the old destination registers.
1401   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1402   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1403   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1404 
1405   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1406       .add(*Dest0) // Copy to same destination including flags and sub reg.
1407       .addReg(DestReg, 0, SubRegIdx0);
1408   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1409       .add(*Dest1)
1410       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1411 
1412   CI.I->eraseFromParent();
1413   Paired.I->eraseFromParent();
1414   return New;
1415 }
1416 
1417 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1418     CombineInfo &CI, CombineInfo &Paired,
1419     MachineBasicBlock::iterator InsertBefore) {
1420   MachineBasicBlock *MBB = CI.I->getParent();
1421   DebugLoc DL = CI.I->getDebugLoc();
1422 
1423   const unsigned Opcode = getNewOpcode(CI, Paired);
1424 
1425   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1426 
1427   // Copy to the new source register.
1428   Register DestReg = MRI->createVirtualRegister(SuperRC);
1429   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1430 
1431   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1432 
1433   AddressRegs Regs = getRegs(Opcode, *TII);
1434 
1435   if (Regs.VAddr)
1436     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1437 
1438   // It shouldn't be possible to get this far if the two instructions
1439   // don't have a single memoperand, because MachineInstr::mayAlias()
1440   // will return true if this is the case.
1441   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1442 
1443   MachineInstr *New =
1444     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1445         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1446         .addImm(MergedOffset) // offset
1447         .addImm(CI.CPol)      // cpol
1448         .addImm(0)            // swz
1449         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1450 
1451   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1452   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1453   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1454 
1455   // Copy to the old destination registers.
1456   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1457   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1458   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1459 
1460   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1461       .add(*Dest0) // Copy to same destination including flags and sub reg.
1462       .addReg(DestReg, 0, SubRegIdx0);
1463   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1464       .add(*Dest1)
1465       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1466 
1467   CI.I->eraseFromParent();
1468   Paired.I->eraseFromParent();
1469   return New;
1470 }
1471 
1472 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1473     CombineInfo &CI, CombineInfo &Paired,
1474     MachineBasicBlock::iterator InsertBefore) {
1475   MachineBasicBlock *MBB = CI.I->getParent();
1476   DebugLoc DL = CI.I->getDebugLoc();
1477 
1478   const unsigned Opcode = getNewOpcode(CI, Paired);
1479 
1480   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1481 
1482   // Copy to the new source register.
1483   Register DestReg = MRI->createVirtualRegister(SuperRC);
1484   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1485 
1486   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1487 
1488   AddressRegs Regs = getRegs(Opcode, *TII);
1489 
1490   if (Regs.VAddr)
1491     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1492 
1493   unsigned JoinedFormat =
1494       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1495 
1496   // It shouldn't be possible to get this far if the two instructions
1497   // don't have a single memoperand, because MachineInstr::mayAlias()
1498   // will return true if this is the case.
1499   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1500 
1501   MachineInstr *New =
1502       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1503           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1504           .addImm(MergedOffset) // offset
1505           .addImm(JoinedFormat) // format
1506           .addImm(CI.CPol)      // cpol
1507           .addImm(0)            // swz
1508           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1509 
1510   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1511   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1512   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1513 
1514   // Copy to the old destination registers.
1515   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1516   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1517   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1518 
1519   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1520       .add(*Dest0) // Copy to same destination including flags and sub reg.
1521       .addReg(DestReg, 0, SubRegIdx0);
1522   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1523       .add(*Dest1)
1524       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1525 
1526   CI.I->eraseFromParent();
1527   Paired.I->eraseFromParent();
1528   return New;
1529 }
1530 
1531 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1532     CombineInfo &CI, CombineInfo &Paired,
1533     MachineBasicBlock::iterator InsertBefore) {
1534   MachineBasicBlock *MBB = CI.I->getParent();
1535   DebugLoc DL = CI.I->getDebugLoc();
1536 
1537   const unsigned Opcode = getNewOpcode(CI, Paired);
1538 
1539   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1540   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1541   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1542 
1543   // Copy to the new source register.
1544   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1545   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1546 
1547   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1548   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1549 
1550   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1551       .add(*Src0)
1552       .addImm(SubRegIdx0)
1553       .add(*Src1)
1554       .addImm(SubRegIdx1);
1555 
1556   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1557                  .addReg(SrcReg, RegState::Kill);
1558 
1559   AddressRegs Regs = getRegs(Opcode, *TII);
1560 
1561   if (Regs.VAddr)
1562     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1563 
1564   unsigned JoinedFormat =
1565       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1566 
1567   // It shouldn't be possible to get this far if the two instructions
1568   // don't have a single memoperand, because MachineInstr::mayAlias()
1569   // will return true if this is the case.
1570   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1571 
1572   MachineInstr *New =
1573       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1574           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1575           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1576           .addImm(JoinedFormat)                     // format
1577           .addImm(CI.CPol)                          // cpol
1578           .addImm(0)                                // swz
1579           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1580 
1581   CI.I->eraseFromParent();
1582   Paired.I->eraseFromParent();
1583   return New;
1584 }
1585 
1586 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1587     CombineInfo &CI, CombineInfo &Paired,
1588     MachineBasicBlock::iterator InsertBefore) {
1589   MachineBasicBlock *MBB = CI.I->getParent();
1590   DebugLoc DL = CI.I->getDebugLoc();
1591 
1592   const unsigned Opcode = getNewOpcode(CI, Paired);
1593 
1594   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1595   Register DestReg = MRI->createVirtualRegister(SuperRC);
1596 
1597   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1598 
1599   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1600     MIB.add(*SAddr);
1601 
1602   MachineInstr *New =
1603     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1604        .addImm(std::min(CI.Offset, Paired.Offset))
1605        .addImm(CI.CPol)
1606        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1607 
1608   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1609   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1610   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1611 
1612   // Copy to the old destination registers.
1613   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1614   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1615   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1616 
1617   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1618       .add(*Dest0) // Copy to same destination including flags and sub reg.
1619       .addReg(DestReg, 0, SubRegIdx0);
1620   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1621       .add(*Dest1)
1622       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1623 
1624   CI.I->eraseFromParent();
1625   Paired.I->eraseFromParent();
1626   return New;
1627 }
1628 
1629 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1630     CombineInfo &CI, CombineInfo &Paired,
1631     MachineBasicBlock::iterator InsertBefore) {
1632   MachineBasicBlock *MBB = CI.I->getParent();
1633   DebugLoc DL = CI.I->getDebugLoc();
1634 
1635   const unsigned Opcode = getNewOpcode(CI, Paired);
1636 
1637   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1638   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1639   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1640 
1641   // Copy to the new source register.
1642   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1643   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1644 
1645   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1646   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1647 
1648   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1649       .add(*Src0)
1650       .addImm(SubRegIdx0)
1651       .add(*Src1)
1652       .addImm(SubRegIdx1);
1653 
1654   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1655                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1656                  .addReg(SrcReg, RegState::Kill);
1657 
1658   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1659     MIB.add(*SAddr);
1660 
1661   MachineInstr *New =
1662     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1663        .addImm(CI.CPol)
1664        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1665 
1666   CI.I->eraseFromParent();
1667   Paired.I->eraseFromParent();
1668   return New;
1669 }
1670 
1671 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1672                                             const CombineInfo &Paired) {
1673   const unsigned Width = CI.Width + Paired.Width;
1674 
1675   switch (getCommonInstClass(CI, Paired)) {
1676   default:
1677     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1678     // FIXME: Handle d16 correctly
1679     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1680                                   Width);
1681   case TBUFFER_LOAD:
1682   case TBUFFER_STORE:
1683     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1684                                   Width);
1685 
1686   case UNKNOWN:
1687     llvm_unreachable("Unknown instruction class");
1688   case S_BUFFER_LOAD_IMM:
1689     switch (Width) {
1690     default:
1691       return 0;
1692     case 2:
1693       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1694     case 4:
1695       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1696     case 8:
1697       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1698     }
1699   case S_BUFFER_LOAD_SGPR_IMM:
1700     switch (Width) {
1701     default:
1702       return 0;
1703     case 2:
1704       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1705                             : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1706     case 4:
1707       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1708                             : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1709     case 8:
1710       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1711                             : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1712     }
1713   case S_LOAD_IMM:
1714     switch (Width) {
1715     default:
1716       return 0;
1717     case 2:
1718       return AMDGPU::S_LOAD_DWORDX2_IMM;
1719     case 4:
1720       return AMDGPU::S_LOAD_DWORDX4_IMM;
1721     case 8:
1722       return AMDGPU::S_LOAD_DWORDX8_IMM;
1723     }
1724   case GLOBAL_LOAD:
1725     switch (Width) {
1726     default:
1727       return 0;
1728     case 2:
1729       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1730     case 3:
1731       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1732     case 4:
1733       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1734     }
1735   case GLOBAL_LOAD_SADDR:
1736     switch (Width) {
1737     default:
1738       return 0;
1739     case 2:
1740       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1741     case 3:
1742       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1743     case 4:
1744       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1745     }
1746   case GLOBAL_STORE:
1747     switch (Width) {
1748     default:
1749       return 0;
1750     case 2:
1751       return AMDGPU::GLOBAL_STORE_DWORDX2;
1752     case 3:
1753       return AMDGPU::GLOBAL_STORE_DWORDX3;
1754     case 4:
1755       return AMDGPU::GLOBAL_STORE_DWORDX4;
1756     }
1757   case GLOBAL_STORE_SADDR:
1758     switch (Width) {
1759     default:
1760       return 0;
1761     case 2:
1762       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1763     case 3:
1764       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1765     case 4:
1766       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1767     }
1768   case FLAT_LOAD:
1769     switch (Width) {
1770     default:
1771       return 0;
1772     case 2:
1773       return AMDGPU::FLAT_LOAD_DWORDX2;
1774     case 3:
1775       return AMDGPU::FLAT_LOAD_DWORDX3;
1776     case 4:
1777       return AMDGPU::FLAT_LOAD_DWORDX4;
1778     }
1779   case FLAT_STORE:
1780     switch (Width) {
1781     default:
1782       return 0;
1783     case 2:
1784       return AMDGPU::FLAT_STORE_DWORDX2;
1785     case 3:
1786       return AMDGPU::FLAT_STORE_DWORDX3;
1787     case 4:
1788       return AMDGPU::FLAT_STORE_DWORDX4;
1789     }
1790   case MIMG:
1791     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1792            "No overlaps");
1793     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1794   }
1795 }
1796 
1797 std::pair<unsigned, unsigned>
1798 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1799                                     const CombineInfo &Paired) {
1800   assert((CI.InstClass != MIMG ||
1801           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1802            CI.Width + Paired.Width)) &&
1803          "No overlaps");
1804 
1805   unsigned Idx0;
1806   unsigned Idx1;
1807 
1808   static const unsigned Idxs[5][4] = {
1809       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1810       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1811       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1812       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1813       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1814   };
1815 
1816   assert(CI.Width >= 1 && CI.Width <= 4);
1817   assert(Paired.Width >= 1 && Paired.Width <= 4);
1818 
1819   if (Paired < CI) {
1820     Idx1 = Idxs[0][Paired.Width - 1];
1821     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1822   } else {
1823     Idx0 = Idxs[0][CI.Width - 1];
1824     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1825   }
1826 
1827   return std::pair(Idx0, Idx1);
1828 }
1829 
1830 const TargetRegisterClass *
1831 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1832                                              const CombineInfo &Paired) {
1833   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1834       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1835     switch (CI.Width + Paired.Width) {
1836     default:
1837       return nullptr;
1838     case 2:
1839       return &AMDGPU::SReg_64_XEXECRegClass;
1840     case 4:
1841       return &AMDGPU::SGPR_128RegClass;
1842     case 8:
1843       return &AMDGPU::SGPR_256RegClass;
1844     case 16:
1845       return &AMDGPU::SGPR_512RegClass;
1846     }
1847   }
1848 
1849   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1850   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1851              ? TRI->getAGPRClassForBitWidth(BitWidth)
1852              : TRI->getVGPRClassForBitWidth(BitWidth);
1853 }
1854 
1855 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1856     CombineInfo &CI, CombineInfo &Paired,
1857     MachineBasicBlock::iterator InsertBefore) {
1858   MachineBasicBlock *MBB = CI.I->getParent();
1859   DebugLoc DL = CI.I->getDebugLoc();
1860 
1861   const unsigned Opcode = getNewOpcode(CI, Paired);
1862 
1863   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1864   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1865   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1866 
1867   // Copy to the new source register.
1868   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1869   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1870 
1871   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1872   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1873 
1874   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1875       .add(*Src0)
1876       .addImm(SubRegIdx0)
1877       .add(*Src1)
1878       .addImm(SubRegIdx1);
1879 
1880   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1881                  .addReg(SrcReg, RegState::Kill);
1882 
1883   AddressRegs Regs = getRegs(Opcode, *TII);
1884 
1885   if (Regs.VAddr)
1886     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1887 
1888 
1889   // It shouldn't be possible to get this far if the two instructions
1890   // don't have a single memoperand, because MachineInstr::mayAlias()
1891   // will return true if this is the case.
1892   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1893 
1894   MachineInstr *New =
1895     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1896         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1897         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1898         .addImm(CI.CPol)      // cpol
1899         .addImm(0)            // swz
1900         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1901 
1902   CI.I->eraseFromParent();
1903   Paired.I->eraseFromParent();
1904   return New;
1905 }
1906 
1907 MachineOperand
1908 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1909   APInt V(32, Val, true);
1910   if (TII->isInlineConstant(V))
1911     return MachineOperand::CreateImm(Val);
1912 
1913   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1914   MachineInstr *Mov =
1915   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1916           TII->get(AMDGPU::S_MOV_B32), Reg)
1917     .addImm(Val);
1918   (void)Mov;
1919   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1920   return MachineOperand::CreateReg(Reg, false);
1921 }
1922 
1923 // Compute base address using Addr and return the final register.
1924 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1925                                            const MemAddress &Addr) const {
1926   MachineBasicBlock *MBB = MI.getParent();
1927   MachineBasicBlock::iterator MBBI = MI.getIterator();
1928   DebugLoc DL = MI.getDebugLoc();
1929 
1930   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1931           Addr.Base.LoSubReg) &&
1932          "Expected 32-bit Base-Register-Low!!");
1933 
1934   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1935           Addr.Base.HiSubReg) &&
1936          "Expected 32-bit Base-Register-Hi!!");
1937 
1938   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1939   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1940   MachineOperand OffsetHi =
1941     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1942 
1943   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1944   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1945   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1946 
1947   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1948   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1949   MachineInstr *LoHalf =
1950     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1951       .addReg(CarryReg, RegState::Define)
1952       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1953       .add(OffsetLo)
1954       .addImm(0); // clamp bit
1955   (void)LoHalf;
1956   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1957 
1958   MachineInstr *HiHalf =
1959   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1960     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1961     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1962     .add(OffsetHi)
1963     .addReg(CarryReg, RegState::Kill)
1964     .addImm(0); // clamp bit
1965   (void)HiHalf;
1966   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1967 
1968   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1969   MachineInstr *FullBase =
1970     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1971       .addReg(DestSub0)
1972       .addImm(AMDGPU::sub0)
1973       .addReg(DestSub1)
1974       .addImm(AMDGPU::sub1);
1975   (void)FullBase;
1976   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1977 
1978   return FullDestReg;
1979 }
1980 
1981 // Update base and offset with the NewBase and NewOffset in MI.
1982 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1983                                                Register NewBase,
1984                                                int32_t NewOffset) const {
1985   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1986   Base->setReg(NewBase);
1987   Base->setIsKill(false);
1988   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1989 }
1990 
1991 std::optional<int32_t>
1992 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1993   if (Op.isImm())
1994     return Op.getImm();
1995 
1996   if (!Op.isReg())
1997     return std::nullopt;
1998 
1999   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2000   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2001       !Def->getOperand(1).isImm())
2002     return std::nullopt;
2003 
2004   return Def->getOperand(1).getImm();
2005 }
2006 
2007 // Analyze Base and extracts:
2008 //  - 32bit base registers, subregisters
2009 //  - 64bit constant offset
2010 // Expecting base computation as:
2011 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2012 //   %LO:vgpr_32, %c:sreg_64_xexec =
2013 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2014 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2015 //   %Base:vreg_64 =
2016 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2017 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2018                                                       MemAddress &Addr) const {
2019   if (!Base.isReg())
2020     return;
2021 
2022   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2023   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2024       || Def->getNumOperands() != 5)
2025     return;
2026 
2027   MachineOperand BaseLo = Def->getOperand(1);
2028   MachineOperand BaseHi = Def->getOperand(3);
2029   if (!BaseLo.isReg() || !BaseHi.isReg())
2030     return;
2031 
2032   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2033   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2034 
2035   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2036       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2037     return;
2038 
2039   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2040   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2041 
2042   auto Offset0P = extractConstOffset(*Src0);
2043   if (Offset0P)
2044     BaseLo = *Src1;
2045   else {
2046     if (!(Offset0P = extractConstOffset(*Src1)))
2047       return;
2048     BaseLo = *Src0;
2049   }
2050 
2051   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2052   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2053 
2054   if (Src0->isImm())
2055     std::swap(Src0, Src1);
2056 
2057   if (!Src1->isImm())
2058     return;
2059 
2060   uint64_t Offset1 = Src1->getImm();
2061   BaseHi = *Src0;
2062 
2063   Addr.Base.LoReg = BaseLo.getReg();
2064   Addr.Base.HiReg = BaseHi.getReg();
2065   Addr.Base.LoSubReg = BaseLo.getSubReg();
2066   Addr.Base.HiSubReg = BaseHi.getSubReg();
2067   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2068 }
2069 
2070 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2071     MachineInstr &MI,
2072     MemInfoMap &Visited,
2073     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2074 
2075   if (!(MI.mayLoad() ^ MI.mayStore()))
2076     return false;
2077 
2078   // TODO: Support flat and scratch.
2079   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2080     return false;
2081 
2082   if (MI.mayLoad() &&
2083       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2084     return false;
2085 
2086   if (AnchorList.count(&MI))
2087     return false;
2088 
2089   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2090 
2091   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2092     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2093     return false;
2094   }
2095 
2096   // Step1: Find the base-registers and a 64bit constant offset.
2097   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2098   MemAddress MAddr;
2099   if (!Visited.contains(&MI)) {
2100     processBaseWithConstOffset(Base, MAddr);
2101     Visited[&MI] = MAddr;
2102   } else
2103     MAddr = Visited[&MI];
2104 
2105   if (MAddr.Offset == 0) {
2106     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2107                          " constant offsets that can be promoted.\n";);
2108     return false;
2109   }
2110 
2111   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2112              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2113 
2114   // Step2: Traverse through MI's basic block and find an anchor(that has the
2115   // same base-registers) with the highest 13bit distance from MI's offset.
2116   // E.g. (64bit loads)
2117   // bb:
2118   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2119   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2120   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2121   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2122   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2123   //
2124   // Starting from the first load, the optimization will try to find a new base
2125   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2126   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2127   // as the new-base(anchor) because of the maximum distance which can
2128   // accommodate more intermediate bases presumably.
2129   //
2130   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2131   // (&a + 8192) for load1, load2, load4.
2132   //   addr = &a + 8192
2133   //   load1 = load(addr,       -4096)
2134   //   load2 = load(addr,       -2048)
2135   //   load3 = load(addr,       0)
2136   //   load4 = load(addr,       2048)
2137   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2138   //
2139   MachineInstr *AnchorInst = nullptr;
2140   MemAddress AnchorAddr;
2141   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2142   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2143 
2144   MachineBasicBlock *MBB = MI.getParent();
2145   MachineBasicBlock::iterator E = MBB->end();
2146   MachineBasicBlock::iterator MBBI = MI.getIterator();
2147   ++MBBI;
2148   const SITargetLowering *TLI =
2149     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2150 
2151   for ( ; MBBI != E; ++MBBI) {
2152     MachineInstr &MINext = *MBBI;
2153     // TODO: Support finding an anchor(with same base) from store addresses or
2154     // any other load addresses where the opcodes are different.
2155     if (MINext.getOpcode() != MI.getOpcode() ||
2156         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2157       continue;
2158 
2159     const MachineOperand &BaseNext =
2160       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2161     MemAddress MAddrNext;
2162     if (!Visited.contains(&MINext)) {
2163       processBaseWithConstOffset(BaseNext, MAddrNext);
2164       Visited[&MINext] = MAddrNext;
2165     } else
2166       MAddrNext = Visited[&MINext];
2167 
2168     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2169         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2170         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2171         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2172       continue;
2173 
2174     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2175 
2176     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2177     TargetLoweringBase::AddrMode AM;
2178     AM.HasBaseReg = true;
2179     AM.BaseOffs = Dist;
2180     if (TLI->isLegalGlobalAddressingMode(AM) &&
2181         (uint32_t)std::abs(Dist) > MaxDist) {
2182       MaxDist = std::abs(Dist);
2183 
2184       AnchorAddr = MAddrNext;
2185       AnchorInst = &MINext;
2186     }
2187   }
2188 
2189   if (AnchorInst) {
2190     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2191                AnchorInst->dump());
2192     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2193                <<  AnchorAddr.Offset << "\n\n");
2194 
2195     // Instead of moving up, just re-compute anchor-instruction's base address.
2196     Register Base = computeBase(MI, AnchorAddr);
2197 
2198     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2199     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2200 
2201     for (auto P : InstsWCommonBase) {
2202       TargetLoweringBase::AddrMode AM;
2203       AM.HasBaseReg = true;
2204       AM.BaseOffs = P.second - AnchorAddr.Offset;
2205 
2206       if (TLI->isLegalGlobalAddressingMode(AM)) {
2207         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2208                    dbgs() << ")"; P.first->dump());
2209         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2210         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2211       }
2212     }
2213     AnchorList.insert(AnchorInst);
2214     return true;
2215   }
2216 
2217   return false;
2218 }
2219 
2220 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2221                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2222   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2223     if (AddrList.front().InstClass == CI.InstClass &&
2224         AddrList.front().IsAGPR == CI.IsAGPR &&
2225         AddrList.front().hasSameBaseAddress(CI)) {
2226       AddrList.emplace_back(CI);
2227       return;
2228     }
2229   }
2230 
2231   // Base address not found, so add a new list.
2232   MergeableInsts.emplace_back(1, CI);
2233 }
2234 
2235 std::pair<MachineBasicBlock::iterator, bool>
2236 SILoadStoreOptimizer::collectMergeableInsts(
2237     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2238     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2239     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2240   bool Modified = false;
2241 
2242   // Sort potential mergeable instructions into lists.  One list per base address.
2243   unsigned Order = 0;
2244   MachineBasicBlock::iterator BlockI = Begin;
2245   for (; BlockI != End; ++BlockI) {
2246     MachineInstr &MI = *BlockI;
2247 
2248     // We run this before checking if an address is mergeable, because it can produce
2249     // better code even if the instructions aren't mergeable.
2250     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2251       Modified = true;
2252 
2253     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2254     // barriers. We can look after this barrier for separate merges.
2255     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2256       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2257 
2258       // Search will resume after this instruction in a separate merge list.
2259       ++BlockI;
2260       break;
2261     }
2262 
2263     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2264     if (InstClass == UNKNOWN)
2265       continue;
2266 
2267     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2268     int Swizzled =
2269         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2270     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2271       continue;
2272 
2273     CombineInfo CI;
2274     CI.setMI(MI, *this);
2275     CI.Order = Order++;
2276 
2277     if (!CI.hasMergeableAddress(*MRI))
2278       continue;
2279 
2280     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2281       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2282       //        operands. However we are reporting that ds_write2 shall have
2283       //        only VGPR data so that machine copy propagation does not
2284       //        create an illegal instruction with a VGPR and AGPR sources.
2285       //        Consequenctially if we create such instruction the verifier
2286       //        will complain.
2287       continue;
2288     }
2289 
2290     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2291 
2292     addInstToMergeableList(CI, MergeableInsts);
2293   }
2294 
2295   // At this point we have lists of Mergeable instructions.
2296   //
2297   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2298   // list try to find an instruction that can be merged with I.  If an instruction
2299   // is found, it is stored in the Paired field.  If no instructions are found, then
2300   // the CombineInfo object is deleted from the list.
2301 
2302   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2303                                                    E = MergeableInsts.end(); I != E;) {
2304 
2305     std::list<CombineInfo> &MergeList = *I;
2306     if (MergeList.size() <= 1) {
2307       // This means we have found only one instruction with a given address
2308       // that can be merged, and we need at least 2 instructions to do a merge,
2309       // so this list can be discarded.
2310       I = MergeableInsts.erase(I);
2311       continue;
2312     }
2313 
2314     // Sort the lists by offsets, this way mergeable instructions will be
2315     // adjacent to each other in the list, which will make it easier to find
2316     // matches.
2317     MergeList.sort(
2318         [] (const CombineInfo &A, const CombineInfo &B) {
2319           return A.Offset < B.Offset;
2320         });
2321     ++I;
2322   }
2323 
2324   return std::pair(BlockI, Modified);
2325 }
2326 
2327 // Scan through looking for adjacent LDS operations with constant offsets from
2328 // the same base register. We rely on the scheduler to do the hard work of
2329 // clustering nearby loads, and assume these are all adjacent.
2330 bool SILoadStoreOptimizer::optimizeBlock(
2331                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2332   bool Modified = false;
2333 
2334   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2335                                                    E = MergeableInsts.end(); I != E;) {
2336     std::list<CombineInfo> &MergeList = *I;
2337 
2338     bool OptimizeListAgain = false;
2339     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2340       // We weren't able to make any changes, so delete the list so we don't
2341       // process the same instructions the next time we try to optimize this
2342       // block.
2343       I = MergeableInsts.erase(I);
2344       continue;
2345     }
2346 
2347     Modified = true;
2348 
2349     // We made changes, but also determined that there were no more optimization
2350     // opportunities, so we don't need to reprocess the list
2351     if (!OptimizeListAgain) {
2352       I = MergeableInsts.erase(I);
2353       continue;
2354     }
2355     OptimizeAgain = true;
2356   }
2357   return Modified;
2358 }
2359 
2360 bool
2361 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2362                                           std::list<CombineInfo> &MergeList,
2363                                           bool &OptimizeListAgain) {
2364   if (MergeList.empty())
2365     return false;
2366 
2367   bool Modified = false;
2368 
2369   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2370        Next = std::next(I)) {
2371 
2372     auto First = I;
2373     auto Second = Next;
2374 
2375     if ((*First).Order > (*Second).Order)
2376       std::swap(First, Second);
2377     CombineInfo &CI = *First;
2378     CombineInfo &Paired = *Second;
2379 
2380     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2381     if (!Where) {
2382       ++I;
2383       continue;
2384     }
2385 
2386     Modified = true;
2387 
2388     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2389 
2390     MachineBasicBlock::iterator NewMI;
2391     switch (CI.InstClass) {
2392     default:
2393       llvm_unreachable("unknown InstClass");
2394       break;
2395     case DS_READ:
2396       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2397       break;
2398     case DS_WRITE:
2399       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2400       break;
2401     case S_BUFFER_LOAD_IMM:
2402     case S_BUFFER_LOAD_SGPR_IMM:
2403     case S_LOAD_IMM:
2404       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2405       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2406       break;
2407     case BUFFER_LOAD:
2408       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2409       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2410       break;
2411     case BUFFER_STORE:
2412       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2413       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2414       break;
2415     case MIMG:
2416       NewMI = mergeImagePair(CI, Paired, Where->I);
2417       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418       break;
2419     case TBUFFER_LOAD:
2420       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2421       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2422       break;
2423     case TBUFFER_STORE:
2424       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2425       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2426       break;
2427     case FLAT_LOAD:
2428     case GLOBAL_LOAD:
2429     case GLOBAL_LOAD_SADDR:
2430       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2431       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2432       break;
2433     case FLAT_STORE:
2434     case GLOBAL_STORE:
2435     case GLOBAL_STORE_SADDR:
2436       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2437       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2438       break;
2439     }
2440     CI.setMI(NewMI, *this);
2441     CI.Order = Where->Order;
2442     if (I == Second)
2443       I = Next;
2444 
2445     MergeList.erase(Second);
2446   }
2447 
2448   return Modified;
2449 }
2450 
2451 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2452   if (skipFunction(MF.getFunction()))
2453     return false;
2454 
2455   STM = &MF.getSubtarget<GCNSubtarget>();
2456   if (!STM->loadStoreOptEnabled())
2457     return false;
2458 
2459   TII = STM->getInstrInfo();
2460   TRI = &TII->getRegisterInfo();
2461 
2462   MRI = &MF.getRegInfo();
2463   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2464 
2465   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2466 
2467   bool Modified = false;
2468 
2469   // Contains the list of instructions for which constant offsets are being
2470   // promoted to the IMM. This is tracked for an entire block at time.
2471   SmallPtrSet<MachineInstr *, 4> AnchorList;
2472   MemInfoMap Visited;
2473 
2474   for (MachineBasicBlock &MBB : MF) {
2475     MachineBasicBlock::iterator SectionEnd;
2476     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2477          I = SectionEnd) {
2478       bool CollectModified;
2479       std::list<std::list<CombineInfo>> MergeableInsts;
2480 
2481       // First pass: Collect list of all instructions we know how to merge in a
2482       // subset of the block.
2483       std::tie(SectionEnd, CollectModified) =
2484           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2485 
2486       Modified |= CollectModified;
2487 
2488       do {
2489         OptimizeAgain = false;
2490         Modified |= optimizeBlock(MergeableInsts);
2491       } while (OptimizeAgain);
2492     }
2493 
2494     Visited.clear();
2495     AnchorList.clear();
2496   }
2497 
2498   return Modified;
2499 }
2500