xref: /llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision c771b670eabbd38867d43475dacd35a1b572e9b5)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *
223   getTargetRegisterClass(const CombineInfo &CI,
224                          const CombineInfo &Paired) const;
225   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226 
227   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228 
229   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230                       MachineBasicBlock::iterator InsertBefore, int OpName,
231                       Register DestReg) const;
232   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233                            MachineBasicBlock::iterator InsertBefore,
234                            int OpName) const;
235 
236   unsigned read2Opcode(unsigned EltSize) const;
237   unsigned read2ST64Opcode(unsigned EltSize) const;
238   MachineBasicBlock::iterator
239   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240                  MachineBasicBlock::iterator InsertBefore);
241 
242   unsigned write2Opcode(unsigned EltSize) const;
243   unsigned write2ST64Opcode(unsigned EltSize) const;
244   MachineBasicBlock::iterator
245   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246                   MachineBasicBlock::iterator InsertBefore);
247   MachineBasicBlock::iterator
248   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249                  MachineBasicBlock::iterator InsertBefore);
250   MachineBasicBlock::iterator
251   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252                        MachineBasicBlock::iterator InsertBefore);
253   MachineBasicBlock::iterator
254   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255                       MachineBasicBlock::iterator InsertBefore);
256   MachineBasicBlock::iterator
257   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258                        MachineBasicBlock::iterator InsertBefore);
259   MachineBasicBlock::iterator
260   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261                        MachineBasicBlock::iterator InsertBefore);
262   MachineBasicBlock::iterator
263   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264                         MachineBasicBlock::iterator InsertBefore);
265   MachineBasicBlock::iterator
266   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267                     MachineBasicBlock::iterator InsertBefore);
268   MachineBasicBlock::iterator
269   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270                      MachineBasicBlock::iterator InsertBefore);
271 
272   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273                            int32_t NewOffset) const;
274   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278   /// Promotes constant offset to the immediate by adjusting the base. It
279   /// tries to use a base from the nearby instructions that allows it to have
280   /// a 13bit constant offset which gets promoted to the immediate.
281   bool promoteConstantOffsetToImm(MachineInstr &CI,
282                                   MemInfoMap &Visited,
283                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284   void addInstToMergeableList(const CombineInfo &CI,
285                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
286 
287   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290       std::list<std::list<CombineInfo>> &MergeableInsts) const;
291 
292   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293                                                      const CombineInfo &Paired);
294 
295   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296                                           const CombineInfo &Paired);
297 
298 public:
299   static char ID;
300 
301   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
302     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
303   }
304 
305   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306                                      bool &OptimizeListAgain);
307   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308 
309   bool runOnMachineFunction(MachineFunction &MF) override;
310 
311   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312 
313   void getAnalysisUsage(AnalysisUsage &AU) const override {
314     AU.setPreservesCFG();
315     AU.addRequired<AAResultsWrapperPass>();
316 
317     MachineFunctionPass::getAnalysisUsage(AU);
318   }
319 
320   MachineFunctionProperties getRequiredProperties() const override {
321     return MachineFunctionProperties()
322       .set(MachineFunctionProperties::Property::IsSSA);
323   }
324 };
325 
326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327   const unsigned Opc = MI.getOpcode();
328 
329   if (TII.isMUBUF(Opc)) {
330     // FIXME: Handle d16 correctly
331     return AMDGPU::getMUBUFElements(Opc);
332   }
333   if (TII.isImage(MI)) {
334     uint64_t DMaskImm =
335         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336     return llvm::popcount(DMaskImm);
337   }
338   if (TII.isMTBUF(Opc)) {
339     return AMDGPU::getMTBUFElements(Opc);
340   }
341 
342   switch (Opc) {
343   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345   case AMDGPU::S_LOAD_DWORD_IMM:
346   case AMDGPU::GLOBAL_LOAD_DWORD:
347   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348   case AMDGPU::GLOBAL_STORE_DWORD:
349   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350   case AMDGPU::FLAT_LOAD_DWORD:
351   case AMDGPU::FLAT_STORE_DWORD:
352     return 1;
353   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355   case AMDGPU::S_LOAD_DWORDX2_IMM:
356   case AMDGPU::GLOBAL_LOAD_DWORDX2:
357   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358   case AMDGPU::GLOBAL_STORE_DWORDX2:
359   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
360   case AMDGPU::FLAT_LOAD_DWORDX2:
361   case AMDGPU::FLAT_STORE_DWORDX2:
362     return 2;
363   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365   case AMDGPU::S_LOAD_DWORDX3_IMM:
366   case AMDGPU::GLOBAL_LOAD_DWORDX3:
367   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368   case AMDGPU::GLOBAL_STORE_DWORDX3:
369   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
370   case AMDGPU::FLAT_LOAD_DWORDX3:
371   case AMDGPU::FLAT_STORE_DWORDX3:
372     return 3;
373   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375   case AMDGPU::S_LOAD_DWORDX4_IMM:
376   case AMDGPU::GLOBAL_LOAD_DWORDX4:
377   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378   case AMDGPU::GLOBAL_STORE_DWORDX4:
379   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
380   case AMDGPU::FLAT_LOAD_DWORDX4:
381   case AMDGPU::FLAT_STORE_DWORDX4:
382     return 4;
383   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385   case AMDGPU::S_LOAD_DWORDX8_IMM:
386     return 8;
387   case AMDGPU::DS_READ_B32:
388   case AMDGPU::DS_READ_B32_gfx9:
389   case AMDGPU::DS_WRITE_B32:
390   case AMDGPU::DS_WRITE_B32_gfx9:
391     return 1;
392   case AMDGPU::DS_READ_B64:
393   case AMDGPU::DS_READ_B64_gfx9:
394   case AMDGPU::DS_WRITE_B64:
395   case AMDGPU::DS_WRITE_B64_gfx9:
396     return 2;
397   default:
398     return 0;
399   }
400 }
401 
402 /// Maps instruction opcode to enum InstClassEnum.
403 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
404   switch (Opc) {
405   default:
406     if (TII.isMUBUF(Opc)) {
407       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
408       default:
409         return UNKNOWN;
410       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
411       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
412       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
413       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
414       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
415       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
416       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
417       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
418       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
419       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
420       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
421       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
422       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
423       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
424       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
425       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
426         return BUFFER_LOAD;
427       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
428       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
429       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
430       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
431       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
432       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
433       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
434       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
435       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
436       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
437       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
438       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
439       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
440       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
441       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
442       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
443         return BUFFER_STORE;
444       }
445     }
446     if (TII.isImage(Opc)) {
447       // Ignore instructions encoded without vaddr.
448       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
449           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
450         return UNKNOWN;
451       // Ignore BVH instructions
452       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
453         return UNKNOWN;
454       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
455       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
456           TII.isGather4(Opc))
457         return UNKNOWN;
458       return MIMG;
459     }
460     if (TII.isMTBUF(Opc)) {
461       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
462       default:
463         return UNKNOWN;
464       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
465       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
466       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
467       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
468       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
469       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
470       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
471       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
472       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
473       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
474       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
475       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
476       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
477       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
478       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
479       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
480         return TBUFFER_LOAD;
481       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
482       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
483       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
484       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
485       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
486       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
487       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
488       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
489         return TBUFFER_STORE;
490       }
491     }
492     return UNKNOWN;
493   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
494   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
495   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
496   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
497   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
498     return S_BUFFER_LOAD_IMM;
499   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
500   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
501   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
502   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
503   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
504     return S_BUFFER_LOAD_SGPR_IMM;
505   case AMDGPU::S_LOAD_DWORD_IMM:
506   case AMDGPU::S_LOAD_DWORDX2_IMM:
507   case AMDGPU::S_LOAD_DWORDX3_IMM:
508   case AMDGPU::S_LOAD_DWORDX4_IMM:
509   case AMDGPU::S_LOAD_DWORDX8_IMM:
510     return S_LOAD_IMM;
511   case AMDGPU::DS_READ_B32:
512   case AMDGPU::DS_READ_B32_gfx9:
513   case AMDGPU::DS_READ_B64:
514   case AMDGPU::DS_READ_B64_gfx9:
515     return DS_READ;
516   case AMDGPU::DS_WRITE_B32:
517   case AMDGPU::DS_WRITE_B32_gfx9:
518   case AMDGPU::DS_WRITE_B64:
519   case AMDGPU::DS_WRITE_B64_gfx9:
520     return DS_WRITE;
521   case AMDGPU::GLOBAL_LOAD_DWORD:
522   case AMDGPU::GLOBAL_LOAD_DWORDX2:
523   case AMDGPU::GLOBAL_LOAD_DWORDX3:
524   case AMDGPU::GLOBAL_LOAD_DWORDX4:
525   case AMDGPU::FLAT_LOAD_DWORD:
526   case AMDGPU::FLAT_LOAD_DWORDX2:
527   case AMDGPU::FLAT_LOAD_DWORDX3:
528   case AMDGPU::FLAT_LOAD_DWORDX4:
529     return FLAT_LOAD;
530   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
531   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
532   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
533   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
534     return GLOBAL_LOAD_SADDR;
535   case AMDGPU::GLOBAL_STORE_DWORD:
536   case AMDGPU::GLOBAL_STORE_DWORDX2:
537   case AMDGPU::GLOBAL_STORE_DWORDX3:
538   case AMDGPU::GLOBAL_STORE_DWORDX4:
539   case AMDGPU::FLAT_STORE_DWORD:
540   case AMDGPU::FLAT_STORE_DWORDX2:
541   case AMDGPU::FLAT_STORE_DWORDX3:
542   case AMDGPU::FLAT_STORE_DWORDX4:
543     return FLAT_STORE;
544   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
545   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
546   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
547   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
548     return GLOBAL_STORE_SADDR;
549   }
550 }
551 
552 /// Determines instruction subclass from opcode. Only instructions
553 /// of the same subclass can be merged together. The merged instruction may have
554 /// a different subclass but must have the same class.
555 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
556   switch (Opc) {
557   default:
558     if (TII.isMUBUF(Opc))
559       return AMDGPU::getMUBUFBaseOpcode(Opc);
560     if (TII.isImage(Opc)) {
561       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
562       assert(Info);
563       return Info->BaseOpcode;
564     }
565     if (TII.isMTBUF(Opc))
566       return AMDGPU::getMTBUFBaseOpcode(Opc);
567     return -1;
568   case AMDGPU::DS_READ_B32:
569   case AMDGPU::DS_READ_B32_gfx9:
570   case AMDGPU::DS_READ_B64:
571   case AMDGPU::DS_READ_B64_gfx9:
572   case AMDGPU::DS_WRITE_B32:
573   case AMDGPU::DS_WRITE_B32_gfx9:
574   case AMDGPU::DS_WRITE_B64:
575   case AMDGPU::DS_WRITE_B64_gfx9:
576     return Opc;
577   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
578   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
579   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
580   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
581   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
582     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
583   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
584   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
585   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
586   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
587   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
588     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
589   case AMDGPU::S_LOAD_DWORD_IMM:
590   case AMDGPU::S_LOAD_DWORDX2_IMM:
591   case AMDGPU::S_LOAD_DWORDX3_IMM:
592   case AMDGPU::S_LOAD_DWORDX4_IMM:
593   case AMDGPU::S_LOAD_DWORDX8_IMM:
594     return AMDGPU::S_LOAD_DWORD_IMM;
595   case AMDGPU::GLOBAL_LOAD_DWORD:
596   case AMDGPU::GLOBAL_LOAD_DWORDX2:
597   case AMDGPU::GLOBAL_LOAD_DWORDX3:
598   case AMDGPU::GLOBAL_LOAD_DWORDX4:
599   case AMDGPU::FLAT_LOAD_DWORD:
600   case AMDGPU::FLAT_LOAD_DWORDX2:
601   case AMDGPU::FLAT_LOAD_DWORDX3:
602   case AMDGPU::FLAT_LOAD_DWORDX4:
603     return AMDGPU::FLAT_LOAD_DWORD;
604   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
605   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
606   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
607   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
608     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
609   case AMDGPU::GLOBAL_STORE_DWORD:
610   case AMDGPU::GLOBAL_STORE_DWORDX2:
611   case AMDGPU::GLOBAL_STORE_DWORDX3:
612   case AMDGPU::GLOBAL_STORE_DWORDX4:
613   case AMDGPU::FLAT_STORE_DWORD:
614   case AMDGPU::FLAT_STORE_DWORDX2:
615   case AMDGPU::FLAT_STORE_DWORDX3:
616   case AMDGPU::FLAT_STORE_DWORDX4:
617     return AMDGPU::FLAT_STORE_DWORD;
618   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
619   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
620   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
621   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
622     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
623   }
624 }
625 
626 // GLOBAL loads and stores are classified as FLAT initially. If both combined
627 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
628 // If either or both instructions are non segment specific FLAT the resulting
629 // combined operation will be FLAT, potentially promoting one of the GLOBAL
630 // operations to FLAT.
631 // For other instructions return the original unmodified class.
632 InstClassEnum
633 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
634                                          const CombineInfo &Paired) {
635   assert(CI.InstClass == Paired.InstClass);
636 
637   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
638       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
639     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
640 
641   return CI.InstClass;
642 }
643 
644 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
645   AddressRegs Result;
646 
647   if (TII.isMUBUF(Opc)) {
648     if (AMDGPU::getMUBUFHasVAddr(Opc))
649       Result.VAddr = true;
650     if (AMDGPU::getMUBUFHasSrsrc(Opc))
651       Result.SRsrc = true;
652     if (AMDGPU::getMUBUFHasSoffset(Opc))
653       Result.SOffset = true;
654 
655     return Result;
656   }
657 
658   if (TII.isImage(Opc)) {
659     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
660     if (VAddr0Idx >= 0) {
661       int RsrcName =
662           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
663       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
664       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
665     } else {
666       Result.VAddr = true;
667     }
668     Result.SRsrc = true;
669     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
670     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
671       Result.SSamp = true;
672 
673     return Result;
674   }
675   if (TII.isMTBUF(Opc)) {
676     if (AMDGPU::getMTBUFHasVAddr(Opc))
677       Result.VAddr = true;
678     if (AMDGPU::getMTBUFHasSrsrc(Opc))
679       Result.SRsrc = true;
680     if (AMDGPU::getMTBUFHasSoffset(Opc))
681       Result.SOffset = true;
682 
683     return Result;
684   }
685 
686   switch (Opc) {
687   default:
688     return Result;
689   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
690   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
691   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
692   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
693   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
694     Result.SOffset = true;
695     [[fallthrough]];
696   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
697   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
698   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
699   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
700   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
701   case AMDGPU::S_LOAD_DWORD_IMM:
702   case AMDGPU::S_LOAD_DWORDX2_IMM:
703   case AMDGPU::S_LOAD_DWORDX3_IMM:
704   case AMDGPU::S_LOAD_DWORDX4_IMM:
705   case AMDGPU::S_LOAD_DWORDX8_IMM:
706     Result.SBase = true;
707     return Result;
708   case AMDGPU::DS_READ_B32:
709   case AMDGPU::DS_READ_B64:
710   case AMDGPU::DS_READ_B32_gfx9:
711   case AMDGPU::DS_READ_B64_gfx9:
712   case AMDGPU::DS_WRITE_B32:
713   case AMDGPU::DS_WRITE_B64:
714   case AMDGPU::DS_WRITE_B32_gfx9:
715   case AMDGPU::DS_WRITE_B64_gfx9:
716     Result.Addr = true;
717     return Result;
718   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
719   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
720   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
721   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
722   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
723   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
724   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
725   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
726     Result.SAddr = true;
727     [[fallthrough]];
728   case AMDGPU::GLOBAL_LOAD_DWORD:
729   case AMDGPU::GLOBAL_LOAD_DWORDX2:
730   case AMDGPU::GLOBAL_LOAD_DWORDX3:
731   case AMDGPU::GLOBAL_LOAD_DWORDX4:
732   case AMDGPU::GLOBAL_STORE_DWORD:
733   case AMDGPU::GLOBAL_STORE_DWORDX2:
734   case AMDGPU::GLOBAL_STORE_DWORDX3:
735   case AMDGPU::GLOBAL_STORE_DWORDX4:
736   case AMDGPU::FLAT_LOAD_DWORD:
737   case AMDGPU::FLAT_LOAD_DWORDX2:
738   case AMDGPU::FLAT_LOAD_DWORDX3:
739   case AMDGPU::FLAT_LOAD_DWORDX4:
740   case AMDGPU::FLAT_STORE_DWORD:
741   case AMDGPU::FLAT_STORE_DWORDX2:
742   case AMDGPU::FLAT_STORE_DWORDX3:
743   case AMDGPU::FLAT_STORE_DWORDX4:
744     Result.VAddr = true;
745     return Result;
746   }
747 }
748 
749 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
750                                               const SILoadStoreOptimizer &LSO) {
751   I = MI;
752   unsigned Opc = MI->getOpcode();
753   InstClass = getInstClass(Opc, *LSO.TII);
754 
755   if (InstClass == UNKNOWN)
756     return;
757 
758   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
759 
760   switch (InstClass) {
761   case DS_READ:
762    EltSize =
763           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
764                                                                           : 4;
765    break;
766   case DS_WRITE:
767     EltSize =
768           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
769                                                                             : 4;
770     break;
771   case S_BUFFER_LOAD_IMM:
772   case S_BUFFER_LOAD_SGPR_IMM:
773   case S_LOAD_IMM:
774     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
775     break;
776   default:
777     EltSize = 4;
778     break;
779   }
780 
781   if (InstClass == MIMG) {
782     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
783     // Offset is not considered for MIMG instructions.
784     Offset = 0;
785   } else {
786     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
787     Offset = I->getOperand(OffsetIdx).getImm();
788   }
789 
790   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
791     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
792 
793   Width = getOpcodeWidth(*I, *LSO.TII);
794 
795   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
796     Offset &= 0xffff;
797   } else if (InstClass != MIMG) {
798     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
799   }
800 
801   AddressRegs Regs = getRegs(Opc, *LSO.TII);
802   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
803 
804   NumAddresses = 0;
805   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
806     AddrIdx[NumAddresses++] =
807         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
808   if (Regs.Addr)
809     AddrIdx[NumAddresses++] =
810         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
811   if (Regs.SBase)
812     AddrIdx[NumAddresses++] =
813         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
814   if (Regs.SRsrc)
815     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
816         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
817   if (Regs.SOffset)
818     AddrIdx[NumAddresses++] =
819         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
820   if (Regs.SAddr)
821     AddrIdx[NumAddresses++] =
822         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
823   if (Regs.VAddr)
824     AddrIdx[NumAddresses++] =
825         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
826   if (Regs.SSamp)
827     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
828         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
829   assert(NumAddresses <= MaxAddressRegs);
830 
831   for (unsigned J = 0; J < NumAddresses; J++)
832     AddrReg[J] = &I->getOperand(AddrIdx[J]);
833 }
834 
835 } // end anonymous namespace.
836 
837 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
838                       "SI Load Store Optimizer", false, false)
839 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
840 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
841                     false, false)
842 
843 char SILoadStoreOptimizer::ID = 0;
844 
845 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
846 
847 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
848   return new SILoadStoreOptimizer();
849 }
850 
851 static void addDefsUsesToList(const MachineInstr &MI,
852                               DenseSet<Register> &RegDefs,
853                               DenseSet<Register> &RegUses) {
854   for (const auto &Op : MI.operands()) {
855     if (!Op.isReg())
856       continue;
857     if (Op.isDef())
858       RegDefs.insert(Op.getReg());
859     if (Op.readsReg())
860       RegUses.insert(Op.getReg());
861   }
862 }
863 
864 bool SILoadStoreOptimizer::canSwapInstructions(
865     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
866     const MachineInstr &A, const MachineInstr &B) const {
867   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
868       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
869     return false;
870   for (const auto &BOp : B.operands()) {
871     if (!BOp.isReg())
872       continue;
873     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
874       return false;
875     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
876       return false;
877   }
878   return true;
879 }
880 
881 // Given that \p CI and \p Paired are adjacent memory operations produce a new
882 // MMO for the combined operation with a new access size.
883 MachineMemOperand *
884 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
885                                                const CombineInfo &Paired) {
886   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
887   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
888 
889   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
890 
891   // A base pointer for the combined operation is the same as the leading
892   // operation's pointer.
893   if (Paired < CI)
894     std::swap(MMOa, MMOb);
895 
896   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
897   // If merging FLAT and GLOBAL set address space to FLAT.
898   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
899     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
900 
901   MachineFunction *MF = CI.I->getMF();
902   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
903 }
904 
905 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
906                                                const SIInstrInfo &TII,
907                                                const CombineInfo &Paired) {
908   assert(CI.InstClass == MIMG);
909 
910   // Ignore instructions with tfe/lwe set.
911   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
912   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
913 
914   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
915     return false;
916 
917   // Check other optional immediate operands for equality.
918   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
919                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
920                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
921 
922   for (auto op : OperandsToMatch) {
923     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
924     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
925       return false;
926     if (Idx != -1 &&
927         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
928       return false;
929   }
930 
931   // Check DMask for overlaps.
932   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
933   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
934 
935   if (!MaxMask)
936     return false;
937 
938   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
939   if ((1u << AllowedBitsForMin) <= MinMask)
940     return false;
941 
942   return true;
943 }
944 
945 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
946                                        unsigned ComponentCount,
947                                        const GCNSubtarget &STI) {
948   if (ComponentCount > 4)
949     return 0;
950 
951   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
952       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
953   if (!OldFormatInfo)
954     return 0;
955 
956   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
957       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
958                                            ComponentCount,
959                                            OldFormatInfo->NumFormat, STI);
960 
961   if (!NewFormatInfo)
962     return 0;
963 
964   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
965          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
966 
967   return NewFormatInfo->Format;
968 }
969 
970 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
971 // highest power of two. Note that the result is well defined for all inputs
972 // including corner cases like:
973 // - if Lo == Hi, return that value
974 // - if Lo == 0, return 0 (even though the "- 1" below underflows
975 // - if Lo > Hi, return 0 (as if the range wrapped around)
976 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
977   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
978 }
979 
980 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
981                                                 const GCNSubtarget &STI,
982                                                 CombineInfo &Paired,
983                                                 bool Modify) {
984   assert(CI.InstClass != MIMG);
985 
986   // XXX - Would the same offset be OK? Is there any reason this would happen or
987   // be useful?
988   if (CI.Offset == Paired.Offset)
989     return false;
990 
991   // This won't be valid if the offset isn't aligned.
992   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
993     return false;
994 
995   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
996 
997     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
998         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
999     if (!Info0)
1000       return false;
1001     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1002         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1003     if (!Info1)
1004       return false;
1005 
1006     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1007         Info0->NumFormat != Info1->NumFormat)
1008       return false;
1009 
1010     // TODO: Should be possible to support more formats, but if format loads
1011     // are not dword-aligned, the merged load might not be valid.
1012     if (Info0->BitsPerComp != 32)
1013       return false;
1014 
1015     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1016       return false;
1017   }
1018 
1019   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1020   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1021   CI.UseST64 = false;
1022   CI.BaseOff = 0;
1023 
1024   // Handle all non-DS instructions.
1025   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1026     if (EltOffset0 + CI.Width != EltOffset1 &&
1027             EltOffset1 + Paired.Width != EltOffset0)
1028       return false;
1029     if (CI.CPol != Paired.CPol)
1030       return false;
1031     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1032         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1033       // Reject cases like:
1034       //   dword + dwordx2 -> dwordx3
1035       //   dword + dwordx3 -> dwordx4
1036       // If we tried to combine these cases, we would fail to extract a subreg
1037       // for the result of the second load due to SGPR alignment requirements.
1038       if (CI.Width != Paired.Width &&
1039           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1040         return false;
1041     }
1042     return true;
1043   }
1044 
1045   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1046   // the stride 64 versions.
1047   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1048       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1049     if (Modify) {
1050       CI.Offset = EltOffset0 / 64;
1051       Paired.Offset = EltOffset1 / 64;
1052       CI.UseST64 = true;
1053     }
1054     return true;
1055   }
1056 
1057   // Check if the new offsets fit in the reduced 8-bit range.
1058   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1059     if (Modify) {
1060       CI.Offset = EltOffset0;
1061       Paired.Offset = EltOffset1;
1062     }
1063     return true;
1064   }
1065 
1066   // Try to shift base address to decrease offsets.
1067   uint32_t Min = std::min(EltOffset0, EltOffset1);
1068   uint32_t Max = std::max(EltOffset0, EltOffset1);
1069 
1070   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1071   if (((Max - Min) & ~Mask) == 0) {
1072     if (Modify) {
1073       // From the range of values we could use for BaseOff, choose the one that
1074       // is aligned to the highest power of two, to maximise the chance that
1075       // the same offset can be reused for other load/store pairs.
1076       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1077       // Copy the low bits of the offsets, so that when we adjust them by
1078       // subtracting BaseOff they will be multiples of 64.
1079       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1080       CI.BaseOff = BaseOff * CI.EltSize;
1081       CI.Offset = (EltOffset0 - BaseOff) / 64;
1082       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1083       CI.UseST64 = true;
1084     }
1085     return true;
1086   }
1087 
1088   if (isUInt<8>(Max - Min)) {
1089     if (Modify) {
1090       // From the range of values we could use for BaseOff, choose the one that
1091       // is aligned to the highest power of two, to maximise the chance that
1092       // the same offset can be reused for other load/store pairs.
1093       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1094       CI.BaseOff = BaseOff * CI.EltSize;
1095       CI.Offset = EltOffset0 - BaseOff;
1096       Paired.Offset = EltOffset1 - BaseOff;
1097     }
1098     return true;
1099   }
1100 
1101   return false;
1102 }
1103 
1104 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1105                                      const CombineInfo &CI,
1106                                      const CombineInfo &Paired) {
1107   const unsigned Width = (CI.Width + Paired.Width);
1108   switch (CI.InstClass) {
1109   default:
1110     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1111   case S_BUFFER_LOAD_IMM:
1112   case S_BUFFER_LOAD_SGPR_IMM:
1113   case S_LOAD_IMM:
1114     switch (Width) {
1115     default:
1116       return false;
1117     case 2:
1118     case 4:
1119     case 8:
1120       return true;
1121     case 3:
1122       return STM.hasScalarDwordx3Loads();
1123     }
1124   }
1125 }
1126 
1127 const TargetRegisterClass *
1128 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1129   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1130     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1131   }
1132   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1133     return TRI->getRegClassForReg(*MRI, Src->getReg());
1134   }
1135   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1136     return TRI->getRegClassForReg(*MRI, Src->getReg());
1137   }
1138   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1139     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1140   }
1141   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1142     return TRI->getRegClassForReg(*MRI, Src->getReg());
1143   }
1144   return nullptr;
1145 }
1146 
1147 /// This function assumes that CI comes before Paired in a basic block. Return
1148 /// an insertion point for the merged instruction or nullptr on failure.
1149 SILoadStoreOptimizer::CombineInfo *
1150 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1151                                            CombineInfo &Paired) {
1152   // If another instruction has already been merged into CI, it may now be a
1153   // type that we can't do any further merging into.
1154   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1155     return nullptr;
1156   assert(CI.InstClass == Paired.InstClass);
1157 
1158   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1159       getInstSubclass(Paired.I->getOpcode(), *TII))
1160     return nullptr;
1161 
1162   // Check both offsets (or masks for MIMG) can be combined and fit in the
1163   // reduced range.
1164   if (CI.InstClass == MIMG) {
1165     if (!dmasksCanBeCombined(CI, *TII, Paired))
1166       return nullptr;
1167   } else {
1168     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1169       return nullptr;
1170   }
1171 
1172   DenseSet<Register> RegDefs;
1173   DenseSet<Register> RegUses;
1174   CombineInfo *Where;
1175   if (CI.I->mayLoad()) {
1176     // Try to hoist Paired up to CI.
1177     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1178     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1179       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1180         return nullptr;
1181     }
1182     Where = &CI;
1183   } else {
1184     // Try to sink CI down to Paired.
1185     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1186     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1187       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1188         return nullptr;
1189     }
1190     Where = &Paired;
1191   }
1192 
1193   // Call offsetsCanBeCombined with modify = true so that the offsets are
1194   // correct for the new instruction.  This should return true, because
1195   // this function should only be called on CombineInfo objects that
1196   // have already been confirmed to be mergeable.
1197   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1198     offsetsCanBeCombined(CI, *STM, Paired, true);
1199   return Where;
1200 }
1201 
1202 // Copy the merged load result from DestReg to the original dest regs of CI and
1203 // Paired.
1204 void SILoadStoreOptimizer::copyToDestRegs(
1205     CombineInfo &CI, CombineInfo &Paired,
1206     MachineBasicBlock::iterator InsertBefore, int OpName,
1207     Register DestReg) const {
1208   MachineBasicBlock *MBB = CI.I->getParent();
1209   DebugLoc DL = CI.I->getDebugLoc();
1210 
1211   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1212 
1213   // Copy to the old destination registers.
1214   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1215   const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1216   const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1217 
1218   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1219       .add(*Dest0) // Copy to same destination including flags and sub reg.
1220       .addReg(DestReg, 0, SubRegIdx0);
1221   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1222       .add(*Dest1)
1223       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1224 }
1225 
1226 // Return a register for the source of the merged store after copying the
1227 // original source regs of CI and Paired into it.
1228 Register
1229 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1230                                       MachineBasicBlock::iterator InsertBefore,
1231                                       int OpName) const {
1232   MachineBasicBlock *MBB = CI.I->getParent();
1233   DebugLoc DL = CI.I->getDebugLoc();
1234 
1235   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1236 
1237   // Copy to the new source register.
1238   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1239   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1240 
1241   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1242   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1243 
1244   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1245       .add(*Src0)
1246       .addImm(SubRegIdx0)
1247       .add(*Src1)
1248       .addImm(SubRegIdx1);
1249 
1250   return SrcReg;
1251 }
1252 
1253 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1254   if (STM->ldsRequiresM0Init())
1255     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1256   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1257 }
1258 
1259 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1260   if (STM->ldsRequiresM0Init())
1261     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1262 
1263   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1264                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1265 }
1266 
1267 MachineBasicBlock::iterator
1268 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1269                                      MachineBasicBlock::iterator InsertBefore) {
1270   MachineBasicBlock *MBB = CI.I->getParent();
1271 
1272   // Be careful, since the addresses could be subregisters themselves in weird
1273   // cases, like vectors of pointers.
1274   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1275 
1276   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1277   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1278   unsigned Opc =
1279       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1280 
1281   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1282          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1283 
1284   const MCInstrDesc &Read2Desc = TII->get(Opc);
1285 
1286   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1287   Register DestReg = MRI->createVirtualRegister(SuperRC);
1288 
1289   DebugLoc DL = CI.I->getDebugLoc();
1290 
1291   Register BaseReg = AddrReg->getReg();
1292   unsigned BaseSubReg = AddrReg->getSubReg();
1293   unsigned BaseRegFlags = 0;
1294   if (CI.BaseOff) {
1295     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1296     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1297         .addImm(CI.BaseOff);
1298 
1299     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1300     BaseRegFlags = RegState::Kill;
1301 
1302     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1303         .addReg(ImmReg)
1304         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1305         .addImm(0); // clamp bit
1306     BaseSubReg = 0;
1307   }
1308 
1309   MachineInstrBuilder Read2 =
1310       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1311           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1312           .addImm(NewOffset0)                        // offset0
1313           .addImm(NewOffset1)                        // offset1
1314           .addImm(0)                                 // gds
1315           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1316 
1317   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1318 
1319   CI.I->eraseFromParent();
1320   Paired.I->eraseFromParent();
1321 
1322   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1323   return Read2;
1324 }
1325 
1326 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1327   if (STM->ldsRequiresM0Init())
1328     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1329   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1330                         : AMDGPU::DS_WRITE2_B64_gfx9;
1331 }
1332 
1333 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1334   if (STM->ldsRequiresM0Init())
1335     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1336                           : AMDGPU::DS_WRITE2ST64_B64;
1337 
1338   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1339                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1340 }
1341 
1342 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1343     CombineInfo &CI, CombineInfo &Paired,
1344     MachineBasicBlock::iterator InsertBefore) {
1345   MachineBasicBlock *MBB = CI.I->getParent();
1346 
1347   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1348   // sure we preserve the subregister index and any register flags set on them.
1349   const MachineOperand *AddrReg =
1350       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1351   const MachineOperand *Data0 =
1352       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1353   const MachineOperand *Data1 =
1354       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1355 
1356   unsigned NewOffset0 = CI.Offset;
1357   unsigned NewOffset1 = Paired.Offset;
1358   unsigned Opc =
1359       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1360 
1361   if (NewOffset0 > NewOffset1) {
1362     // Canonicalize the merged instruction so the smaller offset comes first.
1363     std::swap(NewOffset0, NewOffset1);
1364     std::swap(Data0, Data1);
1365   }
1366 
1367   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1368          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1369 
1370   const MCInstrDesc &Write2Desc = TII->get(Opc);
1371   DebugLoc DL = CI.I->getDebugLoc();
1372 
1373   Register BaseReg = AddrReg->getReg();
1374   unsigned BaseSubReg = AddrReg->getSubReg();
1375   unsigned BaseRegFlags = 0;
1376   if (CI.BaseOff) {
1377     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1378     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1379         .addImm(CI.BaseOff);
1380 
1381     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1382     BaseRegFlags = RegState::Kill;
1383 
1384     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1385         .addReg(ImmReg)
1386         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1387         .addImm(0); // clamp bit
1388     BaseSubReg = 0;
1389   }
1390 
1391   MachineInstrBuilder Write2 =
1392       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1393           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1394           .add(*Data0)                               // data0
1395           .add(*Data1)                               // data1
1396           .addImm(NewOffset0)                        // offset0
1397           .addImm(NewOffset1)                        // offset1
1398           .addImm(0)                                 // gds
1399           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1400 
1401   CI.I->eraseFromParent();
1402   Paired.I->eraseFromParent();
1403 
1404   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1405   return Write2;
1406 }
1407 
1408 MachineBasicBlock::iterator
1409 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1410                                      MachineBasicBlock::iterator InsertBefore) {
1411   MachineBasicBlock *MBB = CI.I->getParent();
1412   DebugLoc DL = CI.I->getDebugLoc();
1413   const unsigned Opcode = getNewOpcode(CI, Paired);
1414 
1415   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1416 
1417   Register DestReg = MRI->createVirtualRegister(SuperRC);
1418   unsigned MergedDMask = CI.DMask | Paired.DMask;
1419   unsigned DMaskIdx =
1420       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1421 
1422   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1423   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1424     if (I == DMaskIdx)
1425       MIB.addImm(MergedDMask);
1426     else
1427       MIB.add((*CI.I).getOperand(I));
1428   }
1429 
1430   // It shouldn't be possible to get this far if the two instructions
1431   // don't have a single memoperand, because MachineInstr::mayAlias()
1432   // will return true if this is the case.
1433   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1434 
1435   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1436 
1437   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1438 
1439   CI.I->eraseFromParent();
1440   Paired.I->eraseFromParent();
1441   return New;
1442 }
1443 
1444 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1445     CombineInfo &CI, CombineInfo &Paired,
1446     MachineBasicBlock::iterator InsertBefore) {
1447   MachineBasicBlock *MBB = CI.I->getParent();
1448   DebugLoc DL = CI.I->getDebugLoc();
1449   const unsigned Opcode = getNewOpcode(CI, Paired);
1450 
1451   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1452 
1453   Register DestReg = MRI->createVirtualRegister(SuperRC);
1454   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1455 
1456   // It shouldn't be possible to get this far if the two instructions
1457   // don't have a single memoperand, because MachineInstr::mayAlias()
1458   // will return true if this is the case.
1459   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1460 
1461   MachineInstrBuilder New =
1462       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1463           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1464   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1465     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1466   New.addImm(MergedOffset);
1467   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1468 
1469   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1470 
1471   CI.I->eraseFromParent();
1472   Paired.I->eraseFromParent();
1473   return New;
1474 }
1475 
1476 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1477     CombineInfo &CI, CombineInfo &Paired,
1478     MachineBasicBlock::iterator InsertBefore) {
1479   MachineBasicBlock *MBB = CI.I->getParent();
1480   DebugLoc DL = CI.I->getDebugLoc();
1481 
1482   const unsigned Opcode = getNewOpcode(CI, Paired);
1483 
1484   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1485 
1486   // Copy to the new source register.
1487   Register DestReg = MRI->createVirtualRegister(SuperRC);
1488   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1489 
1490   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1491 
1492   AddressRegs Regs = getRegs(Opcode, *TII);
1493 
1494   if (Regs.VAddr)
1495     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1496 
1497   // It shouldn't be possible to get this far if the two instructions
1498   // don't have a single memoperand, because MachineInstr::mayAlias()
1499   // will return true if this is the case.
1500   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1501 
1502   MachineInstr *New =
1503     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1504         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1505         .addImm(MergedOffset) // offset
1506         .addImm(CI.CPol)      // cpol
1507         .addImm(0)            // swz
1508         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1509 
1510   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1511 
1512   CI.I->eraseFromParent();
1513   Paired.I->eraseFromParent();
1514   return New;
1515 }
1516 
1517 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1518     CombineInfo &CI, CombineInfo &Paired,
1519     MachineBasicBlock::iterator InsertBefore) {
1520   MachineBasicBlock *MBB = CI.I->getParent();
1521   DebugLoc DL = CI.I->getDebugLoc();
1522 
1523   const unsigned Opcode = getNewOpcode(CI, Paired);
1524 
1525   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1526 
1527   // Copy to the new source register.
1528   Register DestReg = MRI->createVirtualRegister(SuperRC);
1529   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1530 
1531   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1532 
1533   AddressRegs Regs = getRegs(Opcode, *TII);
1534 
1535   if (Regs.VAddr)
1536     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1537 
1538   unsigned JoinedFormat =
1539       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1540 
1541   // It shouldn't be possible to get this far if the two instructions
1542   // don't have a single memoperand, because MachineInstr::mayAlias()
1543   // will return true if this is the case.
1544   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1545 
1546   MachineInstr *New =
1547       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1548           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1549           .addImm(MergedOffset) // offset
1550           .addImm(JoinedFormat) // format
1551           .addImm(CI.CPol)      // cpol
1552           .addImm(0)            // swz
1553           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1554 
1555   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1556 
1557   CI.I->eraseFromParent();
1558   Paired.I->eraseFromParent();
1559   return New;
1560 }
1561 
1562 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1563     CombineInfo &CI, CombineInfo &Paired,
1564     MachineBasicBlock::iterator InsertBefore) {
1565   MachineBasicBlock *MBB = CI.I->getParent();
1566   DebugLoc DL = CI.I->getDebugLoc();
1567 
1568   const unsigned Opcode = getNewOpcode(CI, Paired);
1569 
1570   Register SrcReg =
1571       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1572 
1573   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1574                  .addReg(SrcReg, RegState::Kill);
1575 
1576   AddressRegs Regs = getRegs(Opcode, *TII);
1577 
1578   if (Regs.VAddr)
1579     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1580 
1581   unsigned JoinedFormat =
1582       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1583 
1584   // It shouldn't be possible to get this far if the two instructions
1585   // don't have a single memoperand, because MachineInstr::mayAlias()
1586   // will return true if this is the case.
1587   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1588 
1589   MachineInstr *New =
1590       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1591           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1592           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1593           .addImm(JoinedFormat)                     // format
1594           .addImm(CI.CPol)                          // cpol
1595           .addImm(0)                                // swz
1596           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1597 
1598   CI.I->eraseFromParent();
1599   Paired.I->eraseFromParent();
1600   return New;
1601 }
1602 
1603 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1604     CombineInfo &CI, CombineInfo &Paired,
1605     MachineBasicBlock::iterator InsertBefore) {
1606   MachineBasicBlock *MBB = CI.I->getParent();
1607   DebugLoc DL = CI.I->getDebugLoc();
1608 
1609   const unsigned Opcode = getNewOpcode(CI, Paired);
1610 
1611   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1612   Register DestReg = MRI->createVirtualRegister(SuperRC);
1613 
1614   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1615 
1616   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1617     MIB.add(*SAddr);
1618 
1619   MachineInstr *New =
1620     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1621        .addImm(std::min(CI.Offset, Paired.Offset))
1622        .addImm(CI.CPol)
1623        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1624 
1625   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1626 
1627   CI.I->eraseFromParent();
1628   Paired.I->eraseFromParent();
1629   return New;
1630 }
1631 
1632 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1633     CombineInfo &CI, CombineInfo &Paired,
1634     MachineBasicBlock::iterator InsertBefore) {
1635   MachineBasicBlock *MBB = CI.I->getParent();
1636   DebugLoc DL = CI.I->getDebugLoc();
1637 
1638   const unsigned Opcode = getNewOpcode(CI, Paired);
1639 
1640   Register SrcReg =
1641       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1642 
1643   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1644                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1645                  .addReg(SrcReg, RegState::Kill);
1646 
1647   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1648     MIB.add(*SAddr);
1649 
1650   MachineInstr *New =
1651     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1652        .addImm(CI.CPol)
1653        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1654 
1655   CI.I->eraseFromParent();
1656   Paired.I->eraseFromParent();
1657   return New;
1658 }
1659 
1660 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1661                                             const CombineInfo &Paired) {
1662   const unsigned Width = CI.Width + Paired.Width;
1663 
1664   switch (getCommonInstClass(CI, Paired)) {
1665   default:
1666     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1667     // FIXME: Handle d16 correctly
1668     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1669                                   Width);
1670   case TBUFFER_LOAD:
1671   case TBUFFER_STORE:
1672     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1673                                   Width);
1674 
1675   case UNKNOWN:
1676     llvm_unreachable("Unknown instruction class");
1677   case S_BUFFER_LOAD_IMM:
1678     switch (Width) {
1679     default:
1680       return 0;
1681     case 2:
1682       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1683     case 3:
1684       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1685     case 4:
1686       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1687     case 8:
1688       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1689     }
1690   case S_BUFFER_LOAD_SGPR_IMM:
1691     switch (Width) {
1692     default:
1693       return 0;
1694     case 2:
1695       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1696     case 3:
1697       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1698     case 4:
1699       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1700     case 8:
1701       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1702     }
1703   case S_LOAD_IMM:
1704     switch (Width) {
1705     default:
1706       return 0;
1707     case 2:
1708       return AMDGPU::S_LOAD_DWORDX2_IMM;
1709     case 3:
1710       return AMDGPU::S_LOAD_DWORDX3_IMM;
1711     case 4:
1712       return AMDGPU::S_LOAD_DWORDX4_IMM;
1713     case 8:
1714       return AMDGPU::S_LOAD_DWORDX8_IMM;
1715     }
1716   case GLOBAL_LOAD:
1717     switch (Width) {
1718     default:
1719       return 0;
1720     case 2:
1721       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1722     case 3:
1723       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1724     case 4:
1725       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1726     }
1727   case GLOBAL_LOAD_SADDR:
1728     switch (Width) {
1729     default:
1730       return 0;
1731     case 2:
1732       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1733     case 3:
1734       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1735     case 4:
1736       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1737     }
1738   case GLOBAL_STORE:
1739     switch (Width) {
1740     default:
1741       return 0;
1742     case 2:
1743       return AMDGPU::GLOBAL_STORE_DWORDX2;
1744     case 3:
1745       return AMDGPU::GLOBAL_STORE_DWORDX3;
1746     case 4:
1747       return AMDGPU::GLOBAL_STORE_DWORDX4;
1748     }
1749   case GLOBAL_STORE_SADDR:
1750     switch (Width) {
1751     default:
1752       return 0;
1753     case 2:
1754       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1755     case 3:
1756       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1757     case 4:
1758       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1759     }
1760   case FLAT_LOAD:
1761     switch (Width) {
1762     default:
1763       return 0;
1764     case 2:
1765       return AMDGPU::FLAT_LOAD_DWORDX2;
1766     case 3:
1767       return AMDGPU::FLAT_LOAD_DWORDX3;
1768     case 4:
1769       return AMDGPU::FLAT_LOAD_DWORDX4;
1770     }
1771   case FLAT_STORE:
1772     switch (Width) {
1773     default:
1774       return 0;
1775     case 2:
1776       return AMDGPU::FLAT_STORE_DWORDX2;
1777     case 3:
1778       return AMDGPU::FLAT_STORE_DWORDX3;
1779     case 4:
1780       return AMDGPU::FLAT_STORE_DWORDX4;
1781     }
1782   case MIMG:
1783     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1784            "No overlaps");
1785     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1786   }
1787 }
1788 
1789 std::pair<unsigned, unsigned>
1790 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1791                                     const CombineInfo &Paired) {
1792   assert((CI.InstClass != MIMG ||
1793           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1794            CI.Width + Paired.Width)) &&
1795          "No overlaps");
1796 
1797   unsigned Idx0;
1798   unsigned Idx1;
1799 
1800   static const unsigned Idxs[5][4] = {
1801       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1802       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1803       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1804       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1805       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1806   };
1807 
1808   assert(CI.Width >= 1 && CI.Width <= 4);
1809   assert(Paired.Width >= 1 && Paired.Width <= 4);
1810 
1811   if (Paired < CI) {
1812     Idx1 = Idxs[0][Paired.Width - 1];
1813     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1814   } else {
1815     Idx0 = Idxs[0][CI.Width - 1];
1816     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1817   }
1818 
1819   return {Idx0, Idx1};
1820 }
1821 
1822 const TargetRegisterClass *
1823 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1824                                              const CombineInfo &Paired) const {
1825   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1826       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1827     switch (CI.Width + Paired.Width) {
1828     default:
1829       return nullptr;
1830     case 2:
1831       return &AMDGPU::SReg_64_XEXECRegClass;
1832     case 3:
1833       return &AMDGPU::SGPR_96RegClass;
1834     case 4:
1835       return &AMDGPU::SGPR_128RegClass;
1836     case 8:
1837       return &AMDGPU::SGPR_256RegClass;
1838     case 16:
1839       return &AMDGPU::SGPR_512RegClass;
1840     }
1841   }
1842 
1843   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1844   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1845              ? TRI->getAGPRClassForBitWidth(BitWidth)
1846              : TRI->getVGPRClassForBitWidth(BitWidth);
1847 }
1848 
1849 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1850     CombineInfo &CI, CombineInfo &Paired,
1851     MachineBasicBlock::iterator InsertBefore) {
1852   MachineBasicBlock *MBB = CI.I->getParent();
1853   DebugLoc DL = CI.I->getDebugLoc();
1854 
1855   const unsigned Opcode = getNewOpcode(CI, Paired);
1856 
1857   Register SrcReg =
1858       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1859 
1860   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1861                  .addReg(SrcReg, RegState::Kill);
1862 
1863   AddressRegs Regs = getRegs(Opcode, *TII);
1864 
1865   if (Regs.VAddr)
1866     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1867 
1868 
1869   // It shouldn't be possible to get this far if the two instructions
1870   // don't have a single memoperand, because MachineInstr::mayAlias()
1871   // will return true if this is the case.
1872   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1873 
1874   MachineInstr *New =
1875     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1876         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1877         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1878         .addImm(CI.CPol)      // cpol
1879         .addImm(0)            // swz
1880         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1881 
1882   CI.I->eraseFromParent();
1883   Paired.I->eraseFromParent();
1884   return New;
1885 }
1886 
1887 MachineOperand
1888 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1889   APInt V(32, Val, true);
1890   if (TII->isInlineConstant(V))
1891     return MachineOperand::CreateImm(Val);
1892 
1893   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1894   MachineInstr *Mov =
1895   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1896           TII->get(AMDGPU::S_MOV_B32), Reg)
1897     .addImm(Val);
1898   (void)Mov;
1899   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1900   return MachineOperand::CreateReg(Reg, false);
1901 }
1902 
1903 // Compute base address using Addr and return the final register.
1904 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1905                                            const MemAddress &Addr) const {
1906   MachineBasicBlock *MBB = MI.getParent();
1907   MachineBasicBlock::iterator MBBI = MI.getIterator();
1908   DebugLoc DL = MI.getDebugLoc();
1909 
1910   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1911           Addr.Base.LoSubReg) &&
1912          "Expected 32-bit Base-Register-Low!!");
1913 
1914   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1915           Addr.Base.HiSubReg) &&
1916          "Expected 32-bit Base-Register-Hi!!");
1917 
1918   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1919   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1920   MachineOperand OffsetHi =
1921     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1922 
1923   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1924   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1925   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1926 
1927   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1928   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1929   MachineInstr *LoHalf =
1930     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1931       .addReg(CarryReg, RegState::Define)
1932       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1933       .add(OffsetLo)
1934       .addImm(0); // clamp bit
1935   (void)LoHalf;
1936   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1937 
1938   MachineInstr *HiHalf =
1939   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1940     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1941     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1942     .add(OffsetHi)
1943     .addReg(CarryReg, RegState::Kill)
1944     .addImm(0); // clamp bit
1945   (void)HiHalf;
1946   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1947 
1948   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1949   MachineInstr *FullBase =
1950     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1951       .addReg(DestSub0)
1952       .addImm(AMDGPU::sub0)
1953       .addReg(DestSub1)
1954       .addImm(AMDGPU::sub1);
1955   (void)FullBase;
1956   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1957 
1958   return FullDestReg;
1959 }
1960 
1961 // Update base and offset with the NewBase and NewOffset in MI.
1962 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1963                                                Register NewBase,
1964                                                int32_t NewOffset) const {
1965   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1966   Base->setReg(NewBase);
1967   Base->setIsKill(false);
1968   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1969 }
1970 
1971 std::optional<int32_t>
1972 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1973   if (Op.isImm())
1974     return Op.getImm();
1975 
1976   if (!Op.isReg())
1977     return std::nullopt;
1978 
1979   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1980   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1981       !Def->getOperand(1).isImm())
1982     return std::nullopt;
1983 
1984   return Def->getOperand(1).getImm();
1985 }
1986 
1987 // Analyze Base and extracts:
1988 //  - 32bit base registers, subregisters
1989 //  - 64bit constant offset
1990 // Expecting base computation as:
1991 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1992 //   %LO:vgpr_32, %c:sreg_64_xexec =
1993 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1994 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1995 //   %Base:vreg_64 =
1996 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1997 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1998                                                       MemAddress &Addr) const {
1999   if (!Base.isReg())
2000     return;
2001 
2002   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2003   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2004       || Def->getNumOperands() != 5)
2005     return;
2006 
2007   MachineOperand BaseLo = Def->getOperand(1);
2008   MachineOperand BaseHi = Def->getOperand(3);
2009   if (!BaseLo.isReg() || !BaseHi.isReg())
2010     return;
2011 
2012   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2013   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2014 
2015   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2016       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2017     return;
2018 
2019   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2020   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2021 
2022   auto Offset0P = extractConstOffset(*Src0);
2023   if (Offset0P)
2024     BaseLo = *Src1;
2025   else {
2026     if (!(Offset0P = extractConstOffset(*Src1)))
2027       return;
2028     BaseLo = *Src0;
2029   }
2030 
2031   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2032   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2033 
2034   if (Src0->isImm())
2035     std::swap(Src0, Src1);
2036 
2037   if (!Src1->isImm() || Src0->isImm())
2038     return;
2039 
2040   uint64_t Offset1 = Src1->getImm();
2041   BaseHi = *Src0;
2042 
2043   Addr.Base.LoReg = BaseLo.getReg();
2044   Addr.Base.HiReg = BaseHi.getReg();
2045   Addr.Base.LoSubReg = BaseLo.getSubReg();
2046   Addr.Base.HiSubReg = BaseHi.getSubReg();
2047   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2048 }
2049 
2050 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2051     MachineInstr &MI,
2052     MemInfoMap &Visited,
2053     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2054 
2055   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2056     return false;
2057 
2058   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2059   if (SIInstrInfo::isFLATScratch(MI))
2060     return false;
2061 
2062   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2063                                               : AMDGPUAS::FLAT_ADDRESS;
2064 
2065   if (AnchorList.count(&MI))
2066     return false;
2067 
2068   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2069 
2070   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2071     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2072     return false;
2073   }
2074 
2075   // Step1: Find the base-registers and a 64bit constant offset.
2076   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2077   MemAddress MAddr;
2078   if (!Visited.contains(&MI)) {
2079     processBaseWithConstOffset(Base, MAddr);
2080     Visited[&MI] = MAddr;
2081   } else
2082     MAddr = Visited[&MI];
2083 
2084   if (MAddr.Offset == 0) {
2085     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2086                          " constant offsets that can be promoted.\n";);
2087     return false;
2088   }
2089 
2090   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2091              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2092 
2093   // Step2: Traverse through MI's basic block and find an anchor(that has the
2094   // same base-registers) with the highest 13bit distance from MI's offset.
2095   // E.g. (64bit loads)
2096   // bb:
2097   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2098   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2099   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2100   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2101   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2102   //
2103   // Starting from the first load, the optimization will try to find a new base
2104   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2105   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2106   // as the new-base(anchor) because of the maximum distance which can
2107   // accommodate more intermediate bases presumably.
2108   //
2109   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2110   // (&a + 8192) for load1, load2, load4.
2111   //   addr = &a + 8192
2112   //   load1 = load(addr,       -4096)
2113   //   load2 = load(addr,       -2048)
2114   //   load3 = load(addr,       0)
2115   //   load4 = load(addr,       2048)
2116   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2117   //
2118   MachineInstr *AnchorInst = nullptr;
2119   MemAddress AnchorAddr;
2120   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2121   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2122 
2123   MachineBasicBlock *MBB = MI.getParent();
2124   MachineBasicBlock::iterator E = MBB->end();
2125   MachineBasicBlock::iterator MBBI = MI.getIterator();
2126   ++MBBI;
2127   const SITargetLowering *TLI =
2128     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2129 
2130   for ( ; MBBI != E; ++MBBI) {
2131     MachineInstr &MINext = *MBBI;
2132     // TODO: Support finding an anchor(with same base) from store addresses or
2133     // any other load addresses where the opcodes are different.
2134     if (MINext.getOpcode() != MI.getOpcode() ||
2135         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2136       continue;
2137 
2138     const MachineOperand &BaseNext =
2139       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2140     MemAddress MAddrNext;
2141     if (!Visited.contains(&MINext)) {
2142       processBaseWithConstOffset(BaseNext, MAddrNext);
2143       Visited[&MINext] = MAddrNext;
2144     } else
2145       MAddrNext = Visited[&MINext];
2146 
2147     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2148         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2149         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2150         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2151       continue;
2152 
2153     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2154 
2155     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2156     TargetLoweringBase::AddrMode AM;
2157     AM.HasBaseReg = true;
2158     AM.BaseOffs = Dist;
2159     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2160         (uint32_t)std::abs(Dist) > MaxDist) {
2161       MaxDist = std::abs(Dist);
2162 
2163       AnchorAddr = MAddrNext;
2164       AnchorInst = &MINext;
2165     }
2166   }
2167 
2168   if (AnchorInst) {
2169     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2170                AnchorInst->dump());
2171     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2172                <<  AnchorAddr.Offset << "\n\n");
2173 
2174     // Instead of moving up, just re-compute anchor-instruction's base address.
2175     Register Base = computeBase(MI, AnchorAddr);
2176 
2177     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2178     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2179 
2180     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2181       TargetLoweringBase::AddrMode AM;
2182       AM.HasBaseReg = true;
2183       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2184 
2185       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2186         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2187                    OtherMI->dump());
2188         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2189         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2190       }
2191     }
2192     AnchorList.insert(AnchorInst);
2193     return true;
2194   }
2195 
2196   return false;
2197 }
2198 
2199 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2200                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2201   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2202     if (AddrList.front().InstClass == CI.InstClass &&
2203         AddrList.front().IsAGPR == CI.IsAGPR &&
2204         AddrList.front().hasSameBaseAddress(CI)) {
2205       AddrList.emplace_back(CI);
2206       return;
2207     }
2208   }
2209 
2210   // Base address not found, so add a new list.
2211   MergeableInsts.emplace_back(1, CI);
2212 }
2213 
2214 std::pair<MachineBasicBlock::iterator, bool>
2215 SILoadStoreOptimizer::collectMergeableInsts(
2216     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2217     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2218     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2219   bool Modified = false;
2220 
2221   // Sort potential mergeable instructions into lists.  One list per base address.
2222   unsigned Order = 0;
2223   MachineBasicBlock::iterator BlockI = Begin;
2224   for (; BlockI != End; ++BlockI) {
2225     MachineInstr &MI = *BlockI;
2226 
2227     // We run this before checking if an address is mergeable, because it can produce
2228     // better code even if the instructions aren't mergeable.
2229     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2230       Modified = true;
2231 
2232     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2233     // barriers. We can look after this barrier for separate merges.
2234     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2235       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2236 
2237       // Search will resume after this instruction in a separate merge list.
2238       ++BlockI;
2239       break;
2240     }
2241 
2242     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2243     if (InstClass == UNKNOWN)
2244       continue;
2245 
2246     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2247     int Swizzled =
2248         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2249     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2250       continue;
2251 
2252     CombineInfo CI;
2253     CI.setMI(MI, *this);
2254     CI.Order = Order++;
2255 
2256     if (!CI.hasMergeableAddress(*MRI))
2257       continue;
2258 
2259     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2260       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2261       //        operands. However we are reporting that ds_write2 shall have
2262       //        only VGPR data so that machine copy propagation does not
2263       //        create an illegal instruction with a VGPR and AGPR sources.
2264       //        Consequenctially if we create such instruction the verifier
2265       //        will complain.
2266       continue;
2267     }
2268 
2269     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2270 
2271     addInstToMergeableList(CI, MergeableInsts);
2272   }
2273 
2274   // At this point we have lists of Mergeable instructions.
2275   //
2276   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2277   // list try to find an instruction that can be merged with I.  If an instruction
2278   // is found, it is stored in the Paired field.  If no instructions are found, then
2279   // the CombineInfo object is deleted from the list.
2280 
2281   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2282                                                    E = MergeableInsts.end(); I != E;) {
2283 
2284     std::list<CombineInfo> &MergeList = *I;
2285     if (MergeList.size() <= 1) {
2286       // This means we have found only one instruction with a given address
2287       // that can be merged, and we need at least 2 instructions to do a merge,
2288       // so this list can be discarded.
2289       I = MergeableInsts.erase(I);
2290       continue;
2291     }
2292 
2293     // Sort the lists by offsets, this way mergeable instructions will be
2294     // adjacent to each other in the list, which will make it easier to find
2295     // matches.
2296     MergeList.sort(
2297         [] (const CombineInfo &A, const CombineInfo &B) {
2298           return A.Offset < B.Offset;
2299         });
2300     ++I;
2301   }
2302 
2303   return {BlockI, Modified};
2304 }
2305 
2306 // Scan through looking for adjacent LDS operations with constant offsets from
2307 // the same base register. We rely on the scheduler to do the hard work of
2308 // clustering nearby loads, and assume these are all adjacent.
2309 bool SILoadStoreOptimizer::optimizeBlock(
2310                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2311   bool Modified = false;
2312 
2313   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2314                                                    E = MergeableInsts.end(); I != E;) {
2315     std::list<CombineInfo> &MergeList = *I;
2316 
2317     bool OptimizeListAgain = false;
2318     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2319       // We weren't able to make any changes, so delete the list so we don't
2320       // process the same instructions the next time we try to optimize this
2321       // block.
2322       I = MergeableInsts.erase(I);
2323       continue;
2324     }
2325 
2326     Modified = true;
2327 
2328     // We made changes, but also determined that there were no more optimization
2329     // opportunities, so we don't need to reprocess the list
2330     if (!OptimizeListAgain) {
2331       I = MergeableInsts.erase(I);
2332       continue;
2333     }
2334     OptimizeAgain = true;
2335   }
2336   return Modified;
2337 }
2338 
2339 bool
2340 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2341                                           std::list<CombineInfo> &MergeList,
2342                                           bool &OptimizeListAgain) {
2343   if (MergeList.empty())
2344     return false;
2345 
2346   bool Modified = false;
2347 
2348   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2349        Next = std::next(I)) {
2350 
2351     auto First = I;
2352     auto Second = Next;
2353 
2354     if ((*First).Order > (*Second).Order)
2355       std::swap(First, Second);
2356     CombineInfo &CI = *First;
2357     CombineInfo &Paired = *Second;
2358 
2359     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2360     if (!Where) {
2361       ++I;
2362       continue;
2363     }
2364 
2365     Modified = true;
2366 
2367     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2368 
2369     MachineBasicBlock::iterator NewMI;
2370     switch (CI.InstClass) {
2371     default:
2372       llvm_unreachable("unknown InstClass");
2373       break;
2374     case DS_READ:
2375       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2376       break;
2377     case DS_WRITE:
2378       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2379       break;
2380     case S_BUFFER_LOAD_IMM:
2381     case S_BUFFER_LOAD_SGPR_IMM:
2382     case S_LOAD_IMM:
2383       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2384       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2385       break;
2386     case BUFFER_LOAD:
2387       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2388       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2389       break;
2390     case BUFFER_STORE:
2391       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2392       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2393       break;
2394     case MIMG:
2395       NewMI = mergeImagePair(CI, Paired, Where->I);
2396       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2397       break;
2398     case TBUFFER_LOAD:
2399       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2400       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2401       break;
2402     case TBUFFER_STORE:
2403       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2404       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2405       break;
2406     case FLAT_LOAD:
2407     case GLOBAL_LOAD:
2408     case GLOBAL_LOAD_SADDR:
2409       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2410       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2411       break;
2412     case FLAT_STORE:
2413     case GLOBAL_STORE:
2414     case GLOBAL_STORE_SADDR:
2415       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2416       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2417       break;
2418     }
2419     CI.setMI(NewMI, *this);
2420     CI.Order = Where->Order;
2421     if (I == Second)
2422       I = Next;
2423 
2424     MergeList.erase(Second);
2425   }
2426 
2427   return Modified;
2428 }
2429 
2430 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2431   if (skipFunction(MF.getFunction()))
2432     return false;
2433 
2434   STM = &MF.getSubtarget<GCNSubtarget>();
2435   if (!STM->loadStoreOptEnabled())
2436     return false;
2437 
2438   TII = STM->getInstrInfo();
2439   TRI = &TII->getRegisterInfo();
2440 
2441   MRI = &MF.getRegInfo();
2442   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2443 
2444   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2445 
2446   bool Modified = false;
2447 
2448   // Contains the list of instructions for which constant offsets are being
2449   // promoted to the IMM. This is tracked for an entire block at time.
2450   SmallPtrSet<MachineInstr *, 4> AnchorList;
2451   MemInfoMap Visited;
2452 
2453   for (MachineBasicBlock &MBB : MF) {
2454     MachineBasicBlock::iterator SectionEnd;
2455     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2456          I = SectionEnd) {
2457       bool CollectModified;
2458       std::list<std::list<CombineInfo>> MergeableInsts;
2459 
2460       // First pass: Collect list of all instructions we know how to merge in a
2461       // subset of the block.
2462       std::tie(SectionEnd, CollectModified) =
2463           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2464 
2465       Modified |= CollectModified;
2466 
2467       do {
2468         OptimizeAgain = false;
2469         Modified |= optimizeBlock(MergeableInsts);
2470       } while (OptimizeAgain);
2471     }
2472 
2473     Visited.clear();
2474     AnchorList.clear();
2475   }
2476 
2477   return Modified;
2478 }
2479