xref: /llvm-project/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp (revision 0288d065eecb1208971dc4cdcc71731e34c6fca0)
1 //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Merge the offset of address calculation into the offset field
10 // of instructions in a global address lowering sequence.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "LoongArch.h"
15 #include "LoongArchTargetMachine.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/Passes.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Target/TargetOptions.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "loongarch-merge-base-offset"
26 #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
27 
28 namespace {
29 
30 class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
31   const LoongArchSubtarget *ST = nullptr;
32   MachineRegisterInfo *MRI;
33 
34 public:
35   static char ID;
36   bool runOnMachineFunction(MachineFunction &Fn) override;
37   bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
38                       MachineInstr *&Lo20, MachineInstr *&Hi12,
39                       MachineInstr *&Last);
40 
41   bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
42                            MachineInstr *&Lo20, MachineInstr *&Hi12,
43                            MachineInstr *&Last);
44   void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
45                   MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
46                   int64_t Offset);
47   bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
48                        MachineInstr *&Lo20, MachineInstr *&Hi12,
49                        MachineInstr *&Last, MachineInstr &TailAdd,
50                        Register GAReg);
51 
52   bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
53                          MachineInstr *&Lo20, MachineInstr *&Hi12,
54                          MachineInstr *&Last);
55 
56   LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
57 
58   MachineFunctionProperties getRequiredProperties() const override {
59     return MachineFunctionProperties().set(
60         MachineFunctionProperties::Property::IsSSA);
61   }
62 
63   void getAnalysisUsage(AnalysisUsage &AU) const override {
64     AU.setPreservesCFG();
65     MachineFunctionPass::getAnalysisUsage(AU);
66   }
67 
68   StringRef getPassName() const override {
69     return LoongArch_MERGE_BASE_OFFSET_NAME;
70   }
71 };
72 } // end anonymous namespace
73 
74 char LoongArchMergeBaseOffsetOpt::ID = 0;
75 INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
76                 LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
77 
78 // Detect either of the patterns:
79 //
80 // 1. (small/medium):
81 //   pcalau12i vreg1, %pc_hi20(s)
82 //   addi.d    vreg2, vreg1, %pc_lo12(s)
83 //
84 // 2. (large):
85 //   pcalau12i vreg1, %pc_hi20(s)
86 //   addi.d    vreg2, $zero, %pc_lo12(s)
87 //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
88 //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
89 //   add.d     vreg5, vreg4, vreg1
90 
91 // The pattern is only accepted if:
92 //    1) For small and medium pattern, the first instruction has only one use,
93 //       which is the ADDI.
94 //    2) For large pattern, the first four instructions each have only one use,
95 //       and the user of the fourth instruction is ADD.
96 //    3) The address operands have the appropriate type, reflecting the
97 //       lowering of a global address or constant pool using the pattern.
98 //    4) The offset value in the Global Address or Constant Pool is 0.
99 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
100                                                  MachineInstr *&Lo12,
101                                                  MachineInstr *&Lo20,
102                                                  MachineInstr *&Hi12,
103                                                  MachineInstr *&Last) {
104   if (Hi20.getOpcode() != LoongArch::PCALAU12I)
105     return false;
106 
107   const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
108   if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)
109     return false;
110 
111   auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
112     return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
113   };
114 
115   if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
116     return false;
117 
118   Register HiDestReg = Hi20.getOperand(0).getReg();
119   if (!MRI->hasOneUse(HiDestReg))
120     return false;
121 
122   MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
123   if (UseInst->getOpcode() != LoongArch::ADD_D) {
124     Lo12 = UseInst;
125     if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
126         (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
127       return false;
128   } else {
129     assert(ST->is64Bit());
130     Last = UseInst;
131 
132     Register LastOp1Reg = Last->getOperand(1).getReg();
133     if (!LastOp1Reg.isVirtual())
134       return false;
135     Hi12 = MRI->getVRegDef(LastOp1Reg);
136     const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
137     if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
138       return false;
139     if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
140       return false;
141     if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
142       return false;
143 
144     Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
145     const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
146     if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
147       return false;
148     if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
149       return false;
150     if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
151       return false;
152 
153     Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
154     if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
155       return false;
156   }
157 
158   const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
159   assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
160   if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||
161       !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
162       Lo12Op2.getOffset() != 0)
163     return false;
164 
165   if (Hi20Op1.isGlobal()) {
166     LLVM_DEBUG(dbgs() << "  Found lowered global address: "
167                       << *Hi20Op1.getGlobal() << "\n");
168   } else if (Hi20Op1.isBlockAddress()) {
169     LLVM_DEBUG(dbgs() << "  Found lowered basic address: "
170                       << *Hi20Op1.getBlockAddress() << "\n");
171   } else if (Hi20Op1.isCPI()) {
172     LLVM_DEBUG(dbgs() << "  Found lowered constant pool: " << Hi20Op1.getIndex()
173                       << "\n");
174   }
175 
176   return true;
177 }
178 
179 // Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
180 // Delete the tail instruction and update all the uses to use the
181 // output from Last.
182 void LoongArchMergeBaseOffsetOpt::foldOffset(
183     MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
184     MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
185     int64_t Offset) {
186   // Put the offset back in Hi and the Lo
187   Hi20.getOperand(1).setOffset(Offset);
188   Lo12.getOperand(2).setOffset(Offset);
189   if (Lo20 && Hi12) {
190     Lo20->getOperand(2).setOffset(Offset);
191     Hi12->getOperand(2).setOffset(Offset);
192   }
193   // Delete the tail instruction.
194   MachineInstr *Def = Last ? Last : &Lo12;
195   MRI->constrainRegClass(Def->getOperand(0).getReg(),
196                          MRI->getRegClass(Tail.getOperand(0).getReg()));
197   MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
198   Tail.eraseFromParent();
199   LLVM_DEBUG(dbgs() << "  Merged offset " << Offset << " into base.\n"
200                     << "     " << Hi20 << "     " << Lo12;);
201   if (Lo20 && Hi12) {
202     LLVM_DEBUG(dbgs() << "     " << *Lo20 << "     " << *Hi12;);
203   }
204 }
205 
206 // Detect patterns for large offsets that are passed into an ADD instruction.
207 // If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
208 // instructions and deletes TailAdd and the instructions that produced the
209 // offset.
210 //
211 //   (The instructions marked with "!" are not necessarily present)
212 //
213 //        Base address lowering is of the form:
214 //           Hi20:  pcalau12i vreg1, %pc_hi20(s)
215 //        +- Lo12:  addi.d vreg2, vreg1, %pc_lo12(s)
216 //        |  Lo20:  lu32i.d vreg2, %pc64_lo20(s) !
217 //        +- Hi12:  lu52i.d vreg2, vreg2, %pc64_hi12(s) !
218 //        |
219 //        | The large offset can be one of the forms:
220 //        |
221 //        +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
222 //        |     OffsetHi20: lu12i.w vreg3, 4
223 //        |     OffsetLo12: ori voff, vreg3, 188    ------------------+
224 //        |                                                           |
225 //        +-> 2) Offset that has non zero bits in Hi20 bits only:     |
226 //        |     OffsetHi20: lu12i.w voff, 128       ------------------+
227 //        |                                                           |
228 //        +-> 3) Offset that has non zero bits in Lo20 bits:          |
229 //        |     OffsetHi20: lu12i.w vreg3, 121 !                      |
230 //        |     OffsetLo12: ori voff, vreg3, 122 !                    |
231 //        |     OffsetLo20: lu32i.d voff, 123       ------------------+
232 //        +-> 4) Offset that has non zero bits in Hi12 bits:          |
233 //              OffsetHi20: lu12i.w vreg3, 121 !                      |
234 //              OffsetLo12: ori voff, vreg3, 122 !                    |
235 //              OffsetLo20: lu32i.d vreg3, 123 !                      |
236 //              OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
237 //                                                                    |
238 //        TailAdd: add.d  vreg4, vreg2, voff       <------------------+
239 //
240 bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
241     MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
242     MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
243     Register GAReg) {
244   assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
245           TailAdd.getOpcode() == LoongArch::ADD_D) &&
246          "Expected ADD instruction!");
247   Register Rs = TailAdd.getOperand(1).getReg();
248   Register Rt = TailAdd.getOperand(2).getReg();
249   Register Reg = Rs == GAReg ? Rt : Rs;
250   SmallVector<MachineInstr *, 4> Instrs;
251   int64_t Offset = 0;
252   int64_t Mask = -1;
253 
254   // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
255   for (int i = 0; i < 4; i++) {
256     // Handle Reg is R0.
257     if (Reg == LoongArch::R0)
258       break;
259 
260     // Can't fold if the register has more than one use.
261     if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
262       return false;
263 
264     MachineInstr *Curr = MRI->getVRegDef(Reg);
265     if (!Curr)
266       break;
267 
268     switch (Curr->getOpcode()) {
269     default:
270       // Can't fold if the instruction opcode is unexpected.
271       return false;
272     case LoongArch::ORI: {
273       MachineOperand ImmOp = Curr->getOperand(2);
274       if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
275         return false;
276       Offset += ImmOp.getImm();
277       Reg = Curr->getOperand(1).getReg();
278       Instrs.push_back(Curr);
279       break;
280     }
281     case LoongArch::LU12I_W: {
282       MachineOperand ImmOp = Curr->getOperand(1);
283       if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
284         return false;
285       Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;
286       Reg = LoongArch::R0;
287       Instrs.push_back(Curr);
288       break;
289     }
290     case LoongArch::LU32I_D: {
291       MachineOperand ImmOp = Curr->getOperand(2);
292       if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
293         return false;
294       Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;
295       Mask ^= 0x000FFFFF00000000ULL;
296       Reg = Curr->getOperand(1).getReg();
297       Instrs.push_back(Curr);
298       break;
299     }
300     case LoongArch::LU52I_D: {
301       MachineOperand ImmOp = Curr->getOperand(2);
302       if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
303         return false;
304       Offset += ImmOp.getImm() << 52;
305       Mask ^= 0xFFF0000000000000ULL;
306       Reg = Curr->getOperand(1).getReg();
307       Instrs.push_back(Curr);
308       break;
309     }
310     }
311   }
312 
313   // Can't fold if the offset is not extracted.
314   if (!Offset)
315     return false;
316 
317   foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
318   LLVM_DEBUG(dbgs() << "  Offset Instrs:\n");
319   for (auto I : Instrs) {
320     LLVM_DEBUG(dbgs() << "                 " << *I);
321     I->eraseFromParent();
322   }
323 
324   return true;
325 }
326 
327 bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
328                                                       MachineInstr &Lo12,
329                                                       MachineInstr *&Lo20,
330                                                       MachineInstr *&Hi12,
331                                                       MachineInstr *&Last) {
332   Register DestReg =
333       Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
334 
335   // Look for arithmetic instructions we can get an offset from.
336   // We might be able to remove the arithmetic instructions by folding the
337   // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
338   if (!MRI->hasOneUse(DestReg))
339     return false;
340 
341   // DestReg has only one use.
342   MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
343   switch (Tail.getOpcode()) {
344   default:
345     LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
346                       << Tail);
347     break;
348   case LoongArch::ADDI_W:
349     if (ST->is64Bit())
350       return false;
351     [[fallthrough]];
352   case LoongArch::ADDI_D:
353   case LoongArch::ADDU16I_D: {
354     // Offset is simply an immediate operand.
355     int64_t Offset = Tail.getOperand(2).getImm();
356     if (Tail.getOpcode() == LoongArch::ADDU16I_D)
357       Offset = SignExtend64<32>(Offset << 16);
358 
359     // We might have two ADDIs in a row.
360     Register TailDestReg = Tail.getOperand(0).getReg();
361     if (MRI->hasOneUse(TailDestReg)) {
362       MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
363       if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
364         return false;
365       if (TailTail.getOpcode() == LoongArch::ADDI_W ||
366           TailTail.getOpcode() == LoongArch::ADDI_D) {
367         Offset += TailTail.getOperand(2).getImm();
368         LLVM_DEBUG(dbgs() << "  Offset Instrs: " << Tail << TailTail);
369         foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
370         Tail.eraseFromParent();
371         return true;
372       }
373     }
374 
375     LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
376     foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
377     return true;
378   }
379   case LoongArch::ADD_W:
380     if (ST->is64Bit())
381       return false;
382     [[fallthrough]];
383   case LoongArch::ADD_D:
384     // The offset is too large to fit in the immediate field of ADDI.
385     return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
386     break;
387   }
388 
389   return false;
390 }
391 
392 // Memory access opcode mapping for transforms.
393 static unsigned getNewOpc(unsigned Op, bool isLarge) {
394   switch (Op) {
395   case LoongArch::LD_B:
396     return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
397   case LoongArch::LD_H:
398     return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
399   case LoongArch::LD_W:
400   case LoongArch::LDPTR_W:
401     return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
402   case LoongArch::LD_D:
403   case LoongArch::LDPTR_D:
404     return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
405   case LoongArch::LD_BU:
406     return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
407   case LoongArch::LD_HU:
408     return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
409   case LoongArch::LD_WU:
410     return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
411   case LoongArch::FLD_S:
412     return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
413   case LoongArch::FLD_D:
414     return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
415   case LoongArch::VLD:
416     return isLarge ? LoongArch::VLDX : LoongArch::VLD;
417   case LoongArch::XVLD:
418     return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;
419   case LoongArch::VLDREPL_B:
420     return LoongArch::VLDREPL_B;
421   case LoongArch::XVLDREPL_B:
422     return LoongArch::XVLDREPL_B;
423   case LoongArch::ST_B:
424     return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
425   case LoongArch::ST_H:
426     return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
427   case LoongArch::ST_W:
428   case LoongArch::STPTR_W:
429     return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
430   case LoongArch::ST_D:
431   case LoongArch::STPTR_D:
432     return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
433   case LoongArch::FST_S:
434     return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
435   case LoongArch::FST_D:
436     return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
437   case LoongArch::VST:
438     return isLarge ? LoongArch::VSTX : LoongArch::VST;
439   case LoongArch::XVST:
440     return isLarge ? LoongArch::XVSTX : LoongArch::XVST;
441   default:
442     llvm_unreachable("Unexpected opcode for replacement");
443   }
444 }
445 
446 bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
447                                                     MachineInstr &Lo12,
448                                                     MachineInstr *&Lo20,
449                                                     MachineInstr *&Hi12,
450                                                     MachineInstr *&Last) {
451   Register DestReg =
452       Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
453 
454   // If all the uses are memory ops with the same offset, we can transform:
455   //
456   // 1. (small/medium):
457   //   pcalau12i vreg1, %pc_hi20(s)
458   //   addi.d    vreg2, vreg1, %pc_lo12(s)
459   //   ld.w      vreg3, 8(vreg2)
460   //
461   //   =>
462   //
463   //   pcalau12i vreg1, %pc_hi20(s+8)
464   //   ld.w      vreg3, vreg1, %pc_lo12(s+8)(vreg1)
465   //
466   // 2. (large):
467   //   pcalau12i vreg1, %pc_hi20(s)
468   //   addi.d    vreg2, $zero, %pc_lo12(s)
469   //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
470   //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
471   //   add.d     vreg5, vreg4, vreg1
472   //   ld.w      vreg6, 8(vreg5)
473   //
474   //   =>
475   //
476   //   pcalau12i vreg1, %pc_hi20(s+8)
477   //   addi.d    vreg2, $zero, %pc_lo12(s+8)
478   //   lu32i.d   vreg3, vreg2, %pc64_lo20(s+8)
479   //   lu52i.d   vreg4, vreg3, %pc64_hi12(s+8)
480   //   ldx.w     vreg6, vreg4, vreg1
481 
482   std::optional<int64_t> CommonOffset;
483   DenseMap<const MachineInstr *, SmallVector<unsigned>>
484       InlineAsmMemoryOpIndexesMap;
485   for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
486     switch (UseMI.getOpcode()) {
487     default:
488       LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
489       return false;
490     case LoongArch::VLDREPL_B:
491     case LoongArch::XVLDREPL_B:
492       // We can't do this for large pattern.
493       if (Last)
494         return false;
495       [[fallthrough]];
496     case LoongArch::LD_B:
497     case LoongArch::LD_H:
498     case LoongArch::LD_W:
499     case LoongArch::LD_D:
500     case LoongArch::LD_BU:
501     case LoongArch::LD_HU:
502     case LoongArch::LD_WU:
503     case LoongArch::LDPTR_W:
504     case LoongArch::LDPTR_D:
505     case LoongArch::FLD_S:
506     case LoongArch::FLD_D:
507     case LoongArch::VLD:
508     case LoongArch::XVLD:
509     case LoongArch::ST_B:
510     case LoongArch::ST_H:
511     case LoongArch::ST_W:
512     case LoongArch::ST_D:
513     case LoongArch::STPTR_W:
514     case LoongArch::STPTR_D:
515     case LoongArch::FST_S:
516     case LoongArch::FST_D:
517     case LoongArch::VST:
518     case LoongArch::XVST: {
519       if (UseMI.getOperand(1).isFI())
520         return false;
521       // Register defined by Lo should not be the value register.
522       if (DestReg == UseMI.getOperand(0).getReg())
523         return false;
524       assert(DestReg == UseMI.getOperand(1).getReg() &&
525              "Expected base address use");
526       // All load/store instructions must use the same offset.
527       int64_t Offset = UseMI.getOperand(2).getImm();
528       if (CommonOffset && Offset != CommonOffset)
529         return false;
530       CommonOffset = Offset;
531       break;
532     }
533     case LoongArch::INLINEASM:
534     case LoongArch::INLINEASM_BR: {
535       // We can't do this for large pattern.
536       if (Last)
537         return false;
538       SmallVector<unsigned> InlineAsmMemoryOpIndexes;
539       unsigned NumOps = 0;
540       for (unsigned I = InlineAsm::MIOp_FirstOperand;
541            I < UseMI.getNumOperands(); I += 1 + NumOps) {
542         const MachineOperand &FlagsMO = UseMI.getOperand(I);
543         // Should be an imm.
544         if (!FlagsMO.isImm())
545           continue;
546 
547         const InlineAsm::Flag Flags(FlagsMO.getImm());
548         NumOps = Flags.getNumOperandRegisters();
549 
550         // Memory constraints have two operands.
551         if (NumOps != 2 || !Flags.isMemKind()) {
552           // If the register is used by something other than a memory contraint,
553           // we should not fold.
554           for (unsigned J = 0; J < NumOps; ++J) {
555             const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
556             if (MO.isReg() && MO.getReg() == DestReg)
557               return false;
558           }
559           continue;
560         }
561 
562         // We can only do this for constraint m.
563         if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
564           return false;
565 
566         const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
567         if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
568           continue;
569 
570         const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
571         if (!OffsetMO.isImm())
572           continue;
573 
574         // All inline asm memory operands must use the same offset.
575         int64_t Offset = OffsetMO.getImm();
576         if (CommonOffset && Offset != CommonOffset)
577           return false;
578         CommonOffset = Offset;
579         InlineAsmMemoryOpIndexes.push_back(I + 1);
580       }
581       InlineAsmMemoryOpIndexesMap.insert(
582           std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
583       break;
584     }
585     }
586   }
587 
588   // We found a common offset.
589   // Update the offsets in global address lowering.
590   // We may have already folded some arithmetic so we need to add to any
591   // existing offset.
592   int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
593   // LA32 ignores the upper 32 bits.
594   if (!ST->is64Bit())
595     NewOffset = SignExtend64<32>(NewOffset);
596   // We can only fold simm32 offsets.
597   if (!isInt<32>(NewOffset))
598     return false;
599 
600   // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
601   // be removed from the code sequence.
602   //
603   // For example:
604   //   pcalau12i $a0, %pc_hi20(symbol)
605   //   addi.d $a0, $a0, %pc_lo12(symbol)
606   //   ld.w $a0, $a0, 0
607   //
608   //   =>
609   //
610   //   pcalau12i $a0, %pc_hi20(symbol)
611   //   ld.w $a0, $a0, %pc_lo12(symbol)
612   //
613   // Code sequence optimized before can be relax by linker. But after being
614   // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
615   // carried by them.
616   Hi20.getOperand(1).setOffset(NewOffset);
617   Hi20.getOperand(1).setTargetFlags(
618       LoongArchII::getDirectFlags(Hi20.getOperand(1)));
619   MachineOperand &ImmOp = Lo12.getOperand(2);
620   ImmOp.setOffset(NewOffset);
621   ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
622   if (Lo20 && Hi12) {
623     Lo20->getOperand(2).setOffset(NewOffset);
624     Hi12->getOperand(2).setOffset(NewOffset);
625   }
626 
627   // Update the immediate in the load/store instructions to add the offset.
628   const LoongArchInstrInfo &TII = *ST->getInstrInfo();
629   for (MachineInstr &UseMI :
630        llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
631     if (UseMI.getOpcode() == LoongArch::INLINEASM ||
632         UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
633       auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
634       for (unsigned I : InlineAsmMemoryOpIndexes) {
635         MachineOperand &MO = UseMI.getOperand(I + 1);
636         switch (ImmOp.getType()) {
637         case MachineOperand::MO_GlobalAddress:
638           MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
639                         LoongArchII::getDirectFlags(ImmOp));
640           break;
641         case MachineOperand::MO_MCSymbol:
642           MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),
643                               LoongArchII::getDirectFlags(ImmOp));
644           MO.setOffset(ImmOp.getOffset());
645           break;
646         case MachineOperand::MO_BlockAddress:
647           MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
648                         LoongArchII::getDirectFlags(ImmOp));
649           break;
650         default:
651           report_fatal_error("unsupported machine operand type");
652           break;
653         }
654       }
655     } else {
656       UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
657       if (Last) {
658         UseMI.removeOperand(2);
659         UseMI.removeOperand(1);
660         UseMI.addOperand(Last->getOperand(1));
661         UseMI.addOperand(Last->getOperand(2));
662         UseMI.getOperand(1).setIsKill(false);
663         UseMI.getOperand(2).setIsKill(false);
664       } else {
665         UseMI.removeOperand(2);
666         UseMI.addOperand(ImmOp);
667       }
668     }
669   }
670 
671   if (Last) {
672     Last->eraseFromParent();
673     return true;
674   }
675 
676   MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
677   Lo12.eraseFromParent();
678   return true;
679 }
680 
681 bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
682   if (skipFunction(Fn.getFunction()))
683     return false;
684 
685   ST = &Fn.getSubtarget<LoongArchSubtarget>();
686 
687   bool MadeChange = false;
688   MRI = &Fn.getRegInfo();
689   for (MachineBasicBlock &MBB : Fn) {
690     LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
691     for (MachineInstr &Hi20 : MBB) {
692       MachineInstr *Lo12 = nullptr;
693       MachineInstr *Lo20 = nullptr;
694       MachineInstr *Hi12 = nullptr;
695       MachineInstr *Last = nullptr;
696       if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
697         continue;
698       MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
699       MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
700     }
701   }
702 
703   return MadeChange;
704 }
705 
706 /// Returns an instance of the Merge Base Offset Optimization pass.
707 FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
708   return new LoongArchMergeBaseOffsetOpt();
709 }
710