xref: /llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp (revision e4ba6b8c242823e7feafe3bca12173d56930254c)
1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This file contains a pass that performs load / store related peephole
11 /// optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARM.h"
16 #include "ARMBaseInstrInfo.h"
17 #include "ARMBaseRegisterInfo.h"
18 #include "ARMISelLowering.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMSubtarget.h"
21 #include "MCTargetDesc/ARMAddressingModes.h"
22 #include "ThumbRegisterInfo.h"
23 #include "llvm/ADT/DenseMap.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstr.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/RegisterClassInfo.h"
35 #include "llvm/CodeGen/SelectionDAGNodes.h"
36 #include "llvm/CodeGen/LivePhysRegs.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Target/TargetInstrInfo.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetRegisterInfo.h"
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "arm-ldst-opt"
50 
51 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
52 STATISTIC(NumSTMGened , "Number of stm instructions generated");
53 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
54 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
55 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
56 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
57 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
58 STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
59 STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
60 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
61 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
62 
63 namespace {
64   /// Post- register allocation pass the combine load / store instructions to
65   /// form ldm / stm instructions.
66   struct ARMLoadStoreOpt : public MachineFunctionPass {
67     static char ID;
68     ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
69 
70     const MachineFunction *MF;
71     const TargetInstrInfo *TII;
72     const TargetRegisterInfo *TRI;
73     const MachineRegisterInfo *MRI;
74     const ARMSubtarget *STI;
75     const TargetLowering *TL;
76     ARMFunctionInfo *AFI;
77     LivePhysRegs LiveRegs;
78     RegisterClassInfo RegClassInfo;
79     MachineBasicBlock::const_iterator LiveRegPos;
80     bool LiveRegsValid;
81     bool RegClassInfoValid;
82     bool isThumb1, isThumb2;
83 
84     bool runOnMachineFunction(MachineFunction &Fn) override;
85 
86     const char *getPassName() const override {
87       return "ARM load / store optimization pass";
88     }
89 
90   private:
91     /// A set of load/store MachineInstrs with same base register sorted by
92     /// offset.
93     struct MemOpQueueEntry {
94       MachineInstr *MI;
95       int Offset;        ///< Load/Store offset.
96       unsigned Position; ///< Position as counted from end of basic block.
97       MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
98         : MI(MI), Offset(Offset), Position(Position) {}
99     };
100     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
101 
102     /// A set of MachineInstrs that fulfill (nearly all) conditions to get
103     /// merged into a LDM/STM.
104     struct MergeCandidate {
105       /// List of instructions ordered by load/store offset.
106       SmallVector<MachineInstr*, 4> Instrs;
107       /// Index in Instrs of the instruction being latest in the schedule.
108       unsigned LatestMIIdx;
109       /// Index in Instrs of the instruction being earliest in the schedule.
110       unsigned EarliestMIIdx;
111       /// Index into the basic block where the merged instruction will be
112       /// inserted. (See MemOpQueueEntry.Position)
113       unsigned InsertPos;
114       /// Whether the instructions can be merged into a ldm/stm instruction.
115       bool CanMergeToLSMulti;
116       /// Whether the instructions can be merged into a ldrd/strd instruction.
117       bool CanMergeToLSDouble;
118     };
119     BumpPtrAllocator Allocator;
120     SmallVector<const MergeCandidate*,4> Candidates;
121 
122     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
123                             MachineBasicBlock::const_iterator Before);
124     unsigned findFreeReg(const TargetRegisterClass &RegClass);
125     void UpdateBaseRegUses(MachineBasicBlock &MBB,
126                            MachineBasicBlock::iterator MBBI,
127                            DebugLoc DL, unsigned Base, unsigned WordOffset,
128                            ARMCC::CondCodes Pred, unsigned PredReg);
129     MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
130         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
131         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
132         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
133     MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
134         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
135         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
136         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
137     void FormCandidates(const MemOpQueue &MemOps);
138     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
139     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
140                              MachineBasicBlock::iterator &MBBI);
141     bool MergeBaseUpdateLoadStore(MachineInstr *MI);
142     bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
143     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
144     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
145   };
146   char ARMLoadStoreOpt::ID = 0;
147 }
148 
149 static bool definesCPSR(const MachineInstr *MI) {
150   for (const auto &MO : MI->operands()) {
151     if (!MO.isReg())
152       continue;
153     if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
154       // If the instruction has live CPSR def, then it's not safe to fold it
155       // into load / store.
156       return true;
157   }
158 
159   return false;
160 }
161 
162 static int getMemoryOpOffset(const MachineInstr *MI) {
163   unsigned Opcode = MI->getOpcode();
164   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
165   unsigned NumOperands = MI->getDesc().getNumOperands();
166   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
167 
168   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
169       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
170       Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
171       Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
172     return OffField;
173 
174   // Thumb1 immediate offsets are scaled by 4
175   if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
176       Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
177     return OffField * 4;
178 
179   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
180     : ARM_AM::getAM5Offset(OffField) * 4;
181   ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
182     : ARM_AM::getAM5Op(OffField);
183 
184   if (Op == ARM_AM::sub)
185     return -Offset;
186 
187   return Offset;
188 }
189 
190 static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
191   return MI.getOperand(1);
192 }
193 
194 static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
195   return MI.getOperand(0);
196 }
197 
198 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
199   switch (Opcode) {
200   default: llvm_unreachable("Unhandled opcode!");
201   case ARM::LDRi12:
202     ++NumLDMGened;
203     switch (Mode) {
204     default: llvm_unreachable("Unhandled submode!");
205     case ARM_AM::ia: return ARM::LDMIA;
206     case ARM_AM::da: return ARM::LDMDA;
207     case ARM_AM::db: return ARM::LDMDB;
208     case ARM_AM::ib: return ARM::LDMIB;
209     }
210   case ARM::STRi12:
211     ++NumSTMGened;
212     switch (Mode) {
213     default: llvm_unreachable("Unhandled submode!");
214     case ARM_AM::ia: return ARM::STMIA;
215     case ARM_AM::da: return ARM::STMDA;
216     case ARM_AM::db: return ARM::STMDB;
217     case ARM_AM::ib: return ARM::STMIB;
218     }
219   case ARM::tLDRi:
220   case ARM::tLDRspi:
221     // tLDMIA is writeback-only - unless the base register is in the input
222     // reglist.
223     ++NumLDMGened;
224     switch (Mode) {
225     default: llvm_unreachable("Unhandled submode!");
226     case ARM_AM::ia: return ARM::tLDMIA;
227     }
228   case ARM::tSTRi:
229   case ARM::tSTRspi:
230     // There is no non-writeback tSTMIA either.
231     ++NumSTMGened;
232     switch (Mode) {
233     default: llvm_unreachable("Unhandled submode!");
234     case ARM_AM::ia: return ARM::tSTMIA_UPD;
235     }
236   case ARM::t2LDRi8:
237   case ARM::t2LDRi12:
238     ++NumLDMGened;
239     switch (Mode) {
240     default: llvm_unreachable("Unhandled submode!");
241     case ARM_AM::ia: return ARM::t2LDMIA;
242     case ARM_AM::db: return ARM::t2LDMDB;
243     }
244   case ARM::t2STRi8:
245   case ARM::t2STRi12:
246     ++NumSTMGened;
247     switch (Mode) {
248     default: llvm_unreachable("Unhandled submode!");
249     case ARM_AM::ia: return ARM::t2STMIA;
250     case ARM_AM::db: return ARM::t2STMDB;
251     }
252   case ARM::VLDRS:
253     ++NumVLDMGened;
254     switch (Mode) {
255     default: llvm_unreachable("Unhandled submode!");
256     case ARM_AM::ia: return ARM::VLDMSIA;
257     case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
258     }
259   case ARM::VSTRS:
260     ++NumVSTMGened;
261     switch (Mode) {
262     default: llvm_unreachable("Unhandled submode!");
263     case ARM_AM::ia: return ARM::VSTMSIA;
264     case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
265     }
266   case ARM::VLDRD:
267     ++NumVLDMGened;
268     switch (Mode) {
269     default: llvm_unreachable("Unhandled submode!");
270     case ARM_AM::ia: return ARM::VLDMDIA;
271     case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
272     }
273   case ARM::VSTRD:
274     ++NumVSTMGened;
275     switch (Mode) {
276     default: llvm_unreachable("Unhandled submode!");
277     case ARM_AM::ia: return ARM::VSTMDIA;
278     case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
279     }
280   }
281 }
282 
283 static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
284   switch (Opcode) {
285   default: llvm_unreachable("Unhandled opcode!");
286   case ARM::LDMIA_RET:
287   case ARM::LDMIA:
288   case ARM::LDMIA_UPD:
289   case ARM::STMIA:
290   case ARM::STMIA_UPD:
291   case ARM::tLDMIA:
292   case ARM::tLDMIA_UPD:
293   case ARM::tSTMIA_UPD:
294   case ARM::t2LDMIA_RET:
295   case ARM::t2LDMIA:
296   case ARM::t2LDMIA_UPD:
297   case ARM::t2STMIA:
298   case ARM::t2STMIA_UPD:
299   case ARM::VLDMSIA:
300   case ARM::VLDMSIA_UPD:
301   case ARM::VSTMSIA:
302   case ARM::VSTMSIA_UPD:
303   case ARM::VLDMDIA:
304   case ARM::VLDMDIA_UPD:
305   case ARM::VSTMDIA:
306   case ARM::VSTMDIA_UPD:
307     return ARM_AM::ia;
308 
309   case ARM::LDMDA:
310   case ARM::LDMDA_UPD:
311   case ARM::STMDA:
312   case ARM::STMDA_UPD:
313     return ARM_AM::da;
314 
315   case ARM::LDMDB:
316   case ARM::LDMDB_UPD:
317   case ARM::STMDB:
318   case ARM::STMDB_UPD:
319   case ARM::t2LDMDB:
320   case ARM::t2LDMDB_UPD:
321   case ARM::t2STMDB:
322   case ARM::t2STMDB_UPD:
323   case ARM::VLDMSDB_UPD:
324   case ARM::VSTMSDB_UPD:
325   case ARM::VLDMDDB_UPD:
326   case ARM::VSTMDDB_UPD:
327     return ARM_AM::db;
328 
329   case ARM::LDMIB:
330   case ARM::LDMIB_UPD:
331   case ARM::STMIB:
332   case ARM::STMIB_UPD:
333     return ARM_AM::ib;
334   }
335 }
336 
337 static bool isT1i32Load(unsigned Opc) {
338   return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
339 }
340 
341 static bool isT2i32Load(unsigned Opc) {
342   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
343 }
344 
345 static bool isi32Load(unsigned Opc) {
346   return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
347 }
348 
349 static bool isT1i32Store(unsigned Opc) {
350   return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
351 }
352 
353 static bool isT2i32Store(unsigned Opc) {
354   return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
355 }
356 
357 static bool isi32Store(unsigned Opc) {
358   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
359 }
360 
361 static bool isLoadSingle(unsigned Opc) {
362   return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
363 }
364 
365 static unsigned getImmScale(unsigned Opc) {
366   switch (Opc) {
367   default: llvm_unreachable("Unhandled opcode!");
368   case ARM::tLDRi:
369   case ARM::tSTRi:
370   case ARM::tLDRspi:
371   case ARM::tSTRspi:
372     return 1;
373   case ARM::tLDRHi:
374   case ARM::tSTRHi:
375     return 2;
376   case ARM::tLDRBi:
377   case ARM::tSTRBi:
378     return 4;
379   }
380 }
381 
382 static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
383   switch (MI->getOpcode()) {
384   default: return 0;
385   case ARM::LDRi12:
386   case ARM::STRi12:
387   case ARM::tLDRi:
388   case ARM::tSTRi:
389   case ARM::tLDRspi:
390   case ARM::tSTRspi:
391   case ARM::t2LDRi8:
392   case ARM::t2LDRi12:
393   case ARM::t2STRi8:
394   case ARM::t2STRi12:
395   case ARM::VLDRS:
396   case ARM::VSTRS:
397     return 4;
398   case ARM::VLDRD:
399   case ARM::VSTRD:
400     return 8;
401   case ARM::LDMIA:
402   case ARM::LDMDA:
403   case ARM::LDMDB:
404   case ARM::LDMIB:
405   case ARM::STMIA:
406   case ARM::STMDA:
407   case ARM::STMDB:
408   case ARM::STMIB:
409   case ARM::tLDMIA:
410   case ARM::tLDMIA_UPD:
411   case ARM::tSTMIA_UPD:
412   case ARM::t2LDMIA:
413   case ARM::t2LDMDB:
414   case ARM::t2STMIA:
415   case ARM::t2STMDB:
416   case ARM::VLDMSIA:
417   case ARM::VSTMSIA:
418     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
419   case ARM::VLDMDIA:
420   case ARM::VSTMDIA:
421     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
422   }
423 }
424 
425 /// Update future uses of the base register with the offset introduced
426 /// due to writeback. This function only works on Thumb1.
427 void
428 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
429                                    MachineBasicBlock::iterator MBBI,
430                                    DebugLoc DL, unsigned Base,
431                                    unsigned WordOffset,
432                                    ARMCC::CondCodes Pred, unsigned PredReg) {
433   assert(isThumb1 && "Can only update base register uses for Thumb1!");
434   // Start updating any instructions with immediate offsets. Insert a SUB before
435   // the first non-updateable instruction (if any).
436   for (; MBBI != MBB.end(); ++MBBI) {
437     bool InsertSub = false;
438     unsigned Opc = MBBI->getOpcode();
439 
440     if (MBBI->readsRegister(Base)) {
441       int Offset;
442       bool IsLoad =
443         Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
444       bool IsStore =
445         Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
446 
447       if (IsLoad || IsStore) {
448         // Loads and stores with immediate offsets can be updated, but only if
449         // the new offset isn't negative.
450         // The MachineOperand containing the offset immediate is the last one
451         // before predicates.
452         MachineOperand &MO =
453           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
454         // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
455         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
456 
457         // If storing the base register, it needs to be reset first.
458         unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
459 
460         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
461           MO.setImm(Offset);
462         else
463           InsertSub = true;
464 
465       } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
466                  !definesCPSR(MBBI)) {
467         // SUBS/ADDS using this register, with a dead def of the CPSR.
468         // Merge it with the update; if the merged offset is too large,
469         // insert a new sub instead.
470         MachineOperand &MO =
471           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
472         Offset = (Opc == ARM::tSUBi8) ?
473           MO.getImm() + WordOffset * 4 :
474           MO.getImm() - WordOffset * 4 ;
475         if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
476           // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
477           // Offset == 0.
478           MO.setImm(Offset);
479           // The base register has now been reset, so exit early.
480           return;
481         } else {
482           InsertSub = true;
483         }
484 
485       } else {
486         // Can't update the instruction.
487         InsertSub = true;
488       }
489 
490     } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
491       // Since SUBS sets the condition flags, we can't place the base reset
492       // after an instruction that has a live CPSR def.
493       // The base register might also contain an argument for a function call.
494       InsertSub = true;
495     }
496 
497     if (InsertSub) {
498       // An instruction above couldn't be updated, so insert a sub.
499       AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
500         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
501       return;
502     }
503 
504     if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
505       // Register got killed. Stop updating.
506       return;
507   }
508 
509   // End of block was reached.
510   if (MBB.succ_size() > 0) {
511     // FIXME: Because of a bug, live registers are sometimes missing from
512     // the successor blocks' live-in sets. This means we can't trust that
513     // information and *always* have to reset at the end of a block.
514     // See PR21029.
515     if (MBBI != MBB.end()) --MBBI;
516     AddDefaultT1CC(
517       BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
518       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
519   }
520 }
521 
522 /// Return the first register of class \p RegClass that is not in \p Regs.
523 unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
524   if (!RegClassInfoValid) {
525     RegClassInfo.runOnMachineFunction(*MF);
526     RegClassInfoValid = true;
527   }
528 
529   for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
530     if (!LiveRegs.contains(Reg))
531       return Reg;
532   return 0;
533 }
534 
535 /// Compute live registers just before instruction \p Before (in normal schedule
536 /// direction). Computes backwards so multiple queries in the same block must
537 /// come in reverse order.
538 void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
539     MachineBasicBlock::const_iterator Before) {
540   // Initialize if we never queried in this block.
541   if (!LiveRegsValid) {
542     LiveRegs.init(TRI);
543     LiveRegs.addLiveOuts(&MBB, true);
544     LiveRegPos = MBB.end();
545     LiveRegsValid = true;
546   }
547   // Move backward just before the "Before" position.
548   while (LiveRegPos != Before) {
549     --LiveRegPos;
550     LiveRegs.stepBackward(*LiveRegPos);
551   }
552 }
553 
554 static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
555                         unsigned Reg) {
556   for (const std::pair<unsigned, bool> &R : Regs)
557     if (R.first == Reg)
558       return true;
559   return false;
560 }
561 
562 /// Create and insert a LDM or STM with Base as base register and registers in
563 /// Regs as the register operands that would be loaded / stored.  It returns
564 /// true if the transformation is done.
565 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
566     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
567     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
568     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
569   unsigned NumRegs = Regs.size();
570   assert(NumRegs > 1);
571 
572   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
573   // Compute liveness information for that register to make the decision.
574   bool SafeToClobberCPSR = !isThumb1 ||
575     (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
576      MachineBasicBlock::LQR_Dead);
577 
578   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
579 
580   // Exception: If the base register is in the input reglist, Thumb1 LDM is
581   // non-writeback.
582   // It's also not possible to merge an STR of the base register in Thumb1.
583   if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
584     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
585     if (Opcode == ARM::tLDRi) {
586       Writeback = false;
587     } else if (Opcode == ARM::tSTRi) {
588       return nullptr;
589     }
590   }
591 
592   ARM_AM::AMSubMode Mode = ARM_AM::ia;
593   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
594   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
595   bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
596 
597   if (Offset == 4 && haveIBAndDA) {
598     Mode = ARM_AM::ib;
599   } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
600     Mode = ARM_AM::da;
601   } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
602     // VLDM/VSTM do not support DB mode without also updating the base reg.
603     Mode = ARM_AM::db;
604   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
605     // Check if this is a supported opcode before inserting instructions to
606     // calculate a new base register.
607     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
608 
609     // If starting offset isn't zero, insert a MI to materialize a new base.
610     // But only do so if it is cost effective, i.e. merging more than two
611     // loads / stores.
612     if (NumRegs <= 2)
613       return nullptr;
614 
615     // On Thumb1, it's not worth materializing a new base register without
616     // clobbering the CPSR (i.e. not using ADDS/SUBS).
617     if (!SafeToClobberCPSR)
618       return nullptr;
619 
620     unsigned NewBase;
621     if (isi32Load(Opcode)) {
622       // If it is a load, then just use one of the destination register to
623       // use as the new base.
624       NewBase = Regs[NumRegs-1].first;
625     } else {
626       // Find a free register that we can use as scratch register.
627       moveLiveRegsBefore(MBB, InsertBefore);
628       // The merged instruction does not exist yet but will use several Regs if
629       // it is a Store.
630       if (!isLoadSingle(Opcode))
631         for (const std::pair<unsigned, bool> &R : Regs)
632           LiveRegs.addReg(R.first);
633 
634       NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
635       if (NewBase == 0)
636         return nullptr;
637     }
638 
639     int BaseOpc =
640       isThumb2 ? ARM::t2ADDri :
641       (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
642       (isThumb1 && Offset < 8) ? ARM::tADDi3 :
643       isThumb1 ? ARM::tADDi8  : ARM::ADDri;
644 
645     if (Offset < 0) {
646       Offset = - Offset;
647       BaseOpc =
648         isThumb2 ? ARM::t2SUBri :
649         (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
650         isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
651     }
652 
653     if (!TL->isLegalAddImmediate(Offset))
654       // FIXME: Try add with register operand?
655       return nullptr; // Probably not worth it then.
656 
657     // We can only append a kill flag to the add/sub input if the value is not
658     // used in the register list of the stm as well.
659     bool KillOldBase = BaseKill &&
660       (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
661 
662     if (isThumb1) {
663       // Thumb1: depending on immediate size, use either
664       //   ADDS NewBase, Base, #imm3
665       // or
666       //   MOV  NewBase, Base
667       //   ADDS NewBase, #imm8.
668       if (Base != NewBase &&
669           (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
670         // Need to insert a MOV to the new base first.
671         if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
672             !STI->hasV6Ops()) {
673           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
674           if (Pred != ARMCC::AL)
675             return nullptr;
676           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
677             .addReg(Base, getKillRegState(KillOldBase));
678         } else
679           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
680             .addReg(Base, getKillRegState(KillOldBase))
681             .addImm(Pred).addReg(PredReg);
682 
683         // The following ADDS/SUBS becomes an update.
684         Base = NewBase;
685         KillOldBase = true;
686       }
687       if (BaseOpc == ARM::tADDrSPi) {
688         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
689         BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
690           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
691           .addImm(Pred).addReg(PredReg);
692       } else
693         AddDefaultT1CC(
694           BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
695           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
696           .addImm(Pred).addReg(PredReg);
697     } else {
698       BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
699         .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
700         .addImm(Pred).addReg(PredReg).addReg(0);
701     }
702     Base = NewBase;
703     BaseKill = true; // New base is always killed straight away.
704   }
705 
706   bool isDef = isLoadSingle(Opcode);
707 
708   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
709   // base register writeback.
710   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
711   if (!Opcode)
712     return nullptr;
713 
714   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
715   // - There is no writeback (LDM of base register),
716   // - the base register is killed by the merged instruction,
717   // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
718   //   to reset the base register.
719   // Otherwise, don't merge.
720   // It's safe to return here since the code to materialize a new base register
721   // above is also conditional on SafeToClobberCPSR.
722   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
723     return nullptr;
724 
725   MachineInstrBuilder MIB;
726 
727   if (Writeback) {
728     if (Opcode == ARM::tLDMIA)
729       // Update tLDMIA with writeback if necessary.
730       Opcode = ARM::tLDMIA_UPD;
731 
732     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
733 
734     // Thumb1: we might need to set base writeback when building the MI.
735     MIB.addReg(Base, getDefRegState(true))
736        .addReg(Base, getKillRegState(BaseKill));
737 
738     // The base isn't dead after a merged instruction with writeback.
739     // Insert a sub instruction after the newly formed instruction to reset.
740     if (!BaseKill)
741       UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
742 
743   } else {
744     // No writeback, simply build the MachineInstr.
745     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
746     MIB.addReg(Base, getKillRegState(BaseKill));
747   }
748 
749   MIB.addImm(Pred).addReg(PredReg);
750 
751   for (const std::pair<unsigned, bool> &R : Regs)
752     MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
753 
754   return MIB.getInstr();
755 }
756 
757 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
758     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
759     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
760     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
761   bool IsLoad = isi32Load(Opcode);
762   assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
763   unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
764 
765   assert(Regs.size() == 2);
766   MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
767                                     TII->get(LoadStoreOpcode));
768   if (IsLoad) {
769     MIB.addReg(Regs[0].first, RegState::Define)
770        .addReg(Regs[1].first, RegState::Define);
771   } else {
772     MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
773        .addReg(Regs[1].first, getKillRegState(Regs[1].second));
774   }
775   MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
776   return MIB.getInstr();
777 }
778 
779 /// Call MergeOps and update MemOps and merges accordingly on success.
780 MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
781   const MachineInstr *First = Cand.Instrs.front();
782   unsigned Opcode = First->getOpcode();
783   bool IsLoad = isLoadSingle(Opcode);
784   SmallVector<std::pair<unsigned, bool>, 8> Regs;
785   SmallVector<unsigned, 4> ImpDefs;
786   DenseSet<unsigned> KilledRegs;
787   // Determine list of registers and list of implicit super-register defs.
788   for (const MachineInstr *MI : Cand.Instrs) {
789     const MachineOperand &MO = getLoadStoreRegOp(*MI);
790     unsigned Reg = MO.getReg();
791     bool IsKill = MO.isKill();
792     if (IsKill)
793       KilledRegs.insert(Reg);
794     Regs.push_back(std::make_pair(Reg, IsKill));
795 
796     if (IsLoad) {
797       // Collect any implicit defs of super-registers, after merging we can't
798       // be sure anymore that we properly preserved these live ranges and must
799       // removed these implicit operands.
800       for (const MachineOperand &MO : MI->implicit_operands()) {
801         if (!MO.isReg() || !MO.isDef() || MO.isDead())
802           continue;
803         assert(MO.isImplicit());
804         unsigned DefReg = MO.getReg();
805 
806         if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
807           continue;
808         // We can ignore cases where the super-reg is read and written.
809         if (MI->readsRegister(DefReg))
810           continue;
811         ImpDefs.push_back(DefReg);
812       }
813     }
814   }
815 
816   // Attempt the merge.
817   typedef MachineBasicBlock::iterator iterator;
818   MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
819   iterator InsertBefore = std::next(iterator(LatestMI));
820   MachineBasicBlock &MBB = *LatestMI->getParent();
821   unsigned Offset = getMemoryOpOffset(First);
822   unsigned Base = getLoadStoreBaseOp(*First).getReg();
823   bool BaseKill = LatestMI->killsRegister(Base);
824   unsigned PredReg = 0;
825   ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
826   DebugLoc DL = First->getDebugLoc();
827   MachineInstr *Merged = nullptr;
828   if (Cand.CanMergeToLSDouble)
829     Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
830                                    Opcode, Pred, PredReg, DL, Regs);
831   if (!Merged && Cand.CanMergeToLSMulti)
832     Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
833                                   Opcode, Pred, PredReg, DL, Regs);
834   if (!Merged)
835     return nullptr;
836 
837   // Determine earliest instruction that will get removed. We then keep an
838   // iterator just above it so the following erases don't invalidated it.
839   iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
840   bool EarliestAtBegin = false;
841   if (EarliestI == MBB.begin()) {
842     EarliestAtBegin = true;
843   } else {
844     EarliestI = std::prev(EarliestI);
845   }
846 
847   // Remove instructions which have been merged.
848   for (MachineInstr *MI : Cand.Instrs)
849     MBB.erase(MI);
850 
851   // Determine range between the earliest removed instruction and the new one.
852   if (EarliestAtBegin)
853     EarliestI = MBB.begin();
854   else
855     EarliestI = std::next(EarliestI);
856   auto FixupRange = make_range(EarliestI, iterator(Merged));
857 
858   if (isLoadSingle(Opcode)) {
859     // If the previous loads defined a super-reg, then we have to mark earlier
860     // operands undef; Replicate the super-reg def on the merged instruction.
861     for (MachineInstr &MI : FixupRange) {
862       for (unsigned &ImpDefReg : ImpDefs) {
863         for (MachineOperand &MO : MI.implicit_operands()) {
864           if (!MO.isReg() || MO.getReg() != ImpDefReg)
865             continue;
866           if (MO.readsReg())
867             MO.setIsUndef();
868           else if (MO.isDef())
869             ImpDefReg = 0;
870         }
871       }
872     }
873 
874     MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
875     for (unsigned ImpDef : ImpDefs)
876       MIB.addReg(ImpDef, RegState::ImplicitDefine);
877   } else {
878     // Remove kill flags: We are possibly storing the values later now.
879     assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
880     for (MachineInstr &MI : FixupRange) {
881       for (MachineOperand &MO : MI.uses()) {
882         if (!MO.isReg() || !MO.isKill())
883           continue;
884         if (KilledRegs.count(MO.getReg()))
885           MO.setIsKill(false);
886       }
887     }
888     assert(ImpDefs.empty());
889   }
890 
891   return Merged;
892 }
893 
894 static bool isValidLSDoubleOffset(int Offset) {
895   unsigned Value = abs(Offset);
896   // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
897   // multiplied by 4.
898   return (Value % 4) == 0 && Value < 1024;
899 }
900 
901 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
902 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
903   const MachineInstr *FirstMI = MemOps[0].MI;
904   unsigned Opcode = FirstMI->getOpcode();
905   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
906   unsigned Size = getLSMultipleTransferSize(FirstMI);
907   // vldm / vstm limit are 32 for S variants, 16 for D variants.
908   unsigned Limit;
909   switch (Opcode) {
910   default:
911     Limit = UINT_MAX;
912     break;
913   case ARM::VSTRS:
914     Limit = 32;
915     break;
916   case ARM::VSTRD:
917     Limit = 16;
918     break;
919   case ARM::VLDRD:
920     Limit = 16;
921     break;
922   case ARM::VLDRS:
923     Limit = 32;
924     break;
925   }
926 
927   unsigned SIndex = 0;
928   unsigned EIndex = MemOps.size();
929   do {
930     // Look at the first instruction.
931     const MachineInstr *MI = MemOps[SIndex].MI;
932     int Offset = MemOps[SIndex].Offset;
933     const MachineOperand &PMO = getLoadStoreRegOp(*MI);
934     unsigned PReg = PMO.getReg();
935     unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
936     unsigned Latest = SIndex;
937     unsigned Earliest = SIndex;
938     unsigned Count = 1;
939     bool CanMergeToLSDouble =
940       STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
941     // ARM errata 602117: LDRD with base in list may result in incorrect base
942     // register when interrupted or faulted.
943     if (STI->isCortexM3() && isi32Load(Opcode) &&
944         PReg == getLoadStoreBaseOp(*MI).getReg())
945       CanMergeToLSDouble = false;
946 
947     bool CanMergeToLSMulti = true;
948     // On swift vldm/vstm starting with an odd register number as that needs
949     // more uops than single vldrs.
950     if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
951       CanMergeToLSMulti = false;
952 
953     // Merge following instructions where possible.
954     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
955       int NewOffset = MemOps[I].Offset;
956       if (NewOffset != Offset + (int)Size)
957         break;
958       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
959       unsigned Reg = MO.getReg();
960       unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
961 
962       // See if the current load/store may be part of a multi load/store.
963       bool PartOfLSMulti = CanMergeToLSMulti;
964       if (PartOfLSMulti) {
965         // Cannot load from SP
966         if (Reg == ARM::SP)
967           PartOfLSMulti = false;
968         // Register numbers must be in ascending order.
969         else if (RegNum <= PRegNum)
970           PartOfLSMulti = false;
971         // For VFP / NEON load/store multiples, the registers must be
972         // consecutive and within the limit on the number of registers per
973         // instruction.
974         else if (!isNotVFP && RegNum != PRegNum+1)
975           PartOfLSMulti = false;
976       }
977       // See if the current load/store may be part of a double load/store.
978       bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
979 
980       if (!PartOfLSMulti && !PartOfLSDouble)
981         break;
982       CanMergeToLSMulti &= PartOfLSMulti;
983       CanMergeToLSDouble &= PartOfLSDouble;
984       // Track MemOp with latest and earliest position (Positions are
985       // counted in reverse).
986       unsigned Position = MemOps[I].Position;
987       if (Position < MemOps[Latest].Position)
988         Latest = I;
989       else if (Position > MemOps[Earliest].Position)
990         Earliest = I;
991       // Prepare for next MemOp.
992       Offset += Size;
993       PRegNum = RegNum;
994     }
995 
996     // Form a candidate from the Ops collected so far.
997     MergeCandidate *Candidate = new(Allocator) MergeCandidate;
998     for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
999       Candidate->Instrs.push_back(MemOps[C].MI);
1000     Candidate->LatestMIIdx = Latest - SIndex;
1001     Candidate->EarliestMIIdx = Earliest - SIndex;
1002     Candidate->InsertPos = MemOps[Latest].Position;
1003     if (Count == 1)
1004       CanMergeToLSMulti = CanMergeToLSDouble = false;
1005     Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1006     Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1007     Candidates.push_back(Candidate);
1008     // Continue after the chain.
1009     SIndex += Count;
1010   } while (SIndex < EIndex);
1011 }
1012 
1013 static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
1014                                 unsigned Bytes, unsigned Limit,
1015                                 ARMCC::CondCodes Pred, unsigned PredReg) {
1016   unsigned MyPredReg = 0;
1017   if (!MI)
1018     return false;
1019 
1020   bool CheckCPSRDef = false;
1021   switch (MI->getOpcode()) {
1022   default: return false;
1023   case ARM::tSUBi8:
1024   case ARM::t2SUBri:
1025   case ARM::SUBri:
1026     CheckCPSRDef = true;
1027     break;
1028   case ARM::tSUBspi:
1029     break;
1030   }
1031 
1032   // Make sure the offset fits in 8 bits.
1033   if (Bytes == 0 || (Limit && Bytes >= Limit))
1034     return false;
1035 
1036   unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
1037                     MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
1038   if (!(MI->getOperand(0).getReg() == Base &&
1039         MI->getOperand(1).getReg() == Base &&
1040         (MI->getOperand(2).getImm() * Scale) == Bytes &&
1041         getInstrPredicate(MI, MyPredReg) == Pred &&
1042         MyPredReg == PredReg))
1043     return false;
1044 
1045   return CheckCPSRDef ? !definesCPSR(MI) : true;
1046 }
1047 
1048 static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
1049                                 unsigned Bytes, unsigned Limit,
1050                                 ARMCC::CondCodes Pred, unsigned PredReg) {
1051   unsigned MyPredReg = 0;
1052   if (!MI)
1053     return false;
1054 
1055   bool CheckCPSRDef = false;
1056   switch (MI->getOpcode()) {
1057   default: return false;
1058   case ARM::tADDi8:
1059   case ARM::t2ADDri:
1060   case ARM::ADDri:
1061     CheckCPSRDef = true;
1062     break;
1063   case ARM::tADDspi:
1064     break;
1065   }
1066 
1067   if (Bytes == 0 || (Limit && Bytes >= Limit))
1068     // Make sure the offset fits in 8 bits.
1069     return false;
1070 
1071   unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
1072                     MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
1073   if (!(MI->getOperand(0).getReg() == Base &&
1074         MI->getOperand(1).getReg() == Base &&
1075         (MI->getOperand(2).getImm() * Scale) == Bytes &&
1076         getInstrPredicate(MI, MyPredReg) == Pred &&
1077         MyPredReg == PredReg))
1078     return false;
1079 
1080   return CheckCPSRDef ? !definesCPSR(MI) : true;
1081 }
1082 
1083 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1084                                             ARM_AM::AMSubMode Mode) {
1085   switch (Opc) {
1086   default: llvm_unreachable("Unhandled opcode!");
1087   case ARM::LDMIA:
1088   case ARM::LDMDA:
1089   case ARM::LDMDB:
1090   case ARM::LDMIB:
1091     switch (Mode) {
1092     default: llvm_unreachable("Unhandled submode!");
1093     case ARM_AM::ia: return ARM::LDMIA_UPD;
1094     case ARM_AM::ib: return ARM::LDMIB_UPD;
1095     case ARM_AM::da: return ARM::LDMDA_UPD;
1096     case ARM_AM::db: return ARM::LDMDB_UPD;
1097     }
1098   case ARM::STMIA:
1099   case ARM::STMDA:
1100   case ARM::STMDB:
1101   case ARM::STMIB:
1102     switch (Mode) {
1103     default: llvm_unreachable("Unhandled submode!");
1104     case ARM_AM::ia: return ARM::STMIA_UPD;
1105     case ARM_AM::ib: return ARM::STMIB_UPD;
1106     case ARM_AM::da: return ARM::STMDA_UPD;
1107     case ARM_AM::db: return ARM::STMDB_UPD;
1108     }
1109   case ARM::t2LDMIA:
1110   case ARM::t2LDMDB:
1111     switch (Mode) {
1112     default: llvm_unreachable("Unhandled submode!");
1113     case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1114     case ARM_AM::db: return ARM::t2LDMDB_UPD;
1115     }
1116   case ARM::t2STMIA:
1117   case ARM::t2STMDB:
1118     switch (Mode) {
1119     default: llvm_unreachable("Unhandled submode!");
1120     case ARM_AM::ia: return ARM::t2STMIA_UPD;
1121     case ARM_AM::db: return ARM::t2STMDB_UPD;
1122     }
1123   case ARM::VLDMSIA:
1124     switch (Mode) {
1125     default: llvm_unreachable("Unhandled submode!");
1126     case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1127     case ARM_AM::db: return ARM::VLDMSDB_UPD;
1128     }
1129   case ARM::VLDMDIA:
1130     switch (Mode) {
1131     default: llvm_unreachable("Unhandled submode!");
1132     case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1133     case ARM_AM::db: return ARM::VLDMDDB_UPD;
1134     }
1135   case ARM::VSTMSIA:
1136     switch (Mode) {
1137     default: llvm_unreachable("Unhandled submode!");
1138     case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1139     case ARM_AM::db: return ARM::VSTMSDB_UPD;
1140     }
1141   case ARM::VSTMDIA:
1142     switch (Mode) {
1143     default: llvm_unreachable("Unhandled submode!");
1144     case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1145     case ARM_AM::db: return ARM::VSTMDDB_UPD;
1146     }
1147   }
1148 }
1149 
1150 /// Fold proceeding/trailing inc/dec of base register into the
1151 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1152 ///
1153 /// stmia rn, <ra, rb, rc>
1154 /// rn := rn + 4 * 3;
1155 /// =>
1156 /// stmia rn!, <ra, rb, rc>
1157 ///
1158 /// rn := rn - 4 * 3;
1159 /// ldmia rn, <ra, rb, rc>
1160 /// =>
1161 /// ldmdb rn!, <ra, rb, rc>
1162 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1163   // Thumb1 is already using updating loads/stores.
1164   if (isThumb1) return false;
1165 
1166   const MachineOperand &BaseOP = MI->getOperand(0);
1167   unsigned Base = BaseOP.getReg();
1168   bool BaseKill = BaseOP.isKill();
1169   unsigned Bytes = getLSMultipleTransferSize(MI);
1170   unsigned PredReg = 0;
1171   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1172   unsigned Opcode = MI->getOpcode();
1173   DebugLoc DL = MI->getDebugLoc();
1174 
1175   // Can't use an updating ld/st if the base register is also a dest
1176   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1177   for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1178     if (MI->getOperand(i).getReg() == Base)
1179       return false;
1180 
1181   bool DoMerge = false;
1182   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1183 
1184   // Try merging with the previous instruction.
1185   MachineBasicBlock &MBB = *MI->getParent();
1186   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1187   MachineBasicBlock::iterator MBBI(MI);
1188   if (MBBI != BeginMBBI) {
1189     MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1190     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
1191       --PrevMBBI;
1192     if (Mode == ARM_AM::ia &&
1193         isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
1194       Mode = ARM_AM::db;
1195       DoMerge = true;
1196     } else if (Mode == ARM_AM::ib &&
1197                isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
1198       Mode = ARM_AM::da;
1199       DoMerge = true;
1200     }
1201     if (DoMerge)
1202       MBB.erase(PrevMBBI);
1203   }
1204 
1205   // Try merging with the next instruction.
1206   MachineBasicBlock::iterator EndMBBI = MBB.end();
1207   if (!DoMerge && MBBI != EndMBBI) {
1208     MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1209     while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1210       ++NextMBBI;
1211     if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
1212         isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
1213       DoMerge = true;
1214     } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
1215                isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
1216       DoMerge = true;
1217     }
1218     if (DoMerge)
1219       MBB.erase(NextMBBI);
1220   }
1221 
1222   if (!DoMerge)
1223     return false;
1224 
1225   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1226   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1227     .addReg(Base, getDefRegState(true)) // WB base register
1228     .addReg(Base, getKillRegState(BaseKill))
1229     .addImm(Pred).addReg(PredReg);
1230 
1231   // Transfer the rest of operands.
1232   for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
1233     MIB.addOperand(MI->getOperand(OpNum));
1234 
1235   // Transfer memoperands.
1236   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
1237 
1238   MBB.erase(MBBI);
1239   return true;
1240 }
1241 
1242 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1243                                              ARM_AM::AddrOpc Mode) {
1244   switch (Opc) {
1245   case ARM::LDRi12:
1246     return ARM::LDR_PRE_IMM;
1247   case ARM::STRi12:
1248     return ARM::STR_PRE_IMM;
1249   case ARM::VLDRS:
1250     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1251   case ARM::VLDRD:
1252     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1253   case ARM::VSTRS:
1254     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1255   case ARM::VSTRD:
1256     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1257   case ARM::t2LDRi8:
1258   case ARM::t2LDRi12:
1259     return ARM::t2LDR_PRE;
1260   case ARM::t2STRi8:
1261   case ARM::t2STRi12:
1262     return ARM::t2STR_PRE;
1263   default: llvm_unreachable("Unhandled opcode!");
1264   }
1265 }
1266 
1267 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1268                                               ARM_AM::AddrOpc Mode) {
1269   switch (Opc) {
1270   case ARM::LDRi12:
1271     return ARM::LDR_POST_IMM;
1272   case ARM::STRi12:
1273     return ARM::STR_POST_IMM;
1274   case ARM::VLDRS:
1275     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1276   case ARM::VLDRD:
1277     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1278   case ARM::VSTRS:
1279     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1280   case ARM::VSTRD:
1281     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1282   case ARM::t2LDRi8:
1283   case ARM::t2LDRi12:
1284     return ARM::t2LDR_POST;
1285   case ARM::t2STRi8:
1286   case ARM::t2STRi12:
1287     return ARM::t2STR_POST;
1288   default: llvm_unreachable("Unhandled opcode!");
1289   }
1290 }
1291 
1292 /// Fold proceeding/trailing inc/dec of base register into the
1293 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1294 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1295   // Thumb1 doesn't have updating LDR/STR.
1296   // FIXME: Use LDM/STM with single register instead.
1297   if (isThumb1) return false;
1298 
1299   unsigned Base = getLoadStoreBaseOp(*MI).getReg();
1300   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1301   unsigned Bytes = getLSMultipleTransferSize(MI);
1302   unsigned Opcode = MI->getOpcode();
1303   DebugLoc DL = MI->getDebugLoc();
1304   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1305                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1306   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1307   if (isi32Load(Opcode) || isi32Store(Opcode))
1308     if (MI->getOperand(2).getImm() != 0)
1309       return false;
1310   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1311     return false;
1312 
1313   bool isLd = isLoadSingle(Opcode);
1314   // Can't do the merge if the destination register is the same as the would-be
1315   // writeback register.
1316   if (MI->getOperand(0).getReg() == Base)
1317     return false;
1318 
1319   unsigned PredReg = 0;
1320   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1321   bool DoMerge = false;
1322   ARM_AM::AddrOpc AddSub = ARM_AM::add;
1323   unsigned NewOpc = 0;
1324   // AM2 - 12 bits, thumb2 - 8 bits.
1325   unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
1326 
1327   // Try merging with the previous instruction.
1328   MachineBasicBlock &MBB = *MI->getParent();
1329   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1330   MachineBasicBlock::iterator MBBI(MI);
1331   if (MBBI != BeginMBBI) {
1332     MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1333     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
1334       --PrevMBBI;
1335     if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
1336       DoMerge = true;
1337       AddSub = ARM_AM::sub;
1338     } else if (!isAM5 &&
1339                isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
1340       DoMerge = true;
1341     }
1342     if (DoMerge) {
1343       NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
1344       MBB.erase(PrevMBBI);
1345     }
1346   }
1347 
1348   // Try merging with the next instruction.
1349   MachineBasicBlock::iterator EndMBBI = MBB.end();
1350   if (!DoMerge && MBBI != EndMBBI) {
1351     MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1352     while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1353       ++NextMBBI;
1354     if (!isAM5 &&
1355         isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
1356       DoMerge = true;
1357       AddSub = ARM_AM::sub;
1358     } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
1359       DoMerge = true;
1360     }
1361     if (DoMerge) {
1362       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
1363       MBB.erase(NextMBBI);
1364     }
1365   }
1366 
1367   if (!DoMerge)
1368     return false;
1369 
1370   if (isAM5) {
1371     // VLDM[SD]_UPD, VSTM[SD]_UPD
1372     // (There are no base-updating versions of VLDR/VSTR instructions, but the
1373     // updating load/store-multiple instructions can be used with only one
1374     // register.)
1375     MachineOperand &MO = MI->getOperand(0);
1376     BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1377       .addReg(Base, getDefRegState(true)) // WB base register
1378       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1379       .addImm(Pred).addReg(PredReg)
1380       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1381                             getKillRegState(MO.isKill())));
1382   } else if (isLd) {
1383     if (isAM2) {
1384       // LDR_PRE, LDR_POST
1385       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1386         int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1387         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1388           .addReg(Base, RegState::Define)
1389           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1390       } else {
1391         int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1392         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1393           .addReg(Base, RegState::Define)
1394           .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
1395       }
1396     } else {
1397       int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1398       // t2LDR_PRE, t2LDR_POST
1399       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1400         .addReg(Base, RegState::Define)
1401         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1402     }
1403   } else {
1404     MachineOperand &MO = MI->getOperand(0);
1405     // FIXME: post-indexed stores use am2offset_imm, which still encodes
1406     // the vestigal zero-reg offset register. When that's fixed, this clause
1407     // can be removed entirely.
1408     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1409       int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1410       // STR_PRE, STR_POST
1411       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1412         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1413         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
1414     } else {
1415       int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1416       // t2STR_PRE, t2STR_POST
1417       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1418         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1419         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1420     }
1421   }
1422   MBB.erase(MBBI);
1423 
1424   return true;
1425 }
1426 
1427 /// Returns true if instruction is a memory operation that this pass is capable
1428 /// of operating on.
1429 static bool isMemoryOp(const MachineInstr *MI) {
1430   // When no memory operands are present, conservatively assume unaligned,
1431   // volatile, unfoldable.
1432   if (!MI->hasOneMemOperand())
1433     return false;
1434 
1435   const MachineMemOperand *MMO = *MI->memoperands_begin();
1436 
1437   // Don't touch volatile memory accesses - we may be changing their order.
1438   if (MMO->isVolatile())
1439     return false;
1440 
1441   // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1442   // not.
1443   if (MMO->getAlignment() < 4)
1444     return false;
1445 
1446   // str <undef> could probably be eliminated entirely, but for now we just want
1447   // to avoid making a mess of it.
1448   // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1449   if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
1450       MI->getOperand(0).isUndef())
1451     return false;
1452 
1453   // Likewise don't mess with references to undefined addresses.
1454   if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
1455       MI->getOperand(1).isUndef())
1456     return false;
1457 
1458   unsigned Opcode = MI->getOpcode();
1459   switch (Opcode) {
1460   default: break;
1461   case ARM::VLDRS:
1462   case ARM::VSTRS:
1463     return MI->getOperand(1).isReg();
1464   case ARM::VLDRD:
1465   case ARM::VSTRD:
1466     return MI->getOperand(1).isReg();
1467   case ARM::LDRi12:
1468   case ARM::STRi12:
1469   case ARM::tLDRi:
1470   case ARM::tSTRi:
1471   case ARM::tLDRspi:
1472   case ARM::tSTRspi:
1473   case ARM::t2LDRi8:
1474   case ARM::t2LDRi12:
1475   case ARM::t2STRi8:
1476   case ARM::t2STRi12:
1477     return MI->getOperand(1).isReg();
1478   }
1479   return false;
1480 }
1481 
1482 static void InsertLDR_STR(MachineBasicBlock &MBB,
1483                           MachineBasicBlock::iterator &MBBI,
1484                           int Offset, bool isDef,
1485                           DebugLoc DL, unsigned NewOpc,
1486                           unsigned Reg, bool RegDeadKill, bool RegUndef,
1487                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
1488                           bool OffKill, bool OffUndef,
1489                           ARMCC::CondCodes Pred, unsigned PredReg,
1490                           const TargetInstrInfo *TII, bool isT2) {
1491   if (isDef) {
1492     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1493                                       TII->get(NewOpc))
1494       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1495       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1496     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1497   } else {
1498     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1499                                       TII->get(NewOpc))
1500       .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1501       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1502     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1503   }
1504 }
1505 
1506 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1507                                           MachineBasicBlock::iterator &MBBI) {
1508   MachineInstr *MI = &*MBBI;
1509   unsigned Opcode = MI->getOpcode();
1510   if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1511     return false;
1512 
1513   const MachineOperand &BaseOp = MI->getOperand(2);
1514   unsigned BaseReg = BaseOp.getReg();
1515   unsigned EvenReg = MI->getOperand(0).getReg();
1516   unsigned OddReg  = MI->getOperand(1).getReg();
1517   unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1518   unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1519 
1520   // ARM errata 602117: LDRD with base in list may result in incorrect base
1521   // register when interrupted or faulted.
1522   bool Errata602117 = EvenReg == BaseReg &&
1523     (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1524   // ARM LDRD/STRD needs consecutive registers.
1525   bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1526     (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1527 
1528   if (!Errata602117 && !NonConsecutiveRegs)
1529     return false;
1530 
1531   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1532   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1533   bool EvenDeadKill = isLd ?
1534     MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1535   bool EvenUndef = MI->getOperand(0).isUndef();
1536   bool OddDeadKill  = isLd ?
1537     MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1538   bool OddUndef = MI->getOperand(1).isUndef();
1539   bool BaseKill = BaseOp.isKill();
1540   bool BaseUndef = BaseOp.isUndef();
1541   bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1542   bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1543   int OffImm = getMemoryOpOffset(MI);
1544   unsigned PredReg = 0;
1545   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1546 
1547   if (OddRegNum > EvenRegNum && OffImm == 0) {
1548     // Ascending register numbers and no offset. It's safe to change it to a
1549     // ldm or stm.
1550     unsigned NewOpc = (isLd)
1551       ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1552       : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1553     if (isLd) {
1554       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1555         .addReg(BaseReg, getKillRegState(BaseKill))
1556         .addImm(Pred).addReg(PredReg)
1557         .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1558         .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1559       ++NumLDRD2LDM;
1560     } else {
1561       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1562         .addReg(BaseReg, getKillRegState(BaseKill))
1563         .addImm(Pred).addReg(PredReg)
1564         .addReg(EvenReg,
1565                 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1566         .addReg(OddReg,
1567                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
1568       ++NumSTRD2STM;
1569     }
1570   } else {
1571     // Split into two instructions.
1572     unsigned NewOpc = (isLd)
1573       ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1574       : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1575     // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1576     // so adjust and use t2LDRi12 here for that.
1577     unsigned NewOpc2 = (isLd)
1578       ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1579       : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1580     DebugLoc dl = MBBI->getDebugLoc();
1581     // If this is a load and base register is killed, it may have been
1582     // re-defed by the load, make sure the first load does not clobber it.
1583     if (isLd &&
1584         (BaseKill || OffKill) &&
1585         (TRI->regsOverlap(EvenReg, BaseReg))) {
1586       assert(!TRI->regsOverlap(OddReg, BaseReg));
1587       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1588                     OddReg, OddDeadKill, false,
1589                     BaseReg, false, BaseUndef, false, OffUndef,
1590                     Pred, PredReg, TII, isT2);
1591       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1592                     EvenReg, EvenDeadKill, false,
1593                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1594                     Pred, PredReg, TII, isT2);
1595     } else {
1596       if (OddReg == EvenReg && EvenDeadKill) {
1597         // If the two source operands are the same, the kill marker is
1598         // probably on the first one. e.g.
1599         // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1600         EvenDeadKill = false;
1601         OddDeadKill = true;
1602       }
1603       // Never kill the base register in the first instruction.
1604       if (EvenReg == BaseReg)
1605         EvenDeadKill = false;
1606       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1607                     EvenReg, EvenDeadKill, EvenUndef,
1608                     BaseReg, false, BaseUndef, false, OffUndef,
1609                     Pred, PredReg, TII, isT2);
1610       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1611                     OddReg, OddDeadKill, OddUndef,
1612                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1613                     Pred, PredReg, TII, isT2);
1614     }
1615     if (isLd)
1616       ++NumLDRD2LDR;
1617     else
1618       ++NumSTRD2STR;
1619   }
1620 
1621   MBBI = MBB.erase(MBBI);
1622   return true;
1623 }
1624 
1625 /// An optimization pass to turn multiple LDR / STR ops of the same base and
1626 /// incrementing offset into LDM / STM ops.
1627 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1628   MemOpQueue MemOps;
1629   unsigned CurrBase = 0;
1630   unsigned CurrOpc = ~0u;
1631   unsigned CurrSize = 0;
1632   ARMCC::CondCodes CurrPred = ARMCC::AL;
1633   unsigned CurrPredReg = 0;
1634   unsigned Position = 0;
1635   assert(Candidates.size() == 0);
1636   LiveRegsValid = false;
1637 
1638   for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1639        I = MBBI) {
1640     // The instruction in front of the iterator is the one we look at.
1641     MBBI = std::prev(I);
1642     if (FixInvalidRegPairOp(MBB, MBBI))
1643       continue;
1644     ++Position;
1645 
1646     if (isMemoryOp(MBBI)) {
1647       unsigned Opcode = MBBI->getOpcode();
1648       unsigned Size = getLSMultipleTransferSize(MBBI);
1649       const MachineOperand &MO = MBBI->getOperand(0);
1650       unsigned Reg = MO.getReg();
1651       unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
1652       unsigned PredReg = 0;
1653       ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
1654       int Offset = getMemoryOpOffset(MBBI);
1655       if (CurrBase == 0) {
1656         // Start of a new chain.
1657         CurrBase = Base;
1658         CurrOpc  = Opcode;
1659         CurrSize = Size;
1660         CurrPred = Pred;
1661         CurrPredReg = PredReg;
1662         MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1663         continue;
1664       }
1665       // Note: No need to match PredReg in the next if.
1666       if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1667         // Watch out for:
1668         //   r4 := ldr [r0, #8]
1669         //   r4 := ldr [r0, #4]
1670         // or
1671         //   r0 := ldr [r0]
1672         // If a load overrides the base register or a register loaded by
1673         // another load in our chain, we cannot take this instruction.
1674         bool Overlap = false;
1675         if (isLoadSingle(Opcode)) {
1676           Overlap = (Base == Reg);
1677           if (!Overlap) {
1678             for (const MemOpQueueEntry &E : MemOps) {
1679               if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1680                 Overlap = true;
1681                 break;
1682               }
1683             }
1684           }
1685         }
1686 
1687         if (!Overlap) {
1688           // Check offset and sort memory operation into the current chain.
1689           if (Offset > MemOps.back().Offset) {
1690             MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1691             continue;
1692           } else {
1693             MemOpQueue::iterator MI, ME;
1694             for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1695               if (Offset < MI->Offset) {
1696                 // Found a place to insert.
1697                 break;
1698               }
1699               if (Offset == MI->Offset) {
1700                 // Collision, abort.
1701                 MI = ME;
1702                 break;
1703               }
1704             }
1705             if (MI != MemOps.end()) {
1706               MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
1707               continue;
1708             }
1709           }
1710         }
1711       }
1712 
1713       // Don't advance the iterator; The op will start a new chain next.
1714       MBBI = I;
1715       --Position;
1716       // Fallthrough to look into existing chain.
1717     } else if (MBBI->isDebugValue())
1718       continue;
1719 
1720     // If we are here then the chain is broken; Extract candidates for a merge.
1721     if (MemOps.size() > 0) {
1722       FormCandidates(MemOps);
1723       // Reset for the next chain.
1724       CurrBase = 0;
1725       CurrOpc = ~0u;
1726       CurrSize = 0;
1727       CurrPred = ARMCC::AL;
1728       CurrPredReg = 0;
1729       MemOps.clear();
1730     }
1731   }
1732   if (MemOps.size() > 0)
1733     FormCandidates(MemOps);
1734 
1735   // Sort candidates so they get processed from end to begin of the basic
1736   // block later; This is necessary for liveness calculation.
1737   auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1738     return M0->InsertPos < M1->InsertPos;
1739   };
1740   std::sort(Candidates.begin(), Candidates.end(), LessThan);
1741 
1742   // Go through list of candidates and merge.
1743   bool Changed = false;
1744   for (const MergeCandidate *Candidate : Candidates) {
1745     if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1746       MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1747       // Merge preceding/trailing base inc/dec into the merged op.
1748       if (Merged) {
1749         Changed = true;
1750         unsigned Opcode = Merged->getOpcode();
1751         if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
1752           MergeBaseUpdateLSMultiple(Merged);
1753       } else {
1754         for (MachineInstr *MI : Candidate->Instrs) {
1755           if (MergeBaseUpdateLoadStore(MI))
1756             Changed = true;
1757         }
1758       }
1759     } else {
1760       assert(Candidate->Instrs.size() == 1);
1761       if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
1762         Changed = true;
1763     }
1764   }
1765   Candidates.clear();
1766 
1767   return Changed;
1768 }
1769 
1770 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
1771 /// into the preceding stack restore so it directly restore the value of LR
1772 /// into pc.
1773 ///   ldmfd sp!, {..., lr}
1774 ///   bx lr
1775 /// or
1776 ///   ldmfd sp!, {..., lr}
1777 ///   mov pc, lr
1778 /// =>
1779 ///   ldmfd sp!, {..., pc}
1780 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1781   // Thumb1 LDM doesn't allow high registers.
1782   if (isThumb1) return false;
1783   if (MBB.empty()) return false;
1784 
1785   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1786   if (MBBI != MBB.begin() &&
1787       (MBBI->getOpcode() == ARM::BX_RET ||
1788        MBBI->getOpcode() == ARM::tBX_RET ||
1789        MBBI->getOpcode() == ARM::MOVPCLR)) {
1790     MachineInstr *PrevMI = std::prev(MBBI);
1791     unsigned Opcode = PrevMI->getOpcode();
1792     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1793         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1794         Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1795       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1796       if (MO.getReg() != ARM::LR)
1797         return false;
1798       unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1799       assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1800               Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1801       PrevMI->setDesc(TII->get(NewOpc));
1802       MO.setReg(ARM::PC);
1803       PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
1804       MBB.erase(MBBI);
1805       return true;
1806     }
1807   }
1808   return false;
1809 }
1810 
1811 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1812   MF = &Fn;
1813   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1814   TL = STI->getTargetLowering();
1815   AFI = Fn.getInfo<ARMFunctionInfo>();
1816   TII = STI->getInstrInfo();
1817   TRI = STI->getRegisterInfo();
1818   MRI = &Fn.getRegInfo();
1819   RegClassInfoValid = false;
1820   isThumb2 = AFI->isThumb2Function();
1821   isThumb1 = AFI->isThumbFunction() && !isThumb2;
1822 
1823   bool Modified = false;
1824   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1825        ++MFI) {
1826     MachineBasicBlock &MBB = *MFI;
1827     Modified |= LoadStoreMultipleOpti(MBB);
1828     if (STI->hasV5TOps())
1829       Modified |= MergeReturnIntoLDM(MBB);
1830   }
1831 
1832   Allocator.Reset();
1833   return Modified;
1834 }
1835 
1836 namespace {
1837   /// Pre- register allocation pass that move load / stores from consecutive
1838   /// locations close to make it more likely they will be combined later.
1839   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1840     static char ID;
1841     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
1842 
1843     const DataLayout *TD;
1844     const TargetInstrInfo *TII;
1845     const TargetRegisterInfo *TRI;
1846     const ARMSubtarget *STI;
1847     MachineRegisterInfo *MRI;
1848     MachineFunction *MF;
1849 
1850     bool runOnMachineFunction(MachineFunction &Fn) override;
1851 
1852     const char *getPassName() const override {
1853       return "ARM pre- register allocation load / store optimization pass";
1854     }
1855 
1856   private:
1857     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1858                           unsigned &NewOpc, unsigned &EvenReg,
1859                           unsigned &OddReg, unsigned &BaseReg,
1860                           int &Offset,
1861                           unsigned &PredReg, ARMCC::CondCodes &Pred,
1862                           bool &isT2);
1863     bool RescheduleOps(MachineBasicBlock *MBB,
1864                        SmallVectorImpl<MachineInstr *> &Ops,
1865                        unsigned Base, bool isLd,
1866                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1867     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1868   };
1869   char ARMPreAllocLoadStoreOpt::ID = 0;
1870 }
1871 
1872 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1873   TD = Fn.getTarget().getDataLayout();
1874   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1875   TII = STI->getInstrInfo();
1876   TRI = STI->getRegisterInfo();
1877   MRI = &Fn.getRegInfo();
1878   MF  = &Fn;
1879 
1880   bool Modified = false;
1881   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1882        ++MFI)
1883     Modified |= RescheduleLoadStoreInstrs(MFI);
1884 
1885   return Modified;
1886 }
1887 
1888 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1889                                       MachineBasicBlock::iterator I,
1890                                       MachineBasicBlock::iterator E,
1891                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
1892                                       SmallSet<unsigned, 4> &MemRegs,
1893                                       const TargetRegisterInfo *TRI) {
1894   // Are there stores / loads / calls between them?
1895   // FIXME: This is overly conservative. We should make use of alias information
1896   // some day.
1897   SmallSet<unsigned, 4> AddedRegPressure;
1898   while (++I != E) {
1899     if (I->isDebugValue() || MemOps.count(&*I))
1900       continue;
1901     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
1902       return false;
1903     if (isLd && I->mayStore())
1904       return false;
1905     if (!isLd) {
1906       if (I->mayLoad())
1907         return false;
1908       // It's not safe to move the first 'str' down.
1909       // str r1, [r0]
1910       // strh r5, [r0]
1911       // str r4, [r0, #+4]
1912       if (I->mayStore())
1913         return false;
1914     }
1915     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1916       MachineOperand &MO = I->getOperand(j);
1917       if (!MO.isReg())
1918         continue;
1919       unsigned Reg = MO.getReg();
1920       if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1921         return false;
1922       if (Reg != Base && !MemRegs.count(Reg))
1923         AddedRegPressure.insert(Reg);
1924     }
1925   }
1926 
1927   // Estimate register pressure increase due to the transformation.
1928   if (MemRegs.size() <= 4)
1929     // Ok if we are moving small number of instructions.
1930     return true;
1931   return AddedRegPressure.size() <= MemRegs.size() * 2;
1932 }
1933 
1934 
1935 /// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
1936 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
1937                                    MachineInstr *Op1) {
1938   assert(MI->memoperands_empty() && "expected a new machineinstr");
1939   size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
1940     + (Op1->memoperands_end() - Op1->memoperands_begin());
1941 
1942   MachineFunction *MF = MI->getParent()->getParent();
1943   MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
1944   MachineSDNode::mmo_iterator MemEnd =
1945     std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
1946   MemEnd =
1947     std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
1948   MI->setMemRefs(MemBegin, MemEnd);
1949 }
1950 
1951 bool
1952 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1953                                           DebugLoc &dl, unsigned &NewOpc,
1954                                           unsigned &FirstReg,
1955                                           unsigned &SecondReg,
1956                                           unsigned &BaseReg, int &Offset,
1957                                           unsigned &PredReg,
1958                                           ARMCC::CondCodes &Pred,
1959                                           bool &isT2) {
1960   // Make sure we're allowed to generate LDRD/STRD.
1961   if (!STI->hasV5TEOps())
1962     return false;
1963 
1964   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1965   unsigned Scale = 1;
1966   unsigned Opcode = Op0->getOpcode();
1967   if (Opcode == ARM::LDRi12) {
1968     NewOpc = ARM::LDRD;
1969   } else if (Opcode == ARM::STRi12) {
1970     NewOpc = ARM::STRD;
1971   } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1972     NewOpc = ARM::t2LDRDi8;
1973     Scale = 4;
1974     isT2 = true;
1975   } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1976     NewOpc = ARM::t2STRDi8;
1977     Scale = 4;
1978     isT2 = true;
1979   } else {
1980     return false;
1981   }
1982 
1983   // Make sure the base address satisfies i64 ld / st alignment requirement.
1984   // At the moment, we ignore the memoryoperand's value.
1985   // If we want to use AliasAnalysis, we should check it accordingly.
1986   if (!Op0->hasOneMemOperand() ||
1987       (*Op0->memoperands_begin())->isVolatile())
1988     return false;
1989 
1990   unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1991   const Function *Func = MF->getFunction();
1992   unsigned ReqAlign = STI->hasV6Ops()
1993     ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
1994     : 8;  // Pre-v6 need 8-byte align
1995   if (Align < ReqAlign)
1996     return false;
1997 
1998   // Then make sure the immediate offset fits.
1999   int OffImm = getMemoryOpOffset(Op0);
2000   if (isT2) {
2001     int Limit = (1 << 8) * Scale;
2002     if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2003       return false;
2004     Offset = OffImm;
2005   } else {
2006     ARM_AM::AddrOpc AddSub = ARM_AM::add;
2007     if (OffImm < 0) {
2008       AddSub = ARM_AM::sub;
2009       OffImm = - OffImm;
2010     }
2011     int Limit = (1 << 8) * Scale;
2012     if (OffImm >= Limit || (OffImm & (Scale-1)))
2013       return false;
2014     Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2015   }
2016   FirstReg = Op0->getOperand(0).getReg();
2017   SecondReg = Op1->getOperand(0).getReg();
2018   if (FirstReg == SecondReg)
2019     return false;
2020   BaseReg = Op0->getOperand(1).getReg();
2021   Pred = getInstrPredicate(Op0, PredReg);
2022   dl = Op0->getDebugLoc();
2023   return true;
2024 }
2025 
2026 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2027                                  SmallVectorImpl<MachineInstr *> &Ops,
2028                                  unsigned Base, bool isLd,
2029                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2030   bool RetVal = false;
2031 
2032   // Sort by offset (in reverse order).
2033   std::sort(Ops.begin(), Ops.end(),
2034             [](const MachineInstr *LHS, const MachineInstr *RHS) {
2035     int LOffset = getMemoryOpOffset(LHS);
2036     int ROffset = getMemoryOpOffset(RHS);
2037     assert(LHS == RHS || LOffset != ROffset);
2038     return LOffset > ROffset;
2039   });
2040 
2041   // The loads / stores of the same base are in order. Scan them from first to
2042   // last and check for the following:
2043   // 1. Any def of base.
2044   // 2. Any gaps.
2045   while (Ops.size() > 1) {
2046     unsigned FirstLoc = ~0U;
2047     unsigned LastLoc = 0;
2048     MachineInstr *FirstOp = nullptr;
2049     MachineInstr *LastOp = nullptr;
2050     int LastOffset = 0;
2051     unsigned LastOpcode = 0;
2052     unsigned LastBytes = 0;
2053     unsigned NumMove = 0;
2054     for (int i = Ops.size() - 1; i >= 0; --i) {
2055       MachineInstr *Op = Ops[i];
2056       unsigned Loc = MI2LocMap[Op];
2057       if (Loc <= FirstLoc) {
2058         FirstLoc = Loc;
2059         FirstOp = Op;
2060       }
2061       if (Loc >= LastLoc) {
2062         LastLoc = Loc;
2063         LastOp = Op;
2064       }
2065 
2066       unsigned LSMOpcode
2067         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2068       if (LastOpcode && LSMOpcode != LastOpcode)
2069         break;
2070 
2071       int Offset = getMemoryOpOffset(Op);
2072       unsigned Bytes = getLSMultipleTransferSize(Op);
2073       if (LastBytes) {
2074         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2075           break;
2076       }
2077       LastOffset = Offset;
2078       LastBytes = Bytes;
2079       LastOpcode = LSMOpcode;
2080       if (++NumMove == 8) // FIXME: Tune this limit.
2081         break;
2082     }
2083 
2084     if (NumMove <= 1)
2085       Ops.pop_back();
2086     else {
2087       SmallPtrSet<MachineInstr*, 4> MemOps;
2088       SmallSet<unsigned, 4> MemRegs;
2089       for (int i = NumMove-1; i >= 0; --i) {
2090         MemOps.insert(Ops[i]);
2091         MemRegs.insert(Ops[i]->getOperand(0).getReg());
2092       }
2093 
2094       // Be conservative, if the instructions are too far apart, don't
2095       // move them. We want to limit the increase of register pressure.
2096       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2097       if (DoMove)
2098         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2099                                            MemOps, MemRegs, TRI);
2100       if (!DoMove) {
2101         for (unsigned i = 0; i != NumMove; ++i)
2102           Ops.pop_back();
2103       } else {
2104         // This is the new location for the loads / stores.
2105         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2106         while (InsertPos != MBB->end()
2107                && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
2108           ++InsertPos;
2109 
2110         // If we are moving a pair of loads / stores, see if it makes sense
2111         // to try to allocate a pair of registers that can form register pairs.
2112         MachineInstr *Op0 = Ops.back();
2113         MachineInstr *Op1 = Ops[Ops.size()-2];
2114         unsigned FirstReg = 0, SecondReg = 0;
2115         unsigned BaseReg = 0, PredReg = 0;
2116         ARMCC::CondCodes Pred = ARMCC::AL;
2117         bool isT2 = false;
2118         unsigned NewOpc = 0;
2119         int Offset = 0;
2120         DebugLoc dl;
2121         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2122                                              FirstReg, SecondReg, BaseReg,
2123                                              Offset, PredReg, Pred, isT2)) {
2124           Ops.pop_back();
2125           Ops.pop_back();
2126 
2127           const MCInstrDesc &MCID = TII->get(NewOpc);
2128           const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2129           MRI->constrainRegClass(FirstReg, TRC);
2130           MRI->constrainRegClass(SecondReg, TRC);
2131 
2132           // Form the pair instruction.
2133           if (isLd) {
2134             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2135               .addReg(FirstReg, RegState::Define)
2136               .addReg(SecondReg, RegState::Define)
2137               .addReg(BaseReg);
2138             // FIXME: We're converting from LDRi12 to an insn that still
2139             // uses addrmode2, so we need an explicit offset reg. It should
2140             // always by reg0 since we're transforming LDRi12s.
2141             if (!isT2)
2142               MIB.addReg(0);
2143             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2144             concatenateMemOperands(MIB, Op0, Op1);
2145             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2146             ++NumLDRDFormed;
2147           } else {
2148             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2149               .addReg(FirstReg)
2150               .addReg(SecondReg)
2151               .addReg(BaseReg);
2152             // FIXME: We're converting from LDRi12 to an insn that still
2153             // uses addrmode2, so we need an explicit offset reg. It should
2154             // always by reg0 since we're transforming STRi12s.
2155             if (!isT2)
2156               MIB.addReg(0);
2157             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2158             concatenateMemOperands(MIB, Op0, Op1);
2159             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2160             ++NumSTRDFormed;
2161           }
2162           MBB->erase(Op0);
2163           MBB->erase(Op1);
2164 
2165           if (!isT2) {
2166             // Add register allocation hints to form register pairs.
2167             MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2168             MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
2169           }
2170         } else {
2171           for (unsigned i = 0; i != NumMove; ++i) {
2172             MachineInstr *Op = Ops.back();
2173             Ops.pop_back();
2174             MBB->splice(InsertPos, MBB, Op);
2175           }
2176         }
2177 
2178         NumLdStMoved += NumMove;
2179         RetVal = true;
2180       }
2181     }
2182   }
2183 
2184   return RetVal;
2185 }
2186 
2187 bool
2188 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2189   bool RetVal = false;
2190 
2191   DenseMap<MachineInstr*, unsigned> MI2LocMap;
2192   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
2193   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
2194   SmallVector<unsigned, 4> LdBases;
2195   SmallVector<unsigned, 4> StBases;
2196 
2197   unsigned Loc = 0;
2198   MachineBasicBlock::iterator MBBI = MBB->begin();
2199   MachineBasicBlock::iterator E = MBB->end();
2200   while (MBBI != E) {
2201     for (; MBBI != E; ++MBBI) {
2202       MachineInstr *MI = MBBI;
2203       if (MI->isCall() || MI->isTerminator()) {
2204         // Stop at barriers.
2205         ++MBBI;
2206         break;
2207       }
2208 
2209       if (!MI->isDebugValue())
2210         MI2LocMap[MI] = ++Loc;
2211 
2212       if (!isMemoryOp(MI))
2213         continue;
2214       unsigned PredReg = 0;
2215       if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
2216         continue;
2217 
2218       int Opc = MI->getOpcode();
2219       bool isLd = isLoadSingle(Opc);
2220       unsigned Base = MI->getOperand(1).getReg();
2221       int Offset = getMemoryOpOffset(MI);
2222 
2223       bool StopHere = false;
2224       if (isLd) {
2225         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2226           Base2LdsMap.find(Base);
2227         if (BI != Base2LdsMap.end()) {
2228           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2229             if (Offset == getMemoryOpOffset(BI->second[i])) {
2230               StopHere = true;
2231               break;
2232             }
2233           }
2234           if (!StopHere)
2235             BI->second.push_back(MI);
2236         } else {
2237           Base2LdsMap[Base].push_back(MI);
2238           LdBases.push_back(Base);
2239         }
2240       } else {
2241         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2242           Base2StsMap.find(Base);
2243         if (BI != Base2StsMap.end()) {
2244           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2245             if (Offset == getMemoryOpOffset(BI->second[i])) {
2246               StopHere = true;
2247               break;
2248             }
2249           }
2250           if (!StopHere)
2251             BI->second.push_back(MI);
2252         } else {
2253           Base2StsMap[Base].push_back(MI);
2254           StBases.push_back(Base);
2255         }
2256       }
2257 
2258       if (StopHere) {
2259         // Found a duplicate (a base+offset combination that's seen earlier).
2260         // Backtrack.
2261         --Loc;
2262         break;
2263       }
2264     }
2265 
2266     // Re-schedule loads.
2267     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2268       unsigned Base = LdBases[i];
2269       SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2270       if (Lds.size() > 1)
2271         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2272     }
2273 
2274     // Re-schedule stores.
2275     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2276       unsigned Base = StBases[i];
2277       SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2278       if (Sts.size() > 1)
2279         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2280     }
2281 
2282     if (MBBI != E) {
2283       Base2LdsMap.clear();
2284       Base2StsMap.clear();
2285       LdBases.clear();
2286       StBases.clear();
2287     }
2288   }
2289 
2290   return RetVal;
2291 }
2292 
2293 
2294 /// Returns an instance of the load / store optimization pass.
2295 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2296   if (PreAlloc)
2297     return new ARMPreAllocLoadStoreOpt();
2298   return new ARMLoadStoreOpt();
2299 }
2300