xref: /llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (revision 9c91d79dadc660cb6a0ec736389341debd8cd118)
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 ///   form should be in the preheader, whereas the while form should be in the
14 ///   preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 /// In addition to this, we also look for the presence of the VCTP instruction,
19 /// which determines whether we can generated the tail-predicated low-overhead
20 /// loop form.
21 ///
22 /// Assumptions and Dependencies:
23 /// Low-overhead loops are constructed and executed using a setup instruction:
24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26 /// but fixed polarity: WLS can only branch forwards and LE can only branch
27 /// backwards. These restrictions mean that this pass is dependent upon block
28 /// layout and block sizes, which is why it's the last pass to run. The same is
29 /// true for ConstantIslands, but this pass does not increase the size of the
30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31 /// during the transform and pseudo instructions are replaced by real ones. In
32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33 /// multiple instructions for a single pseudo (see RevertWhile and
34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
35 /// are defined to be as large as this maximum sequence of replacement
36 /// instructions.
37 ///
38 //===----------------------------------------------------------------------===//
39 
40 #include "ARM.h"
41 #include "ARMBaseInstrInfo.h"
42 #include "ARMBaseRegisterInfo.h"
43 #include "ARMBasicBlockInfo.h"
44 #include "ARMSubtarget.h"
45 #include "llvm/ADT/SetOperations.h"
46 #include "llvm/ADT/SmallSet.h"
47 #include "llvm/CodeGen/MachineFunctionPass.h"
48 #include "llvm/CodeGen/MachineLoopInfo.h"
49 #include "llvm/CodeGen/MachineLoopUtils.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/Passes.h"
52 #include "llvm/CodeGen/ReachingDefAnalysis.h"
53 #include "llvm/MC/MCInstrDesc.h"
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "arm-low-overhead-loops"
58 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
59 
60 namespace {
61 
62   struct PredicatedMI {
63     MachineInstr *MI = nullptr;
64     SetVector<MachineInstr*> Predicates;
65 
66   public:
67     PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
68     MI(I) {
69       Predicates.insert(Preds.begin(), Preds.end());
70     }
71   };
72 
73   // Represent a VPT block, a list of instructions that begins with a VPST and
74   // has a maximum of four proceeding instructions. All instructions within the
75   // block are predicated upon the vpr and we allow instructions to define the
76   // vpr within in the block too.
77   class VPTBlock {
78     std::unique_ptr<PredicatedMI> VPST;
79     PredicatedMI *Divergent = nullptr;
80     SmallVector<PredicatedMI, 4> Insts;
81 
82   public:
83     VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
84       VPST = std::make_unique<PredicatedMI>(MI, Preds);
85     }
86 
87     void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
88       LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
89       if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
90         Divergent = &Insts.back();
91         LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
92       }
93       Insts.emplace_back(MI, Preds);
94       assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
95     }
96 
97     // Have we found an instruction within the block which defines the vpr? If
98     // so, not all the instructions in the block will have the same predicate.
99     bool HasNonUniformPredicate() const {
100       return Divergent != nullptr;
101     }
102 
103     // Is the given instruction part of the predicate set controlling the entry
104     // to the block.
105     bool IsPredicatedOn(MachineInstr *MI) const {
106       return VPST->Predicates.count(MI);
107     }
108 
109     // Is the given instruction the only predicate which controls the entry to
110     // the block.
111     bool IsOnlyPredicatedOn(MachineInstr *MI) const {
112       return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
113     }
114 
115     unsigned size() const { return Insts.size(); }
116     SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
117     MachineInstr *getVPST() const { return VPST->MI; }
118     PredicatedMI *getDivergent() const { return Divergent; }
119   };
120 
121   struct LowOverheadLoop {
122 
123     MachineLoop *ML = nullptr;
124     MachineFunction *MF = nullptr;
125     MachineInstr *InsertPt = nullptr;
126     MachineInstr *Start = nullptr;
127     MachineInstr *Dec = nullptr;
128     MachineInstr *End = nullptr;
129     MachineInstr *VCTP = nullptr;
130     VPTBlock *CurrentBlock = nullptr;
131     SetVector<MachineInstr*> CurrentPredicate;
132     SmallVector<VPTBlock, 4> VPTBlocks;
133     bool Revert = false;
134     bool CannotTailPredicate = false;
135 
136     LowOverheadLoop(MachineLoop *ML) : ML(ML) {
137       MF = ML->getHeader()->getParent();
138     }
139 
140     bool RecordVPTBlocks(MachineInstr *MI);
141 
142     // If this is an MVE instruction, check that we know how to use tail
143     // predication with it.
144     void AnalyseMVEInst(MachineInstr *MI) {
145       if (CannotTailPredicate)
146         return;
147 
148       if (!RecordVPTBlocks(MI)) {
149         CannotTailPredicate = true;
150         return;
151       }
152 
153       const MCInstrDesc &MCID = MI->getDesc();
154       uint64_t Flags = MCID.TSFlags;
155       if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
156         return;
157 
158       if ((Flags & ARMII::ValidForTailPredication) == 0) {
159         LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
160         CannotTailPredicate = true;
161       }
162     }
163 
164     bool IsTailPredicationLegal() const {
165       // For now, let's keep things really simple and only support a single
166       // block for tail predication.
167       return !Revert && FoundAllComponents() && VCTP &&
168              !CannotTailPredicate && ML->getNumBlocks() == 1;
169     }
170 
171     bool ValidateTailPredicate(MachineInstr *StartInsertPt,
172                                ReachingDefAnalysis *RDA,
173                                MachineLoopInfo *MLI);
174 
175     // Is it safe to define LR with DLS/WLS?
176     // LR can be defined if it is the operand to start, because it's the same
177     // value, or if it's going to be equivalent to the operand to Start.
178     MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
179 
180     // Check the branch targets are within range and we satisfy our
181     // restrictions.
182     void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
183                        MachineLoopInfo *MLI);
184 
185     bool FoundAllComponents() const {
186       return Start && Dec && End;
187     }
188 
189     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
190 
191     // Return the loop iteration count, or the number of elements if we're tail
192     // predicating.
193     MachineOperand &getCount() {
194       return IsTailPredicationLegal() ?
195         VCTP->getOperand(1) : Start->getOperand(0);
196     }
197 
198     unsigned getStartOpcode() const {
199       bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
200       if (!IsTailPredicationLegal())
201         return IsDo ? ARM::t2DLS : ARM::t2WLS;
202 
203       return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
204     }
205 
206     void dump() const {
207       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
208       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
209       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
210       if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
211       if (!FoundAllComponents())
212         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
213       else if (!(Start && Dec && End))
214         dbgs() << "ARM Loops: Failed to find all loop components.\n";
215     }
216   };
217 
218   class ARMLowOverheadLoops : public MachineFunctionPass {
219     MachineFunction           *MF = nullptr;
220     MachineLoopInfo           *MLI = nullptr;
221     ReachingDefAnalysis       *RDA = nullptr;
222     const ARMBaseInstrInfo    *TII = nullptr;
223     MachineRegisterInfo       *MRI = nullptr;
224     const TargetRegisterInfo  *TRI = nullptr;
225     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
226 
227   public:
228     static char ID;
229 
230     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
231 
232     void getAnalysisUsage(AnalysisUsage &AU) const override {
233       AU.setPreservesCFG();
234       AU.addRequired<MachineLoopInfo>();
235       AU.addRequired<ReachingDefAnalysis>();
236       MachineFunctionPass::getAnalysisUsage(AU);
237     }
238 
239     bool runOnMachineFunction(MachineFunction &MF) override;
240 
241     MachineFunctionProperties getRequiredProperties() const override {
242       return MachineFunctionProperties().set(
243           MachineFunctionProperties::Property::NoVRegs).set(
244           MachineFunctionProperties::Property::TracksLiveness);
245     }
246 
247     StringRef getPassName() const override {
248       return ARM_LOW_OVERHEAD_LOOPS_NAME;
249     }
250 
251   private:
252     bool ProcessLoop(MachineLoop *ML);
253 
254     bool RevertNonLoops();
255 
256     void RevertWhile(MachineInstr *MI) const;
257 
258     bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
259 
260     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
261 
262     void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
263 
264     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
265 
266     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
267 
268     void Expand(LowOverheadLoop &LoLoop);
269 
270   };
271 }
272 
273 char ARMLowOverheadLoops::ID = 0;
274 
275 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
276                 false, false)
277 
278 MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
279   // We can define LR because LR already contains the same value.
280   if (Start->getOperand(0).getReg() == ARM::LR)
281     return Start;
282 
283   unsigned CountReg = Start->getOperand(0).getReg();
284   auto IsMoveLR = [&CountReg](MachineInstr *MI) {
285     return MI->getOpcode() == ARM::tMOVr &&
286            MI->getOperand(0).getReg() == ARM::LR &&
287            MI->getOperand(1).getReg() == CountReg &&
288            MI->getOperand(2).getImm() == ARMCC::AL;
289    };
290 
291   MachineBasicBlock *MBB = Start->getParent();
292 
293   // Find an insertion point:
294   // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
295   //   to Count before Start, we can insert at that mov.
296   if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
297     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
298       return LRDef;
299 
300   // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
301   //   to Count after Start, we can insert at that mov.
302   if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
303     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
304       return LRDef;
305 
306   // We've found no suitable LR def and Start doesn't use LR directly. Can we
307   // just define LR anyway?
308   if (!RDA->isRegUsedAfter(Start, ARM::LR))
309     return Start;
310 
311   return nullptr;
312 }
313 
314 // Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
315 // not define a register that is used by any instructions, after and including,
316 // 'To'. These instructions also must not redefine any of Froms operands.
317 template<typename Iterator>
318 static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
319   SmallSet<int, 2> Defs;
320   // First check that From would compute the same value if moved.
321   for (auto &MO : From->operands()) {
322     if (!MO.isReg() || MO.isUndef() || !MO.getReg())
323       continue;
324     if (MO.isDef())
325       Defs.insert(MO.getReg());
326     else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
327       return false;
328   }
329 
330   // Now walk checking that the rest of the instructions will compute the same
331   // value.
332   for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
333     for (auto &MO : I->operands())
334       if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
335         return false;
336   }
337   return true;
338 }
339 
340 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
341 		                            ReachingDefAnalysis *RDA,
342 		                            MachineLoopInfo *MLI) {
343   // All predication within the loop should be based on vctp. If the block
344   // isn't predicated on entry, check whether the vctp is within the block
345   // and that all other instructions are then predicated on it.
346   for (auto &Block : VPTBlocks) {
347     if (Block.IsPredicatedOn(VCTP))
348       continue;
349     if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI))
350       return false;
351     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
352     for (auto &PredMI : Insts) {
353       if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
354         continue;
355       LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
356                         << " - which is predicated on:\n";
357                         for (auto *MI : PredMI.Predicates)
358                           dbgs() << "   - " << *MI;
359                  );
360       return false;
361     }
362   }
363 
364   // For tail predication, we need to provide the number of elements, instead
365   // of the iteration count, to the loop start instruction. The number of
366   // elements is provided to the vctp instruction, so we need to check that
367   // we can use this register at InsertPt.
368   Register NumElements = VCTP->getOperand(1).getReg();
369 
370   // If the register is defined within loop, then we can't perform TP.
371   // TODO: Check whether this is just a mov of a register that would be
372   // available.
373   if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
374     LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
375     return false;
376   }
377 
378   // The element count register maybe defined after InsertPt, in which case we
379   // need to try to move either InsertPt or the def so that the [w|d]lstp can
380   // use the value.
381   MachineBasicBlock *InsertBB = InsertPt->getParent();
382   if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
383     if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
384       if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
385         ElemDef->removeFromParent();
386         InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
387         LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
388                    << *ElemDef);
389       } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
390         InsertPt->removeFromParent();
391         InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
392         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
393       } else
394         return false;
395     }
396   }
397 
398   // Especially in the case of while loops, InsertBB may not be the
399   // preheader, so we need to check that the register isn't redefined
400   // before entering the loop.
401   auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
402                                       Register NumElements) {
403     // NumElements is redefined in this block.
404     if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
405       return true;
406 
407     // Don't continue searching up through multiple predecessors.
408     if (MBB->pred_size() > 1)
409       return true;
410 
411     return false;
412   };
413 
414   // First, find the block that looks like the preheader.
415   MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
416   if (!MBB)
417     return false;
418 
419   // Then search backwards for a def, until we get to InsertBB.
420   while (MBB != InsertBB) {
421     if (CannotProvideElements(MBB, NumElements))
422       return false;
423     MBB = *MBB->pred_begin();
424   }
425 
426   LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
427   return true;
428 }
429 
430 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
431                                     ReachingDefAnalysis *RDA,
432                                     MachineLoopInfo *MLI) {
433   if (Revert)
434     return;
435 
436   if (!End->getOperand(1).isMBB())
437     report_fatal_error("Expected LoopEnd to target basic block");
438 
439   // TODO Maybe there's cases where the target doesn't have to be the header,
440   // but for now be safe and revert.
441   if (End->getOperand(1).getMBB() != ML->getHeader()) {
442     LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
443     Revert = true;
444     return;
445   }
446 
447   // The WLS and LE instructions have 12-bits for the label offset. WLS
448   // requires a positive offset, while LE uses negative.
449   if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
450       !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
451     LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
452     Revert = true;
453     return;
454   }
455 
456   if (Start->getOpcode() == ARM::t2WhileLoopStart &&
457       (BBUtils->getOffsetOf(Start) >
458        BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
459        !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
460     LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
461     Revert = true;
462     return;
463   }
464 
465   InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
466   if (!InsertPt) {
467     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
468     Revert = true;
469     return;
470   } else
471     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
472 
473   if (!IsTailPredicationLegal()) {
474     LLVM_DEBUG(dbgs() << "ARM Loops: Tail-predication is not valid.\n");
475     return;
476   }
477 
478   assert(ML->getBlocks().size() == 1 &&
479          "Shouldn't be processing a loop with more than one block");
480   CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
481   LLVM_DEBUG(if (CannotTailPredicate)
482              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
483 }
484 
485 bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
486   // Only support a single vctp.
487   if (isVCTP(MI) && VCTP)
488     return false;
489 
490   // Start a new vpt block when we discover a vpt.
491   if (MI->getOpcode() == ARM::MVE_VPST) {
492     VPTBlocks.emplace_back(MI, CurrentPredicate);
493     CurrentBlock = &VPTBlocks.back();
494     return true;
495   }
496 
497   if (isVCTP(MI))
498     VCTP = MI;
499 
500   unsigned VPROpNum = MI->getNumOperands() - 1;
501   bool IsUse = false;
502   if (MI->getOperand(VPROpNum).isReg() &&
503       MI->getOperand(VPROpNum).getReg() == ARM::VPR &&
504       MI->getOperand(VPROpNum).isUse()) {
505     // If this instruction is predicated by VPR, it will be its last
506     // operand.  Also check that it's only 'Then' predicated.
507     if (!MI->getOperand(VPROpNum-1).isImm() ||
508         MI->getOperand(VPROpNum-1).getImm() != ARMVCC::Then) {
509       LLVM_DEBUG(dbgs() << "ARM Loops: Found unhandled predicate on: "
510                  << *MI);
511       return false;
512     }
513     CurrentBlock->addInst(MI, CurrentPredicate);
514     IsUse = true;
515   }
516 
517   bool IsDef = false;
518   for (unsigned i = 0; i < MI->getNumOperands() - 1; ++i) {
519     const MachineOperand &MO = MI->getOperand(i);
520     if (!MO.isReg() || MO.getReg() != ARM::VPR)
521       continue;
522 
523     if (MO.isDef()) {
524       CurrentPredicate.insert(MI);
525       IsDef = true;
526     } else {
527       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
528       return false;
529     }
530   }
531 
532   // If we find a vpr def that is not already predicated on the vctp, we've
533   // got disjoint predicates that may not be equivalent when we do the
534   // conversion.
535   if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
536     LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
537     return false;
538   }
539 
540   return true;
541 }
542 
543 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
544   const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
545   if (!ST.hasLOB())
546     return false;
547 
548   MF = &mf;
549   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
550 
551   MLI = &getAnalysis<MachineLoopInfo>();
552   RDA = &getAnalysis<ReachingDefAnalysis>();
553   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
554   MRI = &MF->getRegInfo();
555   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
556   TRI = ST.getRegisterInfo();
557   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
558   BBUtils->computeAllBlockSizes();
559   BBUtils->adjustBBOffsetsAfter(&MF->front());
560 
561   bool Changed = false;
562   for (auto ML : *MLI) {
563     if (!ML->getParentLoop())
564       Changed |= ProcessLoop(ML);
565   }
566   Changed |= RevertNonLoops();
567   return Changed;
568 }
569 
570 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
571 
572   bool Changed = false;
573 
574   // Process inner loops first.
575   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
576     Changed |= ProcessLoop(*I);
577 
578   LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
579              if (auto *Preheader = ML->getLoopPreheader())
580                dbgs() << " - " << Preheader->getName() << "\n";
581              else if (auto *Preheader = MLI->findLoopPreheader(ML))
582                dbgs() << " - " << Preheader->getName() << "\n";
583              for (auto *MBB : ML->getBlocks())
584                dbgs() << " - " << MBB->getName() << "\n";
585             );
586 
587   // Search the given block for a loop start instruction. If one isn't found,
588   // and there's only one predecessor block, search that one too.
589   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
590     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
591     for (auto &MI : *MBB) {
592       if (isLoopStart(MI))
593         return &MI;
594     }
595     if (MBB->pred_size() == 1)
596       return SearchForStart(*MBB->pred_begin());
597     return nullptr;
598   };
599 
600   LowOverheadLoop LoLoop(ML);
601   // Search the preheader for the start intrinsic.
602   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
603   // with potentially multiple set.loop.iterations, so we need to enable this.
604   if (auto *Preheader = ML->getLoopPreheader())
605     LoLoop.Start = SearchForStart(Preheader);
606   else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
607     LoLoop.Start = SearchForStart(Preheader);
608   else
609     return false;
610 
611   // Find the low-overhead loop components and decide whether or not to fall
612   // back to a normal loop. Also look for a vctp instructions and decide
613   // whether we can convert that predicate using tail predication.
614   for (auto *MBB : reverse(ML->getBlocks())) {
615     for (auto &MI : *MBB) {
616       if (MI.getOpcode() == ARM::t2LoopDec)
617         LoLoop.Dec = &MI;
618       else if (MI.getOpcode() == ARM::t2LoopEnd)
619         LoLoop.End = &MI;
620       else if (isLoopStart(MI))
621         LoLoop.Start = &MI;
622       else if (MI.getDesc().isCall()) {
623         // TODO: Though the call will require LE to execute again, does this
624         // mean we should revert? Always executing LE hopefully should be
625         // faster than performing a sub,cmp,br or even subs,br.
626         LoLoop.Revert = true;
627         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
628       } else {
629         // Record VPR defs and build up their corresponding vpt blocks.
630         // Check we know how to tail predicate any mve instructions.
631         LoLoop.AnalyseMVEInst(&MI);
632       }
633 
634       // We need to ensure that LR is not used or defined inbetween LoopDec and
635       // LoopEnd.
636       if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
637         continue;
638 
639       // If we find that LR has been written or read between LoopDec and
640       // LoopEnd, expect that the decremented value is being used else where.
641       // Because this value isn't actually going to be produced until the
642       // latch, by LE, we would need to generate a real sub. The value is also
643       // likely to be copied/reloaded for use of LoopEnd - in which in case
644       // we'd need to perform an add because it gets subtracted again by LE!
645       // The other option is to then generate the other form of LE which doesn't
646       // perform the sub.
647       for (auto &MO : MI.operands()) {
648         if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
649             MO.getReg() == ARM::LR) {
650           LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
651           LoLoop.Revert = true;
652           break;
653         }
654       }
655     }
656   }
657 
658   LLVM_DEBUG(LoLoop.dump());
659   if (!LoLoop.FoundAllComponents())
660     return false;
661 
662   LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
663   Expand(LoLoop);
664   return true;
665 }
666 
667 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
668 // beq that branches to the exit branch.
669 // TODO: We could also try to generate a cbz if the value in LR is also in
670 // another low register.
671 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
672   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
673   MachineBasicBlock *MBB = MI->getParent();
674   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
675                                     TII->get(ARM::t2CMPri));
676   MIB.add(MI->getOperand(0));
677   MIB.addImm(0);
678   MIB.addImm(ARMCC::AL);
679   MIB.addReg(ARM::NoRegister);
680 
681   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
682   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
683     ARM::tBcc : ARM::t2Bcc;
684 
685   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
686   MIB.add(MI->getOperand(1));   // branch target
687   MIB.addImm(ARMCC::EQ);        // condition code
688   MIB.addReg(ARM::CPSR);
689   MI->eraseFromParent();
690 }
691 
692 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
693                                         bool SetFlags) const {
694   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
695   MachineBasicBlock *MBB = MI->getParent();
696 
697   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
698   if (SetFlags &&
699       (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
700        !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
701       SetFlags = false;
702 
703   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
704                                     TII->get(ARM::t2SUBri));
705   MIB.addDef(ARM::LR);
706   MIB.add(MI->getOperand(1));
707   MIB.add(MI->getOperand(2));
708   MIB.addImm(ARMCC::AL);
709   MIB.addReg(0);
710 
711   if (SetFlags) {
712     MIB.addReg(ARM::CPSR);
713     MIB->getOperand(5).setIsDef(true);
714   } else
715     MIB.addReg(0);
716 
717   MI->eraseFromParent();
718   return SetFlags;
719 }
720 
721 // Generate a subs, or sub and cmp, and a branch instead of an LE.
722 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
723   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
724 
725   MachineBasicBlock *MBB = MI->getParent();
726   // Create cmp
727   if (!SkipCmp) {
728     MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
729                                       TII->get(ARM::t2CMPri));
730     MIB.addReg(ARM::LR);
731     MIB.addImm(0);
732     MIB.addImm(ARMCC::AL);
733     MIB.addReg(ARM::NoRegister);
734   }
735 
736   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
737   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
738     ARM::tBcc : ARM::t2Bcc;
739 
740   // Create bne
741   MachineInstrBuilder MIB =
742     BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
743   MIB.add(MI->getOperand(1));   // branch target
744   MIB.addImm(ARMCC::NE);        // condition code
745   MIB.addReg(ARM::CPSR);
746   MI->eraseFromParent();
747 }
748 
749 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
750   MachineInstr *InsertPt = LoLoop.InsertPt;
751   MachineInstr *Start = LoLoop.Start;
752   MachineBasicBlock *MBB = InsertPt->getParent();
753   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
754   unsigned Opc = LoLoop.getStartOpcode();
755   MachineOperand &Count = LoLoop.getCount();
756 
757   MachineInstrBuilder MIB =
758     BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
759 
760   MIB.addDef(ARM::LR);
761   MIB.add(Count);
762   if (!IsDo)
763     MIB.add(Start->getOperand(1));
764 
765   // When using tail-predication, try to delete the dead code that was used to
766   // calculate the number of loop iterations.
767   if (LoLoop.IsTailPredicationLegal()) {
768     SmallVector<MachineInstr*, 4> Killed;
769     SmallVector<MachineInstr*, 4> Dead;
770     if (auto *Def = RDA->getReachingMIDef(Start,
771                                           Start->getOperand(0).getReg())) {
772       Killed.push_back(Def);
773 
774       while (!Killed.empty()) {
775         MachineInstr *Def = Killed.back();
776         Killed.pop_back();
777         Dead.push_back(Def);
778         for (auto &MO : Def->operands()) {
779           if (!MO.isReg() || !MO.isKill())
780             continue;
781 
782           MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
783           if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
784             Killed.push_back(Kill);
785         }
786       }
787       for (auto *MI : Dead)
788         MI->eraseFromParent();
789     }
790   }
791 
792   // If we're inserting at a mov lr, then remove it as it's redundant.
793   if (InsertPt != Start)
794     InsertPt->eraseFromParent();
795   Start->eraseFromParent();
796   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
797   return &*MIB;
798 }
799 
800 // Goal is to optimise and clean-up these loops:
801 //
802 //   vector.body:
803 //     renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
804 //     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
805 //     ..
806 //     $lr = MVE_DLSTP_32 renamable $r3
807 //
808 // The SUB is the old update of the loop iteration count expression, which
809 // is no longer needed. This sub is removed when the element count, which is in
810 // r3 in this example, is defined by an instruction in the loop, and it has
811 // no uses.
812 //
813 void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
814   Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
815   MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
816 
817   LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
818 
819   if (LoLoop.ML->getNumBlocks() != 1) {
820     LLVM_DEBUG(dbgs() << "ARM Loops: single block loop expected\n");
821     return;
822   }
823 
824   LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing MO: ";
825              LoLoop.VCTP->getOperand(1).dump());
826 
827   // Find the definition we are interested in removing, if there is one.
828   MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
829   if (!Def)
830     return;
831 
832   // Bail if we define CPSR and it is not dead
833   if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
834     LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
835     return;
836   }
837 
838   // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
839   if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
840     LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
841     return;
842   }
843 
844   // Bail if there are uses after this Def in the block.
845   SmallVector<MachineInstr*, 4> Uses;
846   RDA->getReachingLocalUses(Def, ElemCount, Uses);
847   if (Uses.size()) {
848     LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
849     return;
850   }
851 
852   Uses.clear();
853   RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
854 
855   // Remove Def if there are no uses, or if the only use is the VCTP
856   // instruction.
857   if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
858     LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
859                Def->dump());
860     Def->eraseFromParent();
861   }
862 }
863 
864 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
865   auto RemovePredicate = [](MachineInstr *MI) {
866     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
867     unsigned OpNum = MI->getNumOperands() - 1;
868     assert(MI->getOperand(OpNum-1).getImm() == ARMVCC::Then &&
869            "Expected Then predicate!");
870     MI->getOperand(OpNum-1).setImm(ARMVCC::None);
871     MI->getOperand(OpNum).setReg(0);
872   };
873 
874   // There are a few scenarios which we have to fix up:
875   // 1) A VPT block with is only predicated by the vctp and has no internal vpr
876   //    defs.
877   // 2) A VPT block which is only predicated by the vctp but has an internal
878   //    vpr def.
879   // 3) A VPT block which is predicated upon the vctp as well as another vpr
880   //    def.
881   // 4) A VPT block which is not predicated upon a vctp, but contains it and
882   //    all instructions within the block are predicated upon in.
883 
884   for (auto &Block : LoLoop.getVPTBlocks()) {
885     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
886     if (Block.HasNonUniformPredicate()) {
887       PredicatedMI *Divergent = Block.getDivergent();
888       if (isVCTP(Divergent->MI)) {
889         // The vctp will be removed, so the size of the vpt block needs to be
890         // modified.
891         uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
892         Block.getVPST()->getOperand(0).setImm(Size);
893         LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
894       } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
895         // The VPT block has a non-uniform predicate but it's entry is guarded
896         // only by a vctp, which means we:
897         // - Need to remove the original vpst.
898         // - Then need to unpredicate any following instructions, until
899         //   we come across the divergent vpr def.
900         // - Insert a new vpst to predicate the instruction(s) that following
901         //   the divergent vpr def.
902         // TODO: We could be producing more VPT blocks than necessary and could
903         // fold the newly created one into a proceeding one.
904         for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
905              E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
906           RemovePredicate(&*I);
907 
908         unsigned Size = 0;
909         auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
910         auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
911         MachineInstr *InsertAt = nullptr;
912         while (I != E) {
913           InsertAt = &*I;
914           ++Size;
915           ++I;
916         }
917         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
918                                           InsertAt->getDebugLoc(),
919                                           TII->get(ARM::MVE_VPST));
920         MIB.addImm(getARMVPTBlockMask(Size));
921         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
922         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
923         Block.getVPST()->eraseFromParent();
924       }
925     } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
926       // A vpt block which is only predicated upon vctp and has no internal vpr
927       // defs:
928       // - Remove vpst.
929       // - Unpredicate the remaining instructions.
930       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
931       Block.getVPST()->eraseFromParent();
932       for (auto &PredMI : Insts)
933         RemovePredicate(PredMI.MI);
934     }
935   }
936 
937   LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
938   LoLoop.VCTP->eraseFromParent();
939 }
940 
941 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
942 
943   // Combine the LoopDec and LoopEnd instructions into LE(TP).
944   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
945     MachineInstr *End = LoLoop.End;
946     MachineBasicBlock *MBB = End->getParent();
947     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
948       ARM::MVE_LETP : ARM::t2LEUpdate;
949     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
950                                       TII->get(Opc));
951     MIB.addDef(ARM::LR);
952     MIB.add(End->getOperand(0));
953     MIB.add(End->getOperand(1));
954     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
955 
956     LoLoop.End->eraseFromParent();
957     LoLoop.Dec->eraseFromParent();
958     return &*MIB;
959   };
960 
961   // TODO: We should be able to automatically remove these branches before we
962   // get here - probably by teaching analyzeBranch about the pseudo
963   // instructions.
964   // If there is an unconditional branch, after I, that just branches to the
965   // next block, remove it.
966   auto RemoveDeadBranch = [](MachineInstr *I) {
967     MachineBasicBlock *BB = I->getParent();
968     MachineInstr *Terminator = &BB->instr_back();
969     if (Terminator->isUnconditionalBranch() && I != Terminator) {
970       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
971       if (BB->isLayoutSuccessor(Succ)) {
972         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
973         Terminator->eraseFromParent();
974       }
975     }
976   };
977 
978   if (LoLoop.Revert) {
979     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
980       RevertWhile(LoLoop.Start);
981     else
982       LoLoop.Start->eraseFromParent();
983     bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
984     RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
985   } else {
986     LoLoop.Start = ExpandLoopStart(LoLoop);
987     RemoveDeadBranch(LoLoop.Start);
988     LoLoop.End = ExpandLoopEnd(LoLoop);
989     RemoveDeadBranch(LoLoop.End);
990     if (LoLoop.IsTailPredicationLegal()) {
991       RemoveLoopUpdate(LoLoop);
992       ConvertVPTBlocks(LoLoop);
993     }
994   }
995 }
996 
997 bool ARMLowOverheadLoops::RevertNonLoops() {
998   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
999   bool Changed = false;
1000 
1001   for (auto &MBB : *MF) {
1002     SmallVector<MachineInstr*, 4> Starts;
1003     SmallVector<MachineInstr*, 4> Decs;
1004     SmallVector<MachineInstr*, 4> Ends;
1005 
1006     for (auto &I : MBB) {
1007       if (isLoopStart(I))
1008         Starts.push_back(&I);
1009       else if (I.getOpcode() == ARM::t2LoopDec)
1010         Decs.push_back(&I);
1011       else if (I.getOpcode() == ARM::t2LoopEnd)
1012         Ends.push_back(&I);
1013     }
1014 
1015     if (Starts.empty() && Decs.empty() && Ends.empty())
1016       continue;
1017 
1018     Changed = true;
1019 
1020     for (auto *Start : Starts) {
1021       if (Start->getOpcode() == ARM::t2WhileLoopStart)
1022         RevertWhile(Start);
1023       else
1024         Start->eraseFromParent();
1025     }
1026     for (auto *Dec : Decs)
1027       RevertLoopDec(Dec);
1028 
1029     for (auto *End : Ends)
1030       RevertLoopEnd(End);
1031   }
1032   return Changed;
1033 }
1034 
1035 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1036   return new ARMLowOverheadLoops();
1037 }
1038