xref: /llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (revision 4569f63ae1cb520ce28f08f4800dfbcd5f255eed)
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 ///   form should be in the preheader, whereas the while form should be in the
14 ///   preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 /// In addition to this, we also look for the presence of the VCTP instruction,
19 /// which determines whether we can generated the tail-predicated low-overhead
20 /// loop form.
21 ///
22 /// Assumptions and Dependencies:
23 /// Low-overhead loops are constructed and executed using a setup instruction:
24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26 /// but fixed polarity: WLS can only branch forwards and LE can only branch
27 /// backwards. These restrictions mean that this pass is dependent upon block
28 /// layout and block sizes, which is why it's the last pass to run. The same is
29 /// true for ConstantIslands, but this pass does not increase the size of the
30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31 /// during the transform and pseudo instructions are replaced by real ones. In
32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33 /// multiple instructions for a single pseudo (see RevertWhile and
34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
35 /// are defined to be as large as this maximum sequence of replacement
36 /// instructions.
37 ///
38 //===----------------------------------------------------------------------===//
39 
40 #include "ARM.h"
41 #include "ARMBaseInstrInfo.h"
42 #include "ARMBaseRegisterInfo.h"
43 #include "ARMBasicBlockInfo.h"
44 #include "ARMSubtarget.h"
45 #include "llvm/ADT/SetOperations.h"
46 #include "llvm/ADT/SmallSet.h"
47 #include "llvm/CodeGen/MachineFunctionPass.h"
48 #include "llvm/CodeGen/MachineLoopInfo.h"
49 #include "llvm/CodeGen/MachineLoopUtils.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/Passes.h"
52 #include "llvm/CodeGen/ReachingDefAnalysis.h"
53 #include "llvm/MC/MCInstrDesc.h"
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "arm-low-overhead-loops"
58 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
59 
60 namespace {
61 
62   struct PredicatedMI {
63     MachineInstr *MI = nullptr;
64     SetVector<MachineInstr*> Predicates;
65 
66   public:
67     PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
68     MI(I) {
69       Predicates.insert(Preds.begin(), Preds.end());
70     }
71   };
72 
73   // Represent a VPT block, a list of instructions that begins with a VPST and
74   // has a maximum of four proceeding instructions. All instructions within the
75   // block are predicated upon the vpr and we allow instructions to define the
76   // vpr within in the block too.
77   class VPTBlock {
78     std::unique_ptr<PredicatedMI> VPST;
79     PredicatedMI *Divergent = nullptr;
80     SmallVector<PredicatedMI, 4> Insts;
81 
82   public:
83     VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
84       VPST = std::make_unique<PredicatedMI>(MI, Preds);
85     }
86 
87     void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
88       LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
89       if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
90         Divergent = &Insts.back();
91         LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
92       }
93       Insts.emplace_back(MI, Preds);
94       assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
95     }
96 
97     // Have we found an instruction within the block which defines the vpr? If
98     // so, not all the instructions in the block will have the same predicate.
99     bool HasNonUniformPredicate() const {
100       return Divergent != nullptr;
101     }
102 
103     // Is the given instruction part of the predicate set controlling the entry
104     // to the block.
105     bool IsPredicatedOn(MachineInstr *MI) const {
106       return VPST->Predicates.count(MI);
107     }
108 
109     // Is the given instruction the only predicate which controls the entry to
110     // the block.
111     bool IsOnlyPredicatedOn(MachineInstr *MI) const {
112       return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
113     }
114 
115     unsigned size() const { return Insts.size(); }
116     SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
117     MachineInstr *getVPST() const { return VPST->MI; }
118     PredicatedMI *getDivergent() const { return Divergent; }
119   };
120 
121   struct LowOverheadLoop {
122 
123     MachineLoop *ML = nullptr;
124     MachineFunction *MF = nullptr;
125     MachineInstr *InsertPt = nullptr;
126     MachineInstr *Start = nullptr;
127     MachineInstr *Dec = nullptr;
128     MachineInstr *End = nullptr;
129     MachineInstr *VCTP = nullptr;
130     VPTBlock *CurrentBlock = nullptr;
131     SetVector<MachineInstr*> CurrentPredicate;
132     SmallVector<VPTBlock, 4> VPTBlocks;
133     bool Revert = false;
134     bool CannotTailPredicate = false;
135 
136     LowOverheadLoop(MachineLoop *ML) : ML(ML) {
137       MF = ML->getHeader()->getParent();
138     }
139 
140     bool RecordVPTBlocks(MachineInstr *MI);
141 
142     // If this is an MVE instruction, check that we know how to use tail
143     // predication with it.
144     void AnalyseMVEInst(MachineInstr *MI) {
145       if (CannotTailPredicate)
146         return;
147 
148       if (!RecordVPTBlocks(MI)) {
149         CannotTailPredicate = true;
150         return;
151       }
152 
153       const MCInstrDesc &MCID = MI->getDesc();
154       uint64_t Flags = MCID.TSFlags;
155       if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
156         return;
157 
158       if ((Flags & ARMII::ValidForTailPredication) == 0) {
159         LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
160         CannotTailPredicate = true;
161       }
162     }
163 
164     bool IsTailPredicationLegal() const {
165       // For now, let's keep things really simple and only support a single
166       // block for tail predication.
167       return !Revert && FoundAllComponents() && VCTP &&
168              !CannotTailPredicate && ML->getNumBlocks() == 1;
169     }
170 
171     bool ValidateTailPredicate(MachineInstr *StartInsertPt,
172                                ReachingDefAnalysis *RDA,
173                                MachineLoopInfo *MLI);
174 
175     // Is it safe to define LR with DLS/WLS?
176     // LR can be defined if it is the operand to start, because it's the same
177     // value, or if it's going to be equivalent to the operand to Start.
178     MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
179 
180     // Check the branch targets are within range and we satisfy our
181     // restrictions.
182     void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
183                        MachineLoopInfo *MLI);
184 
185     bool FoundAllComponents() const {
186       return Start && Dec && End;
187     }
188 
189     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
190 
191     // Return the loop iteration count, or the number of elements if we're tail
192     // predicating.
193     MachineOperand &getCount() {
194       return IsTailPredicationLegal() ?
195         VCTP->getOperand(1) : Start->getOperand(0);
196     }
197 
198     unsigned getStartOpcode() const {
199       bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
200       if (!IsTailPredicationLegal())
201         return IsDo ? ARM::t2DLS : ARM::t2WLS;
202 
203       return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
204     }
205 
206     void dump() const {
207       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
208       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
209       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
210       if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
211       if (!FoundAllComponents())
212         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
213       else if (!(Start && Dec && End))
214         dbgs() << "ARM Loops: Failed to find all loop components.\n";
215     }
216   };
217 
218   class ARMLowOverheadLoops : public MachineFunctionPass {
219     MachineFunction           *MF = nullptr;
220     MachineLoopInfo           *MLI = nullptr;
221     ReachingDefAnalysis       *RDA = nullptr;
222     const ARMBaseInstrInfo    *TII = nullptr;
223     MachineRegisterInfo       *MRI = nullptr;
224     const TargetRegisterInfo  *TRI = nullptr;
225     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
226 
227   public:
228     static char ID;
229 
230     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
231 
232     void getAnalysisUsage(AnalysisUsage &AU) const override {
233       AU.setPreservesCFG();
234       AU.addRequired<MachineLoopInfo>();
235       AU.addRequired<ReachingDefAnalysis>();
236       MachineFunctionPass::getAnalysisUsage(AU);
237     }
238 
239     bool runOnMachineFunction(MachineFunction &MF) override;
240 
241     MachineFunctionProperties getRequiredProperties() const override {
242       return MachineFunctionProperties().set(
243           MachineFunctionProperties::Property::NoVRegs).set(
244           MachineFunctionProperties::Property::TracksLiveness);
245     }
246 
247     StringRef getPassName() const override {
248       return ARM_LOW_OVERHEAD_LOOPS_NAME;
249     }
250 
251   private:
252     bool ProcessLoop(MachineLoop *ML);
253 
254     bool RevertNonLoops();
255 
256     void RevertWhile(MachineInstr *MI) const;
257 
258     bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
259 
260     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
261 
262     void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
263 
264     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
265 
266     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
267 
268     void Expand(LowOverheadLoop &LoLoop);
269 
270   };
271 }
272 
273 char ARMLowOverheadLoops::ID = 0;
274 
275 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
276                 false, false)
277 
278 MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
279   // We can define LR because LR already contains the same value.
280   if (Start->getOperand(0).getReg() == ARM::LR)
281     return Start;
282 
283   unsigned CountReg = Start->getOperand(0).getReg();
284   auto IsMoveLR = [&CountReg](MachineInstr *MI) {
285     return MI->getOpcode() == ARM::tMOVr &&
286            MI->getOperand(0).getReg() == ARM::LR &&
287            MI->getOperand(1).getReg() == CountReg &&
288            MI->getOperand(2).getImm() == ARMCC::AL;
289    };
290 
291   MachineBasicBlock *MBB = Start->getParent();
292 
293   // Find an insertion point:
294   // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
295   //   to Count before Start, we can insert at that mov.
296   if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
297     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
298       return LRDef;
299 
300   // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
301   //   to Count after Start, we can insert at that mov.
302   if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
303     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
304       return LRDef;
305 
306   // We've found no suitable LR def and Start doesn't use LR directly. Can we
307   // just define LR anyway?
308   if (!RDA->isRegUsedAfter(Start, ARM::LR))
309     return Start;
310 
311   return nullptr;
312 }
313 
314 // Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
315 // not define a register that is used by any instructions, after and including,
316 // 'To'. These instructions also must not redefine any of Froms operands.
317 template<typename Iterator>
318 static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
319   SmallSet<int, 2> Defs;
320   // First check that From would compute the same value if moved.
321   for (auto &MO : From->operands()) {
322     if (!MO.isReg() || MO.isUndef() || !MO.getReg())
323       continue;
324     if (MO.isDef())
325       Defs.insert(MO.getReg());
326     else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
327       return false;
328   }
329 
330   // Now walk checking that the rest of the instructions will compute the same
331   // value.
332   for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
333     for (auto &MO : I->operands())
334       if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
335         return false;
336   }
337   return true;
338 }
339 
340 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
341     ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
342   assert(VCTP && "VCTP instruction expected but is not set");
343   // All predication within the loop should be based on vctp. If the block
344   // isn't predicated on entry, check whether the vctp is within the block
345   // and that all other instructions are then predicated on it.
346   for (auto &Block : VPTBlocks) {
347     if (Block.IsPredicatedOn(VCTP))
348       continue;
349     if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI))
350       return false;
351     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
352     for (auto &PredMI : Insts) {
353       if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
354         continue;
355       LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
356                         << " - which is predicated on:\n";
357                         for (auto *MI : PredMI.Predicates)
358                           dbgs() << "   - " << *MI;
359                  );
360       return false;
361     }
362   }
363 
364   // For tail predication, we need to provide the number of elements, instead
365   // of the iteration count, to the loop start instruction. The number of
366   // elements is provided to the vctp instruction, so we need to check that
367   // we can use this register at InsertPt.
368   Register NumElements = VCTP->getOperand(1).getReg();
369 
370   // If the register is defined within loop, then we can't perform TP.
371   // TODO: Check whether this is just a mov of a register that would be
372   // available.
373   if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
374     LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
375     return false;
376   }
377 
378   // The element count register maybe defined after InsertPt, in which case we
379   // need to try to move either InsertPt or the def so that the [w|d]lstp can
380   // use the value.
381   MachineBasicBlock *InsertBB = InsertPt->getParent();
382   if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
383     if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
384       if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
385         ElemDef->removeFromParent();
386         InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
387         LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
388                    << *ElemDef);
389       } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
390         InsertPt->removeFromParent();
391         InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
392         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
393       } else
394         return false;
395     }
396   }
397 
398   // Especially in the case of while loops, InsertBB may not be the
399   // preheader, so we need to check that the register isn't redefined
400   // before entering the loop.
401   auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
402                                       Register NumElements) {
403     // NumElements is redefined in this block.
404     if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
405       return true;
406 
407     // Don't continue searching up through multiple predecessors.
408     if (MBB->pred_size() > 1)
409       return true;
410 
411     return false;
412   };
413 
414   // First, find the block that looks like the preheader.
415   MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
416   if (!MBB)
417     return false;
418 
419   // Then search backwards for a def, until we get to InsertBB.
420   while (MBB != InsertBB) {
421     if (CannotProvideElements(MBB, NumElements))
422       return false;
423     MBB = *MBB->pred_begin();
424   }
425 
426   LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
427   return true;
428 }
429 
430 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
431                                     ReachingDefAnalysis *RDA,
432                                     MachineLoopInfo *MLI) {
433   if (Revert)
434     return;
435 
436   if (!End->getOperand(1).isMBB())
437     report_fatal_error("Expected LoopEnd to target basic block");
438 
439   // TODO Maybe there's cases where the target doesn't have to be the header,
440   // but for now be safe and revert.
441   if (End->getOperand(1).getMBB() != ML->getHeader()) {
442     LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
443     Revert = true;
444     return;
445   }
446 
447   // The WLS and LE instructions have 12-bits for the label offset. WLS
448   // requires a positive offset, while LE uses negative.
449   if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
450       !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
451     LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
452     Revert = true;
453     return;
454   }
455 
456   if (Start->getOpcode() == ARM::t2WhileLoopStart &&
457       (BBUtils->getOffsetOf(Start) >
458        BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
459        !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
460     LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
461     Revert = true;
462     return;
463   }
464 
465   InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
466   if (!InsertPt) {
467     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
468     Revert = true;
469     return;
470   } else
471     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
472 
473   if (!IsTailPredicationLegal()) {
474     LLVM_DEBUG(if (!VCTP)
475                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
476                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
477     return;
478   }
479 
480   assert(ML->getBlocks().size() == 1 &&
481          "Shouldn't be processing a loop with more than one block");
482   CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
483   LLVM_DEBUG(if (CannotTailPredicate)
484              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
485 }
486 
487 bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
488   // Only support a single vctp.
489   if (isVCTP(MI) && VCTP)
490     return false;
491 
492   // Start a new vpt block when we discover a vpt.
493   if (MI->getOpcode() == ARM::MVE_VPST) {
494     VPTBlocks.emplace_back(MI, CurrentPredicate);
495     CurrentBlock = &VPTBlocks.back();
496     return true;
497   }
498 
499   if (isVCTP(MI))
500     VCTP = MI;
501 
502   unsigned VPROpNum = MI->getNumOperands() - 1;
503   bool IsUse = false;
504   if (MI->getOperand(VPROpNum).isReg() &&
505       MI->getOperand(VPROpNum).getReg() == ARM::VPR &&
506       MI->getOperand(VPROpNum).isUse()) {
507     // If this instruction is predicated by VPR, it will be its last
508     // operand.  Also check that it's only 'Then' predicated.
509     if (!MI->getOperand(VPROpNum-1).isImm() ||
510         MI->getOperand(VPROpNum-1).getImm() != ARMVCC::Then) {
511       LLVM_DEBUG(dbgs() << "ARM Loops: Found unhandled predicate on: "
512                  << *MI);
513       return false;
514     }
515     CurrentBlock->addInst(MI, CurrentPredicate);
516     IsUse = true;
517   }
518 
519   bool IsDef = false;
520   for (unsigned i = 0; i < MI->getNumOperands() - 1; ++i) {
521     const MachineOperand &MO = MI->getOperand(i);
522     if (!MO.isReg() || MO.getReg() != ARM::VPR)
523       continue;
524 
525     if (MO.isDef()) {
526       CurrentPredicate.insert(MI);
527       IsDef = true;
528     } else {
529       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
530       return false;
531     }
532   }
533 
534   // If we find a vpr def that is not already predicated on the vctp, we've
535   // got disjoint predicates that may not be equivalent when we do the
536   // conversion.
537   if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
538     LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
539     return false;
540   }
541 
542   return true;
543 }
544 
545 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
546   const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
547   if (!ST.hasLOB())
548     return false;
549 
550   MF = &mf;
551   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
552 
553   MLI = &getAnalysis<MachineLoopInfo>();
554   RDA = &getAnalysis<ReachingDefAnalysis>();
555   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
556   MRI = &MF->getRegInfo();
557   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
558   TRI = ST.getRegisterInfo();
559   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
560   BBUtils->computeAllBlockSizes();
561   BBUtils->adjustBBOffsetsAfter(&MF->front());
562 
563   bool Changed = false;
564   for (auto ML : *MLI) {
565     if (!ML->getParentLoop())
566       Changed |= ProcessLoop(ML);
567   }
568   Changed |= RevertNonLoops();
569   return Changed;
570 }
571 
572 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
573 
574   bool Changed = false;
575 
576   // Process inner loops first.
577   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
578     Changed |= ProcessLoop(*I);
579 
580   LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
581              if (auto *Preheader = ML->getLoopPreheader())
582                dbgs() << " - " << Preheader->getName() << "\n";
583              else if (auto *Preheader = MLI->findLoopPreheader(ML))
584                dbgs() << " - " << Preheader->getName() << "\n";
585              for (auto *MBB : ML->getBlocks())
586                dbgs() << " - " << MBB->getName() << "\n";
587             );
588 
589   // Search the given block for a loop start instruction. If one isn't found,
590   // and there's only one predecessor block, search that one too.
591   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
592     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
593     for (auto &MI : *MBB) {
594       if (isLoopStart(MI))
595         return &MI;
596     }
597     if (MBB->pred_size() == 1)
598       return SearchForStart(*MBB->pred_begin());
599     return nullptr;
600   };
601 
602   LowOverheadLoop LoLoop(ML);
603   // Search the preheader for the start intrinsic.
604   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
605   // with potentially multiple set.loop.iterations, so we need to enable this.
606   if (auto *Preheader = ML->getLoopPreheader())
607     LoLoop.Start = SearchForStart(Preheader);
608   else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
609     LoLoop.Start = SearchForStart(Preheader);
610   else
611     return false;
612 
613   // Find the low-overhead loop components and decide whether or not to fall
614   // back to a normal loop. Also look for a vctp instructions and decide
615   // whether we can convert that predicate using tail predication.
616   for (auto *MBB : reverse(ML->getBlocks())) {
617     for (auto &MI : *MBB) {
618       if (MI.getOpcode() == ARM::t2LoopDec)
619         LoLoop.Dec = &MI;
620       else if (MI.getOpcode() == ARM::t2LoopEnd)
621         LoLoop.End = &MI;
622       else if (isLoopStart(MI))
623         LoLoop.Start = &MI;
624       else if (MI.getDesc().isCall()) {
625         // TODO: Though the call will require LE to execute again, does this
626         // mean we should revert? Always executing LE hopefully should be
627         // faster than performing a sub,cmp,br or even subs,br.
628         LoLoop.Revert = true;
629         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
630       } else {
631         // Record VPR defs and build up their corresponding vpt blocks.
632         // Check we know how to tail predicate any mve instructions.
633         LoLoop.AnalyseMVEInst(&MI);
634       }
635 
636       // We need to ensure that LR is not used or defined inbetween LoopDec and
637       // LoopEnd.
638       if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
639         continue;
640 
641       // If we find that LR has been written or read between LoopDec and
642       // LoopEnd, expect that the decremented value is being used else where.
643       // Because this value isn't actually going to be produced until the
644       // latch, by LE, we would need to generate a real sub. The value is also
645       // likely to be copied/reloaded for use of LoopEnd - in which in case
646       // we'd need to perform an add because it gets subtracted again by LE!
647       // The other option is to then generate the other form of LE which doesn't
648       // perform the sub.
649       for (auto &MO : MI.operands()) {
650         if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
651             MO.getReg() == ARM::LR) {
652           LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
653           LoLoop.Revert = true;
654           break;
655         }
656       }
657     }
658   }
659 
660   LLVM_DEBUG(LoLoop.dump());
661   if (!LoLoop.FoundAllComponents()) {
662     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
663     return false;
664   }
665 
666   LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
667   Expand(LoLoop);
668   return true;
669 }
670 
671 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
672 // beq that branches to the exit branch.
673 // TODO: We could also try to generate a cbz if the value in LR is also in
674 // another low register.
675 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
676   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
677   MachineBasicBlock *MBB = MI->getParent();
678   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
679                                     TII->get(ARM::t2CMPri));
680   MIB.add(MI->getOperand(0));
681   MIB.addImm(0);
682   MIB.addImm(ARMCC::AL);
683   MIB.addReg(ARM::NoRegister);
684 
685   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
686   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
687     ARM::tBcc : ARM::t2Bcc;
688 
689   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
690   MIB.add(MI->getOperand(1));   // branch target
691   MIB.addImm(ARMCC::EQ);        // condition code
692   MIB.addReg(ARM::CPSR);
693   MI->eraseFromParent();
694 }
695 
696 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
697                                         bool SetFlags) const {
698   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
699   MachineBasicBlock *MBB = MI->getParent();
700 
701   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
702   if (SetFlags &&
703       (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
704        !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
705       SetFlags = false;
706 
707   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
708                                     TII->get(ARM::t2SUBri));
709   MIB.addDef(ARM::LR);
710   MIB.add(MI->getOperand(1));
711   MIB.add(MI->getOperand(2));
712   MIB.addImm(ARMCC::AL);
713   MIB.addReg(0);
714 
715   if (SetFlags) {
716     MIB.addReg(ARM::CPSR);
717     MIB->getOperand(5).setIsDef(true);
718   } else
719     MIB.addReg(0);
720 
721   MI->eraseFromParent();
722   return SetFlags;
723 }
724 
725 // Generate a subs, or sub and cmp, and a branch instead of an LE.
726 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
727   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
728 
729   MachineBasicBlock *MBB = MI->getParent();
730   // Create cmp
731   if (!SkipCmp) {
732     MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
733                                       TII->get(ARM::t2CMPri));
734     MIB.addReg(ARM::LR);
735     MIB.addImm(0);
736     MIB.addImm(ARMCC::AL);
737     MIB.addReg(ARM::NoRegister);
738   }
739 
740   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
741   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
742     ARM::tBcc : ARM::t2Bcc;
743 
744   // Create bne
745   MachineInstrBuilder MIB =
746     BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
747   MIB.add(MI->getOperand(1));   // branch target
748   MIB.addImm(ARMCC::NE);        // condition code
749   MIB.addReg(ARM::CPSR);
750   MI->eraseFromParent();
751 }
752 
753 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
754   MachineInstr *InsertPt = LoLoop.InsertPt;
755   MachineInstr *Start = LoLoop.Start;
756   MachineBasicBlock *MBB = InsertPt->getParent();
757   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
758   unsigned Opc = LoLoop.getStartOpcode();
759   MachineOperand &Count = LoLoop.getCount();
760 
761   MachineInstrBuilder MIB =
762     BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
763 
764   MIB.addDef(ARM::LR);
765   MIB.add(Count);
766   if (!IsDo)
767     MIB.add(Start->getOperand(1));
768 
769   // When using tail-predication, try to delete the dead code that was used to
770   // calculate the number of loop iterations.
771   if (LoLoop.IsTailPredicationLegal()) {
772     SmallVector<MachineInstr*, 4> Killed;
773     SmallVector<MachineInstr*, 4> Dead;
774     if (auto *Def = RDA->getReachingMIDef(Start,
775                                           Start->getOperand(0).getReg())) {
776       Killed.push_back(Def);
777 
778       while (!Killed.empty()) {
779         MachineInstr *Def = Killed.back();
780         Killed.pop_back();
781         Dead.push_back(Def);
782         for (auto &MO : Def->operands()) {
783           if (!MO.isReg() || !MO.isKill())
784             continue;
785 
786           MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
787           if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
788             Killed.push_back(Kill);
789         }
790       }
791       for (auto *MI : Dead)
792         MI->eraseFromParent();
793     }
794   }
795 
796   // If we're inserting at a mov lr, then remove it as it's redundant.
797   if (InsertPt != Start)
798     InsertPt->eraseFromParent();
799   Start->eraseFromParent();
800   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
801   return &*MIB;
802 }
803 
804 // Goal is to optimise and clean-up these loops:
805 //
806 //   vector.body:
807 //     renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
808 //     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
809 //     ..
810 //     $lr = MVE_DLSTP_32 renamable $r3
811 //
812 // The SUB is the old update of the loop iteration count expression, which
813 // is no longer needed. This sub is removed when the element count, which is in
814 // r3 in this example, is defined by an instruction in the loop, and it has
815 // no uses.
816 //
817 void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
818   Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
819   MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
820 
821   LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
822 
823   if (LoLoop.ML->getNumBlocks() != 1) {
824     LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
825     return;
826   }
827 
828   LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
829              LoLoop.VCTP->getOperand(1).dump());
830 
831   // Find the definition we are interested in removing, if there is one.
832   MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
833   if (!Def) {
834     LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
835     return;
836   }
837 
838   // Bail if we define CPSR and it is not dead
839   if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
840     LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
841     return;
842   }
843 
844   // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
845   if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
846     LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
847     return;
848   }
849 
850   // Bail if there are uses after this Def in the block.
851   SmallVector<MachineInstr*, 4> Uses;
852   RDA->getReachingLocalUses(Def, ElemCount, Uses);
853   if (Uses.size()) {
854     LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
855     return;
856   }
857 
858   Uses.clear();
859   RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
860 
861   // Remove Def if there are no uses, or if the only use is the VCTP
862   // instruction.
863   if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
864     LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
865                Def->dump());
866     Def->eraseFromParent();
867   }
868 
869   LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
870              for (auto U : Uses) U->dump());
871 }
872 
873 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
874   auto RemovePredicate = [](MachineInstr *MI) {
875     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
876     unsigned OpNum = MI->getNumOperands() - 1;
877     assert(MI->getOperand(OpNum-1).getImm() == ARMVCC::Then &&
878            "Expected Then predicate!");
879     MI->getOperand(OpNum-1).setImm(ARMVCC::None);
880     MI->getOperand(OpNum).setReg(0);
881   };
882 
883   // There are a few scenarios which we have to fix up:
884   // 1) A VPT block with is only predicated by the vctp and has no internal vpr
885   //    defs.
886   // 2) A VPT block which is only predicated by the vctp but has an internal
887   //    vpr def.
888   // 3) A VPT block which is predicated upon the vctp as well as another vpr
889   //    def.
890   // 4) A VPT block which is not predicated upon a vctp, but contains it and
891   //    all instructions within the block are predicated upon in.
892 
893   for (auto &Block : LoLoop.getVPTBlocks()) {
894     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
895     if (Block.HasNonUniformPredicate()) {
896       PredicatedMI *Divergent = Block.getDivergent();
897       if (isVCTP(Divergent->MI)) {
898         // The vctp will be removed, so the size of the vpt block needs to be
899         // modified.
900         uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
901         Block.getVPST()->getOperand(0).setImm(Size);
902         LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
903       } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
904         // The VPT block has a non-uniform predicate but it's entry is guarded
905         // only by a vctp, which means we:
906         // - Need to remove the original vpst.
907         // - Then need to unpredicate any following instructions, until
908         //   we come across the divergent vpr def.
909         // - Insert a new vpst to predicate the instruction(s) that following
910         //   the divergent vpr def.
911         // TODO: We could be producing more VPT blocks than necessary and could
912         // fold the newly created one into a proceeding one.
913         for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
914              E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
915           RemovePredicate(&*I);
916 
917         unsigned Size = 0;
918         auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
919         auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
920         MachineInstr *InsertAt = nullptr;
921         while (I != E) {
922           InsertAt = &*I;
923           ++Size;
924           ++I;
925         }
926         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
927                                           InsertAt->getDebugLoc(),
928                                           TII->get(ARM::MVE_VPST));
929         MIB.addImm(getARMVPTBlockMask(Size));
930         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
931         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
932         Block.getVPST()->eraseFromParent();
933       }
934     } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
935       // A vpt block which is only predicated upon vctp and has no internal vpr
936       // defs:
937       // - Remove vpst.
938       // - Unpredicate the remaining instructions.
939       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
940       Block.getVPST()->eraseFromParent();
941       for (auto &PredMI : Insts)
942         RemovePredicate(PredMI.MI);
943     }
944   }
945 
946   LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
947   LoLoop.VCTP->eraseFromParent();
948 }
949 
950 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
951 
952   // Combine the LoopDec and LoopEnd instructions into LE(TP).
953   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
954     MachineInstr *End = LoLoop.End;
955     MachineBasicBlock *MBB = End->getParent();
956     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
957       ARM::MVE_LETP : ARM::t2LEUpdate;
958     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
959                                       TII->get(Opc));
960     MIB.addDef(ARM::LR);
961     MIB.add(End->getOperand(0));
962     MIB.add(End->getOperand(1));
963     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
964 
965     LoLoop.End->eraseFromParent();
966     LoLoop.Dec->eraseFromParent();
967     return &*MIB;
968   };
969 
970   // TODO: We should be able to automatically remove these branches before we
971   // get here - probably by teaching analyzeBranch about the pseudo
972   // instructions.
973   // If there is an unconditional branch, after I, that just branches to the
974   // next block, remove it.
975   auto RemoveDeadBranch = [](MachineInstr *I) {
976     MachineBasicBlock *BB = I->getParent();
977     MachineInstr *Terminator = &BB->instr_back();
978     if (Terminator->isUnconditionalBranch() && I != Terminator) {
979       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
980       if (BB->isLayoutSuccessor(Succ)) {
981         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
982         Terminator->eraseFromParent();
983       }
984     }
985   };
986 
987   if (LoLoop.Revert) {
988     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
989       RevertWhile(LoLoop.Start);
990     else
991       LoLoop.Start->eraseFromParent();
992     bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
993     RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
994   } else {
995     LoLoop.Start = ExpandLoopStart(LoLoop);
996     RemoveDeadBranch(LoLoop.Start);
997     LoLoop.End = ExpandLoopEnd(LoLoop);
998     RemoveDeadBranch(LoLoop.End);
999     if (LoLoop.IsTailPredicationLegal()) {
1000       RemoveLoopUpdate(LoLoop);
1001       ConvertVPTBlocks(LoLoop);
1002     }
1003   }
1004 }
1005 
1006 bool ARMLowOverheadLoops::RevertNonLoops() {
1007   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1008   bool Changed = false;
1009 
1010   for (auto &MBB : *MF) {
1011     SmallVector<MachineInstr*, 4> Starts;
1012     SmallVector<MachineInstr*, 4> Decs;
1013     SmallVector<MachineInstr*, 4> Ends;
1014 
1015     for (auto &I : MBB) {
1016       if (isLoopStart(I))
1017         Starts.push_back(&I);
1018       else if (I.getOpcode() == ARM::t2LoopDec)
1019         Decs.push_back(&I);
1020       else if (I.getOpcode() == ARM::t2LoopEnd)
1021         Ends.push_back(&I);
1022     }
1023 
1024     if (Starts.empty() && Decs.empty() && Ends.empty())
1025       continue;
1026 
1027     Changed = true;
1028 
1029     for (auto *Start : Starts) {
1030       if (Start->getOpcode() == ARM::t2WhileLoopStart)
1031         RevertWhile(Start);
1032       else
1033         Start->eraseFromParent();
1034     }
1035     for (auto *Dec : Decs)
1036       RevertLoopDec(Dec);
1037 
1038     for (auto *End : Ends)
1039       RevertLoopEnd(End);
1040   }
1041   return Changed;
1042 }
1043 
1044 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1045   return new ARMLowOverheadLoops();
1046 }
1047