xref: /llvm-project/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp (revision 9571cc2b28d74c20f1abb3280adaa42d6e5b88dc)
1 //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does a few optimisations related to Tail predicated loops
10 /// and MVE VPT blocks before register allocation is performed. For VPT blocks
11 /// the goal is to maximize the sizes of the blocks that will be created by the
12 /// MVE VPT Block Insertion pass (which runs after register allocation). For
13 /// tail predicated loops we transform the loop into something that will
14 /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #include "ARM.h"
19 #include "ARMSubtarget.h"
20 #include "MVETailPredUtils.h"
21 #include "Thumb2InstrInfo.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineLoopInfo.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Support/Debug.h"
31 #include <cassert>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "arm-mve-vpt-opts"
36 
37 static cl::opt<bool>
38 MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
39     cl::desc("Enable merging Loop End and Dec instructions."),
40     cl::init(true));
41 
42 static cl::opt<bool>
43 SetLRPredicate("arm-set-lr-predicate", cl::Hidden,
44     cl::desc("Enable setting lr as a predicate in tail predication regions."),
45     cl::init(true));
46 
47 namespace {
48 class MVETPAndVPTOptimisations : public MachineFunctionPass {
49 public:
50   static char ID;
51   const Thumb2InstrInfo *TII;
52   MachineRegisterInfo *MRI;
53 
54   MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {}
55 
56   bool runOnMachineFunction(MachineFunction &Fn) override;
57 
58   void getAnalysisUsage(AnalysisUsage &AU) const override {
59     AU.addRequired<MachineLoopInfoWrapperPass>();
60     AU.addPreserved<MachineLoopInfoWrapperPass>();
61     AU.addRequired<MachineDominatorTreeWrapperPass>();
62     AU.addPreserved<MachineDominatorTreeWrapperPass>();
63     MachineFunctionPass::getAnalysisUsage(AU);
64   }
65 
66   StringRef getPassName() const override {
67     return "ARM MVE TailPred and VPT Optimisation Pass";
68   }
69 
70 private:
71   bool LowerWhileLoopStart(MachineLoop *ML);
72   bool MergeLoopEnd(MachineLoop *ML);
73   bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
74   MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
75                                             MachineInstr &Instr,
76                                             MachineOperand &User,
77                                             Register Target);
78   bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
79   bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
80   bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
81   bool ConvertVPSEL(MachineBasicBlock &MBB);
82   bool HintDoLoopStartReg(MachineBasicBlock &MBB);
83   MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
84                                             MachineInstr *LoopStart);
85 };
86 
87 char MVETPAndVPTOptimisations::ID = 0;
88 
89 } // end anonymous namespace
90 
91 INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
92                       "ARM MVE TailPred and VPT Optimisations pass", false,
93                       false)
94 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
95 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
96 INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
97                     "ARM MVE TailPred and VPT Optimisations pass", false, false)
98 
99 static MachineInstr *LookThroughCOPY(MachineInstr *MI,
100                                      MachineRegisterInfo *MRI) {
101   while (MI && MI->getOpcode() == TargetOpcode::COPY &&
102          MI->getOperand(1).getReg().isVirtual())
103     MI = MRI->getVRegDef(MI->getOperand(1).getReg());
104   return MI;
105 }
106 
107 // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
108 // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
109 // at the moment, returning a t2DoLoopStart in LoopStart.
110 static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
111                                MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
112                                MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
113   MachineBasicBlock *Header = ML->getHeader();
114   MachineBasicBlock *Latch = ML->getLoopLatch();
115   if (!Header || !Latch) {
116     LLVM_DEBUG(dbgs() << "  no Loop Latch or Header\n");
117     return false;
118   }
119 
120   // Find the loop end from the terminators.
121   LoopEnd = nullptr;
122   for (auto &T : Latch->terminators()) {
123     if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
124       LoopEnd = &T;
125       break;
126     }
127     if (T.getOpcode() == ARM::t2LoopEndDec &&
128         T.getOperand(2).getMBB() == Header) {
129       LoopEnd = &T;
130       break;
131     }
132   }
133   if (!LoopEnd) {
134     LLVM_DEBUG(dbgs() << "  no LoopEnd\n");
135     return false;
136   }
137   LLVM_DEBUG(dbgs() << "  found loop end: " << *LoopEnd);
138 
139   // Find the dec from the use of the end. There may be copies between
140   // instructions. We expect the loop to loop like:
141   //   $vs = t2DoLoopStart ...
142   // loop:
143   //   $vp = phi [ $vs ], [ $vd ]
144   //   ...
145   //   $vd = t2LoopDec $vp
146   //   ...
147   //   t2LoopEnd $vd, loop
148   if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
149     LoopDec = LoopEnd;
150   else {
151     LoopDec =
152         LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
153     if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
154       LLVM_DEBUG(dbgs() << "  didn't find LoopDec where we expected!\n");
155       return false;
156     }
157   }
158   LLVM_DEBUG(dbgs() << "  found loop dec: " << *LoopDec);
159 
160   LoopPhi =
161       LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
162   if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
163       LoopPhi->getNumOperands() != 5 ||
164       (LoopPhi->getOperand(2).getMBB() != Latch &&
165        LoopPhi->getOperand(4).getMBB() != Latch)) {
166     LLVM_DEBUG(dbgs() << "  didn't find PHI where we expected!\n");
167     return false;
168   }
169   LLVM_DEBUG(dbgs() << "  found loop phi: " << *LoopPhi);
170 
171   Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
172                           ? LoopPhi->getOperand(3).getReg()
173                           : LoopPhi->getOperand(1).getReg();
174   LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
175   if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
176                      LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
177                      LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
178     LLVM_DEBUG(dbgs() << "  didn't find Start where we expected!\n");
179     return false;
180   }
181   LLVM_DEBUG(dbgs() << "  found loop start: " << *LoopStart);
182 
183   return true;
184 }
185 
186 static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
187   MachineBasicBlock *MBB = MI->getParent();
188   assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
189          "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
190 
191   // Subs
192   MachineInstrBuilder MIB =
193       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
194   MIB.add(MI->getOperand(0));
195   MIB.add(MI->getOperand(1));
196   MIB.addImm(0);
197   MIB.addImm(ARMCC::AL);
198   MIB.addReg(ARM::NoRegister);
199   MIB.addReg(ARM::CPSR, RegState::Define);
200 
201   // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
202   for (MachineInstr &I : MBB->terminators()) {
203     if (I.getOpcode() == ARM::t2WhileLoopStart) {
204       MachineInstrBuilder MIB =
205           BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
206       MIB.add(MI->getOperand(1)); // branch target
207       MIB.addImm(ARMCC::EQ);
208       MIB.addReg(ARM::CPSR);
209       I.eraseFromParent();
210       break;
211     }
212   }
213 
214   MI->eraseFromParent();
215 }
216 
217 // The Hardware Loop insertion and ISel Lowering produce the pseudos for the
218 // start of a while loop:
219 //   %a:gprlr = t2WhileLoopSetup %Cnt
220 //   t2WhileLoopStart %a, %BB
221 // We want to convert those to a single instruction which, like t2LoopEndDec and
222 // t2DoLoopStartTP is both a terminator and produces a value:
223 //   %a:grplr: t2WhileLoopStartLR %Cnt, %BB
224 //
225 // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
226 // t2WhileLoopStart are not valid past regalloc.
227 bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
228   LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
229                     << ML->getHeader()->getName() << "\n");
230 
231   MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
232   if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
233     return false;
234 
235   if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
236     return false;
237 
238   Register LR = LoopStart->getOperand(0).getReg();
239   auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
240     return MI.getOpcode() == ARM::t2WhileLoopStart;
241   });
242   if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
243     RevertWhileLoopSetup(LoopStart, TII);
244     RevertLoopDec(LoopStart, TII);
245     RevertLoopEnd(LoopStart, TII);
246     return true;
247   }
248 
249   MachineInstrBuilder MI =
250       BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
251               TII->get(ARM::t2WhileLoopStartLR), LR)
252           .add(LoopStart->getOperand(1))
253           .add(WLSIt->getOperand(1));
254   (void)MI;
255   LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());
256 
257   WLSIt->eraseFromParent();
258   LoopStart->eraseFromParent();
259   return true;
260 }
261 
262 // Return true if this instruction is invalid in a low overhead loop, usually
263 // because it clobbers LR.
264 static bool IsInvalidTPInstruction(MachineInstr &MI) {
265   return MI.isCall() || isLoopStart(MI);
266 }
267 
268 // Starting from PreHeader, search for invalid instructions back until the
269 // LoopStart block is reached. If invalid instructions are found, the loop start
270 // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
271 // return the new DLS LoopStart if updated.
272 MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
273     MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
274   SmallVector<MachineBasicBlock *> Worklist;
275   SmallPtrSet<MachineBasicBlock *, 4> Visited;
276   Worklist.push_back(PreHeader);
277   Visited.insert(LoopStart->getParent());
278 
279   while (!Worklist.empty()) {
280     MachineBasicBlock *MBB = Worklist.pop_back_val();
281     if (Visited.count(MBB))
282       continue;
283 
284     for (MachineInstr &MI : *MBB) {
285       if (!IsInvalidTPInstruction(MI))
286         continue;
287 
288       LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);
289 
290       // Create a t2DoLoopStart at the end of the preheader.
291       MachineInstrBuilder MIB =
292           BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
293                   LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
294       MIB.add(LoopStart->getOperand(0));
295       MIB.add(LoopStart->getOperand(1));
296 
297       // Make sure to remove the kill flags, to prevent them from being invalid.
298       LoopStart->getOperand(1).setIsKill(false);
299 
300       // Revert the t2WhileLoopStartLR to a CMP and Br.
301       RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
302       return MIB;
303     }
304 
305     Visited.insert(MBB);
306     for (auto *Pred : MBB->predecessors())
307       Worklist.push_back(Pred);
308   }
309   return LoopStart;
310 }
311 
312 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
313 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
314 // will be valid to be used for the low overhead loop, which means nothing else
315 // is using LR (especially calls) and there are no superfluous copies in the
316 // loop. The t2LoopEndDec is a branching terminator that produces a value (the
317 // decrement) around the loop edge, which means we need to be careful that they
318 // will be valid to allocate without any spilling.
319 bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
320   if (!MergeEndDec)
321     return false;
322 
323   LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
324                     << "\n");
325 
326   MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
327   if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
328     return false;
329 
330   // Check if there is an illegal instruction (a call) in the low overhead loop
331   // and if so revert it now before we get any further. While loops also need to
332   // check the preheaders, but can be reverted to a DLS loop if needed.
333   auto *PreHeader = ML->getLoopPreheader();
334   if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
335     LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);
336 
337   for (MachineBasicBlock *MBB : ML->blocks()) {
338     for (MachineInstr &MI : *MBB) {
339       if (IsInvalidTPInstruction(MI)) {
340         LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
341         if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
342           RevertDoLoopStart(LoopStart, TII);
343         else
344           RevertWhileLoopStartLR(LoopStart, TII);
345         RevertLoopDec(LoopDec, TII);
346         RevertLoopEnd(LoopEnd, TII);
347         return true;
348       }
349     }
350   }
351 
352   // Remove any copies from the loop, to ensure the phi that remains is both
353   // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
354   // that cannot spill, we need to be careful what remains in the loop.
355   Register PhiReg = LoopPhi->getOperand(0).getReg();
356   Register DecReg = LoopDec->getOperand(0).getReg();
357   Register StartReg = LoopStart->getOperand(0).getReg();
358   // Ensure the uses are expected, and collect any copies we want to remove.
359   SmallVector<MachineInstr *, 4> Copies;
360   auto CheckUsers = [&Copies](Register BaseReg,
361                               ArrayRef<MachineInstr *> ExpectedUsers,
362                               MachineRegisterInfo *MRI) {
363     SmallVector<Register, 4> Worklist;
364     Worklist.push_back(BaseReg);
365     while (!Worklist.empty()) {
366       Register Reg = Worklist.pop_back_val();
367       for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
368         if (llvm::is_contained(ExpectedUsers, &MI))
369           continue;
370         if (MI.getOpcode() != TargetOpcode::COPY ||
371             !MI.getOperand(0).getReg().isVirtual()) {
372           LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
373           return false;
374         }
375         Worklist.push_back(MI.getOperand(0).getReg());
376         Copies.push_back(&MI);
377       }
378     }
379     return true;
380   };
381   if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
382       !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
383       !CheckUsers(StartReg, {LoopPhi}, MRI)) {
384     // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
385     if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
386       RevertWhileLoopStartLR(LoopStart, TII);
387       RevertLoopDec(LoopDec, TII);
388       RevertLoopEnd(LoopEnd, TII);
389       return true;
390     }
391     return false;
392   }
393 
394   MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
395   MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
396   MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
397 
398   if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
399     LoopPhi->getOperand(3).setReg(StartReg);
400     LoopPhi->getOperand(1).setReg(DecReg);
401   } else {
402     LoopPhi->getOperand(1).setReg(StartReg);
403     LoopPhi->getOperand(3).setReg(DecReg);
404   }
405 
406   SmallVector<MachineOperand, 4> Cond;              // For analyzeBranch.
407   MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
408   if (!TII->analyzeBranch(*LoopEnd->getParent(), TBB, FBB, Cond) && !FBB) {
409     // If the LoopEnd falls through, need to insert a t2B to the fall-through
410     // block so that the non-analyzable t2LoopEndDec doesn't fall through.
411     MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator();
412     BuildMI(LoopEnd->getParent(), DebugLoc(), TII->get(ARM::t2B))
413         .addMBB(&*MBBI)
414         .add(predOps(ARMCC::AL));
415   }
416 
417   // Replace the loop dec and loop end as a single instruction.
418   MachineInstrBuilder MI =
419       BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
420               TII->get(ARM::t2LoopEndDec), DecReg)
421           .addReg(PhiReg)
422           .add(LoopEnd->getOperand(1));
423   (void)MI;
424   LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
425 
426   LoopDec->eraseFromParent();
427   LoopEnd->eraseFromParent();
428   for (auto *MI : Copies)
429     MI->eraseFromParent();
430   return true;
431 }
432 
433 // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
434 // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
435 // instruction, making the backend ARMLowOverheadLoops passes job of finding the
436 // VCTP operand much simpler.
437 bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
438                                               MachineDominatorTree *DT) {
439   LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
440                     << ML->getHeader()->getName() << "\n");
441 
442   // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
443   // in the loop.
444   MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
445   if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
446     return false;
447   if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
448                              LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
449     return false;
450 
451   SmallVector<MachineInstr *, 4> VCTPs;
452   SmallVector<MachineInstr *, 4> MVEInstrs;
453   for (MachineBasicBlock *BB : ML->blocks()) {
454     for (MachineInstr &MI : *BB)
455       if (isVCTP(&MI))
456         VCTPs.push_back(&MI);
457       else if (findFirstVPTPredOperandIdx(MI) != -1)
458         MVEInstrs.push_back(&MI);
459   }
460 
461   if (VCTPs.empty()) {
462     LLVM_DEBUG(dbgs() << "  no VCTPs\n");
463     return false;
464   }
465 
466   // Check all VCTPs are the same.
467   MachineInstr *FirstVCTP = *VCTPs.begin();
468   for (MachineInstr *VCTP : VCTPs) {
469     LLVM_DEBUG(dbgs() << "  with VCTP " << *VCTP);
470     if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
471         VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
472       LLVM_DEBUG(dbgs() << "  VCTP's are not identical\n");
473       return false;
474     }
475   }
476 
477   // Check for the register being used can be setup before the loop. We expect
478   // this to be:
479   //   $vx = ...
480   // loop:
481   //   $vp = PHI [ $vx ], [ $vd ]
482   //   ..
483   //   $vpr = VCTP $vp
484   //   ..
485   //   $vd = t2SUBri $vp, #n
486   //   ..
487   Register CountReg = FirstVCTP->getOperand(1).getReg();
488   if (!CountReg.isVirtual()) {
489     LLVM_DEBUG(dbgs() << "  cannot determine VCTP PHI\n");
490     return false;
491   }
492   MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
493   if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
494       Phi->getNumOperands() != 5 ||
495       (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
496        Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
497     LLVM_DEBUG(dbgs() << "  cannot determine VCTP Count\n");
498     return false;
499   }
500   CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
501                  ? Phi->getOperand(3).getReg()
502                  : Phi->getOperand(1).getReg();
503 
504   // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
505   // the preheader and add the new CountReg to it. We attempt to place it late
506   // in the preheader, but may need to move that earlier based on uses.
507   MachineBasicBlock *MBB = LoopStart->getParent();
508   MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
509   for (MachineInstr &Use :
510        MRI->use_instructions(LoopStart->getOperand(0).getReg()))
511     if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
512         !DT->dominates(ML->getHeader(), Use.getParent())) {
513       LLVM_DEBUG(dbgs() << "  InsertPt could not be a terminator!\n");
514       return false;
515     }
516 
517   unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
518                         ? ARM::t2DoLoopStartTP
519                         : ARM::t2WhileLoopStartTP;
520   MachineInstrBuilder MI =
521       BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
522           .add(LoopStart->getOperand(0))
523           .add(LoopStart->getOperand(1))
524           .addReg(CountReg);
525   if (NewOpc == ARM::t2WhileLoopStartTP)
526     MI.add(LoopStart->getOperand(2));
527   LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << "  with "
528                     << *MI.getInstr());
529   MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
530   LoopStart->eraseFromParent();
531 
532   if (SetLRPredicate) {
533     // Each instruction in the loop needs to be using LR as the predicate from
534     // the Phi as the predicate.
535     Register LR = LoopPhi->getOperand(0).getReg();
536     for (MachineInstr *MI : MVEInstrs) {
537       int Idx = findFirstVPTPredOperandIdx(*MI);
538       MI->getOperand(Idx + 2).setReg(LR);
539     }
540   }
541 
542   return true;
543 }
544 
545 // Returns true if Opcode is any VCMP Opcode.
546 static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
547 
548 // Returns true if a VCMP with this Opcode can have its operands swapped.
549 // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
550 // and VCMPr instructions (since the r is always on the right).
551 static bool CanHaveSwappedOperands(unsigned Opcode) {
552   switch (Opcode) {
553   default:
554     return true;
555   case ARM::MVE_VCMPf32:
556   case ARM::MVE_VCMPf16:
557   case ARM::MVE_VCMPf32r:
558   case ARM::MVE_VCMPf16r:
559   case ARM::MVE_VCMPi8r:
560   case ARM::MVE_VCMPi16r:
561   case ARM::MVE_VCMPi32r:
562   case ARM::MVE_VCMPu8r:
563   case ARM::MVE_VCMPu16r:
564   case ARM::MVE_VCMPu32r:
565   case ARM::MVE_VCMPs8r:
566   case ARM::MVE_VCMPs16r:
567   case ARM::MVE_VCMPs32r:
568     return false;
569   }
570 }
571 
572 // Returns the CondCode of a VCMP Instruction.
573 static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
574   assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
575   return ARMCC::CondCodes(Instr.getOperand(3).getImm());
576 }
577 
578 // Returns true if Cond is equivalent to a VPNOT instruction on the result of
579 // Prev. Cond and Prev must be VCMPs.
580 static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
581   assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
582 
583   // Opcodes must match.
584   if (Cond.getOpcode() != Prev.getOpcode())
585     return false;
586 
587   MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
588   MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
589 
590   // If the VCMP has the opposite condition with the same operands, we can
591   // replace it with a VPNOT
592   ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
593   ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
594   if (ExpectedCode == GetCondCode(Prev))
595     if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
596       return true;
597   // Check again with operands swapped if possible
598   if (!CanHaveSwappedOperands(Cond.getOpcode()))
599     return false;
600   ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
601   return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
602          CondOP2.isIdenticalTo(PrevOP1);
603 }
604 
605 // Returns true if Instr writes to VCCR.
606 static bool IsWritingToVCCR(MachineInstr &Instr) {
607   if (Instr.getNumOperands() == 0)
608     return false;
609   MachineOperand &Dst = Instr.getOperand(0);
610   if (!Dst.isReg())
611     return false;
612   Register DstReg = Dst.getReg();
613   if (!DstReg.isVirtual())
614     return false;
615   MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
616   const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
617   return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
618 }
619 
620 // Transforms
621 //    <Instr that uses %A ('User' Operand)>
622 // Into
623 //    %K = VPNOT %Target
624 //    <Instr that uses %K ('User' Operand)>
625 // And returns the newly inserted VPNOT.
626 // This optimization is done in the hopes of preventing spills/reloads of VPR by
627 // reducing the number of VCCR values with overlapping lifetimes.
628 MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
629     MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
630     Register Target) {
631   Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
632 
633   MachineInstrBuilder MIBuilder =
634       BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
635           .addDef(NewResult)
636           .addReg(Target);
637   addUnpredicatedMveVpredNOp(MIBuilder);
638 
639   // Make the user use NewResult instead, and clear its kill flag.
640   User.setReg(NewResult);
641   User.setIsKill(false);
642 
643   LLVM_DEBUG(dbgs() << "  Inserting VPNOT (for spill prevention): ";
644              MIBuilder.getInstr()->dump());
645 
646   return *MIBuilder.getInstr();
647 }
648 
649 // Moves a VPNOT before its first user if an instruction that uses Reg is found
650 // in-between the VPNOT and its user.
651 // Returns true if there is at least one user of the VPNOT in the block.
652 static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
653                                      MachineBasicBlock::iterator Iter,
654                                      Register Reg) {
655   assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
656   assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
657          "The VPNOT cannot be predicated");
658 
659   MachineInstr &VPNOT = *Iter;
660   Register VPNOTResult = VPNOT.getOperand(0).getReg();
661   Register VPNOTOperand = VPNOT.getOperand(1).getReg();
662 
663   // Whether the VPNOT will need to be moved, and whether we found a user of the
664   // VPNOT.
665   bool MustMove = false, HasUser = false;
666   MachineOperand *VPNOTOperandKiller = nullptr;
667   for (; Iter != MBB.end(); ++Iter) {
668     if (MachineOperand *MO =
669             Iter->findRegisterUseOperand(VPNOTOperand, /*TRI=*/nullptr,
670                                          /*isKill*/ true)) {
671       // If we find the operand that kills the VPNOTOperand's result, save it.
672       VPNOTOperandKiller = MO;
673     }
674 
675     if (Iter->findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != -1) {
676       MustMove = true;
677       continue;
678     }
679 
680     if (Iter->findRegisterUseOperandIdx(VPNOTResult, /*TRI=*/nullptr) == -1)
681       continue;
682 
683     HasUser = true;
684     if (!MustMove)
685       break;
686 
687     // Move the VPNOT right before Iter
688     LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << "  Before: ";
689                Iter->dump());
690     MBB.splice(Iter, &MBB, VPNOT.getIterator());
691     // If we move the instr, and its operand was killed earlier, remove the kill
692     // flag.
693     if (VPNOTOperandKiller)
694       VPNOTOperandKiller->setIsKill(false);
695 
696     break;
697   }
698   return HasUser;
699 }
700 
701 // This optimisation attempts to reduce the number of overlapping lifetimes of
702 // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
703 // this replaces
704 //    %A:vccr = (something)
705 //    %B:vccr = VPNOT %A
706 //    %Foo = (some op that uses %B)
707 //    %Bar = (some op that uses %A)
708 // With
709 //    %A:vccr = (something)
710 //    %B:vccr = VPNOT %A
711 //    %Foo = (some op that uses %B)
712 //    %TMP2:vccr = VPNOT %B
713 //    %Bar = (some op that uses %A)
714 bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
715   MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
716   SmallVector<MachineInstr *, 4> DeadInstructions;
717   bool Modified = false;
718 
719   while (Iter != End) {
720     Register VCCRValue, OppositeVCCRValue;
721     // The first loop looks for 2 unpredicated instructions:
722     //    %A:vccr = (instr)     ; A is stored in VCCRValue
723     //    %B:vccr = VPNOT %A    ; B is stored in OppositeVCCRValue
724     for (; Iter != End; ++Iter) {
725       // We're only interested in unpredicated instructions that write to VCCR.
726       if (!IsWritingToVCCR(*Iter) ||
727           getVPTInstrPredicate(*Iter) != ARMVCC::None)
728         continue;
729       Register Dst = Iter->getOperand(0).getReg();
730 
731       // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
732       // found what we were looking for.
733       if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
734           Iter->findRegisterUseOperandIdx(VCCRValue, /*TRI=*/nullptr) != -1) {
735         // Move the VPNOT closer to its first user if needed, and ignore if it
736         // has no users.
737         if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
738           continue;
739 
740         OppositeVCCRValue = Dst;
741         ++Iter;
742         break;
743       }
744 
745       // Else, just set VCCRValue.
746       VCCRValue = Dst;
747     }
748 
749     // If the first inner loop didn't find anything, stop here.
750     if (Iter == End)
751       break;
752 
753     assert(VCCRValue && OppositeVCCRValue &&
754            "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
755            "stopped before the end of the block!");
756     assert(VCCRValue != OppositeVCCRValue &&
757            "VCCRValue should not be equal to OppositeVCCRValue!");
758 
759     // LastVPNOTResult always contains the same value as OppositeVCCRValue.
760     Register LastVPNOTResult = OppositeVCCRValue;
761 
762     // This second loop tries to optimize the remaining instructions.
763     for (; Iter != End; ++Iter) {
764       bool IsInteresting = false;
765 
766       if (MachineOperand *MO =
767               Iter->findRegisterUseOperand(VCCRValue, /*TRI=*/nullptr)) {
768         IsInteresting = true;
769 
770         // - If the instruction is a VPNOT, it can be removed, and we can just
771         //   replace its uses with LastVPNOTResult.
772         // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
773         if (Iter->getOpcode() == ARM::MVE_VPNOT) {
774           Register Result = Iter->getOperand(0).getReg();
775 
776           MRI->replaceRegWith(Result, LastVPNOTResult);
777           DeadInstructions.push_back(&*Iter);
778           Modified = true;
779 
780           LLVM_DEBUG(dbgs()
781                      << "Replacing all uses of '" << printReg(Result)
782                      << "' with '" << printReg(LastVPNOTResult) << "'\n");
783         } else {
784           MachineInstr &VPNOT =
785               ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
786           Modified = true;
787 
788           LastVPNOTResult = VPNOT.getOperand(0).getReg();
789           std::swap(VCCRValue, OppositeVCCRValue);
790 
791           LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
792                             << "' with '" << printReg(LastVPNOTResult)
793                             << "' in instr: " << *Iter);
794         }
795       } else {
796         // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
797         // instead as they contain the same value.
798         if (MachineOperand *MO = Iter->findRegisterUseOperand(
799                 OppositeVCCRValue, /*TRI=*/nullptr)) {
800           IsInteresting = true;
801 
802           // This is pointless if LastVPNOTResult == OppositeVCCRValue.
803           if (LastVPNOTResult != OppositeVCCRValue) {
804             LLVM_DEBUG(dbgs() << "Replacing usage of '"
805                               << printReg(OppositeVCCRValue) << "' with '"
806                               << printReg(LastVPNOTResult) << " for instr: ";
807                        Iter->dump());
808             MO->setReg(LastVPNOTResult);
809             Modified = true;
810           }
811 
812           MO->setIsKill(false);
813         }
814 
815         // If this is an unpredicated VPNOT on
816         // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
817         if (Iter->getOpcode() == ARM::MVE_VPNOT &&
818             getVPTInstrPredicate(*Iter) == ARMVCC::None) {
819           Register VPNOTOperand = Iter->getOperand(1).getReg();
820           if (VPNOTOperand == LastVPNOTResult ||
821               VPNOTOperand == OppositeVCCRValue) {
822             IsInteresting = true;
823 
824             std::swap(VCCRValue, OppositeVCCRValue);
825             LastVPNOTResult = Iter->getOperand(0).getReg();
826           }
827         }
828       }
829 
830       // If this instruction was not interesting, and it writes to VCCR, stop.
831       if (!IsInteresting && IsWritingToVCCR(*Iter))
832         break;
833     }
834   }
835 
836   for (MachineInstr *DeadInstruction : DeadInstructions)
837     DeadInstruction->eraseFromParent();
838 
839   return Modified;
840 }
841 
842 // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
843 bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
844   SmallVector<MachineInstr *, 4> DeadInstructions;
845 
846   // The last VCMP that we have seen and that couldn't be replaced.
847   // This is reset when an instruction that writes to VCCR/VPR is found, or when
848   // a VCMP is replaced with a VPNOT.
849   // We'll only replace VCMPs with VPNOTs when this is not null, and when the
850   // current VCMP is the opposite of PrevVCMP.
851   MachineInstr *PrevVCMP = nullptr;
852   // If we find an instruction that kills the result of PrevVCMP, we save the
853   // operand here to remove the kill flag in case we need to use PrevVCMP's
854   // result.
855   MachineOperand *PrevVCMPResultKiller = nullptr;
856 
857   for (MachineInstr &Instr : MBB.instrs()) {
858     if (PrevVCMP) {
859       if (MachineOperand *MO =
860               Instr.findRegisterUseOperand(PrevVCMP->getOperand(0).getReg(),
861                                            /*TRI=*/nullptr, /*isKill*/ true)) {
862         // If we come accross the instr that kills PrevVCMP's result, record it
863         // so we can remove the kill flag later if we need to.
864         PrevVCMPResultKiller = MO;
865       }
866     }
867 
868     // Ignore predicated instructions.
869     if (getVPTInstrPredicate(Instr) != ARMVCC::None)
870       continue;
871 
872     // Only look at VCMPs
873     if (!IsVCMP(Instr.getOpcode())) {
874       // If the instruction writes to VCCR, forget the previous VCMP.
875       if (IsWritingToVCCR(Instr))
876         PrevVCMP = nullptr;
877       continue;
878     }
879 
880     if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
881       PrevVCMP = &Instr;
882       continue;
883     }
884 
885     // The register containing the result of the VCMP that we're going to
886     // replace.
887     Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
888 
889     // Build a VPNOT to replace the VCMP, reusing its operands.
890     MachineInstrBuilder MIBuilder =
891         BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
892             .add(Instr.getOperand(0))
893             .addReg(PrevVCMPResultReg);
894     addUnpredicatedMveVpredNOp(MIBuilder);
895     LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
896                MIBuilder.getInstr()->dump(); dbgs() << "  Removed VCMP: ";
897                Instr.dump());
898 
899     // If we found an instruction that uses, and kills PrevVCMP's result,
900     // remove the kill flag.
901     if (PrevVCMPResultKiller)
902       PrevVCMPResultKiller->setIsKill(false);
903 
904     // Finally, mark the old VCMP for removal and reset
905     // PrevVCMP/PrevVCMPResultKiller.
906     DeadInstructions.push_back(&Instr);
907     PrevVCMP = nullptr;
908     PrevVCMPResultKiller = nullptr;
909   }
910 
911   for (MachineInstr *DeadInstruction : DeadInstructions)
912     DeadInstruction->eraseFromParent();
913 
914   return !DeadInstructions.empty();
915 }
916 
917 bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
918                                                     MachineDominatorTree *DT) {
919   // Scan through the block, looking for instructions that use constants moves
920   // into VPR that are the negative of one another. These are expected to be
921   // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
922   // mask is kept it or and VPNOT's of it are added or reused as we scan through
923   // the function.
924   unsigned LastVPTImm = 0;
925   Register LastVPTReg = 0;
926   SmallSet<MachineInstr *, 4> DeadInstructions;
927 
928   for (MachineInstr &Instr : MBB.instrs()) {
929     // Look for predicated MVE instructions.
930     int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
931     if (PIdx == -1)
932       continue;
933     Register VPR = Instr.getOperand(PIdx + 1).getReg();
934     if (!VPR.isVirtual())
935       continue;
936 
937     // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
938     MachineInstr *Copy = MRI->getVRegDef(VPR);
939     if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
940         !Copy->getOperand(1).getReg().isVirtual() ||
941         MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
942       LastVPTReg = 0;
943       continue;
944     }
945     Register GPR = Copy->getOperand(1).getReg();
946 
947     // Find the Immediate used by the copy.
948     auto getImm = [&](Register GPR) -> unsigned {
949       MachineInstr *Def = MRI->getVRegDef(GPR);
950       if (Def && (Def->getOpcode() == ARM::t2MOVi ||
951                   Def->getOpcode() == ARM::t2MOVi16))
952         return Def->getOperand(1).getImm();
953       return -1U;
954     };
955     unsigned Imm = getImm(GPR);
956     if (Imm == -1U) {
957       LastVPTReg = 0;
958       continue;
959     }
960 
961     unsigned NotImm = ~Imm & 0xffff;
962     if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
963       MRI->clearKillFlags(LastVPTReg);
964       Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
965       if (MRI->use_empty(VPR)) {
966         DeadInstructions.insert(Copy);
967         if (MRI->hasOneUse(GPR))
968           DeadInstructions.insert(MRI->getVRegDef(GPR));
969       }
970       LLVM_DEBUG(dbgs() << "Reusing predicate: in  " << Instr);
971       VPR = LastVPTReg;
972     } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
973       // We have found the not of a previous constant. Create a VPNot of the
974       // earlier predicate reg and use it instead of the copy.
975       Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
976       auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
977                            TII->get(ARM::MVE_VPNOT), NewVPR)
978                        .addReg(LastVPTReg);
979       addUnpredicatedMveVpredNOp(VPNot);
980 
981       // Use the new register and check if the def is now dead.
982       Instr.getOperand(PIdx + 1).setReg(NewVPR);
983       if (MRI->use_empty(VPR)) {
984         DeadInstructions.insert(Copy);
985         if (MRI->hasOneUse(GPR))
986           DeadInstructions.insert(MRI->getVRegDef(GPR));
987       }
988       LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << "  to replace use at "
989                         << Instr);
990       VPR = NewVPR;
991     }
992 
993     LastVPTImm = Imm;
994     LastVPTReg = VPR;
995   }
996 
997   for (MachineInstr *DI : DeadInstructions)
998     DI->eraseFromParent();
999 
1000   return !DeadInstructions.empty();
1001 }
1002 
1003 // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
1004 // somewhat blunt approximation to allow tail predicated with vpsel
1005 // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
1006 // different semantics under tail predication. Until that is modelled we just
1007 // convert to a VMOVT (via a predicated VORR) instead.
1008 bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
1009   bool HasVCTP = false;
1010   SmallVector<MachineInstr *, 4> DeadInstructions;
1011 
1012   for (MachineInstr &MI : MBB.instrs()) {
1013     if (isVCTP(&MI)) {
1014       HasVCTP = true;
1015       continue;
1016     }
1017 
1018     if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
1019       continue;
1020 
1021     MachineInstrBuilder MIBuilder =
1022         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
1023             .add(MI.getOperand(0))
1024             .add(MI.getOperand(1))
1025             .add(MI.getOperand(1))
1026             .addImm(ARMVCC::Then)
1027             .add(MI.getOperand(4))
1028             .add(MI.getOperand(5))
1029             .add(MI.getOperand(2));
1030     // Silence unused variable warning in release builds.
1031     (void)MIBuilder;
1032     LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
1033                dbgs() << "     with VMOVT: "; MIBuilder.getInstr()->dump());
1034     DeadInstructions.push_back(&MI);
1035   }
1036 
1037   for (MachineInstr *DeadInstruction : DeadInstructions)
1038     DeadInstruction->eraseFromParent();
1039 
1040   return !DeadInstructions.empty();
1041 }
1042 
1043 // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
1044 // the instruction may be removable as a noop.
1045 bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
1046   bool Changed = false;
1047   for (MachineInstr &MI : MBB.instrs()) {
1048     if (MI.getOpcode() != ARM::t2DoLoopStart)
1049       continue;
1050     Register R = MI.getOperand(1).getReg();
1051     MachineFunction *MF = MI.getParent()->getParent();
1052     MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
1053     Changed = true;
1054   }
1055   return Changed;
1056 }
1057 
1058 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
1059   const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
1060 
1061   if (!STI.isThumb2() || !STI.hasLOB())
1062     return false;
1063 
1064   TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
1065   MRI = &Fn.getRegInfo();
1066   MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1067   MachineDominatorTree *DT =
1068       &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
1069 
1070   LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
1071                     << "********** Function: " << Fn.getName() << '\n');
1072 
1073   bool Modified = false;
1074   for (MachineLoop *ML : MLI->getLoopsInPreorder()) {
1075     Modified |= LowerWhileLoopStart(ML);
1076     Modified |= MergeLoopEnd(ML);
1077     Modified |= ConvertTailPredLoop(ML, DT);
1078   }
1079 
1080   for (MachineBasicBlock &MBB : Fn) {
1081     Modified |= HintDoLoopStartReg(MBB);
1082     Modified |= ReplaceConstByVPNOTs(MBB, DT);
1083     Modified |= ReplaceVCMPsByVPNOTs(MBB);
1084     Modified |= ReduceOldVCCRValueUses(MBB);
1085     Modified |= ConvertVPSEL(MBB);
1086   }
1087 
1088   LLVM_DEBUG(dbgs() << "**************************************\n");
1089   return Modified;
1090 }
1091 
1092 /// createMVETPAndVPTOptimisationsPass
1093 FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() {
1094   return new MVETPAndVPTOptimisations();
1095 }
1096