1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. 15 /// - t2LoopDec - placed within in the loop body. 16 /// - t2LoopEnd - the loop latch terminator. 17 /// 18 /// In addition to this, we also look for the presence of the VCTP instruction, 19 /// which determines whether we can generated the tail-predicated low-overhead 20 /// loop form. 21 /// 22 /// Assumptions and Dependencies: 23 /// Low-overhead loops are constructed and executed using a setup instruction: 24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. 25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range 26 /// but fixed polarity: WLS can only branch forwards and LE can only branch 27 /// backwards. These restrictions mean that this pass is dependent upon block 28 /// layout and block sizes, which is why it's the last pass to run. The same is 29 /// true for ConstantIslands, but this pass does not increase the size of the 30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed 31 /// during the transform and pseudo instructions are replaced by real ones. In 32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce 33 /// multiple instructions for a single pseudo (see RevertWhile and 34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd 35 /// are defined to be as large as this maximum sequence of replacement 36 /// instructions. 37 /// 38 /// A note on VPR.P0 (the lane mask): 39 /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a 40 /// "VPT Active" context (which includes low-overhead loops and vpt blocks). 41 /// They will simply "and" the result of their calculation with the current 42 /// value of VPR.P0. You can think of it like this: 43 /// \verbatim 44 /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs 45 /// VPR.P0 &= Value 46 /// else 47 /// VPR.P0 = Value 48 /// \endverbatim 49 /// When we're inside the low-overhead loop (between DLSTP and LETP), we always 50 /// fall in the "VPT active" case, so we can consider that all VPR writes by 51 /// one of those instruction is actually a "and". 52 //===----------------------------------------------------------------------===// 53 54 #include "ARM.h" 55 #include "ARMBaseInstrInfo.h" 56 #include "ARMBaseRegisterInfo.h" 57 #include "ARMBasicBlockInfo.h" 58 #include "ARMSubtarget.h" 59 #include "Thumb2InstrInfo.h" 60 #include "llvm/ADT/SetOperations.h" 61 #include "llvm/ADT/SmallSet.h" 62 #include "llvm/CodeGen/LivePhysRegs.h" 63 #include "llvm/CodeGen/MachineFunctionPass.h" 64 #include "llvm/CodeGen/MachineLoopInfo.h" 65 #include "llvm/CodeGen/MachineLoopUtils.h" 66 #include "llvm/CodeGen/MachineRegisterInfo.h" 67 #include "llvm/CodeGen/Passes.h" 68 #include "llvm/CodeGen/ReachingDefAnalysis.h" 69 #include "llvm/MC/MCInstrDesc.h" 70 71 using namespace llvm; 72 73 #define DEBUG_TYPE "arm-low-overhead-loops" 74 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 75 76 namespace { 77 78 using InstSet = SmallPtrSetImpl<MachineInstr *>; 79 80 class PostOrderLoopTraversal { 81 MachineLoop &ML; 82 MachineLoopInfo &MLI; 83 SmallPtrSet<MachineBasicBlock*, 4> Visited; 84 SmallVector<MachineBasicBlock*, 4> Order; 85 86 public: 87 PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) 88 : ML(ML), MLI(MLI) { } 89 90 const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { 91 return Order; 92 } 93 94 // Visit all the blocks within the loop, as well as exit blocks and any 95 // blocks properly dominating the header. 96 void ProcessLoop() { 97 std::function<void(MachineBasicBlock*)> Search = [this, &Search] 98 (MachineBasicBlock *MBB) -> void { 99 if (Visited.count(MBB)) 100 return; 101 102 Visited.insert(MBB); 103 for (auto *Succ : MBB->successors()) { 104 if (!ML.contains(Succ)) 105 continue; 106 Search(Succ); 107 } 108 Order.push_back(MBB); 109 }; 110 111 // Insert exit blocks. 112 SmallVector<MachineBasicBlock*, 2> ExitBlocks; 113 ML.getExitBlocks(ExitBlocks); 114 for (auto *MBB : ExitBlocks) 115 Order.push_back(MBB); 116 117 // Then add the loop body. 118 Search(ML.getHeader()); 119 120 // Then try the preheader and its predecessors. 121 std::function<void(MachineBasicBlock*)> GetPredecessor = 122 [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { 123 Order.push_back(MBB); 124 if (MBB->pred_size() == 1) 125 GetPredecessor(*MBB->pred_begin()); 126 }; 127 128 if (auto *Preheader = ML.getLoopPreheader()) 129 GetPredecessor(Preheader); 130 else if (auto *Preheader = MLI.findLoopPreheader(&ML, true)) 131 GetPredecessor(Preheader); 132 } 133 }; 134 135 struct PredicatedMI { 136 MachineInstr *MI = nullptr; 137 SetVector<MachineInstr*> Predicates; 138 139 public: 140 PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) { 141 assert(I && "Instruction must not be null!"); 142 Predicates.insert(Preds.begin(), Preds.end()); 143 } 144 }; 145 146 // Represent a VPT block, a list of instructions that begins with a VPT/VPST 147 // and has a maximum of four proceeding instructions. All instructions within 148 // the block are predicated upon the vpr and we allow instructions to define 149 // the vpr within in the block too. 150 class VPTBlock { 151 // The predicate then instruction, which is either a VPT, or a VPST 152 // instruction. 153 std::unique_ptr<PredicatedMI> PredicateThen; 154 PredicatedMI *Divergent = nullptr; 155 SmallVector<PredicatedMI, 4> Insts; 156 157 public: 158 VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { 159 PredicateThen = std::make_unique<PredicatedMI>(MI, Preds); 160 } 161 162 void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { 163 LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); 164 if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) { 165 Divergent = &Insts.back(); 166 LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); 167 } 168 Insts.emplace_back(MI, Preds); 169 assert(Insts.size() <= 4 && "Too many instructions in VPT block!"); 170 } 171 172 // Have we found an instruction within the block which defines the vpr? If 173 // so, not all the instructions in the block will have the same predicate. 174 bool HasNonUniformPredicate() const { 175 return Divergent != nullptr; 176 } 177 178 // Is the given instruction part of the predicate set controlling the entry 179 // to the block. 180 bool IsPredicatedOn(MachineInstr *MI) const { 181 return PredicateThen->Predicates.count(MI); 182 } 183 184 // Returns true if this is a VPT instruction. 185 bool isVPT() const { return !isVPST(); } 186 187 // Returns true if this is a VPST instruction. 188 bool isVPST() const { 189 return PredicateThen->MI->getOpcode() == ARM::MVE_VPST; 190 } 191 192 // Is the given instruction the only predicate which controls the entry to 193 // the block. 194 bool IsOnlyPredicatedOn(MachineInstr *MI) const { 195 return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1; 196 } 197 198 unsigned size() const { return Insts.size(); } 199 SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; } 200 MachineInstr *getPredicateThen() const { return PredicateThen->MI; } 201 PredicatedMI *getDivergent() const { return Divergent; } 202 }; 203 204 struct Reduction { 205 MachineInstr *Init; 206 MachineInstr &Copy; 207 MachineInstr &Reduce; 208 MachineInstr &VPSEL; 209 210 Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add, 211 MachineInstr *Sel) 212 : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { } 213 }; 214 215 struct LowOverheadLoop { 216 217 MachineLoop &ML; 218 MachineBasicBlock *Preheader = nullptr; 219 MachineLoopInfo &MLI; 220 ReachingDefAnalysis &RDA; 221 const TargetRegisterInfo &TRI; 222 const ARMBaseInstrInfo &TII; 223 MachineFunction *MF = nullptr; 224 MachineInstr *InsertPt = nullptr; 225 MachineInstr *Start = nullptr; 226 MachineInstr *Dec = nullptr; 227 MachineInstr *End = nullptr; 228 MachineInstr *VCTP = nullptr; 229 MachineOperand TPNumElements; 230 SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs; 231 VPTBlock *CurrentBlock = nullptr; 232 SetVector<MachineInstr*> CurrentPredicate; 233 SmallVector<VPTBlock, 4> VPTBlocks; 234 SmallPtrSet<MachineInstr*, 4> ToRemove; 235 SmallVector<std::unique_ptr<Reduction>, 1> Reductions; 236 SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute; 237 bool Revert = false; 238 bool CannotTailPredicate = false; 239 240 LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, 241 ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, 242 const ARMBaseInstrInfo &TII) 243 : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), 244 TPNumElements(MachineOperand::CreateImm(0)) { 245 MF = ML.getHeader()->getParent(); 246 if (auto *MBB = ML.getLoopPreheader()) 247 Preheader = MBB; 248 else if (auto *MBB = MLI.findLoopPreheader(&ML, true)) 249 Preheader = MBB; 250 } 251 252 // If this is an MVE instruction, check that we know how to use tail 253 // predication with it. Record VPT blocks and return whether the 254 // instruction is valid for tail predication. 255 bool ValidateMVEInst(MachineInstr *MI); 256 257 void AnalyseMVEInst(MachineInstr *MI) { 258 CannotTailPredicate = !ValidateMVEInst(MI); 259 } 260 261 bool IsTailPredicationLegal() const { 262 // For now, let's keep things really simple and only support a single 263 // block for tail predication. 264 return !Revert && FoundAllComponents() && VCTP && 265 !CannotTailPredicate && ML.getNumBlocks() == 1; 266 } 267 268 // Check that the predication in the loop will be equivalent once we 269 // perform the conversion. Also ensure that we can provide the number 270 // of elements to the loop start instruction. 271 bool ValidateTailPredicate(MachineInstr *StartInsertPt); 272 273 // See whether the live-out instructions are a reduction that we can fixup 274 // later. 275 bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers); 276 277 // Check that any values available outside of the loop will be the same 278 // after tail predication conversion. 279 bool ValidateLiveOuts(); 280 281 // Is it safe to define LR with DLS/WLS? 282 // LR can be defined if it is the operand to start, because it's the same 283 // value, or if it's going to be equivalent to the operand to Start. 284 MachineInstr *isSafeToDefineLR(); 285 286 // Check the branch targets are within range and we satisfy our 287 // restrictions. 288 void CheckLegality(ARMBasicBlockUtils *BBUtils); 289 290 bool FoundAllComponents() const { 291 return Start && Dec && End; 292 } 293 294 SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; } 295 296 // Return the operand for the loop start instruction. This will be the loop 297 // iteration count, or the number of elements if we're tail predicating. 298 MachineOperand &getLoopStartOperand() { 299 return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0); 300 } 301 302 unsigned getStartOpcode() const { 303 bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; 304 if (!IsTailPredicationLegal()) 305 return IsDo ? ARM::t2DLS : ARM::t2WLS; 306 307 return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo); 308 } 309 310 void dump() const { 311 if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; 312 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; 313 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; 314 if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; 315 if (!FoundAllComponents()) 316 dbgs() << "ARM Loops: Not a low-overhead loop.\n"; 317 else if (!(Start && Dec && End)) 318 dbgs() << "ARM Loops: Failed to find all loop components.\n"; 319 } 320 }; 321 322 class ARMLowOverheadLoops : public MachineFunctionPass { 323 MachineFunction *MF = nullptr; 324 MachineLoopInfo *MLI = nullptr; 325 ReachingDefAnalysis *RDA = nullptr; 326 const ARMBaseInstrInfo *TII = nullptr; 327 MachineRegisterInfo *MRI = nullptr; 328 const TargetRegisterInfo *TRI = nullptr; 329 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 330 331 public: 332 static char ID; 333 334 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 335 336 void getAnalysisUsage(AnalysisUsage &AU) const override { 337 AU.setPreservesCFG(); 338 AU.addRequired<MachineLoopInfo>(); 339 AU.addRequired<ReachingDefAnalysis>(); 340 MachineFunctionPass::getAnalysisUsage(AU); 341 } 342 343 bool runOnMachineFunction(MachineFunction &MF) override; 344 345 MachineFunctionProperties getRequiredProperties() const override { 346 return MachineFunctionProperties().set( 347 MachineFunctionProperties::Property::NoVRegs).set( 348 MachineFunctionProperties::Property::TracksLiveness); 349 } 350 351 StringRef getPassName() const override { 352 return ARM_LOW_OVERHEAD_LOOPS_NAME; 353 } 354 355 private: 356 bool ProcessLoop(MachineLoop *ML); 357 358 bool RevertNonLoops(); 359 360 void RevertWhile(MachineInstr *MI) const; 361 362 bool RevertLoopDec(MachineInstr *MI) const; 363 364 void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; 365 366 void ConvertVPTBlocks(LowOverheadLoop &LoLoop); 367 368 void FixupReductions(LowOverheadLoop &LoLoop) const; 369 370 MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); 371 372 void Expand(LowOverheadLoop &LoLoop); 373 374 void IterationCountDCE(LowOverheadLoop &LoLoop); 375 }; 376 } 377 378 char ARMLowOverheadLoops::ID = 0; 379 380 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 381 false, false) 382 383 MachineInstr *LowOverheadLoop::isSafeToDefineLR() { 384 // We can define LR because LR already contains the same value. 385 if (Start->getOperand(0).getReg() == ARM::LR) 386 return Start; 387 388 unsigned CountReg = Start->getOperand(0).getReg(); 389 auto IsMoveLR = [&CountReg](MachineInstr *MI) { 390 return MI->getOpcode() == ARM::tMOVr && 391 MI->getOperand(0).getReg() == ARM::LR && 392 MI->getOperand(1).getReg() == CountReg && 393 MI->getOperand(2).getImm() == ARMCC::AL; 394 }; 395 396 MachineBasicBlock *MBB = Start->getParent(); 397 398 // Find an insertion point: 399 // - Is there a (mov lr, Count) before Start? If so, and nothing else writes 400 // to Count before Start, we can insert at that mov. 401 if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR)) 402 if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) 403 return LRDef; 404 405 // - Is there a (mov lr, Count) after Start? If so, and nothing else writes 406 // to Count after Start, we can insert at that mov. 407 if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR)) 408 if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) 409 return LRDef; 410 411 // We've found no suitable LR def and Start doesn't use LR directly. Can we 412 // just define LR anyway? 413 return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr; 414 } 415 416 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { 417 assert(VCTP && "VCTP instruction expected but is not set"); 418 // All predication within the loop should be based on vctp. If the block 419 // isn't predicated on entry, check whether the vctp is within the block 420 // and that all other instructions are then predicated on it. 421 for (auto &Block : VPTBlocks) { 422 if (Block.IsPredicatedOn(VCTP)) 423 continue; 424 if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) { 425 LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: " 426 << *Block.getDivergent()->MI); 427 return false; 428 } 429 SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); 430 for (auto &PredMI : Insts) { 431 // Check the instructions in the block and only allow: 432 // - VCTPs 433 // - Instructions predicated on the main VCTP 434 // - Any VCMP 435 // - VCMPs just "and" their result with VPR.P0. Whether they are 436 // located before/after the VCTP is irrelevant - the end result will 437 // be the same in both cases, so there's no point in requiring them 438 // to be located after the VCTP! 439 if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) || 440 VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0) 441 continue; 442 LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI 443 << " - which is predicated on:\n"; 444 for (auto *MI : PredMI.Predicates) 445 dbgs() << " - " << *MI); 446 return false; 447 } 448 } 449 450 if (!ValidateLiveOuts()) 451 return false; 452 453 // For tail predication, we need to provide the number of elements, instead 454 // of the iteration count, to the loop start instruction. The number of 455 // elements is provided to the vctp instruction, so we need to check that 456 // we can use this register at InsertPt. 457 TPNumElements = VCTP->getOperand(1); 458 Register NumElements = TPNumElements.getReg(); 459 460 // If the register is defined within loop, then we can't perform TP. 461 // TODO: Check whether this is just a mov of a register that would be 462 // available. 463 if (RDA.hasLocalDefBefore(VCTP, NumElements)) { 464 LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); 465 return false; 466 } 467 468 // The element count register maybe defined after InsertPt, in which case we 469 // need to try to move either InsertPt or the def so that the [w|d]lstp can 470 // use the value. 471 MachineBasicBlock *InsertBB = StartInsertPt->getParent(); 472 473 if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) { 474 if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) { 475 if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) { 476 ElemDef->removeFromParent(); 477 InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef); 478 LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " 479 << *ElemDef); 480 } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) { 481 StartInsertPt->removeFromParent(); 482 InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), 483 StartInsertPt); 484 LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); 485 } else { 486 // If we fail to move an instruction and the element count is provided 487 // by a mov, use the mov operand if it will have the same value at the 488 // insertion point 489 MachineOperand Operand = ElemDef->getOperand(1); 490 if (isMovRegOpcode(ElemDef->getOpcode()) && 491 RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) == 492 RDA.getUniqueReachingMIDef(StartInsertPt, Operand.getReg())) { 493 TPNumElements = Operand; 494 NumElements = TPNumElements.getReg(); 495 } else { 496 LLVM_DEBUG(dbgs() 497 << "ARM Loops: Unable to move element count to loop " 498 << "start instruction.\n"); 499 return false; 500 } 501 } 502 } 503 } 504 505 // Especially in the case of while loops, InsertBB may not be the 506 // preheader, so we need to check that the register isn't redefined 507 // before entering the loop. 508 auto CannotProvideElements = [this](MachineBasicBlock *MBB, 509 Register NumElements) { 510 // NumElements is redefined in this block. 511 if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) 512 return true; 513 514 // Don't continue searching up through multiple predecessors. 515 if (MBB->pred_size() > 1) 516 return true; 517 518 return false; 519 }; 520 521 // First, find the block that looks like the preheader. 522 MachineBasicBlock *MBB = Preheader; 523 if (!MBB) { 524 LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); 525 return false; 526 } 527 528 // Then search backwards for a def, until we get to InsertBB. 529 while (MBB != InsertBB) { 530 if (CannotProvideElements(MBB, NumElements)) { 531 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); 532 return false; 533 } 534 MBB = *MBB->pred_begin(); 535 } 536 537 // Check that the value change of the element count is what we expect and 538 // that the predication will be equivalent. For this we need: 539 // NumElements = NumElements - VectorWidth. The sub will be a sub immediate 540 // and we can also allow register copies within the chain too. 541 auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { 542 return -getAddSubImmediate(*MI) == ExpectedVecWidth; 543 }; 544 545 MBB = VCTP->getParent(); 546 if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) { 547 SmallPtrSet<MachineInstr*, 2> ElementChain; 548 SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP }; 549 unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); 550 551 Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end()); 552 553 if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) { 554 bool FoundSub = false; 555 556 for (auto *MI : ElementChain) { 557 if (isMovRegOpcode(MI->getOpcode())) 558 continue; 559 560 if (isSubImmOpcode(MI->getOpcode())) { 561 if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) 562 return false; 563 FoundSub = true; 564 } else 565 return false; 566 } 567 568 LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n"; 569 for (auto *MI : ElementChain) 570 dbgs() << " - " << *MI); 571 ToRemove.insert(ElementChain.begin(), ElementChain.end()); 572 } 573 } 574 return true; 575 } 576 577 static bool isVectorPredicated(MachineInstr *MI) { 578 int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); 579 return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; 580 } 581 582 static bool isRegInClass(const MachineOperand &MO, 583 const TargetRegisterClass *Class) { 584 return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); 585 } 586 587 // MVE 'narrowing' operate on half a lane, reading from half and writing 588 // to half, which are referred to has the top and bottom half. The other 589 // half retains its previous value. 590 static bool retainsPreviousHalfElement(const MachineInstr &MI) { 591 const MCInstrDesc &MCID = MI.getDesc(); 592 uint64_t Flags = MCID.TSFlags; 593 return (Flags & ARMII::RetainsPreviousHalfElement) != 0; 594 } 595 596 // Some MVE instructions read from the top/bottom halves of their operand(s) 597 // and generate a vector result with result elements that are double the 598 // width of the input. 599 static bool producesDoubleWidthResult(const MachineInstr &MI) { 600 const MCInstrDesc &MCID = MI.getDesc(); 601 uint64_t Flags = MCID.TSFlags; 602 return (Flags & ARMII::DoubleWidthResult) != 0; 603 } 604 605 static bool isHorizontalReduction(const MachineInstr &MI) { 606 const MCInstrDesc &MCID = MI.getDesc(); 607 uint64_t Flags = MCID.TSFlags; 608 return (Flags & ARMII::HorizontalReduction) != 0; 609 } 610 611 // Can this instruction generate a non-zero result when given only zeroed 612 // operands? This allows us to know that, given operands with false bytes 613 // zeroed by masked loads, that the result will also contain zeros in those 614 // bytes. 615 static bool canGenerateNonZeros(const MachineInstr &MI) { 616 617 // Check for instructions which can write into a larger element size, 618 // possibly writing into a previous zero'd lane. 619 if (producesDoubleWidthResult(MI)) 620 return true; 621 622 switch (MI.getOpcode()) { 623 default: 624 break; 625 // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow 626 // fp16 -> fp32 vector conversions. 627 // Instructions that perform a NOT will generate 1s from 0s. 628 case ARM::MVE_VMVN: 629 case ARM::MVE_VORN: 630 // Count leading zeros will do just that! 631 case ARM::MVE_VCLZs8: 632 case ARM::MVE_VCLZs16: 633 case ARM::MVE_VCLZs32: 634 return true; 635 } 636 return false; 637 } 638 639 640 // Look at its register uses to see if it only can only receive zeros 641 // into its false lanes which would then produce zeros. Also check that 642 // the output register is also defined by an FalseLanesZero instruction 643 // so that if tail-predication happens, the lanes that aren't updated will 644 // still be zeros. 645 static bool producesFalseLanesZero(MachineInstr &MI, 646 const TargetRegisterClass *QPRs, 647 const ReachingDefAnalysis &RDA, 648 InstSet &FalseLanesZero) { 649 if (canGenerateNonZeros(MI)) 650 return false; 651 652 bool AllowScalars = isHorizontalReduction(MI); 653 for (auto &MO : MI.operands()) { 654 if (!MO.isReg() || !MO.getReg()) 655 continue; 656 if (!isRegInClass(MO, QPRs) && AllowScalars) 657 continue; 658 if (auto *OpDef = RDA.getMIOperand(&MI, MO)) 659 if (FalseLanesZero.count(OpDef)) 660 continue; 661 return false; 662 } 663 LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); 664 return true; 665 } 666 667 bool 668 LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) { 669 // Also check for reductions where the operation needs to be merging values 670 // from the last and previous loop iterations. This means an instruction 671 // producing a value and a vmov storing the value calculated in the previous 672 // iteration. So we can have two live-out regs, one produced by a vmov and 673 // both being consumed by a vpsel. 674 LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n"; 675 for (auto *MI : LiveMIs) 676 dbgs() << " - " << *MI); 677 678 if (!Preheader) 679 return false; 680 681 // Expect a vmov, a vadd and a single vpsel user. 682 // TODO: This means we can't currently support multiple reductions in the 683 // loop. 684 if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1) 685 return false; 686 687 MachineInstr *VPSEL = *LiveOutUsers.begin(); 688 if (VPSEL->getOpcode() != ARM::MVE_VPSEL) 689 return false; 690 691 unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1; 692 MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx); 693 if (!Pred || Pred != VCTP) { 694 LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); 695 return false; 696 } 697 698 MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1); 699 if (!Reduce) 700 return false; 701 702 assert(LiveMIs.count(Reduce) && "Expected MI to be live-out"); 703 704 // TODO: Support more operations than VADD. 705 switch (VCTP->getOpcode()) { 706 default: 707 return false; 708 case ARM::MVE_VCTP8: 709 if (Reduce->getOpcode() != ARM::MVE_VADDi8) 710 return false; 711 break; 712 case ARM::MVE_VCTP16: 713 if (Reduce->getOpcode() != ARM::MVE_VADDi16) 714 return false; 715 break; 716 case ARM::MVE_VCTP32: 717 if (Reduce->getOpcode() != ARM::MVE_VADDi32) 718 return false; 719 break; 720 } 721 722 // Test that the reduce op is overwriting ones of its operands. 723 if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() && 724 Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) { 725 LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n"); 726 return false; 727 } 728 729 // Check that the VORR is actually a VMOV. 730 MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2); 731 if (!Copy || Copy->getOpcode() != ARM::MVE_VORR || 732 !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() || 733 Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg()) 734 return false; 735 736 assert(LiveMIs.count(Copy) && "Expected MI to be live-out"); 737 738 // Check that the vadd and vmov are only used by each other and the vpsel. 739 SmallPtrSet<MachineInstr*, 2> CopyUsers; 740 RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers); 741 if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) { 742 LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n"); 743 return false; 744 } 745 746 SmallPtrSet<MachineInstr*, 2> ReduceUsers; 747 RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers); 748 if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) { 749 LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n"); 750 return false; 751 } 752 753 // Then find whether there's an instruction initialising the register that 754 // is storing the reduction. 755 SmallPtrSet<MachineInstr*, 2> Incoming; 756 RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming); 757 if (Incoming.size() > 1) 758 return false; 759 760 MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin(); 761 LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n" 762 << " - " << *Copy 763 << " - " << *Reduce 764 << " - " << *VPSEL); 765 Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL)); 766 return true; 767 } 768 769 bool LowOverheadLoop::ValidateLiveOuts() { 770 // We want to find out if the tail-predicated version of this loop will 771 // produce the same values as the loop in its original form. For this to 772 // be true, the newly inserted implicit predication must not change the 773 // the (observable) results. 774 // We're doing this because many instructions in the loop will not be 775 // predicated and so the conversion from VPT predication to tail-predication 776 // can result in different values being produced; due to the tail-predication 777 // preventing many instructions from updating their falsely predicated 778 // lanes. This analysis assumes that all the instructions perform lane-wise 779 // operations and don't perform any exchanges. 780 // A masked load, whether through VPT or tail predication, will write zeros 781 // to any of the falsely predicated bytes. So, from the loads, we know that 782 // the false lanes are zeroed and here we're trying to track that those false 783 // lanes remain zero, or where they change, the differences are masked away 784 // by their user(s). 785 // All MVE stores have to be predicated, so we know that any predicate load 786 // operands, or stored results are equivalent already. Other explicitly 787 // predicated instructions will perform the same operation in the original 788 // loop and the tail-predicated form too. Because of this, we can insert 789 // loads, stores and other predicated instructions into our Predicated 790 // set and build from there. 791 const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); 792 SetVector<MachineInstr *> FalseLanesUnknown; 793 SmallPtrSet<MachineInstr *, 4> FalseLanesZero; 794 SmallPtrSet<MachineInstr *, 4> Predicated; 795 MachineBasicBlock *Header = ML.getHeader(); 796 797 for (auto &MI : *Header) { 798 const MCInstrDesc &MCID = MI.getDesc(); 799 uint64_t Flags = MCID.TSFlags; 800 if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) 801 continue; 802 803 if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode())) 804 continue; 805 806 // Predicated loads will write zeros to the falsely predicated bytes of the 807 // destination register. 808 if (isVectorPredicated(&MI)) { 809 if (MI.mayLoad()) 810 FalseLanesZero.insert(&MI); 811 Predicated.insert(&MI); 812 continue; 813 } 814 815 if (MI.getNumDefs() == 0) 816 continue; 817 818 if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) { 819 // We require retaining and horizontal operations to operate upon zero'd 820 // false lanes to ensure the conversion doesn't change the output. 821 if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI)) 822 return false; 823 // Otherwise we need to evaluate this instruction later to see whether 824 // unknown false lanes will get masked away by their user(s). 825 FalseLanesUnknown.insert(&MI); 826 } else if (!isHorizontalReduction(MI)) 827 FalseLanesZero.insert(&MI); 828 } 829 830 auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, 831 SmallPtrSetImpl<MachineInstr *> &Predicated) { 832 SmallPtrSet<MachineInstr *, 2> Uses; 833 RDA.getGlobalUses(MI, MO.getReg(), Uses); 834 for (auto *Use : Uses) { 835 if (Use != MI && !Predicated.count(Use)) 836 return false; 837 } 838 return true; 839 }; 840 841 // Visit the unknowns in reverse so that we can start at the values being 842 // stored and then we can work towards the leaves, hopefully adding more 843 // instructions to Predicated. Successfully terminating the loop means that 844 // all the unknown values have to found to be masked by predicated user(s). 845 // For any unpredicated values, we store them in NonPredicated so that we 846 // can later check whether these form a reduction. 847 SmallPtrSet<MachineInstr*, 2> NonPredicated; 848 for (auto *MI : reverse(FalseLanesUnknown)) { 849 for (auto &MO : MI->operands()) { 850 if (!isRegInClass(MO, QPRs) || !MO.isDef()) 851 continue; 852 if (!HasPredicatedUsers(MI, MO, Predicated)) { 853 LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " 854 << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); 855 NonPredicated.insert(MI); 856 continue; 857 } 858 } 859 // Any unknown false lanes have been masked away by the user(s). 860 Predicated.insert(MI); 861 } 862 863 SmallPtrSet<MachineInstr *, 2> LiveOutMIs; 864 SmallPtrSet<MachineInstr*, 2> LiveOutUsers; 865 SmallVector<MachineBasicBlock *, 2> ExitBlocks; 866 ML.getExitBlocks(ExitBlocks); 867 assert(ML.getNumBlocks() == 1 && "Expected single block loop!"); 868 assert(ExitBlocks.size() == 1 && "Expected a single exit block"); 869 MachineBasicBlock *ExitBB = ExitBlocks.front(); 870 for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { 871 // Check Q-regs that are live in the exit blocks. We don't collect scalars 872 // because they won't be affected by lane predication. 873 if (QPRs->contains(RegMask.PhysReg)) { 874 if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg)) 875 LiveOutMIs.insert(MI); 876 RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers); 877 } 878 } 879 880 // If we have any non-predicated live-outs, they need to be part of a 881 // reduction that we can fixup later. The reduction that the form of an 882 // operation that uses its previous values through a vmov and then a vpsel 883 // resides in the exit blocks to select the final bytes from n and n-1 884 // iterations. 885 if (!NonPredicated.empty() && 886 !FindValidReduction(NonPredicated, LiveOutUsers)) 887 return false; 888 889 // We've already validated that any VPT predication within the loop will be 890 // equivalent when we perform the predication transformation; so we know that 891 // any VPT predicated instruction is predicated upon VCTP. Any live-out 892 // instruction needs to be predicated, so check this here. The instructions 893 // in NonPredicated have been found to be a reduction that we can ensure its 894 // legality. 895 for (auto *MI : LiveOutMIs) 896 if (!isVectorPredicated(MI) && !NonPredicated.count(MI)) 897 return false; 898 899 return true; 900 } 901 902 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) { 903 if (Revert) 904 return; 905 906 if (!End->getOperand(1).isMBB()) 907 report_fatal_error("Expected LoopEnd to target basic block"); 908 909 // TODO Maybe there's cases where the target doesn't have to be the header, 910 // but for now be safe and revert. 911 if (End->getOperand(1).getMBB() != ML.getHeader()) { 912 LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); 913 Revert = true; 914 return; 915 } 916 917 // The WLS and LE instructions have 12-bits for the label offset. WLS 918 // requires a positive offset, while LE uses negative. 919 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) || 920 !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) { 921 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); 922 Revert = true; 923 return; 924 } 925 926 if (Start->getOpcode() == ARM::t2WhileLoopStart && 927 (BBUtils->getOffsetOf(Start) > 928 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || 929 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { 930 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); 931 Revert = true; 932 return; 933 } 934 935 InsertPt = Revert ? nullptr : isSafeToDefineLR(); 936 if (!InsertPt) { 937 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); 938 Revert = true; 939 return; 940 } else 941 LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); 942 943 if (!IsTailPredicationLegal()) { 944 LLVM_DEBUG(if (!VCTP) 945 dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; 946 dbgs() << "ARM Loops: Tail-predication is not valid.\n"); 947 return; 948 } 949 950 assert(ML.getBlocks().size() == 1 && 951 "Shouldn't be processing a loop with more than one block"); 952 CannotTailPredicate = !ValidateTailPredicate(InsertPt); 953 LLVM_DEBUG(if (CannotTailPredicate) 954 dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); 955 } 956 957 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { 958 if (CannotTailPredicate) 959 return false; 960 961 if (isVCTP(MI)) { 962 // If we find another VCTP, check whether it uses the same value as the main VCTP. 963 // If it does, store it in the SecondaryVCTPs set, else refuse it. 964 if (VCTP) { 965 if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) || 966 !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) { 967 LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " 968 "definition from the main VCTP"); 969 return false; 970 } 971 LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); 972 SecondaryVCTPs.insert(MI); 973 } else { 974 LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI); 975 VCTP = MI; 976 } 977 } else if (isVPTOpcode(MI->getOpcode())) { 978 if (MI->getOpcode() != ARM::MVE_VPST) { 979 assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 && 980 "VPT does not implicitly define VPR?!"); 981 CurrentPredicate.insert(MI); 982 } 983 984 VPTBlocks.emplace_back(MI, CurrentPredicate); 985 CurrentBlock = &VPTBlocks.back(); 986 return true; 987 } else if (MI->getOpcode() == ARM::MVE_VPSEL || 988 MI->getOpcode() == ARM::MVE_VPNOT) { 989 // TODO: Allow VPSEL and VPNOT, we currently cannot because: 990 // 1) It will use the VPR as a predicate operand, but doesn't have to be 991 // instead a VPT block, which means we can assert while building up 992 // the VPT block because we don't find another VPT or VPST to being a new 993 // one. 994 // 2) VPSEL still requires a VPR operand even after tail predicating, 995 // which means we can't remove it unless there is another 996 // instruction, such as vcmp, that can provide the VPR def. 997 return false; 998 } 999 1000 bool IsUse = false; 1001 bool IsDef = false; 1002 const MCInstrDesc &MCID = MI->getDesc(); 1003 for (int i = MI->getNumOperands() - 1; i >= 0; --i) { 1004 const MachineOperand &MO = MI->getOperand(i); 1005 if (!MO.isReg() || MO.getReg() != ARM::VPR) 1006 continue; 1007 1008 if (MO.isDef()) { 1009 CurrentPredicate.insert(MI); 1010 IsDef = true; 1011 } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { 1012 CurrentBlock->addInst(MI, CurrentPredicate); 1013 IsUse = true; 1014 } else { 1015 LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); 1016 return false; 1017 } 1018 } 1019 1020 // If we find a vpr def that is not already predicated on the vctp, we've 1021 // got disjoint predicates that may not be equivalent when we do the 1022 // conversion. 1023 if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { 1024 LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI); 1025 return false; 1026 } 1027 1028 uint64_t Flags = MCID.TSFlags; 1029 if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) 1030 return true; 1031 1032 // If we find an instruction that has been marked as not valid for tail 1033 // predication, only allow the instruction if it's contained within a valid 1034 // VPT block. 1035 if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) { 1036 LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); 1037 return false; 1038 } 1039 1040 // If the instruction is already explicitly predicated, then the conversion 1041 // will be fine, but ensure that all store operations are predicated. 1042 return !IsUse && MI->mayStore() ? false : true; 1043 } 1044 1045 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { 1046 const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); 1047 if (!ST.hasLOB()) 1048 return false; 1049 1050 MF = &mf; 1051 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); 1052 1053 MLI = &getAnalysis<MachineLoopInfo>(); 1054 RDA = &getAnalysis<ReachingDefAnalysis>(); 1055 MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); 1056 MRI = &MF->getRegInfo(); 1057 TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); 1058 TRI = ST.getRegisterInfo(); 1059 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); 1060 BBUtils->computeAllBlockSizes(); 1061 BBUtils->adjustBBOffsetsAfter(&MF->front()); 1062 1063 bool Changed = false; 1064 for (auto ML : *MLI) { 1065 if (!ML->getParentLoop()) 1066 Changed |= ProcessLoop(ML); 1067 } 1068 Changed |= RevertNonLoops(); 1069 return Changed; 1070 } 1071 1072 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 1073 1074 bool Changed = false; 1075 1076 // Process inner loops first. 1077 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 1078 Changed |= ProcessLoop(*I); 1079 1080 LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n"; 1081 if (auto *Preheader = ML->getLoopPreheader()) 1082 dbgs() << " - " << Preheader->getName() << "\n"; 1083 else if (auto *Preheader = MLI->findLoopPreheader(ML)) 1084 dbgs() << " - " << Preheader->getName() << "\n"; 1085 else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) 1086 dbgs() << " - " << Preheader->getName() << "\n"; 1087 for (auto *MBB : ML->getBlocks()) 1088 dbgs() << " - " << MBB->getName() << "\n"; 1089 ); 1090 1091 // Search the given block for a loop start instruction. If one isn't found, 1092 // and there's only one predecessor block, search that one too. 1093 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 1094 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 1095 for (auto &MI : *MBB) { 1096 if (isLoopStart(MI)) 1097 return &MI; 1098 } 1099 if (MBB->pred_size() == 1) 1100 return SearchForStart(*MBB->pred_begin()); 1101 return nullptr; 1102 }; 1103 1104 LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); 1105 // Search the preheader for the start intrinsic. 1106 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 1107 // with potentially multiple set.loop.iterations, so we need to enable this. 1108 if (LoLoop.Preheader) 1109 LoLoop.Start = SearchForStart(LoLoop.Preheader); 1110 else 1111 return false; 1112 1113 // Find the low-overhead loop components and decide whether or not to fall 1114 // back to a normal loop. Also look for a vctp instructions and decide 1115 // whether we can convert that predicate using tail predication. 1116 for (auto *MBB : reverse(ML->getBlocks())) { 1117 for (auto &MI : *MBB) { 1118 if (MI.isDebugValue()) 1119 continue; 1120 else if (MI.getOpcode() == ARM::t2LoopDec) 1121 LoLoop.Dec = &MI; 1122 else if (MI.getOpcode() == ARM::t2LoopEnd) 1123 LoLoop.End = &MI; 1124 else if (isLoopStart(MI)) 1125 LoLoop.Start = &MI; 1126 else if (MI.getDesc().isCall()) { 1127 // TODO: Though the call will require LE to execute again, does this 1128 // mean we should revert? Always executing LE hopefully should be 1129 // faster than performing a sub,cmp,br or even subs,br. 1130 LoLoop.Revert = true; 1131 LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); 1132 } else { 1133 // Record VPR defs and build up their corresponding vpt blocks. 1134 // Check we know how to tail predicate any mve instructions. 1135 LoLoop.AnalyseMVEInst(&MI); 1136 } 1137 } 1138 } 1139 1140 LLVM_DEBUG(LoLoop.dump()); 1141 if (!LoLoop.FoundAllComponents()) { 1142 LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n"); 1143 return false; 1144 } 1145 1146 // Check that the only instruction using LoopDec is LoopEnd. 1147 // TODO: Check for copy chains that really have no effect. 1148 SmallPtrSet<MachineInstr*, 2> Uses; 1149 RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses); 1150 if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { 1151 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); 1152 LoLoop.Revert = true; 1153 } 1154 LoLoop.CheckLegality(BBUtils.get()); 1155 Expand(LoLoop); 1156 return true; 1157 } 1158 1159 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 1160 // beq that branches to the exit branch. 1161 // TODO: We could also try to generate a cbz if the value in LR is also in 1162 // another low register. 1163 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { 1164 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); 1165 MachineBasicBlock *MBB = MI->getParent(); 1166 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1167 TII->get(ARM::t2CMPri)); 1168 MIB.add(MI->getOperand(0)); 1169 MIB.addImm(0); 1170 MIB.addImm(ARMCC::AL); 1171 MIB.addReg(ARM::NoRegister); 1172 1173 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 1174 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 1175 ARM::tBcc : ARM::t2Bcc; 1176 1177 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 1178 MIB.add(MI->getOperand(1)); // branch target 1179 MIB.addImm(ARMCC::EQ); // condition code 1180 MIB.addReg(ARM::CPSR); 1181 MI->eraseFromParent(); 1182 } 1183 1184 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { 1185 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); 1186 MachineBasicBlock *MBB = MI->getParent(); 1187 SmallPtrSet<MachineInstr*, 1> Ignore; 1188 for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { 1189 if (I->getOpcode() == ARM::t2LoopEnd) { 1190 Ignore.insert(&*I); 1191 break; 1192 } 1193 } 1194 1195 // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. 1196 bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore); 1197 1198 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1199 TII->get(ARM::t2SUBri)); 1200 MIB.addDef(ARM::LR); 1201 MIB.add(MI->getOperand(1)); 1202 MIB.add(MI->getOperand(2)); 1203 MIB.addImm(ARMCC::AL); 1204 MIB.addReg(0); 1205 1206 if (SetFlags) { 1207 MIB.addReg(ARM::CPSR); 1208 MIB->getOperand(5).setIsDef(true); 1209 } else 1210 MIB.addReg(0); 1211 1212 MI->eraseFromParent(); 1213 return SetFlags; 1214 } 1215 1216 // Generate a subs, or sub and cmp, and a branch instead of an LE. 1217 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { 1218 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); 1219 1220 MachineBasicBlock *MBB = MI->getParent(); 1221 // Create cmp 1222 if (!SkipCmp) { 1223 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1224 TII->get(ARM::t2CMPri)); 1225 MIB.addReg(ARM::LR); 1226 MIB.addImm(0); 1227 MIB.addImm(ARMCC::AL); 1228 MIB.addReg(ARM::NoRegister); 1229 } 1230 1231 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 1232 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 1233 ARM::tBcc : ARM::t2Bcc; 1234 1235 // Create bne 1236 MachineInstrBuilder MIB = 1237 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 1238 MIB.add(MI->getOperand(1)); // branch target 1239 MIB.addImm(ARMCC::NE); // condition code 1240 MIB.addReg(ARM::CPSR); 1241 MI->eraseFromParent(); 1242 } 1243 1244 // Perform dead code elimation on the loop iteration count setup expression. 1245 // If we are tail-predicating, the number of elements to be processed is the 1246 // operand of the VCTP instruction in the vector body, see getCount(), which is 1247 // register $r3 in this example: 1248 // 1249 // $lr = big-itercount-expression 1250 // .. 1251 // t2DoLoopStart renamable $lr 1252 // vector.body: 1253 // .. 1254 // $vpr = MVE_VCTP32 renamable $r3 1255 // renamable $lr = t2LoopDec killed renamable $lr, 1 1256 // t2LoopEnd renamable $lr, %vector.body 1257 // tB %end 1258 // 1259 // What we would like achieve here is to replace the do-loop start pseudo 1260 // instruction t2DoLoopStart with: 1261 // 1262 // $lr = MVE_DLSTP_32 killed renamable $r3 1263 // 1264 // Thus, $r3 which defines the number of elements, is written to $lr, 1265 // and then we want to delete the whole chain that used to define $lr, 1266 // see the comment below how this chain could look like. 1267 // 1268 void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { 1269 if (!LoLoop.IsTailPredicationLegal()) 1270 return; 1271 1272 LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n"); 1273 1274 MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0); 1275 if (!Def) { 1276 LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n"); 1277 return; 1278 } 1279 1280 // Collect and remove the users of iteration count. 1281 SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, 1282 LoLoop.End, LoLoop.InsertPt }; 1283 SmallPtrSet<MachineInstr*, 2> Remove; 1284 if (RDA->isSafeToRemove(Def, Remove, Killed)) 1285 LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); 1286 else { 1287 LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n"); 1288 return; 1289 } 1290 1291 // Collect the dead code and the MBBs in which they reside. 1292 RDA->collectKilledOperands(Def, Killed); 1293 SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; 1294 for (auto *MI : Killed) 1295 BasicBlocks.insert(MI->getParent()); 1296 1297 // Collect IT blocks in all affected basic blocks. 1298 std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; 1299 for (auto *MBB : BasicBlocks) { 1300 for (auto &MI : *MBB) { 1301 if (MI.getOpcode() != ARM::t2IT) 1302 continue; 1303 RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]); 1304 } 1305 } 1306 1307 // If we're removing all of the instructions within an IT block, then 1308 // also remove the IT instruction. 1309 SmallPtrSet<MachineInstr*, 2> ModifiedITs; 1310 for (auto *MI : Killed) { 1311 if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) { 1312 MachineInstr *IT = RDA->getMIOperand(MI, *MO); 1313 auto &CurrentBlock = ITBlocks[IT]; 1314 CurrentBlock.erase(MI); 1315 if (CurrentBlock.empty()) 1316 ModifiedITs.erase(IT); 1317 else 1318 ModifiedITs.insert(IT); 1319 } 1320 } 1321 1322 // Delete the killed instructions only if we don't have any IT blocks that 1323 // need to be modified because we need to fixup the mask. 1324 // TODO: Handle cases where IT blocks are modified. 1325 if (ModifiedITs.empty()) { 1326 LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n"; 1327 for (auto *MI : Killed) 1328 dbgs() << " - " << *MI); 1329 LoLoop.ToRemove.insert(Killed.begin(), Killed.end()); 1330 } else 1331 LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n"); 1332 } 1333 1334 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { 1335 LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); 1336 // When using tail-predication, try to delete the dead code that was used to 1337 // calculate the number of loop iterations. 1338 IterationCountDCE(LoLoop); 1339 1340 MachineInstr *InsertPt = LoLoop.InsertPt; 1341 MachineInstr *Start = LoLoop.Start; 1342 MachineBasicBlock *MBB = InsertPt->getParent(); 1343 bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; 1344 unsigned Opc = LoLoop.getStartOpcode(); 1345 MachineOperand &Count = LoLoop.getCount(); 1346 1347 MachineInstrBuilder MIB = 1348 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 1349 1350 MIB.addDef(ARM::LR); 1351 MIB.add(Count); 1352 if (!IsDo) 1353 MIB.add(Start->getOperand(1)); 1354 1355 // If we're inserting at a mov lr, then remove it as it's redundant. 1356 if (InsertPt != Start) 1357 LoLoop.ToRemove.insert(InsertPt); 1358 LoLoop.ToRemove.insert(Start); 1359 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 1360 return &*MIB; 1361 } 1362 1363 void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const { 1364 LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n"); 1365 auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) { 1366 MachineBasicBlock *MBB = InsertPt.getParent(); 1367 MachineInstrBuilder MIB = 1368 BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR)); 1369 MIB.addDef(To); 1370 MIB.addReg(From); 1371 MIB.addReg(From); 1372 MIB.addImm(0); 1373 MIB.addReg(0); 1374 MIB.addReg(To); 1375 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB); 1376 }; 1377 1378 for (auto &Reduction : LoLoop.Reductions) { 1379 MachineInstr &Copy = Reduction->Copy; 1380 MachineInstr &Reduce = Reduction->Reduce; 1381 Register DestReg = Copy.getOperand(0).getReg(); 1382 1383 // Change the initialiser if present 1384 if (Reduction->Init) { 1385 MachineInstr *Init = Reduction->Init; 1386 1387 for (unsigned i = 0; i < Init->getNumOperands(); ++i) { 1388 MachineOperand &MO = Init->getOperand(i); 1389 if (MO.isReg() && MO.isUse() && MO.isTied() && 1390 Init->findTiedOperandIdx(i) == 0) 1391 Init->getOperand(i).setReg(DestReg); 1392 } 1393 Init->getOperand(0).setReg(DestReg); 1394 LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init); 1395 } else 1396 BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg()); 1397 1398 // Change the reducing op to write to the register that is used to copy 1399 // its value on the next iteration. Also update the tied-def operand. 1400 Reduce.getOperand(0).setReg(DestReg); 1401 Reduce.getOperand(5).setReg(DestReg); 1402 LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce); 1403 1404 // Instead of a vpsel, just copy the register into the necessary one. 1405 MachineInstr &VPSEL = Reduction->VPSEL; 1406 if (VPSEL.getOperand(0).getReg() != DestReg) 1407 BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg); 1408 1409 // Remove the unnecessary instructions. 1410 LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n" 1411 << " - " << Copy 1412 << " - " << VPSEL << "\n"); 1413 Copy.eraseFromParent(); 1414 VPSEL.eraseFromParent(); 1415 } 1416 } 1417 1418 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { 1419 auto RemovePredicate = [](MachineInstr *MI) { 1420 LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); 1421 if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) { 1422 assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && 1423 "Expected Then predicate!"); 1424 MI->getOperand(PIdx).setImm(ARMVCC::None); 1425 MI->getOperand(PIdx+1).setReg(0); 1426 } else 1427 llvm_unreachable("trying to unpredicate a non-predicated instruction"); 1428 }; 1429 1430 // There are a few scenarios which we have to fix up: 1431 // 1. VPT Blocks with non-uniform predicates: 1432 // - a. When the divergent instruction is a vctp 1433 // - b. When the block uses a vpst, and is only predicated on the vctp 1434 // - c. When the block uses a vpt and (optionally) contains one or more 1435 // vctp. 1436 // 2. VPT Blocks with uniform predicates: 1437 // - a. The block uses a vpst, and is only predicated on the vctp 1438 for (auto &Block : LoLoop.getVPTBlocks()) { 1439 SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); 1440 if (Block.HasNonUniformPredicate()) { 1441 PredicatedMI *Divergent = Block.getDivergent(); 1442 if (isVCTP(Divergent->MI)) { 1443 // The vctp will be removed, so the block mask of the vp(s)t will need 1444 // to be recomputed. 1445 LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); 1446 } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { 1447 // The VPT block has a non-uniform predicate but it uses a vpst and its 1448 // entry is guarded only by a vctp, which means we: 1449 // - Need to remove the original vpst. 1450 // - Then need to unpredicate any following instructions, until 1451 // we come across the divergent vpr def. 1452 // - Insert a new vpst to predicate the instruction(s) that following 1453 // the divergent vpr def. 1454 // TODO: We could be producing more VPT blocks than necessary and could 1455 // fold the newly created one into a proceeding one. 1456 for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()), 1457 E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) 1458 RemovePredicate(&*I); 1459 1460 unsigned Size = 0; 1461 auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); 1462 auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); 1463 MachineInstr *InsertAt = nullptr; 1464 while (I != E) { 1465 InsertAt = &*I; 1466 ++Size; 1467 ++I; 1468 } 1469 // Create a VPST (with a null mask for now, we'll recompute it later). 1470 MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, 1471 InsertAt->getDebugLoc(), 1472 TII->get(ARM::MVE_VPST)); 1473 MIB.addImm(0); 1474 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); 1475 LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); 1476 LoLoop.ToRemove.insert(Block.getPredicateThen()); 1477 LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); 1478 } 1479 // Else, if the block uses a vpt, iterate over the block, removing the 1480 // extra VCTPs it may contain. 1481 else if (Block.isVPT()) { 1482 bool RemovedVCTP = false; 1483 for (PredicatedMI &Elt : Block.getInsts()) { 1484 MachineInstr *MI = Elt.MI; 1485 if (isVCTP(MI)) { 1486 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI); 1487 LoLoop.ToRemove.insert(MI); 1488 RemovedVCTP = true; 1489 continue; 1490 } 1491 } 1492 if (RemovedVCTP) 1493 LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); 1494 } 1495 } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) { 1496 // A vpt block starting with VPST, is only predicated upon vctp and has no 1497 // internal vpr defs: 1498 // - Remove vpst. 1499 // - Unpredicate the remaining instructions. 1500 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); 1501 LoLoop.ToRemove.insert(Block.getPredicateThen()); 1502 for (auto &PredMI : Insts) 1503 RemovePredicate(PredMI.MI); 1504 } 1505 } 1506 LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n"); 1507 // Remove the "main" VCTP 1508 LoLoop.ToRemove.insert(LoLoop.VCTP); 1509 LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP); 1510 // Remove remaining secondary VCTPs 1511 for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) { 1512 // All VCTPs that aren't marked for removal yet should be unpredicated ones. 1513 // The predicated ones should have already been marked for removal when 1514 // visiting the VPT blocks. 1515 if (LoLoop.ToRemove.insert(VCTP).second) { 1516 assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None && 1517 "Removing Predicated VCTP without updating the block mask!"); 1518 LLVM_DEBUG(dbgs() << " " << *VCTP); 1519 } 1520 } 1521 } 1522 1523 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { 1524 1525 // Combine the LoopDec and LoopEnd instructions into LE(TP). 1526 auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { 1527 MachineInstr *End = LoLoop.End; 1528 MachineBasicBlock *MBB = End->getParent(); 1529 unsigned Opc = LoLoop.IsTailPredicationLegal() ? 1530 ARM::MVE_LETP : ARM::t2LEUpdate; 1531 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 1532 TII->get(Opc)); 1533 MIB.addDef(ARM::LR); 1534 MIB.add(End->getOperand(0)); 1535 MIB.add(End->getOperand(1)); 1536 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 1537 LoLoop.ToRemove.insert(LoLoop.Dec); 1538 LoLoop.ToRemove.insert(End); 1539 return &*MIB; 1540 }; 1541 1542 // TODO: We should be able to automatically remove these branches before we 1543 // get here - probably by teaching analyzeBranch about the pseudo 1544 // instructions. 1545 // If there is an unconditional branch, after I, that just branches to the 1546 // next block, remove it. 1547 auto RemoveDeadBranch = [](MachineInstr *I) { 1548 MachineBasicBlock *BB = I->getParent(); 1549 MachineInstr *Terminator = &BB->instr_back(); 1550 if (Terminator->isUnconditionalBranch() && I != Terminator) { 1551 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 1552 if (BB->isLayoutSuccessor(Succ)) { 1553 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 1554 Terminator->eraseFromParent(); 1555 } 1556 } 1557 }; 1558 1559 if (LoLoop.Revert) { 1560 if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) 1561 RevertWhile(LoLoop.Start); 1562 else 1563 LoLoop.Start->eraseFromParent(); 1564 bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec); 1565 RevertLoopEnd(LoLoop.End, FlagsAlreadySet); 1566 } else { 1567 LoLoop.Start = ExpandLoopStart(LoLoop); 1568 RemoveDeadBranch(LoLoop.Start); 1569 LoLoop.End = ExpandLoopEnd(LoLoop); 1570 RemoveDeadBranch(LoLoop.End); 1571 if (LoLoop.IsTailPredicationLegal()) { 1572 ConvertVPTBlocks(LoLoop); 1573 FixupReductions(LoLoop); 1574 } 1575 for (auto *I : LoLoop.ToRemove) { 1576 LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); 1577 I->eraseFromParent(); 1578 } 1579 for (auto *I : LoLoop.BlockMasksToRecompute) { 1580 LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); 1581 recomputeVPTBlockMask(*I); 1582 LLVM_DEBUG(dbgs() << " ... done: " << *I); 1583 } 1584 } 1585 1586 PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); 1587 DFS.ProcessLoop(); 1588 const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); 1589 for (auto *MBB : PostOrder) { 1590 recomputeLiveIns(*MBB); 1591 // FIXME: For some reason, the live-in print order is non-deterministic for 1592 // our tests and I can't out why... So just sort them. 1593 MBB->sortUniqueLiveIns(); 1594 } 1595 1596 for (auto *MBB : reverse(PostOrder)) 1597 recomputeLivenessFlags(*MBB); 1598 1599 // We've moved, removed and inserted new instructions, so update RDA. 1600 RDA->reset(); 1601 } 1602 1603 bool ARMLowOverheadLoops::RevertNonLoops() { 1604 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); 1605 bool Changed = false; 1606 1607 for (auto &MBB : *MF) { 1608 SmallVector<MachineInstr*, 4> Starts; 1609 SmallVector<MachineInstr*, 4> Decs; 1610 SmallVector<MachineInstr*, 4> Ends; 1611 1612 for (auto &I : MBB) { 1613 if (isLoopStart(I)) 1614 Starts.push_back(&I); 1615 else if (I.getOpcode() == ARM::t2LoopDec) 1616 Decs.push_back(&I); 1617 else if (I.getOpcode() == ARM::t2LoopEnd) 1618 Ends.push_back(&I); 1619 } 1620 1621 if (Starts.empty() && Decs.empty() && Ends.empty()) 1622 continue; 1623 1624 Changed = true; 1625 1626 for (auto *Start : Starts) { 1627 if (Start->getOpcode() == ARM::t2WhileLoopStart) 1628 RevertWhile(Start); 1629 else 1630 Start->eraseFromParent(); 1631 } 1632 for (auto *Dec : Decs) 1633 RevertLoopDec(Dec); 1634 1635 for (auto *End : Ends) 1636 RevertLoopEnd(End); 1637 } 1638 return Changed; 1639 } 1640 1641 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 1642 return new ARMLowOverheadLoops(); 1643 } 1644