1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. TODO: Could DoLoopStart get moved into the 15 /// pre-preheader? 16 /// - t2LoopDec - placed within in the loop body. 17 /// - t2LoopEnd - the loop latch terminator. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 #include "ARM.h" 22 #include "ARMBaseInstrInfo.h" 23 #include "ARMBaseRegisterInfo.h" 24 #include "ARMBasicBlockInfo.h" 25 #include "ARMSubtarget.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineLoopInfo.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "arm-low-overhead-loops" 33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 34 35 namespace { 36 37 class ARMLowOverheadLoops : public MachineFunctionPass { 38 const ARMBaseInstrInfo *TII = nullptr; 39 MachineRegisterInfo *MRI = nullptr; 40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 41 42 public: 43 static char ID; 44 45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 46 47 void getAnalysisUsage(AnalysisUsage &AU) const override { 48 AU.setPreservesCFG(); 49 AU.addRequired<MachineLoopInfo>(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 53 bool runOnMachineFunction(MachineFunction &MF) override; 54 55 bool ProcessLoop(MachineLoop *ML); 56 57 bool RevertNonLoops(MachineFunction &MF); 58 59 void RevertWhile(MachineInstr *MI) const; 60 61 void RevertLoopDec(MachineInstr *MI) const; 62 63 void RevertLoopEnd(MachineInstr *MI) const; 64 65 void Expand(MachineLoop *ML, MachineInstr *Start, 66 MachineInstr *Dec, MachineInstr *End, bool Revert); 67 68 MachineFunctionProperties getRequiredProperties() const override { 69 return MachineFunctionProperties().set( 70 MachineFunctionProperties::Property::NoVRegs); 71 } 72 73 StringRef getPassName() const override { 74 return ARM_LOW_OVERHEAD_LOOPS_NAME; 75 } 76 }; 77 } 78 79 char ARMLowOverheadLoops::ID = 0; 80 81 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 82 false, false) 83 84 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { 85 if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB()) 86 return false; 87 88 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); 89 90 auto &MLI = getAnalysis<MachineLoopInfo>(); 91 MRI = &MF.getRegInfo(); 92 TII = static_cast<const ARMBaseInstrInfo*>( 93 MF.getSubtarget().getInstrInfo()); 94 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); 95 BBUtils->computeAllBlockSizes(); 96 BBUtils->adjustBBOffsetsAfter(&MF.front()); 97 98 bool Changed = false; 99 for (auto ML : MLI) { 100 if (!ML->getParentLoop()) 101 Changed |= ProcessLoop(ML); 102 } 103 Changed |= RevertNonLoops(MF); 104 return Changed; 105 } 106 107 static bool IsLoopStart(MachineInstr &MI) { 108 return MI.getOpcode() == ARM::t2DoLoopStart || 109 MI.getOpcode() == ARM::t2WhileLoopStart; 110 } 111 112 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 113 114 bool Changed = false; 115 116 // Process inner loops first. 117 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 118 Changed |= ProcessLoop(*I); 119 120 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); 121 122 // Search the given block for a loop start instruction. If one isn't found, 123 // and there's only one predecessor block, search that one too. 124 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 125 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 126 for (auto &MI : *MBB) { 127 if (IsLoopStart(MI)) 128 return &MI; 129 } 130 if (MBB->pred_size() == 1) 131 return SearchForStart(*MBB->pred_begin()); 132 return nullptr; 133 }; 134 135 MachineInstr *Start = nullptr; 136 MachineInstr *Dec = nullptr; 137 MachineInstr *End = nullptr; 138 bool Revert = false; 139 140 // Search the preheader for the start intrinsic, or look through the 141 // predecessors of the header to find exactly one set.iterations intrinsic. 142 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 143 // with potentially multiple set.loop.iterations, so we need to enable this. 144 if (auto *Preheader = ML->getLoopPreheader()) { 145 Start = SearchForStart(Preheader); 146 } else { 147 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" 148 << " - Performing manual predecessor search.\n"); 149 MachineBasicBlock *Pred = nullptr; 150 for (auto *MBB : ML->getHeader()->predecessors()) { 151 if (!ML->contains(MBB)) { 152 if (Pred) { 153 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); 154 Start = nullptr; 155 break; 156 } 157 Pred = MBB; 158 Start = SearchForStart(MBB); 159 } 160 } 161 } 162 163 // Find the low-overhead loop components and decide whether or not to fall 164 // back to a normal loop. 165 for (auto *MBB : reverse(ML->getBlocks())) { 166 for (auto &MI : *MBB) { 167 if (MI.getOpcode() == ARM::t2LoopDec) 168 Dec = &MI; 169 else if (MI.getOpcode() == ARM::t2LoopEnd) 170 End = &MI; 171 else if (IsLoopStart(MI)) 172 Start = &MI; 173 else if (MI.getDesc().isCall()) 174 // TODO: Though the call will require LE to execute again, does this 175 // mean we should revert? Always executing LE hopefully should be 176 // faster than performing a sub,cmp,br or even subs,br. 177 Revert = true; 178 179 if (!Dec) 180 continue; 181 182 // If we find that we load/store LR between LoopDec and LoopEnd, expect 183 // that the decremented value has been spilled to the stack. Because 184 // this value isn't actually going to be produced until the latch, by LE, 185 // we would need to generate a real sub. The value is also likely to be 186 // reloaded for use of LoopEnd - in which in case we'd need to perform 187 // an add because it gets negated again by LE! The other option is to 188 // then generate the other form of LE which doesn't perform the sub. 189 if (MI.mayLoad() || MI.mayStore()) 190 Revert = 191 MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; 192 } 193 194 if (Dec && End && Revert) 195 break; 196 } 197 198 LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; 199 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; 200 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); 201 202 if (!Start && !Dec && !End) { 203 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); 204 return Changed; 205 } else if (!(Start && Dec && End)) { 206 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); 207 return false; 208 } 209 210 if (!End->getOperand(1).isMBB()) 211 report_fatal_error("Expected LoopEnd to target basic block"); 212 213 // TODO Maybe there's cases where the target doesn't have to be the header, 214 // but for now be safe and revert. 215 if (End->getOperand(1).getMBB() != ML->getHeader()) { 216 LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); 217 Revert = true; 218 } 219 220 // The WLS and LE instructions have 12-bits for the label offset. WLS 221 // requires a positive offset, while LE uses negative. 222 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || 223 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { 224 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); 225 Revert = true; 226 } 227 if (Start->getOpcode() == ARM::t2WhileLoopStart && 228 (BBUtils->getOffsetOf(Start) > 229 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || 230 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { 231 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); 232 Revert = true; 233 } 234 235 Expand(ML, Start, Dec, End, Revert); 236 return true; 237 } 238 239 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 240 // beq that branches to the exit branch. 241 // FIXME: Need to check that we're not trashing the CPSR when generating the 242 // cmp. We could also try to generate a cbz if the value in LR is also in 243 // another low register. 244 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { 245 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); 246 MachineBasicBlock *MBB = MI->getParent(); 247 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 248 TII->get(ARM::t2CMPri)); 249 MIB.addReg(ARM::LR); 250 MIB.addImm(0); 251 MIB.addImm(ARMCC::AL); 252 MIB.addReg(ARM::CPSR); 253 254 // TODO: Try to use tBcc instead 255 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); 256 MIB.add(MI->getOperand(1)); // branch target 257 MIB.addImm(ARMCC::EQ); // condition code 258 MIB.addReg(ARM::CPSR); 259 MI->eraseFromParent(); 260 } 261 262 // TODO: Check flags so that we can possibly generate a tSubs or tSub. 263 void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { 264 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); 265 MachineBasicBlock *MBB = MI->getParent(); 266 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 267 TII->get(ARM::t2SUBri)); 268 MIB.addDef(ARM::LR); 269 MIB.add(MI->getOperand(1)); 270 MIB.add(MI->getOperand(2)); 271 MIB.addImm(ARMCC::AL); 272 MIB.addReg(0); 273 MIB.addReg(0); 274 MI->eraseFromParent(); 275 } 276 277 // Generate a subs, or sub and cmp, and a branch instead of an LE. 278 // FIXME: Need to check that we're not trashing the CPSR when generating 279 // the cmp. 280 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { 281 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); 282 283 // Create cmp 284 MachineBasicBlock *MBB = MI->getParent(); 285 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 286 TII->get(ARM::t2CMPri)); 287 MIB.addReg(ARM::LR); 288 MIB.addImm(0); 289 MIB.addImm(ARMCC::AL); 290 MIB.addReg(ARM::CPSR); 291 292 // TODO Try to use tBcc instead. 293 // Create bne 294 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); 295 MIB.add(MI->getOperand(1)); // branch target 296 MIB.addImm(ARMCC::NE); // condition code 297 MIB.addReg(ARM::CPSR); 298 MI->eraseFromParent(); 299 } 300 301 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, 302 MachineInstr *Dec, MachineInstr *End, 303 bool Revert) { 304 305 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { 306 // The trip count should already been held in LR since the instructions 307 // within the loop can only read and write to LR. So, there should be a 308 // mov to setup the count. WLS/DLS perform this move, so find the original 309 // and delete it - inserting WLS/DLS in its place. 310 MachineBasicBlock *MBB = Start->getParent(); 311 MachineInstr *InsertPt = Start; 312 for (auto &I : MRI->def_instructions(ARM::LR)) { 313 if (I.getParent() != MBB) 314 continue; 315 316 // Always execute. 317 if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) 318 continue; 319 320 // Only handle move reg, if the trip count it will need moving into a reg 321 // before the setup instruction anyway. 322 if (!I.getDesc().isMoveReg() || 323 !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) 324 continue; 325 InsertPt = &I; 326 break; 327 } 328 329 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? 330 ARM::t2DLS : ARM::t2WLS; 331 MachineInstrBuilder MIB = 332 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 333 334 MIB.addDef(ARM::LR); 335 MIB.add(Start->getOperand(0)); 336 if (Opc == ARM::t2WLS) 337 MIB.add(Start->getOperand(1)); 338 339 if (InsertPt != Start) 340 InsertPt->eraseFromParent(); 341 Start->eraseFromParent(); 342 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 343 return &*MIB; 344 }; 345 346 // Combine the LoopDec and LoopEnd instructions into LE(TP). 347 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, 348 MachineInstr *End) { 349 MachineBasicBlock *MBB = End->getParent(); 350 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 351 TII->get(ARM::t2LEUpdate)); 352 MIB.addDef(ARM::LR); 353 MIB.add(End->getOperand(0)); 354 MIB.add(End->getOperand(1)); 355 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 356 357 End->eraseFromParent(); 358 Dec->eraseFromParent(); 359 return &*MIB; 360 }; 361 362 // TODO: We should be able to automatically remove these branches before we 363 // get here - probably by teaching analyzeBranch about the pseudo 364 // instructions. 365 // If there is an unconditional branch, after I, that just branches to the 366 // next block, remove it. 367 auto RemoveDeadBranch = [](MachineInstr *I) { 368 MachineBasicBlock *BB = I->getParent(); 369 MachineInstr *Terminator = &BB->instr_back(); 370 if (Terminator->isUnconditionalBranch() && I != Terminator) { 371 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 372 if (BB->isLayoutSuccessor(Succ)) { 373 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 374 Terminator->eraseFromParent(); 375 } 376 } 377 }; 378 379 if (Revert) { 380 if (Start->getOpcode() == ARM::t2WhileLoopStart) 381 RevertWhile(Start); 382 else 383 Start->eraseFromParent(); 384 RevertLoopDec(Dec); 385 RevertLoopEnd(End); 386 } else { 387 Start = ExpandLoopStart(ML, Start); 388 RemoveDeadBranch(Start); 389 End = ExpandLoopEnd(ML, Dec, End); 390 RemoveDeadBranch(End); 391 } 392 } 393 394 bool ARMLowOverheadLoops::RevertNonLoops(MachineFunction &MF) { 395 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); 396 bool Changed = false; 397 398 for (auto &MBB : MF) { 399 SmallVector<MachineInstr*, 4> Starts; 400 SmallVector<MachineInstr*, 4> Decs; 401 SmallVector<MachineInstr*, 4> Ends; 402 403 for (auto &I : MBB) { 404 if (IsLoopStart(I)) 405 Starts.push_back(&I); 406 else if (I.getOpcode() == ARM::t2LoopDec) 407 Decs.push_back(&I); 408 else if (I.getOpcode() == ARM::t2LoopEnd) 409 Ends.push_back(&I); 410 } 411 412 if (Starts.empty() && Decs.empty() && Ends.empty()) 413 continue; 414 415 Changed = true; 416 417 for (auto *Start : Starts) { 418 if (Start->getOpcode() == ARM::t2WhileLoopStart) 419 RevertWhile(Start); 420 else 421 Start->eraseFromParent(); 422 } 423 for (auto *Dec : Decs) 424 RevertLoopDec(Dec); 425 426 for (auto *End : Ends) 427 RevertLoopEnd(End); 428 } 429 return Changed; 430 } 431 432 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 433 return new ARMLowOverheadLoops(); 434 } 435