xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/CodeGen/HardwareLoops.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
17330f729Sjoerg //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
27330f729Sjoerg //
37330f729Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
47330f729Sjoerg // See https://llvm.org/LICENSE.txt for license information.
57330f729Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
67330f729Sjoerg //
77330f729Sjoerg //===----------------------------------------------------------------------===//
87330f729Sjoerg /// \file
97330f729Sjoerg /// Insert hardware loop intrinsics into loops which are deemed profitable by
107330f729Sjoerg /// the target, by querying TargetTransformInfo. A hardware loop comprises of
117330f729Sjoerg /// two intrinsics: one, outside the loop, to set the loop iteration count and
127330f729Sjoerg /// another, in the exit block, to decrement the counter. The decremented value
137330f729Sjoerg /// can either be carried through the loop via a phi or handled in some opaque
147330f729Sjoerg /// way by the target.
157330f729Sjoerg ///
167330f729Sjoerg //===----------------------------------------------------------------------===//
177330f729Sjoerg 
187330f729Sjoerg #include "llvm/ADT/Statistic.h"
197330f729Sjoerg #include "llvm/Analysis/AssumptionCache.h"
207330f729Sjoerg #include "llvm/Analysis/LoopInfo.h"
21*82d56013Sjoerg #include "llvm/Analysis/OptimizationRemarkEmitter.h"
227330f729Sjoerg #include "llvm/Analysis/ScalarEvolution.h"
23*82d56013Sjoerg #include "llvm/Analysis/TargetLibraryInfo.h"
247330f729Sjoerg #include "llvm/Analysis/TargetTransformInfo.h"
257330f729Sjoerg #include "llvm/CodeGen/Passes.h"
267330f729Sjoerg #include "llvm/CodeGen/TargetPassConfig.h"
277330f729Sjoerg #include "llvm/IR/BasicBlock.h"
28*82d56013Sjoerg #include "llvm/IR/Constants.h"
297330f729Sjoerg #include "llvm/IR/DataLayout.h"
307330f729Sjoerg #include "llvm/IR/Dominators.h"
317330f729Sjoerg #include "llvm/IR/IRBuilder.h"
327330f729Sjoerg #include "llvm/IR/Instructions.h"
337330f729Sjoerg #include "llvm/IR/IntrinsicInst.h"
347330f729Sjoerg #include "llvm/IR/Value.h"
35*82d56013Sjoerg #include "llvm/InitializePasses.h"
36*82d56013Sjoerg #include "llvm/Pass.h"
37*82d56013Sjoerg #include "llvm/PassRegistry.h"
38*82d56013Sjoerg #include "llvm/Support/CommandLine.h"
397330f729Sjoerg #include "llvm/Support/Debug.h"
407330f729Sjoerg #include "llvm/Transforms/Scalar.h"
417330f729Sjoerg #include "llvm/Transforms/Utils.h"
427330f729Sjoerg #include "llvm/Transforms/Utils/BasicBlockUtils.h"
437330f729Sjoerg #include "llvm/Transforms/Utils/Local.h"
447330f729Sjoerg #include "llvm/Transforms/Utils/LoopUtils.h"
45*82d56013Sjoerg #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
467330f729Sjoerg 
477330f729Sjoerg #define DEBUG_TYPE "hardware-loops"
487330f729Sjoerg 
497330f729Sjoerg #define HW_LOOPS_NAME "Hardware Loop Insertion"
507330f729Sjoerg 
517330f729Sjoerg using namespace llvm;
527330f729Sjoerg 
537330f729Sjoerg static cl::opt<bool>
547330f729Sjoerg ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
557330f729Sjoerg                    cl::desc("Force hardware loops intrinsics to be inserted"));
567330f729Sjoerg 
577330f729Sjoerg static cl::opt<bool>
587330f729Sjoerg ForceHardwareLoopPHI(
597330f729Sjoerg   "force-hardware-loop-phi", cl::Hidden, cl::init(false),
607330f729Sjoerg   cl::desc("Force hardware loop counter to be updated through a phi"));
617330f729Sjoerg 
627330f729Sjoerg static cl::opt<bool>
637330f729Sjoerg ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
647330f729Sjoerg                 cl::desc("Force allowance of nested hardware loops"));
657330f729Sjoerg 
667330f729Sjoerg static cl::opt<unsigned>
677330f729Sjoerg LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
687330f729Sjoerg             cl::desc("Set the loop decrement value"));
697330f729Sjoerg 
707330f729Sjoerg static cl::opt<unsigned>
717330f729Sjoerg CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
727330f729Sjoerg                 cl::desc("Set the loop counter bitwidth"));
737330f729Sjoerg 
747330f729Sjoerg static cl::opt<bool>
757330f729Sjoerg ForceGuardLoopEntry(
767330f729Sjoerg   "force-hardware-loop-guard", cl::Hidden, cl::init(false),
777330f729Sjoerg   cl::desc("Force generation of loop guard intrinsic"));
787330f729Sjoerg 
797330f729Sjoerg STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
807330f729Sjoerg 
81*82d56013Sjoerg #ifndef NDEBUG
debugHWLoopFailure(const StringRef DebugMsg,Instruction * I)82*82d56013Sjoerg static void debugHWLoopFailure(const StringRef DebugMsg,
83*82d56013Sjoerg     Instruction *I) {
84*82d56013Sjoerg   dbgs() << "HWLoops: " << DebugMsg;
85*82d56013Sjoerg   if (I)
86*82d56013Sjoerg     dbgs() << ' ' << *I;
87*82d56013Sjoerg   else
88*82d56013Sjoerg     dbgs() << '.';
89*82d56013Sjoerg   dbgs() << '\n';
90*82d56013Sjoerg }
91*82d56013Sjoerg #endif
92*82d56013Sjoerg 
93*82d56013Sjoerg static OptimizationRemarkAnalysis
createHWLoopAnalysis(StringRef RemarkName,Loop * L,Instruction * I)94*82d56013Sjoerg createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
95*82d56013Sjoerg   Value *CodeRegion = L->getHeader();
96*82d56013Sjoerg   DebugLoc DL = L->getStartLoc();
97*82d56013Sjoerg 
98*82d56013Sjoerg   if (I) {
99*82d56013Sjoerg     CodeRegion = I->getParent();
100*82d56013Sjoerg     // If there is no debug location attached to the instruction, revert back to
101*82d56013Sjoerg     // using the loop's.
102*82d56013Sjoerg     if (I->getDebugLoc())
103*82d56013Sjoerg       DL = I->getDebugLoc();
104*82d56013Sjoerg   }
105*82d56013Sjoerg 
106*82d56013Sjoerg   OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
107*82d56013Sjoerg   R << "hardware-loop not created: ";
108*82d56013Sjoerg   return R;
109*82d56013Sjoerg }
110*82d56013Sjoerg 
1117330f729Sjoerg namespace {
1127330f729Sjoerg 
reportHWLoopFailure(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr)113*82d56013Sjoerg   void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
114*82d56013Sjoerg       OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
115*82d56013Sjoerg     LLVM_DEBUG(debugHWLoopFailure(Msg, I));
116*82d56013Sjoerg     ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
117*82d56013Sjoerg   }
118*82d56013Sjoerg 
1197330f729Sjoerg   using TTI = TargetTransformInfo;
1207330f729Sjoerg 
1217330f729Sjoerg   class HardwareLoops : public FunctionPass {
1227330f729Sjoerg   public:
1237330f729Sjoerg     static char ID;
1247330f729Sjoerg 
HardwareLoops()1257330f729Sjoerg     HardwareLoops() : FunctionPass(ID) {
1267330f729Sjoerg       initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
1277330f729Sjoerg     }
1287330f729Sjoerg 
1297330f729Sjoerg     bool runOnFunction(Function &F) override;
1307330f729Sjoerg 
getAnalysisUsage(AnalysisUsage & AU) const1317330f729Sjoerg     void getAnalysisUsage(AnalysisUsage &AU) const override {
1327330f729Sjoerg       AU.addRequired<LoopInfoWrapperPass>();
1337330f729Sjoerg       AU.addPreserved<LoopInfoWrapperPass>();
1347330f729Sjoerg       AU.addRequired<DominatorTreeWrapperPass>();
1357330f729Sjoerg       AU.addPreserved<DominatorTreeWrapperPass>();
1367330f729Sjoerg       AU.addRequired<ScalarEvolutionWrapperPass>();
1377330f729Sjoerg       AU.addRequired<AssumptionCacheTracker>();
1387330f729Sjoerg       AU.addRequired<TargetTransformInfoWrapperPass>();
139*82d56013Sjoerg       AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1407330f729Sjoerg     }
1417330f729Sjoerg 
1427330f729Sjoerg     // Try to convert the given Loop into a hardware loop.
1437330f729Sjoerg     bool TryConvertLoop(Loop *L);
1447330f729Sjoerg 
1457330f729Sjoerg     // Given that the target believes the loop to be profitable, try to
1467330f729Sjoerg     // convert it.
1477330f729Sjoerg     bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
1487330f729Sjoerg 
1497330f729Sjoerg   private:
1507330f729Sjoerg     ScalarEvolution *SE = nullptr;
1517330f729Sjoerg     LoopInfo *LI = nullptr;
1527330f729Sjoerg     const DataLayout *DL = nullptr;
153*82d56013Sjoerg     OptimizationRemarkEmitter *ORE = nullptr;
1547330f729Sjoerg     const TargetTransformInfo *TTI = nullptr;
1557330f729Sjoerg     DominatorTree *DT = nullptr;
1567330f729Sjoerg     bool PreserveLCSSA = false;
1577330f729Sjoerg     AssumptionCache *AC = nullptr;
1587330f729Sjoerg     TargetLibraryInfo *LibInfo = nullptr;
1597330f729Sjoerg     Module *M = nullptr;
1607330f729Sjoerg     bool MadeChange = false;
1617330f729Sjoerg   };
1627330f729Sjoerg 
1637330f729Sjoerg   class HardwareLoop {
1647330f729Sjoerg     // Expand the trip count scev into a value that we can use.
1657330f729Sjoerg     Value *InitLoopCount();
1667330f729Sjoerg 
1677330f729Sjoerg     // Insert the set_loop_iteration intrinsic.
168*82d56013Sjoerg     Value *InsertIterationSetup(Value *LoopCountInit);
1697330f729Sjoerg 
1707330f729Sjoerg     // Insert the loop_decrement intrinsic.
1717330f729Sjoerg     void InsertLoopDec();
1727330f729Sjoerg 
1737330f729Sjoerg     // Insert the loop_decrement_reg intrinsic.
1747330f729Sjoerg     Instruction *InsertLoopRegDec(Value *EltsRem);
1757330f729Sjoerg 
1767330f729Sjoerg     // If the target requires the counter value to be updated in the loop,
1777330f729Sjoerg     // insert a phi to hold the value. The intended purpose is for use by
1787330f729Sjoerg     // loop_decrement_reg.
1797330f729Sjoerg     PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
1807330f729Sjoerg 
1817330f729Sjoerg     // Create a new cmp, that checks the returned value of loop_decrement*,
1827330f729Sjoerg     // and update the exit branch to use it.
1837330f729Sjoerg     void UpdateBranch(Value *EltsRem);
1847330f729Sjoerg 
1857330f729Sjoerg   public:
HardwareLoop(HardwareLoopInfo & Info,ScalarEvolution & SE,const DataLayout & DL,OptimizationRemarkEmitter * ORE)1867330f729Sjoerg     HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
187*82d56013Sjoerg                  const DataLayout &DL,
188*82d56013Sjoerg                  OptimizationRemarkEmitter *ORE) :
189*82d56013Sjoerg       SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()),
190*82d56013Sjoerg       TripCount(Info.TripCount),
1917330f729Sjoerg       CountType(Info.CountType),
1927330f729Sjoerg       ExitBranch(Info.ExitBranch),
1937330f729Sjoerg       LoopDecrement(Info.LoopDecrement),
1947330f729Sjoerg       UsePHICounter(Info.CounterInReg),
1957330f729Sjoerg       UseLoopGuard(Info.PerformEntryTest) { }
1967330f729Sjoerg 
1977330f729Sjoerg     void Create();
1987330f729Sjoerg 
1997330f729Sjoerg   private:
2007330f729Sjoerg     ScalarEvolution &SE;
2017330f729Sjoerg     const DataLayout &DL;
202*82d56013Sjoerg     OptimizationRemarkEmitter *ORE = nullptr;
2037330f729Sjoerg     Loop *L                 = nullptr;
2047330f729Sjoerg     Module *M               = nullptr;
205*82d56013Sjoerg     const SCEV *TripCount   = nullptr;
2067330f729Sjoerg     Type *CountType         = nullptr;
2077330f729Sjoerg     BranchInst *ExitBranch  = nullptr;
2087330f729Sjoerg     Value *LoopDecrement    = nullptr;
2097330f729Sjoerg     bool UsePHICounter      = false;
2107330f729Sjoerg     bool UseLoopGuard       = false;
2117330f729Sjoerg     BasicBlock *BeginBB     = nullptr;
2127330f729Sjoerg   };
2137330f729Sjoerg }
2147330f729Sjoerg 
2157330f729Sjoerg char HardwareLoops::ID = 0;
2167330f729Sjoerg 
runOnFunction(Function & F)2177330f729Sjoerg bool HardwareLoops::runOnFunction(Function &F) {
2187330f729Sjoerg   if (skipFunction(F))
2197330f729Sjoerg     return false;
2207330f729Sjoerg 
2217330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
2227330f729Sjoerg 
2237330f729Sjoerg   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2247330f729Sjoerg   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2257330f729Sjoerg   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2267330f729Sjoerg   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2277330f729Sjoerg   DL = &F.getParent()->getDataLayout();
228*82d56013Sjoerg   ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2297330f729Sjoerg   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2307330f729Sjoerg   LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr;
2317330f729Sjoerg   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
2327330f729Sjoerg   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2337330f729Sjoerg   M = F.getParent();
2347330f729Sjoerg 
235*82d56013Sjoerg   for (Loop *L : *LI)
236*82d56013Sjoerg     if (L->isOutermost())
2377330f729Sjoerg       TryConvertLoop(L);
2387330f729Sjoerg 
2397330f729Sjoerg   return MadeChange;
2407330f729Sjoerg }
2417330f729Sjoerg 
2427330f729Sjoerg // Return true if the search should stop, which will be when an inner loop is
2437330f729Sjoerg // converted and the parent loop doesn't support containing a hardware loop.
TryConvertLoop(Loop * L)2447330f729Sjoerg bool HardwareLoops::TryConvertLoop(Loop *L) {
2457330f729Sjoerg   // Process nested loops first.
246*82d56013Sjoerg   bool AnyChanged = false;
247*82d56013Sjoerg   for (Loop *SL : *L)
248*82d56013Sjoerg     AnyChanged |= TryConvertLoop(SL);
249*82d56013Sjoerg   if (AnyChanged) {
250*82d56013Sjoerg     reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
251*82d56013Sjoerg                         ORE, L);
2527330f729Sjoerg     return true; // Stop search.
253*82d56013Sjoerg   }
254*82d56013Sjoerg 
255*82d56013Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
2567330f729Sjoerg 
2577330f729Sjoerg   HardwareLoopInfo HWLoopInfo(L);
258*82d56013Sjoerg   if (!HWLoopInfo.canAnalyze(*LI)) {
259*82d56013Sjoerg     reportHWLoopFailure("cannot analyze loop, irreducible control flow",
260*82d56013Sjoerg                         "HWLoopCannotAnalyze", ORE, L);
2617330f729Sjoerg     return false;
262*82d56013Sjoerg   }
2637330f729Sjoerg 
264*82d56013Sjoerg   if (!ForceHardwareLoops &&
265*82d56013Sjoerg       !TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) {
266*82d56013Sjoerg     reportHWLoopFailure("it's not profitable to create a hardware-loop",
267*82d56013Sjoerg                         "HWLoopNotProfitable", ORE, L);
268*82d56013Sjoerg     return false;
269*82d56013Sjoerg   }
2707330f729Sjoerg 
2717330f729Sjoerg   // Allow overriding of the counter width and loop decrement value.
2727330f729Sjoerg   if (CounterBitWidth.getNumOccurrences())
2737330f729Sjoerg     HWLoopInfo.CountType =
2747330f729Sjoerg       IntegerType::get(M->getContext(), CounterBitWidth);
2757330f729Sjoerg 
2767330f729Sjoerg   if (LoopDecrement.getNumOccurrences())
2777330f729Sjoerg     HWLoopInfo.LoopDecrement =
2787330f729Sjoerg       ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
2797330f729Sjoerg 
2807330f729Sjoerg   MadeChange |= TryConvertLoop(HWLoopInfo);
2817330f729Sjoerg   return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
2827330f729Sjoerg }
2837330f729Sjoerg 
TryConvertLoop(HardwareLoopInfo & HWLoopInfo)2847330f729Sjoerg bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
2857330f729Sjoerg 
2867330f729Sjoerg   Loop *L = HWLoopInfo.L;
2877330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
2887330f729Sjoerg 
2897330f729Sjoerg   if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT, ForceNestedLoop,
290*82d56013Sjoerg                                           ForceHardwareLoopPHI)) {
291*82d56013Sjoerg     // TODO: there can be many reasons a loop is not considered a
292*82d56013Sjoerg     // candidate, so we should let isHardwareLoopCandidate fill in the
293*82d56013Sjoerg     // reason and then report a better message here.
294*82d56013Sjoerg     reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
2957330f729Sjoerg     return false;
296*82d56013Sjoerg   }
2977330f729Sjoerg 
2987330f729Sjoerg   assert(
299*82d56013Sjoerg       (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.TripCount) &&
3007330f729Sjoerg       "Hardware Loop must have set exit info.");
3017330f729Sjoerg 
3027330f729Sjoerg   BasicBlock *Preheader = L->getLoopPreheader();
3037330f729Sjoerg 
3047330f729Sjoerg   // If we don't have a preheader, then insert one.
3057330f729Sjoerg   if (!Preheader)
3067330f729Sjoerg     Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
3077330f729Sjoerg   if (!Preheader)
3087330f729Sjoerg     return false;
3097330f729Sjoerg 
310*82d56013Sjoerg   HardwareLoop HWLoop(HWLoopInfo, *SE, *DL, ORE);
3117330f729Sjoerg   HWLoop.Create();
3127330f729Sjoerg   ++NumHWLoops;
3137330f729Sjoerg   return true;
3147330f729Sjoerg }
3157330f729Sjoerg 
Create()3167330f729Sjoerg void HardwareLoop::Create() {
3177330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
3187330f729Sjoerg 
3197330f729Sjoerg   Value *LoopCountInit = InitLoopCount();
320*82d56013Sjoerg   if (!LoopCountInit) {
321*82d56013Sjoerg     reportHWLoopFailure("could not safely create a loop count expression",
322*82d56013Sjoerg                         "HWLoopNotSafe", ORE, L);
3237330f729Sjoerg     return;
324*82d56013Sjoerg   }
3257330f729Sjoerg 
326*82d56013Sjoerg   Value *Setup = InsertIterationSetup(LoopCountInit);
3277330f729Sjoerg 
3287330f729Sjoerg   if (UsePHICounter || ForceHardwareLoopPHI) {
3297330f729Sjoerg     Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
330*82d56013Sjoerg     Value *EltsRem = InsertPHICounter(Setup, LoopDec);
3317330f729Sjoerg     LoopDec->setOperand(0, EltsRem);
3327330f729Sjoerg     UpdateBranch(LoopDec);
3337330f729Sjoerg   } else
3347330f729Sjoerg     InsertLoopDec();
3357330f729Sjoerg 
3367330f729Sjoerg   // Run through the basic blocks of the loop and see if any of them have dead
3377330f729Sjoerg   // PHIs that can be removed.
3387330f729Sjoerg   for (auto I : L->blocks())
3397330f729Sjoerg     DeleteDeadPHIs(I);
3407330f729Sjoerg }
3417330f729Sjoerg 
CanGenerateTest(Loop * L,Value * Count)3427330f729Sjoerg static bool CanGenerateTest(Loop *L, Value *Count) {
3437330f729Sjoerg   BasicBlock *Preheader = L->getLoopPreheader();
3447330f729Sjoerg   if (!Preheader->getSinglePredecessor())
3457330f729Sjoerg     return false;
3467330f729Sjoerg 
3477330f729Sjoerg   BasicBlock *Pred = Preheader->getSinglePredecessor();
3487330f729Sjoerg   if (!isa<BranchInst>(Pred->getTerminator()))
3497330f729Sjoerg     return false;
3507330f729Sjoerg 
3517330f729Sjoerg   auto *BI = cast<BranchInst>(Pred->getTerminator());
3527330f729Sjoerg   if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
3537330f729Sjoerg     return false;
3547330f729Sjoerg 
3557330f729Sjoerg   // Check that the icmp is checking for equality of Count and zero and that
3567330f729Sjoerg   // a non-zero value results in entering the loop.
3577330f729Sjoerg   auto ICmp = cast<ICmpInst>(BI->getCondition());
3587330f729Sjoerg   LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
3597330f729Sjoerg   if (!ICmp->isEquality())
3607330f729Sjoerg     return false;
3617330f729Sjoerg 
3627330f729Sjoerg   auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
3637330f729Sjoerg     if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
3647330f729Sjoerg       return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
3657330f729Sjoerg     return false;
3667330f729Sjoerg   };
3677330f729Sjoerg 
3687330f729Sjoerg   if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1))
3697330f729Sjoerg     return false;
3707330f729Sjoerg 
3717330f729Sjoerg   unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
3727330f729Sjoerg   if (BI->getSuccessor(SuccIdx) != Preheader)
3737330f729Sjoerg     return false;
3747330f729Sjoerg 
3757330f729Sjoerg   return true;
3767330f729Sjoerg }
3777330f729Sjoerg 
InitLoopCount()3787330f729Sjoerg Value *HardwareLoop::InitLoopCount() {
3797330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
3807330f729Sjoerg   // Can we replace a conditional branch with an intrinsic that sets the
3817330f729Sjoerg   // loop counter and tests that is not zero?
3827330f729Sjoerg 
3837330f729Sjoerg   SCEVExpander SCEVE(SE, DL, "loopcnt");
3847330f729Sjoerg 
3857330f729Sjoerg   // If we're trying to use the 'test and set' form of the intrinsic, we need
3867330f729Sjoerg   // to replace a conditional branch that is controlling entry to the loop. It
3877330f729Sjoerg   // is likely (guaranteed?) that the preheader has an unconditional branch to
3887330f729Sjoerg   // the loop header, so also check if it has a single predecessor.
389*82d56013Sjoerg   if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, TripCount,
390*82d56013Sjoerg                                   SE.getZero(TripCount->getType()))) {
3917330f729Sjoerg     LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
3927330f729Sjoerg     UseLoopGuard |= ForceGuardLoopEntry;
3937330f729Sjoerg   } else
3947330f729Sjoerg     UseLoopGuard = false;
3957330f729Sjoerg 
3967330f729Sjoerg   BasicBlock *BB = L->getLoopPreheader();
3977330f729Sjoerg   if (UseLoopGuard && BB->getSinglePredecessor() &&
398*82d56013Sjoerg       cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
399*82d56013Sjoerg     BasicBlock *Predecessor = BB->getSinglePredecessor();
400*82d56013Sjoerg     // If it's not safe to create a while loop then don't force it and create a
401*82d56013Sjoerg     // do-while loop instead
402*82d56013Sjoerg     if (!isSafeToExpandAt(TripCount, Predecessor->getTerminator(), SE))
403*82d56013Sjoerg         UseLoopGuard = false;
404*82d56013Sjoerg     else
405*82d56013Sjoerg         BB = Predecessor;
406*82d56013Sjoerg   }
4077330f729Sjoerg 
408*82d56013Sjoerg   if (!isSafeToExpandAt(TripCount, BB->getTerminator(), SE)) {
409*82d56013Sjoerg     LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand TripCount "
410*82d56013Sjoerg                << *TripCount << "\n");
4117330f729Sjoerg     return nullptr;
4127330f729Sjoerg   }
4137330f729Sjoerg 
414*82d56013Sjoerg   Value *Count = SCEVE.expandCodeFor(TripCount, CountType,
4157330f729Sjoerg                                      BB->getTerminator());
4167330f729Sjoerg 
4177330f729Sjoerg   // FIXME: We've expanded Count where we hope to insert the counter setting
4187330f729Sjoerg   // intrinsic. But, in the case of the 'test and set' form, we may fallback to
4197330f729Sjoerg   // the just 'set' form and in which case the insertion block is most likely
4207330f729Sjoerg   // different. It means there will be instruction(s) in a block that possibly
4217330f729Sjoerg   // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
4227330f729Sjoerg   // but it's doesn't appear to work in all cases.
4237330f729Sjoerg 
4247330f729Sjoerg   UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
4257330f729Sjoerg   BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
4267330f729Sjoerg   LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
4277330f729Sjoerg              << " - Expanded Count in " << BB->getName() << "\n"
4287330f729Sjoerg              << " - Will insert set counter intrinsic into: "
4297330f729Sjoerg              << BeginBB->getName() << "\n");
4307330f729Sjoerg   return Count;
4317330f729Sjoerg }
4327330f729Sjoerg 
InsertIterationSetup(Value * LoopCountInit)433*82d56013Sjoerg Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
4347330f729Sjoerg   IRBuilder<> Builder(BeginBB->getTerminator());
4357330f729Sjoerg   Type *Ty = LoopCountInit->getType();
436*82d56013Sjoerg   bool UsePhi = UsePHICounter || ForceHardwareLoopPHI;
437*82d56013Sjoerg   Intrinsic::ID ID = UseLoopGuard
438*82d56013Sjoerg                          ? (UsePhi ? Intrinsic::test_start_loop_iterations
439*82d56013Sjoerg                                    : Intrinsic::test_set_loop_iterations)
440*82d56013Sjoerg                          : (UsePhi ? Intrinsic::start_loop_iterations
441*82d56013Sjoerg                                    : Intrinsic::set_loop_iterations);
4427330f729Sjoerg   Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
443*82d56013Sjoerg   Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
4447330f729Sjoerg 
4457330f729Sjoerg   // Use the return value of the intrinsic to control the entry of the loop.
4467330f729Sjoerg   if (UseLoopGuard) {
4477330f729Sjoerg     assert((isa<BranchInst>(BeginBB->getTerminator()) &&
4487330f729Sjoerg             cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
4497330f729Sjoerg            "Expected conditional branch");
450*82d56013Sjoerg 
451*82d56013Sjoerg     Value *SetCount =
452*82d56013Sjoerg         UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
4537330f729Sjoerg     auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
4547330f729Sjoerg     LoopGuard->setCondition(SetCount);
4557330f729Sjoerg     if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
4567330f729Sjoerg       LoopGuard->swapSuccessors();
4577330f729Sjoerg   }
458*82d56013Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
459*82d56013Sjoerg                     << "\n");
460*82d56013Sjoerg   if (UsePhi && UseLoopGuard)
461*82d56013Sjoerg     LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
462*82d56013Sjoerg   return !UsePhi ? LoopCountInit : LoopSetup;
4637330f729Sjoerg }
4647330f729Sjoerg 
InsertLoopDec()4657330f729Sjoerg void HardwareLoop::InsertLoopDec() {
4667330f729Sjoerg   IRBuilder<> CondBuilder(ExitBranch);
4677330f729Sjoerg 
4687330f729Sjoerg   Function *DecFunc =
4697330f729Sjoerg     Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
4707330f729Sjoerg                               LoopDecrement->getType());
4717330f729Sjoerg   Value *Ops[] = { LoopDecrement };
4727330f729Sjoerg   Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
4737330f729Sjoerg   Value *OldCond = ExitBranch->getCondition();
4747330f729Sjoerg   ExitBranch->setCondition(NewCond);
4757330f729Sjoerg 
4767330f729Sjoerg   // The false branch must exit the loop.
4777330f729Sjoerg   if (!L->contains(ExitBranch->getSuccessor(0)))
4787330f729Sjoerg     ExitBranch->swapSuccessors();
4797330f729Sjoerg 
4807330f729Sjoerg   // The old condition may be dead now, and may have even created a dead PHI
4817330f729Sjoerg   // (the original induction variable).
4827330f729Sjoerg   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
4837330f729Sjoerg 
4847330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
4857330f729Sjoerg }
4867330f729Sjoerg 
InsertLoopRegDec(Value * EltsRem)4877330f729Sjoerg Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
4887330f729Sjoerg   IRBuilder<> CondBuilder(ExitBranch);
4897330f729Sjoerg 
4907330f729Sjoerg   Function *DecFunc =
4917330f729Sjoerg       Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
492*82d56013Sjoerg                                 { EltsRem->getType() });
4937330f729Sjoerg   Value *Ops[] = { EltsRem, LoopDecrement };
4947330f729Sjoerg   Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
4957330f729Sjoerg 
4967330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
4977330f729Sjoerg   return cast<Instruction>(Call);
4987330f729Sjoerg }
4997330f729Sjoerg 
InsertPHICounter(Value * NumElts,Value * EltsRem)5007330f729Sjoerg PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
5017330f729Sjoerg   BasicBlock *Preheader = L->getLoopPreheader();
5027330f729Sjoerg   BasicBlock *Header = L->getHeader();
5037330f729Sjoerg   BasicBlock *Latch = ExitBranch->getParent();
5047330f729Sjoerg   IRBuilder<> Builder(Header->getFirstNonPHI());
5057330f729Sjoerg   PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
5067330f729Sjoerg   Index->addIncoming(NumElts, Preheader);
5077330f729Sjoerg   Index->addIncoming(EltsRem, Latch);
5087330f729Sjoerg   LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
5097330f729Sjoerg   return Index;
5107330f729Sjoerg }
5117330f729Sjoerg 
UpdateBranch(Value * EltsRem)5127330f729Sjoerg void HardwareLoop::UpdateBranch(Value *EltsRem) {
5137330f729Sjoerg   IRBuilder<> CondBuilder(ExitBranch);
5147330f729Sjoerg   Value *NewCond =
5157330f729Sjoerg     CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
5167330f729Sjoerg   Value *OldCond = ExitBranch->getCondition();
5177330f729Sjoerg   ExitBranch->setCondition(NewCond);
5187330f729Sjoerg 
5197330f729Sjoerg   // The false branch must exit the loop.
5207330f729Sjoerg   if (!L->contains(ExitBranch->getSuccessor(0)))
5217330f729Sjoerg     ExitBranch->swapSuccessors();
5227330f729Sjoerg 
5237330f729Sjoerg   // The old condition may be dead now, and may have even created a dead PHI
5247330f729Sjoerg   // (the original induction variable).
5257330f729Sjoerg   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
5267330f729Sjoerg }
5277330f729Sjoerg 
INITIALIZE_PASS_BEGIN(HardwareLoops,DEBUG_TYPE,HW_LOOPS_NAME,false,false)5287330f729Sjoerg INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
5297330f729Sjoerg INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5307330f729Sjoerg INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
5317330f729Sjoerg INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
532*82d56013Sjoerg INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
5337330f729Sjoerg INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
5347330f729Sjoerg 
5357330f729Sjoerg FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }
536