xref: /llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp (revision f31eba649422082783a19bb6b373f31f10f55de9)
1 //===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass identifies loops where we can generate the PPC branch instructions
10 // that decrement and test the count register (CTR) (bdnz and friends).
11 //
12 // The pattern that defines the induction variable can changed depending on
13 // prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
14 // normalizes induction variables, and the Loop Strength Reduction pass
15 // run by 'llc' may also make changes to the induction variable.
16 //
17 // Criteria for CTR loops:
18 //  - Countable loops (w/ ind. var for a trip count)
19 //  - Try inner-most loops first
20 //  - No nested CTR loops.
21 //  - No function calls in loops.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "PPC.h"
26 #include "PPCSubtarget.h"
27 #include "PPCTargetMachine.h"
28 #include "PPCTargetTransformInfo.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/Statistic.h"
31 #include "llvm/Analysis/AssumptionCache.h"
32 #include "llvm/Analysis/CFG.h"
33 #include "llvm/Analysis/CodeMetrics.h"
34 #include "llvm/Analysis/LoopInfo.h"
35 #include "llvm/Analysis/LoopIterator.h"
36 #include "llvm/Analysis/ScalarEvolutionExpander.h"
37 #include "llvm/Analysis/TargetLibraryInfo.h"
38 #include "llvm/Analysis/TargetTransformInfo.h"
39 #include "llvm/Transforms/Utils/Local.h"
40 #include "llvm/CodeGen/TargetPassConfig.h"
41 #include "llvm/CodeGen/TargetSchedule.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/Dominators.h"
45 #include "llvm/IR/InlineAsm.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/IR/Module.h"
49 #include "llvm/IR/ValueHandle.h"
50 #include "llvm/PassSupport.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/Debug.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Transforms/Scalar.h"
55 #include "llvm/Transforms/Utils.h"
56 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
57 #include "llvm/Transforms/Utils/LoopUtils.h"
58 
59 #ifndef NDEBUG
60 #include "llvm/CodeGen/MachineDominators.h"
61 #include "llvm/CodeGen/MachineFunction.h"
62 #include "llvm/CodeGen/MachineFunctionPass.h"
63 #include "llvm/CodeGen/MachineRegisterInfo.h"
64 #endif
65 
66 using namespace llvm;
67 
68 #define DEBUG_TYPE "ctrloops"
69 
70 #ifndef NDEBUG
71 static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
72 #endif
73 
74 // The latency of mtctr is only justified if there are more than 4
75 // comparisons that will be removed as a result.
76 static cl::opt<unsigned>
77 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
78                       cl::desc("Loops with a constant trip count smaller than "
79                                "this value will not use the count register."));
80 
81 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
82 
83 namespace {
84   struct PPCCTRLoops : public FunctionPass {
85 
86 #ifndef NDEBUG
87     static int Counter;
88 #endif
89 
90   public:
91     static char ID;
92 
93     PPCCTRLoops() : FunctionPass(ID) {
94       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
95     }
96 
97     bool runOnFunction(Function &F) override;
98 
99     void getAnalysisUsage(AnalysisUsage &AU) const override {
100       AU.addRequired<LoopInfoWrapperPass>();
101       AU.addPreserved<LoopInfoWrapperPass>();
102       AU.addRequired<DominatorTreeWrapperPass>();
103       AU.addPreserved<DominatorTreeWrapperPass>();
104       AU.addRequired<ScalarEvolutionWrapperPass>();
105       AU.addRequired<AssumptionCacheTracker>();
106       AU.addRequired<TargetTransformInfoWrapperPass>();
107     }
108 
109   private:
110     bool mightUseCTR(BasicBlock *BB);
111     bool convertToCTRLoop(Loop *L);
112 
113   private:
114     const PPCTargetMachine *TM;
115     const PPCSubtarget *STI;
116     const PPCTargetLowering *TLI;
117     const DataLayout *DL;
118     const TargetLibraryInfo *LibInfo;
119     const TargetTransformInfo *TTI;
120     LoopInfo *LI;
121     ScalarEvolution *SE;
122     DominatorTree *DT;
123     bool PreserveLCSSA;
124     TargetSchedModel SchedModel;
125   };
126 
127   char PPCCTRLoops::ID = 0;
128 #ifndef NDEBUG
129   int PPCCTRLoops::Counter = 0;
130 #endif
131 
132 #ifndef NDEBUG
133   struct PPCCTRLoopsVerify : public MachineFunctionPass {
134   public:
135     static char ID;
136 
137     PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
138       initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
139     }
140 
141     void getAnalysisUsage(AnalysisUsage &AU) const override {
142       AU.addRequired<MachineDominatorTree>();
143       MachineFunctionPass::getAnalysisUsage(AU);
144     }
145 
146     bool runOnMachineFunction(MachineFunction &MF) override;
147 
148   private:
149     MachineDominatorTree *MDT;
150   };
151 
152   char PPCCTRLoopsVerify::ID = 0;
153 #endif // NDEBUG
154 } // end anonymous namespace
155 
156 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
157                       false, false)
158 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
159 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
160 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
161 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
162                     false, false)
163 
164 FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
165 
166 #ifndef NDEBUG
167 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
168                       "PowerPC CTR Loops Verify", false, false)
169 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
170 INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
171                     "PowerPC CTR Loops Verify", false, false)
172 
173 FunctionPass *llvm::createPPCCTRLoopsVerify() {
174   return new PPCCTRLoopsVerify();
175 }
176 #endif // NDEBUG
177 
178 bool PPCCTRLoops::runOnFunction(Function &F) {
179   if (skipFunction(F))
180     return false;
181 
182   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
183   if (!TPC)
184     return false;
185 
186   TM = &TPC->getTM<PPCTargetMachine>();
187   STI = TM->getSubtargetImpl(F);
188   TLI = STI->getTargetLowering();
189 
190   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
191   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
192   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
193   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
194   DL = &F.getParent()->getDataLayout();
195   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
196   LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
197   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
198   SchedModel.init(STI);
199 
200   bool MadeChange = false;
201 
202   for (LoopInfo::iterator I = LI->begin(), E = LI->end();
203        I != E; ++I) {
204     Loop *L = *I;
205     if (!L->getParentLoop())
206       MadeChange |= convertToCTRLoop(L);
207   }
208 
209   return MadeChange;
210 }
211 
212 static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
213   if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
214     return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
215 
216   return false;
217 }
218 
219 // Determining the address of a TLS variable results in a function call in
220 // certain TLS models.
221 static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
222   const auto *GV = dyn_cast<GlobalValue>(MemAddr);
223   if (!GV) {
224     // Recurse to check for constants that refer to TLS global variables.
225     if (const auto *CV = dyn_cast<Constant>(MemAddr))
226       for (const auto &CO : CV->operands())
227         if (memAddrUsesCTR(TM, CO))
228           return true;
229 
230     return false;
231   }
232 
233   if (!GV->isThreadLocal())
234     return false;
235   TLSModel::Model Model = TM.getTLSModel(GV);
236   return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
237 }
238 
239 // Loop through the inline asm constraints and look for something that clobbers
240 // ctr.
241 static bool asmClobbersCTR(InlineAsm *IA) {
242   InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
243   for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
244     InlineAsm::ConstraintInfo &C = CIV[i];
245     if (C.Type != InlineAsm::isInput)
246       for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
247         if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
248           return true;
249   }
250   return false;
251 }
252 
253 bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
254   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
255        J != JE; ++J) {
256     if (CallInst *CI = dyn_cast<CallInst>(J)) {
257       // Inline ASM is okay, unless it clobbers the ctr register.
258       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
259         if (asmClobbersCTR(IA))
260           return true;
261         continue;
262       }
263 
264       if (Function *F = CI->getCalledFunction()) {
265         // Most intrinsics don't become function calls, but some might.
266         // sin, cos, exp and log are always calls.
267         unsigned Opcode = 0;
268         if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
269           switch (F->getIntrinsicID()) {
270           default: continue;
271           // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
272           // we're definitely using CTR.
273           case Intrinsic::ppc_is_decremented_ctr_nonzero:
274           case Intrinsic::ppc_mtctr:
275             return true;
276 
277 // VisualStudio defines setjmp as _setjmp
278 #if defined(_MSC_VER) && defined(setjmp) && \
279                        !defined(setjmp_undefined_for_msvc)
280 #  pragma push_macro("setjmp")
281 #  undef setjmp
282 #  define setjmp_undefined_for_msvc
283 #endif
284 
285           case Intrinsic::setjmp:
286 
287 #if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
288  // let's return it to _setjmp state
289 #  pragma pop_macro("setjmp")
290 #  undef setjmp_undefined_for_msvc
291 #endif
292 
293           case Intrinsic::longjmp:
294 
295           // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
296           // because, although it does clobber the counter register, the
297           // control can't then return to inside the loop unless there is also
298           // an eh_sjlj_setjmp.
299           case Intrinsic::eh_sjlj_setjmp:
300 
301           case Intrinsic::memcpy:
302           case Intrinsic::memmove:
303           case Intrinsic::memset:
304           case Intrinsic::powi:
305           case Intrinsic::log:
306           case Intrinsic::log2:
307           case Intrinsic::log10:
308           case Intrinsic::exp:
309           case Intrinsic::exp2:
310           case Intrinsic::pow:
311           case Intrinsic::sin:
312           case Intrinsic::cos:
313             return true;
314           case Intrinsic::copysign:
315             if (CI->getArgOperand(0)->getType()->getScalarType()->
316                 isPPC_FP128Ty())
317               return true;
318             else
319               continue; // ISD::FCOPYSIGN is never a library call.
320           case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
321           case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
322           case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
323           case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
324           case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
325           case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
326           case Intrinsic::round:              Opcode = ISD::FROUND;     break;
327           case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
328           case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
329           case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
330           case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
331           }
332         }
333 
334         // PowerPC does not use [US]DIVREM or other library calls for
335         // operations on regular types which are not otherwise library calls
336         // (i.e. soft float or atomics). If adapting for targets that do,
337         // additional care is required here.
338 
339         LibFunc Func;
340         if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
341             LibInfo->getLibFunc(F->getName(), Func) &&
342             LibInfo->hasOptimizedCodeGen(Func)) {
343           // Non-read-only functions are never treated as intrinsics.
344           if (!CI->onlyReadsMemory())
345             return true;
346 
347           // Conversion happens only for FP calls.
348           if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
349             return true;
350 
351           switch (Func) {
352           default: return true;
353           case LibFunc_copysign:
354           case LibFunc_copysignf:
355             continue; // ISD::FCOPYSIGN is never a library call.
356           case LibFunc_copysignl:
357             return true;
358           case LibFunc_fabs:
359           case LibFunc_fabsf:
360           case LibFunc_fabsl:
361             continue; // ISD::FABS is never a library call.
362           case LibFunc_sqrt:
363           case LibFunc_sqrtf:
364           case LibFunc_sqrtl:
365             Opcode = ISD::FSQRT; break;
366           case LibFunc_floor:
367           case LibFunc_floorf:
368           case LibFunc_floorl:
369             Opcode = ISD::FFLOOR; break;
370           case LibFunc_nearbyint:
371           case LibFunc_nearbyintf:
372           case LibFunc_nearbyintl:
373             Opcode = ISD::FNEARBYINT; break;
374           case LibFunc_ceil:
375           case LibFunc_ceilf:
376           case LibFunc_ceill:
377             Opcode = ISD::FCEIL; break;
378           case LibFunc_rint:
379           case LibFunc_rintf:
380           case LibFunc_rintl:
381             Opcode = ISD::FRINT; break;
382           case LibFunc_round:
383           case LibFunc_roundf:
384           case LibFunc_roundl:
385             Opcode = ISD::FROUND; break;
386           case LibFunc_trunc:
387           case LibFunc_truncf:
388           case LibFunc_truncl:
389             Opcode = ISD::FTRUNC; break;
390           case LibFunc_fmin:
391           case LibFunc_fminf:
392           case LibFunc_fminl:
393             Opcode = ISD::FMINNUM; break;
394           case LibFunc_fmax:
395           case LibFunc_fmaxf:
396           case LibFunc_fmaxl:
397             Opcode = ISD::FMAXNUM; break;
398           }
399         }
400 
401         if (Opcode) {
402           EVT EVTy =
403               TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
404 
405           if (EVTy == MVT::Other)
406             return true;
407 
408           if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
409             continue;
410           else if (EVTy.isVector() &&
411                    TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
412             continue;
413 
414           return true;
415         }
416       }
417 
418       return true;
419     } else if (isa<BinaryOperator>(J) &&
420                J->getType()->getScalarType()->isPPC_FP128Ty()) {
421       // Most operations on ppc_f128 values become calls.
422       return true;
423     } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
424                isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
425       CastInst *CI = cast<CastInst>(J);
426       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
427           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
428           isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
429           isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
430         return true;
431     } else if (isLargeIntegerTy(!TM->isPPC64(),
432                                 J->getType()->getScalarType()) &&
433                (J->getOpcode() == Instruction::UDiv ||
434                 J->getOpcode() == Instruction::SDiv ||
435                 J->getOpcode() == Instruction::URem ||
436                 J->getOpcode() == Instruction::SRem)) {
437       return true;
438     } else if (!TM->isPPC64() &&
439                isLargeIntegerTy(false, J->getType()->getScalarType()) &&
440                (J->getOpcode() == Instruction::Shl ||
441                 J->getOpcode() == Instruction::AShr ||
442                 J->getOpcode() == Instruction::LShr)) {
443       // Only on PPC32, for 128-bit integers (specifically not 64-bit
444       // integers), these might be runtime calls.
445       return true;
446     } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
447       // On PowerPC, indirect jumps use the counter register.
448       return true;
449     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
450       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
451         return true;
452     }
453 
454     // FREM is always a call.
455     if (J->getOpcode() == Instruction::FRem)
456       return true;
457 
458     if (STI->useSoftFloat()) {
459       switch(J->getOpcode()) {
460       case Instruction::FAdd:
461       case Instruction::FSub:
462       case Instruction::FMul:
463       case Instruction::FDiv:
464       case Instruction::FPTrunc:
465       case Instruction::FPExt:
466       case Instruction::FPToUI:
467       case Instruction::FPToSI:
468       case Instruction::UIToFP:
469       case Instruction::SIToFP:
470       case Instruction::FCmp:
471         return true;
472       }
473     }
474 
475     for (Value *Operand : J->operands())
476       if (memAddrUsesCTR(*TM, Operand))
477         return true;
478   }
479 
480   return false;
481 }
482 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
483   bool MadeChange = false;
484 
485   // Do not convert small short loops to CTR loop.
486   unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
487   if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
488     SmallPtrSet<const Value *, 32> EphValues;
489     auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
490         *L->getHeader()->getParent());
491     CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
492     CodeMetrics Metrics;
493     for (BasicBlock *BB : L->blocks())
494       Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
495     // 6 is an approximate latency for the mtctr instruction.
496     if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
497       return false;
498   }
499 
500   // Process nested loops first.
501   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
502     MadeChange |= convertToCTRLoop(*I);
503     LLVM_DEBUG(dbgs() << "Nested loop converted\n");
504   }
505 
506   // If a nested loop has been converted, then we can't convert this loop.
507   if (MadeChange)
508     return MadeChange;
509 
510   // Bail out if the loop has irreducible control flow.
511   LoopBlocksRPO RPOT(L);
512   RPOT.perform(LI);
513   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
514     return false;
515 
516 #ifndef NDEBUG
517   // Stop trying after reaching the limit (if any).
518   int Limit = CTRLoopLimit;
519   if (Limit >= 0) {
520     if (Counter >= CTRLoopLimit)
521       return false;
522     Counter++;
523   }
524 #endif
525 
526   // We don't want to spill/restore the counter register, and so we don't
527   // want to use the counter register if the loop contains calls.
528   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
529        I != IE; ++I)
530     if (mightUseCTR(*I))
531       return MadeChange;
532 
533   SmallVector<BasicBlock*, 4> ExitingBlocks;
534   L->getExitingBlocks(ExitingBlocks);
535 
536   // If there is an exit edge known to be frequently taken,
537   // we should not transform this loop.
538   for (auto &BB : ExitingBlocks) {
539     Instruction *TI = BB->getTerminator();
540     if (!TI) continue;
541 
542     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
543       uint64_t TrueWeight = 0, FalseWeight = 0;
544       if (!BI->isConditional() ||
545           !BI->extractProfMetadata(TrueWeight, FalseWeight))
546         continue;
547 
548       // If the exit path is more frequent than the loop path,
549       // we return here without further analysis for this loop.
550       bool TrueIsExit = !L->contains(BI->getSuccessor(0));
551       if (( TrueIsExit && FalseWeight < TrueWeight) ||
552           (!TrueIsExit && FalseWeight > TrueWeight))
553         return MadeChange;
554     }
555   }
556 
557   BasicBlock *CountedExitBlock = nullptr;
558   const SCEV *ExitCount = nullptr;
559   BranchInst *CountedExitBranch = nullptr;
560   for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
561        IE = ExitingBlocks.end(); I != IE; ++I) {
562     const SCEV *EC = SE->getExitCount(L, *I);
563     LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
564                       << (*I)->getName() << ": " << *EC << "\n");
565     if (isa<SCEVCouldNotCompute>(EC))
566       continue;
567     if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
568       if (ConstEC->getValue()->isZero())
569         continue;
570     } else if (!SE->isLoopInvariant(EC, L))
571       continue;
572 
573     if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
574       continue;
575 
576     // If this exiting block is contained in a nested loop, it is not eligible
577     // for insertion of the branch-and-decrement since the inner loop would
578     // end up messing up the value in the CTR.
579     if (LI->getLoopFor(*I) != L)
580       continue;
581 
582     // We now have a loop-invariant count of loop iterations (which is not the
583     // constant zero) for which we know that this loop will not exit via this
584     // existing block.
585 
586     // We need to make sure that this block will run on every loop iteration.
587     // For this to be true, we must dominate all blocks with backedges. Such
588     // blocks are in-loop predecessors to the header block.
589     bool NotAlways = false;
590     for (pred_iterator PI = pred_begin(L->getHeader()),
591          PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
592       if (!L->contains(*PI))
593         continue;
594 
595       if (!DT->dominates(*I, *PI)) {
596         NotAlways = true;
597         break;
598       }
599     }
600 
601     if (NotAlways)
602       continue;
603 
604     // Make sure this blocks ends with a conditional branch.
605     Instruction *TI = (*I)->getTerminator();
606     if (!TI)
607       continue;
608 
609     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
610       if (!BI->isConditional())
611         continue;
612 
613       CountedExitBranch = BI;
614     } else
615       continue;
616 
617     // Note that this block may not be the loop latch block, even if the loop
618     // has a latch block.
619     CountedExitBlock = *I;
620     ExitCount = EC;
621     break;
622   }
623 
624   if (!CountedExitBlock)
625     return MadeChange;
626 
627   BasicBlock *Preheader = L->getLoopPreheader();
628 
629   // If we don't have a preheader, then insert one. If we already have a
630   // preheader, then we can use it (except if the preheader contains a use of
631   // the CTR register because some such uses might be reordered by the
632   // selection DAG after the mtctr instruction).
633   if (!Preheader || mightUseCTR(Preheader))
634     Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
635   if (!Preheader)
636     return MadeChange;
637 
638   LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
639                     << "\n");
640 
641   // Insert the count into the preheader and replace the condition used by the
642   // selected branch.
643   MadeChange = true;
644 
645   SCEVExpander SCEVE(*SE, *DL, "loopcnt");
646   LLVMContext &C = SE->getContext();
647   Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
648   if (!ExitCount->getType()->isPointerTy() &&
649       ExitCount->getType() != CountType)
650     ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
651   ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
652   Value *ECValue =
653       SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
654 
655   IRBuilder<> CountBuilder(Preheader->getTerminator());
656   Module *M = Preheader->getParent()->getParent();
657   Function *MTCTRFunc =
658       Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
659   CountBuilder.CreateCall(MTCTRFunc, ECValue);
660 
661   IRBuilder<> CondBuilder(CountedExitBranch);
662   Function *DecFunc =
663       Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
664   Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
665   Value *OldCond = CountedExitBranch->getCondition();
666   CountedExitBranch->setCondition(NewCond);
667 
668   // The false branch must exit the loop.
669   if (!L->contains(CountedExitBranch->getSuccessor(0)))
670     CountedExitBranch->swapSuccessors();
671 
672   // The old condition may be dead now, and may have even created a dead PHI
673   // (the original induction variable).
674   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
675   // Run through the basic blocks of the loop and see if any of them have dead
676   // PHIs that can be removed.
677   for (auto I : L->blocks())
678     DeleteDeadPHIs(I);
679 
680   ++NumCTRLoops;
681   return MadeChange;
682 }
683 
684 #ifndef NDEBUG
685 static bool clobbersCTR(const MachineInstr &MI) {
686   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
687     const MachineOperand &MO = MI.getOperand(i);
688     if (MO.isReg()) {
689       if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
690         return true;
691     } else if (MO.isRegMask()) {
692       if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
693         return true;
694     }
695   }
696 
697   return false;
698 }
699 
700 static bool verifyCTRBranch(MachineBasicBlock *MBB,
701                             MachineBasicBlock::iterator I) {
702   MachineBasicBlock::iterator BI = I;
703   SmallSet<MachineBasicBlock *, 16>   Visited;
704   SmallVector<MachineBasicBlock *, 8> Preds;
705   bool CheckPreds;
706 
707   if (I == MBB->begin()) {
708     Visited.insert(MBB);
709     goto queue_preds;
710   } else
711     --I;
712 
713 check_block:
714   Visited.insert(MBB);
715   if (I == MBB->end())
716     goto queue_preds;
717 
718   CheckPreds = true;
719   for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
720     unsigned Opc = I->getOpcode();
721     if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
722       CheckPreds = false;
723       break;
724     }
725 
726     if (I != BI && clobbersCTR(*I)) {
727       LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
728                         << ") instruction " << *I
729                         << " clobbers CTR, invalidating "
730                         << printMBBReference(*BI->getParent()) << " ("
731                         << BI->getParent()->getFullName() << ") instruction "
732                         << *BI << "\n");
733       return false;
734     }
735 
736     if (I == IE)
737       break;
738   }
739 
740   if (!CheckPreds && Preds.empty())
741     return true;
742 
743   if (CheckPreds) {
744 queue_preds:
745     if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
746       LLVM_DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
747                         << printMBBReference(*BI->getParent()) << " ("
748                         << BI->getParent()->getFullName() << ") instruction "
749                         << *BI << "\n");
750       return false;
751     }
752 
753     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
754          PIE = MBB->pred_end(); PI != PIE; ++PI)
755       Preds.push_back(*PI);
756   }
757 
758   do {
759     MBB = Preds.pop_back_val();
760     if (!Visited.count(MBB)) {
761       I = MBB->getLastNonDebugInstr();
762       goto check_block;
763     }
764   } while (!Preds.empty());
765 
766   return true;
767 }
768 
769 bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
770   MDT = &getAnalysis<MachineDominatorTree>();
771 
772   // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
773   // any other instructions that might clobber the ctr register.
774   for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
775        I != IE; ++I) {
776     MachineBasicBlock *MBB = &*I;
777     if (!MDT->isReachableFromEntry(MBB))
778       continue;
779 
780     for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
781       MIIE = MBB->end(); MII != MIIE; ++MII) {
782       unsigned Opc = MII->getOpcode();
783       if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
784           Opc == PPC::BDZ8  || Opc == PPC::BDZ)
785         if (!verifyCTRBranch(MBB, MII))
786           llvm_unreachable("Invalid PPC CTR loop!");
787     }
788   }
789 
790   return false;
791 }
792 #endif // NDEBUG
793