xref: /llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp (revision d7fb9eb818d22085c7dae0ce9a8be7ade963d7e5)
1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass converts selects to conditional jumps when profitable.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/CodeGen/SelectOptimize.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/BlockFrequencyInfo.h"
17 #include "llvm/Analysis/BranchProbabilityInfo.h"
18 #include "llvm/Analysis/LoopInfo.h"
19 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
20 #include "llvm/Analysis/ProfileSummaryInfo.h"
21 #include "llvm/Analysis/TargetTransformInfo.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/CodeGen/TargetLowering.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/CodeGen/TargetSchedule.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/PatternMatch.h"
33 #include "llvm/IR/ProfDataUtils.h"
34 #include "llvm/InitializePasses.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/ScaledNumber.h"
37 #include "llvm/Target/TargetMachine.h"
38 #include "llvm/Transforms/Utils/SizeOpts.h"
39 #include <algorithm>
40 #include <memory>
41 #include <queue>
42 #include <stack>
43 
44 using namespace llvm;
45 
46 #define DEBUG_TYPE "select-optimize"
47 
48 STATISTIC(NumSelectOptAnalyzed,
49           "Number of select groups considered for conversion to branch");
50 STATISTIC(NumSelectConvertedExpColdOperand,
51           "Number of select groups converted due to expensive cold operand");
52 STATISTIC(NumSelectConvertedHighPred,
53           "Number of select groups converted due to high-predictability");
54 STATISTIC(NumSelectUnPred,
55           "Number of select groups not converted due to unpredictability");
56 STATISTIC(NumSelectColdBB,
57           "Number of select groups not converted due to cold basic block");
58 STATISTIC(NumSelectConvertedLoop,
59           "Number of select groups converted due to loop-level analysis");
60 STATISTIC(NumSelectsConverted, "Number of selects converted");
61 
62 static cl::opt<unsigned> ColdOperandThreshold(
63     "cold-operand-threshold",
64     cl::desc("Maximum frequency of path for an operand to be considered cold."),
65     cl::init(20), cl::Hidden);
66 
67 static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
68     "cold-operand-max-cost-multiplier",
69     cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
70              "slice of a cold operand to be considered inexpensive."),
71     cl::init(1), cl::Hidden);
72 
73 static cl::opt<unsigned>
74     GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
75                           cl::desc("Gradient gain threshold (%)."),
76                           cl::init(25), cl::Hidden);
77 
78 static cl::opt<unsigned>
79     GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
80                        cl::desc("Minimum gain per loop (in cycles) threshold."),
81                        cl::init(4), cl::Hidden);
82 
83 static cl::opt<unsigned> GainRelativeThreshold(
84     "select-opti-loop-relative-gain-threshold",
85     cl::desc(
86         "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
87     cl::init(8), cl::Hidden);
88 
89 static cl::opt<unsigned> MispredictDefaultRate(
90     "mispredict-default-rate", cl::Hidden, cl::init(25),
91     cl::desc("Default mispredict rate (initialized to 25%)."));
92 
93 static cl::opt<bool>
94     DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
95                                cl::init(false),
96                                cl::desc("Disable loop-level heuristics."));
97 
98 namespace {
99 
100 class SelectOptimizeImpl {
101   const TargetMachine *TM = nullptr;
102   const TargetSubtargetInfo *TSI = nullptr;
103   const TargetLowering *TLI = nullptr;
104   const TargetTransformInfo *TTI = nullptr;
105   const LoopInfo *LI = nullptr;
106   BlockFrequencyInfo *BFI;
107   ProfileSummaryInfo *PSI = nullptr;
108   OptimizationRemarkEmitter *ORE = nullptr;
109   TargetSchedModel TSchedModel;
110 
111 public:
112   SelectOptimizeImpl() = default;
113   SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
114   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
115   bool runOnFunction(Function &F, Pass &P);
116 
117 private:
118   // Select groups consist of consecutive select instructions with the same
119   // condition.
120   using SelectGroup = SmallVector<SelectInst *, 2>;
121   using SelectGroups = SmallVector<SelectGroup, 2>;
122 
123   using Scaled64 = ScaledNumber<uint64_t>;
124 
125   struct CostInfo {
126     /// Predicated cost (with selects as conditional moves).
127     Scaled64 PredCost;
128     /// Non-predicated cost (with selects converted to branches).
129     Scaled64 NonPredCost;
130   };
131 
132   // Converts select instructions of a function to conditional jumps when deemed
133   // profitable. Returns true if at least one select was converted.
134   bool optimizeSelects(Function &F);
135 
136   // Heuristics for determining which select instructions can be profitably
137   // conveted to branches. Separate heuristics for selects in inner-most loops
138   // and the rest of code regions (base heuristics for non-inner-most loop
139   // regions).
140   void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
141   void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
142 
143   // Converts to branches the select groups that were deemed
144   // profitable-to-convert.
145   void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
146 
147   // Splits selects of a given basic block into select groups.
148   void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
149 
150   // Determines for which select groups it is profitable converting to branches
151   // (base and inner-most-loop heuristics).
152   void findProfitableSIGroupsBase(SelectGroups &SIGroups,
153                                   SelectGroups &ProfSIGroups);
154   void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
155                                         SelectGroups &ProfSIGroups);
156 
157   // Determines if a select group should be converted to a branch (base
158   // heuristics).
159   bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
160 
161   // Returns true if there are expensive instructions in the cold value
162   // operand's (if any) dependence slice of any of the selects of the given
163   // group.
164   bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
165 
166   // For a given source instruction, collect its backwards dependence slice
167   // consisting of instructions exclusively computed for producing the operands
168   // of the source instruction.
169   void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
170                              Instruction *SI, bool ForSinking = false);
171 
172   // Returns true if the condition of the select is highly predictable.
173   bool isSelectHighlyPredictable(const SelectInst *SI);
174 
175   // Loop-level checks to determine if a non-predicated version (with branches)
176   // of the given loop is more profitable than its predicated version.
177   bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
178 
179   // Computes instruction and loop-critical-path costs for both the predicated
180   // and non-predicated version of the given loop.
181   bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
182                         DenseMap<const Instruction *, CostInfo> &InstCostMap,
183                         CostInfo *LoopCost);
184 
185   // Returns a set of all the select instructions in the given select groups.
186   SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
187 
188   // Returns the latency cost of a given instruction.
189   std::optional<uint64_t> computeInstCost(const Instruction *I);
190 
191   // Returns the misprediction cost of a given select when converted to branch.
192   Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
193 
194   // Returns the cost of a branch when the prediction is correct.
195   Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
196                                 const SelectInst *SI);
197 
198   // Returns true if the target architecture supports lowering a given select.
199   bool isSelectKindSupported(SelectInst *SI);
200 };
201 
202 class SelectOptimize : public FunctionPass {
203   SelectOptimizeImpl Impl;
204 
205 public:
206   static char ID;
207 
208   SelectOptimize() : FunctionPass(ID) {
209     initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
210   }
211 
212   bool runOnFunction(Function &F) override {
213     return Impl.runOnFunction(F, *this);
214   }
215 
216   void getAnalysisUsage(AnalysisUsage &AU) const override {
217     AU.addRequired<ProfileSummaryInfoWrapperPass>();
218     AU.addRequired<TargetPassConfig>();
219     AU.addRequired<TargetTransformInfoWrapperPass>();
220     AU.addRequired<LoopInfoWrapperPass>();
221     AU.addRequired<BlockFrequencyInfoWrapperPass>();
222     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
223   }
224 };
225 
226 } // namespace
227 
228 PreservedAnalyses SelectOptimizePass::run(Function &F,
229                                           FunctionAnalysisManager &FAM) {
230   SelectOptimizeImpl Impl(TM);
231   return Impl.run(F, FAM);
232 }
233 
234 char SelectOptimize::ID = 0;
235 
236 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
237                       false)
238 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
239 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
240 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
241 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
242 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
243 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
244 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
245                     false)
246 
247 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
248 
249 PreservedAnalyses SelectOptimizeImpl::run(Function &F,
250                                           FunctionAnalysisManager &FAM) {
251   TSI = TM->getSubtargetImpl(F);
252   TLI = TSI->getTargetLowering();
253 
254   // If none of the select types are supported then skip this pass.
255   // This is an optimization pass. Legality issues will be handled by
256   // instruction selection.
257   if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
258       !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
259       !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
260     return PreservedAnalyses::all();
261 
262   TTI = &FAM.getResult<TargetIRAnalysis>(F);
263   if (!TTI->enableSelectOptimize())
264     return PreservedAnalyses::all();
265 
266   PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
267             .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
268   assert(PSI && "This pass requires module analysis pass `profile-summary`!");
269   BFI = &FAM.getResult<BlockFrequencyAnalysis>(F);
270 
271   // When optimizing for size, selects are preferable over branches.
272   if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
273     return PreservedAnalyses::all();
274 
275   LI = &FAM.getResult<LoopAnalysis>(F);
276   ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
277   TSchedModel.init(TSI);
278 
279   bool Changed = optimizeSelects(F);
280   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
281 }
282 
283 bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
284   TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
285   TSI = TM->getSubtargetImpl(F);
286   TLI = TSI->getTargetLowering();
287 
288   // If none of the select types are supported then skip this pass.
289   // This is an optimization pass. Legality issues will be handled by
290   // instruction selection.
291   if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
292       !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
293       !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
294     return false;
295 
296   TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
297 
298   if (!TTI->enableSelectOptimize())
299     return false;
300 
301   LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
302   BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
303   PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
304   ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
305   TSchedModel.init(TSI);
306 
307   // When optimizing for size, selects are preferable over branches.
308   if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
309     return false;
310 
311   return optimizeSelects(F);
312 }
313 
314 bool SelectOptimizeImpl::optimizeSelects(Function &F) {
315   // Determine for which select groups it is profitable converting to branches.
316   SelectGroups ProfSIGroups;
317   // Base heuristics apply only to non-loops and outer loops.
318   optimizeSelectsBase(F, ProfSIGroups);
319   // Separate heuristics for inner-most loops.
320   optimizeSelectsInnerLoops(F, ProfSIGroups);
321 
322   // Convert to branches the select groups that were deemed
323   // profitable-to-convert.
324   convertProfitableSIGroups(ProfSIGroups);
325 
326   // Code modified if at least one select group was converted.
327   return !ProfSIGroups.empty();
328 }
329 
330 void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
331                                              SelectGroups &ProfSIGroups) {
332   // Collect all the select groups.
333   SelectGroups SIGroups;
334   for (BasicBlock &BB : F) {
335     // Base heuristics apply only to non-loops and outer loops.
336     Loop *L = LI->getLoopFor(&BB);
337     if (L && L->isInnermost())
338       continue;
339     collectSelectGroups(BB, SIGroups);
340   }
341 
342   // Determine for which select groups it is profitable converting to branches.
343   findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
344 }
345 
346 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
347                                                    SelectGroups &ProfSIGroups) {
348   SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
349   // Need to check size on each iteration as we accumulate child loops.
350   for (unsigned long i = 0; i < Loops.size(); ++i)
351     for (Loop *ChildL : Loops[i]->getSubLoops())
352       Loops.push_back(ChildL);
353 
354   for (Loop *L : Loops) {
355     if (!L->isInnermost())
356       continue;
357 
358     SelectGroups SIGroups;
359     for (BasicBlock *BB : L->getBlocks())
360       collectSelectGroups(*BB, SIGroups);
361 
362     findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
363   }
364 }
365 
366 /// If \p isTrue is true, return the true value of \p SI, otherwise return
367 /// false value of \p SI. If the true/false value of \p SI is defined by any
368 /// select instructions in \p Selects, look through the defining select
369 /// instruction until the true/false value is not defined in \p Selects.
370 static Value *
371 getTrueOrFalseValue(SelectInst *SI, bool isTrue,
372                     const SmallPtrSet<const Instruction *, 2> &Selects) {
373   Value *V = nullptr;
374   for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
375        DefSI = dyn_cast<SelectInst>(V)) {
376     assert(DefSI->getCondition() == SI->getCondition() &&
377            "The condition of DefSI does not match with SI");
378     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
379   }
380   assert(V && "Failed to get select true/false value");
381   return V;
382 }
383 
384 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
385   for (SelectGroup &ASI : ProfSIGroups) {
386     // The code transformation here is a modified version of the sinking
387     // transformation in CodeGenPrepare::optimizeSelectInst with a more
388     // aggressive strategy of which instructions to sink.
389     //
390     // TODO: eliminate the redundancy of logic transforming selects to branches
391     // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
392     // selects for all cases (with and without profile information).
393 
394     // Transform a sequence like this:
395     //    start:
396     //       %cmp = cmp uge i32 %a, %b
397     //       %sel = select i1 %cmp, i32 %c, i32 %d
398     //
399     // Into:
400     //    start:
401     //       %cmp = cmp uge i32 %a, %b
402     //       %cmp.frozen = freeze %cmp
403     //       br i1 %cmp.frozen, label %select.true, label %select.false
404     //    select.true:
405     //       br label %select.end
406     //    select.false:
407     //       br label %select.end
408     //    select.end:
409     //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
410     //
411     // %cmp should be frozen, otherwise it may introduce undefined behavior.
412     // In addition, we may sink instructions that produce %c or %d into the
413     // destination(s) of the new branch.
414     // If the true or false blocks do not contain a sunken instruction, that
415     // block and its branch may be optimized away. In that case, one side of the
416     // first branch will point directly to select.end, and the corresponding PHI
417     // predecessor block will be the start block.
418 
419     // Find all the instructions that can be soundly sunk to the true/false
420     // blocks. These are instructions that are computed solely for producing the
421     // operands of the select instructions in the group and can be sunk without
422     // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
423     // with side effects).
424     SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
425     typedef std::stack<Instruction *>::size_type StackSizeType;
426     StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
427     for (SelectInst *SI : ASI) {
428       // For each select, compute the sinkable dependence chains of the true and
429       // false operands.
430       if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
431         std::stack<Instruction *> TrueSlice;
432         getExclBackwardsSlice(TI, TrueSlice, SI, true);
433         maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
434         TrueSlices.push_back(TrueSlice);
435       }
436       if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
437         std::stack<Instruction *> FalseSlice;
438         getExclBackwardsSlice(FI, FalseSlice, SI, true);
439         maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
440         FalseSlices.push_back(FalseSlice);
441       }
442     }
443     // In the case of multiple select instructions in the same group, the order
444     // of non-dependent instructions (instructions of different dependence
445     // slices) in the true/false blocks appears to affect performance.
446     // Interleaving the slices seems to experimentally be the optimal approach.
447     // This interleaving scheduling allows for more ILP (with a natural downside
448     // of increasing a bit register pressure) compared to a simple ordering of
449     // one whole chain after another. One would expect that this ordering would
450     // not matter since the scheduling in the backend of the compiler  would
451     // take care of it, but apparently the scheduler fails to deliver optimal
452     // ILP with a naive ordering here.
453     SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
454     for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
455       for (auto &S : TrueSlices) {
456         if (!S.empty()) {
457           TrueSlicesInterleaved.push_back(S.top());
458           S.pop();
459         }
460       }
461     }
462     for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
463       for (auto &S : FalseSlices) {
464         if (!S.empty()) {
465           FalseSlicesInterleaved.push_back(S.top());
466           S.pop();
467         }
468       }
469     }
470 
471     // We split the block containing the select(s) into two blocks.
472     SelectInst *SI = ASI.front();
473     SelectInst *LastSI = ASI.back();
474     BasicBlock *StartBlock = SI->getParent();
475     BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
476     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
477     BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
478     // Delete the unconditional branch that was just created by the split.
479     StartBlock->getTerminator()->eraseFromParent();
480 
481     // Move any debug/pseudo instructions that were in-between the select
482     // group to the newly-created end block.
483     SmallVector<Instruction *, 2> DebugPseudoINS;
484     auto DIt = SI->getIterator();
485     while (&*DIt != LastSI) {
486       if (DIt->isDebugOrPseudoInst())
487         DebugPseudoINS.push_back(&*DIt);
488       DIt++;
489     }
490     for (auto *DI : DebugPseudoINS) {
491       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
492     }
493 
494     // Duplicate implementation for DPValues, the non-instruction debug-info
495     // record. Helper lambda for moving DPValues to the end block.
496     auto TransferDPValues = [&](Instruction &I) {
497       for (auto &DPValue : llvm::make_early_inc_range(I.getDbgValueRange())) {
498         DPValue.removeFromParent();
499         EndBlock->insertDPValueBefore(&DPValue,
500                                       EndBlock->getFirstInsertionPt());
501       }
502     };
503 
504     // Iterate over all instructions in between SI and LastSI, not including
505     // SI itself. These are all the variable assignments that happen "in the
506     // middle" of the select group.
507     auto R = make_range(std::next(SI->getIterator()),
508                         std::next(LastSI->getIterator()));
509     llvm::for_each(R, TransferDPValues);
510 
511     // These are the new basic blocks for the conditional branch.
512     // At least one will become an actual new basic block.
513     BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
514     BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
515     if (!TrueSlicesInterleaved.empty()) {
516       TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
517                                      EndBlock->getParent(), EndBlock);
518       TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
519       TrueBranch->setDebugLoc(LastSI->getDebugLoc());
520       for (Instruction *TrueInst : TrueSlicesInterleaved)
521         TrueInst->moveBefore(TrueBranch);
522     }
523     if (!FalseSlicesInterleaved.empty()) {
524       FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
525                                       EndBlock->getParent(), EndBlock);
526       FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
527       FalseBranch->setDebugLoc(LastSI->getDebugLoc());
528       for (Instruction *FalseInst : FalseSlicesInterleaved)
529         FalseInst->moveBefore(FalseBranch);
530     }
531     // If there was nothing to sink, then arbitrarily choose the 'false' side
532     // for a new input value to the PHI.
533     if (TrueBlock == FalseBlock) {
534       assert(TrueBlock == nullptr &&
535              "Unexpected basic block transform while optimizing select");
536 
537       FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
538                                       EndBlock->getParent(), EndBlock);
539       auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
540       FalseBranch->setDebugLoc(SI->getDebugLoc());
541     }
542 
543     // Insert the real conditional branch based on the original condition.
544     // If we did not create a new block for one of the 'true' or 'false' paths
545     // of the condition, it means that side of the branch goes to the end block
546     // directly and the path originates from the start block from the point of
547     // view of the new PHI.
548     BasicBlock *TT, *FT;
549     if (TrueBlock == nullptr) {
550       TT = EndBlock;
551       FT = FalseBlock;
552       TrueBlock = StartBlock;
553     } else if (FalseBlock == nullptr) {
554       TT = TrueBlock;
555       FT = EndBlock;
556       FalseBlock = StartBlock;
557     } else {
558       TT = TrueBlock;
559       FT = FalseBlock;
560     }
561     IRBuilder<> IB(SI);
562     auto *CondFr =
563         IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
564     IB.CreateCondBr(CondFr, TT, FT, SI);
565 
566     SmallPtrSet<const Instruction *, 2> INS;
567     INS.insert(ASI.begin(), ASI.end());
568     // Use reverse iterator because later select may use the value of the
569     // earlier select, and we need to propagate value through earlier select
570     // to get the PHI operand.
571     for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
572       SelectInst *SI = *It;
573       // The select itself is replaced with a PHI Node.
574       PHINode *PN = PHINode::Create(SI->getType(), 2, "");
575       PN->insertBefore(EndBlock->begin());
576       PN->takeName(SI);
577       PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
578       PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
579       PN->setDebugLoc(SI->getDebugLoc());
580 
581       SI->replaceAllUsesWith(PN);
582       SI->eraseFromParent();
583       INS.erase(SI);
584       ++NumSelectsConverted;
585     }
586   }
587 }
588 
589 static bool isSpecialSelect(SelectInst *SI) {
590   using namespace llvm::PatternMatch;
591 
592   // If the select is a logical-and/logical-or then it is better treated as a
593   // and/or by the backend.
594   if (match(SI, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
595                             m_LogicalOr(m_Value(), m_Value()))))
596     return true;
597 
598   return false;
599 }
600 
601 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
602                                              SelectGroups &SIGroups) {
603   BasicBlock::iterator BBIt = BB.begin();
604   while (BBIt != BB.end()) {
605     Instruction *I = &*BBIt++;
606     if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
607       if (isSpecialSelect(SI))
608         continue;
609 
610       SelectGroup SIGroup;
611       SIGroup.push_back(SI);
612       while (BBIt != BB.end()) {
613         Instruction *NI = &*BBIt;
614         SelectInst *NSI = dyn_cast<SelectInst>(NI);
615         if (NSI && SI->getCondition() == NSI->getCondition()) {
616           SIGroup.push_back(NSI);
617         } else if (!NI->isDebugOrPseudoInst()) {
618           // Debug/pseudo instructions should be skipped and not prevent the
619           // formation of a select group.
620           break;
621         }
622         ++BBIt;
623       }
624 
625       // If the select type is not supported, no point optimizing it.
626       // Instruction selection will take care of it.
627       if (!isSelectKindSupported(SI))
628         continue;
629 
630       SIGroups.push_back(SIGroup);
631     }
632   }
633 }
634 
635 void SelectOptimizeImpl::findProfitableSIGroupsBase(
636     SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
637   for (SelectGroup &ASI : SIGroups) {
638     ++NumSelectOptAnalyzed;
639     if (isConvertToBranchProfitableBase(ASI))
640       ProfSIGroups.push_back(ASI);
641   }
642 }
643 
644 static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
645                                DiagnosticInfoOptimizationBase &Rem) {
646   LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
647   ORE->emit(Rem);
648 }
649 
650 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
651     const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
652   NumSelectOptAnalyzed += SIGroups.size();
653   // For each select group in an inner-most loop,
654   // a branch is more preferable than a select/conditional-move if:
655   // i) conversion to branches for all the select groups of the loop satisfies
656   //    loop-level heuristics including reducing the loop's critical path by
657   //    some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
658   // ii) the total cost of the select group is cheaper with a branch compared
659   //     to its predicated version. The cost is in terms of latency and the cost
660   //     of a select group is the cost of its most expensive select instruction
661   //     (assuming infinite resources and thus fully leveraging available ILP).
662 
663   DenseMap<const Instruction *, CostInfo> InstCostMap;
664   CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
665                           {Scaled64::getZero(), Scaled64::getZero()}};
666   if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
667       !checkLoopHeuristics(L, LoopCost)) {
668     return;
669   }
670 
671   for (SelectGroup &ASI : SIGroups) {
672     // Assuming infinite resources, the cost of a group of instructions is the
673     // cost of the most expensive instruction of the group.
674     Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
675     for (SelectInst *SI : ASI) {
676       SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
677       BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
678     }
679     if (BranchCost < SelectCost) {
680       OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
681       OR << "Profitable to convert to branch (loop analysis). BranchCost="
682          << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
683          << ". ";
684       EmitAndPrintRemark(ORE, OR);
685       ++NumSelectConvertedLoop;
686       ProfSIGroups.push_back(ASI);
687     } else {
688       OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
689       ORmiss << "Select is more profitable (loop analysis). BranchCost="
690              << BranchCost.toString()
691              << ", SelectCost=" << SelectCost.toString() << ". ";
692       EmitAndPrintRemark(ORE, ORmiss);
693     }
694   }
695 }
696 
697 bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
698     const SmallVector<SelectInst *, 2> &ASI) {
699   SelectInst *SI = ASI.front();
700   LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI << "\n");
701   OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
702   OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
703 
704   // Skip cold basic blocks. Better to optimize for size for cold blocks.
705   if (PSI->isColdBlock(SI->getParent(), BFI)) {
706     ++NumSelectColdBB;
707     ORmiss << "Not converted to branch because of cold basic block. ";
708     EmitAndPrintRemark(ORE, ORmiss);
709     return false;
710   }
711 
712   // If unpredictable, branch form is less profitable.
713   if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
714     ++NumSelectUnPred;
715     ORmiss << "Not converted to branch because of unpredictable branch. ";
716     EmitAndPrintRemark(ORE, ORmiss);
717     return false;
718   }
719 
720   // If highly predictable, branch form is more profitable, unless a
721   // predictable select is inexpensive in the target architecture.
722   if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
723     ++NumSelectConvertedHighPred;
724     OR << "Converted to branch because of highly predictable branch. ";
725     EmitAndPrintRemark(ORE, OR);
726     return true;
727   }
728 
729   // Look for expensive instructions in the cold operand's (if any) dependence
730   // slice of any of the selects in the group.
731   if (hasExpensiveColdOperand(ASI)) {
732     ++NumSelectConvertedExpColdOperand;
733     OR << "Converted to branch because of expensive cold operand.";
734     EmitAndPrintRemark(ORE, OR);
735     return true;
736   }
737 
738   ORmiss << "Not profitable to convert to branch (base heuristic).";
739   EmitAndPrintRemark(ORE, ORmiss);
740   return false;
741 }
742 
743 static InstructionCost divideNearest(InstructionCost Numerator,
744                                      uint64_t Denominator) {
745   return (Numerator + (Denominator / 2)) / Denominator;
746 }
747 
748 bool SelectOptimizeImpl::hasExpensiveColdOperand(
749     const SmallVector<SelectInst *, 2> &ASI) {
750   bool ColdOperand = false;
751   uint64_t TrueWeight, FalseWeight, TotalWeight;
752   if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) {
753     uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
754     TotalWeight = TrueWeight + FalseWeight;
755     // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
756     ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
757   } else if (PSI->hasProfileSummary()) {
758     OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
759     ORmiss << "Profile data available but missing branch-weights metadata for "
760               "select instruction. ";
761     EmitAndPrintRemark(ORE, ORmiss);
762   }
763   if (!ColdOperand)
764     return false;
765   // Check if the cold path's dependence slice is expensive for any of the
766   // selects of the group.
767   for (SelectInst *SI : ASI) {
768     Instruction *ColdI = nullptr;
769     uint64_t HotWeight;
770     if (TrueWeight < FalseWeight) {
771       ColdI = dyn_cast<Instruction>(SI->getTrueValue());
772       HotWeight = FalseWeight;
773     } else {
774       ColdI = dyn_cast<Instruction>(SI->getFalseValue());
775       HotWeight = TrueWeight;
776     }
777     if (ColdI) {
778       std::stack<Instruction *> ColdSlice;
779       getExclBackwardsSlice(ColdI, ColdSlice, SI);
780       InstructionCost SliceCost = 0;
781       while (!ColdSlice.empty()) {
782         SliceCost += TTI->getInstructionCost(ColdSlice.top(),
783                                              TargetTransformInfo::TCK_Latency);
784         ColdSlice.pop();
785       }
786       // The colder the cold value operand of the select is the more expensive
787       // the cmov becomes for computing the cold value operand every time. Thus,
788       // the colder the cold operand is the more its cost counts.
789       // Get nearest integer cost adjusted for coldness.
790       InstructionCost AdjSliceCost =
791           divideNearest(SliceCost * HotWeight, TotalWeight);
792       if (AdjSliceCost >=
793           ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
794         return true;
795     }
796   }
797   return false;
798 }
799 
800 // Check if it is safe to move LoadI next to the SI.
801 // Conservatively assume it is safe only if there is no instruction
802 // modifying memory in-between the load and the select instruction.
803 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
804   // Assume loads from different basic blocks are unsafe to move.
805   if (LoadI->getParent() != SI->getParent())
806     return false;
807   auto It = LoadI->getIterator();
808   while (&*It != SI) {
809     if (It->mayWriteToMemory())
810       return false;
811     It++;
812   }
813   return true;
814 }
815 
816 // For a given source instruction, collect its backwards dependence slice
817 // consisting of instructions exclusively computed for the purpose of producing
818 // the operands of the source instruction. As an approximation
819 // (sufficiently-accurate in practice), we populate this set with the
820 // instructions of the backwards dependence slice that only have one-use and
821 // form an one-use chain that leads to the source instruction.
822 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
823                                                std::stack<Instruction *> &Slice,
824                                                Instruction *SI,
825                                                bool ForSinking) {
826   SmallPtrSet<Instruction *, 2> Visited;
827   std::queue<Instruction *> Worklist;
828   Worklist.push(I);
829   while (!Worklist.empty()) {
830     Instruction *II = Worklist.front();
831     Worklist.pop();
832 
833     // Avoid cycles.
834     if (!Visited.insert(II).second)
835       continue;
836 
837     if (!II->hasOneUse())
838       continue;
839 
840     // Cannot soundly sink instructions with side-effects.
841     // Terminator or phi instructions cannot be sunk.
842     // Avoid sinking other select instructions (should be handled separetely).
843     if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
844                        isa<SelectInst>(II) || isa<PHINode>(II)))
845       continue;
846 
847     // Avoid sinking loads in order not to skip state-modifying instructions,
848     // that may alias with the loaded address.
849     // Only allow sinking of loads within the same basic block that are
850     // conservatively proven to be safe.
851     if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
852       continue;
853 
854     // Avoid considering instructions with less frequency than the source
855     // instruction (i.e., avoid colder code regions of the dependence slice).
856     if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
857       continue;
858 
859     // Eligible one-use instruction added to the dependence slice.
860     Slice.push(II);
861 
862     // Explore all the operands of the current instruction to expand the slice.
863     for (unsigned k = 0; k < II->getNumOperands(); ++k)
864       if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
865         Worklist.push(OpI);
866   }
867 }
868 
869 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectInst *SI) {
870   uint64_t TrueWeight, FalseWeight;
871   if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
872     uint64_t Max = std::max(TrueWeight, FalseWeight);
873     uint64_t Sum = TrueWeight + FalseWeight;
874     if (Sum != 0) {
875       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
876       if (Probability > TTI->getPredictableBranchThreshold())
877         return true;
878     }
879   }
880   return false;
881 }
882 
883 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
884                                              const CostInfo LoopCost[2]) {
885   // Loop-level checks to determine if a non-predicated version (with branches)
886   // of the loop is more profitable than its predicated version.
887 
888   if (DisableLoopLevelHeuristics)
889     return true;
890 
891   OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
892                                    L->getHeader()->getFirstNonPHI());
893 
894   if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
895       LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
896     ORmissL << "No select conversion in the loop due to no reduction of loop's "
897                "critical path. ";
898     EmitAndPrintRemark(ORE, ORmissL);
899     return false;
900   }
901 
902   Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
903                       LoopCost[1].PredCost - LoopCost[1].NonPredCost};
904 
905   // Profitably converting to branches need to reduce the loop's critical path
906   // by at least some threshold (absolute gain of GainCycleThreshold cycles and
907   // relative gain of 12.5%).
908   if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
909       Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
910     Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
911     ORmissL << "No select conversion in the loop due to small reduction of "
912                "loop's critical path. Gain="
913             << Gain[1].toString()
914             << ", RelativeGain=" << RelativeGain.toString() << "%. ";
915     EmitAndPrintRemark(ORE, ORmissL);
916     return false;
917   }
918 
919   // If the loop's critical path involves loop-carried dependences, the gradient
920   // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
921   // This check ensures that the latency reduction for the loop's critical path
922   // keeps decreasing with sufficient rate beyond the two analyzed loop
923   // iterations.
924   if (Gain[1] > Gain[0]) {
925     Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
926                             (LoopCost[1].PredCost - LoopCost[0].PredCost);
927     if (GradientGain < Scaled64::get(GainGradientThreshold)) {
928       ORmissL << "No select conversion in the loop due to small gradient gain. "
929                  "GradientGain="
930               << GradientGain.toString() << "%. ";
931       EmitAndPrintRemark(ORE, ORmissL);
932       return false;
933     }
934   }
935   // If the gain decreases it is not profitable to convert.
936   else if (Gain[1] < Gain[0]) {
937     ORmissL
938         << "No select conversion in the loop due to negative gradient gain. ";
939     EmitAndPrintRemark(ORE, ORmissL);
940     return false;
941   }
942 
943   // Non-predicated version of the loop is more profitable than its
944   // predicated version.
945   return true;
946 }
947 
948 // Computes instruction and loop-critical-path costs for both the predicated
949 // and non-predicated version of the given loop.
950 // Returns false if unable to compute these costs due to invalid cost of loop
951 // instruction(s).
952 bool SelectOptimizeImpl::computeLoopCosts(
953     const Loop *L, const SelectGroups &SIGroups,
954     DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
955   LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
956                     << L->getHeader()->getName() << "\n");
957   const auto &SIset = getSIset(SIGroups);
958   // Compute instruction and loop-critical-path costs across two iterations for
959   // both predicated and non-predicated version.
960   const unsigned Iterations = 2;
961   for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
962     // Cost of the loop's critical path.
963     CostInfo &MaxCost = LoopCost[Iter];
964     for (BasicBlock *BB : L->getBlocks()) {
965       for (const Instruction &I : *BB) {
966         if (I.isDebugOrPseudoInst())
967           continue;
968         // Compute the predicated and non-predicated cost of the instruction.
969         Scaled64 IPredCost = Scaled64::getZero(),
970                  INonPredCost = Scaled64::getZero();
971 
972         // Assume infinite resources that allow to fully exploit the available
973         // instruction-level parallelism.
974         // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
975         for (const Use &U : I.operands()) {
976           auto UI = dyn_cast<Instruction>(U.get());
977           if (!UI)
978             continue;
979           if (InstCostMap.count(UI)) {
980             IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
981             INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
982           }
983         }
984         auto ILatency = computeInstCost(&I);
985         if (!ILatency) {
986           OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
987           ORmissL << "Invalid instruction cost preventing analysis and "
988                      "optimization of the inner-most loop containing this "
989                      "instruction. ";
990           EmitAndPrintRemark(ORE, ORmissL);
991           return false;
992         }
993         IPredCost += Scaled64::get(*ILatency);
994         INonPredCost += Scaled64::get(*ILatency);
995 
996         // For a select that can be converted to branch,
997         // compute its cost as a branch (non-predicated cost).
998         //
999         // BranchCost = PredictedPathCost + MispredictCost
1000         // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1001         // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1002         if (SIset.contains(&I)) {
1003           auto SI = cast<SelectInst>(&I);
1004 
1005           Scaled64 TrueOpCost = Scaled64::getZero(),
1006                    FalseOpCost = Scaled64::getZero();
1007           if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
1008             if (InstCostMap.count(TI))
1009               TrueOpCost = InstCostMap[TI].NonPredCost;
1010           if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
1011             if (InstCostMap.count(FI))
1012               FalseOpCost = InstCostMap[FI].NonPredCost;
1013           Scaled64 PredictedPathCost =
1014               getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1015 
1016           Scaled64 CondCost = Scaled64::getZero();
1017           if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
1018             if (InstCostMap.count(CI))
1019               CondCost = InstCostMap[CI].NonPredCost;
1020           Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1021 
1022           INonPredCost = PredictedPathCost + MispredictCost;
1023         }
1024         LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1025                           << INonPredCost << " for " << I << "\n");
1026 
1027         InstCostMap[&I] = {IPredCost, INonPredCost};
1028         MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1029         MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1030       }
1031     }
1032     LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1033                       << " MaxCost = " << MaxCost.PredCost << " "
1034                       << MaxCost.NonPredCost << "\n");
1035   }
1036   return true;
1037 }
1038 
1039 SmallPtrSet<const Instruction *, 2>
1040 SelectOptimizeImpl::getSIset(const SelectGroups &SIGroups) {
1041   SmallPtrSet<const Instruction *, 2> SIset;
1042   for (const SelectGroup &ASI : SIGroups)
1043     for (const SelectInst *SI : ASI)
1044       SIset.insert(SI);
1045   return SIset;
1046 }
1047 
1048 std::optional<uint64_t>
1049 SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1050   InstructionCost ICost =
1051       TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
1052   if (auto OC = ICost.getValue())
1053     return std::optional<uint64_t>(*OC);
1054   return std::nullopt;
1055 }
1056 
1057 ScaledNumber<uint64_t>
1058 SelectOptimizeImpl::getMispredictionCost(const SelectInst *SI,
1059                                          const Scaled64 CondCost) {
1060   uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1061 
1062   // Account for the default misprediction rate when using a branch
1063   // (conservatively set to 25% by default).
1064   uint64_t MispredictRate = MispredictDefaultRate;
1065   // If the select condition is obviously predictable, then the misprediction
1066   // rate is zero.
1067   if (isSelectHighlyPredictable(SI))
1068     MispredictRate = 0;
1069 
1070   // CondCost is included to account for cases where the computation of the
1071   // condition is part of a long dependence chain (potentially loop-carried)
1072   // that would delay detection of a misprediction and increase its cost.
1073   Scaled64 MispredictCost =
1074       std::max(Scaled64::get(MispredictPenalty), CondCost) *
1075       Scaled64::get(MispredictRate);
1076   MispredictCost /= Scaled64::get(100);
1077 
1078   return MispredictCost;
1079 }
1080 
1081 // Returns the cost of a branch when the prediction is correct.
1082 // TrueCost * TrueProbability + FalseCost * FalseProbability.
1083 ScaledNumber<uint64_t>
1084 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1085                                          const SelectInst *SI) {
1086   Scaled64 PredPathCost;
1087   uint64_t TrueWeight, FalseWeight;
1088   if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
1089     uint64_t SumWeight = TrueWeight + FalseWeight;
1090     if (SumWeight != 0) {
1091       PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1092                      FalseCost * Scaled64::get(FalseWeight);
1093       PredPathCost /= Scaled64::get(SumWeight);
1094       return PredPathCost;
1095     }
1096   }
1097   // Without branch weight metadata, we assume 75% for the one path and 25% for
1098   // the other, and pick the result with the biggest cost.
1099   PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1100                           FalseCost * Scaled64::get(3) + TrueCost);
1101   PredPathCost /= Scaled64::get(4);
1102   return PredPathCost;
1103 }
1104 
1105 bool SelectOptimizeImpl::isSelectKindSupported(SelectInst *SI) {
1106   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
1107   if (VectorCond)
1108     return false;
1109   TargetLowering::SelectSupportKind SelectKind;
1110   if (SI->getType()->isVectorTy())
1111     SelectKind = TargetLowering::ScalarCondVectorVal;
1112   else
1113     SelectKind = TargetLowering::ScalarValSelect;
1114   return TLI->isSelectSupported(SelectKind);
1115 }
1116