1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass converts selects to conditional jumps when profitable. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/CodeGen/SelectOptimize.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/Analysis/BlockFrequencyInfo.h" 17 #include "llvm/Analysis/BranchProbabilityInfo.h" 18 #include "llvm/Analysis/LoopInfo.h" 19 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 20 #include "llvm/Analysis/ProfileSummaryInfo.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/CodeGen/TargetLowering.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/CodeGen/TargetSchedule.h" 26 #include "llvm/CodeGen/TargetSubtargetInfo.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/PatternMatch.h" 33 #include "llvm/IR/ProfDataUtils.h" 34 #include "llvm/InitializePasses.h" 35 #include "llvm/Pass.h" 36 #include "llvm/Support/ScaledNumber.h" 37 #include "llvm/Target/TargetMachine.h" 38 #include "llvm/Transforms/Utils/SizeOpts.h" 39 #include <algorithm> 40 #include <queue> 41 #include <stack> 42 43 using namespace llvm; 44 using namespace llvm::PatternMatch; 45 46 #define DEBUG_TYPE "select-optimize" 47 48 STATISTIC(NumSelectOptAnalyzed, 49 "Number of select groups considered for conversion to branch"); 50 STATISTIC(NumSelectConvertedExpColdOperand, 51 "Number of select groups converted due to expensive cold operand"); 52 STATISTIC(NumSelectConvertedHighPred, 53 "Number of select groups converted due to high-predictability"); 54 STATISTIC(NumSelectUnPred, 55 "Number of select groups not converted due to unpredictability"); 56 STATISTIC(NumSelectColdBB, 57 "Number of select groups not converted due to cold basic block"); 58 STATISTIC(NumSelectConvertedLoop, 59 "Number of select groups converted due to loop-level analysis"); 60 STATISTIC(NumSelectsConverted, "Number of selects converted"); 61 62 static cl::opt<unsigned> ColdOperandThreshold( 63 "cold-operand-threshold", 64 cl::desc("Maximum frequency of path for an operand to be considered cold."), 65 cl::init(20), cl::Hidden); 66 67 static cl::opt<unsigned> ColdOperandMaxCostMultiplier( 68 "cold-operand-max-cost-multiplier", 69 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " 70 "slice of a cold operand to be considered inexpensive."), 71 cl::init(1), cl::Hidden); 72 73 static cl::opt<unsigned> 74 GainGradientThreshold("select-opti-loop-gradient-gain-threshold", 75 cl::desc("Gradient gain threshold (%)."), 76 cl::init(25), cl::Hidden); 77 78 static cl::opt<unsigned> 79 GainCycleThreshold("select-opti-loop-cycle-gain-threshold", 80 cl::desc("Minimum gain per loop (in cycles) threshold."), 81 cl::init(4), cl::Hidden); 82 83 static cl::opt<unsigned> GainRelativeThreshold( 84 "select-opti-loop-relative-gain-threshold", 85 cl::desc( 86 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), 87 cl::init(8), cl::Hidden); 88 89 static cl::opt<unsigned> MispredictDefaultRate( 90 "mispredict-default-rate", cl::Hidden, cl::init(25), 91 cl::desc("Default mispredict rate (initialized to 25%).")); 92 93 static cl::opt<bool> 94 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, 95 cl::init(false), 96 cl::desc("Disable loop-level heuristics.")); 97 98 namespace { 99 100 class SelectOptimizeImpl { 101 const TargetMachine *TM = nullptr; 102 const TargetSubtargetInfo *TSI = nullptr; 103 const TargetLowering *TLI = nullptr; 104 const TargetTransformInfo *TTI = nullptr; 105 const LoopInfo *LI = nullptr; 106 BlockFrequencyInfo *BFI; 107 ProfileSummaryInfo *PSI = nullptr; 108 OptimizationRemarkEmitter *ORE = nullptr; 109 TargetSchedModel TSchedModel; 110 111 public: 112 SelectOptimizeImpl() = default; 113 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){}; 114 PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); 115 bool runOnFunction(Function &F, Pass &P); 116 117 using Scaled64 = ScaledNumber<uint64_t>; 118 119 struct CostInfo { 120 /// Predicated cost (with selects as conditional moves). 121 Scaled64 PredCost; 122 /// Non-predicated cost (with selects converted to branches). 123 Scaled64 NonPredCost; 124 }; 125 126 /// SelectLike is an abstraction over SelectInst and other operations that can 127 /// act like selects. For example Or(Zext(icmp), X) can be treated like 128 /// select(icmp, X|1, X). 129 class SelectLike { 130 /// The select (/or) instruction. 131 Instruction *I; 132 /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as 133 /// opposed to the original condition. 134 bool Inverted = false; 135 136 /// The index of the operand that depends on condition. Only for select-like 137 /// instruction such as Or/Add. 138 unsigned CondIdx; 139 140 public: 141 SelectLike(Instruction *I, bool Inverted = false, unsigned CondIdx = 0) 142 : I(I), Inverted(Inverted), CondIdx(CondIdx) {} 143 144 Instruction *getI() { return I; } 145 const Instruction *getI() const { return I; } 146 147 Type *getType() const { return I->getType(); } 148 149 unsigned getConditionOpIndex() { return CondIdx; }; 150 151 /// Return the true value for the SelectLike instruction. Note this may not 152 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)` 153 /// the true value would be `or(x,1)`. As this value does not exist, nullptr 154 /// is returned. 155 Value *getTrueValue(bool HonorInverts = true) const { 156 if (Inverted && HonorInverts) 157 return getFalseValue(/*HonorInverts=*/false); 158 if (auto *Sel = dyn_cast<SelectInst>(I)) 159 return Sel->getTrueValue(); 160 // Or(zext) case - The true value is Or(X), so return nullptr as the value 161 // does not yet exist. 162 if (isa<BinaryOperator>(I)) 163 return nullptr; 164 165 llvm_unreachable("Unhandled case in getTrueValue"); 166 } 167 168 /// Return the false value for the SelectLike instruction. For example the 169 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is 170 /// `select(c, x|1, x)`) 171 Value *getFalseValue(bool HonorInverts = true) const { 172 if (Inverted && HonorInverts) 173 return getTrueValue(/*HonorInverts=*/false); 174 if (auto *Sel = dyn_cast<SelectInst>(I)) 175 return Sel->getFalseValue(); 176 // We are on the branch where the condition is zero, which means BinOp 177 // does not perform any computation, and we can simply return the operand 178 // that is not related to the condition 179 if (auto *BO = dyn_cast<BinaryOperator>(I)) 180 return BO->getOperand(1 - CondIdx); 181 182 llvm_unreachable("Unhandled case in getFalseValue"); 183 } 184 185 /// Return the NonPredCost cost of the op on \p isTrue branch, given the 186 /// costs in \p InstCostMap. This may need to be generated for select-like 187 /// instructions. 188 Scaled64 getOpCostOnBranch( 189 bool IsTrue, const DenseMap<const Instruction *, CostInfo> &InstCostMap, 190 const TargetTransformInfo *TTI) { 191 auto *V = IsTrue ? getTrueValue() : getFalseValue(); 192 if (V) { 193 if (auto *IV = dyn_cast<Instruction>(V)) { 194 auto It = InstCostMap.find(IV); 195 return It != InstCostMap.end() ? It->second.NonPredCost 196 : Scaled64::getZero(); 197 } 198 return Scaled64::getZero(); 199 } 200 // If getTrue(False)Value() return nullptr, it means we are dealing with 201 // select-like instructions on the branch where the actual computation is 202 // happening. In that case the cost is equal to the cost of computation + 203 // cost of non-dependant on condition operand 204 InstructionCost Cost = TTI->getArithmeticInstrCost( 205 getI()->getOpcode(), I->getType(), TargetTransformInfo::TCK_Latency, 206 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 207 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2}); 208 auto TotalCost = Scaled64::get(*Cost.getValue()); 209 if (auto *OpI = dyn_cast<Instruction>(I->getOperand(1 - CondIdx))) { 210 auto It = InstCostMap.find(OpI); 211 if (It != InstCostMap.end()) 212 TotalCost += It->second.NonPredCost; 213 } 214 return TotalCost; 215 } 216 }; 217 218 private: 219 // Select groups consist of consecutive select-like instructions with the same 220 // condition. Between select-likes could be any number of auxiliary 221 // instructions related to the condition like not, zext 222 struct SelectGroup { 223 Value *Condition; 224 SmallVector<SelectLike, 2> Selects; 225 }; 226 using SelectGroups = SmallVector<SelectGroup, 2>; 227 228 // Converts select instructions of a function to conditional jumps when deemed 229 // profitable. Returns true if at least one select was converted. 230 bool optimizeSelects(Function &F); 231 232 // Heuristics for determining which select instructions can be profitably 233 // conveted to branches. Separate heuristics for selects in inner-most loops 234 // and the rest of code regions (base heuristics for non-inner-most loop 235 // regions). 236 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); 237 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); 238 239 // Converts to branches the select groups that were deemed 240 // profitable-to-convert. 241 void convertProfitableSIGroups(SelectGroups &ProfSIGroups); 242 243 // Splits selects of a given basic block into select groups. 244 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); 245 246 // Determines for which select groups it is profitable converting to branches 247 // (base and inner-most-loop heuristics). 248 void findProfitableSIGroupsBase(SelectGroups &SIGroups, 249 SelectGroups &ProfSIGroups); 250 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, 251 SelectGroups &ProfSIGroups); 252 253 // Determines if a select group should be converted to a branch (base 254 // heuristics). 255 bool isConvertToBranchProfitableBase(const SelectGroup &ASI); 256 257 // Returns true if there are expensive instructions in the cold value 258 // operand's (if any) dependence slice of any of the selects of the given 259 // group. 260 bool hasExpensiveColdOperand(const SelectGroup &ASI); 261 262 // For a given source instruction, collect its backwards dependence slice 263 // consisting of instructions exclusively computed for producing the operands 264 // of the source instruction. 265 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, 266 Instruction *SI, bool ForSinking = false); 267 268 // Returns true if the condition of the select is highly predictable. 269 bool isSelectHighlyPredictable(const SelectLike SI); 270 271 // Loop-level checks to determine if a non-predicated version (with branches) 272 // of the given loop is more profitable than its predicated version. 273 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); 274 275 // Computes instruction and loop-critical-path costs for both the predicated 276 // and non-predicated version of the given loop. 277 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, 278 DenseMap<const Instruction *, CostInfo> &InstCostMap, 279 CostInfo *LoopCost); 280 281 // Returns a set of all the select instructions in the given select groups. 282 SmallDenseMap<const Instruction *, SelectLike, 2> 283 getSImap(const SelectGroups &SIGroups); 284 285 // Returns a map from select-like instructions to the corresponding select 286 // group. 287 SmallDenseMap<const Instruction *, const SelectGroup *, 2> 288 getSGmap(const SelectGroups &SIGroups); 289 290 // Returns the latency cost of a given instruction. 291 std::optional<uint64_t> computeInstCost(const Instruction *I); 292 293 // Returns the misprediction cost of a given select when converted to branch. 294 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost); 295 296 // Returns the cost of a branch when the prediction is correct. 297 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 298 const SelectLike SI); 299 300 // Returns true if the target architecture supports lowering a given select. 301 bool isSelectKindSupported(const SelectLike SI); 302 }; 303 304 class SelectOptimize : public FunctionPass { 305 SelectOptimizeImpl Impl; 306 307 public: 308 static char ID; 309 310 SelectOptimize() : FunctionPass(ID) { 311 initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); 312 } 313 314 bool runOnFunction(Function &F) override { 315 return Impl.runOnFunction(F, *this); 316 } 317 318 void getAnalysisUsage(AnalysisUsage &AU) const override { 319 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 320 AU.addRequired<TargetPassConfig>(); 321 AU.addRequired<TargetTransformInfoWrapperPass>(); 322 AU.addRequired<LoopInfoWrapperPass>(); 323 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 324 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 325 } 326 }; 327 328 } // namespace 329 330 PreservedAnalyses SelectOptimizePass::run(Function &F, 331 FunctionAnalysisManager &FAM) { 332 SelectOptimizeImpl Impl(TM); 333 return Impl.run(F, FAM); 334 } 335 336 char SelectOptimize::ID = 0; 337 338 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 339 false) 340 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 341 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 342 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 343 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 344 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 345 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 346 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 347 false) 348 349 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } 350 351 PreservedAnalyses SelectOptimizeImpl::run(Function &F, 352 FunctionAnalysisManager &FAM) { 353 TSI = TM->getSubtargetImpl(F); 354 TLI = TSI->getTargetLowering(); 355 356 // If none of the select types are supported then skip this pass. 357 // This is an optimization pass. Legality issues will be handled by 358 // instruction selection. 359 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 360 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 361 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 362 return PreservedAnalyses::all(); 363 364 TTI = &FAM.getResult<TargetIRAnalysis>(F); 365 if (!TTI->enableSelectOptimize()) 366 return PreservedAnalyses::all(); 367 368 PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F) 369 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 370 assert(PSI && "This pass requires module analysis pass `profile-summary`!"); 371 BFI = &FAM.getResult<BlockFrequencyAnalysis>(F); 372 373 // When optimizing for size, selects are preferable over branches. 374 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 375 return PreservedAnalyses::all(); 376 377 LI = &FAM.getResult<LoopAnalysis>(F); 378 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 379 TSchedModel.init(TSI); 380 381 bool Changed = optimizeSelects(F); 382 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 383 } 384 385 bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) { 386 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); 387 TSI = TM->getSubtargetImpl(F); 388 TLI = TSI->getTargetLowering(); 389 390 // If none of the select types are supported then skip this pass. 391 // This is an optimization pass. Legality issues will be handled by 392 // instruction selection. 393 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 394 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 395 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 396 return false; 397 398 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 399 400 if (!TTI->enableSelectOptimize()) 401 return false; 402 403 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 404 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 405 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 406 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 407 TSchedModel.init(TSI); 408 409 // When optimizing for size, selects are preferable over branches. 410 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 411 return false; 412 413 return optimizeSelects(F); 414 } 415 416 bool SelectOptimizeImpl::optimizeSelects(Function &F) { 417 // Determine for which select groups it is profitable converting to branches. 418 SelectGroups ProfSIGroups; 419 // Base heuristics apply only to non-loops and outer loops. 420 optimizeSelectsBase(F, ProfSIGroups); 421 // Separate heuristics for inner-most loops. 422 optimizeSelectsInnerLoops(F, ProfSIGroups); 423 424 // Convert to branches the select groups that were deemed 425 // profitable-to-convert. 426 convertProfitableSIGroups(ProfSIGroups); 427 428 // Code modified if at least one select group was converted. 429 return !ProfSIGroups.empty(); 430 } 431 432 void SelectOptimizeImpl::optimizeSelectsBase(Function &F, 433 SelectGroups &ProfSIGroups) { 434 // Collect all the select groups. 435 SelectGroups SIGroups; 436 for (BasicBlock &BB : F) { 437 // Base heuristics apply only to non-loops and outer loops. 438 Loop *L = LI->getLoopFor(&BB); 439 if (L && L->isInnermost()) 440 continue; 441 collectSelectGroups(BB, SIGroups); 442 } 443 444 // Determine for which select groups it is profitable converting to branches. 445 findProfitableSIGroupsBase(SIGroups, ProfSIGroups); 446 } 447 448 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, 449 SelectGroups &ProfSIGroups) { 450 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); 451 // Need to check size on each iteration as we accumulate child loops. 452 for (unsigned long i = 0; i < Loops.size(); ++i) 453 for (Loop *ChildL : Loops[i]->getSubLoops()) 454 Loops.push_back(ChildL); 455 456 for (Loop *L : Loops) { 457 if (!L->isInnermost()) 458 continue; 459 460 SelectGroups SIGroups; 461 for (BasicBlock *BB : L->getBlocks()) 462 collectSelectGroups(*BB, SIGroups); 463 464 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); 465 } 466 } 467 468 /// Returns optimised value on \p IsTrue branch. For SelectInst that would be 469 /// either True or False value. For (BinaryOperator) instructions, where the 470 /// condition may be skipped, the operation will use a non-conditional operand. 471 /// For example, for `or(V,zext(cond))` this function would return V. 472 /// However, if the conditional operand on \p IsTrue branch matters, we create a 473 /// clone of instruction at the end of that branch \p B and replace the 474 /// condition operand with a constant. 475 /// 476 /// Also /p OptSelects contains previously optimised select-like instructions. 477 /// If the current value uses one of the optimised values, we can optimise it 478 /// further by replacing it with the corresponding value on the given branch 479 static Value *getTrueOrFalseValue( 480 SelectOptimizeImpl::SelectLike &SI, bool isTrue, 481 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> &OptSelects, 482 BasicBlock *B) { 483 Value *V = isTrue ? SI.getTrueValue() : SI.getFalseValue(); 484 if (V) { 485 auto *IV = dyn_cast<Instruction>(V); 486 if (IV && OptSelects.count(IV)) 487 return isTrue ? OptSelects[IV].first : OptSelects[IV].second; 488 return V; 489 } 490 491 auto *BO = cast<BinaryOperator>(SI.getI()); 492 assert((BO->getOpcode() == Instruction::Add || 493 BO->getOpcode() == Instruction::Or || 494 BO->getOpcode() == Instruction::Sub) && 495 "Only currently handling Add, Or and Sub binary operators."); 496 497 auto *CBO = BO->clone(); 498 auto CondIdx = SI.getConditionOpIndex(); 499 CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), 1)); 500 501 unsigned OtherIdx = 1 - CondIdx; 502 if (auto *IV = dyn_cast<Instruction>(CBO->getOperand(OtherIdx))) { 503 if (OptSelects.count(IV)) 504 CBO->setOperand(OtherIdx, 505 isTrue ? OptSelects[IV].first : OptSelects[IV].second); 506 } 507 CBO->insertBefore(B->getTerminator()); 508 return CBO; 509 } 510 511 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { 512 for (SelectGroup &ASI : ProfSIGroups) { 513 // The code transformation here is a modified version of the sinking 514 // transformation in CodeGenPrepare::optimizeSelectInst with a more 515 // aggressive strategy of which instructions to sink. 516 // 517 // TODO: eliminate the redundancy of logic transforming selects to branches 518 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here 519 // selects for all cases (with and without profile information). 520 521 // Transform a sequence like this: 522 // start: 523 // %cmp = cmp uge i32 %a, %b 524 // %sel = select i1 %cmp, i32 %c, i32 %d 525 // 526 // Into: 527 // start: 528 // %cmp = cmp uge i32 %a, %b 529 // %cmp.frozen = freeze %cmp 530 // br i1 %cmp.frozen, label %select.true, label %select.false 531 // select.true: 532 // br label %select.end 533 // select.false: 534 // br label %select.end 535 // select.end: 536 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 537 // 538 // %cmp should be frozen, otherwise it may introduce undefined behavior. 539 // In addition, we may sink instructions that produce %c or %d into the 540 // destination(s) of the new branch. 541 // If the true or false blocks do not contain a sunken instruction, that 542 // block and its branch may be optimized away. In that case, one side of the 543 // first branch will point directly to select.end, and the corresponding PHI 544 // predecessor block will be the start block. 545 546 // Find all the instructions that can be soundly sunk to the true/false 547 // blocks. These are instructions that are computed solely for producing the 548 // operands of the select instructions in the group and can be sunk without 549 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions 550 // with side effects). 551 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; 552 typedef std::stack<Instruction *>::size_type StackSizeType; 553 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; 554 for (SelectLike &SI : ASI.Selects) { 555 if (!isa<SelectInst>(SI.getI())) 556 continue; 557 // For each select, compute the sinkable dependence chains of the true and 558 // false operands. 559 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) { 560 std::stack<Instruction *> TrueSlice; 561 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true); 562 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); 563 TrueSlices.push_back(TrueSlice); 564 } 565 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) { 566 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) { 567 std::stack<Instruction *> FalseSlice; 568 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true); 569 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); 570 FalseSlices.push_back(FalseSlice); 571 } 572 } 573 } 574 // In the case of multiple select instructions in the same group, the order 575 // of non-dependent instructions (instructions of different dependence 576 // slices) in the true/false blocks appears to affect performance. 577 // Interleaving the slices seems to experimentally be the optimal approach. 578 // This interleaving scheduling allows for more ILP (with a natural downside 579 // of increasing a bit register pressure) compared to a simple ordering of 580 // one whole chain after another. One would expect that this ordering would 581 // not matter since the scheduling in the backend of the compiler would 582 // take care of it, but apparently the scheduler fails to deliver optimal 583 // ILP with a naive ordering here. 584 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; 585 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { 586 for (auto &S : TrueSlices) { 587 if (!S.empty()) { 588 TrueSlicesInterleaved.push_back(S.top()); 589 S.pop(); 590 } 591 } 592 } 593 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { 594 for (auto &S : FalseSlices) { 595 if (!S.empty()) { 596 FalseSlicesInterleaved.push_back(S.top()); 597 S.pop(); 598 } 599 } 600 } 601 602 // We split the block containing the select(s) into two blocks. 603 SelectLike &SI = ASI.Selects.front(); 604 SelectLike &LastSI = ASI.Selects.back(); 605 BasicBlock *StartBlock = SI.getI()->getParent(); 606 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI())); 607 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With 608 // RemoveDIs turned on, SplitPt would instead point to the next 609 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs, 610 // tell splitBasicBlock that we want to include any DbgVariableRecords 611 // attached to SplitPt in the splice. 612 SplitPt.setHeadBit(true); 613 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 614 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock)); 615 // Delete the unconditional branch that was just created by the split. 616 StartBlock->getTerminator()->eraseFromParent(); 617 618 // Move any debug/pseudo and auxiliary instructions that were in-between the 619 // select group to the newly-created end block. 620 SmallVector<Instruction *, 2> SinkInstrs; 621 auto DIt = SI.getI()->getIterator(); 622 auto NIt = ASI.Selects.begin(); 623 while (&*DIt != LastSI.getI()) { 624 if (NIt != ASI.Selects.end() && &*DIt == NIt->getI()) 625 ++NIt; 626 else 627 SinkInstrs.push_back(&*DIt); 628 DIt++; 629 } 630 auto InsertionPoint = EndBlock->getFirstInsertionPt(); 631 for (auto *DI : SinkInstrs) 632 DI->moveBeforePreserving(&*InsertionPoint); 633 634 // Duplicate implementation for DbgRecords, the non-instruction debug-info 635 // format. Helper lambda for moving DbgRecords to the end block. 636 auto TransferDbgRecords = [&](Instruction &I) { 637 for (auto &DbgRecord : 638 llvm::make_early_inc_range(I.getDbgRecordRange())) { 639 DbgRecord.removeFromParent(); 640 EndBlock->insertDbgRecordBefore(&DbgRecord, 641 EndBlock->getFirstInsertionPt()); 642 } 643 }; 644 645 // Iterate over all instructions in between SI and LastSI, not including 646 // SI itself. These are all the variable assignments that happen "in the 647 // middle" of the select group. 648 auto R = make_range(std::next(SI.getI()->getIterator()), 649 std::next(LastSI.getI()->getIterator())); 650 llvm::for_each(R, TransferDbgRecords); 651 652 // These are the new basic blocks for the conditional branch. 653 // At least one will become an actual new basic block. 654 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; 655 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; 656 // Checks if select-like instruction would materialise on the given branch 657 auto HasSelectLike = [](SelectGroup &SG, bool IsTrue) { 658 for (auto &SL : SG.Selects) { 659 if ((IsTrue ? SL.getTrueValue() : SL.getFalseValue()) == nullptr) 660 return true; 661 } 662 return false; 663 }; 664 if (!TrueSlicesInterleaved.empty() || HasSelectLike(ASI, true)) { 665 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink", 666 EndBlock->getParent(), EndBlock); 667 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 668 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 669 for (Instruction *TrueInst : TrueSlicesInterleaved) 670 TrueInst->moveBefore(TrueBranch); 671 } 672 if (!FalseSlicesInterleaved.empty() || HasSelectLike(ASI, false)) { 673 FalseBlock = 674 BasicBlock::Create(EndBlock->getContext(), "select.false.sink", 675 EndBlock->getParent(), EndBlock); 676 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 677 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 678 for (Instruction *FalseInst : FalseSlicesInterleaved) 679 FalseInst->moveBefore(FalseBranch); 680 } 681 // If there was nothing to sink, then arbitrarily choose the 'false' side 682 // for a new input value to the PHI. 683 if (TrueBlock == FalseBlock) { 684 assert(TrueBlock == nullptr && 685 "Unexpected basic block transform while optimizing select"); 686 687 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false", 688 EndBlock->getParent(), EndBlock); 689 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 690 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc()); 691 } 692 693 // Insert the real conditional branch based on the original condition. 694 // If we did not create a new block for one of the 'true' or 'false' paths 695 // of the condition, it means that side of the branch goes to the end block 696 // directly and the path originates from the start block from the point of 697 // view of the new PHI. 698 BasicBlock *TT, *FT; 699 if (TrueBlock == nullptr) { 700 TT = EndBlock; 701 FT = FalseBlock; 702 TrueBlock = StartBlock; 703 } else if (FalseBlock == nullptr) { 704 TT = TrueBlock; 705 FT = EndBlock; 706 FalseBlock = StartBlock; 707 } else { 708 TT = TrueBlock; 709 FT = FalseBlock; 710 } 711 IRBuilder<> IB(SI.getI()); 712 auto *CondFr = 713 IB.CreateFreeze(ASI.Condition, ASI.Condition->getName() + ".frozen"); 714 715 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> INS; 716 717 // Use reverse iterator because later select may use the value of the 718 // earlier select, and we need to propagate value through earlier select 719 // to get the PHI operand. 720 InsertionPoint = EndBlock->begin(); 721 for (SelectLike &SI : ASI.Selects) { 722 // The select itself is replaced with a PHI Node. 723 PHINode *PN = PHINode::Create(SI.getType(), 2, ""); 724 PN->insertBefore(InsertionPoint); 725 PN->takeName(SI.getI()); 726 // Current instruction might be a condition of some other group, so we 727 // need to replace it there to avoid dangling pointer 728 if (PN->getType()->isIntegerTy(1)) { 729 for (auto &SG : ProfSIGroups) { 730 if (SG.Condition == SI.getI()) 731 SG.Condition = PN; 732 } 733 } 734 SI.getI()->replaceAllUsesWith(PN); 735 auto *TV = getTrueOrFalseValue(SI, true, INS, TrueBlock); 736 auto *FV = getTrueOrFalseValue(SI, false, INS, FalseBlock); 737 INS[PN] = {TV, FV}; 738 PN->addIncoming(TV, TrueBlock); 739 PN->addIncoming(FV, FalseBlock); 740 PN->setDebugLoc(SI.getI()->getDebugLoc()); 741 ++NumSelectsConverted; 742 } 743 IB.CreateCondBr(CondFr, TT, FT, SI.getI()); 744 745 // Remove the old select instructions, now that they are not longer used. 746 for (SelectLike &SI : ASI.Selects) 747 SI.getI()->eraseFromParent(); 748 } 749 } 750 751 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB, 752 SelectGroups &SIGroups) { 753 // Represents something that can be considered as select instruction. 754 // Auxiliary instruction are instructions that depends on a condition and have 755 // zero or some constant value on True/False branch, such as: 756 // * ZExt(1bit) 757 // * Not(1bit) 758 struct SelectLikeInfo { 759 Value *Cond; 760 bool IsAuxiliary; 761 bool IsInverted; 762 unsigned ConditionIdx; 763 }; 764 765 DenseMap<Value *, SelectLikeInfo> SelectInfo; 766 767 // Check if the instruction is SelectLike or might be part of SelectLike 768 // expression, put information into SelectInfo and return the iterator to the 769 // inserted position. 770 auto ProcessSelectInfo = [&SelectInfo](Instruction *I) { 771 Value *Cond; 772 if (match(I, m_OneUse(m_ZExt(m_Value(Cond)))) && 773 Cond->getType()->isIntegerTy(1)) { 774 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 775 return SelectInfo.insert({I, {Cond, true, Inverted, 0}}).first; 776 } 777 778 if (match(I, m_Not(m_Value(Cond)))) { 779 return SelectInfo.insert({I, {Cond, true, true, 0}}).first; 780 } 781 782 // Select instruction are what we are usually looking for. 783 if (match(I, m_Select(m_Value(Cond), m_Value(), m_Value()))) { 784 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 785 return SelectInfo.insert({I, {Cond, false, Inverted, 0}}).first; 786 } 787 788 // An Or(zext(i1 X), Y) can also be treated like a select, with condition X 789 // and values Y|1 and Y. 790 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 791 switch (I->getOpcode()) { 792 case Instruction::Add: 793 case Instruction::Sub: { 794 Value *X; 795 if (!((PatternMatch::match(I->getOperand(0), 796 m_OneUse(m_ZExt(m_Value(X)))) || 797 PatternMatch::match(I->getOperand(1), 798 m_OneUse(m_ZExt(m_Value(X))))) && 799 X->getType()->isIntegerTy(1))) 800 return SelectInfo.end(); 801 break; 802 } 803 case Instruction::Or: 804 if (BO->getType()->isIntegerTy(1) || BO->getOpcode() != Instruction::Or) 805 return SelectInfo.end(); 806 break; 807 } 808 809 for (unsigned Idx = 0; Idx < 2; Idx++) { 810 auto *Op = BO->getOperand(Idx); 811 auto It = SelectInfo.find(Op); 812 if (It != SelectInfo.end() && It->second.IsAuxiliary) { 813 Cond = It->second.Cond; 814 bool Inverted = It->second.IsInverted; 815 return SelectInfo.insert({I, {Cond, false, Inverted, Idx}}).first; 816 } 817 } 818 } 819 return SelectInfo.end(); 820 }; 821 822 bool AlreadyProcessed = false; 823 BasicBlock::iterator BBIt = BB.begin(); 824 DenseMap<Value *, SelectLikeInfo>::iterator It; 825 while (BBIt != BB.end()) { 826 Instruction *I = &*BBIt++; 827 if (I->isDebugOrPseudoInst()) 828 continue; 829 830 if (!AlreadyProcessed) 831 It = ProcessSelectInfo(I); 832 else 833 AlreadyProcessed = false; 834 835 if (It == SelectInfo.end() || It->second.IsAuxiliary) 836 continue; 837 838 if (!TTI->shouldTreatInstructionLikeSelect(I)) 839 continue; 840 841 Value *Cond = It->second.Cond; 842 // Vector conditions are not supported. 843 if (!Cond->getType()->isIntegerTy(1)) 844 continue; 845 846 SelectGroup SIGroup = {Cond, {}}; 847 SIGroup.Selects.emplace_back(I, It->second.IsInverted, 848 It->second.ConditionIdx); 849 850 // If the select type is not supported, no point optimizing it. 851 // Instruction selection will take care of it. 852 if (!isSelectKindSupported(SIGroup.Selects.front())) 853 continue; 854 855 while (BBIt != BB.end()) { 856 Instruction *NI = &*BBIt; 857 // Debug/pseudo instructions should be skipped and not prevent the 858 // formation of a select group. 859 if (NI->isDebugOrPseudoInst()) { 860 ++BBIt; 861 continue; 862 } 863 864 It = ProcessSelectInfo(NI); 865 if (It == SelectInfo.end()) { 866 AlreadyProcessed = true; 867 break; 868 } 869 870 // Auxiliary with same condition 871 auto [CurrCond, IsAux, IsRev, CondIdx] = It->second; 872 if (Cond != CurrCond) { 873 AlreadyProcessed = true; 874 break; 875 } 876 877 if (!IsAux) 878 SIGroup.Selects.emplace_back(NI, IsRev, CondIdx); 879 ++BBIt; 880 } 881 LLVM_DEBUG({ 882 dbgs() << "New Select group (" << SIGroup.Selects.size() << ") with\n"; 883 for (auto &SI : SIGroup.Selects) 884 dbgs() << " " << *SI.getI() << "\n"; 885 }); 886 887 SIGroups.push_back(SIGroup); 888 } 889 } 890 891 void SelectOptimizeImpl::findProfitableSIGroupsBase( 892 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 893 for (SelectGroup &ASI : SIGroups) { 894 ++NumSelectOptAnalyzed; 895 if (isConvertToBranchProfitableBase(ASI)) 896 ProfSIGroups.push_back(ASI); 897 } 898 } 899 900 static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, 901 DiagnosticInfoOptimizationBase &Rem) { 902 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n"); 903 ORE->emit(Rem); 904 } 905 906 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops( 907 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 908 NumSelectOptAnalyzed += SIGroups.size(); 909 // For each select group in an inner-most loop, 910 // a branch is more preferable than a select/conditional-move if: 911 // i) conversion to branches for all the select groups of the loop satisfies 912 // loop-level heuristics including reducing the loop's critical path by 913 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and 914 // ii) the total cost of the select group is cheaper with a branch compared 915 // to its predicated version. The cost is in terms of latency and the cost 916 // of a select group is the cost of its most expensive select instruction 917 // (assuming infinite resources and thus fully leveraging available ILP). 918 919 DenseMap<const Instruction *, CostInfo> InstCostMap; 920 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, 921 {Scaled64::getZero(), Scaled64::getZero()}}; 922 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || 923 !checkLoopHeuristics(L, LoopCost)) { 924 return; 925 } 926 927 for (SelectGroup &ASI : SIGroups) { 928 // Assuming infinite resources, the cost of a group of instructions is the 929 // cost of the most expensive instruction of the group. 930 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); 931 for (SelectLike &SI : ASI.Selects) { 932 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost); 933 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost); 934 } 935 if (BranchCost < SelectCost) { 936 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", 937 ASI.Selects.front().getI()); 938 OR << "Profitable to convert to branch (loop analysis). BranchCost=" 939 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() 940 << ". "; 941 EmitAndPrintRemark(ORE, OR); 942 ++NumSelectConvertedLoop; 943 ProfSIGroups.push_back(ASI); 944 } else { 945 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 946 ASI.Selects.front().getI()); 947 ORmiss << "Select is more profitable (loop analysis). BranchCost=" 948 << BranchCost.toString() 949 << ", SelectCost=" << SelectCost.toString() << ". "; 950 EmitAndPrintRemark(ORE, ORmiss); 951 } 952 } 953 } 954 955 bool SelectOptimizeImpl::isConvertToBranchProfitableBase( 956 const SelectGroup &ASI) { 957 const SelectLike &SI = ASI.Selects.front(); 958 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI() 959 << "\n"); 960 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI()); 961 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI()); 962 963 // Skip cold basic blocks. Better to optimize for size for cold blocks. 964 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) { 965 ++NumSelectColdBB; 966 ORmiss << "Not converted to branch because of cold basic block. "; 967 EmitAndPrintRemark(ORE, ORmiss); 968 return false; 969 } 970 971 // If unpredictable, branch form is less profitable. 972 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) { 973 ++NumSelectUnPred; 974 ORmiss << "Not converted to branch because of unpredictable branch. "; 975 EmitAndPrintRemark(ORE, ORmiss); 976 return false; 977 } 978 979 // If highly predictable, branch form is more profitable, unless a 980 // predictable select is inexpensive in the target architecture. 981 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { 982 ++NumSelectConvertedHighPred; 983 OR << "Converted to branch because of highly predictable branch. "; 984 EmitAndPrintRemark(ORE, OR); 985 return true; 986 } 987 988 // Look for expensive instructions in the cold operand's (if any) dependence 989 // slice of any of the selects in the group. 990 if (hasExpensiveColdOperand(ASI)) { 991 ++NumSelectConvertedExpColdOperand; 992 OR << "Converted to branch because of expensive cold operand."; 993 EmitAndPrintRemark(ORE, OR); 994 return true; 995 } 996 997 ORmiss << "Not profitable to convert to branch (base heuristic)."; 998 EmitAndPrintRemark(ORE, ORmiss); 999 return false; 1000 } 1001 1002 static InstructionCost divideNearest(InstructionCost Numerator, 1003 uint64_t Denominator) { 1004 return (Numerator + (Denominator / 2)) / Denominator; 1005 } 1006 1007 static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI, 1008 uint64_t &TrueVal, uint64_t &FalseVal) { 1009 if (isa<SelectInst>(SI.getI())) 1010 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal); 1011 return false; 1012 } 1013 1014 bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) { 1015 bool ColdOperand = false; 1016 uint64_t TrueWeight, FalseWeight, TotalWeight; 1017 if (extractBranchWeights(ASI.Selects.front(), TrueWeight, FalseWeight)) { 1018 uint64_t MinWeight = std::min(TrueWeight, FalseWeight); 1019 TotalWeight = TrueWeight + FalseWeight; 1020 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? 1021 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; 1022 } else if (PSI->hasProfileSummary()) { 1023 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 1024 ASI.Selects.front().getI()); 1025 ORmiss << "Profile data available but missing branch-weights metadata for " 1026 "select instruction. "; 1027 EmitAndPrintRemark(ORE, ORmiss); 1028 } 1029 if (!ColdOperand) 1030 return false; 1031 // Check if the cold path's dependence slice is expensive for any of the 1032 // selects of the group. 1033 for (SelectLike SI : ASI.Selects) { 1034 Instruction *ColdI = nullptr; 1035 uint64_t HotWeight; 1036 if (TrueWeight < FalseWeight) { 1037 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue()); 1038 HotWeight = FalseWeight; 1039 } else { 1040 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue()); 1041 HotWeight = TrueWeight; 1042 } 1043 if (ColdI) { 1044 std::stack<Instruction *> ColdSlice; 1045 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI()); 1046 InstructionCost SliceCost = 0; 1047 while (!ColdSlice.empty()) { 1048 SliceCost += TTI->getInstructionCost(ColdSlice.top(), 1049 TargetTransformInfo::TCK_Latency); 1050 ColdSlice.pop(); 1051 } 1052 // The colder the cold value operand of the select is the more expensive 1053 // the cmov becomes for computing the cold value operand every time. Thus, 1054 // the colder the cold operand is the more its cost counts. 1055 // Get nearest integer cost adjusted for coldness. 1056 InstructionCost AdjSliceCost = 1057 divideNearest(SliceCost * HotWeight, TotalWeight); 1058 if (AdjSliceCost >= 1059 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) 1060 return true; 1061 } 1062 } 1063 return false; 1064 } 1065 1066 // Check if it is safe to move LoadI next to the SI. 1067 // Conservatively assume it is safe only if there is no instruction 1068 // modifying memory in-between the load and the select instruction. 1069 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { 1070 // Assume loads from different basic blocks are unsafe to move. 1071 if (LoadI->getParent() != SI->getParent()) 1072 return false; 1073 auto It = LoadI->getIterator(); 1074 while (&*It != SI) { 1075 if (It->mayWriteToMemory()) 1076 return false; 1077 It++; 1078 } 1079 return true; 1080 } 1081 1082 // For a given source instruction, collect its backwards dependence slice 1083 // consisting of instructions exclusively computed for the purpose of producing 1084 // the operands of the source instruction. As an approximation 1085 // (sufficiently-accurate in practice), we populate this set with the 1086 // instructions of the backwards dependence slice that only have one-use and 1087 // form an one-use chain that leads to the source instruction. 1088 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I, 1089 std::stack<Instruction *> &Slice, 1090 Instruction *SI, 1091 bool ForSinking) { 1092 SmallPtrSet<Instruction *, 2> Visited; 1093 std::queue<Instruction *> Worklist; 1094 Worklist.push(I); 1095 while (!Worklist.empty()) { 1096 Instruction *II = Worklist.front(); 1097 Worklist.pop(); 1098 1099 // Avoid cycles. 1100 if (!Visited.insert(II).second) 1101 continue; 1102 1103 if (!II->hasOneUse()) 1104 continue; 1105 1106 // Cannot soundly sink instructions with side-effects. 1107 // Terminator or phi instructions cannot be sunk. 1108 // Avoid sinking other select instructions (should be handled separetely). 1109 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || 1110 isa<SelectInst>(II) || isa<PHINode>(II))) 1111 continue; 1112 1113 // Avoid sinking loads in order not to skip state-modifying instructions, 1114 // that may alias with the loaded address. 1115 // Only allow sinking of loads within the same basic block that are 1116 // conservatively proven to be safe. 1117 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI)) 1118 continue; 1119 1120 // Avoid considering instructions with less frequency than the source 1121 // instruction (i.e., avoid colder code regions of the dependence slice). 1122 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) 1123 continue; 1124 1125 // Eligible one-use instruction added to the dependence slice. 1126 Slice.push(II); 1127 1128 // Explore all the operands of the current instruction to expand the slice. 1129 for (Value *Op : II->operand_values()) 1130 if (auto *OpI = dyn_cast<Instruction>(Op)) 1131 Worklist.push(OpI); 1132 } 1133 } 1134 1135 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) { 1136 uint64_t TrueWeight, FalseWeight; 1137 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1138 uint64_t Max = std::max(TrueWeight, FalseWeight); 1139 uint64_t Sum = TrueWeight + FalseWeight; 1140 if (Sum != 0) { 1141 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 1142 if (Probability > TTI->getPredictableBranchThreshold()) 1143 return true; 1144 } 1145 } 1146 return false; 1147 } 1148 1149 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, 1150 const CostInfo LoopCost[2]) { 1151 // Loop-level checks to determine if a non-predicated version (with branches) 1152 // of the loop is more profitable than its predicated version. 1153 1154 if (DisableLoopLevelHeuristics) 1155 return true; 1156 1157 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", 1158 L->getHeader()->getFirstNonPHI()); 1159 1160 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || 1161 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { 1162 ORmissL << "No select conversion in the loop due to no reduction of loop's " 1163 "critical path. "; 1164 EmitAndPrintRemark(ORE, ORmissL); 1165 return false; 1166 } 1167 1168 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, 1169 LoopCost[1].PredCost - LoopCost[1].NonPredCost}; 1170 1171 // Profitably converting to branches need to reduce the loop's critical path 1172 // by at least some threshold (absolute gain of GainCycleThreshold cycles and 1173 // relative gain of 12.5%). 1174 if (Gain[1] < Scaled64::get(GainCycleThreshold) || 1175 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { 1176 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; 1177 ORmissL << "No select conversion in the loop due to small reduction of " 1178 "loop's critical path. Gain=" 1179 << Gain[1].toString() 1180 << ", RelativeGain=" << RelativeGain.toString() << "%. "; 1181 EmitAndPrintRemark(ORE, ORmissL); 1182 return false; 1183 } 1184 1185 // If the loop's critical path involves loop-carried dependences, the gradient 1186 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). 1187 // This check ensures that the latency reduction for the loop's critical path 1188 // keeps decreasing with sufficient rate beyond the two analyzed loop 1189 // iterations. 1190 if (Gain[1] > Gain[0]) { 1191 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / 1192 (LoopCost[1].PredCost - LoopCost[0].PredCost); 1193 if (GradientGain < Scaled64::get(GainGradientThreshold)) { 1194 ORmissL << "No select conversion in the loop due to small gradient gain. " 1195 "GradientGain=" 1196 << GradientGain.toString() << "%. "; 1197 EmitAndPrintRemark(ORE, ORmissL); 1198 return false; 1199 } 1200 } 1201 // If the gain decreases it is not profitable to convert. 1202 else if (Gain[1] < Gain[0]) { 1203 ORmissL 1204 << "No select conversion in the loop due to negative gradient gain. "; 1205 EmitAndPrintRemark(ORE, ORmissL); 1206 return false; 1207 } 1208 1209 // Non-predicated version of the loop is more profitable than its 1210 // predicated version. 1211 return true; 1212 } 1213 1214 // Computes instruction and loop-critical-path costs for both the predicated 1215 // and non-predicated version of the given loop. 1216 // Returns false if unable to compute these costs due to invalid cost of loop 1217 // instruction(s). 1218 bool SelectOptimizeImpl::computeLoopCosts( 1219 const Loop *L, const SelectGroups &SIGroups, 1220 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { 1221 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop " 1222 << L->getHeader()->getName() << "\n"); 1223 const auto SImap = getSImap(SIGroups); 1224 const auto SGmap = getSGmap(SIGroups); 1225 // Compute instruction and loop-critical-path costs across two iterations for 1226 // both predicated and non-predicated version. 1227 const unsigned Iterations = 2; 1228 for (unsigned Iter = 0; Iter < Iterations; ++Iter) { 1229 // Cost of the loop's critical path. 1230 CostInfo &MaxCost = LoopCost[Iter]; 1231 for (BasicBlock *BB : L->getBlocks()) { 1232 for (const Instruction &I : *BB) { 1233 if (I.isDebugOrPseudoInst()) 1234 continue; 1235 // Compute the predicated and non-predicated cost of the instruction. 1236 Scaled64 IPredCost = Scaled64::getZero(), 1237 INonPredCost = Scaled64::getZero(); 1238 1239 // Assume infinite resources that allow to fully exploit the available 1240 // instruction-level parallelism. 1241 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) 1242 for (const Use &U : I.operands()) { 1243 auto UI = dyn_cast<Instruction>(U.get()); 1244 if (!UI) 1245 continue; 1246 if (InstCostMap.count(UI)) { 1247 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); 1248 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); 1249 } 1250 } 1251 auto ILatency = computeInstCost(&I); 1252 if (!ILatency) { 1253 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); 1254 ORmissL << "Invalid instruction cost preventing analysis and " 1255 "optimization of the inner-most loop containing this " 1256 "instruction. "; 1257 EmitAndPrintRemark(ORE, ORmissL); 1258 return false; 1259 } 1260 IPredCost += Scaled64::get(*ILatency); 1261 INonPredCost += Scaled64::get(*ILatency); 1262 1263 // For a select that can be converted to branch, 1264 // compute its cost as a branch (non-predicated cost). 1265 // 1266 // BranchCost = PredictedPathCost + MispredictCost 1267 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb 1268 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate 1269 if (SImap.contains(&I)) { 1270 auto SI = SImap.at(&I); 1271 const auto *SG = SGmap.at(&I); 1272 Scaled64 TrueOpCost = SI.getOpCostOnBranch(true, InstCostMap, TTI); 1273 Scaled64 FalseOpCost = SI.getOpCostOnBranch(false, InstCostMap, TTI); 1274 Scaled64 PredictedPathCost = 1275 getPredictedPathCost(TrueOpCost, FalseOpCost, SI); 1276 1277 Scaled64 CondCost = Scaled64::getZero(); 1278 if (auto *CI = dyn_cast<Instruction>(SG->Condition)) 1279 if (InstCostMap.count(CI)) 1280 CondCost = InstCostMap[CI].NonPredCost; 1281 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); 1282 1283 INonPredCost = PredictedPathCost + MispredictCost; 1284 } 1285 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/" 1286 << INonPredCost << " for " << I << "\n"); 1287 1288 InstCostMap[&I] = {IPredCost, INonPredCost}; 1289 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); 1290 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); 1291 } 1292 } 1293 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1 1294 << " MaxCost = " << MaxCost.PredCost << " " 1295 << MaxCost.NonPredCost << "\n"); 1296 } 1297 return true; 1298 } 1299 1300 SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2> 1301 SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) { 1302 SmallDenseMap<const Instruction *, SelectLike, 2> SImap; 1303 for (const SelectGroup &ASI : SIGroups) 1304 for (const SelectLike &SI : ASI.Selects) 1305 SImap.try_emplace(SI.getI(), SI); 1306 return SImap; 1307 } 1308 1309 SmallDenseMap<const Instruction *, const SelectOptimizeImpl::SelectGroup *, 2> 1310 SelectOptimizeImpl::getSGmap(const SelectGroups &SIGroups) { 1311 SmallDenseMap<const Instruction *, const SelectGroup *, 2> SImap; 1312 for (const SelectGroup &ASI : SIGroups) 1313 for (const SelectLike &SI : ASI.Selects) 1314 SImap.try_emplace(SI.getI(), &ASI); 1315 return SImap; 1316 } 1317 1318 std::optional<uint64_t> 1319 SelectOptimizeImpl::computeInstCost(const Instruction *I) { 1320 InstructionCost ICost = 1321 TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); 1322 if (auto OC = ICost.getValue()) 1323 return std::optional<uint64_t>(*OC); 1324 return std::nullopt; 1325 } 1326 1327 ScaledNumber<uint64_t> 1328 SelectOptimizeImpl::getMispredictionCost(const SelectLike SI, 1329 const Scaled64 CondCost) { 1330 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; 1331 1332 // Account for the default misprediction rate when using a branch 1333 // (conservatively set to 25% by default). 1334 uint64_t MispredictRate = MispredictDefaultRate; 1335 // If the select condition is obviously predictable, then the misprediction 1336 // rate is zero. 1337 if (isSelectHighlyPredictable(SI)) 1338 MispredictRate = 0; 1339 1340 // CondCost is included to account for cases where the computation of the 1341 // condition is part of a long dependence chain (potentially loop-carried) 1342 // that would delay detection of a misprediction and increase its cost. 1343 Scaled64 MispredictCost = 1344 std::max(Scaled64::get(MispredictPenalty), CondCost) * 1345 Scaled64::get(MispredictRate); 1346 MispredictCost /= Scaled64::get(100); 1347 1348 return MispredictCost; 1349 } 1350 1351 // Returns the cost of a branch when the prediction is correct. 1352 // TrueCost * TrueProbability + FalseCost * FalseProbability. 1353 ScaledNumber<uint64_t> 1354 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 1355 const SelectLike SI) { 1356 Scaled64 PredPathCost; 1357 uint64_t TrueWeight, FalseWeight; 1358 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1359 uint64_t SumWeight = TrueWeight + FalseWeight; 1360 if (SumWeight != 0) { 1361 PredPathCost = TrueCost * Scaled64::get(TrueWeight) + 1362 FalseCost * Scaled64::get(FalseWeight); 1363 PredPathCost /= Scaled64::get(SumWeight); 1364 return PredPathCost; 1365 } 1366 } 1367 // Without branch weight metadata, we assume 75% for the one path and 25% for 1368 // the other, and pick the result with the biggest cost. 1369 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, 1370 FalseCost * Scaled64::get(3) + TrueCost); 1371 PredPathCost /= Scaled64::get(4); 1372 return PredPathCost; 1373 } 1374 1375 bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) { 1376 TargetLowering::SelectSupportKind SelectKind; 1377 if (SI.getType()->isVectorTy()) 1378 SelectKind = TargetLowering::ScalarCondVectorVal; 1379 else 1380 SelectKind = TargetLowering::ScalarValSelect; 1381 return TLI->isSelectSupported(SelectKind); 1382 } 1383