1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass converts selects to conditional jumps when profitable. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/CodeGen/SelectOptimize.h" 14 #include "llvm/ADT/SetVector.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 21 #include "llvm/Analysis/ProfileSummaryInfo.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetLowering.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/CodeGen/TargetSubtargetInfo.h" 28 #include "llvm/IR/BasicBlock.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/PatternMatch.h" 34 #include "llvm/IR/ProfDataUtils.h" 35 #include "llvm/InitializePasses.h" 36 #include "llvm/Pass.h" 37 #include "llvm/Support/ScaledNumber.h" 38 #include "llvm/Target/TargetMachine.h" 39 #include "llvm/Transforms/Utils/SizeOpts.h" 40 #include <algorithm> 41 #include <queue> 42 #include <stack> 43 44 using namespace llvm; 45 using namespace llvm::PatternMatch; 46 47 #define DEBUG_TYPE "select-optimize" 48 49 STATISTIC(NumSelectOptAnalyzed, 50 "Number of select groups considered for conversion to branch"); 51 STATISTIC(NumSelectConvertedExpColdOperand, 52 "Number of select groups converted due to expensive cold operand"); 53 STATISTIC(NumSelectConvertedHighPred, 54 "Number of select groups converted due to high-predictability"); 55 STATISTIC(NumSelectUnPred, 56 "Number of select groups not converted due to unpredictability"); 57 STATISTIC(NumSelectColdBB, 58 "Number of select groups not converted due to cold basic block"); 59 STATISTIC(NumSelectConvertedLoop, 60 "Number of select groups converted due to loop-level analysis"); 61 STATISTIC(NumSelectsConverted, "Number of selects converted"); 62 63 static cl::opt<unsigned> ColdOperandThreshold( 64 "cold-operand-threshold", 65 cl::desc("Maximum frequency of path for an operand to be considered cold."), 66 cl::init(20), cl::Hidden); 67 68 static cl::opt<unsigned> ColdOperandMaxCostMultiplier( 69 "cold-operand-max-cost-multiplier", 70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " 71 "slice of a cold operand to be considered inexpensive."), 72 cl::init(1), cl::Hidden); 73 74 static cl::opt<unsigned> 75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold", 76 cl::desc("Gradient gain threshold (%)."), 77 cl::init(25), cl::Hidden); 78 79 static cl::opt<unsigned> 80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold", 81 cl::desc("Minimum gain per loop (in cycles) threshold."), 82 cl::init(4), cl::Hidden); 83 84 static cl::opt<unsigned> GainRelativeThreshold( 85 "select-opti-loop-relative-gain-threshold", 86 cl::desc( 87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), 88 cl::init(8), cl::Hidden); 89 90 static cl::opt<unsigned> MispredictDefaultRate( 91 "mispredict-default-rate", cl::Hidden, cl::init(25), 92 cl::desc("Default mispredict rate (initialized to 25%).")); 93 94 static cl::opt<bool> 95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, 96 cl::init(false), 97 cl::desc("Disable loop-level heuristics.")); 98 99 namespace { 100 101 class SelectOptimizeImpl { 102 const TargetMachine *TM = nullptr; 103 const TargetSubtargetInfo *TSI = nullptr; 104 const TargetLowering *TLI = nullptr; 105 const TargetTransformInfo *TTI = nullptr; 106 const LoopInfo *LI = nullptr; 107 BlockFrequencyInfo *BFI; 108 ProfileSummaryInfo *PSI = nullptr; 109 OptimizationRemarkEmitter *ORE = nullptr; 110 TargetSchedModel TSchedModel; 111 112 public: 113 SelectOptimizeImpl() = default; 114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){}; 115 PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); 116 bool runOnFunction(Function &F, Pass &P); 117 118 using Scaled64 = ScaledNumber<uint64_t>; 119 120 struct CostInfo { 121 /// Predicated cost (with selects as conditional moves). 122 Scaled64 PredCost; 123 /// Non-predicated cost (with selects converted to branches). 124 Scaled64 NonPredCost; 125 }; 126 127 /// SelectLike is an abstraction over SelectInst and other operations that can 128 /// act like selects. For example Or(Zext(icmp), X) can be treated like 129 /// select(icmp, X|1, X). 130 class SelectLike { 131 /// The select (/or) instruction. 132 Instruction *I; 133 /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as 134 /// opposed to the original condition. 135 bool Inverted = false; 136 137 /// The index of the operand that depends on condition. Only for select-like 138 /// instruction such as Or/Add. 139 unsigned CondIdx; 140 141 public: 142 SelectLike(Instruction *I, bool Inverted = false, unsigned CondIdx = 0) 143 : I(I), Inverted(Inverted), CondIdx(CondIdx) {} 144 145 Instruction *getI() { return I; } 146 const Instruction *getI() const { return I; } 147 148 Type *getType() const { return I->getType(); } 149 150 unsigned getConditionOpIndex() { return CondIdx; }; 151 152 /// Return the true value for the SelectLike instruction. Note this may not 153 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)` 154 /// the true value would be `or(x,1)`. As this value does not exist, nullptr 155 /// is returned. 156 Value *getTrueValue(bool HonorInverts = true) const { 157 if (Inverted && HonorInverts) 158 return getFalseValue(/*HonorInverts=*/false); 159 if (auto *Sel = dyn_cast<SelectInst>(I)) 160 return Sel->getTrueValue(); 161 // Or(zext) case - The true value is Or(X), so return nullptr as the value 162 // does not yet exist. 163 if (isa<BinaryOperator>(I)) 164 return nullptr; 165 166 llvm_unreachable("Unhandled case in getTrueValue"); 167 } 168 169 /// Return the false value for the SelectLike instruction. For example the 170 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is 171 /// `select(c, x|1, x)`) 172 Value *getFalseValue(bool HonorInverts = true) const { 173 if (Inverted && HonorInverts) 174 return getTrueValue(/*HonorInverts=*/false); 175 if (auto *Sel = dyn_cast<SelectInst>(I)) 176 return Sel->getFalseValue(); 177 // We are on the branch where the condition is zero, which means BinOp 178 // does not perform any computation, and we can simply return the operand 179 // that is not related to the condition 180 if (auto *BO = dyn_cast<BinaryOperator>(I)) 181 return BO->getOperand(1 - CondIdx); 182 183 llvm_unreachable("Unhandled case in getFalseValue"); 184 } 185 186 /// Return the NonPredCost cost of the op on \p isTrue branch, given the 187 /// costs in \p InstCostMap. This may need to be generated for select-like 188 /// instructions. 189 Scaled64 getOpCostOnBranch( 190 bool IsTrue, const DenseMap<const Instruction *, CostInfo> &InstCostMap, 191 const TargetTransformInfo *TTI) { 192 auto *V = IsTrue ? getTrueValue() : getFalseValue(); 193 if (V) { 194 if (auto *IV = dyn_cast<Instruction>(V)) { 195 auto It = InstCostMap.find(IV); 196 return It != InstCostMap.end() ? It->second.NonPredCost 197 : Scaled64::getZero(); 198 } 199 return Scaled64::getZero(); 200 } 201 // If getTrue(False)Value() return nullptr, it means we are dealing with 202 // select-like instructions on the branch where the actual computation is 203 // happening. In that case the cost is equal to the cost of computation + 204 // cost of non-dependant on condition operand 205 InstructionCost Cost = TTI->getArithmeticInstrCost( 206 getI()->getOpcode(), I->getType(), TargetTransformInfo::TCK_Latency, 207 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 208 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2}); 209 auto TotalCost = Scaled64::get(*Cost.getValue()); 210 if (auto *OpI = dyn_cast<Instruction>(I->getOperand(1 - CondIdx))) { 211 auto It = InstCostMap.find(OpI); 212 if (It != InstCostMap.end()) 213 TotalCost += It->second.NonPredCost; 214 } 215 return TotalCost; 216 } 217 }; 218 219 private: 220 // Select groups consist of consecutive select-like instructions with the same 221 // condition. Between select-likes could be any number of auxiliary 222 // instructions related to the condition like not, zext, ashr/lshr 223 struct SelectGroup { 224 Value *Condition; 225 SmallVector<SelectLike, 2> Selects; 226 }; 227 using SelectGroups = SmallVector<SelectGroup, 2>; 228 229 // Converts select instructions of a function to conditional jumps when deemed 230 // profitable. Returns true if at least one select was converted. 231 bool optimizeSelects(Function &F); 232 233 // Heuristics for determining which select instructions can be profitably 234 // conveted to branches. Separate heuristics for selects in inner-most loops 235 // and the rest of code regions (base heuristics for non-inner-most loop 236 // regions). 237 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); 238 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); 239 240 // Converts to branches the select groups that were deemed 241 // profitable-to-convert. 242 void convertProfitableSIGroups(SelectGroups &ProfSIGroups); 243 244 // Splits selects of a given basic block into select groups. 245 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); 246 247 // Determines for which select groups it is profitable converting to branches 248 // (base and inner-most-loop heuristics). 249 void findProfitableSIGroupsBase(SelectGroups &SIGroups, 250 SelectGroups &ProfSIGroups); 251 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, 252 SelectGroups &ProfSIGroups); 253 254 // Determines if a select group should be converted to a branch (base 255 // heuristics). 256 bool isConvertToBranchProfitableBase(const SelectGroup &ASI); 257 258 // Returns true if there are expensive instructions in the cold value 259 // operand's (if any) dependence slice of any of the selects of the given 260 // group. 261 bool hasExpensiveColdOperand(const SelectGroup &ASI); 262 263 // For a given source instruction, collect its backwards dependence slice 264 // consisting of instructions exclusively computed for producing the operands 265 // of the source instruction. 266 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, 267 Instruction *SI, bool ForSinking = false); 268 269 // Returns true if the condition of the select is highly predictable. 270 bool isSelectHighlyPredictable(const SelectLike SI); 271 272 // Loop-level checks to determine if a non-predicated version (with branches) 273 // of the given loop is more profitable than its predicated version. 274 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); 275 276 // Computes instruction and loop-critical-path costs for both the predicated 277 // and non-predicated version of the given loop. 278 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, 279 DenseMap<const Instruction *, CostInfo> &InstCostMap, 280 CostInfo *LoopCost); 281 282 // Returns a set of all the select instructions in the given select groups. 283 SmallDenseMap<const Instruction *, SelectLike, 2> 284 getSImap(const SelectGroups &SIGroups); 285 286 // Returns a map from select-like instructions to the corresponding select 287 // group. 288 SmallDenseMap<const Instruction *, const SelectGroup *, 2> 289 getSGmap(const SelectGroups &SIGroups); 290 291 // Returns the latency cost of a given instruction. 292 std::optional<uint64_t> computeInstCost(const Instruction *I); 293 294 // Returns the misprediction cost of a given select when converted to branch. 295 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost); 296 297 // Returns the cost of a branch when the prediction is correct. 298 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 299 const SelectLike SI); 300 301 // Returns true if the target architecture supports lowering a given select. 302 bool isSelectKindSupported(const SelectLike SI); 303 }; 304 305 class SelectOptimize : public FunctionPass { 306 SelectOptimizeImpl Impl; 307 308 public: 309 static char ID; 310 311 SelectOptimize() : FunctionPass(ID) { 312 initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); 313 } 314 315 bool runOnFunction(Function &F) override { 316 return Impl.runOnFunction(F, *this); 317 } 318 319 void getAnalysisUsage(AnalysisUsage &AU) const override { 320 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 321 AU.addRequired<TargetPassConfig>(); 322 AU.addRequired<TargetTransformInfoWrapperPass>(); 323 AU.addRequired<LoopInfoWrapperPass>(); 324 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 325 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 326 } 327 }; 328 329 } // namespace 330 331 PreservedAnalyses SelectOptimizePass::run(Function &F, 332 FunctionAnalysisManager &FAM) { 333 SelectOptimizeImpl Impl(TM); 334 return Impl.run(F, FAM); 335 } 336 337 char SelectOptimize::ID = 0; 338 339 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 340 false) 341 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 342 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 343 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 344 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 345 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 346 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 347 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 348 false) 349 350 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } 351 352 PreservedAnalyses SelectOptimizeImpl::run(Function &F, 353 FunctionAnalysisManager &FAM) { 354 TSI = TM->getSubtargetImpl(F); 355 TLI = TSI->getTargetLowering(); 356 357 // If none of the select types are supported then skip this pass. 358 // This is an optimization pass. Legality issues will be handled by 359 // instruction selection. 360 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 361 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 362 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 363 return PreservedAnalyses::all(); 364 365 TTI = &FAM.getResult<TargetIRAnalysis>(F); 366 if (!TTI->enableSelectOptimize()) 367 return PreservedAnalyses::all(); 368 369 PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F) 370 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 371 assert(PSI && "This pass requires module analysis pass `profile-summary`!"); 372 BFI = &FAM.getResult<BlockFrequencyAnalysis>(F); 373 374 // When optimizing for size, selects are preferable over branches. 375 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 376 return PreservedAnalyses::all(); 377 378 LI = &FAM.getResult<LoopAnalysis>(F); 379 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 380 TSchedModel.init(TSI); 381 382 bool Changed = optimizeSelects(F); 383 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 384 } 385 386 bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) { 387 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); 388 TSI = TM->getSubtargetImpl(F); 389 TLI = TSI->getTargetLowering(); 390 391 // If none of the select types are supported then skip this pass. 392 // This is an optimization pass. Legality issues will be handled by 393 // instruction selection. 394 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 395 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 396 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 397 return false; 398 399 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 400 401 if (!TTI->enableSelectOptimize()) 402 return false; 403 404 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 405 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 406 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 407 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 408 TSchedModel.init(TSI); 409 410 // When optimizing for size, selects are preferable over branches. 411 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 412 return false; 413 414 return optimizeSelects(F); 415 } 416 417 bool SelectOptimizeImpl::optimizeSelects(Function &F) { 418 // Determine for which select groups it is profitable converting to branches. 419 SelectGroups ProfSIGroups; 420 // Base heuristics apply only to non-loops and outer loops. 421 optimizeSelectsBase(F, ProfSIGroups); 422 // Separate heuristics for inner-most loops. 423 optimizeSelectsInnerLoops(F, ProfSIGroups); 424 425 // Convert to branches the select groups that were deemed 426 // profitable-to-convert. 427 convertProfitableSIGroups(ProfSIGroups); 428 429 // Code modified if at least one select group was converted. 430 return !ProfSIGroups.empty(); 431 } 432 433 void SelectOptimizeImpl::optimizeSelectsBase(Function &F, 434 SelectGroups &ProfSIGroups) { 435 // Collect all the select groups. 436 SelectGroups SIGroups; 437 for (BasicBlock &BB : F) { 438 // Base heuristics apply only to non-loops and outer loops. 439 Loop *L = LI->getLoopFor(&BB); 440 if (L && L->isInnermost()) 441 continue; 442 collectSelectGroups(BB, SIGroups); 443 } 444 445 // Determine for which select groups it is profitable converting to branches. 446 findProfitableSIGroupsBase(SIGroups, ProfSIGroups); 447 } 448 449 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, 450 SelectGroups &ProfSIGroups) { 451 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); 452 // Need to check size on each iteration as we accumulate child loops. 453 for (unsigned long i = 0; i < Loops.size(); ++i) 454 for (Loop *ChildL : Loops[i]->getSubLoops()) 455 Loops.push_back(ChildL); 456 457 for (Loop *L : Loops) { 458 if (!L->isInnermost()) 459 continue; 460 461 SelectGroups SIGroups; 462 for (BasicBlock *BB : L->getBlocks()) 463 collectSelectGroups(*BB, SIGroups); 464 465 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); 466 } 467 } 468 469 /// Returns optimised value on \p IsTrue branch. For SelectInst that would be 470 /// either True or False value. For (BinaryOperator) instructions, where the 471 /// condition may be skipped, the operation will use a non-conditional operand. 472 /// For example, for `or(V,zext(cond))` this function would return V. 473 /// However, if the conditional operand on \p IsTrue branch matters, we create a 474 /// clone of instruction at the end of that branch \p B and replace the 475 /// condition operand with a constant. 476 /// 477 /// Also /p OptSelects contains previously optimised select-like instructions. 478 /// If the current value uses one of the optimised values, we can optimise it 479 /// further by replacing it with the corresponding value on the given branch 480 static Value *getTrueOrFalseValue( 481 SelectOptimizeImpl::SelectLike &SI, bool isTrue, 482 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> &OptSelects, 483 BasicBlock *B) { 484 Value *V = isTrue ? SI.getTrueValue() : SI.getFalseValue(); 485 if (V) { 486 auto *IV = dyn_cast<Instruction>(V); 487 if (IV && OptSelects.count(IV)) 488 return isTrue ? OptSelects[IV].first : OptSelects[IV].second; 489 return V; 490 } 491 492 auto *BO = cast<BinaryOperator>(SI.getI()); 493 assert((BO->getOpcode() == Instruction::Add || 494 BO->getOpcode() == Instruction::Or || 495 BO->getOpcode() == Instruction::Sub) && 496 "Only currently handling Add, Or and Sub binary operators."); 497 498 auto *CBO = BO->clone(); 499 auto CondIdx = SI.getConditionOpIndex(); 500 auto *AuxI = cast<Instruction>(CBO->getOperand(CondIdx)); 501 if (isa<ZExtInst>(AuxI) || isa<LShrOperator>(AuxI)) { 502 CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), 1)); 503 } else { 504 assert((isa<AShrOperator>(AuxI) || isa<SExtInst>(AuxI)) && 505 "Unexpected opcode"); 506 CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), -1)); 507 } 508 509 unsigned OtherIdx = 1 - CondIdx; 510 if (auto *IV = dyn_cast<Instruction>(CBO->getOperand(OtherIdx))) { 511 if (OptSelects.count(IV)) 512 CBO->setOperand(OtherIdx, 513 isTrue ? OptSelects[IV].first : OptSelects[IV].second); 514 } 515 CBO->insertBefore(B->getTerminator()); 516 return CBO; 517 } 518 519 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { 520 for (SelectGroup &ASI : ProfSIGroups) { 521 // The code transformation here is a modified version of the sinking 522 // transformation in CodeGenPrepare::optimizeSelectInst with a more 523 // aggressive strategy of which instructions to sink. 524 // 525 // TODO: eliminate the redundancy of logic transforming selects to branches 526 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here 527 // selects for all cases (with and without profile information). 528 529 // Transform a sequence like this: 530 // start: 531 // %cmp = cmp uge i32 %a, %b 532 // %sel = select i1 %cmp, i32 %c, i32 %d 533 // 534 // Into: 535 // start: 536 // %cmp = cmp uge i32 %a, %b 537 // %cmp.frozen = freeze %cmp 538 // br i1 %cmp.frozen, label %select.true, label %select.false 539 // select.true: 540 // br label %select.end 541 // select.false: 542 // br label %select.end 543 // select.end: 544 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 545 // 546 // %cmp should be frozen, otherwise it may introduce undefined behavior. 547 // In addition, we may sink instructions that produce %c or %d into the 548 // destination(s) of the new branch. 549 // If the true or false blocks do not contain a sunken instruction, that 550 // block and its branch may be optimized away. In that case, one side of the 551 // first branch will point directly to select.end, and the corresponding PHI 552 // predecessor block will be the start block. 553 554 // Find all the instructions that can be soundly sunk to the true/false 555 // blocks. These are instructions that are computed solely for producing the 556 // operands of the select instructions in the group and can be sunk without 557 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions 558 // with side effects). 559 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; 560 typedef std::stack<Instruction *>::size_type StackSizeType; 561 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; 562 for (SelectLike &SI : ASI.Selects) { 563 if (!isa<SelectInst>(SI.getI())) 564 continue; 565 // For each select, compute the sinkable dependence chains of the true and 566 // false operands. 567 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) { 568 std::stack<Instruction *> TrueSlice; 569 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true); 570 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); 571 TrueSlices.push_back(TrueSlice); 572 } 573 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) { 574 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) { 575 std::stack<Instruction *> FalseSlice; 576 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true); 577 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); 578 FalseSlices.push_back(FalseSlice); 579 } 580 } 581 } 582 // In the case of multiple select instructions in the same group, the order 583 // of non-dependent instructions (instructions of different dependence 584 // slices) in the true/false blocks appears to affect performance. 585 // Interleaving the slices seems to experimentally be the optimal approach. 586 // This interleaving scheduling allows for more ILP (with a natural downside 587 // of increasing a bit register pressure) compared to a simple ordering of 588 // one whole chain after another. One would expect that this ordering would 589 // not matter since the scheduling in the backend of the compiler would 590 // take care of it, but apparently the scheduler fails to deliver optimal 591 // ILP with a naive ordering here. 592 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; 593 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { 594 for (auto &S : TrueSlices) { 595 if (!S.empty()) { 596 TrueSlicesInterleaved.push_back(S.top()); 597 S.pop(); 598 } 599 } 600 } 601 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { 602 for (auto &S : FalseSlices) { 603 if (!S.empty()) { 604 FalseSlicesInterleaved.push_back(S.top()); 605 S.pop(); 606 } 607 } 608 } 609 610 // We split the block containing the select(s) into two blocks. 611 SelectLike &SI = ASI.Selects.front(); 612 SelectLike &LastSI = ASI.Selects.back(); 613 BasicBlock *StartBlock = SI.getI()->getParent(); 614 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI())); 615 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With 616 // RemoveDIs turned on, SplitPt would instead point to the next 617 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs, 618 // tell splitBasicBlock that we want to include any DbgVariableRecords 619 // attached to SplitPt in the splice. 620 SplitPt.setHeadBit(true); 621 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 622 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock)); 623 // Delete the unconditional branch that was just created by the split. 624 StartBlock->getTerminator()->eraseFromParent(); 625 626 // Move any debug/pseudo and auxiliary instructions that were in-between the 627 // select group to the newly-created end block. 628 SmallVector<Instruction *, 2> SinkInstrs; 629 auto DIt = SI.getI()->getIterator(); 630 auto NIt = ASI.Selects.begin(); 631 while (&*DIt != LastSI.getI()) { 632 if (NIt != ASI.Selects.end() && &*DIt == NIt->getI()) 633 ++NIt; 634 else 635 SinkInstrs.push_back(&*DIt); 636 DIt++; 637 } 638 auto InsertionPoint = EndBlock->getFirstInsertionPt(); 639 for (auto *DI : SinkInstrs) 640 DI->moveBeforePreserving(&*InsertionPoint); 641 642 // Duplicate implementation for DbgRecords, the non-instruction debug-info 643 // format. Helper lambda for moving DbgRecords to the end block. 644 auto TransferDbgRecords = [&](Instruction &I) { 645 for (auto &DbgRecord : 646 llvm::make_early_inc_range(I.getDbgRecordRange())) { 647 DbgRecord.removeFromParent(); 648 EndBlock->insertDbgRecordBefore(&DbgRecord, 649 EndBlock->getFirstInsertionPt()); 650 } 651 }; 652 653 // Iterate over all instructions in between SI and LastSI, not including 654 // SI itself. These are all the variable assignments that happen "in the 655 // middle" of the select group. 656 auto R = make_range(std::next(SI.getI()->getIterator()), 657 std::next(LastSI.getI()->getIterator())); 658 llvm::for_each(R, TransferDbgRecords); 659 660 // These are the new basic blocks for the conditional branch. 661 // At least one will become an actual new basic block. 662 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; 663 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; 664 // Checks if select-like instruction would materialise on the given branch 665 auto HasSelectLike = [](SelectGroup &SG, bool IsTrue) { 666 for (auto &SL : SG.Selects) { 667 if ((IsTrue ? SL.getTrueValue() : SL.getFalseValue()) == nullptr) 668 return true; 669 } 670 return false; 671 }; 672 if (!TrueSlicesInterleaved.empty() || HasSelectLike(ASI, true)) { 673 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink", 674 EndBlock->getParent(), EndBlock); 675 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 676 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 677 for (Instruction *TrueInst : TrueSlicesInterleaved) 678 TrueInst->moveBefore(TrueBranch); 679 } 680 if (!FalseSlicesInterleaved.empty() || HasSelectLike(ASI, false)) { 681 FalseBlock = 682 BasicBlock::Create(EndBlock->getContext(), "select.false.sink", 683 EndBlock->getParent(), EndBlock); 684 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 685 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 686 for (Instruction *FalseInst : FalseSlicesInterleaved) 687 FalseInst->moveBefore(FalseBranch); 688 } 689 // If there was nothing to sink, then arbitrarily choose the 'false' side 690 // for a new input value to the PHI. 691 if (TrueBlock == FalseBlock) { 692 assert(TrueBlock == nullptr && 693 "Unexpected basic block transform while optimizing select"); 694 695 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false", 696 EndBlock->getParent(), EndBlock); 697 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 698 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc()); 699 } 700 701 // Insert the real conditional branch based on the original condition. 702 // If we did not create a new block for one of the 'true' or 'false' paths 703 // of the condition, it means that side of the branch goes to the end block 704 // directly and the path originates from the start block from the point of 705 // view of the new PHI. 706 BasicBlock *TT, *FT; 707 if (TrueBlock == nullptr) { 708 TT = EndBlock; 709 FT = FalseBlock; 710 TrueBlock = StartBlock; 711 } else if (FalseBlock == nullptr) { 712 TT = TrueBlock; 713 FT = EndBlock; 714 FalseBlock = StartBlock; 715 } else { 716 TT = TrueBlock; 717 FT = FalseBlock; 718 } 719 IRBuilder<> IB(SI.getI()); 720 auto *CondFr = 721 IB.CreateFreeze(ASI.Condition, ASI.Condition->getName() + ".frozen"); 722 723 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> INS; 724 725 // Use reverse iterator because later select may use the value of the 726 // earlier select, and we need to propagate value through earlier select 727 // to get the PHI operand. 728 InsertionPoint = EndBlock->begin(); 729 for (SelectLike &SI : ASI.Selects) { 730 // The select itself is replaced with a PHI Node. 731 PHINode *PN = PHINode::Create(SI.getType(), 2, ""); 732 PN->insertBefore(InsertionPoint); 733 PN->takeName(SI.getI()); 734 // Current instruction might be a condition of some other group, so we 735 // need to replace it there to avoid dangling pointer 736 if (PN->getType()->isIntegerTy(1)) { 737 for (auto &SG : ProfSIGroups) { 738 if (SG.Condition == SI.getI()) 739 SG.Condition = PN; 740 } 741 } 742 SI.getI()->replaceAllUsesWith(PN); 743 auto *TV = getTrueOrFalseValue(SI, true, INS, TrueBlock); 744 auto *FV = getTrueOrFalseValue(SI, false, INS, FalseBlock); 745 INS[PN] = {TV, FV}; 746 PN->addIncoming(TV, TrueBlock); 747 PN->addIncoming(FV, FalseBlock); 748 PN->setDebugLoc(SI.getI()->getDebugLoc()); 749 ++NumSelectsConverted; 750 } 751 IB.CreateCondBr(CondFr, TT, FT, SI.getI()); 752 753 // Remove the old select instructions, now that they are not longer used. 754 for (SelectLike &SI : ASI.Selects) 755 SI.getI()->eraseFromParent(); 756 } 757 } 758 759 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB, 760 SelectGroups &SIGroups) { 761 // Represents something that can be considered as select instruction. 762 // Auxiliary instruction are instructions that depends on a condition and have 763 // zero or some constant value on True/False branch, such as: 764 // * ZExt(1bit) 765 // * SExt(1bit) 766 // * Not(1bit) 767 // * A(L)Shr(Val), ValBitSize - 1, where there is a condition like `Val <= 0` 768 // earlier in the BB. For conditions that check the sign of the Val compiler 769 // may generate shifts instead of ZExt/SExt. 770 struct SelectLikeInfo { 771 Value *Cond; 772 bool IsAuxiliary; 773 bool IsInverted; 774 unsigned ConditionIdx; 775 }; 776 777 DenseMap<Value *, SelectLikeInfo> SelectInfo; 778 // Keeps visited comparisons to help identify AShr/LShr variants of auxiliary 779 // instructions. 780 SmallSetVector<CmpInst *, 4> SeenCmp; 781 782 // Check if the instruction is SelectLike or might be part of SelectLike 783 // expression, put information into SelectInfo and return the iterator to the 784 // inserted position. 785 auto ProcessSelectInfo = [&SelectInfo, &SeenCmp](Instruction *I) { 786 if (auto *Cmp = dyn_cast<CmpInst>(I)) { 787 SeenCmp.insert(Cmp); 788 return SelectInfo.end(); 789 } 790 791 Value *Cond; 792 if (match(I, m_OneUse(m_ZExtOrSExt(m_Value(Cond)))) && 793 Cond->getType()->isIntegerTy(1)) { 794 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 795 return SelectInfo.insert({I, {Cond, true, Inverted, 0}}).first; 796 } 797 798 if (match(I, m_Not(m_Value(Cond)))) { 799 return SelectInfo.insert({I, {Cond, true, true, 0}}).first; 800 } 801 802 // Select instruction are what we are usually looking for. 803 if (match(I, m_Select(m_Value(Cond), m_Value(), m_Value()))) { 804 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 805 return SelectInfo.insert({I, {Cond, false, Inverted, 0}}).first; 806 } 807 Value *Val; 808 ConstantInt *Shift; 809 if (match(I, m_Shr(m_Value(Val), m_ConstantInt(Shift))) && 810 I->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1) { 811 for (auto *CmpI : SeenCmp) { 812 auto Pred = CmpI->getPredicate(); 813 if (Val != CmpI->getOperand(0)) 814 continue; 815 if ((Pred == CmpInst::ICMP_SGT && 816 match(CmpI->getOperand(1), m_ConstantInt<-1>())) || 817 (Pred == CmpInst::ICMP_SGE && 818 match(CmpI->getOperand(1), m_Zero())) || 819 (Pred == CmpInst::ICMP_SLT && 820 match(CmpI->getOperand(1), m_Zero())) || 821 (Pred == CmpInst::ICMP_SLE && 822 match(CmpI->getOperand(1), m_ConstantInt<-1>()))) { 823 bool Inverted = 824 Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE; 825 return SelectInfo.insert({I, {CmpI, true, Inverted, 0}}).first; 826 } 827 } 828 return SelectInfo.end(); 829 } 830 831 // An BinOp(Aux(X), Y) can also be treated like a select, with condition X 832 // and values Y|1 and Y. 833 // `Aux` can be either `ZExt(1bit)`, `SExt(1bit)` or `XShr(Val), ValBitSize 834 // - 1` `BinOp` can be Add, Sub, Or 835 Value *X; 836 auto MatchZExtOrSExtPattern = 837 m_c_BinOp(m_Value(), m_OneUse(m_ZExtOrSExt(m_Value(X)))); 838 auto MatchShiftPattern = 839 m_c_BinOp(m_Value(), m_OneUse(m_Shr(m_Value(X), m_ConstantInt(Shift)))); 840 841 // This check is unnecessary, but it prevents costly access to the 842 // SelectInfo map. 843 if ((match(I, MatchZExtOrSExtPattern) && X->getType()->isIntegerTy(1)) || 844 (match(I, MatchShiftPattern) && 845 X->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1)) { 846 if (I->getOpcode() != Instruction::Add && 847 I->getOpcode() != Instruction::Sub && 848 I->getOpcode() != Instruction::Or) 849 return SelectInfo.end(); 850 851 if (I->getOpcode() == Instruction::Or && I->getType()->isIntegerTy(1)) 852 return SelectInfo.end(); 853 854 // Iterate through operands and find dependant on recognised sign 855 // extending auxiliary select-like instructions. The operand index does 856 // not matter for Add and Or. However, for Sub, we can only safely 857 // transform when the operand is second. 858 unsigned Idx = I->getOpcode() == Instruction::Sub ? 1 : 0; 859 for (; Idx < 2; Idx++) { 860 auto *Op = I->getOperand(Idx); 861 auto It = SelectInfo.find(Op); 862 if (It != SelectInfo.end() && It->second.IsAuxiliary) { 863 Cond = It->second.Cond; 864 bool Inverted = It->second.IsInverted; 865 return SelectInfo.insert({I, {Cond, false, Inverted, Idx}}).first; 866 } 867 } 868 } 869 return SelectInfo.end(); 870 }; 871 872 bool AlreadyProcessed = false; 873 BasicBlock::iterator BBIt = BB.begin(); 874 DenseMap<Value *, SelectLikeInfo>::iterator It; 875 while (BBIt != BB.end()) { 876 Instruction *I = &*BBIt++; 877 if (I->isDebugOrPseudoInst()) 878 continue; 879 880 if (!AlreadyProcessed) 881 It = ProcessSelectInfo(I); 882 else 883 AlreadyProcessed = false; 884 885 if (It == SelectInfo.end() || It->second.IsAuxiliary) 886 continue; 887 888 if (!TTI->shouldTreatInstructionLikeSelect(I)) 889 continue; 890 891 Value *Cond = It->second.Cond; 892 // Vector conditions are not supported. 893 if (!Cond->getType()->isIntegerTy(1)) 894 continue; 895 896 SelectGroup SIGroup = {Cond, {}}; 897 SIGroup.Selects.emplace_back(I, It->second.IsInverted, 898 It->second.ConditionIdx); 899 900 // If the select type is not supported, no point optimizing it. 901 // Instruction selection will take care of it. 902 if (!isSelectKindSupported(SIGroup.Selects.front())) 903 continue; 904 905 while (BBIt != BB.end()) { 906 Instruction *NI = &*BBIt; 907 // Debug/pseudo instructions should be skipped and not prevent the 908 // formation of a select group. 909 if (NI->isDebugOrPseudoInst()) { 910 ++BBIt; 911 continue; 912 } 913 914 It = ProcessSelectInfo(NI); 915 if (It == SelectInfo.end()) { 916 AlreadyProcessed = true; 917 break; 918 } 919 920 // Auxiliary with same condition 921 auto [CurrCond, IsAux, IsRev, CondIdx] = It->second; 922 if (Cond != CurrCond) { 923 AlreadyProcessed = true; 924 break; 925 } 926 927 if (!IsAux) 928 SIGroup.Selects.emplace_back(NI, IsRev, CondIdx); 929 ++BBIt; 930 } 931 LLVM_DEBUG({ 932 dbgs() << "New Select group (" << SIGroup.Selects.size() << ") with\n"; 933 for (auto &SI : SIGroup.Selects) 934 dbgs() << " " << *SI.getI() << "\n"; 935 }); 936 937 SIGroups.push_back(SIGroup); 938 } 939 } 940 941 void SelectOptimizeImpl::findProfitableSIGroupsBase( 942 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 943 for (SelectGroup &ASI : SIGroups) { 944 ++NumSelectOptAnalyzed; 945 if (isConvertToBranchProfitableBase(ASI)) 946 ProfSIGroups.push_back(ASI); 947 } 948 } 949 950 static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, 951 DiagnosticInfoOptimizationBase &Rem) { 952 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n"); 953 ORE->emit(Rem); 954 } 955 956 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops( 957 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 958 NumSelectOptAnalyzed += SIGroups.size(); 959 // For each select group in an inner-most loop, 960 // a branch is more preferable than a select/conditional-move if: 961 // i) conversion to branches for all the select groups of the loop satisfies 962 // loop-level heuristics including reducing the loop's critical path by 963 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and 964 // ii) the total cost of the select group is cheaper with a branch compared 965 // to its predicated version. The cost is in terms of latency and the cost 966 // of a select group is the cost of its most expensive select instruction 967 // (assuming infinite resources and thus fully leveraging available ILP). 968 969 DenseMap<const Instruction *, CostInfo> InstCostMap; 970 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, 971 {Scaled64::getZero(), Scaled64::getZero()}}; 972 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || 973 !checkLoopHeuristics(L, LoopCost)) { 974 return; 975 } 976 977 for (SelectGroup &ASI : SIGroups) { 978 // Assuming infinite resources, the cost of a group of instructions is the 979 // cost of the most expensive instruction of the group. 980 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); 981 for (SelectLike &SI : ASI.Selects) { 982 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost); 983 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost); 984 } 985 if (BranchCost < SelectCost) { 986 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", 987 ASI.Selects.front().getI()); 988 OR << "Profitable to convert to branch (loop analysis). BranchCost=" 989 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() 990 << ". "; 991 EmitAndPrintRemark(ORE, OR); 992 ++NumSelectConvertedLoop; 993 ProfSIGroups.push_back(ASI); 994 } else { 995 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 996 ASI.Selects.front().getI()); 997 ORmiss << "Select is more profitable (loop analysis). BranchCost=" 998 << BranchCost.toString() 999 << ", SelectCost=" << SelectCost.toString() << ". "; 1000 EmitAndPrintRemark(ORE, ORmiss); 1001 } 1002 } 1003 } 1004 1005 bool SelectOptimizeImpl::isConvertToBranchProfitableBase( 1006 const SelectGroup &ASI) { 1007 const SelectLike &SI = ASI.Selects.front(); 1008 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI() 1009 << "\n"); 1010 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI()); 1011 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI()); 1012 1013 // Skip cold basic blocks. Better to optimize for size for cold blocks. 1014 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) { 1015 ++NumSelectColdBB; 1016 ORmiss << "Not converted to branch because of cold basic block. "; 1017 EmitAndPrintRemark(ORE, ORmiss); 1018 return false; 1019 } 1020 1021 // If unpredictable, branch form is less profitable. 1022 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) { 1023 ++NumSelectUnPred; 1024 ORmiss << "Not converted to branch because of unpredictable branch. "; 1025 EmitAndPrintRemark(ORE, ORmiss); 1026 return false; 1027 } 1028 1029 // If highly predictable, branch form is more profitable, unless a 1030 // predictable select is inexpensive in the target architecture. 1031 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { 1032 ++NumSelectConvertedHighPred; 1033 OR << "Converted to branch because of highly predictable branch. "; 1034 EmitAndPrintRemark(ORE, OR); 1035 return true; 1036 } 1037 1038 // Look for expensive instructions in the cold operand's (if any) dependence 1039 // slice of any of the selects in the group. 1040 if (hasExpensiveColdOperand(ASI)) { 1041 ++NumSelectConvertedExpColdOperand; 1042 OR << "Converted to branch because of expensive cold operand."; 1043 EmitAndPrintRemark(ORE, OR); 1044 return true; 1045 } 1046 1047 // If latch has a select group with several elements, it is usually profitable 1048 // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if 1049 // conversion is profitable for innermost loops. 1050 auto *BB = SI.getI()->getParent(); 1051 auto *L = LI->getLoopFor(BB); 1052 if (L && !L->isInnermost() && L->getLoopLatch() == BB && 1053 ASI.Selects.size() >= 3) { 1054 OR << "Converted to branch because select group in the latch block is big."; 1055 EmitAndPrintRemark(ORE, OR); 1056 return true; 1057 } 1058 1059 ORmiss << "Not profitable to convert to branch (base heuristic)."; 1060 EmitAndPrintRemark(ORE, ORmiss); 1061 return false; 1062 } 1063 1064 static InstructionCost divideNearest(InstructionCost Numerator, 1065 uint64_t Denominator) { 1066 return (Numerator + (Denominator / 2)) / Denominator; 1067 } 1068 1069 static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI, 1070 uint64_t &TrueVal, uint64_t &FalseVal) { 1071 if (isa<SelectInst>(SI.getI())) 1072 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal); 1073 return false; 1074 } 1075 1076 bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) { 1077 bool ColdOperand = false; 1078 uint64_t TrueWeight, FalseWeight, TotalWeight; 1079 if (extractBranchWeights(ASI.Selects.front(), TrueWeight, FalseWeight)) { 1080 uint64_t MinWeight = std::min(TrueWeight, FalseWeight); 1081 TotalWeight = TrueWeight + FalseWeight; 1082 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? 1083 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; 1084 } else if (PSI->hasProfileSummary()) { 1085 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 1086 ASI.Selects.front().getI()); 1087 ORmiss << "Profile data available but missing branch-weights metadata for " 1088 "select instruction. "; 1089 EmitAndPrintRemark(ORE, ORmiss); 1090 } 1091 if (!ColdOperand) 1092 return false; 1093 // Check if the cold path's dependence slice is expensive for any of the 1094 // selects of the group. 1095 for (SelectLike SI : ASI.Selects) { 1096 Instruction *ColdI = nullptr; 1097 uint64_t HotWeight; 1098 if (TrueWeight < FalseWeight) { 1099 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue()); 1100 HotWeight = FalseWeight; 1101 } else { 1102 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue()); 1103 HotWeight = TrueWeight; 1104 } 1105 if (ColdI) { 1106 std::stack<Instruction *> ColdSlice; 1107 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI()); 1108 InstructionCost SliceCost = 0; 1109 while (!ColdSlice.empty()) { 1110 SliceCost += TTI->getInstructionCost(ColdSlice.top(), 1111 TargetTransformInfo::TCK_Latency); 1112 ColdSlice.pop(); 1113 } 1114 // The colder the cold value operand of the select is the more expensive 1115 // the cmov becomes for computing the cold value operand every time. Thus, 1116 // the colder the cold operand is the more its cost counts. 1117 // Get nearest integer cost adjusted for coldness. 1118 InstructionCost AdjSliceCost = 1119 divideNearest(SliceCost * HotWeight, TotalWeight); 1120 if (AdjSliceCost >= 1121 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) 1122 return true; 1123 } 1124 } 1125 return false; 1126 } 1127 1128 // Check if it is safe to move LoadI next to the SI. 1129 // Conservatively assume it is safe only if there is no instruction 1130 // modifying memory in-between the load and the select instruction. 1131 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { 1132 // Assume loads from different basic blocks are unsafe to move. 1133 if (LoadI->getParent() != SI->getParent()) 1134 return false; 1135 auto It = LoadI->getIterator(); 1136 while (&*It != SI) { 1137 if (It->mayWriteToMemory()) 1138 return false; 1139 It++; 1140 } 1141 return true; 1142 } 1143 1144 // For a given source instruction, collect its backwards dependence slice 1145 // consisting of instructions exclusively computed for the purpose of producing 1146 // the operands of the source instruction. As an approximation 1147 // (sufficiently-accurate in practice), we populate this set with the 1148 // instructions of the backwards dependence slice that only have one-use and 1149 // form an one-use chain that leads to the source instruction. 1150 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I, 1151 std::stack<Instruction *> &Slice, 1152 Instruction *SI, 1153 bool ForSinking) { 1154 SmallPtrSet<Instruction *, 2> Visited; 1155 std::queue<Instruction *> Worklist; 1156 Worklist.push(I); 1157 while (!Worklist.empty()) { 1158 Instruction *II = Worklist.front(); 1159 Worklist.pop(); 1160 1161 // Avoid cycles. 1162 if (!Visited.insert(II).second) 1163 continue; 1164 1165 if (!II->hasOneUse()) 1166 continue; 1167 1168 // Cannot soundly sink instructions with side-effects. 1169 // Terminator or phi instructions cannot be sunk. 1170 // Avoid sinking other select instructions (should be handled separetely). 1171 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || 1172 isa<SelectInst>(II) || isa<PHINode>(II))) 1173 continue; 1174 1175 // Avoid sinking loads in order not to skip state-modifying instructions, 1176 // that may alias with the loaded address. 1177 // Only allow sinking of loads within the same basic block that are 1178 // conservatively proven to be safe. 1179 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI)) 1180 continue; 1181 1182 // Avoid considering instructions with less frequency than the source 1183 // instruction (i.e., avoid colder code regions of the dependence slice). 1184 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) 1185 continue; 1186 1187 // Eligible one-use instruction added to the dependence slice. 1188 Slice.push(II); 1189 1190 // Explore all the operands of the current instruction to expand the slice. 1191 for (Value *Op : II->operand_values()) 1192 if (auto *OpI = dyn_cast<Instruction>(Op)) 1193 Worklist.push(OpI); 1194 } 1195 } 1196 1197 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) { 1198 uint64_t TrueWeight, FalseWeight; 1199 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1200 uint64_t Max = std::max(TrueWeight, FalseWeight); 1201 uint64_t Sum = TrueWeight + FalseWeight; 1202 if (Sum != 0) { 1203 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 1204 if (Probability > TTI->getPredictableBranchThreshold()) 1205 return true; 1206 } 1207 } 1208 return false; 1209 } 1210 1211 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, 1212 const CostInfo LoopCost[2]) { 1213 // Loop-level checks to determine if a non-predicated version (with branches) 1214 // of the loop is more profitable than its predicated version. 1215 1216 if (DisableLoopLevelHeuristics) 1217 return true; 1218 1219 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", 1220 L->getHeader()->getFirstNonPHI()); 1221 1222 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || 1223 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { 1224 ORmissL << "No select conversion in the loop due to no reduction of loop's " 1225 "critical path. "; 1226 EmitAndPrintRemark(ORE, ORmissL); 1227 return false; 1228 } 1229 1230 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, 1231 LoopCost[1].PredCost - LoopCost[1].NonPredCost}; 1232 1233 // Profitably converting to branches need to reduce the loop's critical path 1234 // by at least some threshold (absolute gain of GainCycleThreshold cycles and 1235 // relative gain of 12.5%). 1236 if (Gain[1] < Scaled64::get(GainCycleThreshold) || 1237 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { 1238 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; 1239 ORmissL << "No select conversion in the loop due to small reduction of " 1240 "loop's critical path. Gain=" 1241 << Gain[1].toString() 1242 << ", RelativeGain=" << RelativeGain.toString() << "%. "; 1243 EmitAndPrintRemark(ORE, ORmissL); 1244 return false; 1245 } 1246 1247 // If the loop's critical path involves loop-carried dependences, the gradient 1248 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). 1249 // This check ensures that the latency reduction for the loop's critical path 1250 // keeps decreasing with sufficient rate beyond the two analyzed loop 1251 // iterations. 1252 if (Gain[1] > Gain[0]) { 1253 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / 1254 (LoopCost[1].PredCost - LoopCost[0].PredCost); 1255 if (GradientGain < Scaled64::get(GainGradientThreshold)) { 1256 ORmissL << "No select conversion in the loop due to small gradient gain. " 1257 "GradientGain=" 1258 << GradientGain.toString() << "%. "; 1259 EmitAndPrintRemark(ORE, ORmissL); 1260 return false; 1261 } 1262 } 1263 // If the gain decreases it is not profitable to convert. 1264 else if (Gain[1] < Gain[0]) { 1265 ORmissL 1266 << "No select conversion in the loop due to negative gradient gain. "; 1267 EmitAndPrintRemark(ORE, ORmissL); 1268 return false; 1269 } 1270 1271 // Non-predicated version of the loop is more profitable than its 1272 // predicated version. 1273 return true; 1274 } 1275 1276 // Computes instruction and loop-critical-path costs for both the predicated 1277 // and non-predicated version of the given loop. 1278 // Returns false if unable to compute these costs due to invalid cost of loop 1279 // instruction(s). 1280 bool SelectOptimizeImpl::computeLoopCosts( 1281 const Loop *L, const SelectGroups &SIGroups, 1282 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { 1283 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop " 1284 << L->getHeader()->getName() << "\n"); 1285 const auto SImap = getSImap(SIGroups); 1286 const auto SGmap = getSGmap(SIGroups); 1287 // Compute instruction and loop-critical-path costs across two iterations for 1288 // both predicated and non-predicated version. 1289 const unsigned Iterations = 2; 1290 for (unsigned Iter = 0; Iter < Iterations; ++Iter) { 1291 // Cost of the loop's critical path. 1292 CostInfo &MaxCost = LoopCost[Iter]; 1293 for (BasicBlock *BB : L->getBlocks()) { 1294 for (const Instruction &I : *BB) { 1295 if (I.isDebugOrPseudoInst()) 1296 continue; 1297 // Compute the predicated and non-predicated cost of the instruction. 1298 Scaled64 IPredCost = Scaled64::getZero(), 1299 INonPredCost = Scaled64::getZero(); 1300 1301 // Assume infinite resources that allow to fully exploit the available 1302 // instruction-level parallelism. 1303 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) 1304 for (const Use &U : I.operands()) { 1305 auto UI = dyn_cast<Instruction>(U.get()); 1306 if (!UI) 1307 continue; 1308 if (InstCostMap.count(UI)) { 1309 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); 1310 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); 1311 } 1312 } 1313 auto ILatency = computeInstCost(&I); 1314 if (!ILatency) { 1315 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); 1316 ORmissL << "Invalid instruction cost preventing analysis and " 1317 "optimization of the inner-most loop containing this " 1318 "instruction. "; 1319 EmitAndPrintRemark(ORE, ORmissL); 1320 return false; 1321 } 1322 IPredCost += Scaled64::get(*ILatency); 1323 INonPredCost += Scaled64::get(*ILatency); 1324 1325 // For a select that can be converted to branch, 1326 // compute its cost as a branch (non-predicated cost). 1327 // 1328 // BranchCost = PredictedPathCost + MispredictCost 1329 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb 1330 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate 1331 if (SImap.contains(&I)) { 1332 auto SI = SImap.at(&I); 1333 const auto *SG = SGmap.at(&I); 1334 Scaled64 TrueOpCost = SI.getOpCostOnBranch(true, InstCostMap, TTI); 1335 Scaled64 FalseOpCost = SI.getOpCostOnBranch(false, InstCostMap, TTI); 1336 Scaled64 PredictedPathCost = 1337 getPredictedPathCost(TrueOpCost, FalseOpCost, SI); 1338 1339 Scaled64 CondCost = Scaled64::getZero(); 1340 if (auto *CI = dyn_cast<Instruction>(SG->Condition)) 1341 if (InstCostMap.count(CI)) 1342 CondCost = InstCostMap[CI].NonPredCost; 1343 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); 1344 1345 INonPredCost = PredictedPathCost + MispredictCost; 1346 } 1347 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/" 1348 << INonPredCost << " for " << I << "\n"); 1349 1350 InstCostMap[&I] = {IPredCost, INonPredCost}; 1351 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); 1352 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); 1353 } 1354 } 1355 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1 1356 << " MaxCost = " << MaxCost.PredCost << " " 1357 << MaxCost.NonPredCost << "\n"); 1358 } 1359 return true; 1360 } 1361 1362 SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2> 1363 SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) { 1364 SmallDenseMap<const Instruction *, SelectLike, 2> SImap; 1365 for (const SelectGroup &ASI : SIGroups) 1366 for (const SelectLike &SI : ASI.Selects) 1367 SImap.try_emplace(SI.getI(), SI); 1368 return SImap; 1369 } 1370 1371 SmallDenseMap<const Instruction *, const SelectOptimizeImpl::SelectGroup *, 2> 1372 SelectOptimizeImpl::getSGmap(const SelectGroups &SIGroups) { 1373 SmallDenseMap<const Instruction *, const SelectGroup *, 2> SImap; 1374 for (const SelectGroup &ASI : SIGroups) 1375 for (const SelectLike &SI : ASI.Selects) 1376 SImap.try_emplace(SI.getI(), &ASI); 1377 return SImap; 1378 } 1379 1380 std::optional<uint64_t> 1381 SelectOptimizeImpl::computeInstCost(const Instruction *I) { 1382 InstructionCost ICost = 1383 TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); 1384 if (auto OC = ICost.getValue()) 1385 return std::optional<uint64_t>(*OC); 1386 return std::nullopt; 1387 } 1388 1389 ScaledNumber<uint64_t> 1390 SelectOptimizeImpl::getMispredictionCost(const SelectLike SI, 1391 const Scaled64 CondCost) { 1392 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; 1393 1394 // Account for the default misprediction rate when using a branch 1395 // (conservatively set to 25% by default). 1396 uint64_t MispredictRate = MispredictDefaultRate; 1397 // If the select condition is obviously predictable, then the misprediction 1398 // rate is zero. 1399 if (isSelectHighlyPredictable(SI)) 1400 MispredictRate = 0; 1401 1402 // CondCost is included to account for cases where the computation of the 1403 // condition is part of a long dependence chain (potentially loop-carried) 1404 // that would delay detection of a misprediction and increase its cost. 1405 Scaled64 MispredictCost = 1406 std::max(Scaled64::get(MispredictPenalty), CondCost) * 1407 Scaled64::get(MispredictRate); 1408 MispredictCost /= Scaled64::get(100); 1409 1410 return MispredictCost; 1411 } 1412 1413 // Returns the cost of a branch when the prediction is correct. 1414 // TrueCost * TrueProbability + FalseCost * FalseProbability. 1415 ScaledNumber<uint64_t> 1416 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 1417 const SelectLike SI) { 1418 Scaled64 PredPathCost; 1419 uint64_t TrueWeight, FalseWeight; 1420 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1421 uint64_t SumWeight = TrueWeight + FalseWeight; 1422 if (SumWeight != 0) { 1423 PredPathCost = TrueCost * Scaled64::get(TrueWeight) + 1424 FalseCost * Scaled64::get(FalseWeight); 1425 PredPathCost /= Scaled64::get(SumWeight); 1426 return PredPathCost; 1427 } 1428 } 1429 // Without branch weight metadata, we assume 75% for the one path and 25% for 1430 // the other, and pick the result with the biggest cost. 1431 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, 1432 FalseCost * Scaled64::get(3) + TrueCost); 1433 PredPathCost /= Scaled64::get(4); 1434 return PredPathCost; 1435 } 1436 1437 bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) { 1438 TargetLowering::SelectSupportKind SelectKind; 1439 if (SI.getType()->isVectorTy()) 1440 SelectKind = TargetLowering::ScalarCondVectorVal; 1441 else 1442 SelectKind = TargetLowering::ScalarValSelect; 1443 return TLI->isSelectSupported(SelectKind); 1444 } 1445