1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass converts selects to conditional jumps when profitable. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/CodeGen/SelectOptimize.h" 14 #include "llvm/ADT/SetVector.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 21 #include "llvm/Analysis/ProfileSummaryInfo.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetLowering.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/CodeGen/TargetSubtargetInfo.h" 28 #include "llvm/IR/BasicBlock.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/PatternMatch.h" 34 #include "llvm/IR/ProfDataUtils.h" 35 #include "llvm/InitializePasses.h" 36 #include "llvm/Pass.h" 37 #include "llvm/Support/ScaledNumber.h" 38 #include "llvm/Target/TargetMachine.h" 39 #include "llvm/Transforms/Utils/SizeOpts.h" 40 #include <algorithm> 41 #include <queue> 42 #include <stack> 43 44 using namespace llvm; 45 using namespace llvm::PatternMatch; 46 47 #define DEBUG_TYPE "select-optimize" 48 49 STATISTIC(NumSelectOptAnalyzed, 50 "Number of select groups considered for conversion to branch"); 51 STATISTIC(NumSelectConvertedExpColdOperand, 52 "Number of select groups converted due to expensive cold operand"); 53 STATISTIC(NumSelectConvertedHighPred, 54 "Number of select groups converted due to high-predictability"); 55 STATISTIC(NumSelectUnPred, 56 "Number of select groups not converted due to unpredictability"); 57 STATISTIC(NumSelectColdBB, 58 "Number of select groups not converted due to cold basic block"); 59 STATISTIC(NumSelectConvertedLoop, 60 "Number of select groups converted due to loop-level analysis"); 61 STATISTIC(NumSelectsConverted, "Number of selects converted"); 62 63 static cl::opt<unsigned> ColdOperandThreshold( 64 "cold-operand-threshold", 65 cl::desc("Maximum frequency of path for an operand to be considered cold."), 66 cl::init(20), cl::Hidden); 67 68 static cl::opt<unsigned> ColdOperandMaxCostMultiplier( 69 "cold-operand-max-cost-multiplier", 70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " 71 "slice of a cold operand to be considered inexpensive."), 72 cl::init(1), cl::Hidden); 73 74 static cl::opt<unsigned> 75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold", 76 cl::desc("Gradient gain threshold (%)."), 77 cl::init(25), cl::Hidden); 78 79 static cl::opt<unsigned> 80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold", 81 cl::desc("Minimum gain per loop (in cycles) threshold."), 82 cl::init(4), cl::Hidden); 83 84 static cl::opt<unsigned> GainRelativeThreshold( 85 "select-opti-loop-relative-gain-threshold", 86 cl::desc( 87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), 88 cl::init(8), cl::Hidden); 89 90 static cl::opt<unsigned> MispredictDefaultRate( 91 "mispredict-default-rate", cl::Hidden, cl::init(25), 92 cl::desc("Default mispredict rate (initialized to 25%).")); 93 94 static cl::opt<bool> 95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, 96 cl::init(false), 97 cl::desc("Disable loop-level heuristics.")); 98 99 namespace { 100 101 class SelectOptimizeImpl { 102 const TargetMachine *TM = nullptr; 103 const TargetSubtargetInfo *TSI = nullptr; 104 const TargetLowering *TLI = nullptr; 105 const TargetTransformInfo *TTI = nullptr; 106 const LoopInfo *LI = nullptr; 107 BlockFrequencyInfo *BFI; 108 ProfileSummaryInfo *PSI = nullptr; 109 OptimizationRemarkEmitter *ORE = nullptr; 110 TargetSchedModel TSchedModel; 111 112 public: 113 SelectOptimizeImpl() = default; 114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){}; 115 PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); 116 bool runOnFunction(Function &F, Pass &P); 117 118 using Scaled64 = ScaledNumber<uint64_t>; 119 120 struct CostInfo { 121 /// Predicated cost (with selects as conditional moves). 122 Scaled64 PredCost; 123 /// Non-predicated cost (with selects converted to branches). 124 Scaled64 NonPredCost; 125 }; 126 127 /// SelectLike is an abstraction over SelectInst and other operations that can 128 /// act like selects. For example Or(Zext(icmp), X) can be treated like 129 /// select(icmp, X|1, X). 130 class SelectLike { 131 /// The select (/or) instruction. 132 Instruction *I; 133 /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as 134 /// opposed to the original condition. 135 bool Inverted = false; 136 137 /// The index of the operand that depends on condition. Only for select-like 138 /// instruction such as Or/Add. 139 unsigned CondIdx; 140 141 public: 142 SelectLike(Instruction *I, bool Inverted = false, unsigned CondIdx = 0) 143 : I(I), Inverted(Inverted), CondIdx(CondIdx) {} 144 145 Instruction *getI() { return I; } 146 const Instruction *getI() const { return I; } 147 148 Type *getType() const { return I->getType(); } 149 150 unsigned getConditionOpIndex() { return CondIdx; }; 151 152 /// Return the true value for the SelectLike instruction. Note this may not 153 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)` 154 /// the true value would be `or(x,1)`. As this value does not exist, nullptr 155 /// is returned. 156 Value *getTrueValue(bool HonorInverts = true) const { 157 if (Inverted && HonorInverts) 158 return getFalseValue(/*HonorInverts=*/false); 159 if (auto *Sel = dyn_cast<SelectInst>(I)) 160 return Sel->getTrueValue(); 161 // Or(zext) case - The true value is Or(X), so return nullptr as the value 162 // does not yet exist. 163 if (isa<BinaryOperator>(I)) 164 return nullptr; 165 166 llvm_unreachable("Unhandled case in getTrueValue"); 167 } 168 169 /// Return the false value for the SelectLike instruction. For example the 170 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is 171 /// `select(c, x|1, x)`) 172 Value *getFalseValue(bool HonorInverts = true) const { 173 if (Inverted && HonorInverts) 174 return getTrueValue(/*HonorInverts=*/false); 175 if (auto *Sel = dyn_cast<SelectInst>(I)) 176 return Sel->getFalseValue(); 177 // We are on the branch where the condition is zero, which means BinOp 178 // does not perform any computation, and we can simply return the operand 179 // that is not related to the condition 180 if (auto *BO = dyn_cast<BinaryOperator>(I)) 181 return BO->getOperand(1 - CondIdx); 182 183 llvm_unreachable("Unhandled case in getFalseValue"); 184 } 185 186 /// Return the NonPredCost cost of the op on \p isTrue branch, given the 187 /// costs in \p InstCostMap. This may need to be generated for select-like 188 /// instructions. 189 Scaled64 getOpCostOnBranch( 190 bool IsTrue, const DenseMap<const Instruction *, CostInfo> &InstCostMap, 191 const TargetTransformInfo *TTI) { 192 auto *V = IsTrue ? getTrueValue() : getFalseValue(); 193 if (V) { 194 if (auto *IV = dyn_cast<Instruction>(V)) { 195 auto It = InstCostMap.find(IV); 196 return It != InstCostMap.end() ? It->second.NonPredCost 197 : Scaled64::getZero(); 198 } 199 return Scaled64::getZero(); 200 } 201 // If getTrue(False)Value() return nullptr, it means we are dealing with 202 // select-like instructions on the branch where the actual computation is 203 // happening. In that case the cost is equal to the cost of computation + 204 // cost of non-dependant on condition operand 205 InstructionCost Cost = TTI->getArithmeticInstrCost( 206 getI()->getOpcode(), I->getType(), TargetTransformInfo::TCK_Latency, 207 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 208 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2}); 209 auto TotalCost = Scaled64::get(*Cost.getValue()); 210 if (auto *OpI = dyn_cast<Instruction>(I->getOperand(1 - CondIdx))) { 211 auto It = InstCostMap.find(OpI); 212 if (It != InstCostMap.end()) 213 TotalCost += It->second.NonPredCost; 214 } 215 return TotalCost; 216 } 217 }; 218 219 private: 220 // Select groups consist of consecutive select-like instructions with the same 221 // condition. Between select-likes could be any number of auxiliary 222 // instructions related to the condition like not, zext, ashr/lshr 223 struct SelectGroup { 224 Value *Condition; 225 SmallVector<SelectLike, 2> Selects; 226 }; 227 using SelectGroups = SmallVector<SelectGroup, 2>; 228 229 // Converts select instructions of a function to conditional jumps when deemed 230 // profitable. Returns true if at least one select was converted. 231 bool optimizeSelects(Function &F); 232 233 // Heuristics for determining which select instructions can be profitably 234 // conveted to branches. Separate heuristics for selects in inner-most loops 235 // and the rest of code regions (base heuristics for non-inner-most loop 236 // regions). 237 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); 238 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); 239 240 // Converts to branches the select groups that were deemed 241 // profitable-to-convert. 242 void convertProfitableSIGroups(SelectGroups &ProfSIGroups); 243 244 // Splits selects of a given basic block into select groups. 245 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); 246 247 // Determines for which select groups it is profitable converting to branches 248 // (base and inner-most-loop heuristics). 249 void findProfitableSIGroupsBase(SelectGroups &SIGroups, 250 SelectGroups &ProfSIGroups); 251 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, 252 SelectGroups &ProfSIGroups); 253 254 // Determines if a select group should be converted to a branch (base 255 // heuristics). 256 bool isConvertToBranchProfitableBase(const SelectGroup &ASI); 257 258 // Returns true if there are expensive instructions in the cold value 259 // operand's (if any) dependence slice of any of the selects of the given 260 // group. 261 bool hasExpensiveColdOperand(const SelectGroup &ASI); 262 263 // For a given source instruction, collect its backwards dependence slice 264 // consisting of instructions exclusively computed for producing the operands 265 // of the source instruction. 266 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, 267 Instruction *SI, bool ForSinking = false); 268 269 // Returns true if the condition of the select is highly predictable. 270 bool isSelectHighlyPredictable(const SelectLike SI); 271 272 // Loop-level checks to determine if a non-predicated version (with branches) 273 // of the given loop is more profitable than its predicated version. 274 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); 275 276 // Computes instruction and loop-critical-path costs for both the predicated 277 // and non-predicated version of the given loop. 278 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, 279 DenseMap<const Instruction *, CostInfo> &InstCostMap, 280 CostInfo *LoopCost); 281 282 // Returns a set of all the select instructions in the given select groups. 283 SmallDenseMap<const Instruction *, SelectLike, 2> 284 getSImap(const SelectGroups &SIGroups); 285 286 // Returns a map from select-like instructions to the corresponding select 287 // group. 288 SmallDenseMap<const Instruction *, const SelectGroup *, 2> 289 getSGmap(const SelectGroups &SIGroups); 290 291 // Returns the latency cost of a given instruction. 292 std::optional<uint64_t> computeInstCost(const Instruction *I); 293 294 // Returns the misprediction cost of a given select when converted to branch. 295 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost); 296 297 // Returns the cost of a branch when the prediction is correct. 298 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 299 const SelectLike SI); 300 301 // Returns true if the target architecture supports lowering a given select. 302 bool isSelectKindSupported(const SelectLike SI); 303 }; 304 305 class SelectOptimize : public FunctionPass { 306 SelectOptimizeImpl Impl; 307 308 public: 309 static char ID; 310 311 SelectOptimize() : FunctionPass(ID) { 312 initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); 313 } 314 315 bool runOnFunction(Function &F) override { 316 return Impl.runOnFunction(F, *this); 317 } 318 319 void getAnalysisUsage(AnalysisUsage &AU) const override { 320 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 321 AU.addRequired<TargetPassConfig>(); 322 AU.addRequired<TargetTransformInfoWrapperPass>(); 323 AU.addRequired<LoopInfoWrapperPass>(); 324 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 325 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 326 } 327 }; 328 329 } // namespace 330 331 PreservedAnalyses SelectOptimizePass::run(Function &F, 332 FunctionAnalysisManager &FAM) { 333 SelectOptimizeImpl Impl(TM); 334 return Impl.run(F, FAM); 335 } 336 337 char SelectOptimize::ID = 0; 338 339 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 340 false) 341 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 342 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 343 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 344 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 345 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 346 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 347 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 348 false) 349 350 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } 351 352 PreservedAnalyses SelectOptimizeImpl::run(Function &F, 353 FunctionAnalysisManager &FAM) { 354 TSI = TM->getSubtargetImpl(F); 355 TLI = TSI->getTargetLowering(); 356 357 // If none of the select types are supported then skip this pass. 358 // This is an optimization pass. Legality issues will be handled by 359 // instruction selection. 360 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 361 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 362 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 363 return PreservedAnalyses::all(); 364 365 TTI = &FAM.getResult<TargetIRAnalysis>(F); 366 if (!TTI->enableSelectOptimize()) 367 return PreservedAnalyses::all(); 368 369 PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F) 370 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 371 assert(PSI && "This pass requires module analysis pass `profile-summary`!"); 372 BFI = &FAM.getResult<BlockFrequencyAnalysis>(F); 373 374 // When optimizing for size, selects are preferable over branches. 375 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 376 return PreservedAnalyses::all(); 377 378 LI = &FAM.getResult<LoopAnalysis>(F); 379 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 380 TSchedModel.init(TSI); 381 382 bool Changed = optimizeSelects(F); 383 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 384 } 385 386 bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) { 387 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); 388 TSI = TM->getSubtargetImpl(F); 389 TLI = TSI->getTargetLowering(); 390 391 // If none of the select types are supported then skip this pass. 392 // This is an optimization pass. Legality issues will be handled by 393 // instruction selection. 394 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 395 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 396 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 397 return false; 398 399 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 400 401 if (!TTI->enableSelectOptimize()) 402 return false; 403 404 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 405 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 406 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 407 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 408 TSchedModel.init(TSI); 409 410 // When optimizing for size, selects are preferable over branches. 411 if (llvm::shouldOptimizeForSize(&F, PSI, BFI)) 412 return false; 413 414 return optimizeSelects(F); 415 } 416 417 bool SelectOptimizeImpl::optimizeSelects(Function &F) { 418 // Determine for which select groups it is profitable converting to branches. 419 SelectGroups ProfSIGroups; 420 // Base heuristics apply only to non-loops and outer loops. 421 optimizeSelectsBase(F, ProfSIGroups); 422 // Separate heuristics for inner-most loops. 423 optimizeSelectsInnerLoops(F, ProfSIGroups); 424 425 // Convert to branches the select groups that were deemed 426 // profitable-to-convert. 427 convertProfitableSIGroups(ProfSIGroups); 428 429 // Code modified if at least one select group was converted. 430 return !ProfSIGroups.empty(); 431 } 432 433 void SelectOptimizeImpl::optimizeSelectsBase(Function &F, 434 SelectGroups &ProfSIGroups) { 435 // Collect all the select groups. 436 SelectGroups SIGroups; 437 for (BasicBlock &BB : F) { 438 // Base heuristics apply only to non-loops and outer loops. 439 Loop *L = LI->getLoopFor(&BB); 440 if (L && L->isInnermost()) 441 continue; 442 collectSelectGroups(BB, SIGroups); 443 } 444 445 // Determine for which select groups it is profitable converting to branches. 446 findProfitableSIGroupsBase(SIGroups, ProfSIGroups); 447 } 448 449 void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, 450 SelectGroups &ProfSIGroups) { 451 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); 452 // Need to check size on each iteration as we accumulate child loops. 453 for (unsigned long i = 0; i < Loops.size(); ++i) 454 for (Loop *ChildL : Loops[i]->getSubLoops()) 455 Loops.push_back(ChildL); 456 457 for (Loop *L : Loops) { 458 if (!L->isInnermost()) 459 continue; 460 461 SelectGroups SIGroups; 462 for (BasicBlock *BB : L->getBlocks()) 463 collectSelectGroups(*BB, SIGroups); 464 465 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); 466 } 467 } 468 469 /// Returns optimised value on \p IsTrue branch. For SelectInst that would be 470 /// either True or False value. For (BinaryOperator) instructions, where the 471 /// condition may be skipped, the operation will use a non-conditional operand. 472 /// For example, for `or(V,zext(cond))` this function would return V. 473 /// However, if the conditional operand on \p IsTrue branch matters, we create a 474 /// clone of instruction at the end of that branch \p B and replace the 475 /// condition operand with a constant. 476 /// 477 /// Also /p OptSelects contains previously optimised select-like instructions. 478 /// If the current value uses one of the optimised values, we can optimise it 479 /// further by replacing it with the corresponding value on the given branch 480 static Value *getTrueOrFalseValue( 481 SelectOptimizeImpl::SelectLike &SI, bool isTrue, 482 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> &OptSelects, 483 BasicBlock *B) { 484 Value *V = isTrue ? SI.getTrueValue() : SI.getFalseValue(); 485 if (V) { 486 if (auto *IV = dyn_cast<Instruction>(V)) 487 if (auto It = OptSelects.find(IV); It != OptSelects.end()) 488 return isTrue ? It->second.first : It->second.second; 489 return V; 490 } 491 492 auto *BO = cast<BinaryOperator>(SI.getI()); 493 assert((BO->getOpcode() == Instruction::Add || 494 BO->getOpcode() == Instruction::Or || 495 BO->getOpcode() == Instruction::Sub) && 496 "Only currently handling Add, Or and Sub binary operators."); 497 498 auto *CBO = BO->clone(); 499 auto CondIdx = SI.getConditionOpIndex(); 500 auto *AuxI = cast<Instruction>(CBO->getOperand(CondIdx)); 501 if (isa<ZExtInst>(AuxI) || isa<LShrOperator>(AuxI)) { 502 CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), 1)); 503 } else { 504 assert((isa<AShrOperator>(AuxI) || isa<SExtInst>(AuxI)) && 505 "Unexpected opcode"); 506 CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), -1)); 507 } 508 509 unsigned OtherIdx = 1 - CondIdx; 510 if (auto *IV = dyn_cast<Instruction>(CBO->getOperand(OtherIdx))) { 511 if (auto It = OptSelects.find(IV); It != OptSelects.end()) 512 CBO->setOperand(OtherIdx, isTrue ? It->second.first : It->second.second); 513 } 514 CBO->insertBefore(B->getTerminator()->getIterator()); 515 return CBO; 516 } 517 518 void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { 519 for (SelectGroup &ASI : ProfSIGroups) { 520 // The code transformation here is a modified version of the sinking 521 // transformation in CodeGenPrepare::optimizeSelectInst with a more 522 // aggressive strategy of which instructions to sink. 523 // 524 // TODO: eliminate the redundancy of logic transforming selects to branches 525 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here 526 // selects for all cases (with and without profile information). 527 528 // Transform a sequence like this: 529 // start: 530 // %cmp = cmp uge i32 %a, %b 531 // %sel = select i1 %cmp, i32 %c, i32 %d 532 // 533 // Into: 534 // start: 535 // %cmp = cmp uge i32 %a, %b 536 // %cmp.frozen = freeze %cmp 537 // br i1 %cmp.frozen, label %select.true, label %select.false 538 // select.true: 539 // br label %select.end 540 // select.false: 541 // br label %select.end 542 // select.end: 543 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 544 // 545 // %cmp should be frozen, otherwise it may introduce undefined behavior. 546 // In addition, we may sink instructions that produce %c or %d into the 547 // destination(s) of the new branch. 548 // If the true or false blocks do not contain a sunken instruction, that 549 // block and its branch may be optimized away. In that case, one side of the 550 // first branch will point directly to select.end, and the corresponding PHI 551 // predecessor block will be the start block. 552 553 // Find all the instructions that can be soundly sunk to the true/false 554 // blocks. These are instructions that are computed solely for producing the 555 // operands of the select instructions in the group and can be sunk without 556 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions 557 // with side effects). 558 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; 559 typedef std::stack<Instruction *>::size_type StackSizeType; 560 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; 561 for (SelectLike &SI : ASI.Selects) { 562 if (!isa<SelectInst>(SI.getI())) 563 continue; 564 // For each select, compute the sinkable dependence chains of the true and 565 // false operands. 566 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) { 567 std::stack<Instruction *> TrueSlice; 568 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true); 569 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); 570 TrueSlices.push_back(TrueSlice); 571 } 572 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) { 573 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) { 574 std::stack<Instruction *> FalseSlice; 575 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true); 576 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); 577 FalseSlices.push_back(FalseSlice); 578 } 579 } 580 } 581 // In the case of multiple select instructions in the same group, the order 582 // of non-dependent instructions (instructions of different dependence 583 // slices) in the true/false blocks appears to affect performance. 584 // Interleaving the slices seems to experimentally be the optimal approach. 585 // This interleaving scheduling allows for more ILP (with a natural downside 586 // of increasing a bit register pressure) compared to a simple ordering of 587 // one whole chain after another. One would expect that this ordering would 588 // not matter since the scheduling in the backend of the compiler would 589 // take care of it, but apparently the scheduler fails to deliver optimal 590 // ILP with a naive ordering here. 591 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; 592 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { 593 for (auto &S : TrueSlices) { 594 if (!S.empty()) { 595 TrueSlicesInterleaved.push_back(S.top()); 596 S.pop(); 597 } 598 } 599 } 600 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { 601 for (auto &S : FalseSlices) { 602 if (!S.empty()) { 603 FalseSlicesInterleaved.push_back(S.top()); 604 S.pop(); 605 } 606 } 607 } 608 609 // We split the block containing the select(s) into two blocks. 610 SelectLike &SI = ASI.Selects.front(); 611 SelectLike &LastSI = ASI.Selects.back(); 612 BasicBlock *StartBlock = SI.getI()->getParent(); 613 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI())); 614 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With 615 // RemoveDIs turned on, SplitPt would instead point to the next 616 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs, 617 // tell splitBasicBlock that we want to include any DbgVariableRecords 618 // attached to SplitPt in the splice. 619 SplitPt.setHeadBit(true); 620 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 621 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock)); 622 // Delete the unconditional branch that was just created by the split. 623 StartBlock->getTerminator()->eraseFromParent(); 624 625 // Move any debug/pseudo and auxiliary instructions that were in-between the 626 // select group to the newly-created end block. 627 SmallVector<Instruction *, 2> SinkInstrs; 628 auto DIt = SI.getI()->getIterator(); 629 auto NIt = ASI.Selects.begin(); 630 while (&*DIt != LastSI.getI()) { 631 if (NIt != ASI.Selects.end() && &*DIt == NIt->getI()) 632 ++NIt; 633 else 634 SinkInstrs.push_back(&*DIt); 635 DIt++; 636 } 637 auto InsertionPoint = EndBlock->getFirstInsertionPt(); 638 for (auto *DI : SinkInstrs) 639 DI->moveBeforePreserving(InsertionPoint); 640 641 // Duplicate implementation for DbgRecords, the non-instruction debug-info 642 // format. Helper lambda for moving DbgRecords to the end block. 643 auto TransferDbgRecords = [&](Instruction &I) { 644 for (auto &DbgRecord : 645 llvm::make_early_inc_range(I.getDbgRecordRange())) { 646 DbgRecord.removeFromParent(); 647 EndBlock->insertDbgRecordBefore(&DbgRecord, 648 EndBlock->getFirstInsertionPt()); 649 } 650 }; 651 652 // Iterate over all instructions in between SI and LastSI, not including 653 // SI itself. These are all the variable assignments that happen "in the 654 // middle" of the select group. 655 auto R = make_range(std::next(SI.getI()->getIterator()), 656 std::next(LastSI.getI()->getIterator())); 657 llvm::for_each(R, TransferDbgRecords); 658 659 // These are the new basic blocks for the conditional branch. 660 // At least one will become an actual new basic block. 661 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; 662 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; 663 // Checks if select-like instruction would materialise on the given branch 664 auto HasSelectLike = [](SelectGroup &SG, bool IsTrue) { 665 for (auto &SL : SG.Selects) { 666 if ((IsTrue ? SL.getTrueValue() : SL.getFalseValue()) == nullptr) 667 return true; 668 } 669 return false; 670 }; 671 if (!TrueSlicesInterleaved.empty() || HasSelectLike(ASI, true)) { 672 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink", 673 EndBlock->getParent(), EndBlock); 674 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 675 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 676 for (Instruction *TrueInst : TrueSlicesInterleaved) 677 TrueInst->moveBefore(TrueBranch->getIterator()); 678 } 679 if (!FalseSlicesInterleaved.empty() || HasSelectLike(ASI, false)) { 680 FalseBlock = 681 BasicBlock::Create(EndBlock->getContext(), "select.false.sink", 682 EndBlock->getParent(), EndBlock); 683 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 684 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); 685 for (Instruction *FalseInst : FalseSlicesInterleaved) 686 FalseInst->moveBefore(FalseBranch->getIterator()); 687 } 688 // If there was nothing to sink, then arbitrarily choose the 'false' side 689 // for a new input value to the PHI. 690 if (TrueBlock == FalseBlock) { 691 assert(TrueBlock == nullptr && 692 "Unexpected basic block transform while optimizing select"); 693 694 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false", 695 EndBlock->getParent(), EndBlock); 696 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 697 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc()); 698 } 699 700 // Insert the real conditional branch based on the original condition. 701 // If we did not create a new block for one of the 'true' or 'false' paths 702 // of the condition, it means that side of the branch goes to the end block 703 // directly and the path originates from the start block from the point of 704 // view of the new PHI. 705 BasicBlock *TT, *FT; 706 if (TrueBlock == nullptr) { 707 TT = EndBlock; 708 FT = FalseBlock; 709 TrueBlock = StartBlock; 710 } else if (FalseBlock == nullptr) { 711 TT = TrueBlock; 712 FT = EndBlock; 713 FalseBlock = StartBlock; 714 } else { 715 TT = TrueBlock; 716 FT = FalseBlock; 717 } 718 IRBuilder<> IB(SI.getI()); 719 auto *CondFr = 720 IB.CreateFreeze(ASI.Condition, ASI.Condition->getName() + ".frozen"); 721 722 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> INS; 723 724 // Use reverse iterator because later select may use the value of the 725 // earlier select, and we need to propagate value through earlier select 726 // to get the PHI operand. 727 InsertionPoint = EndBlock->begin(); 728 for (SelectLike &SI : ASI.Selects) { 729 // The select itself is replaced with a PHI Node. 730 PHINode *PN = PHINode::Create(SI.getType(), 2, ""); 731 PN->insertBefore(InsertionPoint); 732 PN->takeName(SI.getI()); 733 // Current instruction might be a condition of some other group, so we 734 // need to replace it there to avoid dangling pointer 735 if (PN->getType()->isIntegerTy(1)) { 736 for (auto &SG : ProfSIGroups) { 737 if (SG.Condition == SI.getI()) 738 SG.Condition = PN; 739 } 740 } 741 SI.getI()->replaceAllUsesWith(PN); 742 auto *TV = getTrueOrFalseValue(SI, true, INS, TrueBlock); 743 auto *FV = getTrueOrFalseValue(SI, false, INS, FalseBlock); 744 INS[PN] = {TV, FV}; 745 PN->addIncoming(TV, TrueBlock); 746 PN->addIncoming(FV, FalseBlock); 747 PN->setDebugLoc(SI.getI()->getDebugLoc()); 748 ++NumSelectsConverted; 749 } 750 IB.CreateCondBr(CondFr, TT, FT, SI.getI()); 751 752 // Remove the old select instructions, now that they are not longer used. 753 for (SelectLike &SI : ASI.Selects) 754 SI.getI()->eraseFromParent(); 755 } 756 } 757 758 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB, 759 SelectGroups &SIGroups) { 760 // Represents something that can be considered as select instruction. 761 // Auxiliary instruction are instructions that depends on a condition and have 762 // zero or some constant value on True/False branch, such as: 763 // * ZExt(1bit) 764 // * SExt(1bit) 765 // * Not(1bit) 766 // * A(L)Shr(Val), ValBitSize - 1, where there is a condition like `Val <= 0` 767 // earlier in the BB. For conditions that check the sign of the Val compiler 768 // may generate shifts instead of ZExt/SExt. 769 struct SelectLikeInfo { 770 Value *Cond; 771 bool IsAuxiliary; 772 bool IsInverted; 773 unsigned ConditionIdx; 774 }; 775 776 DenseMap<Value *, SelectLikeInfo> SelectInfo; 777 // Keeps visited comparisons to help identify AShr/LShr variants of auxiliary 778 // instructions. 779 SmallSetVector<CmpInst *, 4> SeenCmp; 780 781 // Check if the instruction is SelectLike or might be part of SelectLike 782 // expression, put information into SelectInfo and return the iterator to the 783 // inserted position. 784 auto ProcessSelectInfo = [&SelectInfo, &SeenCmp](Instruction *I) { 785 if (auto *Cmp = dyn_cast<CmpInst>(I)) { 786 SeenCmp.insert(Cmp); 787 return SelectInfo.end(); 788 } 789 790 Value *Cond; 791 if (match(I, m_OneUse(m_ZExtOrSExt(m_Value(Cond)))) && 792 Cond->getType()->isIntegerTy(1)) { 793 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 794 return SelectInfo.insert({I, {Cond, true, Inverted, 0}}).first; 795 } 796 797 if (match(I, m_Not(m_Value(Cond)))) { 798 return SelectInfo.insert({I, {Cond, true, true, 0}}).first; 799 } 800 801 // Select instruction are what we are usually looking for. 802 if (match(I, m_Select(m_Value(Cond), m_Value(), m_Value()))) { 803 bool Inverted = match(Cond, m_Not(m_Value(Cond))); 804 return SelectInfo.insert({I, {Cond, false, Inverted, 0}}).first; 805 } 806 Value *Val; 807 ConstantInt *Shift; 808 if (match(I, m_Shr(m_Value(Val), m_ConstantInt(Shift))) && 809 I->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1) { 810 for (auto *CmpI : SeenCmp) { 811 auto Pred = CmpI->getPredicate(); 812 if (Val != CmpI->getOperand(0)) 813 continue; 814 if ((Pred == CmpInst::ICMP_SGT && 815 match(CmpI->getOperand(1), m_ConstantInt<-1>())) || 816 (Pred == CmpInst::ICMP_SGE && 817 match(CmpI->getOperand(1), m_Zero())) || 818 (Pred == CmpInst::ICMP_SLT && 819 match(CmpI->getOperand(1), m_Zero())) || 820 (Pred == CmpInst::ICMP_SLE && 821 match(CmpI->getOperand(1), m_ConstantInt<-1>()))) { 822 bool Inverted = 823 Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE; 824 return SelectInfo.insert({I, {CmpI, true, Inverted, 0}}).first; 825 } 826 } 827 return SelectInfo.end(); 828 } 829 830 // An BinOp(Aux(X), Y) can also be treated like a select, with condition X 831 // and values Y|1 and Y. 832 // `Aux` can be either `ZExt(1bit)`, `SExt(1bit)` or `XShr(Val), ValBitSize 833 // - 1` `BinOp` can be Add, Sub, Or 834 Value *X; 835 auto MatchZExtOrSExtPattern = 836 m_c_BinOp(m_Value(), m_OneUse(m_ZExtOrSExt(m_Value(X)))); 837 auto MatchShiftPattern = 838 m_c_BinOp(m_Value(), m_OneUse(m_Shr(m_Value(X), m_ConstantInt(Shift)))); 839 840 // This check is unnecessary, but it prevents costly access to the 841 // SelectInfo map. 842 if ((match(I, MatchZExtOrSExtPattern) && X->getType()->isIntegerTy(1)) || 843 (match(I, MatchShiftPattern) && 844 X->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1)) { 845 if (I->getOpcode() != Instruction::Add && 846 I->getOpcode() != Instruction::Sub && 847 I->getOpcode() != Instruction::Or) 848 return SelectInfo.end(); 849 850 if (I->getOpcode() == Instruction::Or && I->getType()->isIntegerTy(1)) 851 return SelectInfo.end(); 852 853 // Iterate through operands and find dependant on recognised sign 854 // extending auxiliary select-like instructions. The operand index does 855 // not matter for Add and Or. However, for Sub, we can only safely 856 // transform when the operand is second. 857 unsigned Idx = I->getOpcode() == Instruction::Sub ? 1 : 0; 858 for (; Idx < 2; Idx++) { 859 auto *Op = I->getOperand(Idx); 860 auto It = SelectInfo.find(Op); 861 if (It != SelectInfo.end() && It->second.IsAuxiliary) { 862 Cond = It->second.Cond; 863 bool Inverted = It->second.IsInverted; 864 return SelectInfo.insert({I, {Cond, false, Inverted, Idx}}).first; 865 } 866 } 867 } 868 return SelectInfo.end(); 869 }; 870 871 bool AlreadyProcessed = false; 872 BasicBlock::iterator BBIt = BB.begin(); 873 DenseMap<Value *, SelectLikeInfo>::iterator It; 874 while (BBIt != BB.end()) { 875 Instruction *I = &*BBIt++; 876 if (I->isDebugOrPseudoInst()) 877 continue; 878 879 if (!AlreadyProcessed) 880 It = ProcessSelectInfo(I); 881 else 882 AlreadyProcessed = false; 883 884 if (It == SelectInfo.end() || It->second.IsAuxiliary) 885 continue; 886 887 if (!TTI->shouldTreatInstructionLikeSelect(I)) 888 continue; 889 890 Value *Cond = It->second.Cond; 891 // Vector conditions are not supported. 892 if (!Cond->getType()->isIntegerTy(1)) 893 continue; 894 895 SelectGroup SIGroup = {Cond, {}}; 896 SIGroup.Selects.emplace_back(I, It->second.IsInverted, 897 It->second.ConditionIdx); 898 899 // If the select type is not supported, no point optimizing it. 900 // Instruction selection will take care of it. 901 if (!isSelectKindSupported(SIGroup.Selects.front())) 902 continue; 903 904 while (BBIt != BB.end()) { 905 Instruction *NI = &*BBIt; 906 // Debug/pseudo instructions should be skipped and not prevent the 907 // formation of a select group. 908 if (NI->isDebugOrPseudoInst()) { 909 ++BBIt; 910 continue; 911 } 912 913 It = ProcessSelectInfo(NI); 914 if (It == SelectInfo.end()) { 915 AlreadyProcessed = true; 916 break; 917 } 918 919 // Auxiliary with same condition 920 auto [CurrCond, IsAux, IsRev, CondIdx] = It->second; 921 if (Cond != CurrCond) { 922 AlreadyProcessed = true; 923 break; 924 } 925 926 if (!IsAux) 927 SIGroup.Selects.emplace_back(NI, IsRev, CondIdx); 928 ++BBIt; 929 } 930 LLVM_DEBUG({ 931 dbgs() << "New Select group (" << SIGroup.Selects.size() << ") with\n"; 932 for (auto &SI : SIGroup.Selects) 933 dbgs() << " " << *SI.getI() << "\n"; 934 }); 935 936 SIGroups.push_back(SIGroup); 937 } 938 } 939 940 void SelectOptimizeImpl::findProfitableSIGroupsBase( 941 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 942 for (SelectGroup &ASI : SIGroups) { 943 ++NumSelectOptAnalyzed; 944 if (isConvertToBranchProfitableBase(ASI)) 945 ProfSIGroups.push_back(ASI); 946 } 947 } 948 949 static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, 950 DiagnosticInfoOptimizationBase &Rem) { 951 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n"); 952 ORE->emit(Rem); 953 } 954 955 void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops( 956 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 957 NumSelectOptAnalyzed += SIGroups.size(); 958 // For each select group in an inner-most loop, 959 // a branch is more preferable than a select/conditional-move if: 960 // i) conversion to branches for all the select groups of the loop satisfies 961 // loop-level heuristics including reducing the loop's critical path by 962 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and 963 // ii) the total cost of the select group is cheaper with a branch compared 964 // to its predicated version. The cost is in terms of latency and the cost 965 // of a select group is the cost of its most expensive select instruction 966 // (assuming infinite resources and thus fully leveraging available ILP). 967 968 DenseMap<const Instruction *, CostInfo> InstCostMap; 969 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, 970 {Scaled64::getZero(), Scaled64::getZero()}}; 971 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || 972 !checkLoopHeuristics(L, LoopCost)) { 973 return; 974 } 975 976 for (SelectGroup &ASI : SIGroups) { 977 // Assuming infinite resources, the cost of a group of instructions is the 978 // cost of the most expensive instruction of the group. 979 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); 980 for (SelectLike &SI : ASI.Selects) { 981 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost); 982 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost); 983 } 984 if (BranchCost < SelectCost) { 985 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", 986 ASI.Selects.front().getI()); 987 OR << "Profitable to convert to branch (loop analysis). BranchCost=" 988 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() 989 << ". "; 990 EmitAndPrintRemark(ORE, OR); 991 ++NumSelectConvertedLoop; 992 ProfSIGroups.push_back(ASI); 993 } else { 994 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 995 ASI.Selects.front().getI()); 996 ORmiss << "Select is more profitable (loop analysis). BranchCost=" 997 << BranchCost.toString() 998 << ", SelectCost=" << SelectCost.toString() << ". "; 999 EmitAndPrintRemark(ORE, ORmiss); 1000 } 1001 } 1002 } 1003 1004 bool SelectOptimizeImpl::isConvertToBranchProfitableBase( 1005 const SelectGroup &ASI) { 1006 const SelectLike &SI = ASI.Selects.front(); 1007 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI() 1008 << "\n"); 1009 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI()); 1010 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI()); 1011 1012 // Skip cold basic blocks. Better to optimize for size for cold blocks. 1013 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) { 1014 ++NumSelectColdBB; 1015 ORmiss << "Not converted to branch because of cold basic block. "; 1016 EmitAndPrintRemark(ORE, ORmiss); 1017 return false; 1018 } 1019 1020 // If unpredictable, branch form is less profitable. 1021 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) { 1022 ++NumSelectUnPred; 1023 ORmiss << "Not converted to branch because of unpredictable branch. "; 1024 EmitAndPrintRemark(ORE, ORmiss); 1025 return false; 1026 } 1027 1028 // If highly predictable, branch form is more profitable, unless a 1029 // predictable select is inexpensive in the target architecture. 1030 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { 1031 ++NumSelectConvertedHighPred; 1032 OR << "Converted to branch because of highly predictable branch. "; 1033 EmitAndPrintRemark(ORE, OR); 1034 return true; 1035 } 1036 1037 // Look for expensive instructions in the cold operand's (if any) dependence 1038 // slice of any of the selects in the group. 1039 if (hasExpensiveColdOperand(ASI)) { 1040 ++NumSelectConvertedExpColdOperand; 1041 OR << "Converted to branch because of expensive cold operand."; 1042 EmitAndPrintRemark(ORE, OR); 1043 return true; 1044 } 1045 1046 // If latch has a select group with several elements, it is usually profitable 1047 // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if 1048 // conversion is profitable for innermost loops. 1049 auto *BB = SI.getI()->getParent(); 1050 auto *L = LI->getLoopFor(BB); 1051 if (L && !L->isInnermost() && L->getLoopLatch() == BB && 1052 ASI.Selects.size() >= 3) { 1053 OR << "Converted to branch because select group in the latch block is big."; 1054 EmitAndPrintRemark(ORE, OR); 1055 return true; 1056 } 1057 1058 ORmiss << "Not profitable to convert to branch (base heuristic)."; 1059 EmitAndPrintRemark(ORE, ORmiss); 1060 return false; 1061 } 1062 1063 static InstructionCost divideNearest(InstructionCost Numerator, 1064 uint64_t Denominator) { 1065 return (Numerator + (Denominator / 2)) / Denominator; 1066 } 1067 1068 static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI, 1069 uint64_t &TrueVal, uint64_t &FalseVal) { 1070 if (isa<SelectInst>(SI.getI())) 1071 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal); 1072 return false; 1073 } 1074 1075 bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) { 1076 bool ColdOperand = false; 1077 uint64_t TrueWeight, FalseWeight, TotalWeight; 1078 if (extractBranchWeights(ASI.Selects.front(), TrueWeight, FalseWeight)) { 1079 uint64_t MinWeight = std::min(TrueWeight, FalseWeight); 1080 TotalWeight = TrueWeight + FalseWeight; 1081 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? 1082 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; 1083 } else if (PSI->hasProfileSummary()) { 1084 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", 1085 ASI.Selects.front().getI()); 1086 ORmiss << "Profile data available but missing branch-weights metadata for " 1087 "select instruction. "; 1088 EmitAndPrintRemark(ORE, ORmiss); 1089 } 1090 if (!ColdOperand) 1091 return false; 1092 // Check if the cold path's dependence slice is expensive for any of the 1093 // selects of the group. 1094 for (SelectLike SI : ASI.Selects) { 1095 Instruction *ColdI = nullptr; 1096 uint64_t HotWeight; 1097 if (TrueWeight < FalseWeight) { 1098 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue()); 1099 HotWeight = FalseWeight; 1100 } else { 1101 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue()); 1102 HotWeight = TrueWeight; 1103 } 1104 if (ColdI) { 1105 std::stack<Instruction *> ColdSlice; 1106 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI()); 1107 InstructionCost SliceCost = 0; 1108 while (!ColdSlice.empty()) { 1109 SliceCost += TTI->getInstructionCost(ColdSlice.top(), 1110 TargetTransformInfo::TCK_Latency); 1111 ColdSlice.pop(); 1112 } 1113 // The colder the cold value operand of the select is the more expensive 1114 // the cmov becomes for computing the cold value operand every time. Thus, 1115 // the colder the cold operand is the more its cost counts. 1116 // Get nearest integer cost adjusted for coldness. 1117 InstructionCost AdjSliceCost = 1118 divideNearest(SliceCost * HotWeight, TotalWeight); 1119 if (AdjSliceCost >= 1120 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) 1121 return true; 1122 } 1123 } 1124 return false; 1125 } 1126 1127 // Check if it is safe to move LoadI next to the SI. 1128 // Conservatively assume it is safe only if there is no instruction 1129 // modifying memory in-between the load and the select instruction. 1130 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { 1131 // Assume loads from different basic blocks are unsafe to move. 1132 if (LoadI->getParent() != SI->getParent()) 1133 return false; 1134 auto It = LoadI->getIterator(); 1135 while (&*It != SI) { 1136 if (It->mayWriteToMemory()) 1137 return false; 1138 It++; 1139 } 1140 return true; 1141 } 1142 1143 // For a given source instruction, collect its backwards dependence slice 1144 // consisting of instructions exclusively computed for the purpose of producing 1145 // the operands of the source instruction. As an approximation 1146 // (sufficiently-accurate in practice), we populate this set with the 1147 // instructions of the backwards dependence slice that only have one-use and 1148 // form an one-use chain that leads to the source instruction. 1149 void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I, 1150 std::stack<Instruction *> &Slice, 1151 Instruction *SI, 1152 bool ForSinking) { 1153 SmallPtrSet<Instruction *, 2> Visited; 1154 std::queue<Instruction *> Worklist; 1155 Worklist.push(I); 1156 while (!Worklist.empty()) { 1157 Instruction *II = Worklist.front(); 1158 Worklist.pop(); 1159 1160 // Avoid cycles. 1161 if (!Visited.insert(II).second) 1162 continue; 1163 1164 if (!II->hasOneUse()) 1165 continue; 1166 1167 // Cannot soundly sink instructions with side-effects. 1168 // Terminator or phi instructions cannot be sunk. 1169 // Avoid sinking other select instructions (should be handled separetely). 1170 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || 1171 isa<SelectInst>(II) || isa<PHINode>(II))) 1172 continue; 1173 1174 // Avoid sinking loads in order not to skip state-modifying instructions, 1175 // that may alias with the loaded address. 1176 // Only allow sinking of loads within the same basic block that are 1177 // conservatively proven to be safe. 1178 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI)) 1179 continue; 1180 1181 // Avoid considering instructions with less frequency than the source 1182 // instruction (i.e., avoid colder code regions of the dependence slice). 1183 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) 1184 continue; 1185 1186 // Eligible one-use instruction added to the dependence slice. 1187 Slice.push(II); 1188 1189 // Explore all the operands of the current instruction to expand the slice. 1190 for (Value *Op : II->operand_values()) 1191 if (auto *OpI = dyn_cast<Instruction>(Op)) 1192 Worklist.push(OpI); 1193 } 1194 } 1195 1196 bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) { 1197 uint64_t TrueWeight, FalseWeight; 1198 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1199 uint64_t Max = std::max(TrueWeight, FalseWeight); 1200 uint64_t Sum = TrueWeight + FalseWeight; 1201 if (Sum != 0) { 1202 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 1203 if (Probability > TTI->getPredictableBranchThreshold()) 1204 return true; 1205 } 1206 } 1207 return false; 1208 } 1209 1210 bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, 1211 const CostInfo LoopCost[2]) { 1212 // Loop-level checks to determine if a non-predicated version (with branches) 1213 // of the loop is more profitable than its predicated version. 1214 1215 if (DisableLoopLevelHeuristics) 1216 return true; 1217 1218 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", 1219 &*L->getHeader()->getFirstNonPHIIt()); 1220 1221 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || 1222 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { 1223 ORmissL << "No select conversion in the loop due to no reduction of loop's " 1224 "critical path. "; 1225 EmitAndPrintRemark(ORE, ORmissL); 1226 return false; 1227 } 1228 1229 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, 1230 LoopCost[1].PredCost - LoopCost[1].NonPredCost}; 1231 1232 // Profitably converting to branches need to reduce the loop's critical path 1233 // by at least some threshold (absolute gain of GainCycleThreshold cycles and 1234 // relative gain of 12.5%). 1235 if (Gain[1] < Scaled64::get(GainCycleThreshold) || 1236 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { 1237 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; 1238 ORmissL << "No select conversion in the loop due to small reduction of " 1239 "loop's critical path. Gain=" 1240 << Gain[1].toString() 1241 << ", RelativeGain=" << RelativeGain.toString() << "%. "; 1242 EmitAndPrintRemark(ORE, ORmissL); 1243 return false; 1244 } 1245 1246 // If the loop's critical path involves loop-carried dependences, the gradient 1247 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). 1248 // This check ensures that the latency reduction for the loop's critical path 1249 // keeps decreasing with sufficient rate beyond the two analyzed loop 1250 // iterations. 1251 if (Gain[1] > Gain[0]) { 1252 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / 1253 (LoopCost[1].PredCost - LoopCost[0].PredCost); 1254 if (GradientGain < Scaled64::get(GainGradientThreshold)) { 1255 ORmissL << "No select conversion in the loop due to small gradient gain. " 1256 "GradientGain=" 1257 << GradientGain.toString() << "%. "; 1258 EmitAndPrintRemark(ORE, ORmissL); 1259 return false; 1260 } 1261 } 1262 // If the gain decreases it is not profitable to convert. 1263 else if (Gain[1] < Gain[0]) { 1264 ORmissL 1265 << "No select conversion in the loop due to negative gradient gain. "; 1266 EmitAndPrintRemark(ORE, ORmissL); 1267 return false; 1268 } 1269 1270 // Non-predicated version of the loop is more profitable than its 1271 // predicated version. 1272 return true; 1273 } 1274 1275 // Computes instruction and loop-critical-path costs for both the predicated 1276 // and non-predicated version of the given loop. 1277 // Returns false if unable to compute these costs due to invalid cost of loop 1278 // instruction(s). 1279 bool SelectOptimizeImpl::computeLoopCosts( 1280 const Loop *L, const SelectGroups &SIGroups, 1281 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { 1282 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop " 1283 << L->getHeader()->getName() << "\n"); 1284 const auto SImap = getSImap(SIGroups); 1285 const auto SGmap = getSGmap(SIGroups); 1286 // Compute instruction and loop-critical-path costs across two iterations for 1287 // both predicated and non-predicated version. 1288 const unsigned Iterations = 2; 1289 for (unsigned Iter = 0; Iter < Iterations; ++Iter) { 1290 // Cost of the loop's critical path. 1291 CostInfo &MaxCost = LoopCost[Iter]; 1292 for (BasicBlock *BB : L->getBlocks()) { 1293 for (const Instruction &I : *BB) { 1294 if (I.isDebugOrPseudoInst()) 1295 continue; 1296 // Compute the predicated and non-predicated cost of the instruction. 1297 Scaled64 IPredCost = Scaled64::getZero(), 1298 INonPredCost = Scaled64::getZero(); 1299 1300 // Assume infinite resources that allow to fully exploit the available 1301 // instruction-level parallelism. 1302 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) 1303 for (const Use &U : I.operands()) { 1304 auto UI = dyn_cast<Instruction>(U.get()); 1305 if (!UI) 1306 continue; 1307 if (auto It = InstCostMap.find(UI); It != InstCostMap.end()) { 1308 IPredCost = std::max(IPredCost, It->second.PredCost); 1309 INonPredCost = std::max(INonPredCost, It->second.NonPredCost); 1310 } 1311 } 1312 auto ILatency = computeInstCost(&I); 1313 if (!ILatency) { 1314 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); 1315 ORmissL << "Invalid instruction cost preventing analysis and " 1316 "optimization of the inner-most loop containing this " 1317 "instruction. "; 1318 EmitAndPrintRemark(ORE, ORmissL); 1319 return false; 1320 } 1321 IPredCost += Scaled64::get(*ILatency); 1322 INonPredCost += Scaled64::get(*ILatency); 1323 1324 // For a select that can be converted to branch, 1325 // compute its cost as a branch (non-predicated cost). 1326 // 1327 // BranchCost = PredictedPathCost + MispredictCost 1328 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb 1329 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate 1330 if (SImap.contains(&I)) { 1331 auto SI = SImap.at(&I); 1332 const auto *SG = SGmap.at(&I); 1333 Scaled64 TrueOpCost = SI.getOpCostOnBranch(true, InstCostMap, TTI); 1334 Scaled64 FalseOpCost = SI.getOpCostOnBranch(false, InstCostMap, TTI); 1335 Scaled64 PredictedPathCost = 1336 getPredictedPathCost(TrueOpCost, FalseOpCost, SI); 1337 1338 Scaled64 CondCost = Scaled64::getZero(); 1339 if (auto *CI = dyn_cast<Instruction>(SG->Condition)) 1340 if (auto It = InstCostMap.find(CI); It != InstCostMap.end()) 1341 CondCost = It->second.NonPredCost; 1342 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); 1343 1344 INonPredCost = PredictedPathCost + MispredictCost; 1345 } 1346 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/" 1347 << INonPredCost << " for " << I << "\n"); 1348 1349 InstCostMap[&I] = {IPredCost, INonPredCost}; 1350 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); 1351 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); 1352 } 1353 } 1354 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1 1355 << " MaxCost = " << MaxCost.PredCost << " " 1356 << MaxCost.NonPredCost << "\n"); 1357 } 1358 return true; 1359 } 1360 1361 SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2> 1362 SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) { 1363 SmallDenseMap<const Instruction *, SelectLike, 2> SImap; 1364 for (const SelectGroup &ASI : SIGroups) 1365 for (const SelectLike &SI : ASI.Selects) 1366 SImap.try_emplace(SI.getI(), SI); 1367 return SImap; 1368 } 1369 1370 SmallDenseMap<const Instruction *, const SelectOptimizeImpl::SelectGroup *, 2> 1371 SelectOptimizeImpl::getSGmap(const SelectGroups &SIGroups) { 1372 SmallDenseMap<const Instruction *, const SelectGroup *, 2> SImap; 1373 for (const SelectGroup &ASI : SIGroups) 1374 for (const SelectLike &SI : ASI.Selects) 1375 SImap.try_emplace(SI.getI(), &ASI); 1376 return SImap; 1377 } 1378 1379 std::optional<uint64_t> 1380 SelectOptimizeImpl::computeInstCost(const Instruction *I) { 1381 InstructionCost ICost = 1382 TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); 1383 if (auto OC = ICost.getValue()) 1384 return std::optional<uint64_t>(*OC); 1385 return std::nullopt; 1386 } 1387 1388 ScaledNumber<uint64_t> 1389 SelectOptimizeImpl::getMispredictionCost(const SelectLike SI, 1390 const Scaled64 CondCost) { 1391 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; 1392 1393 // Account for the default misprediction rate when using a branch 1394 // (conservatively set to 25% by default). 1395 uint64_t MispredictRate = MispredictDefaultRate; 1396 // If the select condition is obviously predictable, then the misprediction 1397 // rate is zero. 1398 if (isSelectHighlyPredictable(SI)) 1399 MispredictRate = 0; 1400 1401 // CondCost is included to account for cases where the computation of the 1402 // condition is part of a long dependence chain (potentially loop-carried) 1403 // that would delay detection of a misprediction and increase its cost. 1404 Scaled64 MispredictCost = 1405 std::max(Scaled64::get(MispredictPenalty), CondCost) * 1406 Scaled64::get(MispredictRate); 1407 MispredictCost /= Scaled64::get(100); 1408 1409 return MispredictCost; 1410 } 1411 1412 // Returns the cost of a branch when the prediction is correct. 1413 // TrueCost * TrueProbability + FalseCost * FalseProbability. 1414 ScaledNumber<uint64_t> 1415 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 1416 const SelectLike SI) { 1417 Scaled64 PredPathCost; 1418 uint64_t TrueWeight, FalseWeight; 1419 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) { 1420 uint64_t SumWeight = TrueWeight + FalseWeight; 1421 if (SumWeight != 0) { 1422 PredPathCost = TrueCost * Scaled64::get(TrueWeight) + 1423 FalseCost * Scaled64::get(FalseWeight); 1424 PredPathCost /= Scaled64::get(SumWeight); 1425 return PredPathCost; 1426 } 1427 } 1428 // Without branch weight metadata, we assume 75% for the one path and 25% for 1429 // the other, and pick the result with the biggest cost. 1430 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, 1431 FalseCost * Scaled64::get(3) + TrueCost); 1432 PredPathCost /= Scaled64::get(4); 1433 return PredPathCost; 1434 } 1435 1436 bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) { 1437 TargetLowering::SelectSupportKind SelectKind; 1438 if (SI.getType()->isVectorTy()) 1439 SelectKind = TargetLowering::ScalarCondVectorVal; 1440 else 1441 SelectKind = TargetLowering::ScalarValSelect; 1442 return TLI->isSelectSupported(SelectKind); 1443 } 1444