1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass converts selects to conditional jumps when profitable. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/ADT/Optional.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/Analysis/BlockFrequencyInfo.h" 17 #include "llvm/Analysis/BranchProbabilityInfo.h" 18 #include "llvm/Analysis/LoopInfo.h" 19 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 20 #include "llvm/Analysis/ProfileSummaryInfo.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/CodeGen/TargetLowering.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/CodeGen/TargetSchedule.h" 26 #include "llvm/CodeGen/TargetSubtargetInfo.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/ProfDataUtils.h" 33 #include "llvm/InitializePasses.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/ScaledNumber.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/Utils/SizeOpts.h" 38 #include <algorithm> 39 #include <memory> 40 #include <queue> 41 #include <stack> 42 #include <string> 43 44 using namespace llvm; 45 46 #define DEBUG_TYPE "select-optimize" 47 48 STATISTIC(NumSelectOptAnalyzed, 49 "Number of select groups considered for conversion to branch"); 50 STATISTIC(NumSelectConvertedExpColdOperand, 51 "Number of select groups converted due to expensive cold operand"); 52 STATISTIC(NumSelectConvertedHighPred, 53 "Number of select groups converted due to high-predictability"); 54 STATISTIC(NumSelectUnPred, 55 "Number of select groups not converted due to unpredictability"); 56 STATISTIC(NumSelectColdBB, 57 "Number of select groups not converted due to cold basic block"); 58 STATISTIC(NumSelectConvertedLoop, 59 "Number of select groups converted due to loop-level analysis"); 60 STATISTIC(NumSelectsConverted, "Number of selects converted"); 61 62 static cl::opt<unsigned> ColdOperandThreshold( 63 "cold-operand-threshold", 64 cl::desc("Maximum frequency of path for an operand to be considered cold."), 65 cl::init(20), cl::Hidden); 66 67 static cl::opt<unsigned> ColdOperandMaxCostMultiplier( 68 "cold-operand-max-cost-multiplier", 69 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " 70 "slice of a cold operand to be considered inexpensive."), 71 cl::init(1), cl::Hidden); 72 73 static cl::opt<unsigned> 74 GainGradientThreshold("select-opti-loop-gradient-gain-threshold", 75 cl::desc("Gradient gain threshold (%)."), 76 cl::init(25), cl::Hidden); 77 78 static cl::opt<unsigned> 79 GainCycleThreshold("select-opti-loop-cycle-gain-threshold", 80 cl::desc("Minimum gain per loop (in cycles) threshold."), 81 cl::init(4), cl::Hidden); 82 83 static cl::opt<unsigned> GainRelativeThreshold( 84 "select-opti-loop-relative-gain-threshold", 85 cl::desc( 86 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), 87 cl::init(8), cl::Hidden); 88 89 static cl::opt<unsigned> MispredictDefaultRate( 90 "mispredict-default-rate", cl::Hidden, cl::init(25), 91 cl::desc("Default mispredict rate (initialized to 25%).")); 92 93 static cl::opt<bool> 94 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, 95 cl::init(false), 96 cl::desc("Disable loop-level heuristics.")); 97 98 namespace { 99 100 class SelectOptimize : public FunctionPass { 101 const TargetMachine *TM = nullptr; 102 const TargetSubtargetInfo *TSI; 103 const TargetLowering *TLI = nullptr; 104 const TargetTransformInfo *TTI = nullptr; 105 const LoopInfo *LI; 106 DominatorTree *DT; 107 std::unique_ptr<BlockFrequencyInfo> BFI; 108 std::unique_ptr<BranchProbabilityInfo> BPI; 109 ProfileSummaryInfo *PSI; 110 OptimizationRemarkEmitter *ORE; 111 TargetSchedModel TSchedModel; 112 113 public: 114 static char ID; 115 116 SelectOptimize() : FunctionPass(ID) { 117 initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); 118 } 119 120 bool runOnFunction(Function &F) override; 121 122 void getAnalysisUsage(AnalysisUsage &AU) const override { 123 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 124 AU.addRequired<TargetPassConfig>(); 125 AU.addRequired<TargetTransformInfoWrapperPass>(); 126 AU.addRequired<DominatorTreeWrapperPass>(); 127 AU.addRequired<LoopInfoWrapperPass>(); 128 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 129 } 130 131 private: 132 // Select groups consist of consecutive select instructions with the same 133 // condition. 134 using SelectGroup = SmallVector<SelectInst *, 2>; 135 using SelectGroups = SmallVector<SelectGroup, 2>; 136 137 using Scaled64 = ScaledNumber<uint64_t>; 138 139 struct CostInfo { 140 /// Predicated cost (with selects as conditional moves). 141 Scaled64 PredCost; 142 /// Non-predicated cost (with selects converted to branches). 143 Scaled64 NonPredCost; 144 }; 145 146 // Converts select instructions of a function to conditional jumps when deemed 147 // profitable. Returns true if at least one select was converted. 148 bool optimizeSelects(Function &F); 149 150 // Heuristics for determining which select instructions can be profitably 151 // conveted to branches. Separate heuristics for selects in inner-most loops 152 // and the rest of code regions (base heuristics for non-inner-most loop 153 // regions). 154 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); 155 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); 156 157 // Converts to branches the select groups that were deemed 158 // profitable-to-convert. 159 void convertProfitableSIGroups(SelectGroups &ProfSIGroups); 160 161 // Splits selects of a given basic block into select groups. 162 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); 163 164 // Determines for which select groups it is profitable converting to branches 165 // (base and inner-most-loop heuristics). 166 void findProfitableSIGroupsBase(SelectGroups &SIGroups, 167 SelectGroups &ProfSIGroups); 168 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, 169 SelectGroups &ProfSIGroups); 170 171 // Determines if a select group should be converted to a branch (base 172 // heuristics). 173 bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI); 174 175 // Returns true if there are expensive instructions in the cold value 176 // operand's (if any) dependence slice of any of the selects of the given 177 // group. 178 bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI); 179 180 // For a given source instruction, collect its backwards dependence slice 181 // consisting of instructions exclusively computed for producing the operands 182 // of the source instruction. 183 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, 184 Instruction *SI, bool ForSinking = false); 185 186 // Returns true if the condition of the select is highly predictable. 187 bool isSelectHighlyPredictable(const SelectInst *SI); 188 189 // Loop-level checks to determine if a non-predicated version (with branches) 190 // of the given loop is more profitable than its predicated version. 191 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); 192 193 // Computes instruction and loop-critical-path costs for both the predicated 194 // and non-predicated version of the given loop. 195 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, 196 DenseMap<const Instruction *, CostInfo> &InstCostMap, 197 CostInfo *LoopCost); 198 199 // Returns a set of all the select instructions in the given select groups. 200 SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups); 201 202 // Returns the latency cost of a given instruction. 203 Optional<uint64_t> computeInstCost(const Instruction *I); 204 205 // Returns the misprediction cost of a given select when converted to branch. 206 Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost); 207 208 // Returns the cost of a branch when the prediction is correct. 209 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 210 const SelectInst *SI); 211 212 // Returns true if the target architecture supports lowering a given select. 213 bool isSelectKindSupported(SelectInst *SI); 214 }; 215 } // namespace 216 217 char SelectOptimize::ID = 0; 218 219 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 220 false) 221 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 222 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 223 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 224 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 225 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 226 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 227 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, 228 false) 229 230 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } 231 232 bool SelectOptimize::runOnFunction(Function &F) { 233 TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); 234 TSI = TM->getSubtargetImpl(F); 235 TLI = TSI->getTargetLowering(); 236 237 // If none of the select types is supported then skip this pass. 238 // This is an optimization pass. Legality issues will be handled by 239 // instruction selection. 240 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && 241 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && 242 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) 243 return false; 244 245 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 246 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 247 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 248 BPI.reset(new BranchProbabilityInfo(F, *LI)); 249 BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); 250 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 251 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 252 TSchedModel.init(TSI); 253 254 // When optimizing for size, selects are preferable over branches. 255 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get())) 256 return false; 257 258 return optimizeSelects(F); 259 } 260 261 bool SelectOptimize::optimizeSelects(Function &F) { 262 // Determine for which select groups it is profitable converting to branches. 263 SelectGroups ProfSIGroups; 264 // Base heuristics apply only to non-loops and outer loops. 265 optimizeSelectsBase(F, ProfSIGroups); 266 // Separate heuristics for inner-most loops. 267 optimizeSelectsInnerLoops(F, ProfSIGroups); 268 269 // Convert to branches the select groups that were deemed 270 // profitable-to-convert. 271 convertProfitableSIGroups(ProfSIGroups); 272 273 // Code modified if at least one select group was converted. 274 return !ProfSIGroups.empty(); 275 } 276 277 void SelectOptimize::optimizeSelectsBase(Function &F, 278 SelectGroups &ProfSIGroups) { 279 // Collect all the select groups. 280 SelectGroups SIGroups; 281 for (BasicBlock &BB : F) { 282 // Base heuristics apply only to non-loops and outer loops. 283 Loop *L = LI->getLoopFor(&BB); 284 if (L && L->isInnermost()) 285 continue; 286 collectSelectGroups(BB, SIGroups); 287 } 288 289 // Determine for which select groups it is profitable converting to branches. 290 findProfitableSIGroupsBase(SIGroups, ProfSIGroups); 291 } 292 293 void SelectOptimize::optimizeSelectsInnerLoops(Function &F, 294 SelectGroups &ProfSIGroups) { 295 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); 296 // Need to check size on each iteration as we accumulate child loops. 297 for (unsigned long i = 0; i < Loops.size(); ++i) 298 for (Loop *ChildL : Loops[i]->getSubLoops()) 299 Loops.push_back(ChildL); 300 301 for (Loop *L : Loops) { 302 if (!L->isInnermost()) 303 continue; 304 305 SelectGroups SIGroups; 306 for (BasicBlock *BB : L->getBlocks()) 307 collectSelectGroups(*BB, SIGroups); 308 309 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); 310 } 311 } 312 313 /// If \p isTrue is true, return the true value of \p SI, otherwise return 314 /// false value of \p SI. If the true/false value of \p SI is defined by any 315 /// select instructions in \p Selects, look through the defining select 316 /// instruction until the true/false value is not defined in \p Selects. 317 static Value * 318 getTrueOrFalseValue(SelectInst *SI, bool isTrue, 319 const SmallPtrSet<const Instruction *, 2> &Selects) { 320 Value *V = nullptr; 321 for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI); 322 DefSI = dyn_cast<SelectInst>(V)) { 323 assert(DefSI->getCondition() == SI->getCondition() && 324 "The condition of DefSI does not match with SI"); 325 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); 326 } 327 assert(V && "Failed to get select true/false value"); 328 return V; 329 } 330 331 void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { 332 for (SelectGroup &ASI : ProfSIGroups) { 333 // The code transformation here is a modified version of the sinking 334 // transformation in CodeGenPrepare::optimizeSelectInst with a more 335 // aggressive strategy of which instructions to sink. 336 // 337 // TODO: eliminate the redundancy of logic transforming selects to branches 338 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here 339 // selects for all cases (with and without profile information). 340 341 // Transform a sequence like this: 342 // start: 343 // %cmp = cmp uge i32 %a, %b 344 // %sel = select i1 %cmp, i32 %c, i32 %d 345 // 346 // Into: 347 // start: 348 // %cmp = cmp uge i32 %a, %b 349 // %cmp.frozen = freeze %cmp 350 // br i1 %cmp.frozen, label %select.true, label %select.false 351 // select.true: 352 // br label %select.end 353 // select.false: 354 // br label %select.end 355 // select.end: 356 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 357 // 358 // %cmp should be frozen, otherwise it may introduce undefined behavior. 359 // In addition, we may sink instructions that produce %c or %d into the 360 // destination(s) of the new branch. 361 // If the true or false blocks do not contain a sunken instruction, that 362 // block and its branch may be optimized away. In that case, one side of the 363 // first branch will point directly to select.end, and the corresponding PHI 364 // predecessor block will be the start block. 365 366 // Find all the instructions that can be soundly sunk to the true/false 367 // blocks. These are instructions that are computed solely for producing the 368 // operands of the select instructions in the group and can be sunk without 369 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions 370 // with side effects). 371 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; 372 typedef std::stack<Instruction *>::size_type StackSizeType; 373 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; 374 for (SelectInst *SI : ASI) { 375 // For each select, compute the sinkable dependence chains of the true and 376 // false operands. 377 if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) { 378 std::stack<Instruction *> TrueSlice; 379 getExclBackwardsSlice(TI, TrueSlice, SI, true); 380 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); 381 TrueSlices.push_back(TrueSlice); 382 } 383 if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) { 384 std::stack<Instruction *> FalseSlice; 385 getExclBackwardsSlice(FI, FalseSlice, SI, true); 386 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); 387 FalseSlices.push_back(FalseSlice); 388 } 389 } 390 // In the case of multiple select instructions in the same group, the order 391 // of non-dependent instructions (instructions of different dependence 392 // slices) in the true/false blocks appears to affect performance. 393 // Interleaving the slices seems to experimentally be the optimal approach. 394 // This interleaving scheduling allows for more ILP (with a natural downside 395 // of increasing a bit register pressure) compared to a simple ordering of 396 // one whole chain after another. One would expect that this ordering would 397 // not matter since the scheduling in the backend of the compiler would 398 // take care of it, but apparently the scheduler fails to deliver optimal 399 // ILP with a naive ordering here. 400 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; 401 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { 402 for (auto &S : TrueSlices) { 403 if (!S.empty()) { 404 TrueSlicesInterleaved.push_back(S.top()); 405 S.pop(); 406 } 407 } 408 } 409 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { 410 for (auto &S : FalseSlices) { 411 if (!S.empty()) { 412 FalseSlicesInterleaved.push_back(S.top()); 413 S.pop(); 414 } 415 } 416 } 417 418 // We split the block containing the select(s) into two blocks. 419 SelectInst *SI = ASI.front(); 420 SelectInst *LastSI = ASI.back(); 421 BasicBlock *StartBlock = SI->getParent(); 422 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); 423 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 424 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); 425 // Delete the unconditional branch that was just created by the split. 426 StartBlock->getTerminator()->eraseFromParent(); 427 428 // Move any debug/pseudo instructions that were in-between the select 429 // group to the newly-created end block. 430 SmallVector<Instruction *, 2> DebugPseudoINS; 431 auto DIt = SI->getIterator(); 432 while (&*DIt != LastSI) { 433 if (DIt->isDebugOrPseudoInst()) 434 DebugPseudoINS.push_back(&*DIt); 435 DIt++; 436 } 437 for (auto *DI : DebugPseudoINS) { 438 DI->moveBefore(&*EndBlock->getFirstInsertionPt()); 439 } 440 441 // These are the new basic blocks for the conditional branch. 442 // At least one will become an actual new basic block. 443 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; 444 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; 445 if (!TrueSlicesInterleaved.empty()) { 446 TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", 447 EndBlock->getParent(), EndBlock); 448 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 449 TrueBranch->setDebugLoc(LastSI->getDebugLoc()); 450 for (Instruction *TrueInst : TrueSlicesInterleaved) 451 TrueInst->moveBefore(TrueBranch); 452 } 453 if (!FalseSlicesInterleaved.empty()) { 454 FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", 455 EndBlock->getParent(), EndBlock); 456 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 457 FalseBranch->setDebugLoc(LastSI->getDebugLoc()); 458 for (Instruction *FalseInst : FalseSlicesInterleaved) 459 FalseInst->moveBefore(FalseBranch); 460 } 461 // If there was nothing to sink, then arbitrarily choose the 'false' side 462 // for a new input value to the PHI. 463 if (TrueBlock == FalseBlock) { 464 assert(TrueBlock == nullptr && 465 "Unexpected basic block transform while optimizing select"); 466 467 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", 468 EndBlock->getParent(), EndBlock); 469 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 470 FalseBranch->setDebugLoc(SI->getDebugLoc()); 471 } 472 473 // Insert the real conditional branch based on the original condition. 474 // If we did not create a new block for one of the 'true' or 'false' paths 475 // of the condition, it means that side of the branch goes to the end block 476 // directly and the path originates from the start block from the point of 477 // view of the new PHI. 478 BasicBlock *TT, *FT; 479 if (TrueBlock == nullptr) { 480 TT = EndBlock; 481 FT = FalseBlock; 482 TrueBlock = StartBlock; 483 } else if (FalseBlock == nullptr) { 484 TT = TrueBlock; 485 FT = EndBlock; 486 FalseBlock = StartBlock; 487 } else { 488 TT = TrueBlock; 489 FT = FalseBlock; 490 } 491 IRBuilder<> IB(SI); 492 auto *CondFr = 493 IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); 494 IB.CreateCondBr(CondFr, TT, FT, SI); 495 496 SmallPtrSet<const Instruction *, 2> INS; 497 INS.insert(ASI.begin(), ASI.end()); 498 // Use reverse iterator because later select may use the value of the 499 // earlier select, and we need to propagate value through earlier select 500 // to get the PHI operand. 501 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { 502 SelectInst *SI = *It; 503 // The select itself is replaced with a PHI Node. 504 PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); 505 PN->takeName(SI); 506 PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); 507 PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); 508 PN->setDebugLoc(SI->getDebugLoc()); 509 510 SI->replaceAllUsesWith(PN); 511 SI->eraseFromParent(); 512 INS.erase(SI); 513 ++NumSelectsConverted; 514 } 515 } 516 } 517 518 void SelectOptimize::collectSelectGroups(BasicBlock &BB, 519 SelectGroups &SIGroups) { 520 BasicBlock::iterator BBIt = BB.begin(); 521 while (BBIt != BB.end()) { 522 Instruction *I = &*BBIt++; 523 if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 524 SelectGroup SIGroup; 525 SIGroup.push_back(SI); 526 while (BBIt != BB.end()) { 527 Instruction *NI = &*BBIt; 528 SelectInst *NSI = dyn_cast<SelectInst>(NI); 529 if (NSI && SI->getCondition() == NSI->getCondition()) { 530 SIGroup.push_back(NSI); 531 } else if (!NI->isDebugOrPseudoInst()) { 532 // Debug/pseudo instructions should be skipped and not prevent the 533 // formation of a select group. 534 break; 535 } 536 ++BBIt; 537 } 538 539 // If the select type is not supported, no point optimizing it. 540 // Instruction selection will take care of it. 541 if (!isSelectKindSupported(SI)) 542 continue; 543 544 SIGroups.push_back(SIGroup); 545 } 546 } 547 } 548 549 void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups, 550 SelectGroups &ProfSIGroups) { 551 for (SelectGroup &ASI : SIGroups) { 552 ++NumSelectOptAnalyzed; 553 if (isConvertToBranchProfitableBase(ASI)) 554 ProfSIGroups.push_back(ASI); 555 } 556 } 557 558 void SelectOptimize::findProfitableSIGroupsInnerLoops( 559 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { 560 NumSelectOptAnalyzed += SIGroups.size(); 561 // For each select group in an inner-most loop, 562 // a branch is more preferable than a select/conditional-move if: 563 // i) conversion to branches for all the select groups of the loop satisfies 564 // loop-level heuristics including reducing the loop's critical path by 565 // some threshold (see SelectOptimize::checkLoopHeuristics); and 566 // ii) the total cost of the select group is cheaper with a branch compared 567 // to its predicated version. The cost is in terms of latency and the cost 568 // of a select group is the cost of its most expensive select instruction 569 // (assuming infinite resources and thus fully leveraging available ILP). 570 571 DenseMap<const Instruction *, CostInfo> InstCostMap; 572 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, 573 {Scaled64::getZero(), Scaled64::getZero()}}; 574 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || 575 !checkLoopHeuristics(L, LoopCost)) { 576 return; 577 } 578 579 for (SelectGroup &ASI : SIGroups) { 580 // Assuming infinite resources, the cost of a group of instructions is the 581 // cost of the most expensive instruction of the group. 582 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); 583 for (SelectInst *SI : ASI) { 584 SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost); 585 BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost); 586 } 587 if (BranchCost < SelectCost) { 588 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front()); 589 OR << "Profitable to convert to branch (loop analysis). BranchCost=" 590 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() 591 << ". "; 592 ORE->emit(OR); 593 ++NumSelectConvertedLoop; 594 ProfSIGroups.push_back(ASI); 595 } else { 596 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); 597 ORmiss << "Select is more profitable (loop analysis). BranchCost=" 598 << BranchCost.toString() 599 << ", SelectCost=" << SelectCost.toString() << ". "; 600 ORE->emit(ORmiss); 601 } 602 } 603 } 604 605 bool SelectOptimize::isConvertToBranchProfitableBase( 606 const SmallVector<SelectInst *, 2> &ASI) { 607 SelectInst *SI = ASI.front(); 608 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI); 609 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI); 610 611 // Skip cold basic blocks. Better to optimize for size for cold blocks. 612 if (PSI->isColdBlock(SI->getParent(), BFI.get())) { 613 ++NumSelectColdBB; 614 ORmiss << "Not converted to branch because of cold basic block. "; 615 ORE->emit(ORmiss); 616 return false; 617 } 618 619 // If unpredictable, branch form is less profitable. 620 if (SI->getMetadata(LLVMContext::MD_unpredictable)) { 621 ++NumSelectUnPred; 622 ORmiss << "Not converted to branch because of unpredictable branch. "; 623 ORE->emit(ORmiss); 624 return false; 625 } 626 627 // If highly predictable, branch form is more profitable, unless a 628 // predictable select is inexpensive in the target architecture. 629 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { 630 ++NumSelectConvertedHighPred; 631 OR << "Converted to branch because of highly predictable branch. "; 632 ORE->emit(OR); 633 return true; 634 } 635 636 // Look for expensive instructions in the cold operand's (if any) dependence 637 // slice of any of the selects in the group. 638 if (hasExpensiveColdOperand(ASI)) { 639 ++NumSelectConvertedExpColdOperand; 640 OR << "Converted to branch because of expensive cold operand."; 641 ORE->emit(OR); 642 return true; 643 } 644 645 ORmiss << "Not profitable to convert to branch (base heuristic)."; 646 ORE->emit(ORmiss); 647 return false; 648 } 649 650 static InstructionCost divideNearest(InstructionCost Numerator, 651 uint64_t Denominator) { 652 return (Numerator + (Denominator / 2)) / Denominator; 653 } 654 655 bool SelectOptimize::hasExpensiveColdOperand( 656 const SmallVector<SelectInst *, 2> &ASI) { 657 bool ColdOperand = false; 658 uint64_t TrueWeight, FalseWeight, TotalWeight; 659 if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) { 660 uint64_t MinWeight = std::min(TrueWeight, FalseWeight); 661 TotalWeight = TrueWeight + FalseWeight; 662 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? 663 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; 664 } else if (PSI->hasProfileSummary()) { 665 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); 666 ORmiss << "Profile data available but missing branch-weights metadata for " 667 "select instruction. "; 668 ORE->emit(ORmiss); 669 } 670 if (!ColdOperand) 671 return false; 672 // Check if the cold path's dependence slice is expensive for any of the 673 // selects of the group. 674 for (SelectInst *SI : ASI) { 675 Instruction *ColdI = nullptr; 676 uint64_t HotWeight; 677 if (TrueWeight < FalseWeight) { 678 ColdI = dyn_cast<Instruction>(SI->getTrueValue()); 679 HotWeight = FalseWeight; 680 } else { 681 ColdI = dyn_cast<Instruction>(SI->getFalseValue()); 682 HotWeight = TrueWeight; 683 } 684 if (ColdI) { 685 std::stack<Instruction *> ColdSlice; 686 getExclBackwardsSlice(ColdI, ColdSlice, SI); 687 InstructionCost SliceCost = 0; 688 while (!ColdSlice.empty()) { 689 SliceCost += TTI->getInstructionCost(ColdSlice.top(), 690 TargetTransformInfo::TCK_Latency); 691 ColdSlice.pop(); 692 } 693 // The colder the cold value operand of the select is the more expensive 694 // the cmov becomes for computing the cold value operand every time. Thus, 695 // the colder the cold operand is the more its cost counts. 696 // Get nearest integer cost adjusted for coldness. 697 InstructionCost AdjSliceCost = 698 divideNearest(SliceCost * HotWeight, TotalWeight); 699 if (AdjSliceCost >= 700 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) 701 return true; 702 } 703 } 704 return false; 705 } 706 707 // Check if it is safe to move LoadI next to the SI. 708 // Conservatively assume it is safe only if there is no instruction 709 // modifying memory in-between the load and the select instruction. 710 static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { 711 // Assume loads from different basic blocks are unsafe to move. 712 if (LoadI->getParent() != SI->getParent()) 713 return false; 714 auto It = LoadI->getIterator(); 715 while (&*It != SI) { 716 if (It->mayWriteToMemory()) 717 return false; 718 It++; 719 } 720 return true; 721 } 722 723 // For a given source instruction, collect its backwards dependence slice 724 // consisting of instructions exclusively computed for the purpose of producing 725 // the operands of the source instruction. As an approximation 726 // (sufficiently-accurate in practice), we populate this set with the 727 // instructions of the backwards dependence slice that only have one-use and 728 // form an one-use chain that leads to the source instruction. 729 void SelectOptimize::getExclBackwardsSlice(Instruction *I, 730 std::stack<Instruction *> &Slice, 731 Instruction *SI, bool ForSinking) { 732 SmallPtrSet<Instruction *, 2> Visited; 733 std::queue<Instruction *> Worklist; 734 Worklist.push(I); 735 while (!Worklist.empty()) { 736 Instruction *II = Worklist.front(); 737 Worklist.pop(); 738 739 // Avoid cycles. 740 if (!Visited.insert(II).second) 741 continue; 742 743 if (!II->hasOneUse()) 744 continue; 745 746 // Cannot soundly sink instructions with side-effects. 747 // Terminator or phi instructions cannot be sunk. 748 // Avoid sinking other select instructions (should be handled separetely). 749 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || 750 isa<SelectInst>(II) || isa<PHINode>(II))) 751 continue; 752 753 // Avoid sinking loads in order not to skip state-modifying instructions, 754 // that may alias with the loaded address. 755 // Only allow sinking of loads within the same basic block that are 756 // conservatively proven to be safe. 757 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI)) 758 continue; 759 760 // Avoid considering instructions with less frequency than the source 761 // instruction (i.e., avoid colder code regions of the dependence slice). 762 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) 763 continue; 764 765 // Eligible one-use instruction added to the dependence slice. 766 Slice.push(II); 767 768 // Explore all the operands of the current instruction to expand the slice. 769 for (unsigned k = 0; k < II->getNumOperands(); ++k) 770 if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k))) 771 Worklist.push(OpI); 772 } 773 } 774 775 bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) { 776 uint64_t TrueWeight, FalseWeight; 777 if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) { 778 uint64_t Max = std::max(TrueWeight, FalseWeight); 779 uint64_t Sum = TrueWeight + FalseWeight; 780 if (Sum != 0) { 781 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 782 if (Probability > TTI->getPredictableBranchThreshold()) 783 return true; 784 } 785 } 786 return false; 787 } 788 789 bool SelectOptimize::checkLoopHeuristics(const Loop *L, 790 const CostInfo LoopCost[2]) { 791 // Loop-level checks to determine if a non-predicated version (with branches) 792 // of the loop is more profitable than its predicated version. 793 794 if (DisableLoopLevelHeuristics) 795 return true; 796 797 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", 798 L->getHeader()->getFirstNonPHI()); 799 800 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || 801 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { 802 ORmissL << "No select conversion in the loop due to no reduction of loop's " 803 "critical path. "; 804 ORE->emit(ORmissL); 805 return false; 806 } 807 808 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, 809 LoopCost[1].PredCost - LoopCost[1].NonPredCost}; 810 811 // Profitably converting to branches need to reduce the loop's critical path 812 // by at least some threshold (absolute gain of GainCycleThreshold cycles and 813 // relative gain of 12.5%). 814 if (Gain[1] < Scaled64::get(GainCycleThreshold) || 815 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { 816 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; 817 ORmissL << "No select conversion in the loop due to small reduction of " 818 "loop's critical path. Gain=" 819 << Gain[1].toString() 820 << ", RelativeGain=" << RelativeGain.toString() << "%. "; 821 ORE->emit(ORmissL); 822 return false; 823 } 824 825 // If the loop's critical path involves loop-carried dependences, the gradient 826 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). 827 // This check ensures that the latency reduction for the loop's critical path 828 // keeps decreasing with sufficient rate beyond the two analyzed loop 829 // iterations. 830 if (Gain[1] > Gain[0]) { 831 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / 832 (LoopCost[1].PredCost - LoopCost[0].PredCost); 833 if (GradientGain < Scaled64::get(GainGradientThreshold)) { 834 ORmissL << "No select conversion in the loop due to small gradient gain. " 835 "GradientGain=" 836 << GradientGain.toString() << "%. "; 837 ORE->emit(ORmissL); 838 return false; 839 } 840 } 841 // If the gain decreases it is not profitable to convert. 842 else if (Gain[1] < Gain[0]) { 843 ORmissL 844 << "No select conversion in the loop due to negative gradient gain. "; 845 ORE->emit(ORmissL); 846 return false; 847 } 848 849 // Non-predicated version of the loop is more profitable than its 850 // predicated version. 851 return true; 852 } 853 854 // Computes instruction and loop-critical-path costs for both the predicated 855 // and non-predicated version of the given loop. 856 // Returns false if unable to compute these costs due to invalid cost of loop 857 // instruction(s). 858 bool SelectOptimize::computeLoopCosts( 859 const Loop *L, const SelectGroups &SIGroups, 860 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { 861 const auto &SIset = getSIset(SIGroups); 862 // Compute instruction and loop-critical-path costs across two iterations for 863 // both predicated and non-predicated version. 864 const unsigned Iterations = 2; 865 for (unsigned Iter = 0; Iter < Iterations; ++Iter) { 866 // Cost of the loop's critical path. 867 CostInfo &MaxCost = LoopCost[Iter]; 868 for (BasicBlock *BB : L->getBlocks()) { 869 for (const Instruction &I : *BB) { 870 if (I.isDebugOrPseudoInst()) 871 continue; 872 // Compute the predicated and non-predicated cost of the instruction. 873 Scaled64 IPredCost = Scaled64::getZero(), 874 INonPredCost = Scaled64::getZero(); 875 876 // Assume infinite resources that allow to fully exploit the available 877 // instruction-level parallelism. 878 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) 879 for (const Use &U : I.operands()) { 880 auto UI = dyn_cast<Instruction>(U.get()); 881 if (!UI) 882 continue; 883 if (InstCostMap.count(UI)) { 884 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); 885 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); 886 } 887 } 888 auto ILatency = computeInstCost(&I); 889 if (!ILatency) { 890 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); 891 ORmissL << "Invalid instruction cost preventing analysis and " 892 "optimization of the inner-most loop containing this " 893 "instruction. "; 894 ORE->emit(ORmissL); 895 return false; 896 } 897 IPredCost += Scaled64::get(ILatency.value()); 898 INonPredCost += Scaled64::get(ILatency.value()); 899 900 // For a select that can be converted to branch, 901 // compute its cost as a branch (non-predicated cost). 902 // 903 // BranchCost = PredictedPathCost + MispredictCost 904 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb 905 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate 906 if (SIset.contains(&I)) { 907 auto SI = dyn_cast<SelectInst>(&I); 908 909 Scaled64 TrueOpCost = Scaled64::getZero(), 910 FalseOpCost = Scaled64::getZero(); 911 if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) 912 if (InstCostMap.count(TI)) 913 TrueOpCost = InstCostMap[TI].NonPredCost; 914 if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) 915 if (InstCostMap.count(FI)) 916 FalseOpCost = InstCostMap[FI].NonPredCost; 917 Scaled64 PredictedPathCost = 918 getPredictedPathCost(TrueOpCost, FalseOpCost, SI); 919 920 Scaled64 CondCost = Scaled64::getZero(); 921 if (auto *CI = dyn_cast<Instruction>(SI->getCondition())) 922 if (InstCostMap.count(CI)) 923 CondCost = InstCostMap[CI].NonPredCost; 924 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); 925 926 INonPredCost = PredictedPathCost + MispredictCost; 927 } 928 929 InstCostMap[&I] = {IPredCost, INonPredCost}; 930 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); 931 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); 932 } 933 } 934 } 935 return true; 936 } 937 938 SmallPtrSet<const Instruction *, 2> 939 SelectOptimize::getSIset(const SelectGroups &SIGroups) { 940 SmallPtrSet<const Instruction *, 2> SIset; 941 for (const SelectGroup &ASI : SIGroups) 942 for (const SelectInst *SI : ASI) 943 SIset.insert(SI); 944 return SIset; 945 } 946 947 Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) { 948 InstructionCost ICost = 949 TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); 950 if (auto OC = ICost.getValue()) 951 return Optional<uint64_t>(*OC); 952 return None; 953 } 954 955 ScaledNumber<uint64_t> 956 SelectOptimize::getMispredictionCost(const SelectInst *SI, 957 const Scaled64 CondCost) { 958 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; 959 960 // Account for the default misprediction rate when using a branch 961 // (conservatively set to 25% by default). 962 uint64_t MispredictRate = MispredictDefaultRate; 963 // If the select condition is obviously predictable, then the misprediction 964 // rate is zero. 965 if (isSelectHighlyPredictable(SI)) 966 MispredictRate = 0; 967 968 // CondCost is included to account for cases where the computation of the 969 // condition is part of a long dependence chain (potentially loop-carried) 970 // that would delay detection of a misprediction and increase its cost. 971 Scaled64 MispredictCost = 972 std::max(Scaled64::get(MispredictPenalty), CondCost) * 973 Scaled64::get(MispredictRate); 974 MispredictCost /= Scaled64::get(100); 975 976 return MispredictCost; 977 } 978 979 // Returns the cost of a branch when the prediction is correct. 980 // TrueCost * TrueProbability + FalseCost * FalseProbability. 981 ScaledNumber<uint64_t> 982 SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, 983 const SelectInst *SI) { 984 Scaled64 PredPathCost; 985 uint64_t TrueWeight, FalseWeight; 986 if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) { 987 uint64_t SumWeight = TrueWeight + FalseWeight; 988 if (SumWeight != 0) { 989 PredPathCost = TrueCost * Scaled64::get(TrueWeight) + 990 FalseCost * Scaled64::get(FalseWeight); 991 PredPathCost /= Scaled64::get(SumWeight); 992 return PredPathCost; 993 } 994 } 995 // Without branch weight metadata, we assume 75% for the one path and 25% for 996 // the other, and pick the result with the biggest cost. 997 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, 998 FalseCost * Scaled64::get(3) + TrueCost); 999 PredPathCost /= Scaled64::get(4); 1000 return PredPathCost; 1001 } 1002 1003 bool SelectOptimize::isSelectKindSupported(SelectInst *SI) { 1004 bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); 1005 if (VectorCond) 1006 return false; 1007 TargetLowering::SelectSupportKind SelectKind; 1008 if (SI->getType()->isVectorTy()) 1009 SelectKind = TargetLowering::ScalarCondVectorVal; 1010 else 1011 SelectKind = TargetLowering::ScalarValSelect; 1012 return TLI->isSelectSupported(SelectKind); 1013 } 1014