1 //===- FunctionSpecialization.cpp - Function Specialization ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Transforms/IPO/FunctionSpecialization.h" 10 #include "llvm/ADT/Statistic.h" 11 #include "llvm/Analysis/CodeMetrics.h" 12 #include "llvm/Analysis/ConstantFolding.h" 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/Analysis/InstructionSimplify.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/Analysis/ValueLattice.h" 17 #include "llvm/Analysis/ValueLatticeUtils.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/Transforms/Scalar/SCCP.h" 21 #include "llvm/Transforms/Utils/Cloning.h" 22 #include "llvm/Transforms/Utils/SCCPSolver.h" 23 #include "llvm/Transforms/Utils/SizeOpts.h" 24 #include <cmath> 25 26 using namespace llvm; 27 28 #define DEBUG_TYPE "function-specialization" 29 30 STATISTIC(NumSpecsCreated, "Number of specializations created"); 31 32 static cl::opt<bool> ForceSpecialization( 33 "force-specialization", cl::init(false), cl::Hidden, cl::desc( 34 "Force function specialization for every call site with a constant " 35 "argument")); 36 37 static cl::opt<unsigned> MaxClones( 38 "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc( 39 "The maximum number of clones allowed for a single function " 40 "specialization")); 41 42 static cl::opt<unsigned> MaxIncomingPhiValues( 43 "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc( 44 "The maximum number of incoming values a PHI node can have to be " 45 "considered during the specialization bonus estimation")); 46 47 static cl::opt<unsigned> MaxBlockPredecessors( 48 "funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc( 49 "The maximum number of predecessors a basic block can have to be " 50 "considered during the estimation of dead code")); 51 52 static cl::opt<unsigned> MinFunctionSize( 53 "funcspec-min-function-size", cl::init(300), cl::Hidden, cl::desc( 54 "Don't specialize functions that have less than this number of " 55 "instructions")); 56 57 static cl::opt<unsigned> MaxCodeSizeGrowth( 58 "funcspec-max-codesize-growth", cl::init(3), cl::Hidden, cl::desc( 59 "Maximum codesize growth allowed per function")); 60 61 static cl::opt<unsigned> MinCodeSizeSavings( 62 "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc( 63 "Reject specializations whose codesize savings are less than this" 64 "much percent of the original function size")); 65 66 static cl::opt<unsigned> MinLatencySavings( 67 "funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc( 68 "Reject specializations whose latency savings are less than this" 69 "much percent of the original function size")); 70 71 static cl::opt<unsigned> MinInliningBonus( 72 "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( 73 "Reject specializations whose inlining bonus is less than this" 74 "much percent of the original function size")); 75 76 static cl::opt<bool> SpecializeOnAddress( 77 "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( 78 "Enable function specialization on the address of global values")); 79 80 // Disabled by default as it can significantly increase compilation times. 81 // 82 // https://llvm-compile-time-tracker.com 83 // https://github.com/nikic/llvm-compile-time-tracker 84 static cl::opt<bool> SpecializeLiteralConstant( 85 "funcspec-for-literal-constant", cl::init(false), cl::Hidden, cl::desc( 86 "Enable specialization of functions that take a literal constant as an " 87 "argument")); 88 89 bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ, 90 DenseSet<BasicBlock *> &DeadBlocks) { 91 unsigned I = 0; 92 return all_of(predecessors(Succ), 93 [&I, BB, Succ, &DeadBlocks] (BasicBlock *Pred) { 94 return I++ < MaxBlockPredecessors && 95 (Pred == BB || Pred == Succ || DeadBlocks.contains(Pred)); 96 }); 97 } 98 99 // Estimates the codesize savings due to dead code after constant propagation. 100 // \p WorkList represents the basic blocks of a specialization which will 101 // eventually become dead once we replace instructions that are known to be 102 // constants. The successors of such blocks are added to the list as long as 103 // the \p Solver found they were executable prior to specialization, and only 104 // if all their predecessors are dead. 105 Cost InstCostVisitor::estimateBasicBlocks( 106 SmallVectorImpl<BasicBlock *> &WorkList) { 107 Cost CodeSize = 0; 108 // Accumulate the instruction cost of each basic block weighted by frequency. 109 while (!WorkList.empty()) { 110 BasicBlock *BB = WorkList.pop_back_val(); 111 112 // These blocks are considered dead as far as the InstCostVisitor 113 // is concerned. They haven't been proven dead yet by the Solver, 114 // but may become if we propagate the specialization arguments. 115 if (!DeadBlocks.insert(BB).second) 116 continue; 117 118 for (Instruction &I : *BB) { 119 // Disregard SSA copies. 120 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 121 if (II->getIntrinsicID() == Intrinsic::ssa_copy) 122 continue; 123 // If it's a known constant we have already accounted for it. 124 if (KnownConstants.contains(&I)) 125 continue; 126 127 Cost C = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 128 129 LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C 130 << " for user " << I << "\n"); 131 CodeSize += C; 132 } 133 134 // Keep adding dead successors to the list as long as they are 135 // executable and only reachable from dead blocks. 136 for (BasicBlock *SuccBB : successors(BB)) 137 if (isBlockExecutable(SuccBB) && 138 canEliminateSuccessor(BB, SuccBB, DeadBlocks)) 139 WorkList.push_back(SuccBB); 140 } 141 return CodeSize; 142 } 143 144 static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) { 145 if (auto *C = dyn_cast<Constant>(V)) 146 return C; 147 return KnownConstants.lookup(V); 148 } 149 150 Bonus InstCostVisitor::getBonusFromPendingPHIs() { 151 Bonus B; 152 while (!PendingPHIs.empty()) { 153 Instruction *Phi = PendingPHIs.pop_back_val(); 154 // The pending PHIs could have been proven dead by now. 155 if (isBlockExecutable(Phi->getParent())) 156 B += getUserBonus(Phi); 157 } 158 return B; 159 } 160 161 /// Compute a bonus for replacing argument \p A with constant \p C. 162 Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) { 163 LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " 164 << C->getNameOrAsOperand() << "\n"); 165 Bonus B; 166 for (auto *U : A->users()) 167 if (auto *UI = dyn_cast<Instruction>(U)) 168 if (isBlockExecutable(UI->getParent())) 169 B += getUserBonus(UI, A, C); 170 171 LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " 172 << B.CodeSize << ", Latency = " << B.Latency 173 << "} for argument " << *A << "\n"); 174 return B; 175 } 176 177 Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) { 178 // We have already propagated a constant for this user. 179 if (KnownConstants.contains(User)) 180 return {0, 0}; 181 182 // Cache the iterator before visiting. 183 LastVisited = Use ? KnownConstants.insert({Use, C}).first 184 : KnownConstants.end(); 185 186 Cost CodeSize = 0; 187 if (auto *I = dyn_cast<SwitchInst>(User)) { 188 CodeSize = estimateSwitchInst(*I); 189 } else if (auto *I = dyn_cast<BranchInst>(User)) { 190 CodeSize = estimateBranchInst(*I); 191 } else { 192 C = visit(*User); 193 if (!C) 194 return {0, 0}; 195 } 196 197 // Even though it doesn't make sense to bind switch and branch instructions 198 // with a constant, unlike any other instruction type, it prevents estimating 199 // their bonus multiple times. 200 KnownConstants.insert({User, C}); 201 202 CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize); 203 204 uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() / 205 BFI.getEntryFreq(); 206 207 Cost Latency = Weight * 208 TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency); 209 210 LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize 211 << ", Latency = " << Latency << "} for user " 212 << *User << "\n"); 213 214 Bonus B(CodeSize, Latency); 215 for (auto *U : User->users()) 216 if (auto *UI = dyn_cast<Instruction>(U)) 217 if (UI != User && isBlockExecutable(UI->getParent())) 218 B += getUserBonus(UI, User, C); 219 220 return B; 221 } 222 223 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { 224 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 225 226 if (I.getCondition() != LastVisited->first) 227 return 0; 228 229 auto *C = dyn_cast<ConstantInt>(LastVisited->second); 230 if (!C) 231 return 0; 232 233 BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor(); 234 // Initialize the worklist with the dead basic blocks. These are the 235 // destination labels which are different from the one corresponding 236 // to \p C. They should be executable and have a unique predecessor. 237 SmallVector<BasicBlock *> WorkList; 238 for (const auto &Case : I.cases()) { 239 BasicBlock *BB = Case.getCaseSuccessor(); 240 if (BB != Succ && isBlockExecutable(BB) && 241 canEliminateSuccessor(I.getParent(), BB, DeadBlocks)) 242 WorkList.push_back(BB); 243 } 244 245 return estimateBasicBlocks(WorkList); 246 } 247 248 Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { 249 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 250 251 if (I.getCondition() != LastVisited->first) 252 return 0; 253 254 BasicBlock *Succ = I.getSuccessor(LastVisited->second->isOneValue()); 255 // Initialize the worklist with the dead successor as long as 256 // it is executable and has a unique predecessor. 257 SmallVector<BasicBlock *> WorkList; 258 if (isBlockExecutable(Succ) && 259 canEliminateSuccessor(I.getParent(), Succ, DeadBlocks)) 260 WorkList.push_back(Succ); 261 262 return estimateBasicBlocks(WorkList); 263 } 264 265 Constant *InstCostVisitor::visitPHINode(PHINode &I) { 266 if (I.getNumIncomingValues() > MaxIncomingPhiValues) 267 return nullptr; 268 269 bool Inserted = VisitedPHIs.insert(&I).second; 270 Constant *Const = nullptr; 271 272 for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) { 273 Value *V = I.getIncomingValue(Idx); 274 if (auto *Inst = dyn_cast<Instruction>(V)) 275 if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx))) 276 continue; 277 Constant *C = findConstantFor(V, KnownConstants); 278 if (!C) { 279 if (Inserted) 280 PendingPHIs.push_back(&I); 281 return nullptr; 282 } 283 if (!Const) 284 Const = C; 285 else if (C != Const) 286 return nullptr; 287 } 288 return Const; 289 } 290 291 Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { 292 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 293 294 if (isGuaranteedNotToBeUndefOrPoison(LastVisited->second)) 295 return LastVisited->second; 296 return nullptr; 297 } 298 299 Constant *InstCostVisitor::visitCallBase(CallBase &I) { 300 Function *F = I.getCalledFunction(); 301 if (!F || !canConstantFoldCallTo(&I, F)) 302 return nullptr; 303 304 SmallVector<Constant *, 8> Operands; 305 Operands.reserve(I.getNumOperands()); 306 307 for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) { 308 Value *V = I.getOperand(Idx); 309 Constant *C = findConstantFor(V, KnownConstants); 310 if (!C) 311 return nullptr; 312 Operands.push_back(C); 313 } 314 315 auto Ops = ArrayRef(Operands.begin(), Operands.end()); 316 return ConstantFoldCall(&I, F, Ops); 317 } 318 319 Constant *InstCostVisitor::visitLoadInst(LoadInst &I) { 320 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 321 322 if (isa<ConstantPointerNull>(LastVisited->second)) 323 return nullptr; 324 return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL); 325 } 326 327 Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { 328 SmallVector<Constant *, 8> Operands; 329 Operands.reserve(I.getNumOperands()); 330 331 for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) { 332 Value *V = I.getOperand(Idx); 333 Constant *C = findConstantFor(V, KnownConstants); 334 if (!C) 335 return nullptr; 336 Operands.push_back(C); 337 } 338 339 auto Ops = ArrayRef(Operands.begin(), Operands.end()); 340 return ConstantFoldInstOperands(&I, Ops, DL); 341 } 342 343 Constant *InstCostVisitor::visitSelectInst(SelectInst &I) { 344 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 345 346 if (I.getCondition() != LastVisited->first) 347 return nullptr; 348 349 Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue() 350 : I.getTrueValue(); 351 Constant *C = findConstantFor(V, KnownConstants); 352 return C; 353 } 354 355 Constant *InstCostVisitor::visitCastInst(CastInst &I) { 356 return ConstantFoldCastOperand(I.getOpcode(), LastVisited->second, 357 I.getType(), DL); 358 } 359 360 Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { 361 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 362 363 bool Swap = I.getOperand(1) == LastVisited->first; 364 Value *V = Swap ? I.getOperand(0) : I.getOperand(1); 365 Constant *Other = findConstantFor(V, KnownConstants); 366 if (!Other) 367 return nullptr; 368 369 Constant *Const = LastVisited->second; 370 return Swap ? 371 ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL) 372 : ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL); 373 } 374 375 Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { 376 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 377 378 return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL); 379 } 380 381 Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { 382 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 383 384 bool Swap = I.getOperand(1) == LastVisited->first; 385 Value *V = Swap ? I.getOperand(0) : I.getOperand(1); 386 Constant *Other = findConstantFor(V, KnownConstants); 387 if (!Other) 388 return nullptr; 389 390 Constant *Const = LastVisited->second; 391 return dyn_cast_or_null<Constant>(Swap ? 392 simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL)) 393 : simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL))); 394 } 395 396 Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, 397 CallInst *Call) { 398 Value *StoreValue = nullptr; 399 for (auto *User : Alloca->users()) { 400 // We can't use llvm::isAllocaPromotable() as that would fail because of 401 // the usage in the CallInst, which is what we check here. 402 if (User == Call) 403 continue; 404 if (auto *Bitcast = dyn_cast<BitCastInst>(User)) { 405 if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call) 406 return nullptr; 407 continue; 408 } 409 410 if (auto *Store = dyn_cast<StoreInst>(User)) { 411 // This is a duplicate store, bail out. 412 if (StoreValue || Store->isVolatile()) 413 return nullptr; 414 StoreValue = Store->getValueOperand(); 415 continue; 416 } 417 // Bail if there is any other unknown usage. 418 return nullptr; 419 } 420 421 if (!StoreValue) 422 return nullptr; 423 424 return getCandidateConstant(StoreValue); 425 } 426 427 // A constant stack value is an AllocaInst that has a single constant 428 // value stored to it. Return this constant if such an alloca stack value 429 // is a function argument. 430 Constant *FunctionSpecializer::getConstantStackValue(CallInst *Call, 431 Value *Val) { 432 if (!Val) 433 return nullptr; 434 Val = Val->stripPointerCasts(); 435 if (auto *ConstVal = dyn_cast<ConstantInt>(Val)) 436 return ConstVal; 437 auto *Alloca = dyn_cast<AllocaInst>(Val); 438 if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy()) 439 return nullptr; 440 return getPromotableAlloca(Alloca, Call); 441 } 442 443 // To support specializing recursive functions, it is important to propagate 444 // constant arguments because after a first iteration of specialisation, a 445 // reduced example may look like this: 446 // 447 // define internal void @RecursiveFn(i32* arg1) { 448 // %temp = alloca i32, align 4 449 // store i32 2 i32* %temp, align 4 450 // call void @RecursiveFn.1(i32* nonnull %temp) 451 // ret void 452 // } 453 // 454 // Before a next iteration, we need to propagate the constant like so 455 // which allows further specialization in next iterations. 456 // 457 // @funcspec.arg = internal constant i32 2 458 // 459 // define internal void @someFunc(i32* arg1) { 460 // call void @otherFunc(i32* nonnull @funcspec.arg) 461 // ret void 462 // } 463 // 464 // See if there are any new constant values for the callers of \p F via 465 // stack variables and promote them to global variables. 466 void FunctionSpecializer::promoteConstantStackValues(Function *F) { 467 for (User *U : F->users()) { 468 469 auto *Call = dyn_cast<CallInst>(U); 470 if (!Call) 471 continue; 472 473 if (!Solver.isBlockExecutable(Call->getParent())) 474 continue; 475 476 for (const Use &U : Call->args()) { 477 unsigned Idx = Call->getArgOperandNo(&U); 478 Value *ArgOp = Call->getArgOperand(Idx); 479 Type *ArgOpType = ArgOp->getType(); 480 481 if (!Call->onlyReadsMemory(Idx) || !ArgOpType->isPointerTy()) 482 continue; 483 484 auto *ConstVal = getConstantStackValue(Call, ArgOp); 485 if (!ConstVal) 486 continue; 487 488 Value *GV = new GlobalVariable(M, ConstVal->getType(), true, 489 GlobalValue::InternalLinkage, ConstVal, 490 "specialized.arg." + Twine(++NGlobals)); 491 if (ArgOpType != ConstVal->getType()) 492 GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOpType); 493 494 Call->setArgOperand(Idx, GV); 495 } 496 } 497 } 498 499 // ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics 500 // interfere with the promoteConstantStackValues() optimization. 501 static void removeSSACopy(Function &F) { 502 for (BasicBlock &BB : F) { 503 for (Instruction &Inst : llvm::make_early_inc_range(BB)) { 504 auto *II = dyn_cast<IntrinsicInst>(&Inst); 505 if (!II) 506 continue; 507 if (II->getIntrinsicID() != Intrinsic::ssa_copy) 508 continue; 509 Inst.replaceAllUsesWith(II->getOperand(0)); 510 Inst.eraseFromParent(); 511 } 512 } 513 } 514 515 /// Remove any ssa_copy intrinsics that may have been introduced. 516 void FunctionSpecializer::cleanUpSSA() { 517 for (Function *F : Specializations) 518 removeSSACopy(*F); 519 } 520 521 522 template <> struct llvm::DenseMapInfo<SpecSig> { 523 static inline SpecSig getEmptyKey() { return {~0U, {}}; } 524 525 static inline SpecSig getTombstoneKey() { return {~1U, {}}; } 526 527 static unsigned getHashValue(const SpecSig &S) { 528 return static_cast<unsigned>(hash_value(S)); 529 } 530 531 static bool isEqual(const SpecSig &LHS, const SpecSig &RHS) { 532 return LHS == RHS; 533 } 534 }; 535 536 FunctionSpecializer::~FunctionSpecializer() { 537 LLVM_DEBUG( 538 if (NumSpecsCreated > 0) 539 dbgs() << "FnSpecialization: Created " << NumSpecsCreated 540 << " specializations in module " << M.getName() << "\n"); 541 // Eliminate dead code. 542 removeDeadFunctions(); 543 cleanUpSSA(); 544 } 545 546 /// Attempt to specialize functions in the module to enable constant 547 /// propagation across function boundaries. 548 /// 549 /// \returns true if at least one function is specialized. 550 bool FunctionSpecializer::run() { 551 // Find possible specializations for each function. 552 SpecMap SM; 553 SmallVector<Spec, 32> AllSpecs; 554 unsigned NumCandidates = 0; 555 for (Function &F : M) { 556 if (!isCandidateFunction(&F)) 557 continue; 558 559 auto [It, Inserted] = FunctionMetrics.try_emplace(&F); 560 CodeMetrics &Metrics = It->second; 561 //Analyze the function. 562 if (Inserted) { 563 SmallPtrSet<const Value *, 32> EphValues; 564 CodeMetrics::collectEphemeralValues(&F, &GetAC(F), EphValues); 565 for (BasicBlock &BB : F) 566 Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues); 567 } 568 569 // If the code metrics reveal that we shouldn't duplicate the function, 570 // or if the code size implies that this function is easy to get inlined, 571 // then we shouldn't specialize it. 572 if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || 573 (!ForceSpecialization && !F.hasFnAttribute(Attribute::NoInline) && 574 Metrics.NumInsts < MinFunctionSize)) 575 continue; 576 577 // TODO: For now only consider recursive functions when running multiple 578 // times. This should change if specialization on literal constants gets 579 // enabled. 580 if (!Inserted && !Metrics.isRecursive && !SpecializeLiteralConstant) 581 continue; 582 583 int64_t Sz = *Metrics.NumInsts.getValue(); 584 assert(Sz > 0 && "CodeSize should be positive"); 585 // It is safe to down cast from int64_t, NumInsts is always positive. 586 unsigned FuncSize = static_cast<unsigned>(Sz); 587 588 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for " 589 << F.getName() << " is " << FuncSize << "\n"); 590 591 if (Inserted && Metrics.isRecursive) 592 promoteConstantStackValues(&F); 593 594 if (!findSpecializations(&F, FuncSize, AllSpecs, SM)) { 595 LLVM_DEBUG( 596 dbgs() << "FnSpecialization: No possible specializations found for " 597 << F.getName() << "\n"); 598 continue; 599 } 600 601 ++NumCandidates; 602 } 603 604 if (!NumCandidates) { 605 LLVM_DEBUG( 606 dbgs() 607 << "FnSpecialization: No possible specializations found in module\n"); 608 return false; 609 } 610 611 // Choose the most profitable specialisations, which fit in the module 612 // specialization budget, which is derived from maximum number of 613 // specializations per specialization candidate function. 614 auto CompareScore = [&AllSpecs](unsigned I, unsigned J) { 615 return AllSpecs[I].Score > AllSpecs[J].Score; 616 }; 617 const unsigned NSpecs = 618 std::min(NumCandidates * MaxClones, unsigned(AllSpecs.size())); 619 SmallVector<unsigned> BestSpecs(NSpecs + 1); 620 std::iota(BestSpecs.begin(), BestSpecs.begin() + NSpecs, 0); 621 if (AllSpecs.size() > NSpecs) { 622 LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed " 623 << "the maximum number of clones threshold.\n" 624 << "FnSpecialization: Specializing the " 625 << NSpecs 626 << " most profitable candidates.\n"); 627 std::make_heap(BestSpecs.begin(), BestSpecs.begin() + NSpecs, CompareScore); 628 for (unsigned I = NSpecs, N = AllSpecs.size(); I < N; ++I) { 629 BestSpecs[NSpecs] = I; 630 std::push_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore); 631 std::pop_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore); 632 } 633 } 634 635 LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n"; 636 for (unsigned I = 0; I < NSpecs; ++I) { 637 const Spec &S = AllSpecs[BestSpecs[I]]; 638 dbgs() << "FnSpecialization: Function " << S.F->getName() 639 << " , score " << S.Score << "\n"; 640 for (const ArgInfo &Arg : S.Sig.Args) 641 dbgs() << "FnSpecialization: FormalArg = " 642 << Arg.Formal->getNameOrAsOperand() 643 << ", ActualArg = " << Arg.Actual->getNameOrAsOperand() 644 << "\n"; 645 }); 646 647 // Create the chosen specializations. 648 SmallPtrSet<Function *, 8> OriginalFuncs; 649 SmallVector<Function *> Clones; 650 for (unsigned I = 0; I < NSpecs; ++I) { 651 Spec &S = AllSpecs[BestSpecs[I]]; 652 S.Clone = createSpecialization(S.F, S.Sig); 653 654 // Update the known call sites to call the clone. 655 for (CallBase *Call : S.CallSites) { 656 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call 657 << " to call " << S.Clone->getName() << "\n"); 658 Call->setCalledFunction(S.Clone); 659 } 660 661 Clones.push_back(S.Clone); 662 OriginalFuncs.insert(S.F); 663 } 664 665 Solver.solveWhileResolvedUndefsIn(Clones); 666 667 // Update the rest of the call sites - these are the recursive calls, calls 668 // to discarded specialisations and calls that may match a specialisation 669 // after the solver runs. 670 for (Function *F : OriginalFuncs) { 671 auto [Begin, End] = SM[F]; 672 updateCallSites(F, AllSpecs.begin() + Begin, AllSpecs.begin() + End); 673 } 674 675 for (Function *F : Clones) { 676 if (F->getReturnType()->isVoidTy()) 677 continue; 678 if (F->getReturnType()->isStructTy()) { 679 auto *STy = cast<StructType>(F->getReturnType()); 680 if (!Solver.isStructLatticeConstant(F, STy)) 681 continue; 682 } else { 683 auto It = Solver.getTrackedRetVals().find(F); 684 assert(It != Solver.getTrackedRetVals().end() && 685 "Return value ought to be tracked"); 686 if (SCCPSolver::isOverdefined(It->second)) 687 continue; 688 } 689 for (User *U : F->users()) { 690 if (auto *CS = dyn_cast<CallBase>(U)) { 691 //The user instruction does not call our function. 692 if (CS->getCalledFunction() != F) 693 continue; 694 Solver.resetLatticeValueFor(CS); 695 } 696 } 697 } 698 699 // Rerun the solver to notify the users of the modified callsites. 700 Solver.solveWhileResolvedUndefs(); 701 702 for (Function *F : OriginalFuncs) 703 if (FunctionMetrics[F].isRecursive) 704 promoteConstantStackValues(F); 705 706 return true; 707 } 708 709 void FunctionSpecializer::removeDeadFunctions() { 710 for (Function *F : FullySpecialized) { 711 LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " 712 << F->getName() << "\n"); 713 if (FAM) 714 FAM->clear(*F, F->getName()); 715 F->eraseFromParent(); 716 } 717 FullySpecialized.clear(); 718 } 719 720 /// Clone the function \p F and remove the ssa_copy intrinsics added by 721 /// the SCCPSolver in the cloned version. 722 static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { 723 ValueToValueMapTy Mappings; 724 Function *Clone = CloneFunction(F, Mappings); 725 Clone->setName(F->getName() + ".specialized." + Twine(NSpecs)); 726 removeSSACopy(*Clone); 727 return Clone; 728 } 729 730 bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, 731 SmallVectorImpl<Spec> &AllSpecs, 732 SpecMap &SM) { 733 // A mapping from a specialisation signature to the index of the respective 734 // entry in the all specialisation array. Used to ensure uniqueness of 735 // specialisations. 736 DenseMap<SpecSig, unsigned> UniqueSpecs; 737 738 // Get a list of interesting arguments. 739 SmallVector<Argument *> Args; 740 for (Argument &Arg : F->args()) 741 if (isArgumentInteresting(&Arg)) 742 Args.push_back(&Arg); 743 744 if (Args.empty()) 745 return false; 746 747 for (User *U : F->users()) { 748 if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) 749 continue; 750 auto &CS = *cast<CallBase>(U); 751 752 // The user instruction does not call our function. 753 if (CS.getCalledFunction() != F) 754 continue; 755 756 // If the call site has attribute minsize set, that callsite won't be 757 // specialized. 758 if (CS.hasFnAttr(Attribute::MinSize)) 759 continue; 760 761 // If the parent of the call site will never be executed, we don't need 762 // to worry about the passed value. 763 if (!Solver.isBlockExecutable(CS.getParent())) 764 continue; 765 766 // Examine arguments and create a specialisation candidate from the 767 // constant operands of this call site. 768 SpecSig S; 769 for (Argument *A : Args) { 770 Constant *C = getCandidateConstant(CS.getArgOperand(A->getArgNo())); 771 if (!C) 772 continue; 773 LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument " 774 << A->getName() << " : " << C->getNameOrAsOperand() 775 << "\n"); 776 S.Args.push_back({A, C}); 777 } 778 779 if (S.Args.empty()) 780 continue; 781 782 // Check if we have encountered the same specialisation already. 783 if (auto It = UniqueSpecs.find(S); It != UniqueSpecs.end()) { 784 // Existing specialisation. Add the call to the list to rewrite, unless 785 // it's a recursive call. A specialisation, generated because of a 786 // recursive call may end up as not the best specialisation for all 787 // the cloned instances of this call, which result from specialising 788 // functions. Hence we don't rewrite the call directly, but match it with 789 // the best specialisation once all specialisations are known. 790 if (CS.getFunction() == F) 791 continue; 792 const unsigned Index = It->second; 793 AllSpecs[Index].CallSites.push_back(&CS); 794 } else { 795 // Calculate the specialisation gain. 796 Bonus B; 797 unsigned Score = 0; 798 InstCostVisitor Visitor = getInstCostVisitorFor(F); 799 for (ArgInfo &A : S.Args) { 800 B += Visitor.getSpecializationBonus(A.Formal, A.Actual); 801 Score += getInliningBonus(A.Formal, A.Actual); 802 } 803 B += Visitor.getBonusFromPendingPHIs(); 804 805 806 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " 807 << B.CodeSize << ", Latency = " << B.Latency 808 << ", Inlining = " << Score << "}\n"); 809 810 FunctionGrowth[F] += FuncSize - B.CodeSize; 811 812 auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize, 813 unsigned FuncGrowth) -> bool { 814 // No check required. 815 if (ForceSpecialization) 816 return true; 817 // Minimum inlining bonus. 818 if (Score > MinInliningBonus * FuncSize / 100) 819 return true; 820 // Minimum codesize savings. 821 if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) 822 return false; 823 // Minimum latency savings. 824 if (B.Latency < MinLatencySavings * FuncSize / 100) 825 return false; 826 // Maximum codesize growth. 827 if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) 828 return false; 829 return true; 830 }; 831 832 // Discard unprofitable specialisations. 833 if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F])) 834 continue; 835 836 // Create a new specialisation entry. 837 Score += std::max(B.CodeSize, B.Latency); 838 auto &Spec = AllSpecs.emplace_back(F, S, Score); 839 if (CS.getFunction() != F) 840 Spec.CallSites.push_back(&CS); 841 const unsigned Index = AllSpecs.size() - 1; 842 UniqueSpecs[S] = Index; 843 if (auto [It, Inserted] = SM.try_emplace(F, Index, Index + 1); !Inserted) 844 It->second.second = Index + 1; 845 } 846 } 847 848 return !UniqueSpecs.empty(); 849 } 850 851 bool FunctionSpecializer::isCandidateFunction(Function *F) { 852 if (F->isDeclaration() || F->arg_empty()) 853 return false; 854 855 if (F->hasFnAttribute(Attribute::NoDuplicate)) 856 return false; 857 858 // Do not specialize the cloned function again. 859 if (Specializations.contains(F)) 860 return false; 861 862 // If we're optimizing the function for size, we shouldn't specialize it. 863 if (F->hasOptSize() || 864 shouldOptimizeForSize(F, nullptr, nullptr, PGSOQueryType::IRPass)) 865 return false; 866 867 // Exit if the function is not executable. There's no point in specializing 868 // a dead function. 869 if (!Solver.isBlockExecutable(&F->getEntryBlock())) 870 return false; 871 872 // It wastes time to specialize a function which would get inlined finally. 873 if (F->hasFnAttribute(Attribute::AlwaysInline)) 874 return false; 875 876 LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() 877 << "\n"); 878 return true; 879 } 880 881 Function *FunctionSpecializer::createSpecialization(Function *F, 882 const SpecSig &S) { 883 Function *Clone = cloneCandidateFunction(F, Specializations.size() + 1); 884 885 // The original function does not neccessarily have internal linkage, but the 886 // clone must. 887 Clone->setLinkage(GlobalValue::InternalLinkage); 888 889 // Initialize the lattice state of the arguments of the function clone, 890 // marking the argument on which we specialized the function constant 891 // with the given value. 892 Solver.setLatticeValueForSpecializationArguments(Clone, S.Args); 893 Solver.markBlockExecutable(&Clone->front()); 894 Solver.addArgumentTrackedFunction(Clone); 895 Solver.addTrackedFunction(Clone); 896 897 // Mark all the specialized functions 898 Specializations.insert(Clone); 899 ++NumSpecsCreated; 900 901 return Clone; 902 } 903 904 /// Compute the inlining bonus for replacing argument \p A with constant \p C. 905 /// The below heuristic is only concerned with exposing inlining 906 /// opportunities via indirect call promotion. If the argument is not a 907 /// (potentially casted) function pointer, give up. 908 unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) { 909 Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts()); 910 if (!CalledFunction) 911 return 0; 912 913 // Get TTI for the called function (used for the inline cost). 914 auto &CalleeTTI = (GetTTI)(*CalledFunction); 915 916 // Look at all the call sites whose called value is the argument. 917 // Specializing the function on the argument would allow these indirect 918 // calls to be promoted to direct calls. If the indirect call promotion 919 // would likely enable the called function to be inlined, specializing is a 920 // good idea. 921 int InliningBonus = 0; 922 for (User *U : A->users()) { 923 if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) 924 continue; 925 auto *CS = cast<CallBase>(U); 926 if (CS->getCalledOperand() != A) 927 continue; 928 if (CS->getFunctionType() != CalledFunction->getFunctionType()) 929 continue; 930 931 // Get the cost of inlining the called function at this call site. Note 932 // that this is only an estimate. The called function may eventually 933 // change in a way that leads to it not being inlined here, even though 934 // inlining looks profitable now. For example, one of its called 935 // functions may be inlined into it, making the called function too large 936 // to be inlined into this call site. 937 // 938 // We apply a boost for performing indirect call promotion by increasing 939 // the default threshold by the threshold for indirect calls. 940 auto Params = getInlineParams(); 941 Params.DefaultThreshold += InlineConstants::IndirectCallThreshold; 942 InlineCost IC = 943 getInlineCost(*CS, CalledFunction, Params, CalleeTTI, GetAC, GetTLI); 944 945 // We clamp the bonus for this call to be between zero and the default 946 // threshold. 947 if (IC.isAlways()) 948 InliningBonus += Params.DefaultThreshold; 949 else if (IC.isVariable() && IC.getCostDelta() > 0) 950 InliningBonus += IC.getCostDelta(); 951 952 LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus 953 << " for user " << *U << "\n"); 954 } 955 956 return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0; 957 } 958 959 /// Determine if it is possible to specialise the function for constant values 960 /// of the formal parameter \p A. 961 bool FunctionSpecializer::isArgumentInteresting(Argument *A) { 962 // No point in specialization if the argument is unused. 963 if (A->user_empty()) 964 return false; 965 966 Type *Ty = A->getType(); 967 if (!Ty->isPointerTy() && (!SpecializeLiteralConstant || 968 (!Ty->isIntegerTy() && !Ty->isFloatingPointTy() && !Ty->isStructTy()))) 969 return false; 970 971 // SCCP solver does not record an argument that will be constructed on 972 // stack. 973 if (A->hasByValAttr() && !A->getParent()->onlyReadsMemory()) 974 return false; 975 976 // For non-argument-tracked functions every argument is overdefined. 977 if (!Solver.isArgumentTrackedFunction(A->getParent())) 978 return true; 979 980 // Check the lattice value and decide if we should attemt to specialize, 981 // based on this argument. No point in specialization, if the lattice value 982 // is already a constant. 983 bool IsOverdefined = Ty->isStructTy() 984 ? any_of(Solver.getStructLatticeValueFor(A), SCCPSolver::isOverdefined) 985 : SCCPSolver::isOverdefined(Solver.getLatticeValueFor(A)); 986 987 LLVM_DEBUG( 988 if (IsOverdefined) 989 dbgs() << "FnSpecialization: Found interesting parameter " 990 << A->getNameOrAsOperand() << "\n"; 991 else 992 dbgs() << "FnSpecialization: Nothing to do, parameter " 993 << A->getNameOrAsOperand() << " is already constant\n"; 994 ); 995 return IsOverdefined; 996 } 997 998 /// Check if the value \p V (an actual argument) is a constant or can only 999 /// have a constant value. Return that constant. 1000 Constant *FunctionSpecializer::getCandidateConstant(Value *V) { 1001 if (isa<PoisonValue>(V)) 1002 return nullptr; 1003 1004 // Select for possible specialisation values that are constants or 1005 // are deduced to be constants or constant ranges with a single element. 1006 Constant *C = dyn_cast<Constant>(V); 1007 if (!C) 1008 C = Solver.getConstantOrNull(V); 1009 1010 // Don't specialize on (anything derived from) the address of a non-constant 1011 // global variable, unless explicitly enabled. 1012 if (C && C->getType()->isPointerTy() && !C->isNullValue()) 1013 if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C)); 1014 GV && !(GV->isConstant() || SpecializeOnAddress)) 1015 return nullptr; 1016 1017 return C; 1018 } 1019 1020 void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, 1021 const Spec *End) { 1022 // Collect the call sites that need updating. 1023 SmallVector<CallBase *> ToUpdate; 1024 for (User *U : F->users()) 1025 if (auto *CS = dyn_cast<CallBase>(U); 1026 CS && CS->getCalledFunction() == F && 1027 Solver.isBlockExecutable(CS->getParent())) 1028 ToUpdate.push_back(CS); 1029 1030 unsigned NCallsLeft = ToUpdate.size(); 1031 for (CallBase *CS : ToUpdate) { 1032 bool ShouldDecrementCount = CS->getFunction() == F; 1033 1034 // Find the best matching specialisation. 1035 const Spec *BestSpec = nullptr; 1036 for (const Spec &S : make_range(Begin, End)) { 1037 if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score)) 1038 continue; 1039 1040 if (any_of(S.Sig.Args, [CS, this](const ArgInfo &Arg) { 1041 unsigned ArgNo = Arg.Formal->getArgNo(); 1042 return getCandidateConstant(CS->getArgOperand(ArgNo)) != Arg.Actual; 1043 })) 1044 continue; 1045 1046 BestSpec = &S; 1047 } 1048 1049 if (BestSpec) { 1050 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS 1051 << " to call " << BestSpec->Clone->getName() << "\n"); 1052 CS->setCalledFunction(BestSpec->Clone); 1053 ShouldDecrementCount = true; 1054 } 1055 1056 if (ShouldDecrementCount) 1057 --NCallsLeft; 1058 } 1059 1060 // If the function has been completely specialized, the original function 1061 // is no longer needed. Mark it unreachable. 1062 if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { 1063 Solver.markFunctionUnreachable(F); 1064 FullySpecialized.insert(F); 1065 } 1066 } 1067