1 //===- FunctionSpecialization.cpp - Function Specialization ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Transforms/IPO/FunctionSpecialization.h" 10 #include "llvm/ADT/Statistic.h" 11 #include "llvm/Analysis/CodeMetrics.h" 12 #include "llvm/Analysis/ConstantFolding.h" 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/Analysis/InstructionSimplify.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/Analysis/ValueLattice.h" 17 #include "llvm/Analysis/ValueLatticeUtils.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/Transforms/Scalar/SCCP.h" 21 #include "llvm/Transforms/Utils/Cloning.h" 22 #include "llvm/Transforms/Utils/SCCPSolver.h" 23 #include "llvm/Transforms/Utils/SizeOpts.h" 24 #include <cmath> 25 26 using namespace llvm; 27 28 #define DEBUG_TYPE "function-specialization" 29 30 STATISTIC(NumSpecsCreated, "Number of specializations created"); 31 32 static cl::opt<bool> ForceSpecialization( 33 "force-specialization", cl::init(false), cl::Hidden, cl::desc( 34 "Force function specialization for every call site with a constant " 35 "argument")); 36 37 static cl::opt<unsigned> MaxClones( 38 "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc( 39 "The maximum number of clones allowed for a single function " 40 "specialization")); 41 42 static cl::opt<unsigned> MaxIncomingPhiValues( 43 "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc( 44 "The maximum number of incoming values a PHI node can have to be " 45 "considered during the specialization bonus estimation")); 46 47 static cl::opt<unsigned> MaxBlockPredecessors( 48 "funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc( 49 "The maximum number of predecessors a basic block can have to be " 50 "considered during the estimation of dead code")); 51 52 static cl::opt<unsigned> MinFunctionSize( 53 "funcspec-min-function-size", cl::init(300), cl::Hidden, cl::desc( 54 "Don't specialize functions that have less than this number of " 55 "instructions")); 56 57 static cl::opt<unsigned> MaxCodeSizeGrowth( 58 "funcspec-max-codesize-growth", cl::init(3), cl::Hidden, cl::desc( 59 "Maximum codesize growth allowed per function")); 60 61 static cl::opt<unsigned> MinCodeSizeSavings( 62 "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc( 63 "Reject specializations whose codesize savings are less than this" 64 "much percent of the original function size")); 65 66 static cl::opt<unsigned> MinLatencySavings( 67 "funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc( 68 "Reject specializations whose latency savings are less than this" 69 "much percent of the original function size")); 70 71 static cl::opt<unsigned> MinInliningBonus( 72 "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( 73 "Reject specializations whose inlining bonus is less than this" 74 "much percent of the original function size")); 75 76 static cl::opt<bool> SpecializeOnAddress( 77 "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( 78 "Enable function specialization on the address of global values")); 79 80 // Disabled by default as it can significantly increase compilation times. 81 // 82 // https://llvm-compile-time-tracker.com 83 // https://github.com/nikic/llvm-compile-time-tracker 84 static cl::opt<bool> SpecializeLiteralConstant( 85 "funcspec-for-literal-constant", cl::init(false), cl::Hidden, cl::desc( 86 "Enable specialization of functions that take a literal constant as an " 87 "argument")); 88 89 bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ, 90 DenseSet<BasicBlock *> &DeadBlocks) { 91 unsigned I = 0; 92 return all_of(predecessors(Succ), 93 [&I, BB, Succ, &DeadBlocks] (BasicBlock *Pred) { 94 return I++ < MaxBlockPredecessors && 95 (Pred == BB || Pred == Succ || DeadBlocks.contains(Pred)); 96 }); 97 } 98 99 // Estimates the codesize savings due to dead code after constant propagation. 100 // \p WorkList represents the basic blocks of a specialization which will 101 // eventually become dead once we replace instructions that are known to be 102 // constants. The successors of such blocks are added to the list as long as 103 // the \p Solver found they were executable prior to specialization, and only 104 // if all their predecessors are dead. 105 Cost InstCostVisitor::estimateBasicBlocks( 106 SmallVectorImpl<BasicBlock *> &WorkList) { 107 Cost CodeSize = 0; 108 // Accumulate the instruction cost of each basic block weighted by frequency. 109 while (!WorkList.empty()) { 110 BasicBlock *BB = WorkList.pop_back_val(); 111 112 // These blocks are considered dead as far as the InstCostVisitor 113 // is concerned. They haven't been proven dead yet by the Solver, 114 // but may become if we propagate the specialization arguments. 115 if (!DeadBlocks.insert(BB).second) 116 continue; 117 118 for (Instruction &I : *BB) { 119 // Disregard SSA copies. 120 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 121 if (II->getIntrinsicID() == Intrinsic::ssa_copy) 122 continue; 123 // If it's a known constant we have already accounted for it. 124 if (KnownConstants.contains(&I)) 125 continue; 126 127 Cost C = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 128 129 LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C 130 << " for user " << I << "\n"); 131 CodeSize += C; 132 } 133 134 // Keep adding dead successors to the list as long as they are 135 // executable and only reachable from dead blocks. 136 for (BasicBlock *SuccBB : successors(BB)) 137 if (isBlockExecutable(SuccBB) && 138 canEliminateSuccessor(BB, SuccBB, DeadBlocks)) 139 WorkList.push_back(SuccBB); 140 } 141 return CodeSize; 142 } 143 144 static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) { 145 if (auto *C = dyn_cast<Constant>(V)) 146 return C; 147 return KnownConstants.lookup(V); 148 } 149 150 Bonus InstCostVisitor::getBonusFromPendingPHIs() { 151 Bonus B; 152 while (!PendingPHIs.empty()) { 153 Instruction *Phi = PendingPHIs.pop_back_val(); 154 // The pending PHIs could have been proven dead by now. 155 if (isBlockExecutable(Phi->getParent())) 156 B += getUserBonus(Phi); 157 } 158 return B; 159 } 160 161 /// Compute a bonus for replacing argument \p A with constant \p C. 162 Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) { 163 LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " 164 << C->getNameOrAsOperand() << "\n"); 165 Bonus B; 166 for (auto *U : A->users()) 167 if (auto *UI = dyn_cast<Instruction>(U)) 168 if (isBlockExecutable(UI->getParent())) 169 B += getUserBonus(UI, A, C); 170 171 LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " 172 << B.CodeSize << ", Latency = " << B.Latency 173 << "} for argument " << *A << "\n"); 174 return B; 175 } 176 177 Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) { 178 // We have already propagated a constant for this user. 179 if (KnownConstants.contains(User)) 180 return {0, 0}; 181 182 // Cache the iterator before visiting. 183 LastVisited = Use ? KnownConstants.insert({Use, C}).first 184 : KnownConstants.end(); 185 186 Cost CodeSize = 0; 187 if (auto *I = dyn_cast<SwitchInst>(User)) { 188 CodeSize = estimateSwitchInst(*I); 189 } else if (auto *I = dyn_cast<BranchInst>(User)) { 190 CodeSize = estimateBranchInst(*I); 191 } else { 192 C = visit(*User); 193 if (!C) 194 return {0, 0}; 195 } 196 197 // Even though it doesn't make sense to bind switch and branch instructions 198 // with a constant, unlike any other instruction type, it prevents estimating 199 // their bonus multiple times. 200 KnownConstants.insert({User, C}); 201 202 CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize); 203 204 uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() / 205 BFI.getEntryFreq().getFrequency(); 206 207 Cost Latency = Weight * 208 TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency); 209 210 LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize 211 << ", Latency = " << Latency << "} for user " 212 << *User << "\n"); 213 214 Bonus B(CodeSize, Latency); 215 for (auto *U : User->users()) 216 if (auto *UI = dyn_cast<Instruction>(U)) 217 if (UI != User && isBlockExecutable(UI->getParent())) 218 B += getUserBonus(UI, User, C); 219 220 return B; 221 } 222 223 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { 224 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 225 226 if (I.getCondition() != LastVisited->first) 227 return 0; 228 229 auto *C = dyn_cast<ConstantInt>(LastVisited->second); 230 if (!C) 231 return 0; 232 233 BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor(); 234 // Initialize the worklist with the dead basic blocks. These are the 235 // destination labels which are different from the one corresponding 236 // to \p C. They should be executable and have a unique predecessor. 237 SmallVector<BasicBlock *> WorkList; 238 for (const auto &Case : I.cases()) { 239 BasicBlock *BB = Case.getCaseSuccessor(); 240 if (BB != Succ && isBlockExecutable(BB) && 241 canEliminateSuccessor(I.getParent(), BB, DeadBlocks)) 242 WorkList.push_back(BB); 243 } 244 245 return estimateBasicBlocks(WorkList); 246 } 247 248 Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { 249 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 250 251 if (I.getCondition() != LastVisited->first) 252 return 0; 253 254 BasicBlock *Succ = I.getSuccessor(LastVisited->second->isOneValue()); 255 // Initialize the worklist with the dead successor as long as 256 // it is executable and has a unique predecessor. 257 SmallVector<BasicBlock *> WorkList; 258 if (isBlockExecutable(Succ) && 259 canEliminateSuccessor(I.getParent(), Succ, DeadBlocks)) 260 WorkList.push_back(Succ); 261 262 return estimateBasicBlocks(WorkList); 263 } 264 265 Constant *InstCostVisitor::visitPHINode(PHINode &I) { 266 if (I.getNumIncomingValues() > MaxIncomingPhiValues) 267 return nullptr; 268 269 bool Inserted = VisitedPHIs.insert(&I).second; 270 Constant *Const = nullptr; 271 272 for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) { 273 Value *V = I.getIncomingValue(Idx); 274 if (auto *Inst = dyn_cast<Instruction>(V)) 275 if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx))) 276 continue; 277 Constant *C = findConstantFor(V, KnownConstants); 278 if (!C) { 279 if (Inserted) 280 PendingPHIs.push_back(&I); 281 return nullptr; 282 } 283 if (!Const) 284 Const = C; 285 else if (C != Const) 286 return nullptr; 287 } 288 return Const; 289 } 290 291 Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { 292 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 293 294 if (isGuaranteedNotToBeUndefOrPoison(LastVisited->second)) 295 return LastVisited->second; 296 return nullptr; 297 } 298 299 Constant *InstCostVisitor::visitCallBase(CallBase &I) { 300 Function *F = I.getCalledFunction(); 301 if (!F || !canConstantFoldCallTo(&I, F)) 302 return nullptr; 303 304 SmallVector<Constant *, 8> Operands; 305 Operands.reserve(I.getNumOperands()); 306 307 for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) { 308 Value *V = I.getOperand(Idx); 309 Constant *C = findConstantFor(V, KnownConstants); 310 if (!C) 311 return nullptr; 312 Operands.push_back(C); 313 } 314 315 auto Ops = ArrayRef(Operands.begin(), Operands.end()); 316 return ConstantFoldCall(&I, F, Ops); 317 } 318 319 Constant *InstCostVisitor::visitLoadInst(LoadInst &I) { 320 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 321 322 if (isa<ConstantPointerNull>(LastVisited->second)) 323 return nullptr; 324 return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL); 325 } 326 327 Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { 328 SmallVector<Constant *, 8> Operands; 329 Operands.reserve(I.getNumOperands()); 330 331 for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) { 332 Value *V = I.getOperand(Idx); 333 Constant *C = findConstantFor(V, KnownConstants); 334 if (!C) 335 return nullptr; 336 Operands.push_back(C); 337 } 338 339 auto Ops = ArrayRef(Operands.begin(), Operands.end()); 340 return ConstantFoldInstOperands(&I, Ops, DL); 341 } 342 343 Constant *InstCostVisitor::visitSelectInst(SelectInst &I) { 344 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 345 346 if (I.getCondition() != LastVisited->first) 347 return nullptr; 348 349 Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue() 350 : I.getTrueValue(); 351 Constant *C = findConstantFor(V, KnownConstants); 352 return C; 353 } 354 355 Constant *InstCostVisitor::visitCastInst(CastInst &I) { 356 return ConstantFoldCastOperand(I.getOpcode(), LastVisited->second, 357 I.getType(), DL); 358 } 359 360 Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { 361 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 362 363 bool Swap = I.getOperand(1) == LastVisited->first; 364 Value *V = Swap ? I.getOperand(0) : I.getOperand(1); 365 Constant *Other = findConstantFor(V, KnownConstants); 366 if (!Other) 367 return nullptr; 368 369 Constant *Const = LastVisited->second; 370 return Swap ? 371 ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL) 372 : ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL); 373 } 374 375 Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { 376 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 377 378 return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL); 379 } 380 381 Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { 382 assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); 383 384 bool Swap = I.getOperand(1) == LastVisited->first; 385 Value *V = Swap ? I.getOperand(0) : I.getOperand(1); 386 Constant *Other = findConstantFor(V, KnownConstants); 387 if (!Other) 388 return nullptr; 389 390 Constant *Const = LastVisited->second; 391 return dyn_cast_or_null<Constant>(Swap ? 392 simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL)) 393 : simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL))); 394 } 395 396 Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, 397 CallInst *Call) { 398 Value *StoreValue = nullptr; 399 for (auto *User : Alloca->users()) { 400 // We can't use llvm::isAllocaPromotable() as that would fail because of 401 // the usage in the CallInst, which is what we check here. 402 if (User == Call) 403 continue; 404 if (auto *Bitcast = dyn_cast<BitCastInst>(User)) { 405 if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call) 406 return nullptr; 407 continue; 408 } 409 410 if (auto *Store = dyn_cast<StoreInst>(User)) { 411 // This is a duplicate store, bail out. 412 if (StoreValue || Store->isVolatile()) 413 return nullptr; 414 StoreValue = Store->getValueOperand(); 415 continue; 416 } 417 // Bail if there is any other unknown usage. 418 return nullptr; 419 } 420 421 if (!StoreValue) 422 return nullptr; 423 424 return getCandidateConstant(StoreValue); 425 } 426 427 // A constant stack value is an AllocaInst that has a single constant 428 // value stored to it. Return this constant if such an alloca stack value 429 // is a function argument. 430 Constant *FunctionSpecializer::getConstantStackValue(CallInst *Call, 431 Value *Val) { 432 if (!Val) 433 return nullptr; 434 Val = Val->stripPointerCasts(); 435 if (auto *ConstVal = dyn_cast<ConstantInt>(Val)) 436 return ConstVal; 437 auto *Alloca = dyn_cast<AllocaInst>(Val); 438 if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy()) 439 return nullptr; 440 return getPromotableAlloca(Alloca, Call); 441 } 442 443 // To support specializing recursive functions, it is important to propagate 444 // constant arguments because after a first iteration of specialisation, a 445 // reduced example may look like this: 446 // 447 // define internal void @RecursiveFn(i32* arg1) { 448 // %temp = alloca i32, align 4 449 // store i32 2 i32* %temp, align 4 450 // call void @RecursiveFn.1(i32* nonnull %temp) 451 // ret void 452 // } 453 // 454 // Before a next iteration, we need to propagate the constant like so 455 // which allows further specialization in next iterations. 456 // 457 // @funcspec.arg = internal constant i32 2 458 // 459 // define internal void @someFunc(i32* arg1) { 460 // call void @otherFunc(i32* nonnull @funcspec.arg) 461 // ret void 462 // } 463 // 464 // See if there are any new constant values for the callers of \p F via 465 // stack variables and promote them to global variables. 466 void FunctionSpecializer::promoteConstantStackValues(Function *F) { 467 for (User *U : F->users()) { 468 469 auto *Call = dyn_cast<CallInst>(U); 470 if (!Call) 471 continue; 472 473 if (!Solver.isBlockExecutable(Call->getParent())) 474 continue; 475 476 for (const Use &U : Call->args()) { 477 unsigned Idx = Call->getArgOperandNo(&U); 478 Value *ArgOp = Call->getArgOperand(Idx); 479 Type *ArgOpType = ArgOp->getType(); 480 481 if (!Call->onlyReadsMemory(Idx) || !ArgOpType->isPointerTy()) 482 continue; 483 484 auto *ConstVal = getConstantStackValue(Call, ArgOp); 485 if (!ConstVal) 486 continue; 487 488 Value *GV = new GlobalVariable(M, ConstVal->getType(), true, 489 GlobalValue::InternalLinkage, ConstVal, 490 "specialized.arg." + Twine(++NGlobals)); 491 Call->setArgOperand(Idx, GV); 492 } 493 } 494 } 495 496 // ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics 497 // interfere with the promoteConstantStackValues() optimization. 498 static void removeSSACopy(Function &F) { 499 for (BasicBlock &BB : F) { 500 for (Instruction &Inst : llvm::make_early_inc_range(BB)) { 501 auto *II = dyn_cast<IntrinsicInst>(&Inst); 502 if (!II) 503 continue; 504 if (II->getIntrinsicID() != Intrinsic::ssa_copy) 505 continue; 506 Inst.replaceAllUsesWith(II->getOperand(0)); 507 Inst.eraseFromParent(); 508 } 509 } 510 } 511 512 /// Remove any ssa_copy intrinsics that may have been introduced. 513 void FunctionSpecializer::cleanUpSSA() { 514 for (Function *F : Specializations) 515 removeSSACopy(*F); 516 } 517 518 519 template <> struct llvm::DenseMapInfo<SpecSig> { 520 static inline SpecSig getEmptyKey() { return {~0U, {}}; } 521 522 static inline SpecSig getTombstoneKey() { return {~1U, {}}; } 523 524 static unsigned getHashValue(const SpecSig &S) { 525 return static_cast<unsigned>(hash_value(S)); 526 } 527 528 static bool isEqual(const SpecSig &LHS, const SpecSig &RHS) { 529 return LHS == RHS; 530 } 531 }; 532 533 FunctionSpecializer::~FunctionSpecializer() { 534 LLVM_DEBUG( 535 if (NumSpecsCreated > 0) 536 dbgs() << "FnSpecialization: Created " << NumSpecsCreated 537 << " specializations in module " << M.getName() << "\n"); 538 // Eliminate dead code. 539 removeDeadFunctions(); 540 cleanUpSSA(); 541 } 542 543 /// Attempt to specialize functions in the module to enable constant 544 /// propagation across function boundaries. 545 /// 546 /// \returns true if at least one function is specialized. 547 bool FunctionSpecializer::run() { 548 // Find possible specializations for each function. 549 SpecMap SM; 550 SmallVector<Spec, 32> AllSpecs; 551 unsigned NumCandidates = 0; 552 for (Function &F : M) { 553 if (!isCandidateFunction(&F)) 554 continue; 555 556 auto [It, Inserted] = FunctionMetrics.try_emplace(&F); 557 CodeMetrics &Metrics = It->second; 558 //Analyze the function. 559 if (Inserted) { 560 SmallPtrSet<const Value *, 32> EphValues; 561 CodeMetrics::collectEphemeralValues(&F, &GetAC(F), EphValues); 562 for (BasicBlock &BB : F) 563 Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues); 564 } 565 566 // If the code metrics reveal that we shouldn't duplicate the function, 567 // or if the code size implies that this function is easy to get inlined, 568 // then we shouldn't specialize it. 569 if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || 570 (!ForceSpecialization && !F.hasFnAttribute(Attribute::NoInline) && 571 Metrics.NumInsts < MinFunctionSize)) 572 continue; 573 574 // TODO: For now only consider recursive functions when running multiple 575 // times. This should change if specialization on literal constants gets 576 // enabled. 577 if (!Inserted && !Metrics.isRecursive && !SpecializeLiteralConstant) 578 continue; 579 580 int64_t Sz = *Metrics.NumInsts.getValue(); 581 assert(Sz > 0 && "CodeSize should be positive"); 582 // It is safe to down cast from int64_t, NumInsts is always positive. 583 unsigned FuncSize = static_cast<unsigned>(Sz); 584 585 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for " 586 << F.getName() << " is " << FuncSize << "\n"); 587 588 if (Inserted && Metrics.isRecursive) 589 promoteConstantStackValues(&F); 590 591 if (!findSpecializations(&F, FuncSize, AllSpecs, SM)) { 592 LLVM_DEBUG( 593 dbgs() << "FnSpecialization: No possible specializations found for " 594 << F.getName() << "\n"); 595 continue; 596 } 597 598 ++NumCandidates; 599 } 600 601 if (!NumCandidates) { 602 LLVM_DEBUG( 603 dbgs() 604 << "FnSpecialization: No possible specializations found in module\n"); 605 return false; 606 } 607 608 // Choose the most profitable specialisations, which fit in the module 609 // specialization budget, which is derived from maximum number of 610 // specializations per specialization candidate function. 611 auto CompareScore = [&AllSpecs](unsigned I, unsigned J) { 612 return AllSpecs[I].Score > AllSpecs[J].Score; 613 }; 614 const unsigned NSpecs = 615 std::min(NumCandidates * MaxClones, unsigned(AllSpecs.size())); 616 SmallVector<unsigned> BestSpecs(NSpecs + 1); 617 std::iota(BestSpecs.begin(), BestSpecs.begin() + NSpecs, 0); 618 if (AllSpecs.size() > NSpecs) { 619 LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed " 620 << "the maximum number of clones threshold.\n" 621 << "FnSpecialization: Specializing the " 622 << NSpecs 623 << " most profitable candidates.\n"); 624 std::make_heap(BestSpecs.begin(), BestSpecs.begin() + NSpecs, CompareScore); 625 for (unsigned I = NSpecs, N = AllSpecs.size(); I < N; ++I) { 626 BestSpecs[NSpecs] = I; 627 std::push_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore); 628 std::pop_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore); 629 } 630 } 631 632 LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n"; 633 for (unsigned I = 0; I < NSpecs; ++I) { 634 const Spec &S = AllSpecs[BestSpecs[I]]; 635 dbgs() << "FnSpecialization: Function " << S.F->getName() 636 << " , score " << S.Score << "\n"; 637 for (const ArgInfo &Arg : S.Sig.Args) 638 dbgs() << "FnSpecialization: FormalArg = " 639 << Arg.Formal->getNameOrAsOperand() 640 << ", ActualArg = " << Arg.Actual->getNameOrAsOperand() 641 << "\n"; 642 }); 643 644 // Create the chosen specializations. 645 SmallPtrSet<Function *, 8> OriginalFuncs; 646 SmallVector<Function *> Clones; 647 for (unsigned I = 0; I < NSpecs; ++I) { 648 Spec &S = AllSpecs[BestSpecs[I]]; 649 S.Clone = createSpecialization(S.F, S.Sig); 650 651 // Update the known call sites to call the clone. 652 for (CallBase *Call : S.CallSites) { 653 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call 654 << " to call " << S.Clone->getName() << "\n"); 655 Call->setCalledFunction(S.Clone); 656 } 657 658 Clones.push_back(S.Clone); 659 OriginalFuncs.insert(S.F); 660 } 661 662 Solver.solveWhileResolvedUndefsIn(Clones); 663 664 // Update the rest of the call sites - these are the recursive calls, calls 665 // to discarded specialisations and calls that may match a specialisation 666 // after the solver runs. 667 for (Function *F : OriginalFuncs) { 668 auto [Begin, End] = SM[F]; 669 updateCallSites(F, AllSpecs.begin() + Begin, AllSpecs.begin() + End); 670 } 671 672 for (Function *F : Clones) { 673 if (F->getReturnType()->isVoidTy()) 674 continue; 675 if (F->getReturnType()->isStructTy()) { 676 auto *STy = cast<StructType>(F->getReturnType()); 677 if (!Solver.isStructLatticeConstant(F, STy)) 678 continue; 679 } else { 680 auto It = Solver.getTrackedRetVals().find(F); 681 assert(It != Solver.getTrackedRetVals().end() && 682 "Return value ought to be tracked"); 683 if (SCCPSolver::isOverdefined(It->second)) 684 continue; 685 } 686 for (User *U : F->users()) { 687 if (auto *CS = dyn_cast<CallBase>(U)) { 688 //The user instruction does not call our function. 689 if (CS->getCalledFunction() != F) 690 continue; 691 Solver.resetLatticeValueFor(CS); 692 } 693 } 694 } 695 696 // Rerun the solver to notify the users of the modified callsites. 697 Solver.solveWhileResolvedUndefs(); 698 699 for (Function *F : OriginalFuncs) 700 if (FunctionMetrics[F].isRecursive) 701 promoteConstantStackValues(F); 702 703 return true; 704 } 705 706 void FunctionSpecializer::removeDeadFunctions() { 707 for (Function *F : FullySpecialized) { 708 LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " 709 << F->getName() << "\n"); 710 if (FAM) 711 FAM->clear(*F, F->getName()); 712 F->eraseFromParent(); 713 } 714 FullySpecialized.clear(); 715 } 716 717 /// Clone the function \p F and remove the ssa_copy intrinsics added by 718 /// the SCCPSolver in the cloned version. 719 static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { 720 ValueToValueMapTy Mappings; 721 Function *Clone = CloneFunction(F, Mappings); 722 Clone->setName(F->getName() + ".specialized." + Twine(NSpecs)); 723 removeSSACopy(*Clone); 724 return Clone; 725 } 726 727 bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, 728 SmallVectorImpl<Spec> &AllSpecs, 729 SpecMap &SM) { 730 // A mapping from a specialisation signature to the index of the respective 731 // entry in the all specialisation array. Used to ensure uniqueness of 732 // specialisations. 733 DenseMap<SpecSig, unsigned> UniqueSpecs; 734 735 // Get a list of interesting arguments. 736 SmallVector<Argument *> Args; 737 for (Argument &Arg : F->args()) 738 if (isArgumentInteresting(&Arg)) 739 Args.push_back(&Arg); 740 741 if (Args.empty()) 742 return false; 743 744 for (User *U : F->users()) { 745 if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) 746 continue; 747 auto &CS = *cast<CallBase>(U); 748 749 // The user instruction does not call our function. 750 if (CS.getCalledFunction() != F) 751 continue; 752 753 // If the call site has attribute minsize set, that callsite won't be 754 // specialized. 755 if (CS.hasFnAttr(Attribute::MinSize)) 756 continue; 757 758 // If the parent of the call site will never be executed, we don't need 759 // to worry about the passed value. 760 if (!Solver.isBlockExecutable(CS.getParent())) 761 continue; 762 763 // Examine arguments and create a specialisation candidate from the 764 // constant operands of this call site. 765 SpecSig S; 766 for (Argument *A : Args) { 767 Constant *C = getCandidateConstant(CS.getArgOperand(A->getArgNo())); 768 if (!C) 769 continue; 770 LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument " 771 << A->getName() << " : " << C->getNameOrAsOperand() 772 << "\n"); 773 S.Args.push_back({A, C}); 774 } 775 776 if (S.Args.empty()) 777 continue; 778 779 // Check if we have encountered the same specialisation already. 780 if (auto It = UniqueSpecs.find(S); It != UniqueSpecs.end()) { 781 // Existing specialisation. Add the call to the list to rewrite, unless 782 // it's a recursive call. A specialisation, generated because of a 783 // recursive call may end up as not the best specialisation for all 784 // the cloned instances of this call, which result from specialising 785 // functions. Hence we don't rewrite the call directly, but match it with 786 // the best specialisation once all specialisations are known. 787 if (CS.getFunction() == F) 788 continue; 789 const unsigned Index = It->second; 790 AllSpecs[Index].CallSites.push_back(&CS); 791 } else { 792 // Calculate the specialisation gain. 793 Bonus B; 794 unsigned Score = 0; 795 InstCostVisitor Visitor = getInstCostVisitorFor(F); 796 for (ArgInfo &A : S.Args) { 797 B += Visitor.getSpecializationBonus(A.Formal, A.Actual); 798 Score += getInliningBonus(A.Formal, A.Actual); 799 } 800 B += Visitor.getBonusFromPendingPHIs(); 801 802 803 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " 804 << B.CodeSize << ", Latency = " << B.Latency 805 << ", Inlining = " << Score << "}\n"); 806 807 FunctionGrowth[F] += FuncSize - B.CodeSize; 808 809 auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize, 810 unsigned FuncGrowth) -> bool { 811 // No check required. 812 if (ForceSpecialization) 813 return true; 814 // Minimum inlining bonus. 815 if (Score > MinInliningBonus * FuncSize / 100) 816 return true; 817 // Minimum codesize savings. 818 if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) 819 return false; 820 // Minimum latency savings. 821 if (B.Latency < MinLatencySavings * FuncSize / 100) 822 return false; 823 // Maximum codesize growth. 824 if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) 825 return false; 826 return true; 827 }; 828 829 // Discard unprofitable specialisations. 830 if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F])) 831 continue; 832 833 // Create a new specialisation entry. 834 Score += std::max(B.CodeSize, B.Latency); 835 auto &Spec = AllSpecs.emplace_back(F, S, Score); 836 if (CS.getFunction() != F) 837 Spec.CallSites.push_back(&CS); 838 const unsigned Index = AllSpecs.size() - 1; 839 UniqueSpecs[S] = Index; 840 if (auto [It, Inserted] = SM.try_emplace(F, Index, Index + 1); !Inserted) 841 It->second.second = Index + 1; 842 } 843 } 844 845 return !UniqueSpecs.empty(); 846 } 847 848 bool FunctionSpecializer::isCandidateFunction(Function *F) { 849 if (F->isDeclaration() || F->arg_empty()) 850 return false; 851 852 if (F->hasFnAttribute(Attribute::NoDuplicate)) 853 return false; 854 855 // Do not specialize the cloned function again. 856 if (Specializations.contains(F)) 857 return false; 858 859 // If we're optimizing the function for size, we shouldn't specialize it. 860 if (F->hasOptSize() || 861 shouldOptimizeForSize(F, nullptr, nullptr, PGSOQueryType::IRPass)) 862 return false; 863 864 // Exit if the function is not executable. There's no point in specializing 865 // a dead function. 866 if (!Solver.isBlockExecutable(&F->getEntryBlock())) 867 return false; 868 869 // It wastes time to specialize a function which would get inlined finally. 870 if (F->hasFnAttribute(Attribute::AlwaysInline)) 871 return false; 872 873 LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() 874 << "\n"); 875 return true; 876 } 877 878 Function *FunctionSpecializer::createSpecialization(Function *F, 879 const SpecSig &S) { 880 Function *Clone = cloneCandidateFunction(F, Specializations.size() + 1); 881 882 // The original function does not neccessarily have internal linkage, but the 883 // clone must. 884 Clone->setLinkage(GlobalValue::InternalLinkage); 885 886 // Initialize the lattice state of the arguments of the function clone, 887 // marking the argument on which we specialized the function constant 888 // with the given value. 889 Solver.setLatticeValueForSpecializationArguments(Clone, S.Args); 890 Solver.markBlockExecutable(&Clone->front()); 891 Solver.addArgumentTrackedFunction(Clone); 892 Solver.addTrackedFunction(Clone); 893 894 // Mark all the specialized functions 895 Specializations.insert(Clone); 896 ++NumSpecsCreated; 897 898 return Clone; 899 } 900 901 /// Compute the inlining bonus for replacing argument \p A with constant \p C. 902 /// The below heuristic is only concerned with exposing inlining 903 /// opportunities via indirect call promotion. If the argument is not a 904 /// (potentially casted) function pointer, give up. 905 unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) { 906 Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts()); 907 if (!CalledFunction) 908 return 0; 909 910 // Get TTI for the called function (used for the inline cost). 911 auto &CalleeTTI = (GetTTI)(*CalledFunction); 912 913 // Look at all the call sites whose called value is the argument. 914 // Specializing the function on the argument would allow these indirect 915 // calls to be promoted to direct calls. If the indirect call promotion 916 // would likely enable the called function to be inlined, specializing is a 917 // good idea. 918 int InliningBonus = 0; 919 for (User *U : A->users()) { 920 if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) 921 continue; 922 auto *CS = cast<CallBase>(U); 923 if (CS->getCalledOperand() != A) 924 continue; 925 if (CS->getFunctionType() != CalledFunction->getFunctionType()) 926 continue; 927 928 // Get the cost of inlining the called function at this call site. Note 929 // that this is only an estimate. The called function may eventually 930 // change in a way that leads to it not being inlined here, even though 931 // inlining looks profitable now. For example, one of its called 932 // functions may be inlined into it, making the called function too large 933 // to be inlined into this call site. 934 // 935 // We apply a boost for performing indirect call promotion by increasing 936 // the default threshold by the threshold for indirect calls. 937 auto Params = getInlineParams(); 938 Params.DefaultThreshold += InlineConstants::IndirectCallThreshold; 939 InlineCost IC = 940 getInlineCost(*CS, CalledFunction, Params, CalleeTTI, GetAC, GetTLI); 941 942 // We clamp the bonus for this call to be between zero and the default 943 // threshold. 944 if (IC.isAlways()) 945 InliningBonus += Params.DefaultThreshold; 946 else if (IC.isVariable() && IC.getCostDelta() > 0) 947 InliningBonus += IC.getCostDelta(); 948 949 LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus 950 << " for user " << *U << "\n"); 951 } 952 953 return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0; 954 } 955 956 /// Determine if it is possible to specialise the function for constant values 957 /// of the formal parameter \p A. 958 bool FunctionSpecializer::isArgumentInteresting(Argument *A) { 959 // No point in specialization if the argument is unused. 960 if (A->user_empty()) 961 return false; 962 963 Type *Ty = A->getType(); 964 if (!Ty->isPointerTy() && (!SpecializeLiteralConstant || 965 (!Ty->isIntegerTy() && !Ty->isFloatingPointTy() && !Ty->isStructTy()))) 966 return false; 967 968 // SCCP solver does not record an argument that will be constructed on 969 // stack. 970 if (A->hasByValAttr() && !A->getParent()->onlyReadsMemory()) 971 return false; 972 973 // For non-argument-tracked functions every argument is overdefined. 974 if (!Solver.isArgumentTrackedFunction(A->getParent())) 975 return true; 976 977 // Check the lattice value and decide if we should attemt to specialize, 978 // based on this argument. No point in specialization, if the lattice value 979 // is already a constant. 980 bool IsOverdefined = Ty->isStructTy() 981 ? any_of(Solver.getStructLatticeValueFor(A), SCCPSolver::isOverdefined) 982 : SCCPSolver::isOverdefined(Solver.getLatticeValueFor(A)); 983 984 LLVM_DEBUG( 985 if (IsOverdefined) 986 dbgs() << "FnSpecialization: Found interesting parameter " 987 << A->getNameOrAsOperand() << "\n"; 988 else 989 dbgs() << "FnSpecialization: Nothing to do, parameter " 990 << A->getNameOrAsOperand() << " is already constant\n"; 991 ); 992 return IsOverdefined; 993 } 994 995 /// Check if the value \p V (an actual argument) is a constant or can only 996 /// have a constant value. Return that constant. 997 Constant *FunctionSpecializer::getCandidateConstant(Value *V) { 998 if (isa<PoisonValue>(V)) 999 return nullptr; 1000 1001 // Select for possible specialisation values that are constants or 1002 // are deduced to be constants or constant ranges with a single element. 1003 Constant *C = dyn_cast<Constant>(V); 1004 if (!C) 1005 C = Solver.getConstantOrNull(V); 1006 1007 // Don't specialize on (anything derived from) the address of a non-constant 1008 // global variable, unless explicitly enabled. 1009 if (C && C->getType()->isPointerTy() && !C->isNullValue()) 1010 if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C)); 1011 GV && !(GV->isConstant() || SpecializeOnAddress)) 1012 return nullptr; 1013 1014 return C; 1015 } 1016 1017 void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, 1018 const Spec *End) { 1019 // Collect the call sites that need updating. 1020 SmallVector<CallBase *> ToUpdate; 1021 for (User *U : F->users()) 1022 if (auto *CS = dyn_cast<CallBase>(U); 1023 CS && CS->getCalledFunction() == F && 1024 Solver.isBlockExecutable(CS->getParent())) 1025 ToUpdate.push_back(CS); 1026 1027 unsigned NCallsLeft = ToUpdate.size(); 1028 for (CallBase *CS : ToUpdate) { 1029 bool ShouldDecrementCount = CS->getFunction() == F; 1030 1031 // Find the best matching specialisation. 1032 const Spec *BestSpec = nullptr; 1033 for (const Spec &S : make_range(Begin, End)) { 1034 if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score)) 1035 continue; 1036 1037 if (any_of(S.Sig.Args, [CS, this](const ArgInfo &Arg) { 1038 unsigned ArgNo = Arg.Formal->getArgNo(); 1039 return getCandidateConstant(CS->getArgOperand(ArgNo)) != Arg.Actual; 1040 })) 1041 continue; 1042 1043 BestSpec = &S; 1044 } 1045 1046 if (BestSpec) { 1047 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS 1048 << " to call " << BestSpec->Clone->getName() << "\n"); 1049 CS->setCalledFunction(BestSpec->Clone); 1050 ShouldDecrementCount = true; 1051 } 1052 1053 if (ShouldDecrementCount) 1054 --NCallsLeft; 1055 } 1056 1057 // If the function has been completely specialized, the original function 1058 // is no longer needed. Mark it unreachable. 1059 if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { 1060 Solver.markFunctionUnreachable(F); 1061 FullySpecialized.insert(F); 1062 } 1063 } 1064