Lines Matching +full:compute +full:- +full:cb

1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
63 #define DEBUG_TYPE "partial-inlining"
75 // Command line option to disable partial-inlining. The default is false:
77 DisablePartialInlining("disable-partial-inlining", cl::init(false),
79 // Command line option to disable multi-region partial-inlining. The default is
82 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
83 cl::desc("Disable multi-region partial inlining"));
88 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
94 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
107 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
113 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
119 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
123 "max-num-inline-blocks", cl::init(5), cl::Hidden,
127 // for the module. The default value of -1 means no limit.
129 "max-partial-inlining", cl::init(-1), cl::Hidden,
136 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
142 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
215 // multi-region outlining.
254 // ClonedOI is specific to outlining non-early return blocks.
280 // Return true if the callee of CB should be partially inlined with
282 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
291 // Compute the mapping from use site of DuplicationFunction to the enclosing
298 return (MaxNumPartialInlining != -1 &&
315 CallBase *CB = getOneCallSiteTo(F);
316 DebugLoc DLoc = CB->getDebugLoc();
317 BasicBlock *Block = CB->getParent();
322 // - The first value is the non-weighted runtime cost for making the call
325 // - The second value is the estimated size of the new call sequence in
330 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
370 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
378 &Succ->front())
380 << ore::NV("Block", BlockList.front()->getName())
394 return BFI->getBlockProfileCount(BB).value_or(0);
397 // Use the same computeBBInlineCost function to compute the cost savings of
430 // not-cold (default: part of the top 99.99% of all block counters)
445 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
446 << SI->getName()
455 if (!DominateVector.front()->hasNPredecessors(1)) {
456 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
465 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
472 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
480 &SI->front())
482 << " inline cost-savings smaller than "
493 // at inner regions because the outer region may have live-exit
499 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
502 OutliningInfo->ORI.push_back(RegInfo);
504 << DominateVector.front()->getName() << "\n";);
519 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
520 if (!BR || BR->isUnconditional())
529 Instruction *TI = BB->getTerminator();
561 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
574 OutliningInfo->Entries.push_back(CurrEntry);
575 OutliningInfo->ReturnBlock = ReturnBlock;
576 OutliningInfo->NonReturnBlock = NonReturnBlock;
587 OutliningInfo->Entries.push_back(CurrEntry);
596 assert(OutliningInfo->Entries[0] == &F.front() &&
599 for (BasicBlock *E : OutliningInfo->Entries)
613 for (BasicBlock *E : OutliningInfo->Entries) {
617 if (Succ == OutliningInfo->ReturnBlock)
618 OutliningInfo->ReturnBlockPreds.push_back(E);
619 else if (Succ != OutliningInfo->NonReturnBlock)
634 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
635 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
647 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
650 if (NonReturnBlock->getSinglePredecessor() != Cand)
654 OutliningInfo->Entries.push_back(Cand);
655 OutliningInfo->NonReturnBlock = NonReturnBlock;
656 OutliningInfo->ReturnBlockPreds.push_back(Cand);
669 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
670 if (!BR || BR->isUnconditional())
682 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
684 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
699 // guess the branch direction right (taken/non-taken), but the guessed
702 // to be made higher (more biased) to not under-estimate the cost of
719 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
723 Function *Callee = CB.getCalledFunction();
729 Function *Caller = CB.getCaller();
732 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
735 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
740 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
749 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
759 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
767 const DataLayout &DL = Caller->getDataLayout();
770 int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
777 &CB)
791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
807 const DataLayout &DL = BB->getDataLayout();
809 for (Instruction &I : BB->instructionsWithoutDebug()) {
819 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
830 Intrinsic::ID IID = II->getIntrinsicID();
833 for (Value *Val : II->args())
834 Tys.push_back(Val->getType());
837 FMF = FPMO->getFastMathFlags();
839 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
840 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
855 InlineCost += (SI->getNumCases() + 1) * InstrCost;
870 // Now compute the cost of the call sequence to the outlined function
876 // Now compute the cost of the extracted/outlined function itself:
886 OutlinedFunctionCost -=
891 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
903 std::vector<User *> Users(DuplicateFunction->user_begin(),
904 DuplicateFunction->user_end());
927 CallBase *CB = getSupportedCallBase(User);
928 Function *Caller = CB->getCaller();
935 BasicBlock *CallBB = CB->getParent();
936 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
955 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
956 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
957 for (BasicBlock *BB : OI->Entries)
958 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
960 for (BasicBlock *E : OI->ReturnBlockPreds) {
962 ClonedOI->ReturnBlockPreds.push_back(NewE);
966 F->replaceAllUsesWith(ClonedFunc);
984 OI->ORI) {
996 ClonedOMRI->ORI.push_back(MappedRegionInfo);
1000 F->replaceAllUsesWith(ClonedFunc);
1005 BasicBlock::iterator I = BB->begin();
1007 while (I != BB->end()) {
1019 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1025 // one extracted block. For simplicity, just split the PHIs into a two-level
1028 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1031 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1033 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1036 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1037 if (llvm::all_equal(PN->incoming_values()))
1038 return PN->getIncomingValue(0);
1042 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1043 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1044 BasicBlock::iterator I = PreReturn->begin();
1045 BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1047 while (I != PreReturn->end()) {
1053 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");
1054 RetPhi->insertBefore(Ins);
1055 OldPhi->replaceAllUsesWith(RetPhi);
1056 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1058 RetPhi->addIncoming(&*I, PreReturn);
1059 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1060 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1061 OldPhi->removeIncomingValue(E);
1066 // region which is live-out, causing necessary overhead (load, store
1069 OldPhi->replaceAllUsesWith(OldPhiVal);
1075 DP->eraseFromParent();
1077 for (auto *E : ClonedOI->ReturnBlockPreds)
1078 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1084 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1087 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1093 if (ClonedOMRI->ORI.empty())
1105 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1110 ClonedOMRI->ORI) {
1116 LookupAC(*RegionInfo.EntryBlock->getParent()),
1136 BasicBlock *OutliningCallBB = OCS->getParent();
1137 assert(OutliningCallBB->getParent() == ClonedFunc);
1143 OutlinedFunc->setCallingConv(CallingConv::Cold);
1144 OCS->setCallingConv(CallingConv::Cold);
1149 &RegionInfo.Region.front()->front())
1163 return BB == ClonedOI->ReturnBlock ||
1164 llvm::is_contained(ClonedOI->Entries, BB);
1180 ToExtract.push_back(ClonedOI->NonReturnBlock);
1182 ClonedOI->NonReturnBlock, ClonedFuncTTI);
1183 for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))
1184 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1203 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1204 assert(OutliningCallBB->getParent() == ClonedFunc);
1209 &ToExtract.front()->front())
1220 ClonedFunc->replaceAllUsesWith(OrigFunc);
1221 ClonedFunc->eraseFromParent();
1227 Func->eraseFromParent();
1271 Cloner.ClonedFunc->print(dbgs());
1281 // Fall-thru to regular partial inlining if we:
1351 assert(Cloner.OrigFunc->users().empty() &&
1354 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1355 Cloner.ClonedFunc->user_end());
1358 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1363 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1371 CallBase *CB = getSupportedCallBase(User);
1376 OptimizationRemarkEmitter CallerORE(CB->getCaller());
1377 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1382 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1384 << ore::NV("Caller", CB->getCaller());
1389 if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1400 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1415 Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1416 CalleeEntryCountV, CalleeEntryCount->getType()));
1442 if (CurrFunc->use_empty())
1458 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1462 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1466 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1470 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1474 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {