1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/PassManager.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/PassRegistry.h" 41 #include "llvm/ProfileData/InstrProf.h" 42 #define INSTR_PROF_VALUE_PROF_MEMOP_API 43 #include "llvm/ProfileData/InstrProfData.inc" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Transforms/Instrumentation.h" 50 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 51 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 52 #include <cassert> 53 #include <cstdint> 54 #include <vector> 55 56 using namespace llvm; 57 58 #define DEBUG_TYPE "pgo-memop-opt" 59 60 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 61 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 62 63 // The minimum call count to optimize memory intrinsic calls. 64 static cl::opt<unsigned> 65 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 66 cl::init(1000), 67 cl::desc("The minimum count to optimize memory " 68 "intrinsic calls")); 69 70 // Command line option to disable memory intrinsic optimization. The default is 71 // false. This is for debug purpose. 72 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 73 cl::Hidden, cl::desc("Disable optimize")); 74 75 // The percent threshold to optimize memory intrinsic calls. 76 static cl::opt<unsigned> 77 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 78 cl::Hidden, cl::ZeroOrMore, 79 cl::desc("The percentage threshold for the " 80 "memory intrinsic calls optimization")); 81 82 // Maximum number of versions for optimizing memory intrinsic call. 83 static cl::opt<unsigned> 84 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 85 cl::ZeroOrMore, 86 cl::desc("The max version for the optimized memory " 87 " intrinsic calls")); 88 89 // Scale the counts from the annotation using the BB count value. 90 static cl::opt<bool> 91 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 92 cl::desc("Scale the memop size counts using the basic " 93 " block count value")); 94 95 cl::opt<bool> 96 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 97 cl::Hidden, 98 cl::desc("Size-specialize memcmp and bcmp calls")); 99 100 static cl::opt<unsigned> 101 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 102 cl::desc("Optimize the memop size <= this value")); 103 104 namespace { 105 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 106 public: 107 static char ID; 108 109 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 110 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 111 } 112 113 StringRef getPassName() const override { return "PGOMemOPSize"; } 114 115 private: 116 bool runOnFunction(Function &F) override; 117 void getAnalysisUsage(AnalysisUsage &AU) const override { 118 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 119 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 120 AU.addPreserved<GlobalsAAWrapperPass>(); 121 AU.addPreserved<DominatorTreeWrapperPass>(); 122 AU.addRequired<TargetLibraryInfoWrapperPass>(); 123 } 124 }; 125 } // end anonymous namespace 126 127 char PGOMemOPSizeOptLegacyPass::ID = 0; 128 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 129 "Optimize memory intrinsic using its size value profile", 130 false, false) 131 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 132 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 133 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 134 "Optimize memory intrinsic using its size value profile", 135 false, false) 136 137 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 138 return new PGOMemOPSizeOptLegacyPass(); 139 } 140 141 namespace { 142 143 static const char *getMIName(const MemIntrinsic *MI) { 144 switch (MI->getIntrinsicID()) { 145 case Intrinsic::memcpy: 146 return "memcpy"; 147 case Intrinsic::memmove: 148 return "memmove"; 149 case Intrinsic::memset: 150 return "memset"; 151 default: 152 return "unknown"; 153 } 154 } 155 156 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 157 struct MemOp { 158 Instruction *I; 159 MemOp(MemIntrinsic *MI) : I(MI) {} 160 MemOp(CallInst *CI) : I(CI) {} 161 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 162 CallInst *asCI() { return cast<CallInst>(I); } 163 MemOp clone() { 164 if (auto MI = asMI()) 165 return MemOp(cast<MemIntrinsic>(MI->clone())); 166 return MemOp(cast<CallInst>(asCI()->clone())); 167 } 168 Value *getLength() { 169 if (auto MI = asMI()) 170 return MI->getLength(); 171 return asCI()->getArgOperand(2); 172 } 173 void setLength(Value *Length) { 174 if (auto MI = asMI()) 175 return MI->setLength(Length); 176 asCI()->setArgOperand(2, Length); 177 } 178 StringRef getFuncName() { 179 if (auto MI = asMI()) 180 return MI->getCalledFunction()->getName(); 181 return asCI()->getCalledFunction()->getName(); 182 } 183 bool isMemmove() { 184 if (auto MI = asMI()) 185 if (MI->getIntrinsicID() == Intrinsic::memmove) 186 return true; 187 return false; 188 } 189 bool isMemcmp(TargetLibraryInfo &TLI) { 190 LibFunc Func; 191 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 192 Func == LibFunc_memcmp) { 193 return true; 194 } 195 return false; 196 } 197 bool isBcmp(TargetLibraryInfo &TLI) { 198 LibFunc Func; 199 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 200 Func == LibFunc_bcmp) { 201 return true; 202 } 203 return false; 204 } 205 const char *getName(TargetLibraryInfo &TLI) { 206 if (auto MI = asMI()) 207 return getMIName(MI); 208 LibFunc Func; 209 if (TLI.getLibFunc(*asCI(), Func)) { 210 if (Func == LibFunc_memcmp) 211 return "memcmp"; 212 if (Func == LibFunc_bcmp) 213 return "bcmp"; 214 } 215 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 216 return nullptr; 217 } 218 }; 219 220 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 221 public: 222 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 223 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 224 TargetLibraryInfo &TLI) 225 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 226 ValueDataArray = 227 std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 228 } 229 bool isChanged() const { return Changed; } 230 void perform() { 231 WorkList.clear(); 232 visit(Func); 233 234 for (auto &MO : WorkList) { 235 ++NumOfPGOMemOPAnnotate; 236 if (perform(MO)) { 237 Changed = true; 238 ++NumOfPGOMemOPOpt; 239 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 240 << "is Transformed.\n"); 241 } 242 } 243 } 244 245 void visitMemIntrinsic(MemIntrinsic &MI) { 246 Value *Length = MI.getLength(); 247 // Not perform on constant length calls. 248 if (dyn_cast<ConstantInt>(Length)) 249 return; 250 WorkList.push_back(MemOp(&MI)); 251 } 252 253 void visitCallInst(CallInst &CI) { 254 LibFunc Func; 255 if (TLI.getLibFunc(CI, Func) && 256 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 257 !isa<ConstantInt>(CI.getArgOperand(2))) { 258 WorkList.push_back(MemOp(&CI)); 259 } 260 } 261 262 private: 263 Function &Func; 264 BlockFrequencyInfo &BFI; 265 OptimizationRemarkEmitter &ORE; 266 DominatorTree *DT; 267 TargetLibraryInfo &TLI; 268 bool Changed; 269 std::vector<MemOp> WorkList; 270 // The space to read the profile annotation. 271 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 272 bool perform(MemOp MO); 273 }; 274 275 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 276 assert(Count <= TotalCount); 277 if (Count < MemOPCountThreshold) 278 return false; 279 if (Count < TotalCount * MemOPPercentThreshold / 100) 280 return false; 281 return true; 282 } 283 284 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 285 uint64_t Denom) { 286 if (!MemOPScaleCount) 287 return Count; 288 bool Overflowed; 289 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 290 return ScaleCount / Denom; 291 } 292 293 bool MemOPSizeOpt::perform(MemOp MO) { 294 assert(MO.I); 295 if (MO.isMemmove()) 296 return false; 297 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 298 return false; 299 300 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 301 uint64_t TotalCount; 302 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, 303 ValueDataArray.get(), NumVals, TotalCount)) 304 return false; 305 306 uint64_t ActualCount = TotalCount; 307 uint64_t SavedTotalCount = TotalCount; 308 if (MemOPScaleCount) { 309 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 310 if (!BBEdgeCount) 311 return false; 312 ActualCount = *BBEdgeCount; 313 } 314 315 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 316 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 317 << ActualCount << "\n"); 318 LLVM_DEBUG( 319 for (auto &VD 320 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 321 322 if (ActualCount < MemOPCountThreshold) 323 return false; 324 // Skip if the total value profiled count is 0, in which case we can't 325 // scale up the counts properly (and there is no profitable transformation). 326 if (TotalCount == 0) 327 return false; 328 329 TotalCount = ActualCount; 330 if (MemOPScaleCount) 331 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 332 << " denominator = " << SavedTotalCount << "\n"); 333 334 // Keeping track of the count of the default case: 335 uint64_t RemainCount = TotalCount; 336 uint64_t SavedRemainCount = SavedTotalCount; 337 SmallVector<uint64_t, 16> SizeIds; 338 SmallVector<uint64_t, 16> CaseCounts; 339 uint64_t MaxCount = 0; 340 unsigned Version = 0; 341 // Default case is in the front -- save the slot here. 342 CaseCounts.push_back(0); 343 for (auto &VD : VDs) { 344 int64_t V = VD.Value; 345 uint64_t C = VD.Count; 346 if (MemOPScaleCount) 347 C = getScaledCount(C, ActualCount, SavedTotalCount); 348 349 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) 350 continue; 351 352 // ValueCounts are sorted on the count. Break at the first un-profitable 353 // value. 354 if (!isProfitable(C, RemainCount)) 355 break; 356 357 SizeIds.push_back(V); 358 CaseCounts.push_back(C); 359 if (C > MaxCount) 360 MaxCount = C; 361 362 assert(RemainCount >= C); 363 RemainCount -= C; 364 assert(SavedRemainCount >= VD.Count); 365 SavedRemainCount -= VD.Count; 366 367 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 368 break; 369 } 370 371 if (Version == 0) 372 return false; 373 374 CaseCounts[0] = RemainCount; 375 if (RemainCount > MaxCount) 376 MaxCount = RemainCount; 377 378 uint64_t SumForOpt = TotalCount - RemainCount; 379 380 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 381 << " Versions (covering " << SumForOpt << " out of " 382 << TotalCount << ")\n"); 383 384 // mem_op(..., size) 385 // ==> 386 // switch (size) { 387 // case s1: 388 // mem_op(..., s1); 389 // goto merge_bb; 390 // case s2: 391 // mem_op(..., s2); 392 // goto merge_bb; 393 // ... 394 // default: 395 // mem_op(..., size); 396 // goto merge_bb; 397 // } 398 // merge_bb: 399 400 BasicBlock *BB = MO.I->getParent(); 401 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 402 LLVM_DEBUG(dbgs() << *BB << "\n"); 403 auto OrigBBFreq = BFI.getBlockFreq(BB); 404 405 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 406 BasicBlock::iterator It(*MO.I); 407 ++It; 408 assert(It != DefaultBB->end()); 409 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 410 MergeBB->setName("MemOP.Merge"); 411 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 412 DefaultBB->setName("MemOP.Default"); 413 414 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 415 auto &Ctx = Func.getContext(); 416 IRBuilder<> IRB(BB); 417 BB->getTerminator()->eraseFromParent(); 418 Value *SizeVar = MO.getLength(); 419 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 420 Type *MemOpTy = MO.I->getType(); 421 PHINode *PHI = nullptr; 422 if (!MemOpTy->isVoidTy()) { 423 // Insert a phi for the return values at the merge block. 424 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 425 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 426 MO.I->replaceAllUsesWith(PHI); 427 PHI->addIncoming(MO.I, DefaultBB); 428 } 429 430 // Clear the value profile data. 431 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 432 // If all promoted, we don't need the MD.prof metadata. 433 if (SavedRemainCount > 0 || Version != NumVals) 434 // Otherwise we need update with the un-promoted records back. 435 annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), 436 SavedRemainCount, IPVK_MemOPSize, NumVals); 437 438 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 439 440 std::vector<DominatorTree::UpdateType> Updates; 441 if (DT) 442 Updates.reserve(2 * SizeIds.size()); 443 444 for (uint64_t SizeId : SizeIds) { 445 BasicBlock *CaseBB = BasicBlock::Create( 446 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 447 MemOp NewMO = MO.clone(); 448 // Fix the argument. 449 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 450 assert(SizeType && "Expected integer type size argument."); 451 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 452 NewMO.setLength(CaseSizeId); 453 CaseBB->getInstList().push_back(NewMO.I); 454 IRBuilder<> IRBCase(CaseBB); 455 IRBCase.CreateBr(MergeBB); 456 SI->addCase(CaseSizeId, CaseBB); 457 if (!MemOpTy->isVoidTy()) 458 PHI->addIncoming(NewMO.I, CaseBB); 459 if (DT) { 460 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 461 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 462 } 463 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 464 } 465 DTU.applyUpdates(Updates); 466 Updates.clear(); 467 468 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 469 470 LLVM_DEBUG(dbgs() << *BB << "\n"); 471 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 472 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 473 474 ORE.emit([&]() { 475 using namespace ore; 476 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 477 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 478 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 479 << " for " << NV("Versions", Version) << " versions"; 480 }); 481 482 return true; 483 } 484 } // namespace 485 486 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 487 OptimizationRemarkEmitter &ORE, 488 DominatorTree *DT, TargetLibraryInfo &TLI) { 489 if (DisableMemOPOPT) 490 return false; 491 492 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 493 return false; 494 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 495 MemOPSizeOpt.perform(); 496 return MemOPSizeOpt.isChanged(); 497 } 498 499 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 500 BlockFrequencyInfo &BFI = 501 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 502 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 503 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 504 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 505 TargetLibraryInfo &TLI = 506 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 507 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 508 } 509 510 namespace llvm { 511 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 512 513 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 514 FunctionAnalysisManager &FAM) { 515 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 516 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 517 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 518 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 519 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 520 if (!Changed) 521 return PreservedAnalyses::all(); 522 auto PA = PreservedAnalyses(); 523 PA.preserve<GlobalsAA>(); 524 PA.preserve<DominatorTreeAnalysis>(); 525 return PA; 526 } 527 } // namespace llvm 528