1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/InstrTypes.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/PassManager.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/InitializePasses.h" 38 #include "llvm/Pass.h" 39 #include "llvm/PassRegistry.h" 40 #include "llvm/ProfileData/InstrProf.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/MathExtras.h" 46 #include "llvm/Transforms/Instrumentation.h" 47 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 48 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <vector> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "pgo-memop-opt" 56 57 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 58 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 59 60 // The minimum call count to optimize memory intrinsic calls. 61 static cl::opt<unsigned> 62 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 63 cl::init(1000), 64 cl::desc("The minimum count to optimize memory " 65 "intrinsic calls")); 66 67 // Command line option to disable memory intrinsic optimization. The default is 68 // false. This is for debug purpose. 69 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 70 cl::Hidden, cl::desc("Disable optimize")); 71 72 // The percent threshold to optimize memory intrinsic calls. 73 static cl::opt<unsigned> 74 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 75 cl::Hidden, cl::ZeroOrMore, 76 cl::desc("The percentage threshold for the " 77 "memory intrinsic calls optimization")); 78 79 // Maximum number of versions for optimizing memory intrinsic call. 80 static cl::opt<unsigned> 81 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 82 cl::ZeroOrMore, 83 cl::desc("The max version for the optimized memory " 84 " intrinsic calls")); 85 86 // Scale the counts from the annotation using the BB count value. 87 static cl::opt<bool> 88 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 89 cl::desc("Scale the memop size counts using the basic " 90 " block count value")); 91 92 // This option sets the rangge of precise profile memop sizes. 93 extern cl::opt<std::string> MemOPSizeRange; 94 95 // This option sets the value that groups large memop sizes 96 extern cl::opt<unsigned> MemOPSizeLarge; 97 98 cl::opt<bool> 99 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 100 cl::Hidden, 101 cl::desc("Size-specialize memcmp and bcmp calls")); 102 103 namespace { 104 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 105 public: 106 static char ID; 107 108 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 109 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 110 } 111 112 StringRef getPassName() const override { return "PGOMemOPSize"; } 113 114 private: 115 bool runOnFunction(Function &F) override; 116 void getAnalysisUsage(AnalysisUsage &AU) const override { 117 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 118 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 119 AU.addPreserved<GlobalsAAWrapperPass>(); 120 AU.addPreserved<DominatorTreeWrapperPass>(); 121 AU.addRequired<TargetLibraryInfoWrapperPass>(); 122 } 123 }; 124 } // end anonymous namespace 125 126 char PGOMemOPSizeOptLegacyPass::ID = 0; 127 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 128 "Optimize memory intrinsic using its size value profile", 129 false, false) 130 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 131 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 132 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 133 "Optimize memory intrinsic using its size value profile", 134 false, false) 135 136 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 137 return new PGOMemOPSizeOptLegacyPass(); 138 } 139 140 namespace { 141 142 static const char *getMIName(const MemIntrinsic *MI) { 143 switch (MI->getIntrinsicID()) { 144 case Intrinsic::memcpy: 145 return "memcpy"; 146 case Intrinsic::memmove: 147 return "memmove"; 148 case Intrinsic::memset: 149 return "memset"; 150 default: 151 return "unknown"; 152 } 153 } 154 155 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 156 struct MemOp { 157 Instruction *I; 158 MemOp(MemIntrinsic *MI) : I(MI) {} 159 MemOp(CallInst *CI) : I(CI) {} 160 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 161 CallInst *asCI() { return cast<CallInst>(I); } 162 MemOp clone() { 163 if (auto MI = asMI()) 164 return MemOp(cast<MemIntrinsic>(MI->clone())); 165 return MemOp(cast<CallInst>(asCI()->clone())); 166 } 167 Value *getLength() { 168 if (auto MI = asMI()) 169 return MI->getLength(); 170 return asCI()->getArgOperand(2); 171 } 172 void setLength(Value *Length) { 173 if (auto MI = asMI()) 174 return MI->setLength(Length); 175 asCI()->setArgOperand(2, Length); 176 } 177 StringRef getFuncName() { 178 if (auto MI = asMI()) 179 return MI->getCalledFunction()->getName(); 180 return asCI()->getCalledFunction()->getName(); 181 } 182 bool isMemmove() { 183 if (auto MI = asMI()) 184 if (MI->getIntrinsicID() == Intrinsic::memmove) 185 return true; 186 return false; 187 } 188 bool isMemcmp(TargetLibraryInfo &TLI) { 189 LibFunc Func; 190 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 191 Func == LibFunc_memcmp) { 192 return true; 193 } 194 return false; 195 } 196 bool isBcmp(TargetLibraryInfo &TLI) { 197 LibFunc Func; 198 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 199 Func == LibFunc_bcmp) { 200 return true; 201 } 202 return false; 203 } 204 const char *getName(TargetLibraryInfo &TLI) { 205 if (auto MI = asMI()) 206 return getMIName(MI); 207 LibFunc Func; 208 if (TLI.getLibFunc(*asCI(), Func)) { 209 if (Func == LibFunc_memcmp) 210 return "memcmp"; 211 if (Func == LibFunc_bcmp) 212 return "bcmp"; 213 } 214 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 215 return nullptr; 216 } 217 }; 218 219 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 220 public: 221 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 222 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 223 TargetLibraryInfo &TLI) 224 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 225 ValueDataArray = 226 std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 227 // Get the MemOPSize range information from option MemOPSizeRange, 228 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, 229 PreciseRangeLast); 230 } 231 bool isChanged() const { return Changed; } 232 void perform() { 233 WorkList.clear(); 234 visit(Func); 235 236 for (auto &MO : WorkList) { 237 ++NumOfPGOMemOPAnnotate; 238 if (perform(MO)) { 239 Changed = true; 240 ++NumOfPGOMemOPOpt; 241 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 242 << "is Transformed.\n"); 243 } 244 } 245 } 246 247 void visitMemIntrinsic(MemIntrinsic &MI) { 248 Value *Length = MI.getLength(); 249 // Not perform on constant length calls. 250 if (dyn_cast<ConstantInt>(Length)) 251 return; 252 WorkList.push_back(MemOp(&MI)); 253 } 254 255 void visitCallInst(CallInst &CI) { 256 LibFunc Func; 257 if (TLI.getLibFunc(CI, Func) && 258 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 259 !dyn_cast<ConstantInt>(CI.getArgOperand(2))) { 260 WorkList.push_back(MemOp(&CI)); 261 } 262 } 263 264 private: 265 Function &Func; 266 BlockFrequencyInfo &BFI; 267 OptimizationRemarkEmitter &ORE; 268 DominatorTree *DT; 269 TargetLibraryInfo &TLI; 270 bool Changed; 271 std::vector<MemOp> WorkList; 272 // Start of the previse range. 273 int64_t PreciseRangeStart; 274 // Last value of the previse range. 275 int64_t PreciseRangeLast; 276 // The space to read the profile annotation. 277 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 278 bool perform(MemOp MO); 279 280 // This kind shows which group the value falls in. For PreciseValue, we have 281 // the profile count for that value. LargeGroup groups the values that are in 282 // range [LargeValue, +inf). NonLargeGroup groups the rest of values. 283 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; 284 285 MemOPSizeKind getMemOPSizeKind(int64_t Value) const { 286 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) 287 return LargeGroup; 288 if (Value == PreciseRangeLast + 1) 289 return NonLargeGroup; 290 return PreciseValue; 291 } 292 }; 293 294 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 295 assert(Count <= TotalCount); 296 if (Count < MemOPCountThreshold) 297 return false; 298 if (Count < TotalCount * MemOPPercentThreshold / 100) 299 return false; 300 return true; 301 } 302 303 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 304 uint64_t Denom) { 305 if (!MemOPScaleCount) 306 return Count; 307 bool Overflowed; 308 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 309 return ScaleCount / Denom; 310 } 311 312 bool MemOPSizeOpt::perform(MemOp MO) { 313 assert(MO.I); 314 if (MO.isMemmove()) 315 return false; 316 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 317 return false; 318 319 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 320 uint64_t TotalCount; 321 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, 322 ValueDataArray.get(), NumVals, TotalCount)) 323 return false; 324 325 uint64_t ActualCount = TotalCount; 326 uint64_t SavedTotalCount = TotalCount; 327 if (MemOPScaleCount) { 328 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 329 if (!BBEdgeCount) 330 return false; 331 ActualCount = *BBEdgeCount; 332 } 333 334 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 335 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 336 << ActualCount << "\n"); 337 LLVM_DEBUG( 338 for (auto &VD 339 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 340 341 if (ActualCount < MemOPCountThreshold) 342 return false; 343 // Skip if the total value profiled count is 0, in which case we can't 344 // scale up the counts properly (and there is no profitable transformation). 345 if (TotalCount == 0) 346 return false; 347 348 TotalCount = ActualCount; 349 if (MemOPScaleCount) 350 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 351 << " denominator = " << SavedTotalCount << "\n"); 352 353 // Keeping track of the count of the default case: 354 uint64_t RemainCount = TotalCount; 355 uint64_t SavedRemainCount = SavedTotalCount; 356 SmallVector<uint64_t, 16> SizeIds; 357 SmallVector<uint64_t, 16> CaseCounts; 358 uint64_t MaxCount = 0; 359 unsigned Version = 0; 360 // Default case is in the front -- save the slot here. 361 CaseCounts.push_back(0); 362 for (auto &VD : VDs) { 363 int64_t V = VD.Value; 364 uint64_t C = VD.Count; 365 if (MemOPScaleCount) 366 C = getScaledCount(C, ActualCount, SavedTotalCount); 367 368 // Only care precise value here. 369 if (getMemOPSizeKind(V) != PreciseValue) 370 continue; 371 372 // ValueCounts are sorted on the count. Break at the first un-profitable 373 // value. 374 if (!isProfitable(C, RemainCount)) 375 break; 376 377 SizeIds.push_back(V); 378 CaseCounts.push_back(C); 379 if (C > MaxCount) 380 MaxCount = C; 381 382 assert(RemainCount >= C); 383 RemainCount -= C; 384 assert(SavedRemainCount >= VD.Count); 385 SavedRemainCount -= VD.Count; 386 387 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 388 break; 389 } 390 391 if (Version == 0) 392 return false; 393 394 CaseCounts[0] = RemainCount; 395 if (RemainCount > MaxCount) 396 MaxCount = RemainCount; 397 398 uint64_t SumForOpt = TotalCount - RemainCount; 399 400 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 401 << " Versions (covering " << SumForOpt << " out of " 402 << TotalCount << ")\n"); 403 404 // mem_op(..., size) 405 // ==> 406 // switch (size) { 407 // case s1: 408 // mem_op(..., s1); 409 // goto merge_bb; 410 // case s2: 411 // mem_op(..., s2); 412 // goto merge_bb; 413 // ... 414 // default: 415 // mem_op(..., size); 416 // goto merge_bb; 417 // } 418 // merge_bb: 419 420 BasicBlock *BB = MO.I->getParent(); 421 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 422 LLVM_DEBUG(dbgs() << *BB << "\n"); 423 auto OrigBBFreq = BFI.getBlockFreq(BB); 424 425 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 426 BasicBlock::iterator It(*MO.I); 427 ++It; 428 assert(It != DefaultBB->end()); 429 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 430 MergeBB->setName("MemOP.Merge"); 431 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 432 DefaultBB->setName("MemOP.Default"); 433 434 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 435 auto &Ctx = Func.getContext(); 436 IRBuilder<> IRB(BB); 437 BB->getTerminator()->eraseFromParent(); 438 Value *SizeVar = MO.getLength(); 439 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 440 Type *MemOpTy = MO.I->getType(); 441 PHINode *PHI = nullptr; 442 if (!MemOpTy->isVoidTy()) { 443 // Insert a phi for the return values at the merge block. 444 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 445 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 446 MO.I->replaceAllUsesWith(PHI); 447 PHI->addIncoming(MO.I, DefaultBB); 448 } 449 450 // Clear the value profile data. 451 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 452 // If all promoted, we don't need the MD.prof metadata. 453 if (SavedRemainCount > 0 || Version != NumVals) 454 // Otherwise we need update with the un-promoted records back. 455 annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), 456 SavedRemainCount, IPVK_MemOPSize, NumVals); 457 458 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 459 460 std::vector<DominatorTree::UpdateType> Updates; 461 if (DT) 462 Updates.reserve(2 * SizeIds.size()); 463 464 for (uint64_t SizeId : SizeIds) { 465 BasicBlock *CaseBB = BasicBlock::Create( 466 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 467 MemOp NewMO = MO.clone(); 468 // Fix the argument. 469 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 470 assert(SizeType && "Expected integer type size argument."); 471 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 472 NewMO.setLength(CaseSizeId); 473 CaseBB->getInstList().push_back(NewMO.I); 474 IRBuilder<> IRBCase(CaseBB); 475 IRBCase.CreateBr(MergeBB); 476 SI->addCase(CaseSizeId, CaseBB); 477 if (!MemOpTy->isVoidTy()) 478 PHI->addIncoming(NewMO.I, CaseBB); 479 if (DT) { 480 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 481 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 482 } 483 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 484 } 485 DTU.applyUpdates(Updates); 486 Updates.clear(); 487 488 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 489 490 LLVM_DEBUG(dbgs() << *BB << "\n"); 491 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 492 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 493 494 ORE.emit([&]() { 495 using namespace ore; 496 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 497 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 498 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 499 << " for " << NV("Versions", Version) << " versions"; 500 }); 501 502 return true; 503 } 504 } // namespace 505 506 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 507 OptimizationRemarkEmitter &ORE, 508 DominatorTree *DT, TargetLibraryInfo &TLI) { 509 if (DisableMemOPOPT) 510 return false; 511 512 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 513 return false; 514 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 515 MemOPSizeOpt.perform(); 516 return MemOPSizeOpt.isChanged(); 517 } 518 519 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 520 BlockFrequencyInfo &BFI = 521 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 522 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 523 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 524 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 525 TargetLibraryInfo &TLI = 526 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 527 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 528 } 529 530 namespace llvm { 531 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 532 533 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 534 FunctionAnalysisManager &FAM) { 535 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 536 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 537 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 538 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 539 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 540 if (!Changed) 541 return PreservedAnalyses::all(); 542 auto PA = PreservedAnalyses(); 543 PA.preserve<GlobalsAA>(); 544 PA.preserve<DominatorTreeAnalysis>(); 545 return PA; 546 } 547 } // namespace llvm 548