1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/PassManager.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/PassRegistry.h" 41 #include "llvm/ProfileData/InstrProf.h" 42 #define INSTR_PROF_VALUE_PROF_MEMOP_API 43 #include "llvm/ProfileData/InstrProfData.inc" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Support/WithColor.h" 50 #include "llvm/Transforms/Instrumentation.h" 51 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 52 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 53 #include <cassert> 54 #include <cstdint> 55 #include <vector> 56 57 using namespace llvm; 58 59 #define DEBUG_TYPE "pgo-memop-opt" 60 61 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 62 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 63 64 // The minimum call count to optimize memory intrinsic calls. 65 static cl::opt<unsigned> 66 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 67 cl::init(1000), 68 cl::desc("The minimum count to optimize memory " 69 "intrinsic calls")); 70 71 // Command line option to disable memory intrinsic optimization. The default is 72 // false. This is for debug purpose. 73 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 74 cl::Hidden, cl::desc("Disable optimize")); 75 76 // The percent threshold to optimize memory intrinsic calls. 77 static cl::opt<unsigned> 78 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 79 cl::Hidden, cl::ZeroOrMore, 80 cl::desc("The percentage threshold for the " 81 "memory intrinsic calls optimization")); 82 83 // Maximum number of versions for optimizing memory intrinsic call. 84 static cl::opt<unsigned> 85 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 86 cl::ZeroOrMore, 87 cl::desc("The max version for the optimized memory " 88 " intrinsic calls")); 89 90 // Scale the counts from the annotation using the BB count value. 91 static cl::opt<bool> 92 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 93 cl::desc("Scale the memop size counts using the basic " 94 " block count value")); 95 96 cl::opt<bool> 97 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 98 cl::Hidden, 99 cl::desc("Size-specialize memcmp and bcmp calls")); 100 101 static cl::opt<unsigned> 102 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 103 cl::desc("Optimize the memop size <= this value")); 104 105 namespace { 106 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 107 public: 108 static char ID; 109 110 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 111 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 112 } 113 114 StringRef getPassName() const override { return "PGOMemOPSize"; } 115 116 private: 117 bool runOnFunction(Function &F) override; 118 void getAnalysisUsage(AnalysisUsage &AU) const override { 119 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 120 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 121 AU.addPreserved<GlobalsAAWrapperPass>(); 122 AU.addPreserved<DominatorTreeWrapperPass>(); 123 AU.addRequired<TargetLibraryInfoWrapperPass>(); 124 } 125 }; 126 } // end anonymous namespace 127 128 char PGOMemOPSizeOptLegacyPass::ID = 0; 129 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 130 "Optimize memory intrinsic using its size value profile", 131 false, false) 132 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 133 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 134 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 135 "Optimize memory intrinsic using its size value profile", 136 false, false) 137 138 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 139 return new PGOMemOPSizeOptLegacyPass(); 140 } 141 142 namespace { 143 144 static const char *getMIName(const MemIntrinsic *MI) { 145 switch (MI->getIntrinsicID()) { 146 case Intrinsic::memcpy: 147 return "memcpy"; 148 case Intrinsic::memmove: 149 return "memmove"; 150 case Intrinsic::memset: 151 return "memset"; 152 default: 153 return "unknown"; 154 } 155 } 156 157 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 158 struct MemOp { 159 Instruction *I; 160 MemOp(MemIntrinsic *MI) : I(MI) {} 161 MemOp(CallInst *CI) : I(CI) {} 162 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 163 CallInst *asCI() { return cast<CallInst>(I); } 164 MemOp clone() { 165 if (auto MI = asMI()) 166 return MemOp(cast<MemIntrinsic>(MI->clone())); 167 return MemOp(cast<CallInst>(asCI()->clone())); 168 } 169 Value *getLength() { 170 if (auto MI = asMI()) 171 return MI->getLength(); 172 return asCI()->getArgOperand(2); 173 } 174 void setLength(Value *Length) { 175 if (auto MI = asMI()) 176 return MI->setLength(Length); 177 asCI()->setArgOperand(2, Length); 178 } 179 StringRef getFuncName() { 180 if (auto MI = asMI()) 181 return MI->getCalledFunction()->getName(); 182 return asCI()->getCalledFunction()->getName(); 183 } 184 bool isMemmove() { 185 if (auto MI = asMI()) 186 if (MI->getIntrinsicID() == Intrinsic::memmove) 187 return true; 188 return false; 189 } 190 bool isMemcmp(TargetLibraryInfo &TLI) { 191 LibFunc Func; 192 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 193 Func == LibFunc_memcmp) { 194 return true; 195 } 196 return false; 197 } 198 bool isBcmp(TargetLibraryInfo &TLI) { 199 LibFunc Func; 200 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 201 Func == LibFunc_bcmp) { 202 return true; 203 } 204 return false; 205 } 206 const char *getName(TargetLibraryInfo &TLI) { 207 if (auto MI = asMI()) 208 return getMIName(MI); 209 LibFunc Func; 210 if (TLI.getLibFunc(*asCI(), Func)) { 211 if (Func == LibFunc_memcmp) 212 return "memcmp"; 213 if (Func == LibFunc_bcmp) 214 return "bcmp"; 215 } 216 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 217 return nullptr; 218 } 219 }; 220 221 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 222 public: 223 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 224 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 225 TargetLibraryInfo &TLI) 226 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 227 ValueDataArray = 228 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 229 } 230 bool isChanged() const { return Changed; } 231 void perform() { 232 WorkList.clear(); 233 visit(Func); 234 235 for (auto &MO : WorkList) { 236 ++NumOfPGOMemOPAnnotate; 237 if (perform(MO)) { 238 Changed = true; 239 ++NumOfPGOMemOPOpt; 240 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 241 << "is Transformed.\n"); 242 } 243 } 244 } 245 246 void visitMemIntrinsic(MemIntrinsic &MI) { 247 Value *Length = MI.getLength(); 248 // Not perform on constant length calls. 249 if (isa<ConstantInt>(Length)) 250 return; 251 WorkList.push_back(MemOp(&MI)); 252 } 253 254 void visitCallInst(CallInst &CI) { 255 LibFunc Func; 256 if (TLI.getLibFunc(CI, Func) && 257 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 258 !isa<ConstantInt>(CI.getArgOperand(2))) { 259 WorkList.push_back(MemOp(&CI)); 260 } 261 } 262 263 private: 264 Function &Func; 265 BlockFrequencyInfo &BFI; 266 OptimizationRemarkEmitter &ORE; 267 DominatorTree *DT; 268 TargetLibraryInfo &TLI; 269 bool Changed; 270 std::vector<MemOp> WorkList; 271 // The space to read the profile annotation. 272 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 273 bool perform(MemOp MO); 274 }; 275 276 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 277 assert(Count <= TotalCount); 278 if (Count < MemOPCountThreshold) 279 return false; 280 if (Count < TotalCount * MemOPPercentThreshold / 100) 281 return false; 282 return true; 283 } 284 285 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 286 uint64_t Denom) { 287 if (!MemOPScaleCount) 288 return Count; 289 bool Overflowed; 290 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 291 return ScaleCount / Denom; 292 } 293 294 bool MemOPSizeOpt::perform(MemOp MO) { 295 assert(MO.I); 296 if (MO.isMemmove()) 297 return false; 298 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 299 return false; 300 301 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 302 uint64_t TotalCount; 303 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 304 ValueDataArray.get(), NumVals, TotalCount)) 305 return false; 306 307 uint64_t ActualCount = TotalCount; 308 uint64_t SavedTotalCount = TotalCount; 309 if (MemOPScaleCount) { 310 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 311 if (!BBEdgeCount) 312 return false; 313 ActualCount = *BBEdgeCount; 314 } 315 316 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 317 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 318 << ActualCount << "\n"); 319 LLVM_DEBUG( 320 for (auto &VD 321 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 322 323 if (ActualCount < MemOPCountThreshold) 324 return false; 325 // Skip if the total value profiled count is 0, in which case we can't 326 // scale up the counts properly (and there is no profitable transformation). 327 if (TotalCount == 0) 328 return false; 329 330 TotalCount = ActualCount; 331 if (MemOPScaleCount) 332 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 333 << " denominator = " << SavedTotalCount << "\n"); 334 335 // Keeping track of the count of the default case: 336 uint64_t RemainCount = TotalCount; 337 uint64_t SavedRemainCount = SavedTotalCount; 338 SmallVector<uint64_t, 16> SizeIds; 339 SmallVector<uint64_t, 16> CaseCounts; 340 uint64_t MaxCount = 0; 341 unsigned Version = 0; 342 int64_t LastV = -1; 343 // Default case is in the front -- save the slot here. 344 CaseCounts.push_back(0); 345 SmallVector<InstrProfValueData, 24> RemainingVDs; 346 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 347 auto &VD = *I; 348 int64_t V = VD.Value; 349 uint64_t C = VD.Count; 350 if (MemOPScaleCount) 351 C = getScaledCount(C, ActualCount, SavedTotalCount); 352 353 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 354 RemainingVDs.push_back(VD); 355 continue; 356 } 357 358 // ValueCounts are sorted on the count. Break at the first un-profitable 359 // value. 360 if (!isProfitable(C, RemainCount)) { 361 RemainingVDs.insert(RemainingVDs.end(), I, E); 362 break; 363 } 364 365 if (V == LastV) { 366 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 367 << ": Two consecutive, identical values in MemOp value" 368 "counts.\n"); 369 return false; 370 } 371 372 LastV = V; 373 374 SizeIds.push_back(V); 375 CaseCounts.push_back(C); 376 if (C > MaxCount) 377 MaxCount = C; 378 379 assert(RemainCount >= C); 380 RemainCount -= C; 381 assert(SavedRemainCount >= VD.Count); 382 SavedRemainCount -= VD.Count; 383 384 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 385 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 386 break; 387 } 388 } 389 390 if (Version == 0) 391 return false; 392 393 CaseCounts[0] = RemainCount; 394 if (RemainCount > MaxCount) 395 MaxCount = RemainCount; 396 397 uint64_t SumForOpt = TotalCount - RemainCount; 398 399 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 400 << " Versions (covering " << SumForOpt << " out of " 401 << TotalCount << ")\n"); 402 403 // mem_op(..., size) 404 // ==> 405 // switch (size) { 406 // case s1: 407 // mem_op(..., s1); 408 // goto merge_bb; 409 // case s2: 410 // mem_op(..., s2); 411 // goto merge_bb; 412 // ... 413 // default: 414 // mem_op(..., size); 415 // goto merge_bb; 416 // } 417 // merge_bb: 418 419 BasicBlock *BB = MO.I->getParent(); 420 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 421 LLVM_DEBUG(dbgs() << *BB << "\n"); 422 auto OrigBBFreq = BFI.getBlockFreq(BB); 423 424 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 425 BasicBlock::iterator It(*MO.I); 426 ++It; 427 assert(It != DefaultBB->end()); 428 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 429 MergeBB->setName("MemOP.Merge"); 430 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 431 DefaultBB->setName("MemOP.Default"); 432 433 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 434 auto &Ctx = Func.getContext(); 435 IRBuilder<> IRB(BB); 436 BB->getTerminator()->eraseFromParent(); 437 Value *SizeVar = MO.getLength(); 438 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 439 Type *MemOpTy = MO.I->getType(); 440 PHINode *PHI = nullptr; 441 if (!MemOpTy->isVoidTy()) { 442 // Insert a phi for the return values at the merge block. 443 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 444 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 445 MO.I->replaceAllUsesWith(PHI); 446 PHI->addIncoming(MO.I, DefaultBB); 447 } 448 449 // Clear the value profile data. 450 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 451 // If all promoted, we don't need the MD.prof metadata. 452 if (SavedRemainCount > 0 || Version != NumVals) { 453 // Otherwise we need update with the un-promoted records back. 454 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 455 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 456 IPVK_MemOPSize, NumVals); 457 } 458 459 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 460 461 std::vector<DominatorTree::UpdateType> Updates; 462 if (DT) 463 Updates.reserve(2 * SizeIds.size()); 464 465 for (uint64_t SizeId : SizeIds) { 466 BasicBlock *CaseBB = BasicBlock::Create( 467 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 468 MemOp NewMO = MO.clone(); 469 // Fix the argument. 470 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 471 assert(SizeType && "Expected integer type size argument."); 472 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 473 NewMO.setLength(CaseSizeId); 474 CaseBB->getInstList().push_back(NewMO.I); 475 IRBuilder<> IRBCase(CaseBB); 476 IRBCase.CreateBr(MergeBB); 477 SI->addCase(CaseSizeId, CaseBB); 478 if (!MemOpTy->isVoidTy()) 479 PHI->addIncoming(NewMO.I, CaseBB); 480 if (DT) { 481 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 482 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 483 } 484 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 485 } 486 DTU.applyUpdates(Updates); 487 Updates.clear(); 488 489 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 490 491 LLVM_DEBUG(dbgs() << *BB << "\n"); 492 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 493 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 494 495 ORE.emit([&]() { 496 using namespace ore; 497 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 498 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 499 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 500 << " for " << NV("Versions", Version) << " versions"; 501 }); 502 503 return true; 504 } 505 } // namespace 506 507 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 508 OptimizationRemarkEmitter &ORE, 509 DominatorTree *DT, TargetLibraryInfo &TLI) { 510 if (DisableMemOPOPT) 511 return false; 512 513 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 514 return false; 515 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 516 MemOPSizeOpt.perform(); 517 return MemOPSizeOpt.isChanged(); 518 } 519 520 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 521 BlockFrequencyInfo &BFI = 522 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 523 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 524 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 525 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 526 TargetLibraryInfo &TLI = 527 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 528 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 529 } 530 531 namespace llvm { 532 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 533 534 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 535 FunctionAnalysisManager &FAM) { 536 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 537 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 538 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 539 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 540 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 541 if (!Changed) 542 return PreservedAnalyses::all(); 543 auto PA = PreservedAnalyses(); 544 PA.preserve<DominatorTreeAnalysis>(); 545 return PA; 546 } 547 } // namespace llvm 548