1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/CallSite.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/PassManager.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/Pass.h" 39 #include "llvm/PassRegistry.h" 40 #include "llvm/PassSupport.h" 41 #include "llvm/ProfileData/InstrProf.h" 42 #include "llvm/Support/Casting.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/MathExtras.h" 47 #include "llvm/Transforms/Instrumentation.h" 48 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 49 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 50 #include <cassert> 51 #include <cstdint> 52 #include <vector> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "pgo-memop-opt" 57 58 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 59 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 60 61 // The minimum call count to optimize memory intrinsic calls. 62 static cl::opt<unsigned> 63 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 64 cl::init(1000), 65 cl::desc("The minimum count to optimize memory " 66 "intrinsic calls")); 67 68 // Command line option to disable memory intrinsic optimization. The default is 69 // false. This is for debug purpose. 70 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 71 cl::Hidden, cl::desc("Disable optimize")); 72 73 // The percent threshold to optimize memory intrinsic calls. 74 static cl::opt<unsigned> 75 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 76 cl::Hidden, cl::ZeroOrMore, 77 cl::desc("The percentage threshold for the " 78 "memory intrinsic calls optimization")); 79 80 // Maximum number of versions for optimizing memory intrinsic call. 81 static cl::opt<unsigned> 82 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 83 cl::ZeroOrMore, 84 cl::desc("The max version for the optimized memory " 85 " intrinsic calls")); 86 87 // Scale the counts from the annotation using the BB count value. 88 static cl::opt<bool> 89 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 90 cl::desc("Scale the memop size counts using the basic " 91 " block count value")); 92 93 // This option sets the rangge of precise profile memop sizes. 94 extern cl::opt<std::string> MemOPSizeRange; 95 96 // This option sets the value that groups large memop sizes 97 extern cl::opt<unsigned> MemOPSizeLarge; 98 99 namespace { 100 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 101 public: 102 static char ID; 103 104 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 105 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 106 } 107 108 StringRef getPassName() const override { return "PGOMemOPSize"; } 109 110 private: 111 bool runOnFunction(Function &F) override; 112 void getAnalysisUsage(AnalysisUsage &AU) const override { 113 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 114 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 115 AU.addPreserved<GlobalsAAWrapperPass>(); 116 AU.addPreserved<DominatorTreeWrapperPass>(); 117 } 118 }; 119 } // end anonymous namespace 120 121 char PGOMemOPSizeOptLegacyPass::ID = 0; 122 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 123 "Optimize memory intrinsic using its size value profile", 124 false, false) 125 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 126 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 127 "Optimize memory intrinsic using its size value profile", 128 false, false) 129 130 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 131 return new PGOMemOPSizeOptLegacyPass(); 132 } 133 134 namespace { 135 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 136 public: 137 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 138 OptimizationRemarkEmitter &ORE, DominatorTree *DT) 139 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { 140 ValueDataArray = 141 llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 142 // Get the MemOPSize range information from option MemOPSizeRange, 143 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, 144 PreciseRangeLast); 145 } 146 bool isChanged() const { return Changed; } 147 void perform() { 148 WorkList.clear(); 149 visit(Func); 150 151 for (auto &MI : WorkList) { 152 ++NumOfPGOMemOPAnnotate; 153 if (perform(MI)) { 154 Changed = true; 155 ++NumOfPGOMemOPOpt; 156 LLVM_DEBUG(dbgs() << "MemOP call: " 157 << MI->getCalledFunction()->getName() 158 << "is Transformed.\n"); 159 } 160 } 161 } 162 163 void visitMemIntrinsic(MemIntrinsic &MI) { 164 Value *Length = MI.getLength(); 165 // Not perform on constant length calls. 166 if (dyn_cast<ConstantInt>(Length)) 167 return; 168 WorkList.push_back(&MI); 169 } 170 171 private: 172 Function &Func; 173 BlockFrequencyInfo &BFI; 174 OptimizationRemarkEmitter &ORE; 175 DominatorTree *DT; 176 bool Changed; 177 std::vector<MemIntrinsic *> WorkList; 178 // Start of the previse range. 179 int64_t PreciseRangeStart; 180 // Last value of the previse range. 181 int64_t PreciseRangeLast; 182 // The space to read the profile annotation. 183 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 184 bool perform(MemIntrinsic *MI); 185 186 // This kind shows which group the value falls in. For PreciseValue, we have 187 // the profile count for that value. LargeGroup groups the values that are in 188 // range [LargeValue, +inf). NonLargeGroup groups the rest of values. 189 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; 190 191 MemOPSizeKind getMemOPSizeKind(int64_t Value) const { 192 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) 193 return LargeGroup; 194 if (Value == PreciseRangeLast + 1) 195 return NonLargeGroup; 196 return PreciseValue; 197 } 198 }; 199 200 static const char *getMIName(const MemIntrinsic *MI) { 201 switch (MI->getIntrinsicID()) { 202 case Intrinsic::memcpy: 203 return "memcpy"; 204 case Intrinsic::memmove: 205 return "memmove"; 206 case Intrinsic::memset: 207 return "memset"; 208 default: 209 return "unknown"; 210 } 211 } 212 213 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 214 assert(Count <= TotalCount); 215 if (Count < MemOPCountThreshold) 216 return false; 217 if (Count < TotalCount * MemOPPercentThreshold / 100) 218 return false; 219 return true; 220 } 221 222 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 223 uint64_t Denom) { 224 if (!MemOPScaleCount) 225 return Count; 226 bool Overflowed; 227 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 228 return ScaleCount / Denom; 229 } 230 231 bool MemOPSizeOpt::perform(MemIntrinsic *MI) { 232 assert(MI); 233 if (MI->getIntrinsicID() == Intrinsic::memmove) 234 return false; 235 236 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 237 uint64_t TotalCount; 238 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, 239 ValueDataArray.get(), NumVals, TotalCount)) 240 return false; 241 242 uint64_t ActualCount = TotalCount; 243 uint64_t SavedTotalCount = TotalCount; 244 if (MemOPScaleCount) { 245 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); 246 if (!BBEdgeCount) 247 return false; 248 ActualCount = *BBEdgeCount; 249 } 250 251 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 252 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 253 << ActualCount << "\n"); 254 LLVM_DEBUG( 255 for (auto &VD 256 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 257 258 if (ActualCount < MemOPCountThreshold) 259 return false; 260 // Skip if the total value profiled count is 0, in which case we can't 261 // scale up the counts properly (and there is no profitable transformation). 262 if (TotalCount == 0) 263 return false; 264 265 TotalCount = ActualCount; 266 if (MemOPScaleCount) 267 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 268 << " denominator = " << SavedTotalCount << "\n"); 269 270 // Keeping track of the count of the default case: 271 uint64_t RemainCount = TotalCount; 272 uint64_t SavedRemainCount = SavedTotalCount; 273 SmallVector<uint64_t, 16> SizeIds; 274 SmallVector<uint64_t, 16> CaseCounts; 275 uint64_t MaxCount = 0; 276 unsigned Version = 0; 277 // Default case is in the front -- save the slot here. 278 CaseCounts.push_back(0); 279 for (auto &VD : VDs) { 280 int64_t V = VD.Value; 281 uint64_t C = VD.Count; 282 if (MemOPScaleCount) 283 C = getScaledCount(C, ActualCount, SavedTotalCount); 284 285 // Only care precise value here. 286 if (getMemOPSizeKind(V) != PreciseValue) 287 continue; 288 289 // ValueCounts are sorted on the count. Break at the first un-profitable 290 // value. 291 if (!isProfitable(C, RemainCount)) 292 break; 293 294 SizeIds.push_back(V); 295 CaseCounts.push_back(C); 296 if (C > MaxCount) 297 MaxCount = C; 298 299 assert(RemainCount >= C); 300 RemainCount -= C; 301 assert(SavedRemainCount >= VD.Count); 302 SavedRemainCount -= VD.Count; 303 304 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 305 break; 306 } 307 308 if (Version == 0) 309 return false; 310 311 CaseCounts[0] = RemainCount; 312 if (RemainCount > MaxCount) 313 MaxCount = RemainCount; 314 315 uint64_t SumForOpt = TotalCount - RemainCount; 316 317 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 318 << " Versions (covering " << SumForOpt << " out of " 319 << TotalCount << ")\n"); 320 321 // mem_op(..., size) 322 // ==> 323 // switch (size) { 324 // case s1: 325 // mem_op(..., s1); 326 // goto merge_bb; 327 // case s2: 328 // mem_op(..., s2); 329 // goto merge_bb; 330 // ... 331 // default: 332 // mem_op(..., size); 333 // goto merge_bb; 334 // } 335 // merge_bb: 336 337 BasicBlock *BB = MI->getParent(); 338 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 339 LLVM_DEBUG(dbgs() << *BB << "\n"); 340 auto OrigBBFreq = BFI.getBlockFreq(BB); 341 342 BasicBlock *DefaultBB = SplitBlock(BB, MI, DT); 343 BasicBlock::iterator It(*MI); 344 ++It; 345 assert(It != DefaultBB->end()); 346 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 347 MergeBB->setName("MemOP.Merge"); 348 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 349 DefaultBB->setName("MemOP.Default"); 350 351 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 352 auto &Ctx = Func.getContext(); 353 IRBuilder<> IRB(BB); 354 BB->getTerminator()->eraseFromParent(); 355 Value *SizeVar = MI->getLength(); 356 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 357 358 // Clear the value profile data. 359 MI->setMetadata(LLVMContext::MD_prof, nullptr); 360 // If all promoted, we don't need the MD.prof metadata. 361 if (SavedRemainCount > 0 || Version != NumVals) 362 // Otherwise we need update with the un-promoted records back. 363 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), 364 SavedRemainCount, IPVK_MemOPSize, NumVals); 365 366 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 367 368 std::vector<DominatorTree::UpdateType> Updates; 369 if (DT) 370 Updates.reserve(2 * SizeIds.size()); 371 372 for (uint64_t SizeId : SizeIds) { 373 BasicBlock *CaseBB = BasicBlock::Create( 374 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 375 Instruction *NewInst = MI->clone(); 376 // Fix the argument. 377 MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst); 378 IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType()); 379 assert(SizeType && "Expected integer type size argument."); 380 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 381 MemI->setLength(CaseSizeId); 382 CaseBB->getInstList().push_back(NewInst); 383 IRBuilder<> IRBCase(CaseBB); 384 IRBCase.CreateBr(MergeBB); 385 SI->addCase(CaseSizeId, CaseBB); 386 if (DT) { 387 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 388 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 389 } 390 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 391 } 392 DTU.applyUpdates(Updates); 393 Updates.clear(); 394 395 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 396 397 LLVM_DEBUG(dbgs() << *BB << "\n"); 398 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 399 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 400 401 ORE.emit([&]() { 402 using namespace ore; 403 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI) 404 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI))) 405 << " with count " << NV("Count", SumForOpt) << " out of " 406 << NV("Total", TotalCount) << " for " << NV("Versions", Version) 407 << " versions"; 408 }); 409 410 return true; 411 } 412 } // namespace 413 414 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 415 OptimizationRemarkEmitter &ORE, 416 DominatorTree *DT) { 417 if (DisableMemOPOPT) 418 return false; 419 420 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 421 return false; 422 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT); 423 MemOPSizeOpt.perform(); 424 return MemOPSizeOpt.isChanged(); 425 } 426 427 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 428 BlockFrequencyInfo &BFI = 429 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 430 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 431 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 432 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 433 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 434 } 435 436 namespace llvm { 437 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 438 439 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 440 FunctionAnalysisManager &FAM) { 441 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 442 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 443 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 444 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 445 if (!Changed) 446 return PreservedAnalyses::all(); 447 auto PA = PreservedAnalyses(); 448 PA.preserve<GlobalsAA>(); 449 PA.preserve<DominatorTreeAnalysis>(); 450 return PA; 451 } 452 } // namespace llvm 453