1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/LLVMContext.h" 34 #include "llvm/IR/PassManager.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/ProfileData/InstrProf.h" 37 #define INSTR_PROF_VALUE_PROF_MEMOP_API 38 #include "llvm/ProfileData/InstrProfData.inc" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include <cassert> 47 #include <cstdint> 48 #include <vector> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "pgo-memop-opt" 53 54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 56 57 // The minimum call count to optimize memory intrinsic calls. 58 static cl::opt<unsigned> 59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), 60 cl::desc("The minimum count to optimize memory " 61 "intrinsic calls")); 62 63 // Command line option to disable memory intrinsic optimization. The default is 64 // false. This is for debug purpose. 65 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 66 cl::Hidden, cl::desc("Disable optimize")); 67 68 // The percent threshold to optimize memory intrinsic calls. 69 static cl::opt<unsigned> 70 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 71 cl::Hidden, 72 cl::desc("The percentage threshold for the " 73 "memory intrinsic calls optimization")); 74 75 // Maximum number of versions for optimizing memory intrinsic call. 76 static cl::opt<unsigned> 77 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 78 cl::desc("The max version for the optimized memory " 79 " intrinsic calls")); 80 81 // Scale the counts from the annotation using the BB count value. 82 static cl::opt<bool> 83 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 84 cl::desc("Scale the memop size counts using the basic " 85 " block count value")); 86 87 cl::opt<bool> 88 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 89 cl::Hidden, 90 cl::desc("Size-specialize memcmp and bcmp calls")); 91 92 static cl::opt<unsigned> 93 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 94 cl::desc("Optimize the memop size <= this value")); 95 96 namespace { 97 98 static const char *getMIName(const MemIntrinsic *MI) { 99 switch (MI->getIntrinsicID()) { 100 case Intrinsic::memcpy: 101 return "memcpy"; 102 case Intrinsic::memmove: 103 return "memmove"; 104 case Intrinsic::memset: 105 return "memset"; 106 default: 107 return "unknown"; 108 } 109 } 110 111 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 112 struct MemOp { 113 Instruction *I; 114 MemOp(MemIntrinsic *MI) : I(MI) {} 115 MemOp(CallInst *CI) : I(CI) {} 116 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 117 CallInst *asCI() { return cast<CallInst>(I); } 118 MemOp clone() { 119 if (auto MI = asMI()) 120 return MemOp(cast<MemIntrinsic>(MI->clone())); 121 return MemOp(cast<CallInst>(asCI()->clone())); 122 } 123 Value *getLength() { 124 if (auto MI = asMI()) 125 return MI->getLength(); 126 return asCI()->getArgOperand(2); 127 } 128 void setLength(Value *Length) { 129 if (auto MI = asMI()) 130 return MI->setLength(Length); 131 asCI()->setArgOperand(2, Length); 132 } 133 StringRef getFuncName() { 134 if (auto MI = asMI()) 135 return MI->getCalledFunction()->getName(); 136 return asCI()->getCalledFunction()->getName(); 137 } 138 bool isMemmove() { 139 if (auto MI = asMI()) 140 if (MI->getIntrinsicID() == Intrinsic::memmove) 141 return true; 142 return false; 143 } 144 bool isMemcmp(TargetLibraryInfo &TLI) { 145 LibFunc Func; 146 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 147 Func == LibFunc_memcmp) { 148 return true; 149 } 150 return false; 151 } 152 bool isBcmp(TargetLibraryInfo &TLI) { 153 LibFunc Func; 154 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 155 Func == LibFunc_bcmp) { 156 return true; 157 } 158 return false; 159 } 160 const char *getName(TargetLibraryInfo &TLI) { 161 if (auto MI = asMI()) 162 return getMIName(MI); 163 LibFunc Func; 164 if (TLI.getLibFunc(*asCI(), Func)) { 165 if (Func == LibFunc_memcmp) 166 return "memcmp"; 167 if (Func == LibFunc_bcmp) 168 return "bcmp"; 169 } 170 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 171 return nullptr; 172 } 173 }; 174 175 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 176 public: 177 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 178 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 179 TargetLibraryInfo &TLI) 180 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {} 181 bool isChanged() const { return Changed; } 182 void perform() { 183 WorkList.clear(); 184 visit(Func); 185 186 for (auto &MO : WorkList) { 187 ++NumOfPGOMemOPAnnotate; 188 if (perform(MO)) { 189 Changed = true; 190 ++NumOfPGOMemOPOpt; 191 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 192 << "is Transformed.\n"); 193 } 194 } 195 } 196 197 void visitMemIntrinsic(MemIntrinsic &MI) { 198 Value *Length = MI.getLength(); 199 // Not perform on constant length calls. 200 if (isa<ConstantInt>(Length)) 201 return; 202 WorkList.push_back(MemOp(&MI)); 203 } 204 205 void visitCallInst(CallInst &CI) { 206 LibFunc Func; 207 if (TLI.getLibFunc(CI, Func) && 208 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 209 !isa<ConstantInt>(CI.getArgOperand(2))) { 210 WorkList.push_back(MemOp(&CI)); 211 } 212 } 213 214 private: 215 Function &Func; 216 BlockFrequencyInfo &BFI; 217 OptimizationRemarkEmitter &ORE; 218 DominatorTree *DT; 219 TargetLibraryInfo &TLI; 220 bool Changed; 221 std::vector<MemOp> WorkList; 222 bool perform(MemOp MO); 223 }; 224 225 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 226 assert(Count <= TotalCount); 227 if (Count < MemOPCountThreshold) 228 return false; 229 if (Count < TotalCount * MemOPPercentThreshold / 100) 230 return false; 231 return true; 232 } 233 234 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 235 uint64_t Denom) { 236 if (!MemOPScaleCount) 237 return Count; 238 bool Overflowed; 239 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 240 return ScaleCount / Denom; 241 } 242 243 bool MemOPSizeOpt::perform(MemOp MO) { 244 assert(MO.I); 245 if (MO.isMemmove()) 246 return false; 247 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 248 return false; 249 250 uint32_t MaxNumVals = INSTR_PROF_NUM_BUCKETS; 251 uint64_t TotalCount; 252 auto VDs = 253 getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, TotalCount); 254 if (VDs.empty()) 255 return false; 256 257 uint64_t ActualCount = TotalCount; 258 uint64_t SavedTotalCount = TotalCount; 259 if (MemOPScaleCount) { 260 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 261 if (!BBEdgeCount) 262 return false; 263 ActualCount = *BBEdgeCount; 264 } 265 266 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 267 << ActualCount << "\n"); 268 LLVM_DEBUG( 269 for (auto &VD 270 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 271 272 if (ActualCount < MemOPCountThreshold) 273 return false; 274 // Skip if the total value profiled count is 0, in which case we can't 275 // scale up the counts properly (and there is no profitable transformation). 276 if (TotalCount == 0) 277 return false; 278 279 TotalCount = ActualCount; 280 if (MemOPScaleCount) 281 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 282 << " denominator = " << SavedTotalCount << "\n"); 283 284 // Keeping track of the count of the default case: 285 uint64_t RemainCount = TotalCount; 286 uint64_t SavedRemainCount = SavedTotalCount; 287 SmallVector<uint64_t, 16> SizeIds; 288 SmallVector<uint64_t, 16> CaseCounts; 289 SmallDenseSet<uint64_t, 16> SeenSizeId; 290 uint64_t MaxCount = 0; 291 unsigned Version = 0; 292 // Default case is in the front -- save the slot here. 293 CaseCounts.push_back(0); 294 SmallVector<InstrProfValueData, 24> RemainingVDs; 295 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 296 auto &VD = *I; 297 int64_t V = VD.Value; 298 uint64_t C = VD.Count; 299 if (MemOPScaleCount) 300 C = getScaledCount(C, ActualCount, SavedTotalCount); 301 302 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 303 RemainingVDs.push_back(VD); 304 continue; 305 } 306 307 // ValueCounts are sorted on the count. Break at the first un-profitable 308 // value. 309 if (!isProfitable(C, RemainCount)) { 310 RemainingVDs.insert(RemainingVDs.end(), I, E); 311 break; 312 } 313 314 if (!SeenSizeId.insert(V).second) { 315 errs() << "warning: Invalid Profile Data in Function " << Func.getName() 316 << ": Two identical values in MemOp value counts.\n"; 317 return false; 318 } 319 320 SizeIds.push_back(V); 321 CaseCounts.push_back(C); 322 if (C > MaxCount) 323 MaxCount = C; 324 325 assert(RemainCount >= C); 326 RemainCount -= C; 327 assert(SavedRemainCount >= VD.Count); 328 SavedRemainCount -= VD.Count; 329 330 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 331 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 332 break; 333 } 334 } 335 336 if (Version == 0) 337 return false; 338 339 CaseCounts[0] = RemainCount; 340 if (RemainCount > MaxCount) 341 MaxCount = RemainCount; 342 343 uint64_t SumForOpt = TotalCount - RemainCount; 344 345 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 346 << " Versions (covering " << SumForOpt << " out of " 347 << TotalCount << ")\n"); 348 349 // mem_op(..., size) 350 // ==> 351 // switch (size) { 352 // case s1: 353 // mem_op(..., s1); 354 // goto merge_bb; 355 // case s2: 356 // mem_op(..., s2); 357 // goto merge_bb; 358 // ... 359 // default: 360 // mem_op(..., size); 361 // goto merge_bb; 362 // } 363 // merge_bb: 364 365 BasicBlock *BB = MO.I->getParent(); 366 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 367 LLVM_DEBUG(dbgs() << *BB << "\n"); 368 auto OrigBBFreq = BFI.getBlockFreq(BB); 369 370 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 371 BasicBlock::iterator It(*MO.I); 372 ++It; 373 assert(It != DefaultBB->end()); 374 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 375 MergeBB->setName("MemOP.Merge"); 376 BFI.setBlockFreq(MergeBB, OrigBBFreq); 377 DefaultBB->setName("MemOP.Default"); 378 379 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 380 auto &Ctx = Func.getContext(); 381 IRBuilder<> IRB(BB); 382 BB->getTerminator()->eraseFromParent(); 383 Value *SizeVar = MO.getLength(); 384 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 385 Type *MemOpTy = MO.I->getType(); 386 PHINode *PHI = nullptr; 387 if (!MemOpTy->isVoidTy()) { 388 // Insert a phi for the return values at the merge block. 389 IRBuilder<> IRBM(MergeBB, MergeBB->getFirstNonPHIIt()); 390 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 391 MO.I->replaceAllUsesWith(PHI); 392 PHI->addIncoming(MO.I, DefaultBB); 393 } 394 395 // Clear the value profile data. 396 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 397 // If all promoted, we don't need the MD.prof metadata. 398 if (SavedRemainCount > 0 || Version != VDs.size()) { 399 // Otherwise we need update with the un-promoted records back. 400 annotateValueSite(*Func.getParent(), *MO.I, RemainingVDs, SavedRemainCount, 401 IPVK_MemOPSize, VDs.size()); 402 } 403 404 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 405 406 std::vector<DominatorTree::UpdateType> Updates; 407 if (DT) 408 Updates.reserve(2 * SizeIds.size()); 409 410 for (uint64_t SizeId : SizeIds) { 411 BasicBlock *CaseBB = BasicBlock::Create( 412 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 413 MemOp NewMO = MO.clone(); 414 // Fix the argument. 415 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 416 assert(SizeType && "Expected integer type size argument."); 417 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 418 NewMO.setLength(CaseSizeId); 419 NewMO.I->insertInto(CaseBB, CaseBB->end()); 420 IRBuilder<> IRBCase(CaseBB); 421 IRBCase.CreateBr(MergeBB); 422 SI->addCase(CaseSizeId, CaseBB); 423 if (!MemOpTy->isVoidTy()) 424 PHI->addIncoming(NewMO.I, CaseBB); 425 if (DT) { 426 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 427 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 428 } 429 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 430 } 431 DTU.applyUpdates(Updates); 432 Updates.clear(); 433 434 if (MaxCount) 435 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 436 437 LLVM_DEBUG(dbgs() << *BB << "\n"); 438 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 439 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 440 441 ORE.emit([&]() { 442 using namespace ore; 443 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 444 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 445 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 446 << " for " << NV("Versions", Version) << " versions"; 447 }); 448 449 return true; 450 } 451 } // namespace 452 453 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 454 OptimizationRemarkEmitter &ORE, 455 DominatorTree *DT, TargetLibraryInfo &TLI) { 456 if (DisableMemOPOPT) 457 return false; 458 459 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 460 return false; 461 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 462 MemOPSizeOpt.perform(); 463 return MemOPSizeOpt.isChanged(); 464 } 465 466 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 467 FunctionAnalysisManager &FAM) { 468 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 469 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 470 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 471 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 472 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 473 if (!Changed) 474 return PreservedAnalyses::all(); 475 auto PA = PreservedAnalyses(); 476 PA.preserve<DominatorTreeAnalysis>(); 477 return PA; 478 } 479