xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp (revision 2eb4d8dc723da3cf7d735a3226ae49da4c8c5dbc)
1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Pass.h"
40 #include "llvm/PassRegistry.h"
41 #include "llvm/ProfileData/InstrProf.h"
42 #define INSTR_PROF_VALUE_PROF_MEMOP_API
43 #include "llvm/ProfileData/InstrProfData.inc"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Transforms/Instrumentation.h"
50 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
51 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
52 #include <cassert>
53 #include <cstdint>
54 #include <vector>
55 
56 using namespace llvm;
57 
58 #define DEBUG_TYPE "pgo-memop-opt"
59 
60 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
61 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
62 
63 // The minimum call count to optimize memory intrinsic calls.
64 static cl::opt<unsigned>
65     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
66                         cl::init(1000),
67                         cl::desc("The minimum count to optimize memory "
68                                  "intrinsic calls"));
69 
70 // Command line option to disable memory intrinsic optimization. The default is
71 // false. This is for debug purpose.
72 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
73                                      cl::Hidden, cl::desc("Disable optimize"));
74 
75 // The percent threshold to optimize memory intrinsic calls.
76 static cl::opt<unsigned>
77     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
78                           cl::Hidden, cl::ZeroOrMore,
79                           cl::desc("The percentage threshold for the "
80                                    "memory intrinsic calls optimization"));
81 
82 // Maximum number of versions for optimizing memory intrinsic call.
83 static cl::opt<unsigned>
84     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
85                     cl::ZeroOrMore,
86                     cl::desc("The max version for the optimized memory "
87                              " intrinsic calls"));
88 
89 // Scale the counts from the annotation using the BB count value.
90 static cl::opt<bool>
91     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
92                     cl::desc("Scale the memop size counts using the basic "
93                              " block count value"));
94 
95 cl::opt<bool>
96     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
97                        cl::Hidden,
98                        cl::desc("Size-specialize memcmp and bcmp calls"));
99 
100 static cl::opt<unsigned>
101     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
102                     cl::desc("Optimize the memop size <= this value"));
103 
104 namespace {
105 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
106 public:
107   static char ID;
108 
109   PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
110     initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
111   }
112 
113   StringRef getPassName() const override { return "PGOMemOPSize"; }
114 
115 private:
116   bool runOnFunction(Function &F) override;
117   void getAnalysisUsage(AnalysisUsage &AU) const override {
118     AU.addRequired<BlockFrequencyInfoWrapperPass>();
119     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
120     AU.addPreserved<GlobalsAAWrapperPass>();
121     AU.addPreserved<DominatorTreeWrapperPass>();
122     AU.addRequired<TargetLibraryInfoWrapperPass>();
123   }
124 };
125 } // end anonymous namespace
126 
127 char PGOMemOPSizeOptLegacyPass::ID = 0;
128 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
129                       "Optimize memory intrinsic using its size value profile",
130                       false, false)
131 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
132 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
133 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
134                     "Optimize memory intrinsic using its size value profile",
135                     false, false)
136 
137 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
138   return new PGOMemOPSizeOptLegacyPass();
139 }
140 
141 namespace {
142 
143 static const char *getMIName(const MemIntrinsic *MI) {
144   switch (MI->getIntrinsicID()) {
145   case Intrinsic::memcpy:
146     return "memcpy";
147   case Intrinsic::memmove:
148     return "memmove";
149   case Intrinsic::memset:
150     return "memset";
151   default:
152     return "unknown";
153   }
154 }
155 
156 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
157 struct MemOp {
158   Instruction *I;
159   MemOp(MemIntrinsic *MI) : I(MI) {}
160   MemOp(CallInst *CI) : I(CI) {}
161   MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
162   CallInst *asCI() { return cast<CallInst>(I); }
163   MemOp clone() {
164     if (auto MI = asMI())
165       return MemOp(cast<MemIntrinsic>(MI->clone()));
166     return MemOp(cast<CallInst>(asCI()->clone()));
167   }
168   Value *getLength() {
169     if (auto MI = asMI())
170       return MI->getLength();
171     return asCI()->getArgOperand(2);
172   }
173   void setLength(Value *Length) {
174     if (auto MI = asMI())
175       return MI->setLength(Length);
176     asCI()->setArgOperand(2, Length);
177   }
178   StringRef getFuncName() {
179     if (auto MI = asMI())
180       return MI->getCalledFunction()->getName();
181     return asCI()->getCalledFunction()->getName();
182   }
183   bool isMemmove() {
184     if (auto MI = asMI())
185       if (MI->getIntrinsicID() == Intrinsic::memmove)
186         return true;
187     return false;
188   }
189   bool isMemcmp(TargetLibraryInfo &TLI) {
190     LibFunc Func;
191     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
192         Func == LibFunc_memcmp) {
193       return true;
194     }
195     return false;
196   }
197   bool isBcmp(TargetLibraryInfo &TLI) {
198     LibFunc Func;
199     if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
200         Func == LibFunc_bcmp) {
201       return true;
202     }
203     return false;
204   }
205   const char *getName(TargetLibraryInfo &TLI) {
206     if (auto MI = asMI())
207       return getMIName(MI);
208     LibFunc Func;
209     if (TLI.getLibFunc(*asCI(), Func)) {
210       if (Func == LibFunc_memcmp)
211         return "memcmp";
212       if (Func == LibFunc_bcmp)
213         return "bcmp";
214     }
215     llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
216     return nullptr;
217   }
218 };
219 
220 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
221 public:
222   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
223                OptimizationRemarkEmitter &ORE, DominatorTree *DT,
224                TargetLibraryInfo &TLI)
225       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
226     ValueDataArray =
227         std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
228   }
229   bool isChanged() const { return Changed; }
230   void perform() {
231     WorkList.clear();
232     visit(Func);
233 
234     for (auto &MO : WorkList) {
235       ++NumOfPGOMemOPAnnotate;
236       if (perform(MO)) {
237         Changed = true;
238         ++NumOfPGOMemOPOpt;
239         LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
240                           << "is Transformed.\n");
241       }
242     }
243   }
244 
245   void visitMemIntrinsic(MemIntrinsic &MI) {
246     Value *Length = MI.getLength();
247     // Not perform on constant length calls.
248     if (dyn_cast<ConstantInt>(Length))
249       return;
250     WorkList.push_back(MemOp(&MI));
251   }
252 
253   void visitCallInst(CallInst &CI) {
254     LibFunc Func;
255     if (TLI.getLibFunc(CI, Func) &&
256         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
257         !isa<ConstantInt>(CI.getArgOperand(2))) {
258       WorkList.push_back(MemOp(&CI));
259     }
260   }
261 
262 private:
263   Function &Func;
264   BlockFrequencyInfo &BFI;
265   OptimizationRemarkEmitter &ORE;
266   DominatorTree *DT;
267   TargetLibraryInfo &TLI;
268   bool Changed;
269   std::vector<MemOp> WorkList;
270   // The space to read the profile annotation.
271   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
272   bool perform(MemOp MO);
273 };
274 
275 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
276   assert(Count <= TotalCount);
277   if (Count < MemOPCountThreshold)
278     return false;
279   if (Count < TotalCount * MemOPPercentThreshold / 100)
280     return false;
281   return true;
282 }
283 
284 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
285                                       uint64_t Denom) {
286   if (!MemOPScaleCount)
287     return Count;
288   bool Overflowed;
289   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
290   return ScaleCount / Denom;
291 }
292 
293 bool MemOPSizeOpt::perform(MemOp MO) {
294   assert(MO.I);
295   if (MO.isMemmove())
296     return false;
297   if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
298     return false;
299 
300   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
301   uint64_t TotalCount;
302   if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
303                                 ValueDataArray.get(), NumVals, TotalCount))
304     return false;
305 
306   uint64_t ActualCount = TotalCount;
307   uint64_t SavedTotalCount = TotalCount;
308   if (MemOPScaleCount) {
309     auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
310     if (!BBEdgeCount)
311       return false;
312     ActualCount = *BBEdgeCount;
313   }
314 
315   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
316   LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
317                     << ActualCount << "\n");
318   LLVM_DEBUG(
319       for (auto &VD
320            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
321 
322   if (ActualCount < MemOPCountThreshold)
323     return false;
324   // Skip if the total value profiled count is 0, in which case we can't
325   // scale up the counts properly (and there is no profitable transformation).
326   if (TotalCount == 0)
327     return false;
328 
329   TotalCount = ActualCount;
330   if (MemOPScaleCount)
331     LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
332                       << " denominator = " << SavedTotalCount << "\n");
333 
334   // Keeping track of the count of the default case:
335   uint64_t RemainCount = TotalCount;
336   uint64_t SavedRemainCount = SavedTotalCount;
337   SmallVector<uint64_t, 16> SizeIds;
338   SmallVector<uint64_t, 16> CaseCounts;
339   uint64_t MaxCount = 0;
340   unsigned Version = 0;
341   // Default case is in the front -- save the slot here.
342   CaseCounts.push_back(0);
343   for (auto &VD : VDs) {
344     int64_t V = VD.Value;
345     uint64_t C = VD.Count;
346     if (MemOPScaleCount)
347       C = getScaledCount(C, ActualCount, SavedTotalCount);
348 
349     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize)
350       continue;
351 
352     // ValueCounts are sorted on the count. Break at the first un-profitable
353     // value.
354     if (!isProfitable(C, RemainCount))
355       break;
356 
357     SizeIds.push_back(V);
358     CaseCounts.push_back(C);
359     if (C > MaxCount)
360       MaxCount = C;
361 
362     assert(RemainCount >= C);
363     RemainCount -= C;
364     assert(SavedRemainCount >= VD.Count);
365     SavedRemainCount -= VD.Count;
366 
367     if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
368       break;
369   }
370 
371   if (Version == 0)
372     return false;
373 
374   CaseCounts[0] = RemainCount;
375   if (RemainCount > MaxCount)
376     MaxCount = RemainCount;
377 
378   uint64_t SumForOpt = TotalCount - RemainCount;
379 
380   LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
381                     << " Versions (covering " << SumForOpt << " out of "
382                     << TotalCount << ")\n");
383 
384   // mem_op(..., size)
385   // ==>
386   // switch (size) {
387   //   case s1:
388   //      mem_op(..., s1);
389   //      goto merge_bb;
390   //   case s2:
391   //      mem_op(..., s2);
392   //      goto merge_bb;
393   //   ...
394   //   default:
395   //      mem_op(..., size);
396   //      goto merge_bb;
397   // }
398   // merge_bb:
399 
400   BasicBlock *BB = MO.I->getParent();
401   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
402   LLVM_DEBUG(dbgs() << *BB << "\n");
403   auto OrigBBFreq = BFI.getBlockFreq(BB);
404 
405   BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
406   BasicBlock::iterator It(*MO.I);
407   ++It;
408   assert(It != DefaultBB->end());
409   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
410   MergeBB->setName("MemOP.Merge");
411   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
412   DefaultBB->setName("MemOP.Default");
413 
414   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
415   auto &Ctx = Func.getContext();
416   IRBuilder<> IRB(BB);
417   BB->getTerminator()->eraseFromParent();
418   Value *SizeVar = MO.getLength();
419   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
420   Type *MemOpTy = MO.I->getType();
421   PHINode *PHI = nullptr;
422   if (!MemOpTy->isVoidTy()) {
423     // Insert a phi for the return values at the merge block.
424     IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
425     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
426     MO.I->replaceAllUsesWith(PHI);
427     PHI->addIncoming(MO.I, DefaultBB);
428   }
429 
430   // Clear the value profile data.
431   MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
432   // If all promoted, we don't need the MD.prof metadata.
433   if (SavedRemainCount > 0 || Version != NumVals)
434     // Otherwise we need update with the un-promoted records back.
435     annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
436                       SavedRemainCount, IPVK_MemOPSize, NumVals);
437 
438   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
439 
440   std::vector<DominatorTree::UpdateType> Updates;
441   if (DT)
442     Updates.reserve(2 * SizeIds.size());
443 
444   for (uint64_t SizeId : SizeIds) {
445     BasicBlock *CaseBB = BasicBlock::Create(
446         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
447     MemOp NewMO = MO.clone();
448     // Fix the argument.
449     auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
450     assert(SizeType && "Expected integer type size argument.");
451     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
452     NewMO.setLength(CaseSizeId);
453     CaseBB->getInstList().push_back(NewMO.I);
454     IRBuilder<> IRBCase(CaseBB);
455     IRBCase.CreateBr(MergeBB);
456     SI->addCase(CaseSizeId, CaseBB);
457     if (!MemOpTy->isVoidTy())
458       PHI->addIncoming(NewMO.I, CaseBB);
459     if (DT) {
460       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
461       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
462     }
463     LLVM_DEBUG(dbgs() << *CaseBB << "\n");
464   }
465   DTU.applyUpdates(Updates);
466   Updates.clear();
467 
468   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
469 
470   LLVM_DEBUG(dbgs() << *BB << "\n");
471   LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
472   LLVM_DEBUG(dbgs() << *MergeBB << "\n");
473 
474   ORE.emit([&]() {
475     using namespace ore;
476     return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
477            << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
478            << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
479            << " for " << NV("Versions", Version) << " versions";
480   });
481 
482   return true;
483 }
484 } // namespace
485 
486 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
487                                 OptimizationRemarkEmitter &ORE,
488                                 DominatorTree *DT, TargetLibraryInfo &TLI) {
489   if (DisableMemOPOPT)
490     return false;
491 
492   if (F.hasFnAttribute(Attribute::OptimizeForSize))
493     return false;
494   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
495   MemOPSizeOpt.perform();
496   return MemOPSizeOpt.isChanged();
497 }
498 
499 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
500   BlockFrequencyInfo &BFI =
501       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
502   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
503   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
504   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
505   TargetLibraryInfo &TLI =
506       getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
507   return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
508 }
509 
510 namespace llvm {
511 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
512 
513 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
514                                        FunctionAnalysisManager &FAM) {
515   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
516   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
517   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
518   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
519   bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
520   if (!Changed)
521     return PreservedAnalyses::all();
522   auto PA = PreservedAnalyses();
523   PA.preserve<GlobalsAA>();
524   PA.preserve<DominatorTreeAnalysis>();
525   return PA;
526 }
527 } // namespace llvm
528