xref: /llvm-project/bolt/lib/Passes/FrameOptimizer.cpp (revision 2430a354bfb9e8c08e0dd5f294012b40afb75ce0)
1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/BinaryFunctionCallGraph.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
22 #include <deque>
23 
24 #define DEBUG_TYPE "fop"
25 
26 using namespace llvm;
27 
28 namespace opts {
29 extern cl::opt<unsigned> Verbosity;
30 extern cl::opt<bool> TimeOpts;
31 extern cl::OptionCategory BoltOptCategory;
32 
33 using namespace bolt;
34 
35 cl::opt<FrameOptimizationType>
36 FrameOptimization("frame-opt",
37   cl::init(FOP_NONE),
38   cl::desc("optimize stack frame accesses"),
39   cl::values(
40     clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
41     clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
42     clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
43   cl::ZeroOrMore,
44   cl::cat(BoltOptCategory));
45 
46 cl::opt<bool> RemoveStores(
47     "frame-opt-rm-stores", cl::init(FOP_NONE),
48     cl::desc("apply additional analysis to remove stores (experimental)"),
49     cl::cat(BoltOptCategory));
50 
51 } // namespace opts
52 
53 namespace llvm {
54 namespace bolt {
55 
removeUnnecessaryLoads(const RegAnalysis & RA,const FrameAnalysis & FA,BinaryFunction & BF)56 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
57                                                 const FrameAnalysis &FA,
58                                                 BinaryFunction &BF) {
59   StackAvailableExpressions SAE(RA, FA, BF);
60   SAE.run();
61 
62   LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
63   std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
64   bool Changed = false;
65   const auto ExprEnd = SAE.expr_end();
66   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
67   for (BinaryBasicBlock &BB : BF) {
68     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
69     const MCInst *Prev = nullptr;
70     for (MCInst &Inst : BB) {
71       LLVM_DEBUG({
72         dbgs() << "\t\tNow at ";
73         Inst.dump();
74         for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
75              I != ExprEnd; ++I) {
76           dbgs() << "\t\t\tReached by: ";
77           (*I)->dump();
78         }
79       });
80       // if Inst is a load from stack and the current available expressions show
81       // this value is available in a register or immediate, replace this load
82       // with move from register or from immediate.
83       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
84       if (!FIEX) {
85         Prev = &Inst;
86         continue;
87       }
88       // FIXME: Change to remove IsSimple == 0. We're being conservative here,
89       // but once replaceMemOperandWithReg is ready, we should feed it with all
90       // sorts of complex instructions.
91       if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
92           FIEX->StackOffset >= 0) {
93         Prev = &Inst;
94         continue;
95       }
96 
97       for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
98            I != ExprEnd; ++I) {
99         const MCInst *AvailableInst = *I;
100         ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
101         if (!FIEY)
102           continue;
103         assert(FIEY->IsStore && FIEY->IsSimple);
104         if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
105           continue;
106         // TODO: Change push/pops to stack adjustment instruction
107         if (MIB->isPop(Inst))
108           continue;
109 
110         ++NumRedundantLoads;
111         FreqRedundantLoads += BB.getKnownExecutionCount();
112         Changed = true;
113         LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
114         LLVM_DEBUG(Inst.dump());
115         LLVM_DEBUG(dbgs() << "Related store instruction: ");
116         LLVM_DEBUG(AvailableInst->dump());
117         LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
118         // Replace load
119         if (FIEY->IsStoreFromReg) {
120           if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
121             LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
122             break;
123           }
124           FreqLoadsChangedToReg += BB.getKnownExecutionCount();
125           MIB->removeAnnotation(Inst, "FrameAccessEntry");
126           LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
127           if (MIB->isRedundantMove(Inst)) {
128             ++NumLoadsDeleted;
129             FreqLoadsDeleted += BB.getKnownExecutionCount();
130             LLVM_DEBUG(dbgs() << "Created a redundant move\n");
131             // Delete it!
132             ToErase.push_front(std::make_pair(&BB, &Inst));
133           }
134         } else {
135           char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
136           support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
137           LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
138           if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
139             LLVM_DEBUG(dbgs() << "FAILED\n");
140           } else {
141             FreqLoadsChangedToImm += BB.getKnownExecutionCount();
142             MIB->removeAnnotation(Inst, "FrameAccessEntry");
143             LLVM_DEBUG(dbgs() << "Ok\n");
144           }
145         }
146         LLVM_DEBUG(dbgs() << "Changed to: ");
147         LLVM_DEBUG(Inst.dump());
148         break;
149       }
150       Prev = &Inst;
151     }
152   }
153   if (Changed)
154     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
155 
156   // TODO: Implement an interface of eraseInstruction that works out the
157   // complete list of elements to remove.
158   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
159     I.first->eraseInstruction(I.first->findInstruction(I.second));
160 }
161 
removeUnusedStores(const FrameAnalysis & FA,BinaryFunction & BF)162 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
163                                             BinaryFunction &BF) {
164   StackReachingUses SRU(FA, BF);
165   SRU.run();
166 
167   LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
168   std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
169   bool Changed = false;
170   for (BinaryBasicBlock &BB : BF) {
171     LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
172     const MCInst *Prev = nullptr;
173     for (MCInst &Inst : llvm::reverse(BB)) {
174       LLVM_DEBUG({
175         dbgs() << "\t\tNow at ";
176         Inst.dump();
177         for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
178              I != SRU.expr_end(); ++I) {
179           dbgs() << "\t\t\tReached by: ";
180           (*I)->dump();
181         }
182       });
183       ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
184       if (!FIEX) {
185         Prev = &Inst;
186         continue;
187       }
188       if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
189         Prev = &Inst;
190         continue;
191       }
192 
193       if (SRU.isStoreUsed(*FIEX,
194                           Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
195         Prev = &Inst;
196         continue;
197       }
198       // TODO: Change push/pops to stack adjustment instruction
199       if (BF.getBinaryContext().MIB->isPush(Inst))
200         continue;
201 
202       ++NumRedundantStores;
203       FreqRedundantStores += BB.getKnownExecutionCount();
204       Changed = true;
205       LLVM_DEBUG(dbgs() << "Unused store instruction: ");
206       LLVM_DEBUG(Inst.dump());
207       LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
208       LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
209                         << " size = " << (int)FIEX->Size << "\n");
210       // Delete it!
211       ToErase.emplace_back(&BB, &Inst);
212       Prev = &Inst;
213     }
214   }
215 
216   for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
217     I.first->eraseInstruction(I.first->findInstruction(I.second));
218 
219   if (Changed)
220     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
221 }
222 
runOnFunctions(BinaryContext & BC)223 Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
224   if (opts::FrameOptimization == FOP_NONE)
225     return Error::success();
226 
227   std::unique_ptr<BinaryFunctionCallGraph> CG;
228   std::unique_ptr<FrameAnalysis> FA;
229   std::unique_ptr<RegAnalysis> RA;
230 
231   {
232     NamedRegionTimer T1("callgraph", "create call graph", "FOP",
233                         "FOP breakdown", opts::TimeOpts);
234     CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
235   }
236 
237   {
238     NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
239                         "FOP breakdown", opts::TimeOpts);
240     FA = std::make_unique<FrameAnalysis>(BC, *CG);
241   }
242 
243   {
244     NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
245                         opts::TimeOpts);
246     RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
247   }
248 
249   // Perform caller-saved register optimizations, then callee-saved register
250   // optimizations (shrink wrapping)
251   for (auto &I : BC.getBinaryFunctions()) {
252     if (!FA->hasFrameInfo(I.second))
253       continue;
254     // Restrict pass execution if user asked to only run on hot functions
255     if (opts::FrameOptimization == FOP_HOT) {
256       if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
257         continue;
258       LLVM_DEBUG(
259           dbgs() << "Considering " << I.second.getPrintName()
260                  << " for frame optimizations because its execution count ( "
261                  << I.second.getKnownExecutionCount()
262                  << " ) exceeds our hotness threshold ( "
263                  << BC.getHotThreshold() << " )\n");
264     }
265 
266     {
267       NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
268                           opts::TimeOpts);
269       if (!FA->hasStackArithmetic(I.second))
270         removeUnnecessaryLoads(*RA, *FA, I.second);
271     }
272 
273     if (opts::RemoveStores) {
274       NamedRegionTimer T1("removestores", "remove stores", "FOP",
275                           "FOP breakdown", opts::TimeOpts);
276       if (!FA->hasStackArithmetic(I.second))
277         removeUnusedStores(*FA, I.second);
278     }
279     // Don't even start shrink wrapping if no profiling info is available
280     if (I.second.getKnownExecutionCount() == 0)
281       continue;
282   }
283 
284   {
285     NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
286                         "FOP breakdown", opts::TimeOpts);
287     if (Error E = performShrinkWrapping(*RA, *FA, BC))
288       return Error(std::move(E));
289   }
290 
291   BC.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292             << " redundant load(s) and " << NumRedundantStores
293             << " unused store(s)\n";
294   BC.outs() << "BOLT-INFO: Frequency of redundant loads is "
295             << FreqRedundantLoads << " and frequency of unused stores is "
296             << FreqRedundantStores << "\n";
297   BC.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298             << FreqLoadsChangedToReg
299             << " and frequency of loads changed to use an immediate is "
300             << FreqLoadsChangedToImm << "\n";
301   BC.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302             << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
303             << NumRedundantStores << " store(s)\n";
304   FA->printStats();
305   ShrinkWrapping::printStats(BC);
306   return Error::success();
307 }
308 
performShrinkWrapping(const RegAnalysis & RA,const FrameAnalysis & FA,BinaryContext & BC)309 Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
310                                                 const FrameAnalysis &FA,
311                                                 BinaryContext &BC) {
312   // Initialize necessary annotations to allow safe parallel accesses to
313   // annotation index in MIB
314   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
315   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
316   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
317   BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
318   BC.MIB->getOrCreateAnnotationIndex(
319       StackLayoutModifier::getOffsetCFIRegTagName());
320   BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
321   BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
322   BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
323   BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
324   BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
325   BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
326   BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
327   BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
328   BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
329   BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
330   BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
331   BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
332   BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
333   BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
334   BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
335   BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
336 
337   std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
338   auto LogFunc = [&](BinaryFunction &BF) {
339     auto Lower = llvm::lower_bound(
340         Top10Funcs, BF.getKnownExecutionCount(),
341         [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
342            uint64_t Value) { return Elmt.first > Value; });
343     if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
344       return;
345     Top10Funcs.insert(Lower,
346                       std::make_pair<>(BF.getKnownExecutionCount(), &BF));
347     if (Top10Funcs.size() > 10)
348       Top10Funcs.resize(10);
349   };
350   (void)LogFunc;
351 
352   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
353     if (BF.getFunctionScore() == 0)
354       return true;
355 
356     return false;
357   };
358 
359   const bool HotOnly = opts::FrameOptimization == FOP_HOT;
360 
361   Error SWError = Error::success();
362 
363   ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
364       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
365         DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
366         ShrinkWrapping SW(FA, BF, Info, AllocatorId);
367 
368         auto ChangedOrErr = SW.perform(HotOnly);
369         if (auto E = ChangedOrErr.takeError()) {
370           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
371           SWError = joinErrors(std::move(SWError), Error(std::move(E)));
372           return;
373         }
374         const bool Changed = *ChangedOrErr;
375         if (Changed) {
376           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
377           FuncsChanged.insert(&BF);
378           LLVM_DEBUG(LogFunc(BF));
379         }
380       };
381 
382   ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
383       BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
384       SkipPredicate, "shrink-wrapping");
385 
386   if (!Top10Funcs.empty()) {
387     BC.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
388     for (const auto &Elmt : Top10Funcs)
389       BC.outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
390   }
391   return SWError;
392 }
393 
394 } // namespace bolt
395 } // namespace llvm
396