1 //===- SpeculateAroundPHIs.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
10 #include "llvm/ADT/PostOrderIterator.h"
11 #include "llvm/ADT/Sequence.h"
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/ADT/Statistic.h"
14 #include "llvm/Analysis/TargetTransformInfo.h"
15 #include "llvm/Analysis/ValueTracking.h"
16 #include "llvm/IR/BasicBlock.h"
17 #include "llvm/IR/IRBuilder.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "spec-phis"
26
27 STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
28 STATISTIC(NumEdgesSplit,
29 "Number of critical edges which were split for speculation");
30 STATISTIC(NumSpeculatedInstructions,
31 "Number of instructions we speculated around the PHI nodes");
32 STATISTIC(NumNewRedundantInstructions,
33 "Number of new, redundant instructions inserted");
34
35 /// Check whether speculating the users of a PHI node around the PHI
36 /// will be safe.
37 ///
38 /// This checks both that all of the users are safe and also that all of their
39 /// operands are either recursively safe or already available along an incoming
40 /// edge to the PHI.
41 ///
42 /// This routine caches both all the safe nodes explored in `PotentialSpecSet`
43 /// and the chain of nodes that definitively reach any unsafe node in
44 /// `UnsafeSet`. By preserving these between repeated calls to this routine for
45 /// PHIs in the same basic block, the exploration here can be reused. However,
46 /// these caches must no be reused for PHIs in a different basic block as they
47 /// reflect what is available along incoming edges.
48 static bool
isSafeToSpeculatePHIUsers(PHINode & PN,DominatorTree & DT,SmallPtrSetImpl<Instruction * > & PotentialSpecSet,SmallPtrSetImpl<Instruction * > & UnsafeSet)49 isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
50 SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
51 SmallPtrSetImpl<Instruction *> &UnsafeSet) {
52 auto *PhiBB = PN.getParent();
53 SmallPtrSet<Instruction *, 4> Visited;
54 SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
55
56 // Walk each user of the PHI node.
57 for (Use &U : PN.uses()) {
58 auto *UI = cast<Instruction>(U.getUser());
59
60 // Ensure the use post-dominates the PHI node. This ensures that, in the
61 // absence of unwinding, the use will actually be reached.
62 // FIXME: We use a blunt hammer of requiring them to be in the same basic
63 // block. We should consider using actual post-dominance here in the
64 // future.
65 if (UI->getParent() != PhiBB) {
66 LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
67 return false;
68 }
69
70 if (const auto *CS = dyn_cast<CallBase>(UI)) {
71 if (CS->isConvergent() || CS->cannotDuplicate()) {
72 LLVM_DEBUG(dbgs() << " Unsafe: convergent "
73 "callsite cannot de duplicated: " << *UI << '\n');
74 return false;
75 }
76 }
77
78 // FIXME: This check is much too conservative. We're not going to move these
79 // instructions onto new dynamic paths through the program unless there is
80 // a call instruction between the use and the PHI node. And memory isn't
81 // changing unless there is a store in that same sequence. We should
82 // probably change this to do at least a limited scan of the intervening
83 // instructions and allow handling stores in easily proven safe cases.
84 if (mayBeMemoryDependent(*UI)) {
85 LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
86 return false;
87 }
88
89 // Now do a depth-first search of everything these users depend on to make
90 // sure they are transitively safe. This is a depth-first search, but we
91 // check nodes in preorder to minimize the amount of checking.
92 Visited.insert(UI);
93 DFSStack.push_back({UI, UI->value_op_begin()});
94 do {
95 User::value_op_iterator OpIt;
96 std::tie(UI, OpIt) = DFSStack.pop_back_val();
97
98 while (OpIt != UI->value_op_end()) {
99 auto *OpI = dyn_cast<Instruction>(*OpIt);
100 // Increment to the next operand for whenever we continue.
101 ++OpIt;
102 // No need to visit non-instructions, which can't form dependencies.
103 if (!OpI)
104 continue;
105
106 // Now do the main pre-order checks that this operand is a viable
107 // dependency of something we want to speculate.
108
109 // First do a few checks for instructions that won't require
110 // speculation at all because they are trivially available on the
111 // incoming edge (either through dominance or through an incoming value
112 // to a PHI).
113 //
114 // The cases in the current block will be trivially dominated by the
115 // edge.
116 auto *ParentBB = OpI->getParent();
117 if (ParentBB == PhiBB) {
118 if (isa<PHINode>(OpI)) {
119 // We can trivially map through phi nodes in the same block.
120 continue;
121 }
122 } else if (DT.dominates(ParentBB, PhiBB)) {
123 // Instructions from dominating blocks are already available.
124 continue;
125 }
126
127 // Once we know that we're considering speculating the operand, check
128 // if we've already explored this subgraph and found it to be safe.
129 if (PotentialSpecSet.count(OpI))
130 continue;
131
132 // If we've already explored this subgraph and found it unsafe, bail.
133 // If when we directly test whether this is safe it fails, bail.
134 if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
135 mayBeMemoryDependent(*OpI)) {
136 LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
137 << *OpI << "\n");
138 // Record the stack of instructions which reach this node as unsafe
139 // so we prune subsequent searches.
140 UnsafeSet.insert(OpI);
141 for (auto &StackPair : DFSStack) {
142 Instruction *I = StackPair.first;
143 UnsafeSet.insert(I);
144 }
145 return false;
146 }
147
148 // Skip any operands we're already recursively checking.
149 if (!Visited.insert(OpI).second)
150 continue;
151
152 // Push onto the stack and descend. We can directly continue this
153 // loop when ascending.
154 DFSStack.push_back({UI, OpIt});
155 UI = OpI;
156 OpIt = OpI->value_op_begin();
157 }
158
159 // This node and all its operands are safe. Go ahead and cache that for
160 // reuse later.
161 PotentialSpecSet.insert(UI);
162
163 // Continue with the next node on the stack.
164 } while (!DFSStack.empty());
165 }
166
167 #ifndef NDEBUG
168 // Every visited operand should have been marked as safe for speculation at
169 // this point. Verify this and return success.
170 for (auto *I : Visited)
171 assert(PotentialSpecSet.count(I) &&
172 "Failed to mark a visited instruction as safe!");
173 #endif
174 return true;
175 }
176
177 /// Check whether, in isolation, a given PHI node is both safe and profitable
178 /// to speculate users around.
179 ///
180 /// This handles checking whether there are any constant operands to a PHI
181 /// which could represent a useful speculation candidate, whether the users of
182 /// the PHI are safe to speculate including all their transitive dependencies,
183 /// and whether after speculation there will be some cost savings (profit) to
184 /// folding the operands into the users of the PHI node. Returns true if both
185 /// safe and profitable with relevant cost savings updated in the map and with
186 /// an update to the `PotentialSpecSet`. Returns false if either safety or
187 /// profitability are absent. Some new entries may be made to the
188 /// `PotentialSpecSet` even when this routine returns false, but they remain
189 /// conservatively correct.
190 ///
191 /// The profitability check here is a local one, but it checks this in an
192 /// interesting way. Beyond checking that the total cost of materializing the
193 /// constants will be less than the cost of folding them into their users, it
194 /// also checks that no one incoming constant will have a higher cost when
195 /// folded into its users rather than materialized. This higher cost could
196 /// result in a dynamic *path* that is more expensive even when the total cost
197 /// is lower. Currently, all of the interesting cases where this optimization
198 /// should fire are ones where it is a no-loss operation in this sense. If we
199 /// ever want to be more aggressive here, we would need to balance the
200 /// different incoming edges' cost by looking at their respective
201 /// probabilities.
isSafeAndProfitableToSpeculateAroundPHI(PHINode & PN,SmallDenseMap<PHINode *,InstructionCost,16> & CostSavingsMap,SmallPtrSetImpl<Instruction * > & PotentialSpecSet,SmallPtrSetImpl<Instruction * > & UnsafeSet,DominatorTree & DT,TargetTransformInfo & TTI)202 static bool isSafeAndProfitableToSpeculateAroundPHI(
203 PHINode &PN, SmallDenseMap<PHINode *, InstructionCost, 16> &CostSavingsMap,
204 SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
205 SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
206 TargetTransformInfo &TTI) {
207 // First see whether there is any cost savings to speculating around this
208 // PHI, and build up a map of the constant inputs to how many times they
209 // occur.
210 bool NonFreeMat = false;
211 struct CostsAndCount {
212 InstructionCost MatCost = TargetTransformInfo::TCC_Free;
213 InstructionCost FoldedCost = TargetTransformInfo::TCC_Free;
214 int Count = 0;
215 };
216 SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
217 SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
218 for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
219 auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
220 if (!IncomingC)
221 continue;
222
223 // Only visit each incoming edge with a constant input once.
224 if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
225 continue;
226
227 auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
228 // Count how many edges share a given incoming costant.
229 ++InsertResult.first->second.Count;
230 // Only compute the cost the first time we see a particular constant.
231 if (!InsertResult.second)
232 continue;
233
234 InstructionCost &MatCost = InsertResult.first->second.MatCost;
235 MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
236 TargetTransformInfo::TCK_SizeAndLatency);
237 NonFreeMat |= MatCost != TTI.TCC_Free;
238 }
239 if (!NonFreeMat) {
240 LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
241 // No profit in free materialization.
242 return false;
243 }
244
245 // Now check that the uses of this PHI can actually be speculated,
246 // otherwise we'll still have to materialize the PHI value.
247 if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
248 LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
249 return false;
250 }
251
252 // Compute how much (if any) savings are available by speculating around this
253 // PHI.
254 for (Use &U : PN.uses()) {
255 auto *UserI = cast<Instruction>(U.getUser());
256 // Now check whether there is any savings to folding the incoming constants
257 // into this use.
258 unsigned Idx = U.getOperandNo();
259
260 // If we have a binary operator that is commutative, an actual constant
261 // operand would end up on the RHS, so pretend the use of the PHI is on the
262 // RHS.
263 //
264 // Technically, this is a bit weird if *both* operands are PHIs we're
265 // speculating. But if that is the case, giving an "optimistic" cost isn't
266 // a bad thing because after speculation it will constant fold. And
267 // moreover, such cases should likely have been constant folded already by
268 // some other pass, so we shouldn't worry about "modeling" them terribly
269 // accurately here. Similarly, if the other operand is a constant, it still
270 // seems fine to be "optimistic" in our cost modeling, because when the
271 // incoming operand from the PHI node is also a constant, we will end up
272 // constant folding.
273 if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
274 // Assume we will commute the constant to the RHS to be canonical.
275 Idx = 1;
276
277 // Get the intrinsic ID if this user is an intrinsic.
278 Intrinsic::ID IID = Intrinsic::not_intrinsic;
279 if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
280 IID = UserII->getIntrinsicID();
281
282 for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
283 ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
284 InstructionCost MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
285 InstructionCost &FoldedCost =
286 IncomingConstantAndCostsAndCount.second.FoldedCost;
287 if (IID)
288 FoldedCost +=
289 TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
290 IncomingC->getType(),
291 TargetTransformInfo::TCK_SizeAndLatency);
292 else
293 FoldedCost +=
294 TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
295 IncomingC->getValue(), IncomingC->getType(),
296 TargetTransformInfo::TCK_SizeAndLatency);
297
298 // If we accumulate more folded cost for this incoming constant than
299 // materialized cost, then we'll regress any edge with this constant so
300 // just bail. We're only interested in cases where folding the incoming
301 // constants is at least break-even on all paths.
302 if (FoldedCost > MatCost) {
303 LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
304 << "\n"
305 " Materializing cost: "
306 << MatCost
307 << "\n"
308 " Accumulated folded cost: "
309 << FoldedCost << "\n");
310 return false;
311 }
312 }
313 }
314
315 // Compute the total cost savings afforded by this PHI node.
316 InstructionCost TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
317 for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
318 InstructionCost MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
319 InstructionCost FoldedCost =
320 IncomingConstantAndCostsAndCount.second.FoldedCost;
321 int Count = IncomingConstantAndCostsAndCount.second.Count;
322
323 TotalMatCost += MatCost * Count;
324 TotalFoldedCost += FoldedCost * Count;
325 }
326 assert(TotalMatCost.isValid() && "Constants must be materializable");
327 assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
328 "less that its materialized cost, "
329 "the sum must be as well.");
330 LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
331 << ": " << PN << "\n");
332 CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
333 return true;
334 }
335
336 /// Simple helper to walk all the users of a list of phis depth first, and call
337 /// a visit function on each one in post-order.
338 ///
339 /// All of the PHIs should be in the same basic block, and this is primarily
340 /// used to make a single depth-first walk across their collective users
341 /// without revisiting any subgraphs. Callers should provide a fast, idempotent
342 /// callable to test whether a node has been visited and the more important
343 /// callable to actually visit a particular node.
344 ///
345 /// Depth-first and postorder here refer to the *operand* graph -- we start
346 /// from a collection of users of PHI nodes and walk "up" the operands
347 /// depth-first.
348 template <typename IsVisitedT, typename VisitT>
visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode * > PNs,IsVisitedT IsVisited,VisitT Visit)349 static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
350 IsVisitedT IsVisited,
351 VisitT Visit) {
352 SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
353 for (auto *PN : PNs)
354 for (Use &U : PN->uses()) {
355 auto *UI = cast<Instruction>(U.getUser());
356 if (IsVisited(UI))
357 // Already visited this user, continue across the roots.
358 continue;
359
360 // Otherwise, walk the operand graph depth-first and visit each
361 // dependency in postorder.
362 DFSStack.push_back({UI, UI->value_op_begin()});
363 do {
364 User::value_op_iterator OpIt;
365 std::tie(UI, OpIt) = DFSStack.pop_back_val();
366 while (OpIt != UI->value_op_end()) {
367 auto *OpI = dyn_cast<Instruction>(*OpIt);
368 // Increment to the next operand for whenever we continue.
369 ++OpIt;
370 // No need to visit non-instructions, which can't form dependencies,
371 // or instructions outside of our potential dependency set that we
372 // were given. Finally, if we've already visited the node, continue
373 // to the next.
374 if (!OpI || IsVisited(OpI))
375 continue;
376
377 // Push onto the stack and descend. We can directly continue this
378 // loop when ascending.
379 DFSStack.push_back({UI, OpIt});
380 UI = OpI;
381 OpIt = OpI->value_op_begin();
382 }
383
384 // Finished visiting children, visit this node.
385 assert(!IsVisited(UI) && "Should not have already visited a node!");
386 Visit(UI);
387 } while (!DFSStack.empty());
388 }
389 }
390
391 /// Find profitable PHIs to speculate.
392 ///
393 /// For a PHI node to be profitable, we need the cost of speculating its users
394 /// (and their dependencies) to not exceed the savings of folding the PHI's
395 /// constant operands into the speculated users.
396 ///
397 /// Computing this is surprisingly challenging. Because users of two different
398 /// PHI nodes can depend on each other or on common other instructions, it may
399 /// be profitable to speculate two PHI nodes together even though neither one
400 /// in isolation is profitable. The straightforward way to find all the
401 /// profitable PHIs would be to check each combination of PHIs' cost, but this
402 /// is exponential in complexity.
403 ///
404 /// Even if we assume that we only care about cases where we can consider each
405 /// PHI node in isolation (rather than considering cases where none are
406 /// profitable in isolation but some subset are profitable as a set), we still
407 /// have a challenge. The obvious way to find all individually profitable PHIs
408 /// is to iterate until reaching a fixed point, but this will be quadratic in
409 /// complexity. =/
410 ///
411 /// This code currently uses a linear-to-compute order for a greedy approach.
412 /// It won't find cases where a set of PHIs must be considered together, but it
413 /// handles most cases of order dependence without quadratic iteration. The
414 /// specific order used is the post-order across the operand DAG. When the last
415 /// user of a PHI is visited in this postorder walk, we check it for
416 /// profitability.
417 ///
418 /// There is an orthogonal extra complexity to all of this: computing the cost
419 /// itself can easily become a linear computation making everything again (at
420 /// best) quadratic. Using a postorder over the operand graph makes it
421 /// particularly easy to avoid this through dynamic programming. As we do the
422 /// postorder walk, we build the transitive cost of that subgraph. It is also
423 /// straightforward to then update these costs when we mark a PHI for
424 /// speculation so that subsequent PHIs don't re-pay the cost of already
425 /// speculated instructions.
findProfitablePHIs(ArrayRef<PHINode * > PNs,const SmallDenseMap<PHINode *,InstructionCost,16> & CostSavingsMap,const SmallPtrSetImpl<Instruction * > & PotentialSpecSet,int NumPreds,DominatorTree & DT,TargetTransformInfo & TTI)426 static SmallVector<PHINode *, 16> findProfitablePHIs(
427 ArrayRef<PHINode *> PNs,
428 const SmallDenseMap<PHINode *, InstructionCost, 16> &CostSavingsMap,
429 const SmallPtrSetImpl<Instruction *> &PotentialSpecSet, int NumPreds,
430 DominatorTree &DT, TargetTransformInfo &TTI) {
431 SmallVector<PHINode *, 16> SpecPNs;
432
433 // First, establish a reverse mapping from immediate users of the PHI nodes
434 // to the nodes themselves, and count how many users each PHI node has in
435 // a way we can update while processing them.
436 SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
437 SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
438 SmallPtrSet<Instruction *, 16> UserSet;
439 for (auto *PN : PNs) {
440 assert(UserSet.empty() && "Must start with an empty user set!");
441 for (Use &U : PN->uses())
442 UserSet.insert(cast<Instruction>(U.getUser()));
443 PNUserCountMap[PN] = UserSet.size();
444 for (auto *UI : UserSet)
445 UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
446 UserSet.clear();
447 }
448
449 // Now do a DFS across the operand graph of the users, computing cost as we
450 // go and when all costs for a given PHI are known, checking that PHI for
451 // profitability.
452 SmallDenseMap<Instruction *, InstructionCost, 16> SpecCostMap;
453 visitPHIUsersAndDepsInPostOrder(
454 PNs,
455 /*IsVisited*/
456 [&](Instruction *I) {
457 // We consider anything that isn't potentially speculated to be
458 // "visited" as it is already handled. Similarly, anything that *is*
459 // potentially speculated but for which we have an entry in our cost
460 // map, we're done.
461 return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
462 },
463 /*Visit*/
464 [&](Instruction *I) {
465 // We've fully visited the operands, so sum their cost with this node
466 // and update the cost map.
467 InstructionCost Cost = TTI.TCC_Free;
468 for (Value *OpV : I->operand_values())
469 if (auto *OpI = dyn_cast<Instruction>(OpV)) {
470 auto CostMapIt = SpecCostMap.find(OpI);
471 if (CostMapIt != SpecCostMap.end())
472 Cost += CostMapIt->second;
473 }
474 Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
475 bool Inserted = SpecCostMap.insert({I, Cost}).second;
476 (void)Inserted;
477 assert(Inserted && "Must not re-insert a cost during the DFS!");
478
479 // Now check if this node had a corresponding PHI node using it. If so,
480 // we need to decrement the outstanding user count for it.
481 auto UserPNsIt = UserToPNMap.find(I);
482 if (UserPNsIt == UserToPNMap.end())
483 return;
484 auto &UserPNs = UserPNsIt->second;
485 auto UserPNsSplitIt = std::stable_partition(
486 UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
487 int &PNUserCount = PNUserCountMap.find(UserPN)->second;
488 assert(
489 PNUserCount > 0 &&
490 "Should never re-visit a PN after its user count hits zero!");
491 --PNUserCount;
492 return PNUserCount != 0;
493 });
494
495 // FIXME: Rather than one at a time, we should sum the savings as the
496 // cost will be completely shared.
497 SmallVector<Instruction *, 16> SpecWorklist;
498 for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
499 InstructionCost SpecCost = TTI.TCC_Free;
500 for (Use &U : PN->uses())
501 SpecCost +=
502 SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
503 SpecCost *= (NumPreds - 1);
504 // When the user count of a PHI node hits zero, we should check its
505 // profitability. If profitable, we should mark it for speculation
506 // and zero out the cost of everything it depends on.
507 InstructionCost CostSavings = CostSavingsMap.find(PN)->second;
508 if (SpecCost > CostSavings) {
509 LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
510 << "\n"
511 " Cost savings: "
512 << CostSavings
513 << "\n"
514 " Speculation cost: "
515 << SpecCost << "\n");
516 continue;
517 }
518
519 // We're going to speculate this user-associated PHI. Copy it out and
520 // add its users to the worklist to update their cost.
521 SpecPNs.push_back(PN);
522 for (Use &U : PN->uses()) {
523 auto *UI = cast<Instruction>(U.getUser());
524 auto CostMapIt = SpecCostMap.find(UI);
525 if (CostMapIt->second == 0)
526 continue;
527 // Zero out this cost entry to avoid duplicates.
528 CostMapIt->second = 0;
529 SpecWorklist.push_back(UI);
530 }
531 }
532
533 // Now walk all the operands of the users in the worklist transitively
534 // to zero out all the memoized costs.
535 while (!SpecWorklist.empty()) {
536 Instruction *SpecI = SpecWorklist.pop_back_val();
537 assert(SpecCostMap.find(SpecI)->second == 0 &&
538 "Didn't zero out a cost!");
539
540 // Walk the operands recursively to zero out their cost as well.
541 for (auto *OpV : SpecI->operand_values()) {
542 auto *OpI = dyn_cast<Instruction>(OpV);
543 if (!OpI)
544 continue;
545 auto CostMapIt = SpecCostMap.find(OpI);
546 if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
547 continue;
548 CostMapIt->second = 0;
549 SpecWorklist.push_back(OpI);
550 }
551 }
552 });
553
554 return SpecPNs;
555 }
556
557 /// Speculate users around a set of PHI nodes.
558 ///
559 /// This routine does the actual speculation around a set of PHI nodes where we
560 /// have determined this to be both safe and profitable.
561 ///
562 /// This routine handles any spliting of critical edges necessary to create
563 /// a safe block to speculate into as well as cloning the instructions and
564 /// rewriting all uses.
speculatePHIs(ArrayRef<PHINode * > SpecPNs,SmallPtrSetImpl<Instruction * > & PotentialSpecSet,SmallSetVector<BasicBlock *,16> & PredSet,DominatorTree & DT)565 static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
566 SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
567 SmallSetVector<BasicBlock *, 16> &PredSet,
568 DominatorTree &DT) {
569 LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
570 NumPHIsSpeculated += SpecPNs.size();
571
572 // Split any critical edges so that we have a block to hoist into.
573 auto *ParentBB = SpecPNs[0]->getParent();
574 SmallVector<BasicBlock *, 16> SpecPreds;
575 SpecPreds.reserve(PredSet.size());
576 for (auto *PredBB : PredSet) {
577 auto *NewPredBB = SplitCriticalEdge(
578 PredBB, ParentBB,
579 CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
580 if (NewPredBB) {
581 ++NumEdgesSplit;
582 LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
583 << "\n");
584 SpecPreds.push_back(NewPredBB);
585 } else {
586 assert(PredBB->getSingleSuccessor() == ParentBB &&
587 "We need a non-critical predecessor to speculate into.");
588 assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
589 "Cannot have a non-critical invoke!");
590
591 // Already non-critical, use existing pred.
592 SpecPreds.push_back(PredBB);
593 }
594 }
595
596 SmallPtrSet<Instruction *, 16> SpecSet;
597 SmallVector<Instruction *, 16> SpecList;
598 visitPHIUsersAndDepsInPostOrder(SpecPNs,
599 /*IsVisited*/
600 [&](Instruction *I) {
601 // This is visited if we don't need to
602 // speculate it or we already have
603 // speculated it.
604 return !PotentialSpecSet.count(I) ||
605 SpecSet.count(I);
606 },
607 /*Visit*/
608 [&](Instruction *I) {
609 // All operands scheduled, schedule this
610 // node.
611 SpecSet.insert(I);
612 SpecList.push_back(I);
613 });
614
615 int NumSpecInsts = SpecList.size() * SpecPreds.size();
616 int NumRedundantInsts = NumSpecInsts - SpecList.size();
617 LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
618 << " speculated instructions, " << NumRedundantInsts
619 << " redundancies\n");
620 NumSpeculatedInstructions += NumSpecInsts;
621 NumNewRedundantInstructions += NumRedundantInsts;
622
623 // Each predecessor is numbered by its index in `SpecPreds`, so for each
624 // instruction we speculate, the speculated instruction is stored in that
625 // index of the vector associated with the original instruction. We also
626 // store the incoming values for each predecessor from any PHIs used.
627 SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
628
629 // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
630 // value. This handles both the PHIs we are speculating around and any other
631 // PHIs that happen to be used.
632 for (auto *OrigI : SpecList)
633 for (auto *OpV : OrigI->operand_values()) {
634 auto *OpPN = dyn_cast<PHINode>(OpV);
635 if (!OpPN || OpPN->getParent() != ParentBB)
636 continue;
637
638 auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
639 if (!InsertResult.second)
640 continue;
641
642 auto &SpeculatedVals = InsertResult.first->second;
643
644 // Populating our structure for mapping is particularly annoying because
645 // finding an incoming value for a particular predecessor block in a PHI
646 // node is a linear time operation! To avoid quadratic behavior, we build
647 // a map for this PHI node's incoming values and then translate it into
648 // the more compact representation used below.
649 SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
650 for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
651 IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
652
653 for (auto *PredBB : SpecPreds)
654 SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
655 }
656
657 // Speculate into each predecessor.
658 for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
659 auto *PredBB = SpecPreds[PredIdx];
660 assert(PredBB->getSingleSuccessor() == ParentBB &&
661 "We need a non-critical predecessor to speculate into.");
662
663 for (auto *OrigI : SpecList) {
664 auto *NewI = OrigI->clone();
665 NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
666 NewI->insertBefore(PredBB->getTerminator());
667
668 // Rewrite all the operands to the previously speculated instructions.
669 // Because we're walking in-order, the defs must precede the uses and we
670 // should already have these mappings.
671 for (Use &U : NewI->operands()) {
672 auto *OpI = dyn_cast<Instruction>(U.get());
673 if (!OpI)
674 continue;
675 auto MapIt = SpeculatedValueMap.find(OpI);
676 if (MapIt == SpeculatedValueMap.end())
677 continue;
678 const auto &SpeculatedVals = MapIt->second;
679 assert(SpeculatedVals[PredIdx] &&
680 "Must have a speculated value for this predecessor!");
681 assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
682 "Speculated value has the wrong type!");
683
684 // Rewrite the use to this predecessor's speculated instruction.
685 U.set(SpeculatedVals[PredIdx]);
686 }
687
688 // Commute instructions which now have a constant in the LHS but not the
689 // RHS.
690 if (NewI->isBinaryOp() && NewI->isCommutative() &&
691 isa<Constant>(NewI->getOperand(0)) &&
692 !isa<Constant>(NewI->getOperand(1)))
693 NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
694
695 SpeculatedValueMap[OrigI].push_back(NewI);
696 assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
697 "Mismatched speculated instruction index!");
698 }
699 }
700
701 // Walk the speculated instruction list and if they have uses, insert a PHI
702 // for them from the speculated versions, and replace the uses with the PHI.
703 // Then erase the instructions as they have been fully speculated. The walk
704 // needs to be in reverse so that we don't think there are users when we'll
705 // actually eventually remove them later.
706 IRBuilder<> IRB(SpecPNs[0]);
707 for (auto *OrigI : llvm::reverse(SpecList)) {
708 // Check if we need a PHI for any remaining users and if so, insert it.
709 if (!OrigI->use_empty()) {
710 auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
711 Twine(OrigI->getName()) + ".phi");
712 // Add the incoming values we speculated.
713 auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
714 for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
715 SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
716
717 // And replace the uses with the PHI node.
718 OrigI->replaceAllUsesWith(SpecIPN);
719 }
720
721 // It is important to immediately erase this so that it stops using other
722 // instructions. This avoids inserting needless PHIs of them.
723 OrigI->eraseFromParent();
724 }
725
726 // All of the uses of the speculated phi nodes should be removed at this
727 // point, so erase them.
728 for (auto *SpecPN : SpecPNs) {
729 assert(SpecPN->use_empty() && "All users should have been speculated!");
730 SpecPN->eraseFromParent();
731 }
732 }
733
734 /// Try to speculate around a series of PHIs from a single basic block.
735 ///
736 /// This routine checks whether any of these PHIs are profitable to speculate
737 /// users around. If safe and profitable, it does the speculation. It returns
738 /// true when at least some speculation occurs.
tryToSpeculatePHIs(SmallVectorImpl<PHINode * > & PNs,DominatorTree & DT,TargetTransformInfo & TTI)739 static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
740 DominatorTree &DT, TargetTransformInfo &TTI) {
741 LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
742
743 // Savings in cost from speculating around a PHI node.
744 SmallDenseMap<PHINode *, InstructionCost, 16> CostSavingsMap;
745
746 // Remember the set of instructions that are candidates for speculation so
747 // that we can quickly walk things within that space. This prunes out
748 // instructions already available along edges, etc.
749 SmallPtrSet<Instruction *, 16> PotentialSpecSet;
750
751 // Remember the set of instructions that are (transitively) unsafe to
752 // speculate into the incoming edges of this basic block. This avoids
753 // recomputing them for each PHI node we check. This set is specific to this
754 // block though as things are pruned out of it based on what is available
755 // along incoming edges.
756 SmallPtrSet<Instruction *, 16> UnsafeSet;
757
758 // For each PHI node in this block, check whether there are immediate folding
759 // opportunities from speculation, and whether that speculation will be
760 // valid. This determise the set of safe PHIs to speculate.
761 llvm::erase_if(PNs, [&](PHINode *PN) {
762 return !isSafeAndProfitableToSpeculateAroundPHI(
763 *PN, CostSavingsMap, PotentialSpecSet, UnsafeSet, DT, TTI);
764 });
765 // If no PHIs were profitable, skip.
766 if (PNs.empty()) {
767 LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
768 return false;
769 }
770
771 // We need to know how much speculation will cost which is determined by how
772 // many incoming edges will need a copy of each speculated instruction.
773 SmallSetVector<BasicBlock *, 16> PredSet;
774 for (auto *PredBB : PNs[0]->blocks()) {
775 if (!PredSet.insert(PredBB))
776 continue;
777
778 // We cannot speculate when a predecessor is an indirect branch.
779 // FIXME: We also can't reliably create a non-critical edge block for
780 // speculation if the predecessor is an invoke. This doesn't seem
781 // fundamental and we should probably be splitting critical edges
782 // differently.
783 const auto *TermInst = PredBB->getTerminator();
784 if (isa<IndirectBrInst>(TermInst) ||
785 isa<InvokeInst>(TermInst) ||
786 isa<CallBrInst>(TermInst)) {
787 LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
788 << PredBB->getName() << "\n");
789 return false;
790 }
791 }
792 if (PredSet.size() < 2) {
793 LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
794 return false;
795 }
796
797 SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
798 PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
799 if (SpecPNs.empty())
800 // Nothing to do.
801 return false;
802
803 speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
804 return true;
805 }
806
run(Function & F,FunctionAnalysisManager & AM)807 PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
808 FunctionAnalysisManager &AM) {
809 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
810 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
811
812 bool Changed = false;
813 for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
814 SmallVector<PHINode *, 16> PNs;
815 auto BBI = BB->begin();
816 while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
817 PNs.push_back(PN);
818 ++BBI;
819 }
820
821 if (PNs.empty())
822 continue;
823
824 Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
825 }
826
827 if (!Changed)
828 return PreservedAnalyses::all();
829
830 PreservedAnalyses PA;
831 return PA;
832 }
833