1 //===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass flattens pairs nested loops into a single loop.
10 //
11 // The intention is to optimise loop nests like this, which together access an
12 // array linearly:
13 // for (int i = 0; i < N; ++i)
14 // for (int j = 0; j < M; ++j)
15 // f(A[i*M+j]);
16 // into one loop:
17 // for (int i = 0; i < (N*M); ++i)
18 // f(A[i]);
19 //
20 // It can also flatten loops where the induction variables are not used in the
21 // loop. This is only worth doing if the induction variables are only used in an
22 // expression like i*M+j. If they had any other uses, we would have to insert a
23 // div/mod to reconstruct the original values, so this wouldn't be profitable.
24 //
25 // We also need to prove that N*M will not overflow.
26 //
27 //===----------------------------------------------------------------------===//
28
29 #include "llvm/Transforms/Scalar/LoopFlatten.h"
30 #include "llvm/Analysis/AssumptionCache.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/Analysis/TargetTransformInfo.h"
35 #include "llvm/Analysis/ValueTracking.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/Module.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/InitializePasses.h"
43 #include "llvm/Pass.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Utils/Local.h"
48 #include "llvm/Transforms/Utils/LoopUtils.h"
49 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
50 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
51
52 #define DEBUG_TYPE "loop-flatten"
53
54 using namespace llvm;
55 using namespace llvm::PatternMatch;
56
57 static cl::opt<unsigned> RepeatedInstructionThreshold(
58 "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
59 cl::desc("Limit on the cost of instructions that can be repeated due to "
60 "loop flattening"));
61
62 static cl::opt<bool>
63 AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
64 cl::init(false),
65 cl::desc("Assume that the product of the two iteration "
66 "limits will never overflow"));
67
68 static cl::opt<bool>
69 WidenIV("loop-flatten-widen-iv", cl::Hidden,
70 cl::init(true),
71 cl::desc("Widen the loop induction variables, if possible, so "
72 "overflow checks won't reject flattening"));
73
74 struct FlattenInfo {
75 Loop *OuterLoop = nullptr;
76 Loop *InnerLoop = nullptr;
77 PHINode *InnerInductionPHI = nullptr;
78 PHINode *OuterInductionPHI = nullptr;
79 Value *InnerLimit = nullptr;
80 Value *OuterLimit = nullptr;
81 BinaryOperator *InnerIncrement = nullptr;
82 BinaryOperator *OuterIncrement = nullptr;
83 BranchInst *InnerBranch = nullptr;
84 BranchInst *OuterBranch = nullptr;
85 SmallPtrSet<Value *, 4> LinearIVUses;
86 SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
87
88 // Whether this holds the flatten info before or after widening.
89 bool Widened = false;
90
FlattenInfoFlattenInfo91 FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
92 };
93
94 // Finds the induction variable, increment and limit for a simple loop that we
95 // can flatten.
findLoopComponents(Loop * L,SmallPtrSetImpl<Instruction * > & IterationInstructions,PHINode * & InductionPHI,Value * & Limit,BinaryOperator * & Increment,BranchInst * & BackBranch,ScalarEvolution * SE)96 static bool findLoopComponents(
97 Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
98 PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment,
99 BranchInst *&BackBranch, ScalarEvolution *SE) {
100 LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
101
102 if (!L->isLoopSimplifyForm()) {
103 LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
104 return false;
105 }
106
107 // There must be exactly one exiting block, and it must be the same at the
108 // latch.
109 BasicBlock *Latch = L->getLoopLatch();
110 if (L->getExitingBlock() != Latch) {
111 LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
112 return false;
113 }
114 // Latch block must end in a conditional branch.
115 BackBranch = dyn_cast<BranchInst>(Latch->getTerminator());
116 if (!BackBranch || !BackBranch->isConditional()) {
117 LLVM_DEBUG(dbgs() << "Could not find back-branch\n");
118 return false;
119 }
120 IterationInstructions.insert(BackBranch);
121 LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
122 bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0));
123
124 // Find the induction PHI. If there is no induction PHI, we can't do the
125 // transformation. TODO: could other variables trigger this? Do we have to
126 // search for the best one?
127 InductionPHI = nullptr;
128 for (PHINode &PHI : L->getHeader()->phis()) {
129 InductionDescriptor ID;
130 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) {
131 InductionPHI = &PHI;
132 LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
133 break;
134 }
135 }
136 if (!InductionPHI) {
137 LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
138 return false;
139 }
140
141 auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
142 if (ContinueOnTrue)
143 return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
144 else
145 return Pred == CmpInst::ICMP_EQ;
146 };
147
148 // Find Compare and make sure it is valid
149 ICmpInst *Compare = dyn_cast<ICmpInst>(BackBranch->getCondition());
150 if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
151 Compare->hasNUsesOrMore(2)) {
152 LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
153 return false;
154 }
155 IterationInstructions.insert(Compare);
156 LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
157
158 // Find increment and limit from the compare
159 Increment = nullptr;
160 if (match(Compare->getOperand(0),
161 m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
162 Increment = dyn_cast<BinaryOperator>(Compare->getOperand(0));
163 Limit = Compare->getOperand(1);
164 } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE &&
165 match(Compare->getOperand(1),
166 m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
167 Increment = dyn_cast<BinaryOperator>(Compare->getOperand(1));
168 Limit = Compare->getOperand(0);
169 }
170 if (!Increment || Increment->hasNUsesOrMore(3)) {
171 LLVM_DEBUG(dbgs() << "Cound not find valid increment\n");
172 return false;
173 }
174 IterationInstructions.insert(Increment);
175 LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
176 LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump());
177
178 assert(InductionPHI->getNumIncomingValues() == 2);
179
180 if (InductionPHI->getIncomingValueForBlock(Latch) != Increment) {
181 LLVM_DEBUG(
182 dbgs() << "Incoming value from latch is not the increment inst\n");
183 return false;
184 }
185
186 auto *CI = dyn_cast<ConstantInt>(
187 InductionPHI->getIncomingValueForBlock(L->getLoopPreheader()));
188 if (!CI || !CI->isZero()) {
189 LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump());
190 return false;
191 }
192
193 LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
194 return true;
195 }
196
checkPHIs(FlattenInfo & FI,const TargetTransformInfo * TTI)197 static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
198 // All PHIs in the inner and outer headers must either be:
199 // - The induction PHI, which we are going to rewrite as one induction in
200 // the new loop. This is already checked by findLoopComponents.
201 // - An outer header PHI with all incoming values from outside the loop.
202 // LoopSimplify guarantees we have a pre-header, so we don't need to
203 // worry about that here.
204 // - Pairs of PHIs in the inner and outer headers, which implement a
205 // loop-carried dependency that will still be valid in the new loop. To
206 // be valid, this variable must be modified only in the inner loop.
207
208 // The set of PHI nodes in the outer loop header that we know will still be
209 // valid after the transformation. These will not need to be modified (with
210 // the exception of the induction variable), but we do need to check that
211 // there are no unsafe PHI nodes.
212 SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
213 SafeOuterPHIs.insert(FI.OuterInductionPHI);
214
215 // Check that all PHI nodes in the inner loop header match one of the valid
216 // patterns.
217 for (PHINode &InnerPHI : FI.InnerLoop->getHeader()->phis()) {
218 // The induction PHIs break these rules, and that's OK because we treat
219 // them specially when doing the transformation.
220 if (&InnerPHI == FI.InnerInductionPHI)
221 continue;
222
223 // Each inner loop PHI node must have two incoming values/blocks - one
224 // from the pre-header, and one from the latch.
225 assert(InnerPHI.getNumIncomingValues() == 2);
226 Value *PreHeaderValue =
227 InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopPreheader());
228 Value *LatchValue =
229 InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopLatch());
230
231 // The incoming value from the outer loop must be the PHI node in the
232 // outer loop header, with no modifications made in the top of the outer
233 // loop.
234 PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
235 if (!OuterPHI || OuterPHI->getParent() != FI.OuterLoop->getHeader()) {
236 LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
237 return false;
238 }
239
240 // The other incoming value must come from the inner loop, without any
241 // modifications in the tail end of the outer loop. We are in LCSSA form,
242 // so this will actually be a PHI in the inner loop's exit block, which
243 // only uses values from inside the inner loop.
244 PHINode *LCSSAPHI = dyn_cast<PHINode>(
245 OuterPHI->getIncomingValueForBlock(FI.OuterLoop->getLoopLatch()));
246 if (!LCSSAPHI) {
247 LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
248 return false;
249 }
250
251 // The value used by the LCSSA PHI must be the same one that the inner
252 // loop's PHI uses.
253 if (LCSSAPHI->hasConstantValue() != LatchValue) {
254 LLVM_DEBUG(
255 dbgs() << "LCSSA PHI incoming value does not match latch value\n");
256 return false;
257 }
258
259 LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
260 LLVM_DEBUG(dbgs() << " Inner: "; InnerPHI.dump());
261 LLVM_DEBUG(dbgs() << " Outer: "; OuterPHI->dump());
262 SafeOuterPHIs.insert(OuterPHI);
263 FI.InnerPHIsToTransform.insert(&InnerPHI);
264 }
265
266 for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
267 if (!SafeOuterPHIs.count(&OuterPHI)) {
268 LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
269 return false;
270 }
271 }
272
273 LLVM_DEBUG(dbgs() << "checkPHIs: OK\n");
274 return true;
275 }
276
277 static bool
checkOuterLoopInsts(FlattenInfo & FI,SmallPtrSetImpl<Instruction * > & IterationInstructions,const TargetTransformInfo * TTI)278 checkOuterLoopInsts(FlattenInfo &FI,
279 SmallPtrSetImpl<Instruction *> &IterationInstructions,
280 const TargetTransformInfo *TTI) {
281 // Check for instructions in the outer but not inner loop. If any of these
282 // have side-effects then this transformation is not legal, and if there is
283 // a significant amount of code here which can't be optimised out that it's
284 // not profitable (as these instructions would get executed for each
285 // iteration of the inner loop).
286 InstructionCost RepeatedInstrCost = 0;
287 for (auto *B : FI.OuterLoop->getBlocks()) {
288 if (FI.InnerLoop->contains(B))
289 continue;
290
291 for (auto &I : *B) {
292 if (!isa<PHINode>(&I) && !I.isTerminator() &&
293 !isSafeToSpeculativelyExecute(&I)) {
294 LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
295 "side effects: ";
296 I.dump());
297 return false;
298 }
299 // The execution count of the outer loop's iteration instructions
300 // (increment, compare and branch) will be increased, but the
301 // equivalent instructions will be removed from the inner loop, so
302 // they make a net difference of zero.
303 if (IterationInstructions.count(&I))
304 continue;
305 // The uncoditional branch to the inner loop's header will turn into
306 // a fall-through, so adds no cost.
307 BranchInst *Br = dyn_cast<BranchInst>(&I);
308 if (Br && Br->isUnconditional() &&
309 Br->getSuccessor(0) == FI.InnerLoop->getHeader())
310 continue;
311 // Multiplies of the outer iteration variable and inner iteration
312 // count will be optimised out.
313 if (match(&I, m_c_Mul(m_Specific(FI.OuterInductionPHI),
314 m_Specific(FI.InnerLimit))))
315 continue;
316 InstructionCost Cost =
317 TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
318 LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
319 RepeatedInstrCost += Cost;
320 }
321 }
322
323 LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
324 << RepeatedInstrCost << "\n");
325 // Bail out if flattening the loops would cause instructions in the outer
326 // loop but not in the inner loop to be executed extra times.
327 if (RepeatedInstrCost > RepeatedInstructionThreshold) {
328 LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: not profitable, bailing.\n");
329 return false;
330 }
331
332 LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: OK\n");
333 return true;
334 }
335
checkIVUsers(FlattenInfo & FI)336 static bool checkIVUsers(FlattenInfo &FI) {
337 // We require all uses of both induction variables to match this pattern:
338 //
339 // (OuterPHI * InnerLimit) + InnerPHI
340 //
341 // Any uses of the induction variables not matching that pattern would
342 // require a div/mod to reconstruct in the flattened loop, so the
343 // transformation wouldn't be profitable.
344
345 Value *InnerLimit = FI.InnerLimit;
346 if (FI.Widened &&
347 (isa<SExtInst>(InnerLimit) || isa<ZExtInst>(InnerLimit)))
348 InnerLimit = cast<Instruction>(InnerLimit)->getOperand(0);
349
350 // Check that all uses of the inner loop's induction variable match the
351 // expected pattern, recording the uses of the outer IV.
352 SmallPtrSet<Value *, 4> ValidOuterPHIUses;
353 for (User *U : FI.InnerInductionPHI->users()) {
354 if (U == FI.InnerIncrement)
355 continue;
356
357 // After widening the IVs, a trunc instruction might have been introduced, so
358 // look through truncs.
359 if (isa<TruncInst>(U)) {
360 if (!U->hasOneUse())
361 return false;
362 U = *U->user_begin();
363 }
364
365 LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
366
367 Value *MatchedMul;
368 Value *MatchedItCount;
369 bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
370 m_Value(MatchedMul))) &&
371 match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
372 m_Value(MatchedItCount)));
373
374 // Matches the same pattern as above, except it also looks for truncs
375 // on the phi, which can be the result of widening the induction variables.
376 bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
377 m_Value(MatchedMul))) &&
378 match(MatchedMul,
379 m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
380 m_Value(MatchedItCount)));
381
382 if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerLimit) {
383 LLVM_DEBUG(dbgs() << "Use is optimisable\n");
384 ValidOuterPHIUses.insert(MatchedMul);
385 FI.LinearIVUses.insert(U);
386 } else {
387 LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
388 return false;
389 }
390 }
391
392 // Check that there are no uses of the outer IV other than the ones found
393 // as part of the pattern above.
394 for (User *U : FI.OuterInductionPHI->users()) {
395 if (U == FI.OuterIncrement)
396 continue;
397
398 auto IsValidOuterPHIUses = [&] (User *U) -> bool {
399 LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
400 if (!ValidOuterPHIUses.count(U)) {
401 LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
402 return false;
403 }
404 LLVM_DEBUG(dbgs() << "Use is optimisable\n");
405 return true;
406 };
407
408 if (auto *V = dyn_cast<TruncInst>(U)) {
409 for (auto *K : V->users()) {
410 if (!IsValidOuterPHIUses(K))
411 return false;
412 }
413 continue;
414 }
415
416 if (!IsValidOuterPHIUses(U))
417 return false;
418 }
419
420 LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
421 dbgs() << "Found " << FI.LinearIVUses.size()
422 << " value(s) that can be replaced:\n";
423 for (Value *V : FI.LinearIVUses) {
424 dbgs() << " ";
425 V->dump();
426 });
427 return true;
428 }
429
430 // Return an OverflowResult dependant on if overflow of the multiplication of
431 // InnerLimit and OuterLimit can be assumed not to happen.
checkOverflow(FlattenInfo & FI,DominatorTree * DT,AssumptionCache * AC)432 static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
433 AssumptionCache *AC) {
434 Function *F = FI.OuterLoop->getHeader()->getParent();
435 const DataLayout &DL = F->getParent()->getDataLayout();
436
437 // For debugging/testing.
438 if (AssumeNoOverflow)
439 return OverflowResult::NeverOverflows;
440
441 // Check if the multiply could not overflow due to known ranges of the
442 // input values.
443 OverflowResult OR = computeOverflowForUnsignedMul(
444 FI.InnerLimit, FI.OuterLimit, DL, AC,
445 FI.OuterLoop->getLoopPreheader()->getTerminator(), DT);
446 if (OR != OverflowResult::MayOverflow)
447 return OR;
448
449 for (Value *V : FI.LinearIVUses) {
450 for (Value *U : V->users()) {
451 if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
452 // The IV is used as the operand of a GEP, and the IV is at least as
453 // wide as the address space of the GEP. In this case, the GEP would
454 // wrap around the address space before the IV increment wraps, which
455 // would be UB.
456 if (GEP->isInBounds() &&
457 V->getType()->getIntegerBitWidth() >=
458 DL.getPointerTypeSizeInBits(GEP->getType())) {
459 LLVM_DEBUG(
460 dbgs() << "use of linear IV would be UB if overflow occurred: ";
461 GEP->dump());
462 return OverflowResult::NeverOverflows;
463 }
464 }
465 }
466 }
467
468 return OverflowResult::MayOverflow;
469 }
470
CanFlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)471 static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
472 ScalarEvolution *SE, AssumptionCache *AC,
473 const TargetTransformInfo *TTI) {
474 SmallPtrSet<Instruction *, 8> IterationInstructions;
475 if (!findLoopComponents(FI.InnerLoop, IterationInstructions, FI.InnerInductionPHI,
476 FI.InnerLimit, FI.InnerIncrement, FI.InnerBranch, SE))
477 return false;
478 if (!findLoopComponents(FI.OuterLoop, IterationInstructions, FI.OuterInductionPHI,
479 FI.OuterLimit, FI.OuterIncrement, FI.OuterBranch, SE))
480 return false;
481
482 // Both of the loop limit values must be invariant in the outer loop
483 // (non-instructions are all inherently invariant).
484 if (!FI.OuterLoop->isLoopInvariant(FI.InnerLimit)) {
485 LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n");
486 return false;
487 }
488 if (!FI.OuterLoop->isLoopInvariant(FI.OuterLimit)) {
489 LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n");
490 return false;
491 }
492
493 if (!checkPHIs(FI, TTI))
494 return false;
495
496 // FIXME: it should be possible to handle different types correctly.
497 if (FI.InnerInductionPHI->getType() != FI.OuterInductionPHI->getType())
498 return false;
499
500 if (!checkOuterLoopInsts(FI, IterationInstructions, TTI))
501 return false;
502
503 // Find the values in the loop that can be replaced with the linearized
504 // induction variable, and check that there are no other uses of the inner
505 // or outer induction variable. If there were, we could still do this
506 // transformation, but we'd have to insert a div/mod to calculate the
507 // original IVs, so it wouldn't be profitable.
508 if (!checkIVUsers(FI))
509 return false;
510
511 LLVM_DEBUG(dbgs() << "CanFlattenLoopPair: OK\n");
512 return true;
513 }
514
DoFlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)515 static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
516 ScalarEvolution *SE, AssumptionCache *AC,
517 const TargetTransformInfo *TTI) {
518 Function *F = FI.OuterLoop->getHeader()->getParent();
519 LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
520 {
521 using namespace ore;
522 OptimizationRemark Remark(DEBUG_TYPE, "Flattened", FI.InnerLoop->getStartLoc(),
523 FI.InnerLoop->getHeader());
524 OptimizationRemarkEmitter ORE(F);
525 Remark << "Flattened into outer loop";
526 ORE.emit(Remark);
527 }
528
529 Value *NewTripCount =
530 BinaryOperator::CreateMul(FI.InnerLimit, FI.OuterLimit, "flatten.tripcount",
531 FI.OuterLoop->getLoopPreheader()->getTerminator());
532 LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
533 NewTripCount->dump());
534
535 // Fix up PHI nodes that take values from the inner loop back-edge, which
536 // we are about to remove.
537 FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
538
539 // The old Phi will be optimised away later, but for now we can't leave
540 // leave it in an invalid state, so are updating them too.
541 for (PHINode *PHI : FI.InnerPHIsToTransform)
542 PHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
543
544 // Modify the trip count of the outer loop to be the product of the two
545 // trip counts.
546 cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
547
548 // Replace the inner loop backedge with an unconditional branch to the exit.
549 BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
550 BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
551 InnerExitingBlock->getTerminator()->eraseFromParent();
552 BranchInst::Create(InnerExitBlock, InnerExitingBlock);
553 DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
554
555 // Replace all uses of the polynomial calculated from the two induction
556 // variables with the one new one.
557 IRBuilder<> Builder(FI.OuterInductionPHI->getParent()->getTerminator());
558 for (Value *V : FI.LinearIVUses) {
559 Value *OuterValue = FI.OuterInductionPHI;
560 if (FI.Widened)
561 OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
562 "flatten.trunciv");
563
564 LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
565 dbgs() << "with: "; OuterValue->dump());
566 V->replaceAllUsesWith(OuterValue);
567 }
568
569 // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
570 // deleted, and any information that have about the outer loop invalidated.
571 SE->forgetLoop(FI.OuterLoop);
572 SE->forgetLoop(FI.InnerLoop);
573 LI->erase(FI.InnerLoop);
574 return true;
575 }
576
CanWidenIV(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)577 static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
578 ScalarEvolution *SE, AssumptionCache *AC,
579 const TargetTransformInfo *TTI) {
580 if (!WidenIV) {
581 LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n");
582 return false;
583 }
584
585 LLVM_DEBUG(dbgs() << "Try widening the IVs\n");
586 Module *M = FI.InnerLoop->getHeader()->getParent()->getParent();
587 auto &DL = M->getDataLayout();
588 auto *InnerType = FI.InnerInductionPHI->getType();
589 auto *OuterType = FI.OuterInductionPHI->getType();
590 unsigned MaxLegalSize = DL.getLargestLegalIntTypeSizeInBits();
591 auto *MaxLegalType = DL.getLargestLegalIntType(M->getContext());
592
593 // If both induction types are less than the maximum legal integer width,
594 // promote both to the widest type available so we know calculating
595 // (OuterLimit * InnerLimit) as the new trip count is safe.
596 if (InnerType != OuterType ||
597 InnerType->getScalarSizeInBits() >= MaxLegalSize ||
598 MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
599 LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
600 return false;
601 }
602
603 SCEVExpander Rewriter(*SE, DL, "loopflatten");
604 SmallVector<WideIVInfo, 2> WideIVs;
605 SmallVector<WeakTrackingVH, 4> DeadInsts;
606 WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
607 WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
608 unsigned ElimExt = 0;
609 unsigned Widened = 0;
610
611 for (const auto &WideIV : WideIVs) {
612 PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
613 ElimExt, Widened, true /* HasGuards */,
614 true /* UsePostIncrementRanges */);
615 if (!WidePhi)
616 return false;
617 LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
618 LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIV.NarrowIV->dump());
619 RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
620 }
621 // After widening, rediscover all the loop components.
622 assert(Widened && "Widened IV expected");
623 FI.Widened = true;
624 return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
625 }
626
FlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)627 static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
628 ScalarEvolution *SE, AssumptionCache *AC,
629 const TargetTransformInfo *TTI) {
630 LLVM_DEBUG(
631 dbgs() << "Loop flattening running on outer loop "
632 << FI.OuterLoop->getHeader()->getName() << " and inner loop "
633 << FI.InnerLoop->getHeader()->getName() << " in "
634 << FI.OuterLoop->getHeader()->getParent()->getName() << "\n");
635
636 if (!CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI))
637 return false;
638
639 // Check if we can widen the induction variables to avoid overflow checks.
640 if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
641 return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
642
643 // Check if the new iteration variable might overflow. In this case, we
644 // need to version the loop, and select the original version at runtime if
645 // the iteration space is too large.
646 // TODO: We currently don't version the loop.
647 OverflowResult OR = checkOverflow(FI, DT, AC);
648 if (OR == OverflowResult::AlwaysOverflowsHigh ||
649 OR == OverflowResult::AlwaysOverflowsLow) {
650 LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
651 return false;
652 } else if (OR == OverflowResult::MayOverflow) {
653 LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
654 return false;
655 }
656
657 LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
658 return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
659 }
660
Flatten(DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,TargetTransformInfo * TTI)661 bool Flatten(DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
662 AssumptionCache *AC, TargetTransformInfo *TTI) {
663 bool Changed = false;
664 for (auto *InnerLoop : LI->getLoopsInPreorder()) {
665 auto *OuterLoop = InnerLoop->getParentLoop();
666 if (!OuterLoop)
667 continue;
668 FlattenInfo FI(OuterLoop, InnerLoop);
669 Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI);
670 }
671 return Changed;
672 }
673
run(Function & F,FunctionAnalysisManager & AM)674 PreservedAnalyses LoopFlattenPass::run(Function &F,
675 FunctionAnalysisManager &AM) {
676 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
677 auto *LI = &AM.getResult<LoopAnalysis>(F);
678 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
679 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
680 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
681
682 bool Changed = false;
683
684 // The loop flattening pass requires loops to be
685 // in simplified form, and also needs LCSSA. Running
686 // this pass will simplify all loops that contain inner loops,
687 // regardless of whether anything ends up being flattened.
688 for (const auto &L : *LI) {
689 if (L->isInnermost())
690 continue;
691 Changed |=
692 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
693 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
694 }
695
696 Changed |= Flatten(DT, LI, SE, AC, TTI);
697
698 if (!Changed)
699 return PreservedAnalyses::all();
700
701 return PreservedAnalyses::none();
702 }
703
704 namespace {
705 class LoopFlattenLegacyPass : public FunctionPass {
706 public:
707 static char ID; // Pass ID, replacement for typeid
LoopFlattenLegacyPass()708 LoopFlattenLegacyPass() : FunctionPass(ID) {
709 initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
710 }
711
712 // Possibly flatten loop L into its child.
713 bool runOnFunction(Function &F) override;
714
getAnalysisUsage(AnalysisUsage & AU) const715 void getAnalysisUsage(AnalysisUsage &AU) const override {
716 getLoopAnalysisUsage(AU);
717 AU.addRequired<TargetTransformInfoWrapperPass>();
718 AU.addPreserved<TargetTransformInfoWrapperPass>();
719 AU.addRequired<AssumptionCacheTracker>();
720 AU.addPreserved<AssumptionCacheTracker>();
721 }
722 };
723 } // namespace
724
725 char LoopFlattenLegacyPass::ID = 0;
726 INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
727 false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)728 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
729 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
730 INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
731 false, false)
732
733 FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
734
runOnFunction(Function & F)735 bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
736 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
737 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
738 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
739 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
740 auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
741 auto *TTI = &TTIP.getTTI(F);
742 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
743 return Flatten(DT, LI, SE, AC, TTI);
744 }
745