1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/Sequence.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/ADT/iterator_range.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/DependenceAnalysis.h"
27 #include "llvm/Analysis/DomTreeUpdater.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/LoopIterator.h"
30 #include "llvm/Analysis/MustExecute.h"
31 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
32 #include "llvm/Analysis/ScalarEvolution.h"
33 #include "llvm/IR/BasicBlock.h"
34 #include "llvm/IR/DebugInfoMetadata.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Use.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/IR/ValueHandle.h"
46 #include "llvm/IR/ValueMap.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/GenericDomTree.h"
51 #include "llvm/Support/raw_ostream.h"
52 #include "llvm/Transforms/Scalar/LoopPassManager.h"
53 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
54 #include "llvm/Transforms/Utils/Cloning.h"
55 #include "llvm/Transforms/Utils/LoopUtils.h"
56 #include "llvm/Transforms/Utils/UnrollLoop.h"
57 #include "llvm/Transforms/Utils/ValueMapper.h"
58 #include <assert.h>
59 #include <memory>
60 #include <type_traits>
61 #include <vector>
62
63 using namespace llvm;
64
65 #define DEBUG_TYPE "loop-unroll-and-jam"
66
67 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
68 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
69
70 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
71
72 // Partition blocks in an outer/inner loop pair into blocks before and after
73 // the loop
partitionLoopBlocks(Loop & L,BasicBlockSet & ForeBlocks,BasicBlockSet & AftBlocks,DominatorTree & DT)74 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
75 BasicBlockSet &AftBlocks, DominatorTree &DT) {
76 Loop *SubLoop = L.getSubLoops()[0];
77 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
78
79 for (BasicBlock *BB : L.blocks()) {
80 if (!SubLoop->contains(BB)) {
81 if (DT.dominates(SubLoopLatch, BB))
82 AftBlocks.insert(BB);
83 else
84 ForeBlocks.insert(BB);
85 }
86 }
87
88 // Check that all blocks in ForeBlocks together dominate the subloop
89 // TODO: This might ideally be done better with a dominator/postdominators.
90 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
91 for (BasicBlock *BB : ForeBlocks) {
92 if (BB == SubLoopPreHeader)
93 continue;
94 Instruction *TI = BB->getTerminator();
95 for (BasicBlock *Succ : successors(TI))
96 if (!ForeBlocks.count(Succ))
97 return false;
98 }
99
100 return true;
101 }
102
103 /// Partition blocks in a loop nest into blocks before and after each inner
104 /// loop.
partitionOuterLoopBlocks(Loop & Root,Loop & JamLoop,BasicBlockSet & JamLoopBlocks,DenseMap<Loop *,BasicBlockSet> & ForeBlocksMap,DenseMap<Loop *,BasicBlockSet> & AftBlocksMap,DominatorTree & DT)105 static bool partitionOuterLoopBlocks(
106 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
107 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
108 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
109 JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
110
111 for (Loop *L : Root.getLoopsInPreorder()) {
112 if (L == &JamLoop)
113 break;
114
115 if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
116 return false;
117 }
118
119 return true;
120 }
121
122 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
123 // than 2 levels loop.
partitionOuterLoopBlocks(Loop * L,Loop * SubLoop,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DominatorTree * DT)124 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
125 BasicBlockSet &ForeBlocks,
126 BasicBlockSet &SubLoopBlocks,
127 BasicBlockSet &AftBlocks,
128 DominatorTree *DT) {
129 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
130 return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
131 }
132
133 // Looks at the phi nodes in Header for values coming from Latch. For these
134 // instructions and all their operands calls Visit on them, keeping going for
135 // all the operands in AftBlocks. Returns false if Visit returns false,
136 // otherwise returns true. This is used to process the instructions in the
137 // Aft blocks that need to be moved before the subloop. It is used in two
138 // places. One to check that the required set of instructions can be moved
139 // before the loop. Then to collect the instructions to actually move in
140 // moveHeaderPhiOperandsToForeBlocks.
141 template <typename T>
processHeaderPhiOperands(BasicBlock * Header,BasicBlock * Latch,BasicBlockSet & AftBlocks,T Visit)142 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
143 BasicBlockSet &AftBlocks, T Visit) {
144 SmallVector<Instruction *, 8> Worklist;
145 SmallPtrSet<Instruction *, 8> VisitedInstr;
146 for (auto &Phi : Header->phis()) {
147 Value *V = Phi.getIncomingValueForBlock(Latch);
148 if (Instruction *I = dyn_cast<Instruction>(V))
149 Worklist.push_back(I);
150 }
151
152 while (!Worklist.empty()) {
153 Instruction *I = Worklist.pop_back_val();
154 if (!Visit(I))
155 return false;
156 VisitedInstr.insert(I);
157
158 if (AftBlocks.count(I->getParent()))
159 for (auto &U : I->operands())
160 if (Instruction *II = dyn_cast<Instruction>(U))
161 if (!VisitedInstr.count(II))
162 Worklist.push_back(II);
163 }
164
165 return true;
166 }
167
168 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
moveHeaderPhiOperandsToForeBlocks(BasicBlock * Header,BasicBlock * Latch,Instruction * InsertLoc,BasicBlockSet & AftBlocks)169 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
170 BasicBlock *Latch,
171 Instruction *InsertLoc,
172 BasicBlockSet &AftBlocks) {
173 // We need to ensure we move the instructions in the correct order,
174 // starting with the earliest required instruction and moving forward.
175 std::vector<Instruction *> Visited;
176 processHeaderPhiOperands(Header, Latch, AftBlocks,
177 [&Visited, &AftBlocks](Instruction *I) {
178 if (AftBlocks.count(I->getParent()))
179 Visited.push_back(I);
180 return true;
181 });
182
183 // Move all instructions in program order to before the InsertLoc
184 BasicBlock *InsertLocBB = InsertLoc->getParent();
185 for (Instruction *I : reverse(Visited)) {
186 if (I->getParent() != InsertLocBB)
187 I->moveBefore(InsertLoc);
188 }
189 }
190
191 /*
192 This method performs Unroll and Jam. For a simple loop like:
193 for (i = ..)
194 Fore(i)
195 for (j = ..)
196 SubLoop(i, j)
197 Aft(i)
198
199 Instead of doing normal inner or outer unrolling, we do:
200 for (i = .., i+=2)
201 Fore(i)
202 Fore(i+1)
203 for (j = ..)
204 SubLoop(i, j)
205 SubLoop(i+1, j)
206 Aft(i)
207 Aft(i+1)
208
209 So the outer loop is essetially unrolled and then the inner loops are fused
210 ("jammed") together into a single loop. This can increase speed when there
211 are loads in SubLoop that are invariant to i, as they become shared between
212 the now jammed inner loops.
213
214 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
215 Fore blocks are those before the inner loop, Aft are those after. Normal
216 Unroll code is used to copy each of these sets of blocks and the results are
217 combined together into the final form above.
218
219 isSafeToUnrollAndJam should be used prior to calling this to make sure the
220 unrolling will be valid. Checking profitablility is also advisable.
221
222 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
223 necessary to create one and not fully unrolled).
224 */
UnrollAndJamLoop(Loop * L,unsigned Count,unsigned TripCount,unsigned TripMultiple,bool UnrollRemainder,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,AssumptionCache * AC,const TargetTransformInfo * TTI,OptimizationRemarkEmitter * ORE,LPMUpdater * U,Loop ** EpilogueLoop)225 LoopUnrollResult llvm::UnrollAndJamLoop(
226 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
227 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
228 AssumptionCache *AC, const TargetTransformInfo *TTI,
229 OptimizationRemarkEmitter *ORE, LPMUpdater *U, Loop **EpilogueLoop) {
230
231 // When we enter here we should have already checked that it is safe
232 BasicBlock *Header = L->getHeader();
233 assert(Header && "No header.");
234 assert(L->getSubLoops().size() == 1);
235 Loop *SubLoop = *L->begin();
236
237 // Don't enter the unroll code if there is nothing to do.
238 if (TripCount == 0 && Count < 2) {
239 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
240 return LoopUnrollResult::Unmodified;
241 }
242
243 assert(Count > 0);
244 assert(TripMultiple > 0);
245 assert(TripCount == 0 || TripCount % TripMultiple == 0);
246
247 // Are we eliminating the loop control altogether?
248 bool CompletelyUnroll = (Count == TripCount);
249
250 // We use the runtime remainder in cases where we don't know trip multiple
251 if (TripMultiple == 1 || TripMultiple % Count != 0) {
252 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
253 /*UseEpilogRemainder*/ true,
254 UnrollRemainder, /*ForgetAllSCEV*/ false,
255 LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
256 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
257 "generated when assuming runtime trip count\n");
258 return LoopUnrollResult::Unmodified;
259 }
260 }
261
262 // Notify ScalarEvolution that the loop will be substantially changed,
263 // if not outright eliminated.
264 if (SE) {
265 SE->forgetLoop(L);
266 SE->forgetLoop(SubLoop);
267 }
268
269 using namespace ore;
270 // Report the unrolling decision.
271 if (CompletelyUnroll) {
272 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
273 << Header->getName() << " with trip count " << TripCount
274 << "!\n");
275 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
276 L->getHeader())
277 << "completely unroll and jammed loop with "
278 << NV("UnrollCount", TripCount) << " iterations");
279 } else {
280 auto DiagBuilder = [&]() {
281 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
282 L->getHeader());
283 return Diag << "unroll and jammed loop by a factor of "
284 << NV("UnrollCount", Count);
285 };
286
287 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
288 << " by " << Count);
289 if (TripMultiple != 1) {
290 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
291 ORE->emit([&]() {
292 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
293 << " trips per branch";
294 });
295 } else {
296 LLVM_DEBUG(dbgs() << " with run-time trip count");
297 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
298 }
299 LLVM_DEBUG(dbgs() << "!\n");
300 }
301
302 BasicBlock *Preheader = L->getLoopPreheader();
303 BasicBlock *LatchBlock = L->getLoopLatch();
304 assert(Preheader && "No preheader");
305 assert(LatchBlock && "No latch block");
306 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
307 assert(BI && !BI->isUnconditional());
308 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
309 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
310 bool SubLoopContinueOnTrue = SubLoop->contains(
311 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
312
313 // Partition blocks in an outer/inner loop pair into blocks before and after
314 // the loop
315 BasicBlockSet SubLoopBlocks;
316 BasicBlockSet ForeBlocks;
317 BasicBlockSet AftBlocks;
318 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
319 DT);
320
321 // We keep track of the entering/first and exiting/last block of each of
322 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
323 // blocks easier.
324 std::vector<BasicBlock *> ForeBlocksFirst;
325 std::vector<BasicBlock *> ForeBlocksLast;
326 std::vector<BasicBlock *> SubLoopBlocksFirst;
327 std::vector<BasicBlock *> SubLoopBlocksLast;
328 std::vector<BasicBlock *> AftBlocksFirst;
329 std::vector<BasicBlock *> AftBlocksLast;
330 ForeBlocksFirst.push_back(Header);
331 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
332 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
333 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
334 AftBlocksFirst.push_back(SubLoop->getExitBlock());
335 AftBlocksLast.push_back(L->getExitingBlock());
336 // Maps Blocks[0] -> Blocks[It]
337 ValueToValueMapTy LastValueMap;
338
339 // Move any instructions from fore phi operands from AftBlocks into Fore.
340 moveHeaderPhiOperandsToForeBlocks(
341 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
342
343 // The current on-the-fly SSA update requires blocks to be processed in
344 // reverse postorder so that LastValueMap contains the correct value at each
345 // exit.
346 LoopBlocksDFS DFS(L);
347 DFS.perform(LI);
348 // Stash the DFS iterators before adding blocks to the loop.
349 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
350 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
351
352 // When a FSDiscriminator is enabled, we don't need to add the multiply
353 // factors to the discriminators.
354 if (Header->getParent()->isDebugInfoForProfiling() && !EnableFSDiscriminator)
355 for (BasicBlock *BB : L->getBlocks())
356 for (Instruction &I : *BB)
357 if (!isa<DbgInfoIntrinsic>(&I))
358 if (const DILocation *DIL = I.getDebugLoc()) {
359 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
360 if (NewDIL)
361 I.setDebugLoc(NewDIL.getValue());
362 else
363 LLVM_DEBUG(dbgs()
364 << "Failed to create new discriminator: "
365 << DIL->getFilename() << " Line: " << DIL->getLine());
366 }
367
368 // Copy all blocks
369 for (unsigned It = 1; It != Count; ++It) {
370 SmallVector<BasicBlock *, 8> NewBlocks;
371 // Maps Blocks[It] -> Blocks[It-1]
372 DenseMap<Value *, Value *> PrevItValueMap;
373 SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
374 NewLoops[L] = L;
375 NewLoops[SubLoop] = SubLoop;
376
377 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
378 ValueToValueMapTy VMap;
379 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
380 Header->getParent()->getBasicBlockList().push_back(New);
381
382 // Tell LI about New.
383 addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
384
385 if (ForeBlocks.count(*BB)) {
386 if (*BB == ForeBlocksFirst[0])
387 ForeBlocksFirst.push_back(New);
388 if (*BB == ForeBlocksLast[0])
389 ForeBlocksLast.push_back(New);
390 } else if (SubLoopBlocks.count(*BB)) {
391 if (*BB == SubLoopBlocksFirst[0])
392 SubLoopBlocksFirst.push_back(New);
393 if (*BB == SubLoopBlocksLast[0])
394 SubLoopBlocksLast.push_back(New);
395 } else if (AftBlocks.count(*BB)) {
396 if (*BB == AftBlocksFirst[0])
397 AftBlocksFirst.push_back(New);
398 if (*BB == AftBlocksLast[0])
399 AftBlocksLast.push_back(New);
400 } else {
401 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
402 }
403
404 // Update our running maps of newest clones
405 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
406 LastValueMap[*BB] = New;
407 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
408 VI != VE; ++VI) {
409 PrevItValueMap[VI->second] =
410 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
411 LastValueMap[VI->first] = VI->second;
412 }
413
414 NewBlocks.push_back(New);
415
416 // Update DomTree:
417 if (*BB == ForeBlocksFirst[0])
418 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
419 else if (*BB == SubLoopBlocksFirst[0])
420 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
421 else if (*BB == AftBlocksFirst[0])
422 DT->addNewBlock(New, AftBlocksLast[It - 1]);
423 else {
424 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
425 // structure.
426 auto BBDomNode = DT->getNode(*BB);
427 auto BBIDom = BBDomNode->getIDom();
428 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
429 assert(OriginalBBIDom);
430 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
431 DT->addNewBlock(
432 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
433 }
434 }
435
436 // Remap all instructions in the most recent iteration
437 remapInstructionsInBlocks(NewBlocks, LastValueMap);
438 for (BasicBlock *NewBlock : NewBlocks) {
439 for (Instruction &I : *NewBlock) {
440 if (auto *II = dyn_cast<AssumeInst>(&I))
441 AC->registerAssumption(II);
442 }
443 }
444
445 // Alter the ForeBlocks phi's, pointing them at the latest version of the
446 // value from the previous iteration's phis
447 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
448 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
449 assert(OldValue && "should have incoming edge from Aft[It]");
450 Value *NewValue = OldValue;
451 if (Value *PrevValue = PrevItValueMap[OldValue])
452 NewValue = PrevValue;
453
454 assert(Phi.getNumOperands() == 2);
455 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
456 Phi.setIncomingValue(0, NewValue);
457 Phi.removeIncomingValue(1);
458 }
459 }
460
461 // Now that all the basic blocks for the unrolled iterations are in place,
462 // finish up connecting the blocks and phi nodes. At this point LastValueMap
463 // is the last unrolled iterations values.
464
465 // Update Phis in BB from OldBB to point to NewBB and use the latest value
466 // from LastValueMap
467 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
468 BasicBlock *NewBB,
469 ValueToValueMapTy &LastValueMap) {
470 for (PHINode &Phi : BB->phis()) {
471 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
472 if (Phi.getIncomingBlock(b) == OldBB) {
473 Value *OldValue = Phi.getIncomingValue(b);
474 if (Value *LastValue = LastValueMap[OldValue])
475 Phi.setIncomingValue(b, LastValue);
476 Phi.setIncomingBlock(b, NewBB);
477 break;
478 }
479 }
480 }
481 };
482 // Move all the phis from Src into Dest
483 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
484 Instruction *insertPoint = Dest->getFirstNonPHI();
485 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
486 Phi->moveBefore(insertPoint);
487 };
488
489 // Update the PHI values outside the loop to point to the last block
490 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
491 LastValueMap);
492
493 // Update ForeBlocks successors and phi nodes
494 BranchInst *ForeTerm =
495 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
496 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
497 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
498
499 if (CompletelyUnroll) {
500 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
501 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
502 Phi->getParent()->getInstList().erase(Phi);
503 }
504 } else {
505 // Update the PHI values to point to the last aft block
506 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
507 AftBlocksLast.back(), LastValueMap);
508 }
509
510 for (unsigned It = 1; It != Count; It++) {
511 // Remap ForeBlock successors from previous iteration to this
512 BranchInst *ForeTerm =
513 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
514 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
515 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
516 }
517
518 // Subloop successors and phis
519 BranchInst *SubTerm =
520 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
521 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
522 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
523 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
524 ForeBlocksLast.back());
525 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
526 SubLoopBlocksLast.back());
527
528 for (unsigned It = 1; It != Count; It++) {
529 // Replace the conditional branch of the previous iteration subloop with an
530 // unconditional one to this one
531 BranchInst *SubTerm =
532 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
533 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
534 SubTerm->eraseFromParent();
535
536 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
537 ForeBlocksLast.back());
538 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
539 SubLoopBlocksLast.back());
540 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
541 }
542
543 // Aft blocks successors and phis
544 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
545 if (CompletelyUnroll) {
546 BranchInst::Create(LoopExit, AftTerm);
547 AftTerm->eraseFromParent();
548 } else {
549 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
550 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
551 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
552 }
553 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
554 SubLoopBlocksLast.back());
555
556 for (unsigned It = 1; It != Count; It++) {
557 // Replace the conditional branch of the previous iteration subloop with an
558 // unconditional one to this one
559 BranchInst *AftTerm =
560 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
561 BranchInst::Create(AftBlocksFirst[It], AftTerm);
562 AftTerm->eraseFromParent();
563
564 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
565 SubLoopBlocksLast.back());
566 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
567 }
568
569 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
570 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
571 // new ones required.
572 if (Count != 1) {
573 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
574 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
575 SubLoopBlocksFirst[0]);
576 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
577 SubLoopBlocksLast[0], AftBlocksFirst[0]);
578
579 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
580 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
581 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
582 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
583 DTU.applyUpdatesPermissive(DTUpdates);
584 }
585
586 // Merge adjacent basic blocks, if possible.
587 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
588 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
589 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
590 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
591
592 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
593
594 // Apply updates to the DomTree.
595 DT = &DTU.getDomTree();
596
597 // At this point, the code is well formed. We now do a quick sweep over the
598 // inserted code, doing constant propagation and dead code elimination as we
599 // go.
600 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
601 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
602 TTI);
603
604 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
605 ++NumUnrolledAndJammed;
606
607 // Update LoopInfo if the loop is completely removed.
608 if (CompletelyUnroll) {
609 if (U)
610 U->markLoopAsDeleted(*L, std::string(L->getName()));
611 LI->erase(L);
612 }
613
614 #ifndef NDEBUG
615 // We shouldn't have done anything to break loop simplify form or LCSSA.
616 Loop *OutestLoop = SubLoop->getParentLoop()
617 ? SubLoop->getParentLoop()->getParentLoop()
618 ? SubLoop->getParentLoop()->getParentLoop()
619 : SubLoop->getParentLoop()
620 : SubLoop;
621 assert(DT->verify());
622 LI->verify(*DT);
623 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
624 if (!CompletelyUnroll)
625 assert(L->isLoopSimplifyForm());
626 assert(SubLoop->isLoopSimplifyForm());
627 SE->verify();
628 #endif
629
630 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
631 : LoopUnrollResult::PartiallyUnrolled;
632 }
633
getLoadsAndStores(BasicBlockSet & Blocks,SmallVector<Instruction *,4> & MemInstr)634 static bool getLoadsAndStores(BasicBlockSet &Blocks,
635 SmallVector<Instruction *, 4> &MemInstr) {
636 // Scan the BBs and collect legal loads and stores.
637 // Returns false if non-simple loads/stores are found.
638 for (BasicBlock *BB : Blocks) {
639 for (Instruction &I : *BB) {
640 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
641 if (!Ld->isSimple())
642 return false;
643 MemInstr.push_back(&I);
644 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
645 if (!St->isSimple())
646 return false;
647 MemInstr.push_back(&I);
648 } else if (I.mayReadOrWriteMemory()) {
649 return false;
650 }
651 }
652 }
653 return true;
654 }
655
preservesForwardDependence(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,Dependence * D)656 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
657 unsigned UnrollLevel, unsigned JamLevel,
658 bool Sequentialized, Dependence *D) {
659 // UnrollLevel might carry the dependency Src --> Dst
660 // Does a different loop after unrolling?
661 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
662 ++CurLoopDepth) {
663 auto JammedDir = D->getDirection(CurLoopDepth);
664 if (JammedDir == Dependence::DVEntry::LT)
665 return true;
666
667 if (JammedDir & Dependence::DVEntry::GT)
668 return false;
669 }
670
671 return true;
672 }
673
preservesBackwardDependence(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,Dependence * D)674 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
675 unsigned UnrollLevel, unsigned JamLevel,
676 bool Sequentialized, Dependence *D) {
677 // UnrollLevel might carry the dependency Dst --> Src
678 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
679 ++CurLoopDepth) {
680 auto JammedDir = D->getDirection(CurLoopDepth);
681 if (JammedDir == Dependence::DVEntry::GT)
682 return true;
683
684 if (JammedDir & Dependence::DVEntry::LT)
685 return false;
686 }
687
688 // Backward dependencies are only preserved if not interleaved.
689 return Sequentialized;
690 }
691
692 // Check whether it is semantically safe Src and Dst considering any potential
693 // dependency between them.
694 //
695 // @param UnrollLevel The level of the loop being unrolled
696 // @param JamLevel The level of the loop being jammed; if Src and Dst are on
697 // different levels, the outermost common loop counts as jammed level
698 //
699 // @return true if is safe and false if there is a dependency violation.
checkDependency(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,DependenceInfo & DI)700 static bool checkDependency(Instruction *Src, Instruction *Dst,
701 unsigned UnrollLevel, unsigned JamLevel,
702 bool Sequentialized, DependenceInfo &DI) {
703 assert(UnrollLevel <= JamLevel &&
704 "Expecting JamLevel to be at least UnrollLevel");
705
706 if (Src == Dst)
707 return true;
708 // Ignore Input dependencies.
709 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
710 return true;
711
712 // Check whether unroll-and-jam may violate a dependency.
713 // By construction, every dependency will be lexicographically non-negative
714 // (if it was, it would violate the current execution order), such as
715 // (0,0,>,*,*)
716 // Unroll-and-jam changes the GT execution of two executions to the same
717 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
718 // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
719 // (0,0,>=,*,*)
720 // Now, the dependency is not necessarily non-negative anymore, i.e.
721 // unroll-and-jam may violate correctness.
722 std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
723 if (!D)
724 return true;
725 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
726
727 if (D->isConfused()) {
728 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
729 << " " << *Src << "\n"
730 << " " << *Dst << "\n");
731 return false;
732 }
733
734 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
735 // non-equal direction, then the locations accessed in the inner levels cannot
736 // overlap in memory. We assumes the indexes never overlap into neighboring
737 // dimensions.
738 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
739 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
740 return true;
741
742 auto UnrollDirection = D->getDirection(UnrollLevel);
743
744 // If the distance carried by the unrolled loop is 0, then after unrolling
745 // that distance will become non-zero resulting in non-overlapping accesses in
746 // the inner loops.
747 if (UnrollDirection == Dependence::DVEntry::EQ)
748 return true;
749
750 if (UnrollDirection & Dependence::DVEntry::LT &&
751 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
752 Sequentialized, D.get()))
753 return false;
754
755 if (UnrollDirection & Dependence::DVEntry::GT &&
756 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
757 Sequentialized, D.get()))
758 return false;
759
760 return true;
761 }
762
763 static bool
checkDependencies(Loop & Root,const BasicBlockSet & SubLoopBlocks,const DenseMap<Loop *,BasicBlockSet> & ForeBlocksMap,const DenseMap<Loop *,BasicBlockSet> & AftBlocksMap,DependenceInfo & DI,LoopInfo & LI)764 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
765 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
766 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
767 DependenceInfo &DI, LoopInfo &LI) {
768 SmallVector<BasicBlockSet, 8> AllBlocks;
769 for (Loop *L : Root.getLoopsInPreorder())
770 if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
771 AllBlocks.push_back(ForeBlocksMap.lookup(L));
772 AllBlocks.push_back(SubLoopBlocks);
773 for (Loop *L : Root.getLoopsInPreorder())
774 if (AftBlocksMap.find(L) != AftBlocksMap.end())
775 AllBlocks.push_back(AftBlocksMap.lookup(L));
776
777 unsigned LoopDepth = Root.getLoopDepth();
778 SmallVector<Instruction *, 4> EarlierLoadsAndStores;
779 SmallVector<Instruction *, 4> CurrentLoadsAndStores;
780 for (BasicBlockSet &Blocks : AllBlocks) {
781 CurrentLoadsAndStores.clear();
782 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
783 return false;
784
785 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
786 unsigned CurLoopDepth = CurLoop->getLoopDepth();
787
788 for (auto *Earlier : EarlierLoadsAndStores) {
789 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
790 unsigned EarlierDepth = EarlierLoop->getLoopDepth();
791 unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
792 for (auto *Later : CurrentLoadsAndStores) {
793 if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
794 DI))
795 return false;
796 }
797 }
798
799 size_t NumInsts = CurrentLoadsAndStores.size();
800 for (size_t I = 0; I < NumInsts; ++I) {
801 for (size_t J = I; J < NumInsts; ++J) {
802 if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
803 LoopDepth, CurLoopDepth, true, DI))
804 return false;
805 }
806 }
807
808 EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
809 CurrentLoadsAndStores.end());
810 }
811 return true;
812 }
813
isEligibleLoopForm(const Loop & Root)814 static bool isEligibleLoopForm(const Loop &Root) {
815 // Root must have a child.
816 if (Root.getSubLoops().size() != 1)
817 return false;
818
819 const Loop *L = &Root;
820 do {
821 // All loops in Root need to be in simplify and rotated form.
822 if (!L->isLoopSimplifyForm())
823 return false;
824
825 if (!L->isRotatedForm())
826 return false;
827
828 if (L->getHeader()->hasAddressTaken()) {
829 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
830 return false;
831 }
832
833 unsigned SubLoopsSize = L->getSubLoops().size();
834 if (SubLoopsSize == 0)
835 return true;
836
837 // Only one child is allowed.
838 if (SubLoopsSize != 1)
839 return false;
840
841 // Only loops with a single exit block can be unrolled and jammed.
842 // The function getExitBlock() is used for this check, rather than
843 // getUniqueExitBlock() to ensure loops with mulitple exit edges are
844 // disallowed.
845 if (!L->getExitBlock()) {
846 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "
847 "blocks can be unrolled and jammed.\n");
848 return false;
849 }
850
851 // Only loops with a single exiting block can be unrolled and jammed.
852 if (!L->getExitingBlock()) {
853 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "
854 "exiting blocks can be unrolled and jammed.\n");
855 return false;
856 }
857
858 L = L->getSubLoops()[0];
859 } while (L);
860
861 return true;
862 }
863
getInnerMostLoop(Loop * L)864 static Loop *getInnerMostLoop(Loop *L) {
865 while (!L->getSubLoops().empty())
866 L = L->getSubLoops()[0];
867 return L;
868 }
869
isSafeToUnrollAndJam(Loop * L,ScalarEvolution & SE,DominatorTree & DT,DependenceInfo & DI,LoopInfo & LI)870 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
871 DependenceInfo &DI, LoopInfo &LI) {
872 if (!isEligibleLoopForm(*L)) {
873 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
874 return false;
875 }
876
877 /* We currently handle outer loops like this:
878 |
879 ForeFirst <------\ }
880 Blocks | } ForeBlocks of L
881 ForeLast | }
882 | |
883 ... |
884 | |
885 ForeFirst <----\ | }
886 Blocks | | } ForeBlocks of a inner loop of L
887 ForeLast | | }
888 | | |
889 JamLoopFirst <\ | | }
890 Blocks | | | } JamLoopBlocks of the innermost loop
891 JamLoopLast -/ | | }
892 | | |
893 AftFirst | | }
894 Blocks | | } AftBlocks of a inner loop of L
895 AftLast ------/ | }
896 | |
897 ... |
898 | |
899 AftFirst | }
900 Blocks | } AftBlocks of L
901 AftLast --------/ }
902 |
903
904 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
905 and AftBlocks, providing that there is one edge from Fores to SubLoops,
906 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
907 In practice we currently limit Aft blocks to a single block, and limit
908 things further in the profitablility checks of the unroll and jam pass.
909
910 Because of the way we rearrange basic blocks, we also require that
911 the Fore blocks of L on all unrolled iterations are safe to move before the
912 blocks of the direct child of L of all iterations. So we require that the
913 phi node looping operands of ForeHeader can be moved to at least the end of
914 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
915 match up Phi's correctly.
916
917 i.e. The old order of blocks used to be
918 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
919 It needs to be safe to transform this to
920 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
921
922 There are then a number of checks along the lines of no calls, no
923 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
924 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
925 UnrollAndJamLoop if the trip count cannot be easily calculated.
926 */
927
928 // Split blocks into Fore/SubLoop/Aft based on dominators
929 Loop *JamLoop = getInnerMostLoop(L);
930 BasicBlockSet SubLoopBlocks;
931 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
932 DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
933 if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
934 AftBlocksMap, DT)) {
935 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
936 return false;
937 }
938
939 // Aft blocks may need to move instructions to fore blocks, which becomes more
940 // difficult if there are multiple (potentially conditionally executed)
941 // blocks. For now we just exclude loops with multiple aft blocks.
942 if (AftBlocksMap[L].size() != 1) {
943 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
944 "multiple blocks after the loop\n");
945 return false;
946 }
947
948 // Check inner loop backedge count is consistent on all iterations of the
949 // outer loop
950 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
951 return !hasIterationCountInvariantInParent(SubLoop, SE);
952 })) {
953 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
954 "not consistent on each iteration\n");
955 return false;
956 }
957
958 // Check the loop safety info for exceptions.
959 SimpleLoopSafetyInfo LSI;
960 LSI.computeLoopSafetyInfo(L);
961 if (LSI.anyBlockMayThrow()) {
962 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
963 return false;
964 }
965
966 // We've ruled out the easy stuff and now need to check that there are no
967 // interdependencies which may prevent us from moving the:
968 // ForeBlocks before Subloop and AftBlocks.
969 // Subloop before AftBlocks.
970 // ForeBlock phi operands before the subloop
971
972 // Make sure we can move all instructions we need to before the subloop
973 BasicBlock *Header = L->getHeader();
974 BasicBlock *Latch = L->getLoopLatch();
975 BasicBlockSet AftBlocks = AftBlocksMap[L];
976 Loop *SubLoop = L->getSubLoops()[0];
977 if (!processHeaderPhiOperands(
978 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
979 if (SubLoop->contains(I->getParent()))
980 return false;
981 if (AftBlocks.count(I->getParent())) {
982 // If we hit a phi node in afts we know we are done (probably
983 // LCSSA)
984 if (isa<PHINode>(I))
985 return false;
986 // Can't move instructions with side effects or memory
987 // reads/writes
988 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
989 return false;
990 }
991 // Keep going
992 return true;
993 })) {
994 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
995 "instructions after subloop to before it\n");
996 return false;
997 }
998
999 // Check for memory dependencies which prohibit the unrolling we are doing.
1000 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
1001 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
1002 if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
1003 LI)) {
1004 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
1005 return false;
1006 }
1007
1008 return true;
1009 }
1010