xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/Sequence.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/ADT/iterator_range.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/DependenceAnalysis.h"
27 #include "llvm/Analysis/DomTreeUpdater.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/LoopIterator.h"
30 #include "llvm/Analysis/MustExecute.h"
31 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
32 #include "llvm/Analysis/ScalarEvolution.h"
33 #include "llvm/IR/BasicBlock.h"
34 #include "llvm/IR/DebugInfoMetadata.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Use.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/IR/ValueHandle.h"
46 #include "llvm/IR/ValueMap.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/GenericDomTree.h"
51 #include "llvm/Support/raw_ostream.h"
52 #include "llvm/Transforms/Scalar/LoopPassManager.h"
53 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
54 #include "llvm/Transforms/Utils/Cloning.h"
55 #include "llvm/Transforms/Utils/LoopUtils.h"
56 #include "llvm/Transforms/Utils/UnrollLoop.h"
57 #include "llvm/Transforms/Utils/ValueMapper.h"
58 #include <assert.h>
59 #include <memory>
60 #include <type_traits>
61 #include <vector>
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "loop-unroll-and-jam"
66 
67 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
68 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
69 
70 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
71 
72 // Partition blocks in an outer/inner loop pair into blocks before and after
73 // the loop
partitionLoopBlocks(Loop & L,BasicBlockSet & ForeBlocks,BasicBlockSet & AftBlocks,DominatorTree & DT)74 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
75                                 BasicBlockSet &AftBlocks, DominatorTree &DT) {
76   Loop *SubLoop = L.getSubLoops()[0];
77   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
78 
79   for (BasicBlock *BB : L.blocks()) {
80     if (!SubLoop->contains(BB)) {
81       if (DT.dominates(SubLoopLatch, BB))
82         AftBlocks.insert(BB);
83       else
84         ForeBlocks.insert(BB);
85     }
86   }
87 
88   // Check that all blocks in ForeBlocks together dominate the subloop
89   // TODO: This might ideally be done better with a dominator/postdominators.
90   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
91   for (BasicBlock *BB : ForeBlocks) {
92     if (BB == SubLoopPreHeader)
93       continue;
94     Instruction *TI = BB->getTerminator();
95     for (BasicBlock *Succ : successors(TI))
96       if (!ForeBlocks.count(Succ))
97         return false;
98   }
99 
100   return true;
101 }
102 
103 /// Partition blocks in a loop nest into blocks before and after each inner
104 /// loop.
partitionOuterLoopBlocks(Loop & Root,Loop & JamLoop,BasicBlockSet & JamLoopBlocks,DenseMap<Loop *,BasicBlockSet> & ForeBlocksMap,DenseMap<Loop *,BasicBlockSet> & AftBlocksMap,DominatorTree & DT)105 static bool partitionOuterLoopBlocks(
106     Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
107     DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
108     DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
109   JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
110 
111   for (Loop *L : Root.getLoopsInPreorder()) {
112     if (L == &JamLoop)
113       break;
114 
115     if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
116       return false;
117   }
118 
119   return true;
120 }
121 
122 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
123 // than 2 levels loop.
partitionOuterLoopBlocks(Loop * L,Loop * SubLoop,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DominatorTree * DT)124 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
125                                      BasicBlockSet &ForeBlocks,
126                                      BasicBlockSet &SubLoopBlocks,
127                                      BasicBlockSet &AftBlocks,
128                                      DominatorTree *DT) {
129   SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
130   return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
131 }
132 
133 // Looks at the phi nodes in Header for values coming from Latch. For these
134 // instructions and all their operands calls Visit on them, keeping going for
135 // all the operands in AftBlocks. Returns false if Visit returns false,
136 // otherwise returns true. This is used to process the instructions in the
137 // Aft blocks that need to be moved before the subloop. It is used in two
138 // places. One to check that the required set of instructions can be moved
139 // before the loop. Then to collect the instructions to actually move in
140 // moveHeaderPhiOperandsToForeBlocks.
141 template <typename T>
processHeaderPhiOperands(BasicBlock * Header,BasicBlock * Latch,BasicBlockSet & AftBlocks,T Visit)142 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
143                                      BasicBlockSet &AftBlocks, T Visit) {
144   SmallVector<Instruction *, 8> Worklist;
145   SmallPtrSet<Instruction *, 8> VisitedInstr;
146   for (auto &Phi : Header->phis()) {
147     Value *V = Phi.getIncomingValueForBlock(Latch);
148     if (Instruction *I = dyn_cast<Instruction>(V))
149       Worklist.push_back(I);
150   }
151 
152   while (!Worklist.empty()) {
153     Instruction *I = Worklist.pop_back_val();
154     if (!Visit(I))
155       return false;
156     VisitedInstr.insert(I);
157 
158     if (AftBlocks.count(I->getParent()))
159       for (auto &U : I->operands())
160         if (Instruction *II = dyn_cast<Instruction>(U))
161           if (!VisitedInstr.count(II))
162             Worklist.push_back(II);
163   }
164 
165   return true;
166 }
167 
168 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
moveHeaderPhiOperandsToForeBlocks(BasicBlock * Header,BasicBlock * Latch,Instruction * InsertLoc,BasicBlockSet & AftBlocks)169 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
170                                               BasicBlock *Latch,
171                                               Instruction *InsertLoc,
172                                               BasicBlockSet &AftBlocks) {
173   // We need to ensure we move the instructions in the correct order,
174   // starting with the earliest required instruction and moving forward.
175   std::vector<Instruction *> Visited;
176   processHeaderPhiOperands(Header, Latch, AftBlocks,
177                            [&Visited, &AftBlocks](Instruction *I) {
178                              if (AftBlocks.count(I->getParent()))
179                                Visited.push_back(I);
180                              return true;
181                            });
182 
183   // Move all instructions in program order to before the InsertLoc
184   BasicBlock *InsertLocBB = InsertLoc->getParent();
185   for (Instruction *I : reverse(Visited)) {
186     if (I->getParent() != InsertLocBB)
187       I->moveBefore(InsertLoc);
188   }
189 }
190 
191 /*
192   This method performs Unroll and Jam. For a simple loop like:
193   for (i = ..)
194     Fore(i)
195     for (j = ..)
196       SubLoop(i, j)
197     Aft(i)
198 
199   Instead of doing normal inner or outer unrolling, we do:
200   for (i = .., i+=2)
201     Fore(i)
202     Fore(i+1)
203     for (j = ..)
204       SubLoop(i, j)
205       SubLoop(i+1, j)
206     Aft(i)
207     Aft(i+1)
208 
209   So the outer loop is essetially unrolled and then the inner loops are fused
210   ("jammed") together into a single loop. This can increase speed when there
211   are loads in SubLoop that are invariant to i, as they become shared between
212   the now jammed inner loops.
213 
214   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
215   Fore blocks are those before the inner loop, Aft are those after. Normal
216   Unroll code is used to copy each of these sets of blocks and the results are
217   combined together into the final form above.
218 
219   isSafeToUnrollAndJam should be used prior to calling this to make sure the
220   unrolling will be valid. Checking profitablility is also advisable.
221 
222   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
223   necessary to create one and not fully unrolled).
224 */
UnrollAndJamLoop(Loop * L,unsigned Count,unsigned TripCount,unsigned TripMultiple,bool UnrollRemainder,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,AssumptionCache * AC,const TargetTransformInfo * TTI,OptimizationRemarkEmitter * ORE,LPMUpdater * U,Loop ** EpilogueLoop)225 LoopUnrollResult llvm::UnrollAndJamLoop(
226     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
227     bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
228     AssumptionCache *AC, const TargetTransformInfo *TTI,
229     OptimizationRemarkEmitter *ORE, LPMUpdater *U, Loop **EpilogueLoop) {
230 
231   // When we enter here we should have already checked that it is safe
232   BasicBlock *Header = L->getHeader();
233   assert(Header && "No header.");
234   assert(L->getSubLoops().size() == 1);
235   Loop *SubLoop = *L->begin();
236 
237   // Don't enter the unroll code if there is nothing to do.
238   if (TripCount == 0 && Count < 2) {
239     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
240     return LoopUnrollResult::Unmodified;
241   }
242 
243   assert(Count > 0);
244   assert(TripMultiple > 0);
245   assert(TripCount == 0 || TripCount % TripMultiple == 0);
246 
247   // Are we eliminating the loop control altogether?
248   bool CompletelyUnroll = (Count == TripCount);
249 
250   // We use the runtime remainder in cases where we don't know trip multiple
251   if (TripMultiple == 1 || TripMultiple % Count != 0) {
252     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
253                                     /*UseEpilogRemainder*/ true,
254                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
255                                     LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
256       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
257                            "generated when assuming runtime trip count\n");
258       return LoopUnrollResult::Unmodified;
259     }
260   }
261 
262   // Notify ScalarEvolution that the loop will be substantially changed,
263   // if not outright eliminated.
264   if (SE) {
265     SE->forgetLoop(L);
266     SE->forgetLoop(SubLoop);
267   }
268 
269   using namespace ore;
270   // Report the unrolling decision.
271   if (CompletelyUnroll) {
272     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
273                       << Header->getName() << " with trip count " << TripCount
274                       << "!\n");
275     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
276                                  L->getHeader())
277               << "completely unroll and jammed loop with "
278               << NV("UnrollCount", TripCount) << " iterations");
279   } else {
280     auto DiagBuilder = [&]() {
281       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
282                               L->getHeader());
283       return Diag << "unroll and jammed loop by a factor of "
284                   << NV("UnrollCount", Count);
285     };
286 
287     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
288                       << " by " << Count);
289     if (TripMultiple != 1) {
290       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
291       ORE->emit([&]() {
292         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
293                              << " trips per branch";
294       });
295     } else {
296       LLVM_DEBUG(dbgs() << " with run-time trip count");
297       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
298     }
299     LLVM_DEBUG(dbgs() << "!\n");
300   }
301 
302   BasicBlock *Preheader = L->getLoopPreheader();
303   BasicBlock *LatchBlock = L->getLoopLatch();
304   assert(Preheader && "No preheader");
305   assert(LatchBlock && "No latch block");
306   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
307   assert(BI && !BI->isUnconditional());
308   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
309   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
310   bool SubLoopContinueOnTrue = SubLoop->contains(
311       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
312 
313   // Partition blocks in an outer/inner loop pair into blocks before and after
314   // the loop
315   BasicBlockSet SubLoopBlocks;
316   BasicBlockSet ForeBlocks;
317   BasicBlockSet AftBlocks;
318   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
319                            DT);
320 
321   // We keep track of the entering/first and exiting/last block of each of
322   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
323   // blocks easier.
324   std::vector<BasicBlock *> ForeBlocksFirst;
325   std::vector<BasicBlock *> ForeBlocksLast;
326   std::vector<BasicBlock *> SubLoopBlocksFirst;
327   std::vector<BasicBlock *> SubLoopBlocksLast;
328   std::vector<BasicBlock *> AftBlocksFirst;
329   std::vector<BasicBlock *> AftBlocksLast;
330   ForeBlocksFirst.push_back(Header);
331   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
332   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
333   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
334   AftBlocksFirst.push_back(SubLoop->getExitBlock());
335   AftBlocksLast.push_back(L->getExitingBlock());
336   // Maps Blocks[0] -> Blocks[It]
337   ValueToValueMapTy LastValueMap;
338 
339   // Move any instructions from fore phi operands from AftBlocks into Fore.
340   moveHeaderPhiOperandsToForeBlocks(
341       Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
342 
343   // The current on-the-fly SSA update requires blocks to be processed in
344   // reverse postorder so that LastValueMap contains the correct value at each
345   // exit.
346   LoopBlocksDFS DFS(L);
347   DFS.perform(LI);
348   // Stash the DFS iterators before adding blocks to the loop.
349   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
350   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
351 
352   // When a FSDiscriminator is enabled, we don't need to add the multiply
353   // factors to the discriminators.
354   if (Header->getParent()->isDebugInfoForProfiling() && !EnableFSDiscriminator)
355     for (BasicBlock *BB : L->getBlocks())
356       for (Instruction &I : *BB)
357         if (!isa<DbgInfoIntrinsic>(&I))
358           if (const DILocation *DIL = I.getDebugLoc()) {
359             auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
360             if (NewDIL)
361               I.setDebugLoc(NewDIL.getValue());
362             else
363               LLVM_DEBUG(dbgs()
364                          << "Failed to create new discriminator: "
365                          << DIL->getFilename() << " Line: " << DIL->getLine());
366           }
367 
368   // Copy all blocks
369   for (unsigned It = 1; It != Count; ++It) {
370     SmallVector<BasicBlock *, 8> NewBlocks;
371     // Maps Blocks[It] -> Blocks[It-1]
372     DenseMap<Value *, Value *> PrevItValueMap;
373     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
374     NewLoops[L] = L;
375     NewLoops[SubLoop] = SubLoop;
376 
377     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
378       ValueToValueMapTy VMap;
379       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
380       Header->getParent()->getBasicBlockList().push_back(New);
381 
382       // Tell LI about New.
383       addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
384 
385       if (ForeBlocks.count(*BB)) {
386         if (*BB == ForeBlocksFirst[0])
387           ForeBlocksFirst.push_back(New);
388         if (*BB == ForeBlocksLast[0])
389           ForeBlocksLast.push_back(New);
390       } else if (SubLoopBlocks.count(*BB)) {
391         if (*BB == SubLoopBlocksFirst[0])
392           SubLoopBlocksFirst.push_back(New);
393         if (*BB == SubLoopBlocksLast[0])
394           SubLoopBlocksLast.push_back(New);
395       } else if (AftBlocks.count(*BB)) {
396         if (*BB == AftBlocksFirst[0])
397           AftBlocksFirst.push_back(New);
398         if (*BB == AftBlocksLast[0])
399           AftBlocksLast.push_back(New);
400       } else {
401         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
402       }
403 
404       // Update our running maps of newest clones
405       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
406       LastValueMap[*BB] = New;
407       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
408            VI != VE; ++VI) {
409         PrevItValueMap[VI->second] =
410             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
411         LastValueMap[VI->first] = VI->second;
412       }
413 
414       NewBlocks.push_back(New);
415 
416       // Update DomTree:
417       if (*BB == ForeBlocksFirst[0])
418         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
419       else if (*BB == SubLoopBlocksFirst[0])
420         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
421       else if (*BB == AftBlocksFirst[0])
422         DT->addNewBlock(New, AftBlocksLast[It - 1]);
423       else {
424         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
425         // structure.
426         auto BBDomNode = DT->getNode(*BB);
427         auto BBIDom = BBDomNode->getIDom();
428         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
429         assert(OriginalBBIDom);
430         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
431         DT->addNewBlock(
432             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
433       }
434     }
435 
436     // Remap all instructions in the most recent iteration
437     remapInstructionsInBlocks(NewBlocks, LastValueMap);
438     for (BasicBlock *NewBlock : NewBlocks) {
439       for (Instruction &I : *NewBlock) {
440         if (auto *II = dyn_cast<AssumeInst>(&I))
441           AC->registerAssumption(II);
442       }
443     }
444 
445     // Alter the ForeBlocks phi's, pointing them at the latest version of the
446     // value from the previous iteration's phis
447     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
448       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
449       assert(OldValue && "should have incoming edge from Aft[It]");
450       Value *NewValue = OldValue;
451       if (Value *PrevValue = PrevItValueMap[OldValue])
452         NewValue = PrevValue;
453 
454       assert(Phi.getNumOperands() == 2);
455       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
456       Phi.setIncomingValue(0, NewValue);
457       Phi.removeIncomingValue(1);
458     }
459   }
460 
461   // Now that all the basic blocks for the unrolled iterations are in place,
462   // finish up connecting the blocks and phi nodes. At this point LastValueMap
463   // is the last unrolled iterations values.
464 
465   // Update Phis in BB from OldBB to point to NewBB and use the latest value
466   // from LastValueMap
467   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
468                                      BasicBlock *NewBB,
469                                      ValueToValueMapTy &LastValueMap) {
470     for (PHINode &Phi : BB->phis()) {
471       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
472         if (Phi.getIncomingBlock(b) == OldBB) {
473           Value *OldValue = Phi.getIncomingValue(b);
474           if (Value *LastValue = LastValueMap[OldValue])
475             Phi.setIncomingValue(b, LastValue);
476           Phi.setIncomingBlock(b, NewBB);
477           break;
478         }
479       }
480     }
481   };
482   // Move all the phis from Src into Dest
483   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
484     Instruction *insertPoint = Dest->getFirstNonPHI();
485     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
486       Phi->moveBefore(insertPoint);
487   };
488 
489   // Update the PHI values outside the loop to point to the last block
490   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
491                            LastValueMap);
492 
493   // Update ForeBlocks successors and phi nodes
494   BranchInst *ForeTerm =
495       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
496   assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
497   ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
498 
499   if (CompletelyUnroll) {
500     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
501       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
502       Phi->getParent()->getInstList().erase(Phi);
503     }
504   } else {
505     // Update the PHI values to point to the last aft block
506     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
507                              AftBlocksLast.back(), LastValueMap);
508   }
509 
510   for (unsigned It = 1; It != Count; It++) {
511     // Remap ForeBlock successors from previous iteration to this
512     BranchInst *ForeTerm =
513         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
514     assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
515     ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
516   }
517 
518   // Subloop successors and phis
519   BranchInst *SubTerm =
520       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
521   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
522   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
523   SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
524                                             ForeBlocksLast.back());
525   SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
526                                             SubLoopBlocksLast.back());
527 
528   for (unsigned It = 1; It != Count; It++) {
529     // Replace the conditional branch of the previous iteration subloop with an
530     // unconditional one to this one
531     BranchInst *SubTerm =
532         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
533     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
534     SubTerm->eraseFromParent();
535 
536     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
537                                                ForeBlocksLast.back());
538     SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
539                                                SubLoopBlocksLast.back());
540     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
541   }
542 
543   // Aft blocks successors and phis
544   BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
545   if (CompletelyUnroll) {
546     BranchInst::Create(LoopExit, AftTerm);
547     AftTerm->eraseFromParent();
548   } else {
549     AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
550     assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
551            "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
552   }
553   AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
554                                         SubLoopBlocksLast.back());
555 
556   for (unsigned It = 1; It != Count; It++) {
557     // Replace the conditional branch of the previous iteration subloop with an
558     // unconditional one to this one
559     BranchInst *AftTerm =
560         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
561     BranchInst::Create(AftBlocksFirst[It], AftTerm);
562     AftTerm->eraseFromParent();
563 
564     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
565                                            SubLoopBlocksLast.back());
566     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
567   }
568 
569   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
570   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
571   // new ones required.
572   if (Count != 1) {
573     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
574     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
575                            SubLoopBlocksFirst[0]);
576     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
577                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
578 
579     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
580                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
581     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
582                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
583     DTU.applyUpdatesPermissive(DTUpdates);
584   }
585 
586   // Merge adjacent basic blocks, if possible.
587   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
588   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
589   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
590   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
591 
592   MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
593 
594   // Apply updates to the DomTree.
595   DT = &DTU.getDomTree();
596 
597   // At this point, the code is well formed.  We now do a quick sweep over the
598   // inserted code, doing constant propagation and dead code elimination as we
599   // go.
600   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
601   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
602                           TTI);
603 
604   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
605   ++NumUnrolledAndJammed;
606 
607   // Update LoopInfo if the loop is completely removed.
608   if (CompletelyUnroll) {
609     if (U)
610       U->markLoopAsDeleted(*L, std::string(L->getName()));
611     LI->erase(L);
612   }
613 
614 #ifndef NDEBUG
615   // We shouldn't have done anything to break loop simplify form or LCSSA.
616   Loop *OutestLoop = SubLoop->getParentLoop()
617                          ? SubLoop->getParentLoop()->getParentLoop()
618                                ? SubLoop->getParentLoop()->getParentLoop()
619                                : SubLoop->getParentLoop()
620                          : SubLoop;
621   assert(DT->verify());
622   LI->verify(*DT);
623   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
624   if (!CompletelyUnroll)
625     assert(L->isLoopSimplifyForm());
626   assert(SubLoop->isLoopSimplifyForm());
627   SE->verify();
628 #endif
629 
630   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
631                           : LoopUnrollResult::PartiallyUnrolled;
632 }
633 
getLoadsAndStores(BasicBlockSet & Blocks,SmallVector<Instruction *,4> & MemInstr)634 static bool getLoadsAndStores(BasicBlockSet &Blocks,
635                               SmallVector<Instruction *, 4> &MemInstr) {
636   // Scan the BBs and collect legal loads and stores.
637   // Returns false if non-simple loads/stores are found.
638   for (BasicBlock *BB : Blocks) {
639     for (Instruction &I : *BB) {
640       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
641         if (!Ld->isSimple())
642           return false;
643         MemInstr.push_back(&I);
644       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
645         if (!St->isSimple())
646           return false;
647         MemInstr.push_back(&I);
648       } else if (I.mayReadOrWriteMemory()) {
649         return false;
650       }
651     }
652   }
653   return true;
654 }
655 
preservesForwardDependence(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,Dependence * D)656 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
657                                        unsigned UnrollLevel, unsigned JamLevel,
658                                        bool Sequentialized, Dependence *D) {
659   // UnrollLevel might carry the dependency Src --> Dst
660   // Does a different loop after unrolling?
661   for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
662        ++CurLoopDepth) {
663     auto JammedDir = D->getDirection(CurLoopDepth);
664     if (JammedDir == Dependence::DVEntry::LT)
665       return true;
666 
667     if (JammedDir & Dependence::DVEntry::GT)
668       return false;
669   }
670 
671   return true;
672 }
673 
preservesBackwardDependence(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,Dependence * D)674 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
675                                         unsigned UnrollLevel, unsigned JamLevel,
676                                         bool Sequentialized, Dependence *D) {
677   // UnrollLevel might carry the dependency Dst --> Src
678   for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
679        ++CurLoopDepth) {
680     auto JammedDir = D->getDirection(CurLoopDepth);
681     if (JammedDir == Dependence::DVEntry::GT)
682       return true;
683 
684     if (JammedDir & Dependence::DVEntry::LT)
685       return false;
686   }
687 
688   // Backward dependencies are only preserved if not interleaved.
689   return Sequentialized;
690 }
691 
692 // Check whether it is semantically safe Src and Dst considering any potential
693 // dependency between them.
694 //
695 // @param UnrollLevel The level of the loop being unrolled
696 // @param JamLevel    The level of the loop being jammed; if Src and Dst are on
697 // different levels, the outermost common loop counts as jammed level
698 //
699 // @return true if is safe and false if there is a dependency violation.
checkDependency(Instruction * Src,Instruction * Dst,unsigned UnrollLevel,unsigned JamLevel,bool Sequentialized,DependenceInfo & DI)700 static bool checkDependency(Instruction *Src, Instruction *Dst,
701                             unsigned UnrollLevel, unsigned JamLevel,
702                             bool Sequentialized, DependenceInfo &DI) {
703   assert(UnrollLevel <= JamLevel &&
704          "Expecting JamLevel to be at least UnrollLevel");
705 
706   if (Src == Dst)
707     return true;
708   // Ignore Input dependencies.
709   if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
710     return true;
711 
712   // Check whether unroll-and-jam may violate a dependency.
713   // By construction, every dependency will be lexicographically non-negative
714   // (if it was, it would violate the current execution order), such as
715   //   (0,0,>,*,*)
716   // Unroll-and-jam changes the GT execution of two executions to the same
717   // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
718   // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
719   //   (0,0,>=,*,*)
720   // Now, the dependency is not necessarily non-negative anymore, i.e.
721   // unroll-and-jam may violate correctness.
722   std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
723   if (!D)
724     return true;
725   assert(D->isOrdered() && "Expected an output, flow or anti dep.");
726 
727   if (D->isConfused()) {
728     LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
729                       << "  " << *Src << "\n"
730                       << "  " << *Dst << "\n");
731     return false;
732   }
733 
734   // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
735   // non-equal direction, then the locations accessed in the inner levels cannot
736   // overlap in memory. We assumes the indexes never overlap into neighboring
737   // dimensions.
738   for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
739     if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
740       return true;
741 
742   auto UnrollDirection = D->getDirection(UnrollLevel);
743 
744   // If the distance carried by the unrolled loop is 0, then after unrolling
745   // that distance will become non-zero resulting in non-overlapping accesses in
746   // the inner loops.
747   if (UnrollDirection == Dependence::DVEntry::EQ)
748     return true;
749 
750   if (UnrollDirection & Dependence::DVEntry::LT &&
751       !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
752                                   Sequentialized, D.get()))
753     return false;
754 
755   if (UnrollDirection & Dependence::DVEntry::GT &&
756       !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
757                                    Sequentialized, D.get()))
758     return false;
759 
760   return true;
761 }
762 
763 static bool
checkDependencies(Loop & Root,const BasicBlockSet & SubLoopBlocks,const DenseMap<Loop *,BasicBlockSet> & ForeBlocksMap,const DenseMap<Loop *,BasicBlockSet> & AftBlocksMap,DependenceInfo & DI,LoopInfo & LI)764 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
765                   const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
766                   const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
767                   DependenceInfo &DI, LoopInfo &LI) {
768   SmallVector<BasicBlockSet, 8> AllBlocks;
769   for (Loop *L : Root.getLoopsInPreorder())
770     if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
771       AllBlocks.push_back(ForeBlocksMap.lookup(L));
772   AllBlocks.push_back(SubLoopBlocks);
773   for (Loop *L : Root.getLoopsInPreorder())
774     if (AftBlocksMap.find(L) != AftBlocksMap.end())
775       AllBlocks.push_back(AftBlocksMap.lookup(L));
776 
777   unsigned LoopDepth = Root.getLoopDepth();
778   SmallVector<Instruction *, 4> EarlierLoadsAndStores;
779   SmallVector<Instruction *, 4> CurrentLoadsAndStores;
780   for (BasicBlockSet &Blocks : AllBlocks) {
781     CurrentLoadsAndStores.clear();
782     if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
783       return false;
784 
785     Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
786     unsigned CurLoopDepth = CurLoop->getLoopDepth();
787 
788     for (auto *Earlier : EarlierLoadsAndStores) {
789       Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
790       unsigned EarlierDepth = EarlierLoop->getLoopDepth();
791       unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
792       for (auto *Later : CurrentLoadsAndStores) {
793         if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
794                              DI))
795           return false;
796       }
797     }
798 
799     size_t NumInsts = CurrentLoadsAndStores.size();
800     for (size_t I = 0; I < NumInsts; ++I) {
801       for (size_t J = I; J < NumInsts; ++J) {
802         if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
803                              LoopDepth, CurLoopDepth, true, DI))
804           return false;
805       }
806     }
807 
808     EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
809                                  CurrentLoadsAndStores.end());
810   }
811   return true;
812 }
813 
isEligibleLoopForm(const Loop & Root)814 static bool isEligibleLoopForm(const Loop &Root) {
815   // Root must have a child.
816   if (Root.getSubLoops().size() != 1)
817     return false;
818 
819   const Loop *L = &Root;
820   do {
821     // All loops in Root need to be in simplify and rotated form.
822     if (!L->isLoopSimplifyForm())
823       return false;
824 
825     if (!L->isRotatedForm())
826       return false;
827 
828     if (L->getHeader()->hasAddressTaken()) {
829       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
830       return false;
831     }
832 
833     unsigned SubLoopsSize = L->getSubLoops().size();
834     if (SubLoopsSize == 0)
835       return true;
836 
837     // Only one child is allowed.
838     if (SubLoopsSize != 1)
839       return false;
840 
841     // Only loops with a single exit block can be unrolled and jammed.
842     // The function getExitBlock() is used for this check, rather than
843     // getUniqueExitBlock() to ensure loops with mulitple exit edges are
844     // disallowed.
845     if (!L->getExitBlock()) {
846       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "
847                            "blocks can be unrolled and jammed.\n");
848       return false;
849     }
850 
851     // Only loops with a single exiting block can be unrolled and jammed.
852     if (!L->getExitingBlock()) {
853       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "
854                            "exiting blocks can be unrolled and jammed.\n");
855       return false;
856     }
857 
858     L = L->getSubLoops()[0];
859   } while (L);
860 
861   return true;
862 }
863 
getInnerMostLoop(Loop * L)864 static Loop *getInnerMostLoop(Loop *L) {
865   while (!L->getSubLoops().empty())
866     L = L->getSubLoops()[0];
867   return L;
868 }
869 
isSafeToUnrollAndJam(Loop * L,ScalarEvolution & SE,DominatorTree & DT,DependenceInfo & DI,LoopInfo & LI)870 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
871                                 DependenceInfo &DI, LoopInfo &LI) {
872   if (!isEligibleLoopForm(*L)) {
873     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
874     return false;
875   }
876 
877   /* We currently handle outer loops like this:
878         |
879     ForeFirst    <------\   }
880      Blocks             |   } ForeBlocks of L
881     ForeLast            |   }
882         |               |
883        ...              |
884         |               |
885     ForeFirst    <----\ |   }
886      Blocks           | |   } ForeBlocks of a inner loop of L
887     ForeLast          | |   }
888         |             | |
889     JamLoopFirst  <\  | |   }
890      Blocks        |  | |   } JamLoopBlocks of the innermost loop
891     JamLoopLast   -/  | |   }
892         |             | |
893     AftFirst          | |   }
894      Blocks           | |   } AftBlocks of a inner loop of L
895     AftLast     ------/ |   }
896         |               |
897        ...              |
898         |               |
899     AftFirst            |   }
900      Blocks             |   } AftBlocks of L
901     AftLast     --------/   }
902         |
903 
904     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
905     and AftBlocks, providing that there is one edge from Fores to SubLoops,
906     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
907     In practice we currently limit Aft blocks to a single block, and limit
908     things further in the profitablility checks of the unroll and jam pass.
909 
910     Because of the way we rearrange basic blocks, we also require that
911     the Fore blocks of L on all unrolled iterations are safe to move before the
912     blocks of the direct child of L of all iterations. So we require that the
913     phi node looping operands of ForeHeader can be moved to at least the end of
914     ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
915     match up Phi's correctly.
916 
917     i.e. The old order of blocks used to be
918            (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
919          It needs to be safe to transform this to
920            (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
921 
922     There are then a number of checks along the lines of no calls, no
923     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
924     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
925     UnrollAndJamLoop if the trip count cannot be easily calculated.
926   */
927 
928   // Split blocks into Fore/SubLoop/Aft based on dominators
929   Loop *JamLoop = getInnerMostLoop(L);
930   BasicBlockSet SubLoopBlocks;
931   DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
932   DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
933   if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
934                                 AftBlocksMap, DT)) {
935     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
936     return false;
937   }
938 
939   // Aft blocks may need to move instructions to fore blocks, which becomes more
940   // difficult if there are multiple (potentially conditionally executed)
941   // blocks. For now we just exclude loops with multiple aft blocks.
942   if (AftBlocksMap[L].size() != 1) {
943     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
944                          "multiple blocks after the loop\n");
945     return false;
946   }
947 
948   // Check inner loop backedge count is consistent on all iterations of the
949   // outer loop
950   if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
951         return !hasIterationCountInvariantInParent(SubLoop, SE);
952       })) {
953     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
954                          "not consistent on each iteration\n");
955     return false;
956   }
957 
958   // Check the loop safety info for exceptions.
959   SimpleLoopSafetyInfo LSI;
960   LSI.computeLoopSafetyInfo(L);
961   if (LSI.anyBlockMayThrow()) {
962     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
963     return false;
964   }
965 
966   // We've ruled out the easy stuff and now need to check that there are no
967   // interdependencies which may prevent us from moving the:
968   //  ForeBlocks before Subloop and AftBlocks.
969   //  Subloop before AftBlocks.
970   //  ForeBlock phi operands before the subloop
971 
972   // Make sure we can move all instructions we need to before the subloop
973   BasicBlock *Header = L->getHeader();
974   BasicBlock *Latch = L->getLoopLatch();
975   BasicBlockSet AftBlocks = AftBlocksMap[L];
976   Loop *SubLoop = L->getSubLoops()[0];
977   if (!processHeaderPhiOperands(
978           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
979             if (SubLoop->contains(I->getParent()))
980               return false;
981             if (AftBlocks.count(I->getParent())) {
982               // If we hit a phi node in afts we know we are done (probably
983               // LCSSA)
984               if (isa<PHINode>(I))
985                 return false;
986               // Can't move instructions with side effects or memory
987               // reads/writes
988               if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
989                 return false;
990             }
991             // Keep going
992             return true;
993           })) {
994     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
995                          "instructions after subloop to before it\n");
996     return false;
997   }
998 
999   // Check for memory dependencies which prohibit the unrolling we are doing.
1000   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
1001   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
1002   if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
1003                          LI)) {
1004     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
1005     return false;
1006   }
1007 
1008   return true;
1009 }
1010