Lines Matching +full:sub +full:- +full:blocks
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
60 #define DEBUG_TYPE "loop-unroll-and-jam"
67 // Partition blocks in an outer/inner loop pair into blocks before and after
72 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
74 for (BasicBlock *BB : L.blocks()) {
75 if (!SubLoop->contains(BB)) {
83 // Check that all blocks in ForeBlocks together dominate the subloop
85 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
89 Instruction *TI = BB->getTerminator();
98 /// Partition blocks in a loop nest into blocks before and after each inner
124 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
132 // Aft blocks that need to be moved before the subloop. It is used in two
147 if (AftBlocks.count(I->getParent()))
148 for (auto &U : I->operands())
156 for (auto &Phi : Header->phis()) {
175 if (AftBlocks.count(I->getParent()))
176 I->moveBefore(InsertLoc);
204 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
205 Fore blocks are those before the inner loop, Aft are those after. Normal
206 Unroll code is used to copy each of these sets of blocks and the results are
212 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
223 BasicBlock *Header = L->getHeader();
225 assert(L->getSubLoops().size() == 1);
226 Loop *SubLoop = *L->begin();
230 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
247 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
256 SE->forgetLoop(L);
257 SE->forgetBlockAndLoopDispositions();
264 << Header->getName() << " with trip count " << TripCount
266 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
267 L->getHeader())
272 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
273 L->getHeader());
278 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
282 ORE->emit([&]() {
287 LLVM_DEBUG(dbgs() << " with run-time trip count");
288 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
293 BasicBlock *Preheader = L->getLoopPreheader();
294 BasicBlock *LatchBlock = L->getLoopLatch();
297 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
298 assert(BI && !BI->isUnconditional());
299 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
300 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
301 bool SubLoopContinueOnTrue = SubLoop->contains(
302 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
304 // Partition blocks in an outer/inner loop pair into blocks before and after
314 // blocks easier.
322 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
323 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
324 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
325 AftBlocksFirst.push_back(SubLoop->getExitBlock());
326 AftBlocksLast.push_back(L->getExitingBlock());
327 // Maps Blocks[0] -> Blocks[It]
332 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
334 // The current on-the-fly SSA update requires blocks to be processed in
339 // Stash the DFS iterators before adding blocks to the loop.
345 if (Header->getParent()->shouldEmitDebugInfoForProfiling() &&
347 for (BasicBlock *BB : L->getBlocks())
351 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
357 << DIL->getFilename() << " Line: " << DIL->getLine());
360 // Copy all blocks
363 // Maps Blocks[It] -> Blocks[It-1]
372 Header->getParent()->insert(Header->getParent()->end(), New);
393 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
401 PrevItValueMap[VI->second] =
402 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
403 LastValueMap[VI->first] = VI->second;
410 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
412 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
414 DT->addNewBlock(New, AftBlocksLast[It - 1]);
416 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
418 auto BBDomNode = DT->getNode(*BB);
419 auto BBIDom = BBDomNode->getIDom();
420 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
423 DT->addNewBlock(
433 AC->registerAssumption(II);
439 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
447 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
453 // Now that all the basic blocks for the unrolled iterations are in place,
454 // finish up connecting the blocks and phi nodes. At this point LastValueMap
462 for (PHINode &Phi : BB->phis()) {
476 BasicBlock::iterator insertPoint = Dest->getFirstNonPHIIt();
477 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
478 Phi->moveBefore(*Dest, insertPoint);
487 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
488 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
489 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
492 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
493 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
494 Phi->eraseFromParent();
505 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
506 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
507 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
512 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
513 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
514 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
515 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
517 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
524 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
525 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());
526 SubTerm->eraseFromParent();
528 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
530 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
535 // Aft blocks successors and phis
536 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
538 BranchInst::Create(LoopExit, AftTerm->getIterator());
539 AftTerm->eraseFromParent();
541 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
542 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
545 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
552 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
553 BranchInst::Create(AftBlocksFirst[It], AftTerm->getIterator());
554 AftTerm->eraseFromParent();
556 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
562 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
578 // Merge adjacent basic blocks, if possible.
601 LI->erase(L);
605 Loop *OutestLoop = SubLoop->getParentLoop()
606 ? SubLoop->getParentLoop()->getParentLoop()
607 ? SubLoop->getParentLoop()->getParentLoop()
608 : SubLoop->getParentLoop()
610 assert(DT->verify());
611 LI->verify(*DT);
612 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
614 assert(L->isLoopSimplifyForm());
615 assert(SubLoop->isLoopSimplifyForm());
616 SE->verify();
623 static bool getLoadsAndStores(BasicBlockSet &Blocks,
626 // Returns false if non-simple loads/stores are found.
627 for (BasicBlock *BB : Blocks) {
630 if (!Ld->isSimple())
634 if (!St->isSimple())
648 // UnrollLevel might carry the dependency Src --> Dst
652 auto JammedDir = D->getDirection(CurLoopDepth);
666 // UnrollLevel might carry the dependency Dst --> Src
669 auto JammedDir = D->getDirection(CurLoopDepth);
701 // Check whether unroll-and-jam may violate a dependency.
702 // By construction, every dependency will be lexicographically non-negative
705 // Unroll-and-jam changes the GT execution of two executions to the same
709 // Now, the dependency is not necessarily non-negative anymore, i.e.
710 // unroll-and-jam may violate correctness.
714 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
716 if (D->isConfused()) {
723 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
724 // non-equal direction, then the locations accessed in the inner levels cannot
728 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
731 auto UnrollDirection = D->getDirection(UnrollLevel);
734 // that distance will become non-zero resulting in non-overlapping accesses in
769 for (BasicBlockSet &Blocks : AllBlocks) {
771 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
774 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
775 unsigned CurLoopDepth = CurLoop->getLoopDepth();
778 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
779 unsigned EarlierDepth = EarlierLoop->getLoopDepth();
811 if (!L->isLoopSimplifyForm())
814 if (!L->isRotatedForm())
817 if (L->getHeader()->hasAddressTaken()) {
818 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
822 unsigned SubLoopsSize = L->getSubLoops().size();
834 if (!L->getExitBlock()) {
835 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "
836 "blocks can be unrolled and jammed.\n");
841 if (!L->getExitingBlock()) {
842 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "
843 "exiting blocks can be unrolled and jammed.\n");
847 L = L->getSubLoops()[0];
854 while (!L->getSubLoops().empty())
855 L = L->getSubLoops()[0];
862 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
868 ForeFirst <------\ }
869 Blocks | } ForeBlocks of L
874 ForeFirst <----\ | }
875 Blocks | | } ForeBlocks of a inner loop of L
879 Blocks | | | } JamLoopBlocks of the innermost loop
880 JamLoopLast -/ | | }
883 Blocks | | } AftBlocks of a inner loop of L
884 AftLast ------/ | }
889 Blocks | } AftBlocks of L
890 AftLast --------/ }
893 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
896 In practice we currently limit Aft blocks to a single block, and limit
899 Because of the way we rearrange basic blocks, we also require that
900 the Fore blocks of L on all unrolled iterations are safe to move before the
901 blocks of the direct child of L of all iterations. So we require that the
903 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
906 i.e. The old order of blocks used to be
917 // Split blocks into Fore/SubLoop/Aft based on dominators
924 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
928 // Aft blocks may need to move instructions to fore blocks, which becomes more
930 // blocks. For now we just exclude loops with multiple aft blocks.
932 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
933 "multiple blocks after the loop\n");
939 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
942 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
951 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
962 BasicBlock *Header = L->getHeader();
963 BasicBlock *Latch = L->getLoopLatch();
965 Loop *SubLoop = L->getSubLoops()[0];
968 if (SubLoop->contains(I->getParent()))
970 if (AftBlocks.count(I->getParent())) {
977 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
983 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
989 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
990 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
993 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");