1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/SmallPtrSet.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/DependenceAnalysis.h" 18 #include "llvm/Analysis/InstructionSimplify.h" 19 #include "llvm/Analysis/LoopAnalysisManager.h" 20 #include "llvm/Analysis/LoopIterator.h" 21 #include "llvm/Analysis/LoopPass.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/ScalarEvolutionExpander.h" 25 #include "llvm/Analysis/Utils/Local.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/DebugInfoMetadata.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/IntrinsicInst.h" 31 #include "llvm/IR/LLVMContext.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 #include "llvm/Transforms/Utils/Cloning.h" 36 #include "llvm/Transforms/Utils/LoopSimplify.h" 37 #include "llvm/Transforms/Utils/LoopUtils.h" 38 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 39 #include "llvm/Transforms/Utils/UnrollLoop.h" 40 using namespace llvm; 41 42 #define DEBUG_TYPE "loop-unroll-and-jam" 43 44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 46 47 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 48 49 // Partition blocks in an outer/inner loop pair into blocks before and after 50 // the loop 51 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 52 BasicBlockSet &ForeBlocks, 53 BasicBlockSet &SubLoopBlocks, 54 BasicBlockSet &AftBlocks, 55 DominatorTree *DT) { 56 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 57 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 58 59 for (BasicBlock *BB : L->blocks()) { 60 if (!SubLoop->contains(BB)) { 61 if (DT->dominates(SubLoopLatch, BB)) 62 AftBlocks.insert(BB); 63 else 64 ForeBlocks.insert(BB); 65 } 66 } 67 68 // Check that all blocks in ForeBlocks together dominate the subloop 69 // TODO: This might ideally be done better with a dominator/postdominators. 70 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 71 for (BasicBlock *BB : ForeBlocks) { 72 if (BB == SubLoopPreHeader) 73 continue; 74 Instruction *TI = BB->getTerminator(); 75 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) 76 if (!ForeBlocks.count(TI->getSuccessor(i))) 77 return false; 78 } 79 80 return true; 81 } 82 83 // Looks at the phi nodes in Header for values coming from Latch. For these 84 // instructions and all their operands calls Visit on them, keeping going for 85 // all the operands in AftBlocks. Returns false if Visit returns false, 86 // otherwise returns true. This is used to process the instructions in the 87 // Aft blocks that need to be moved before the subloop. It is used in two 88 // places. One to check that the required set of instructions can be moved 89 // before the loop. Then to collect the instructions to actually move in 90 // moveHeaderPhiOperandsToForeBlocks. 91 template <typename T> 92 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 93 BasicBlockSet &AftBlocks, T Visit) { 94 SmallVector<Instruction *, 8> Worklist; 95 for (auto &Phi : Header->phis()) { 96 Value *V = Phi.getIncomingValueForBlock(Latch); 97 if (Instruction *I = dyn_cast<Instruction>(V)) 98 Worklist.push_back(I); 99 } 100 101 while (!Worklist.empty()) { 102 Instruction *I = Worklist.back(); 103 Worklist.pop_back(); 104 if (!Visit(I)) 105 return false; 106 107 if (AftBlocks.count(I->getParent())) 108 for (auto &U : I->operands()) 109 if (Instruction *II = dyn_cast<Instruction>(U)) 110 Worklist.push_back(II); 111 } 112 113 return true; 114 } 115 116 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 117 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 118 BasicBlock *Latch, 119 Instruction *InsertLoc, 120 BasicBlockSet &AftBlocks) { 121 // We need to ensure we move the instructions in the correct order, 122 // starting with the earliest required instruction and moving forward. 123 std::vector<Instruction *> Visited; 124 processHeaderPhiOperands(Header, Latch, AftBlocks, 125 [&Visited, &AftBlocks](Instruction *I) { 126 if (AftBlocks.count(I->getParent())) 127 Visited.push_back(I); 128 return true; 129 }); 130 131 // Move all instructions in program order to before the InsertLoc 132 BasicBlock *InsertLocBB = InsertLoc->getParent(); 133 for (Instruction *I : reverse(Visited)) { 134 if (I->getParent() != InsertLocBB) 135 I->moveBefore(InsertLoc); 136 } 137 } 138 139 /* 140 This method performs Unroll and Jam. For a simple loop like: 141 for (i = ..) 142 Fore(i) 143 for (j = ..) 144 SubLoop(i, j) 145 Aft(i) 146 147 Instead of doing normal inner or outer unrolling, we do: 148 for (i = .., i+=2) 149 Fore(i) 150 Fore(i+1) 151 for (j = ..) 152 SubLoop(i, j) 153 SubLoop(i+1, j) 154 Aft(i) 155 Aft(i+1) 156 157 So the outer loop is essetially unrolled and then the inner loops are fused 158 ("jammed") together into a single loop. This can increase speed when there 159 are loads in SubLoop that are invariant to i, as they become shared between 160 the now jammed inner loops. 161 162 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 163 Fore blocks are those before the inner loop, Aft are those after. Normal 164 Unroll code is used to copy each of these sets of blocks and the results are 165 combined together into the final form above. 166 167 isSafeToUnrollAndJam should be used prior to calling this to make sure the 168 unrolling will be valid. Checking profitablility is also advisable. 169 170 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 171 necessary to create one and not fully unrolled). 172 */ 173 LoopUnrollResult llvm::UnrollAndJamLoop( 174 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, 175 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 176 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 177 178 // When we enter here we should have already checked that it is safe 179 BasicBlock *Header = L->getHeader(); 180 assert(L->getSubLoops().size() == 1); 181 Loop *SubLoop = *L->begin(); 182 183 // Don't enter the unroll code if there is nothing to do. 184 if (TripCount == 0 && Count < 2) { 185 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 186 return LoopUnrollResult::Unmodified; 187 } 188 189 assert(Count > 0); 190 assert(TripMultiple > 0); 191 assert(TripCount == 0 || TripCount % TripMultiple == 0); 192 193 // Are we eliminating the loop control altogether? 194 bool CompletelyUnroll = (Count == TripCount); 195 196 // We use the runtime remainder in cases where we don't know trip multiple 197 if (TripMultiple == 1 || TripMultiple % Count != 0) { 198 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 199 /*UseEpilogRemainder*/ true, 200 UnrollRemainder, /*ForgetAllSCEV*/ false, 201 LI, SE, DT, AC, true, EpilogueLoop)) { 202 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 203 "generated when assuming runtime trip count\n"); 204 return LoopUnrollResult::Unmodified; 205 } 206 } 207 208 // Notify ScalarEvolution that the loop will be substantially changed, 209 // if not outright eliminated. 210 if (SE) { 211 SE->forgetLoop(L); 212 SE->forgetLoop(SubLoop); 213 } 214 215 using namespace ore; 216 // Report the unrolling decision. 217 if (CompletelyUnroll) { 218 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 219 << Header->getName() << " with trip count " << TripCount 220 << "!\n"); 221 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 222 L->getHeader()) 223 << "completely unroll and jammed loop with " 224 << NV("UnrollCount", TripCount) << " iterations"); 225 } else { 226 auto DiagBuilder = [&]() { 227 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 228 L->getHeader()); 229 return Diag << "unroll and jammed loop by a factor of " 230 << NV("UnrollCount", Count); 231 }; 232 233 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 234 << " by " << Count); 235 if (TripMultiple != 1) { 236 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 237 ORE->emit([&]() { 238 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 239 << " trips per branch"; 240 }); 241 } else { 242 LLVM_DEBUG(dbgs() << " with run-time trip count"); 243 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 244 } 245 LLVM_DEBUG(dbgs() << "!\n"); 246 } 247 248 BasicBlock *Preheader = L->getLoopPreheader(); 249 BasicBlock *LatchBlock = L->getLoopLatch(); 250 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 251 assert(Preheader && LatchBlock && Header); 252 assert(BI && !BI->isUnconditional()); 253 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 254 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 255 bool SubLoopContinueOnTrue = SubLoop->contains( 256 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 257 258 // Partition blocks in an outer/inner loop pair into blocks before and after 259 // the loop 260 BasicBlockSet SubLoopBlocks; 261 BasicBlockSet ForeBlocks; 262 BasicBlockSet AftBlocks; 263 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 264 DT); 265 266 // We keep track of the entering/first and exiting/last block of each of 267 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 268 // blocks easier. 269 std::vector<BasicBlock *> ForeBlocksFirst; 270 std::vector<BasicBlock *> ForeBlocksLast; 271 std::vector<BasicBlock *> SubLoopBlocksFirst; 272 std::vector<BasicBlock *> SubLoopBlocksLast; 273 std::vector<BasicBlock *> AftBlocksFirst; 274 std::vector<BasicBlock *> AftBlocksLast; 275 ForeBlocksFirst.push_back(Header); 276 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 277 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 278 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 279 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 280 AftBlocksLast.push_back(L->getExitingBlock()); 281 // Maps Blocks[0] -> Blocks[It] 282 ValueToValueMapTy LastValueMap; 283 284 // Move any instructions from fore phi operands from AftBlocks into Fore. 285 moveHeaderPhiOperandsToForeBlocks( 286 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), 287 AftBlocks); 288 289 // The current on-the-fly SSA update requires blocks to be processed in 290 // reverse postorder so that LastValueMap contains the correct value at each 291 // exit. 292 LoopBlocksDFS DFS(L); 293 DFS.perform(LI); 294 // Stash the DFS iterators before adding blocks to the loop. 295 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 296 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 297 298 if (Header->getParent()->isDebugInfoForProfiling()) 299 for (BasicBlock *BB : L->getBlocks()) 300 for (Instruction &I : *BB) 301 if (!isa<DbgInfoIntrinsic>(&I)) 302 if (const DILocation *DIL = I.getDebugLoc()) { 303 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 304 if (NewDIL) 305 I.setDebugLoc(NewDIL.getValue()); 306 else 307 LLVM_DEBUG(dbgs() 308 << "Failed to create new discriminator: " 309 << DIL->getFilename() << " Line: " << DIL->getLine()); 310 } 311 312 // Copy all blocks 313 for (unsigned It = 1; It != Count; ++It) { 314 std::vector<BasicBlock *> NewBlocks; 315 // Maps Blocks[It] -> Blocks[It-1] 316 DenseMap<Value *, Value *> PrevItValueMap; 317 318 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 319 ValueToValueMapTy VMap; 320 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 321 Header->getParent()->getBasicBlockList().push_back(New); 322 323 if (ForeBlocks.count(*BB)) { 324 L->addBasicBlockToLoop(New, *LI); 325 326 if (*BB == ForeBlocksFirst[0]) 327 ForeBlocksFirst.push_back(New); 328 if (*BB == ForeBlocksLast[0]) 329 ForeBlocksLast.push_back(New); 330 } else if (SubLoopBlocks.count(*BB)) { 331 SubLoop->addBasicBlockToLoop(New, *LI); 332 333 if (*BB == SubLoopBlocksFirst[0]) 334 SubLoopBlocksFirst.push_back(New); 335 if (*BB == SubLoopBlocksLast[0]) 336 SubLoopBlocksLast.push_back(New); 337 } else if (AftBlocks.count(*BB)) { 338 L->addBasicBlockToLoop(New, *LI); 339 340 if (*BB == AftBlocksFirst[0]) 341 AftBlocksFirst.push_back(New); 342 if (*BB == AftBlocksLast[0]) 343 AftBlocksLast.push_back(New); 344 } else { 345 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 346 } 347 348 // Update our running maps of newest clones 349 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 350 LastValueMap[*BB] = New; 351 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 352 VI != VE; ++VI) { 353 PrevItValueMap[VI->second] = 354 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 355 LastValueMap[VI->first] = VI->second; 356 } 357 358 NewBlocks.push_back(New); 359 360 // Update DomTree: 361 if (*BB == ForeBlocksFirst[0]) 362 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 363 else if (*BB == SubLoopBlocksFirst[0]) 364 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 365 else if (*BB == AftBlocksFirst[0]) 366 DT->addNewBlock(New, AftBlocksLast[It - 1]); 367 else { 368 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 369 // structure. 370 auto BBDomNode = DT->getNode(*BB); 371 auto BBIDom = BBDomNode->getIDom(); 372 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 373 assert(OriginalBBIDom); 374 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 375 DT->addNewBlock( 376 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 377 } 378 } 379 380 // Remap all instructions in the most recent iteration 381 for (BasicBlock *NewBlock : NewBlocks) { 382 for (Instruction &I : *NewBlock) { 383 ::remapInstruction(&I, LastValueMap); 384 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 385 if (II->getIntrinsicID() == Intrinsic::assume) 386 AC->registerAssumption(II); 387 } 388 } 389 390 // Alter the ForeBlocks phi's, pointing them at the latest version of the 391 // value from the previous iteration's phis 392 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 393 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 394 assert(OldValue && "should have incoming edge from Aft[It]"); 395 Value *NewValue = OldValue; 396 if (Value *PrevValue = PrevItValueMap[OldValue]) 397 NewValue = PrevValue; 398 399 assert(Phi.getNumOperands() == 2); 400 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 401 Phi.setIncomingValue(0, NewValue); 402 Phi.removeIncomingValue(1); 403 } 404 } 405 406 // Now that all the basic blocks for the unrolled iterations are in place, 407 // finish up connecting the blocks and phi nodes. At this point LastValueMap 408 // is the last unrolled iterations values. 409 410 // Update Phis in BB from OldBB to point to NewBB 411 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB, 412 BasicBlock *NewBB) { 413 for (PHINode &Phi : BB->phis()) { 414 int I = Phi.getBasicBlockIndex(OldBB); 415 Phi.setIncomingBlock(I, NewBB); 416 } 417 }; 418 // Update Phis in BB from OldBB to point to NewBB and use the latest value 419 // from LastValueMap 420 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 421 BasicBlock *NewBB, 422 ValueToValueMapTy &LastValueMap) { 423 for (PHINode &Phi : BB->phis()) { 424 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 425 if (Phi.getIncomingBlock(b) == OldBB) { 426 Value *OldValue = Phi.getIncomingValue(b); 427 if (Value *LastValue = LastValueMap[OldValue]) 428 Phi.setIncomingValue(b, LastValue); 429 Phi.setIncomingBlock(b, NewBB); 430 break; 431 } 432 } 433 } 434 }; 435 // Move all the phis from Src into Dest 436 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 437 Instruction *insertPoint = Dest->getFirstNonPHI(); 438 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 439 Phi->moveBefore(insertPoint); 440 }; 441 442 // Update the PHI values outside the loop to point to the last block 443 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 444 LastValueMap); 445 446 // Update ForeBlocks successors and phi nodes 447 BranchInst *ForeTerm = 448 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 449 BasicBlock *Dest = SubLoopBlocksFirst[0]; 450 ForeTerm->setSuccessor(0, Dest); 451 452 if (CompletelyUnroll) { 453 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 454 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 455 Phi->getParent()->getInstList().erase(Phi); 456 } 457 } else { 458 // Update the PHI values to point to the last aft block 459 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 460 AftBlocksLast.back(), LastValueMap); 461 } 462 463 for (unsigned It = 1; It != Count; It++) { 464 // Remap ForeBlock successors from previous iteration to this 465 BranchInst *ForeTerm = 466 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 467 BasicBlock *Dest = ForeBlocksFirst[It]; 468 ForeTerm->setSuccessor(0, Dest); 469 } 470 471 // Subloop successors and phis 472 BranchInst *SubTerm = 473 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 474 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 475 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 476 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0], 477 ForeBlocksLast.back()); 478 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0], 479 SubLoopBlocksLast.back()); 480 481 for (unsigned It = 1; It != Count; It++) { 482 // Replace the conditional branch of the previous iteration subloop with an 483 // unconditional one to this one 484 BranchInst *SubTerm = 485 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 486 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 487 SubTerm->eraseFromParent(); 488 489 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It], 490 ForeBlocksLast.back()); 491 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It], 492 SubLoopBlocksLast.back()); 493 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 494 } 495 496 // Aft blocks successors and phis 497 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 498 if (CompletelyUnroll) { 499 BranchInst::Create(LoopExit, Term); 500 Term->eraseFromParent(); 501 } else { 502 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 503 } 504 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], 505 SubLoopBlocksLast.back()); 506 507 for (unsigned It = 1; It != Count; It++) { 508 // Replace the conditional branch of the previous iteration subloop with an 509 // unconditional one to this one 510 BranchInst *AftTerm = 511 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 512 BranchInst::Create(AftBlocksFirst[It], AftTerm); 513 AftTerm->eraseFromParent(); 514 515 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It], 516 SubLoopBlocksLast.back()); 517 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 518 } 519 520 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 521 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 522 // new ones required. 523 if (Count != 1) { 524 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 525 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 526 SubLoopBlocksFirst[0]); 527 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 528 SubLoopBlocksLast[0], AftBlocksFirst[0]); 529 530 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 531 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 532 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 533 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 534 DTU.applyUpdatesPermissive(DTUpdates); 535 } 536 537 // Merge adjacent basic blocks, if possible. 538 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 539 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 540 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 541 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 542 while (!MergeBlocks.empty()) { 543 BasicBlock *BB = *MergeBlocks.begin(); 544 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); 545 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) { 546 BasicBlock *Dest = Term->getSuccessor(0); 547 BasicBlock *Fold = Dest->getUniquePredecessor(); 548 if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { 549 // Don't remove BB and add Fold as they are the same BB 550 assert(Fold == BB); 551 (void)Fold; 552 MergeBlocks.erase(Dest); 553 } else 554 MergeBlocks.erase(BB); 555 } else 556 MergeBlocks.erase(BB); 557 } 558 // Apply updates to the DomTree. 559 DT = &DTU.getDomTree(); 560 561 // At this point, the code is well formed. We now do a quick sweep over the 562 // inserted code, doing constant propagation and dead code elimination as we 563 // go. 564 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC); 565 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC); 566 567 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 568 ++NumUnrolledAndJammed; 569 570 #ifndef NDEBUG 571 // We shouldn't have done anything to break loop simplify form or LCSSA. 572 Loop *OuterL = L->getParentLoop(); 573 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); 574 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 575 if (!CompletelyUnroll) 576 assert(L->isLoopSimplifyForm()); 577 assert(SubLoop->isLoopSimplifyForm()); 578 assert(DT->verify()); 579 #endif 580 581 // Update LoopInfo if the loop is completely removed. 582 if (CompletelyUnroll) 583 LI->erase(L); 584 585 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 586 : LoopUnrollResult::PartiallyUnrolled; 587 } 588 589 static bool getLoadsAndStores(BasicBlockSet &Blocks, 590 SmallVector<Value *, 4> &MemInstr) { 591 // Scan the BBs and collect legal loads and stores. 592 // Returns false if non-simple loads/stores are found. 593 for (BasicBlock *BB : Blocks) { 594 for (Instruction &I : *BB) { 595 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 596 if (!Ld->isSimple()) 597 return false; 598 MemInstr.push_back(&I); 599 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 600 if (!St->isSimple()) 601 return false; 602 MemInstr.push_back(&I); 603 } else if (I.mayReadOrWriteMemory()) { 604 return false; 605 } 606 } 607 } 608 return true; 609 } 610 611 static bool checkDependencies(SmallVector<Value *, 4> &Earlier, 612 SmallVector<Value *, 4> &Later, 613 unsigned LoopDepth, bool InnerLoop, 614 DependenceInfo &DI) { 615 // Use DA to check for dependencies between loads and stores that make unroll 616 // and jam invalid 617 for (Value *I : Earlier) { 618 for (Value *J : Later) { 619 Instruction *Src = cast<Instruction>(I); 620 Instruction *Dst = cast<Instruction>(J); 621 if (Src == Dst) 622 continue; 623 // Ignore Input dependencies. 624 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 625 continue; 626 627 // Track dependencies, and if we find them take a conservative approach 628 // by allowing only = or < (not >), altough some > would be safe 629 // (depending upon unroll width). 630 // For the inner loop, we need to disallow any (> <) dependencies 631 // FIXME: Allow > so long as distance is less than unroll width 632 if (auto D = DI.depends(Src, Dst, true)) { 633 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 634 635 if (D->isConfused()) { 636 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 637 << " " << *Src << "\n" 638 << " " << *Dst << "\n"); 639 return false; 640 } 641 if (!InnerLoop) { 642 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { 643 LLVM_DEBUG(dbgs() << " > dependency between:\n" 644 << " " << *Src << "\n" 645 << " " << *Dst << "\n"); 646 return false; 647 } 648 } else { 649 assert(LoopDepth + 1 <= D->getLevels()); 650 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && 651 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { 652 LLVM_DEBUG(dbgs() << " < > dependency between:\n" 653 << " " << *Src << "\n" 654 << " " << *Dst << "\n"); 655 return false; 656 } 657 } 658 } 659 } 660 } 661 return true; 662 } 663 664 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, 665 BasicBlockSet &SubLoopBlocks, 666 BasicBlockSet &AftBlocks, DependenceInfo &DI) { 667 // Get all loads/store pairs for each blocks 668 SmallVector<Value *, 4> ForeMemInstr; 669 SmallVector<Value *, 4> SubLoopMemInstr; 670 SmallVector<Value *, 4> AftMemInstr; 671 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || 672 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || 673 !getLoadsAndStores(AftBlocks, AftMemInstr)) 674 return false; 675 676 // Check for dependencies between any blocks that may change order 677 unsigned LoopDepth = L->getLoopDepth(); 678 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, 679 DI) && 680 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && 681 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, 682 DI) && 683 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, 684 DI); 685 } 686 687 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 688 DependenceInfo &DI) { 689 /* We currently handle outer loops like this: 690 | 691 ForeFirst <----\ } 692 Blocks | } ForeBlocks 693 ForeLast | } 694 | | 695 SubLoopFirst <\ | } 696 Blocks | | } SubLoopBlocks 697 SubLoopLast -/ | } 698 | | 699 AftFirst | } 700 Blocks | } AftBlocks 701 AftLast ------/ } 702 | 703 704 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 705 and AftBlocks, providing that there is one edge from Fores to SubLoops, 706 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 707 In practice we currently limit Aft blocks to a single block, and limit 708 things further in the profitablility checks of the unroll and jam pass. 709 710 Because of the way we rearrange basic blocks, we also require that 711 the Fore blocks on all unrolled iterations are safe to move before the 712 SubLoop blocks of all iterations. So we require that the phi node looping 713 operands of ForeHeader can be moved to at least the end of ForeEnd, so that 714 we can arrange cloned Fore Blocks before the subloop and match up Phi's 715 correctly. 716 717 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. 718 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. 719 720 There are then a number of checks along the lines of no calls, no 721 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 722 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 723 UnrollAndJamLoop if the trip count cannot be easily calculated. 724 */ 725 726 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) 727 return false; 728 Loop *SubLoop = L->getSubLoops()[0]; 729 if (!SubLoop->isLoopSimplifyForm()) 730 return false; 731 732 BasicBlock *Header = L->getHeader(); 733 BasicBlock *Latch = L->getLoopLatch(); 734 BasicBlock *Exit = L->getExitingBlock(); 735 BasicBlock *SubLoopHeader = SubLoop->getHeader(); 736 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 737 BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); 738 739 if (Latch != Exit) 740 return false; 741 if (SubLoopLatch != SubLoopExit) 742 return false; 743 744 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { 745 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 746 return false; 747 } 748 749 // Split blocks into Fore/SubLoop/Aft based on dominators 750 BasicBlockSet SubLoopBlocks; 751 BasicBlockSet ForeBlocks; 752 BasicBlockSet AftBlocks; 753 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, 754 AftBlocks, &DT)) { 755 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 756 return false; 757 } 758 759 // Aft blocks may need to move instructions to fore blocks, which becomes more 760 // difficult if there are multiple (potentially conditionally executed) 761 // blocks. For now we just exclude loops with multiple aft blocks. 762 if (AftBlocks.size() != 1) { 763 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 764 "multiple blocks after the loop\n"); 765 return false; 766 } 767 768 // Check inner loop backedge count is consistent on all iterations of the 769 // outer loop 770 if (!hasIterationCountInvariantInParent(SubLoop, SE)) { 771 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 772 "not consistent on each iteration\n"); 773 return false; 774 } 775 776 // Check the loop safety info for exceptions. 777 SimpleLoopSafetyInfo LSI; 778 LSI.computeLoopSafetyInfo(L); 779 if (LSI.anyBlockMayThrow()) { 780 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 781 return false; 782 } 783 784 // We've ruled out the easy stuff and now need to check that there are no 785 // interdependencies which may prevent us from moving the: 786 // ForeBlocks before Subloop and AftBlocks. 787 // Subloop before AftBlocks. 788 // ForeBlock phi operands before the subloop 789 790 // Make sure we can move all instructions we need to before the subloop 791 if (!processHeaderPhiOperands( 792 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 793 if (SubLoop->contains(I->getParent())) 794 return false; 795 if (AftBlocks.count(I->getParent())) { 796 // If we hit a phi node in afts we know we are done (probably 797 // LCSSA) 798 if (isa<PHINode>(I)) 799 return false; 800 // Can't move instructions with side effects or memory 801 // reads/writes 802 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 803 return false; 804 } 805 // Keep going 806 return true; 807 })) { 808 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 809 "instructions after subloop to before it\n"); 810 return false; 811 } 812 813 // Check for memory dependencies which prohibit the unrolling we are doing. 814 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 815 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 816 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { 817 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 818 return false; 819 } 820 821 return true; 822 } 823