1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/Optional.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/Sequence.h" 19 #include "llvm/ADT/SmallPtrSet.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/ADT/iterator_range.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/DependenceAnalysis.h" 27 #include "llvm/Analysis/DomTreeUpdater.h" 28 #include "llvm/Analysis/LoopInfo.h" 29 #include "llvm/Analysis/LoopIterator.h" 30 #include "llvm/Analysis/MustExecute.h" 31 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 32 #include "llvm/Analysis/ScalarEvolution.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Use.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/IR/ValueHandle.h" 46 #include "llvm/IR/ValueMap.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/GenericDomTree.h" 51 #include "llvm/Support/raw_ostream.h" 52 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/LoopUtils.h" 55 #include "llvm/Transforms/Utils/UnrollLoop.h" 56 #include "llvm/Transforms/Utils/ValueMapper.h" 57 #include <assert.h> 58 #include <memory> 59 #include <type_traits> 60 #include <vector> 61 62 using namespace llvm; 63 64 #define DEBUG_TYPE "loop-unroll-and-jam" 65 66 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 67 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 68 69 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 70 71 // Partition blocks in an outer/inner loop pair into blocks before and after 72 // the loop 73 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, 74 BasicBlockSet &AftBlocks, DominatorTree &DT) { 75 Loop *SubLoop = L.getSubLoops()[0]; 76 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 77 78 for (BasicBlock *BB : L.blocks()) { 79 if (!SubLoop->contains(BB)) { 80 if (DT.dominates(SubLoopLatch, BB)) 81 AftBlocks.insert(BB); 82 else 83 ForeBlocks.insert(BB); 84 } 85 } 86 87 // Check that all blocks in ForeBlocks together dominate the subloop 88 // TODO: This might ideally be done better with a dominator/postdominators. 89 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 90 for (BasicBlock *BB : ForeBlocks) { 91 if (BB == SubLoopPreHeader) 92 continue; 93 Instruction *TI = BB->getTerminator(); 94 for (BasicBlock *Succ : successors(TI)) 95 if (!ForeBlocks.count(Succ)) 96 return false; 97 } 98 99 return true; 100 } 101 102 /// Partition blocks in a loop nest into blocks before and after each inner 103 /// loop. 104 static bool partitionOuterLoopBlocks( 105 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, 106 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 107 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { 108 JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); 109 110 for (Loop *L : Root.getLoopsInPreorder()) { 111 if (L == &JamLoop) 112 break; 113 114 if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) 115 return false; 116 } 117 118 return true; 119 } 120 121 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more 122 // than 2 levels loop. 123 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 124 BasicBlockSet &ForeBlocks, 125 BasicBlockSet &SubLoopBlocks, 126 BasicBlockSet &AftBlocks, 127 DominatorTree *DT) { 128 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 129 return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); 130 } 131 132 // Looks at the phi nodes in Header for values coming from Latch. For these 133 // instructions and all their operands calls Visit on them, keeping going for 134 // all the operands in AftBlocks. Returns false if Visit returns false, 135 // otherwise returns true. This is used to process the instructions in the 136 // Aft blocks that need to be moved before the subloop. It is used in two 137 // places. One to check that the required set of instructions can be moved 138 // before the loop. Then to collect the instructions to actually move in 139 // moveHeaderPhiOperandsToForeBlocks. 140 template <typename T> 141 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 142 BasicBlockSet &AftBlocks, T Visit) { 143 SmallVector<Instruction *, 8> Worklist; 144 SmallPtrSet<Instruction *, 8> VisitedInstr; 145 for (auto &Phi : Header->phis()) { 146 Value *V = Phi.getIncomingValueForBlock(Latch); 147 if (Instruction *I = dyn_cast<Instruction>(V)) 148 Worklist.push_back(I); 149 } 150 151 while (!Worklist.empty()) { 152 Instruction *I = Worklist.pop_back_val(); 153 if (!Visit(I)) 154 return false; 155 VisitedInstr.insert(I); 156 157 if (AftBlocks.count(I->getParent())) 158 for (auto &U : I->operands()) 159 if (Instruction *II = dyn_cast<Instruction>(U)) 160 if (!VisitedInstr.count(II)) 161 Worklist.push_back(II); 162 } 163 164 return true; 165 } 166 167 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 168 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 169 BasicBlock *Latch, 170 Instruction *InsertLoc, 171 BasicBlockSet &AftBlocks) { 172 // We need to ensure we move the instructions in the correct order, 173 // starting with the earliest required instruction and moving forward. 174 std::vector<Instruction *> Visited; 175 processHeaderPhiOperands(Header, Latch, AftBlocks, 176 [&Visited, &AftBlocks](Instruction *I) { 177 if (AftBlocks.count(I->getParent())) 178 Visited.push_back(I); 179 return true; 180 }); 181 182 // Move all instructions in program order to before the InsertLoc 183 BasicBlock *InsertLocBB = InsertLoc->getParent(); 184 for (Instruction *I : reverse(Visited)) { 185 if (I->getParent() != InsertLocBB) 186 I->moveBefore(InsertLoc); 187 } 188 } 189 190 /* 191 This method performs Unroll and Jam. For a simple loop like: 192 for (i = ..) 193 Fore(i) 194 for (j = ..) 195 SubLoop(i, j) 196 Aft(i) 197 198 Instead of doing normal inner or outer unrolling, we do: 199 for (i = .., i+=2) 200 Fore(i) 201 Fore(i+1) 202 for (j = ..) 203 SubLoop(i, j) 204 SubLoop(i+1, j) 205 Aft(i) 206 Aft(i+1) 207 208 So the outer loop is essetially unrolled and then the inner loops are fused 209 ("jammed") together into a single loop. This can increase speed when there 210 are loads in SubLoop that are invariant to i, as they become shared between 211 the now jammed inner loops. 212 213 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 214 Fore blocks are those before the inner loop, Aft are those after. Normal 215 Unroll code is used to copy each of these sets of blocks and the results are 216 combined together into the final form above. 217 218 isSafeToUnrollAndJam should be used prior to calling this to make sure the 219 unrolling will be valid. Checking profitablility is also advisable. 220 221 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 222 necessary to create one and not fully unrolled). 223 */ 224 LoopUnrollResult 225 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, 226 unsigned TripMultiple, bool UnrollRemainder, 227 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 228 AssumptionCache *AC, const TargetTransformInfo *TTI, 229 OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 230 231 // When we enter here we should have already checked that it is safe 232 BasicBlock *Header = L->getHeader(); 233 assert(Header && "No header."); 234 assert(L->getSubLoops().size() == 1); 235 Loop *SubLoop = *L->begin(); 236 237 // Don't enter the unroll code if there is nothing to do. 238 if (TripCount == 0 && Count < 2) { 239 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 240 return LoopUnrollResult::Unmodified; 241 } 242 243 assert(Count > 0); 244 assert(TripMultiple > 0); 245 assert(TripCount == 0 || TripCount % TripMultiple == 0); 246 247 // Are we eliminating the loop control altogether? 248 bool CompletelyUnroll = (Count == TripCount); 249 250 // We use the runtime remainder in cases where we don't know trip multiple 251 if (TripMultiple == 1 || TripMultiple % Count != 0) { 252 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 253 /*UseEpilogRemainder*/ true, 254 UnrollRemainder, /*ForgetAllSCEV*/ false, 255 LI, SE, DT, AC, TTI, true, EpilogueLoop)) { 256 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 257 "generated when assuming runtime trip count\n"); 258 return LoopUnrollResult::Unmodified; 259 } 260 } 261 262 // Notify ScalarEvolution that the loop will be substantially changed, 263 // if not outright eliminated. 264 if (SE) { 265 SE->forgetLoop(L); 266 SE->forgetLoop(SubLoop); 267 } 268 269 using namespace ore; 270 // Report the unrolling decision. 271 if (CompletelyUnroll) { 272 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 273 << Header->getName() << " with trip count " << TripCount 274 << "!\n"); 275 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 276 L->getHeader()) 277 << "completely unroll and jammed loop with " 278 << NV("UnrollCount", TripCount) << " iterations"); 279 } else { 280 auto DiagBuilder = [&]() { 281 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 282 L->getHeader()); 283 return Diag << "unroll and jammed loop by a factor of " 284 << NV("UnrollCount", Count); 285 }; 286 287 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 288 << " by " << Count); 289 if (TripMultiple != 1) { 290 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 291 ORE->emit([&]() { 292 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 293 << " trips per branch"; 294 }); 295 } else { 296 LLVM_DEBUG(dbgs() << " with run-time trip count"); 297 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 298 } 299 LLVM_DEBUG(dbgs() << "!\n"); 300 } 301 302 BasicBlock *Preheader = L->getLoopPreheader(); 303 BasicBlock *LatchBlock = L->getLoopLatch(); 304 assert(Preheader && "No preheader"); 305 assert(LatchBlock && "No latch block"); 306 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 307 assert(BI && !BI->isUnconditional()); 308 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 309 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 310 bool SubLoopContinueOnTrue = SubLoop->contains( 311 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 312 313 // Partition blocks in an outer/inner loop pair into blocks before and after 314 // the loop 315 BasicBlockSet SubLoopBlocks; 316 BasicBlockSet ForeBlocks; 317 BasicBlockSet AftBlocks; 318 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 319 DT); 320 321 // We keep track of the entering/first and exiting/last block of each of 322 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 323 // blocks easier. 324 std::vector<BasicBlock *> ForeBlocksFirst; 325 std::vector<BasicBlock *> ForeBlocksLast; 326 std::vector<BasicBlock *> SubLoopBlocksFirst; 327 std::vector<BasicBlock *> SubLoopBlocksLast; 328 std::vector<BasicBlock *> AftBlocksFirst; 329 std::vector<BasicBlock *> AftBlocksLast; 330 ForeBlocksFirst.push_back(Header); 331 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 332 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 333 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 334 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 335 AftBlocksLast.push_back(L->getExitingBlock()); 336 // Maps Blocks[0] -> Blocks[It] 337 ValueToValueMapTy LastValueMap; 338 339 // Move any instructions from fore phi operands from AftBlocks into Fore. 340 moveHeaderPhiOperandsToForeBlocks( 341 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); 342 343 // The current on-the-fly SSA update requires blocks to be processed in 344 // reverse postorder so that LastValueMap contains the correct value at each 345 // exit. 346 LoopBlocksDFS DFS(L); 347 DFS.perform(LI); 348 // Stash the DFS iterators before adding blocks to the loop. 349 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 350 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 351 352 if (Header->getParent()->isDebugInfoForProfiling()) 353 for (BasicBlock *BB : L->getBlocks()) 354 for (Instruction &I : *BB) 355 if (!isa<DbgInfoIntrinsic>(&I)) 356 if (const DILocation *DIL = I.getDebugLoc()) { 357 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 358 if (NewDIL) 359 I.setDebugLoc(NewDIL.getValue()); 360 else 361 LLVM_DEBUG(dbgs() 362 << "Failed to create new discriminator: " 363 << DIL->getFilename() << " Line: " << DIL->getLine()); 364 } 365 366 // Copy all blocks 367 for (unsigned It = 1; It != Count; ++It) { 368 SmallVector<BasicBlock *, 8> NewBlocks; 369 // Maps Blocks[It] -> Blocks[It-1] 370 DenseMap<Value *, Value *> PrevItValueMap; 371 SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 372 NewLoops[L] = L; 373 NewLoops[SubLoop] = SubLoop; 374 375 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 376 ValueToValueMapTy VMap; 377 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 378 Header->getParent()->getBasicBlockList().push_back(New); 379 380 // Tell LI about New. 381 addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 382 383 if (ForeBlocks.count(*BB)) { 384 if (*BB == ForeBlocksFirst[0]) 385 ForeBlocksFirst.push_back(New); 386 if (*BB == ForeBlocksLast[0]) 387 ForeBlocksLast.push_back(New); 388 } else if (SubLoopBlocks.count(*BB)) { 389 if (*BB == SubLoopBlocksFirst[0]) 390 SubLoopBlocksFirst.push_back(New); 391 if (*BB == SubLoopBlocksLast[0]) 392 SubLoopBlocksLast.push_back(New); 393 } else if (AftBlocks.count(*BB)) { 394 if (*BB == AftBlocksFirst[0]) 395 AftBlocksFirst.push_back(New); 396 if (*BB == AftBlocksLast[0]) 397 AftBlocksLast.push_back(New); 398 } else { 399 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 400 } 401 402 // Update our running maps of newest clones 403 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 404 LastValueMap[*BB] = New; 405 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 406 VI != VE; ++VI) { 407 PrevItValueMap[VI->second] = 408 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 409 LastValueMap[VI->first] = VI->second; 410 } 411 412 NewBlocks.push_back(New); 413 414 // Update DomTree: 415 if (*BB == ForeBlocksFirst[0]) 416 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 417 else if (*BB == SubLoopBlocksFirst[0]) 418 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 419 else if (*BB == AftBlocksFirst[0]) 420 DT->addNewBlock(New, AftBlocksLast[It - 1]); 421 else { 422 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 423 // structure. 424 auto BBDomNode = DT->getNode(*BB); 425 auto BBIDom = BBDomNode->getIDom(); 426 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 427 assert(OriginalBBIDom); 428 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 429 DT->addNewBlock( 430 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 431 } 432 } 433 434 // Remap all instructions in the most recent iteration 435 remapInstructionsInBlocks(NewBlocks, LastValueMap); 436 for (BasicBlock *NewBlock : NewBlocks) { 437 for (Instruction &I : *NewBlock) { 438 if (auto *II = dyn_cast<AssumeInst>(&I)) 439 AC->registerAssumption(II); 440 } 441 } 442 443 // Alter the ForeBlocks phi's, pointing them at the latest version of the 444 // value from the previous iteration's phis 445 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 446 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 447 assert(OldValue && "should have incoming edge from Aft[It]"); 448 Value *NewValue = OldValue; 449 if (Value *PrevValue = PrevItValueMap[OldValue]) 450 NewValue = PrevValue; 451 452 assert(Phi.getNumOperands() == 2); 453 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 454 Phi.setIncomingValue(0, NewValue); 455 Phi.removeIncomingValue(1); 456 } 457 } 458 459 // Now that all the basic blocks for the unrolled iterations are in place, 460 // finish up connecting the blocks and phi nodes. At this point LastValueMap 461 // is the last unrolled iterations values. 462 463 // Update Phis in BB from OldBB to point to NewBB and use the latest value 464 // from LastValueMap 465 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 466 BasicBlock *NewBB, 467 ValueToValueMapTy &LastValueMap) { 468 for (PHINode &Phi : BB->phis()) { 469 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 470 if (Phi.getIncomingBlock(b) == OldBB) { 471 Value *OldValue = Phi.getIncomingValue(b); 472 if (Value *LastValue = LastValueMap[OldValue]) 473 Phi.setIncomingValue(b, LastValue); 474 Phi.setIncomingBlock(b, NewBB); 475 break; 476 } 477 } 478 } 479 }; 480 // Move all the phis from Src into Dest 481 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 482 Instruction *insertPoint = Dest->getFirstNonPHI(); 483 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 484 Phi->moveBefore(insertPoint); 485 }; 486 487 // Update the PHI values outside the loop to point to the last block 488 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 489 LastValueMap); 490 491 // Update ForeBlocks successors and phi nodes 492 BranchInst *ForeTerm = 493 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 494 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 495 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); 496 497 if (CompletelyUnroll) { 498 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 499 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 500 Phi->getParent()->getInstList().erase(Phi); 501 } 502 } else { 503 // Update the PHI values to point to the last aft block 504 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 505 AftBlocksLast.back(), LastValueMap); 506 } 507 508 for (unsigned It = 1; It != Count; It++) { 509 // Remap ForeBlock successors from previous iteration to this 510 BranchInst *ForeTerm = 511 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 512 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 513 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); 514 } 515 516 // Subloop successors and phis 517 BranchInst *SubTerm = 518 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 519 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 520 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 521 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0], 522 ForeBlocksLast.back()); 523 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 524 SubLoopBlocksLast.back()); 525 526 for (unsigned It = 1; It != Count; It++) { 527 // Replace the conditional branch of the previous iteration subloop with an 528 // unconditional one to this one 529 BranchInst *SubTerm = 530 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 531 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 532 SubTerm->eraseFromParent(); 533 534 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It], 535 ForeBlocksLast.back()); 536 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 537 SubLoopBlocksLast.back()); 538 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 539 } 540 541 // Aft blocks successors and phis 542 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 543 if (CompletelyUnroll) { 544 BranchInst::Create(LoopExit, AftTerm); 545 AftTerm->eraseFromParent(); 546 } else { 547 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 548 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && 549 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); 550 } 551 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 552 SubLoopBlocksLast.back()); 553 554 for (unsigned It = 1; It != Count; It++) { 555 // Replace the conditional branch of the previous iteration subloop with an 556 // unconditional one to this one 557 BranchInst *AftTerm = 558 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 559 BranchInst::Create(AftBlocksFirst[It], AftTerm); 560 AftTerm->eraseFromParent(); 561 562 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 563 SubLoopBlocksLast.back()); 564 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 565 } 566 567 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 568 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 569 // new ones required. 570 if (Count != 1) { 571 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 572 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 573 SubLoopBlocksFirst[0]); 574 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 575 SubLoopBlocksLast[0], AftBlocksFirst[0]); 576 577 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 578 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 579 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 580 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 581 DTU.applyUpdatesPermissive(DTUpdates); 582 } 583 584 // Merge adjacent basic blocks, if possible. 585 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 586 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 587 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 588 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 589 590 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); 591 592 // Apply updates to the DomTree. 593 DT = &DTU.getDomTree(); 594 595 // At this point, the code is well formed. We now do a quick sweep over the 596 // inserted code, doing constant propagation and dead code elimination as we 597 // go. 598 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); 599 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, 600 TTI); 601 602 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 603 ++NumUnrolledAndJammed; 604 605 // Update LoopInfo if the loop is completely removed. 606 if (CompletelyUnroll) 607 LI->erase(L); 608 609 #ifndef NDEBUG 610 // We shouldn't have done anything to break loop simplify form or LCSSA. 611 Loop *OutestLoop = SubLoop->getParentLoop() 612 ? SubLoop->getParentLoop()->getParentLoop() 613 ? SubLoop->getParentLoop()->getParentLoop() 614 : SubLoop->getParentLoop() 615 : SubLoop; 616 assert(DT->verify()); 617 LI->verify(*DT); 618 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 619 if (!CompletelyUnroll) 620 assert(L->isLoopSimplifyForm()); 621 assert(SubLoop->isLoopSimplifyForm()); 622 SE->verify(); 623 #endif 624 625 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 626 : LoopUnrollResult::PartiallyUnrolled; 627 } 628 629 static bool getLoadsAndStores(BasicBlockSet &Blocks, 630 SmallVector<Instruction *, 4> &MemInstr) { 631 // Scan the BBs and collect legal loads and stores. 632 // Returns false if non-simple loads/stores are found. 633 for (BasicBlock *BB : Blocks) { 634 for (Instruction &I : *BB) { 635 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 636 if (!Ld->isSimple()) 637 return false; 638 MemInstr.push_back(&I); 639 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 640 if (!St->isSimple()) 641 return false; 642 MemInstr.push_back(&I); 643 } else if (I.mayReadOrWriteMemory()) { 644 return false; 645 } 646 } 647 } 648 return true; 649 } 650 651 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, 652 unsigned UnrollLevel, unsigned JamLevel, 653 bool Sequentialized, Dependence *D) { 654 // UnrollLevel might carry the dependency Src --> Dst 655 // Does a different loop after unrolling? 656 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 657 ++CurLoopDepth) { 658 auto JammedDir = D->getDirection(CurLoopDepth); 659 if (JammedDir == Dependence::DVEntry::LT) 660 return true; 661 662 if (JammedDir & Dependence::DVEntry::GT) 663 return false; 664 } 665 666 return true; 667 } 668 669 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, 670 unsigned UnrollLevel, unsigned JamLevel, 671 bool Sequentialized, Dependence *D) { 672 // UnrollLevel might carry the dependency Dst --> Src 673 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 674 ++CurLoopDepth) { 675 auto JammedDir = D->getDirection(CurLoopDepth); 676 if (JammedDir == Dependence::DVEntry::GT) 677 return true; 678 679 if (JammedDir & Dependence::DVEntry::LT) 680 return false; 681 } 682 683 // Backward dependencies are only preserved if not interleaved. 684 return Sequentialized; 685 } 686 687 // Check whether it is semantically safe Src and Dst considering any potential 688 // dependency between them. 689 // 690 // @param UnrollLevel The level of the loop being unrolled 691 // @param JamLevel The level of the loop being jammed; if Src and Dst are on 692 // different levels, the outermost common loop counts as jammed level 693 // 694 // @return true if is safe and false if there is a dependency violation. 695 static bool checkDependency(Instruction *Src, Instruction *Dst, 696 unsigned UnrollLevel, unsigned JamLevel, 697 bool Sequentialized, DependenceInfo &DI) { 698 assert(UnrollLevel <= JamLevel && 699 "Expecting JamLevel to be at least UnrollLevel"); 700 701 if (Src == Dst) 702 return true; 703 // Ignore Input dependencies. 704 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 705 return true; 706 707 // Check whether unroll-and-jam may violate a dependency. 708 // By construction, every dependency will be lexicographically non-negative 709 // (if it was, it would violate the current execution order), such as 710 // (0,0,>,*,*) 711 // Unroll-and-jam changes the GT execution of two executions to the same 712 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE 713 // dependence (or EQ, if we fully unrolled the loop) at the loop's position: 714 // (0,0,>=,*,*) 715 // Now, the dependency is not necessarily non-negative anymore, i.e. 716 // unroll-and-jam may violate correctness. 717 std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); 718 if (!D) 719 return true; 720 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 721 722 if (D->isConfused()) { 723 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 724 << " " << *Src << "\n" 725 << " " << *Dst << "\n"); 726 return false; 727 } 728 729 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a 730 // non-equal direction, then the locations accessed in the inner levels cannot 731 // overlap in memory. We assumes the indexes never overlap into neighboring 732 // dimensions. 733 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) 734 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) 735 return true; 736 737 auto UnrollDirection = D->getDirection(UnrollLevel); 738 739 // If the distance carried by the unrolled loop is 0, then after unrolling 740 // that distance will become non-zero resulting in non-overlapping accesses in 741 // the inner loops. 742 if (UnrollDirection == Dependence::DVEntry::EQ) 743 return true; 744 745 if (UnrollDirection & Dependence::DVEntry::LT && 746 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, 747 Sequentialized, D.get())) 748 return false; 749 750 if (UnrollDirection & Dependence::DVEntry::GT && 751 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, 752 Sequentialized, D.get())) 753 return false; 754 755 return true; 756 } 757 758 static bool 759 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, 760 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 761 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, 762 DependenceInfo &DI, LoopInfo &LI) { 763 SmallVector<BasicBlockSet, 8> AllBlocks; 764 for (Loop *L : Root.getLoopsInPreorder()) 765 if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) 766 AllBlocks.push_back(ForeBlocksMap.lookup(L)); 767 AllBlocks.push_back(SubLoopBlocks); 768 for (Loop *L : Root.getLoopsInPreorder()) 769 if (AftBlocksMap.find(L) != AftBlocksMap.end()) 770 AllBlocks.push_back(AftBlocksMap.lookup(L)); 771 772 unsigned LoopDepth = Root.getLoopDepth(); 773 SmallVector<Instruction *, 4> EarlierLoadsAndStores; 774 SmallVector<Instruction *, 4> CurrentLoadsAndStores; 775 for (BasicBlockSet &Blocks : AllBlocks) { 776 CurrentLoadsAndStores.clear(); 777 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) 778 return false; 779 780 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); 781 unsigned CurLoopDepth = CurLoop->getLoopDepth(); 782 783 for (auto *Earlier : EarlierLoadsAndStores) { 784 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); 785 unsigned EarlierDepth = EarlierLoop->getLoopDepth(); 786 unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); 787 for (auto *Later : CurrentLoadsAndStores) { 788 if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, 789 DI)) 790 return false; 791 } 792 } 793 794 size_t NumInsts = CurrentLoadsAndStores.size(); 795 for (size_t I = 0; I < NumInsts; ++I) { 796 for (size_t J = I; J < NumInsts; ++J) { 797 if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], 798 LoopDepth, CurLoopDepth, true, DI)) 799 return false; 800 } 801 } 802 803 EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), 804 CurrentLoadsAndStores.end()); 805 } 806 return true; 807 } 808 809 static bool isEligibleLoopForm(const Loop &Root) { 810 // Root must have a child. 811 if (Root.getSubLoops().size() != 1) 812 return false; 813 814 const Loop *L = &Root; 815 do { 816 // All loops in Root need to be in simplify and rotated form. 817 if (!L->isLoopSimplifyForm()) 818 return false; 819 820 if (!L->isRotatedForm()) 821 return false; 822 823 if (L->getHeader()->hasAddressTaken()) { 824 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 825 return false; 826 } 827 828 unsigned SubLoopsSize = L->getSubLoops().size(); 829 if (SubLoopsSize == 0) 830 return true; 831 832 // Only one child is allowed. 833 if (SubLoopsSize != 1) 834 return false; 835 836 // Only loops with a single exit block can be unrolled and jammed. 837 // The function getExitBlock() is used for this check, rather than 838 // getUniqueExitBlock() to ensure loops with mulitple exit edges are 839 // disallowed. 840 if (!L->getExitBlock()) { 841 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit " 842 "blocks can be unrolled and jammed.\n"); 843 return false; 844 } 845 846 // Only loops with a single exiting block can be unrolled and jammed. 847 if (!L->getExitingBlock()) { 848 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single " 849 "exiting blocks can be unrolled and jammed.\n"); 850 return false; 851 } 852 853 L = L->getSubLoops()[0]; 854 } while (L); 855 856 return true; 857 } 858 859 static Loop *getInnerMostLoop(Loop *L) { 860 while (!L->getSubLoops().empty()) 861 L = L->getSubLoops()[0]; 862 return L; 863 } 864 865 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 866 DependenceInfo &DI, LoopInfo &LI) { 867 if (!isEligibleLoopForm(*L)) { 868 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); 869 return false; 870 } 871 872 /* We currently handle outer loops like this: 873 | 874 ForeFirst <------\ } 875 Blocks | } ForeBlocks of L 876 ForeLast | } 877 | | 878 ... | 879 | | 880 ForeFirst <----\ | } 881 Blocks | | } ForeBlocks of a inner loop of L 882 ForeLast | | } 883 | | | 884 JamLoopFirst <\ | | } 885 Blocks | | | } JamLoopBlocks of the innermost loop 886 JamLoopLast -/ | | } 887 | | | 888 AftFirst | | } 889 Blocks | | } AftBlocks of a inner loop of L 890 AftLast ------/ | } 891 | | 892 ... | 893 | | 894 AftFirst | } 895 Blocks | } AftBlocks of L 896 AftLast --------/ } 897 | 898 899 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 900 and AftBlocks, providing that there is one edge from Fores to SubLoops, 901 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 902 In practice we currently limit Aft blocks to a single block, and limit 903 things further in the profitablility checks of the unroll and jam pass. 904 905 Because of the way we rearrange basic blocks, we also require that 906 the Fore blocks of L on all unrolled iterations are safe to move before the 907 blocks of the direct child of L of all iterations. So we require that the 908 phi node looping operands of ForeHeader can be moved to at least the end of 909 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and 910 match up Phi's correctly. 911 912 i.e. The old order of blocks used to be 913 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. 914 It needs to be safe to transform this to 915 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. 916 917 There are then a number of checks along the lines of no calls, no 918 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 919 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 920 UnrollAndJamLoop if the trip count cannot be easily calculated. 921 */ 922 923 // Split blocks into Fore/SubLoop/Aft based on dominators 924 Loop *JamLoop = getInnerMostLoop(L); 925 BasicBlockSet SubLoopBlocks; 926 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; 927 DenseMap<Loop *, BasicBlockSet> AftBlocksMap; 928 if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, 929 AftBlocksMap, DT)) { 930 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 931 return false; 932 } 933 934 // Aft blocks may need to move instructions to fore blocks, which becomes more 935 // difficult if there are multiple (potentially conditionally executed) 936 // blocks. For now we just exclude loops with multiple aft blocks. 937 if (AftBlocksMap[L].size() != 1) { 938 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 939 "multiple blocks after the loop\n"); 940 return false; 941 } 942 943 // Check inner loop backedge count is consistent on all iterations of the 944 // outer loop 945 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { 946 return !hasIterationCountInvariantInParent(SubLoop, SE); 947 })) { 948 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 949 "not consistent on each iteration\n"); 950 return false; 951 } 952 953 // Check the loop safety info for exceptions. 954 SimpleLoopSafetyInfo LSI; 955 LSI.computeLoopSafetyInfo(L); 956 if (LSI.anyBlockMayThrow()) { 957 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 958 return false; 959 } 960 961 // We've ruled out the easy stuff and now need to check that there are no 962 // interdependencies which may prevent us from moving the: 963 // ForeBlocks before Subloop and AftBlocks. 964 // Subloop before AftBlocks. 965 // ForeBlock phi operands before the subloop 966 967 // Make sure we can move all instructions we need to before the subloop 968 BasicBlock *Header = L->getHeader(); 969 BasicBlock *Latch = L->getLoopLatch(); 970 BasicBlockSet AftBlocks = AftBlocksMap[L]; 971 Loop *SubLoop = L->getSubLoops()[0]; 972 if (!processHeaderPhiOperands( 973 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 974 if (SubLoop->contains(I->getParent())) 975 return false; 976 if (AftBlocks.count(I->getParent())) { 977 // If we hit a phi node in afts we know we are done (probably 978 // LCSSA) 979 if (isa<PHINode>(I)) 980 return false; 981 // Can't move instructions with side effects or memory 982 // reads/writes 983 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 984 return false; 985 } 986 // Keep going 987 return true; 988 })) { 989 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 990 "instructions after subloop to before it\n"); 991 return false; 992 } 993 994 // Check for memory dependencies which prohibit the unrolling we are doing. 995 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 996 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 997 if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, 998 LI)) { 999 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 1000 return false; 1001 } 1002 1003 return true; 1004 } 1005