1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/Optional.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/Sequence.h" 19 #include "llvm/ADT/SmallPtrSet.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/ADT/iterator_range.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/DependenceAnalysis.h" 27 #include "llvm/Analysis/DomTreeUpdater.h" 28 #include "llvm/Analysis/LoopInfo.h" 29 #include "llvm/Analysis/LoopIterator.h" 30 #include "llvm/Analysis/MustExecute.h" 31 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 32 #include "llvm/Analysis/ScalarEvolution.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Use.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/IR/ValueHandle.h" 46 #include "llvm/IR/ValueMap.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/GenericDomTree.h" 51 #include "llvm/Support/raw_ostream.h" 52 #include "llvm/Transforms/Scalar/LoopPassManager.h" 53 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 54 #include "llvm/Transforms/Utils/Cloning.h" 55 #include "llvm/Transforms/Utils/LoopUtils.h" 56 #include "llvm/Transforms/Utils/UnrollLoop.h" 57 #include "llvm/Transforms/Utils/ValueMapper.h" 58 #include <assert.h> 59 #include <memory> 60 #include <type_traits> 61 #include <vector> 62 63 using namespace llvm; 64 65 #define DEBUG_TYPE "loop-unroll-and-jam" 66 67 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 68 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 69 70 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 71 72 // Partition blocks in an outer/inner loop pair into blocks before and after 73 // the loop 74 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, 75 BasicBlockSet &AftBlocks, DominatorTree &DT) { 76 Loop *SubLoop = L.getSubLoops()[0]; 77 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 78 79 for (BasicBlock *BB : L.blocks()) { 80 if (!SubLoop->contains(BB)) { 81 if (DT.dominates(SubLoopLatch, BB)) 82 AftBlocks.insert(BB); 83 else 84 ForeBlocks.insert(BB); 85 } 86 } 87 88 // Check that all blocks in ForeBlocks together dominate the subloop 89 // TODO: This might ideally be done better with a dominator/postdominators. 90 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 91 for (BasicBlock *BB : ForeBlocks) { 92 if (BB == SubLoopPreHeader) 93 continue; 94 Instruction *TI = BB->getTerminator(); 95 for (BasicBlock *Succ : successors(TI)) 96 if (!ForeBlocks.count(Succ)) 97 return false; 98 } 99 100 return true; 101 } 102 103 /// Partition blocks in a loop nest into blocks before and after each inner 104 /// loop. 105 static bool partitionOuterLoopBlocks( 106 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, 107 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 108 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { 109 JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); 110 111 for (Loop *L : Root.getLoopsInPreorder()) { 112 if (L == &JamLoop) 113 break; 114 115 if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) 116 return false; 117 } 118 119 return true; 120 } 121 122 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more 123 // than 2 levels loop. 124 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 125 BasicBlockSet &ForeBlocks, 126 BasicBlockSet &SubLoopBlocks, 127 BasicBlockSet &AftBlocks, 128 DominatorTree *DT) { 129 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 130 return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); 131 } 132 133 // Looks at the phi nodes in Header for values coming from Latch. For these 134 // instructions and all their operands calls Visit on them, keeping going for 135 // all the operands in AftBlocks. Returns false if Visit returns false, 136 // otherwise returns true. This is used to process the instructions in the 137 // Aft blocks that need to be moved before the subloop. It is used in two 138 // places. One to check that the required set of instructions can be moved 139 // before the loop. Then to collect the instructions to actually move in 140 // moveHeaderPhiOperandsToForeBlocks. 141 template <typename T> 142 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 143 BasicBlockSet &AftBlocks, T Visit) { 144 SmallVector<Instruction *, 8> Worklist; 145 SmallPtrSet<Instruction *, 8> VisitedInstr; 146 for (auto &Phi : Header->phis()) { 147 Value *V = Phi.getIncomingValueForBlock(Latch); 148 if (Instruction *I = dyn_cast<Instruction>(V)) 149 Worklist.push_back(I); 150 } 151 152 while (!Worklist.empty()) { 153 Instruction *I = Worklist.pop_back_val(); 154 if (!Visit(I)) 155 return false; 156 VisitedInstr.insert(I); 157 158 if (AftBlocks.count(I->getParent())) 159 for (auto &U : I->operands()) 160 if (Instruction *II = dyn_cast<Instruction>(U)) 161 if (!VisitedInstr.count(II)) 162 Worklist.push_back(II); 163 } 164 165 return true; 166 } 167 168 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 169 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 170 BasicBlock *Latch, 171 Instruction *InsertLoc, 172 BasicBlockSet &AftBlocks) { 173 // We need to ensure we move the instructions in the correct order, 174 // starting with the earliest required instruction and moving forward. 175 std::vector<Instruction *> Visited; 176 processHeaderPhiOperands(Header, Latch, AftBlocks, 177 [&Visited, &AftBlocks](Instruction *I) { 178 if (AftBlocks.count(I->getParent())) 179 Visited.push_back(I); 180 return true; 181 }); 182 183 // Move all instructions in program order to before the InsertLoc 184 BasicBlock *InsertLocBB = InsertLoc->getParent(); 185 for (Instruction *I : reverse(Visited)) { 186 if (I->getParent() != InsertLocBB) 187 I->moveBefore(InsertLoc); 188 } 189 } 190 191 /* 192 This method performs Unroll and Jam. For a simple loop like: 193 for (i = ..) 194 Fore(i) 195 for (j = ..) 196 SubLoop(i, j) 197 Aft(i) 198 199 Instead of doing normal inner or outer unrolling, we do: 200 for (i = .., i+=2) 201 Fore(i) 202 Fore(i+1) 203 for (j = ..) 204 SubLoop(i, j) 205 SubLoop(i+1, j) 206 Aft(i) 207 Aft(i+1) 208 209 So the outer loop is essetially unrolled and then the inner loops are fused 210 ("jammed") together into a single loop. This can increase speed when there 211 are loads in SubLoop that are invariant to i, as they become shared between 212 the now jammed inner loops. 213 214 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 215 Fore blocks are those before the inner loop, Aft are those after. Normal 216 Unroll code is used to copy each of these sets of blocks and the results are 217 combined together into the final form above. 218 219 isSafeToUnrollAndJam should be used prior to calling this to make sure the 220 unrolling will be valid. Checking profitablility is also advisable. 221 222 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 223 necessary to create one and not fully unrolled). 224 */ 225 LoopUnrollResult llvm::UnrollAndJamLoop( 226 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, 227 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 228 AssumptionCache *AC, const TargetTransformInfo *TTI, 229 OptimizationRemarkEmitter *ORE, LPMUpdater *U, Loop **EpilogueLoop) { 230 231 // When we enter here we should have already checked that it is safe 232 BasicBlock *Header = L->getHeader(); 233 assert(Header && "No header."); 234 assert(L->getSubLoops().size() == 1); 235 Loop *SubLoop = *L->begin(); 236 237 // Don't enter the unroll code if there is nothing to do. 238 if (TripCount == 0 && Count < 2) { 239 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 240 return LoopUnrollResult::Unmodified; 241 } 242 243 assert(Count > 0); 244 assert(TripMultiple > 0); 245 assert(TripCount == 0 || TripCount % TripMultiple == 0); 246 247 // Are we eliminating the loop control altogether? 248 bool CompletelyUnroll = (Count == TripCount); 249 250 // We use the runtime remainder in cases where we don't know trip multiple 251 if (TripMultiple == 1 || TripMultiple % Count != 0) { 252 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 253 /*UseEpilogRemainder*/ true, 254 UnrollRemainder, /*ForgetAllSCEV*/ false, 255 LI, SE, DT, AC, TTI, true, EpilogueLoop)) { 256 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 257 "generated when assuming runtime trip count\n"); 258 return LoopUnrollResult::Unmodified; 259 } 260 } 261 262 // Notify ScalarEvolution that the loop will be substantially changed, 263 // if not outright eliminated. 264 if (SE) { 265 SE->forgetLoop(L); 266 SE->forgetLoop(SubLoop); 267 } 268 269 using namespace ore; 270 // Report the unrolling decision. 271 if (CompletelyUnroll) { 272 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 273 << Header->getName() << " with trip count " << TripCount 274 << "!\n"); 275 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 276 L->getHeader()) 277 << "completely unroll and jammed loop with " 278 << NV("UnrollCount", TripCount) << " iterations"); 279 } else { 280 auto DiagBuilder = [&]() { 281 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 282 L->getHeader()); 283 return Diag << "unroll and jammed loop by a factor of " 284 << NV("UnrollCount", Count); 285 }; 286 287 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 288 << " by " << Count); 289 if (TripMultiple != 1) { 290 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 291 ORE->emit([&]() { 292 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 293 << " trips per branch"; 294 }); 295 } else { 296 LLVM_DEBUG(dbgs() << " with run-time trip count"); 297 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 298 } 299 LLVM_DEBUG(dbgs() << "!\n"); 300 } 301 302 BasicBlock *Preheader = L->getLoopPreheader(); 303 BasicBlock *LatchBlock = L->getLoopLatch(); 304 assert(Preheader && "No preheader"); 305 assert(LatchBlock && "No latch block"); 306 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 307 assert(BI && !BI->isUnconditional()); 308 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 309 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 310 bool SubLoopContinueOnTrue = SubLoop->contains( 311 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 312 313 // Partition blocks in an outer/inner loop pair into blocks before and after 314 // the loop 315 BasicBlockSet SubLoopBlocks; 316 BasicBlockSet ForeBlocks; 317 BasicBlockSet AftBlocks; 318 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 319 DT); 320 321 // We keep track of the entering/first and exiting/last block of each of 322 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 323 // blocks easier. 324 std::vector<BasicBlock *> ForeBlocksFirst; 325 std::vector<BasicBlock *> ForeBlocksLast; 326 std::vector<BasicBlock *> SubLoopBlocksFirst; 327 std::vector<BasicBlock *> SubLoopBlocksLast; 328 std::vector<BasicBlock *> AftBlocksFirst; 329 std::vector<BasicBlock *> AftBlocksLast; 330 ForeBlocksFirst.push_back(Header); 331 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 332 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 333 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 334 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 335 AftBlocksLast.push_back(L->getExitingBlock()); 336 // Maps Blocks[0] -> Blocks[It] 337 ValueToValueMapTy LastValueMap; 338 339 // Move any instructions from fore phi operands from AftBlocks into Fore. 340 moveHeaderPhiOperandsToForeBlocks( 341 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); 342 343 // The current on-the-fly SSA update requires blocks to be processed in 344 // reverse postorder so that LastValueMap contains the correct value at each 345 // exit. 346 LoopBlocksDFS DFS(L); 347 DFS.perform(LI); 348 // Stash the DFS iterators before adding blocks to the loop. 349 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 350 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 351 352 // When a FSDiscriminator is enabled, we don't need to add the multiply 353 // factors to the discriminators. 354 if (Header->getParent()->isDebugInfoForProfiling() && !EnableFSDiscriminator) 355 for (BasicBlock *BB : L->getBlocks()) 356 for (Instruction &I : *BB) 357 if (!isa<DbgInfoIntrinsic>(&I)) 358 if (const DILocation *DIL = I.getDebugLoc()) { 359 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 360 if (NewDIL) 361 I.setDebugLoc(NewDIL.getValue()); 362 else 363 LLVM_DEBUG(dbgs() 364 << "Failed to create new discriminator: " 365 << DIL->getFilename() << " Line: " << DIL->getLine()); 366 } 367 368 // Copy all blocks 369 for (unsigned It = 1; It != Count; ++It) { 370 SmallVector<BasicBlock *, 8> NewBlocks; 371 // Maps Blocks[It] -> Blocks[It-1] 372 DenseMap<Value *, Value *> PrevItValueMap; 373 SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 374 NewLoops[L] = L; 375 NewLoops[SubLoop] = SubLoop; 376 377 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 378 ValueToValueMapTy VMap; 379 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 380 Header->getParent()->getBasicBlockList().push_back(New); 381 382 // Tell LI about New. 383 addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 384 385 if (ForeBlocks.count(*BB)) { 386 if (*BB == ForeBlocksFirst[0]) 387 ForeBlocksFirst.push_back(New); 388 if (*BB == ForeBlocksLast[0]) 389 ForeBlocksLast.push_back(New); 390 } else if (SubLoopBlocks.count(*BB)) { 391 if (*BB == SubLoopBlocksFirst[0]) 392 SubLoopBlocksFirst.push_back(New); 393 if (*BB == SubLoopBlocksLast[0]) 394 SubLoopBlocksLast.push_back(New); 395 } else if (AftBlocks.count(*BB)) { 396 if (*BB == AftBlocksFirst[0]) 397 AftBlocksFirst.push_back(New); 398 if (*BB == AftBlocksLast[0]) 399 AftBlocksLast.push_back(New); 400 } else { 401 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 402 } 403 404 // Update our running maps of newest clones 405 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 406 LastValueMap[*BB] = New; 407 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 408 VI != VE; ++VI) { 409 PrevItValueMap[VI->second] = 410 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 411 LastValueMap[VI->first] = VI->second; 412 } 413 414 NewBlocks.push_back(New); 415 416 // Update DomTree: 417 if (*BB == ForeBlocksFirst[0]) 418 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 419 else if (*BB == SubLoopBlocksFirst[0]) 420 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 421 else if (*BB == AftBlocksFirst[0]) 422 DT->addNewBlock(New, AftBlocksLast[It - 1]); 423 else { 424 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 425 // structure. 426 auto BBDomNode = DT->getNode(*BB); 427 auto BBIDom = BBDomNode->getIDom(); 428 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 429 assert(OriginalBBIDom); 430 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 431 DT->addNewBlock( 432 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 433 } 434 } 435 436 // Remap all instructions in the most recent iteration 437 remapInstructionsInBlocks(NewBlocks, LastValueMap); 438 for (BasicBlock *NewBlock : NewBlocks) { 439 for (Instruction &I : *NewBlock) { 440 if (auto *II = dyn_cast<AssumeInst>(&I)) 441 AC->registerAssumption(II); 442 } 443 } 444 445 // Alter the ForeBlocks phi's, pointing them at the latest version of the 446 // value from the previous iteration's phis 447 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 448 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 449 assert(OldValue && "should have incoming edge from Aft[It]"); 450 Value *NewValue = OldValue; 451 if (Value *PrevValue = PrevItValueMap[OldValue]) 452 NewValue = PrevValue; 453 454 assert(Phi.getNumOperands() == 2); 455 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 456 Phi.setIncomingValue(0, NewValue); 457 Phi.removeIncomingValue(1); 458 } 459 } 460 461 // Now that all the basic blocks for the unrolled iterations are in place, 462 // finish up connecting the blocks and phi nodes. At this point LastValueMap 463 // is the last unrolled iterations values. 464 465 // Update Phis in BB from OldBB to point to NewBB and use the latest value 466 // from LastValueMap 467 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 468 BasicBlock *NewBB, 469 ValueToValueMapTy &LastValueMap) { 470 for (PHINode &Phi : BB->phis()) { 471 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 472 if (Phi.getIncomingBlock(b) == OldBB) { 473 Value *OldValue = Phi.getIncomingValue(b); 474 if (Value *LastValue = LastValueMap[OldValue]) 475 Phi.setIncomingValue(b, LastValue); 476 Phi.setIncomingBlock(b, NewBB); 477 break; 478 } 479 } 480 } 481 }; 482 // Move all the phis from Src into Dest 483 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 484 Instruction *insertPoint = Dest->getFirstNonPHI(); 485 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 486 Phi->moveBefore(insertPoint); 487 }; 488 489 // Update the PHI values outside the loop to point to the last block 490 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 491 LastValueMap); 492 493 // Update ForeBlocks successors and phi nodes 494 BranchInst *ForeTerm = 495 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 496 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 497 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); 498 499 if (CompletelyUnroll) { 500 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 501 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 502 Phi->getParent()->getInstList().erase(Phi); 503 } 504 } else { 505 // Update the PHI values to point to the last aft block 506 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 507 AftBlocksLast.back(), LastValueMap); 508 } 509 510 for (unsigned It = 1; It != Count; It++) { 511 // Remap ForeBlock successors from previous iteration to this 512 BranchInst *ForeTerm = 513 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 514 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 515 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); 516 } 517 518 // Subloop successors and phis 519 BranchInst *SubTerm = 520 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 521 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 522 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 523 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0], 524 ForeBlocksLast.back()); 525 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 526 SubLoopBlocksLast.back()); 527 528 for (unsigned It = 1; It != Count; It++) { 529 // Replace the conditional branch of the previous iteration subloop with an 530 // unconditional one to this one 531 BranchInst *SubTerm = 532 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 533 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 534 SubTerm->eraseFromParent(); 535 536 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It], 537 ForeBlocksLast.back()); 538 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 539 SubLoopBlocksLast.back()); 540 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 541 } 542 543 // Aft blocks successors and phis 544 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 545 if (CompletelyUnroll) { 546 BranchInst::Create(LoopExit, AftTerm); 547 AftTerm->eraseFromParent(); 548 } else { 549 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 550 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && 551 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); 552 } 553 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 554 SubLoopBlocksLast.back()); 555 556 for (unsigned It = 1; It != Count; It++) { 557 // Replace the conditional branch of the previous iteration subloop with an 558 // unconditional one to this one 559 BranchInst *AftTerm = 560 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 561 BranchInst::Create(AftBlocksFirst[It], AftTerm); 562 AftTerm->eraseFromParent(); 563 564 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 565 SubLoopBlocksLast.back()); 566 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 567 } 568 569 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 570 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 571 // new ones required. 572 if (Count != 1) { 573 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 574 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 575 SubLoopBlocksFirst[0]); 576 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 577 SubLoopBlocksLast[0], AftBlocksFirst[0]); 578 579 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 580 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 581 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 582 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 583 DTU.applyUpdatesPermissive(DTUpdates); 584 } 585 586 // Merge adjacent basic blocks, if possible. 587 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 588 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 589 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 590 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 591 592 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); 593 594 // Apply updates to the DomTree. 595 DT = &DTU.getDomTree(); 596 597 // At this point, the code is well formed. We now do a quick sweep over the 598 // inserted code, doing constant propagation and dead code elimination as we 599 // go. 600 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); 601 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, 602 TTI); 603 604 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 605 ++NumUnrolledAndJammed; 606 607 // Update LoopInfo if the loop is completely removed. 608 if (CompletelyUnroll) { 609 if (U) 610 U->markLoopAsDeleted(*L, std::string(L->getName())); 611 LI->erase(L); 612 } 613 614 #ifndef NDEBUG 615 // We shouldn't have done anything to break loop simplify form or LCSSA. 616 Loop *OutestLoop = SubLoop->getParentLoop() 617 ? SubLoop->getParentLoop()->getParentLoop() 618 ? SubLoop->getParentLoop()->getParentLoop() 619 : SubLoop->getParentLoop() 620 : SubLoop; 621 assert(DT->verify()); 622 LI->verify(*DT); 623 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 624 if (!CompletelyUnroll) 625 assert(L->isLoopSimplifyForm()); 626 assert(SubLoop->isLoopSimplifyForm()); 627 SE->verify(); 628 #endif 629 630 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 631 : LoopUnrollResult::PartiallyUnrolled; 632 } 633 634 static bool getLoadsAndStores(BasicBlockSet &Blocks, 635 SmallVector<Instruction *, 4> &MemInstr) { 636 // Scan the BBs and collect legal loads and stores. 637 // Returns false if non-simple loads/stores are found. 638 for (BasicBlock *BB : Blocks) { 639 for (Instruction &I : *BB) { 640 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 641 if (!Ld->isSimple()) 642 return false; 643 MemInstr.push_back(&I); 644 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 645 if (!St->isSimple()) 646 return false; 647 MemInstr.push_back(&I); 648 } else if (I.mayReadOrWriteMemory()) { 649 return false; 650 } 651 } 652 } 653 return true; 654 } 655 656 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, 657 unsigned UnrollLevel, unsigned JamLevel, 658 bool Sequentialized, Dependence *D) { 659 // UnrollLevel might carry the dependency Src --> Dst 660 // Does a different loop after unrolling? 661 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 662 ++CurLoopDepth) { 663 auto JammedDir = D->getDirection(CurLoopDepth); 664 if (JammedDir == Dependence::DVEntry::LT) 665 return true; 666 667 if (JammedDir & Dependence::DVEntry::GT) 668 return false; 669 } 670 671 return true; 672 } 673 674 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, 675 unsigned UnrollLevel, unsigned JamLevel, 676 bool Sequentialized, Dependence *D) { 677 // UnrollLevel might carry the dependency Dst --> Src 678 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 679 ++CurLoopDepth) { 680 auto JammedDir = D->getDirection(CurLoopDepth); 681 if (JammedDir == Dependence::DVEntry::GT) 682 return true; 683 684 if (JammedDir & Dependence::DVEntry::LT) 685 return false; 686 } 687 688 // Backward dependencies are only preserved if not interleaved. 689 return Sequentialized; 690 } 691 692 // Check whether it is semantically safe Src and Dst considering any potential 693 // dependency between them. 694 // 695 // @param UnrollLevel The level of the loop being unrolled 696 // @param JamLevel The level of the loop being jammed; if Src and Dst are on 697 // different levels, the outermost common loop counts as jammed level 698 // 699 // @return true if is safe and false if there is a dependency violation. 700 static bool checkDependency(Instruction *Src, Instruction *Dst, 701 unsigned UnrollLevel, unsigned JamLevel, 702 bool Sequentialized, DependenceInfo &DI) { 703 assert(UnrollLevel <= JamLevel && 704 "Expecting JamLevel to be at least UnrollLevel"); 705 706 if (Src == Dst) 707 return true; 708 // Ignore Input dependencies. 709 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 710 return true; 711 712 // Check whether unroll-and-jam may violate a dependency. 713 // By construction, every dependency will be lexicographically non-negative 714 // (if it was, it would violate the current execution order), such as 715 // (0,0,>,*,*) 716 // Unroll-and-jam changes the GT execution of two executions to the same 717 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE 718 // dependence (or EQ, if we fully unrolled the loop) at the loop's position: 719 // (0,0,>=,*,*) 720 // Now, the dependency is not necessarily non-negative anymore, i.e. 721 // unroll-and-jam may violate correctness. 722 std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); 723 if (!D) 724 return true; 725 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 726 727 if (D->isConfused()) { 728 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 729 << " " << *Src << "\n" 730 << " " << *Dst << "\n"); 731 return false; 732 } 733 734 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a 735 // non-equal direction, then the locations accessed in the inner levels cannot 736 // overlap in memory. We assumes the indexes never overlap into neighboring 737 // dimensions. 738 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) 739 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) 740 return true; 741 742 auto UnrollDirection = D->getDirection(UnrollLevel); 743 744 // If the distance carried by the unrolled loop is 0, then after unrolling 745 // that distance will become non-zero resulting in non-overlapping accesses in 746 // the inner loops. 747 if (UnrollDirection == Dependence::DVEntry::EQ) 748 return true; 749 750 if (UnrollDirection & Dependence::DVEntry::LT && 751 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, 752 Sequentialized, D.get())) 753 return false; 754 755 if (UnrollDirection & Dependence::DVEntry::GT && 756 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, 757 Sequentialized, D.get())) 758 return false; 759 760 return true; 761 } 762 763 static bool 764 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, 765 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 766 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, 767 DependenceInfo &DI, LoopInfo &LI) { 768 SmallVector<BasicBlockSet, 8> AllBlocks; 769 for (Loop *L : Root.getLoopsInPreorder()) 770 if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) 771 AllBlocks.push_back(ForeBlocksMap.lookup(L)); 772 AllBlocks.push_back(SubLoopBlocks); 773 for (Loop *L : Root.getLoopsInPreorder()) 774 if (AftBlocksMap.find(L) != AftBlocksMap.end()) 775 AllBlocks.push_back(AftBlocksMap.lookup(L)); 776 777 unsigned LoopDepth = Root.getLoopDepth(); 778 SmallVector<Instruction *, 4> EarlierLoadsAndStores; 779 SmallVector<Instruction *, 4> CurrentLoadsAndStores; 780 for (BasicBlockSet &Blocks : AllBlocks) { 781 CurrentLoadsAndStores.clear(); 782 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) 783 return false; 784 785 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); 786 unsigned CurLoopDepth = CurLoop->getLoopDepth(); 787 788 for (auto *Earlier : EarlierLoadsAndStores) { 789 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); 790 unsigned EarlierDepth = EarlierLoop->getLoopDepth(); 791 unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); 792 for (auto *Later : CurrentLoadsAndStores) { 793 if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, 794 DI)) 795 return false; 796 } 797 } 798 799 size_t NumInsts = CurrentLoadsAndStores.size(); 800 for (size_t I = 0; I < NumInsts; ++I) { 801 for (size_t J = I; J < NumInsts; ++J) { 802 if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], 803 LoopDepth, CurLoopDepth, true, DI)) 804 return false; 805 } 806 } 807 808 EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), 809 CurrentLoadsAndStores.end()); 810 } 811 return true; 812 } 813 814 static bool isEligibleLoopForm(const Loop &Root) { 815 // Root must have a child. 816 if (Root.getSubLoops().size() != 1) 817 return false; 818 819 const Loop *L = &Root; 820 do { 821 // All loops in Root need to be in simplify and rotated form. 822 if (!L->isLoopSimplifyForm()) 823 return false; 824 825 if (!L->isRotatedForm()) 826 return false; 827 828 if (L->getHeader()->hasAddressTaken()) { 829 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 830 return false; 831 } 832 833 unsigned SubLoopsSize = L->getSubLoops().size(); 834 if (SubLoopsSize == 0) 835 return true; 836 837 // Only one child is allowed. 838 if (SubLoopsSize != 1) 839 return false; 840 841 // Only loops with a single exit block can be unrolled and jammed. 842 // The function getExitBlock() is used for this check, rather than 843 // getUniqueExitBlock() to ensure loops with mulitple exit edges are 844 // disallowed. 845 if (!L->getExitBlock()) { 846 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit " 847 "blocks can be unrolled and jammed.\n"); 848 return false; 849 } 850 851 // Only loops with a single exiting block can be unrolled and jammed. 852 if (!L->getExitingBlock()) { 853 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single " 854 "exiting blocks can be unrolled and jammed.\n"); 855 return false; 856 } 857 858 L = L->getSubLoops()[0]; 859 } while (L); 860 861 return true; 862 } 863 864 static Loop *getInnerMostLoop(Loop *L) { 865 while (!L->getSubLoops().empty()) 866 L = L->getSubLoops()[0]; 867 return L; 868 } 869 870 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 871 DependenceInfo &DI, LoopInfo &LI) { 872 if (!isEligibleLoopForm(*L)) { 873 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); 874 return false; 875 } 876 877 /* We currently handle outer loops like this: 878 | 879 ForeFirst <------\ } 880 Blocks | } ForeBlocks of L 881 ForeLast | } 882 | | 883 ... | 884 | | 885 ForeFirst <----\ | } 886 Blocks | | } ForeBlocks of a inner loop of L 887 ForeLast | | } 888 | | | 889 JamLoopFirst <\ | | } 890 Blocks | | | } JamLoopBlocks of the innermost loop 891 JamLoopLast -/ | | } 892 | | | 893 AftFirst | | } 894 Blocks | | } AftBlocks of a inner loop of L 895 AftLast ------/ | } 896 | | 897 ... | 898 | | 899 AftFirst | } 900 Blocks | } AftBlocks of L 901 AftLast --------/ } 902 | 903 904 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 905 and AftBlocks, providing that there is one edge from Fores to SubLoops, 906 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 907 In practice we currently limit Aft blocks to a single block, and limit 908 things further in the profitablility checks of the unroll and jam pass. 909 910 Because of the way we rearrange basic blocks, we also require that 911 the Fore blocks of L on all unrolled iterations are safe to move before the 912 blocks of the direct child of L of all iterations. So we require that the 913 phi node looping operands of ForeHeader can be moved to at least the end of 914 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and 915 match up Phi's correctly. 916 917 i.e. The old order of blocks used to be 918 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. 919 It needs to be safe to transform this to 920 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. 921 922 There are then a number of checks along the lines of no calls, no 923 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 924 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 925 UnrollAndJamLoop if the trip count cannot be easily calculated. 926 */ 927 928 // Split blocks into Fore/SubLoop/Aft based on dominators 929 Loop *JamLoop = getInnerMostLoop(L); 930 BasicBlockSet SubLoopBlocks; 931 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; 932 DenseMap<Loop *, BasicBlockSet> AftBlocksMap; 933 if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, 934 AftBlocksMap, DT)) { 935 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 936 return false; 937 } 938 939 // Aft blocks may need to move instructions to fore blocks, which becomes more 940 // difficult if there are multiple (potentially conditionally executed) 941 // blocks. For now we just exclude loops with multiple aft blocks. 942 if (AftBlocksMap[L].size() != 1) { 943 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 944 "multiple blocks after the loop\n"); 945 return false; 946 } 947 948 // Check inner loop backedge count is consistent on all iterations of the 949 // outer loop 950 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { 951 return !hasIterationCountInvariantInParent(SubLoop, SE); 952 })) { 953 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 954 "not consistent on each iteration\n"); 955 return false; 956 } 957 958 // Check the loop safety info for exceptions. 959 SimpleLoopSafetyInfo LSI; 960 LSI.computeLoopSafetyInfo(L); 961 if (LSI.anyBlockMayThrow()) { 962 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 963 return false; 964 } 965 966 // We've ruled out the easy stuff and now need to check that there are no 967 // interdependencies which may prevent us from moving the: 968 // ForeBlocks before Subloop and AftBlocks. 969 // Subloop before AftBlocks. 970 // ForeBlock phi operands before the subloop 971 972 // Make sure we can move all instructions we need to before the subloop 973 BasicBlock *Header = L->getHeader(); 974 BasicBlock *Latch = L->getLoopLatch(); 975 BasicBlockSet AftBlocks = AftBlocksMap[L]; 976 Loop *SubLoop = L->getSubLoops()[0]; 977 if (!processHeaderPhiOperands( 978 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 979 if (SubLoop->contains(I->getParent())) 980 return false; 981 if (AftBlocks.count(I->getParent())) { 982 // If we hit a phi node in afts we know we are done (probably 983 // LCSSA) 984 if (isa<PHINode>(I)) 985 return false; 986 // Can't move instructions with side effects or memory 987 // reads/writes 988 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 989 return false; 990 } 991 // Keep going 992 return true; 993 })) { 994 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 995 "instructions after subloop to before it\n"); 996 return false; 997 } 998 999 // Check for memory dependencies which prohibit the unrolling we are doing. 1000 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 1001 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 1002 if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, 1003 LI)) { 1004 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 1005 return false; 1006 } 1007 1008 return true; 1009 } 1010