1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/AssumptionCache.h" 50 #include "llvm/Analysis/DependenceAnalysis.h" 51 #include "llvm/Analysis/DomTreeUpdater.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 54 #include "llvm/Analysis/PostDominators.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/TargetTransformInfo.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/Verifier.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Pass.h" 62 #include "llvm/Support/CommandLine.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Transforms/Scalar.h" 66 #include "llvm/Transforms/Utils.h" 67 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 68 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 69 #include "llvm/Transforms/Utils/LoopPeel.h" 70 71 using namespace llvm; 72 73 #define DEBUG_TYPE "loop-fusion" 74 75 STATISTIC(FuseCounter, "Loops fused"); 76 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 77 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 78 STATISTIC(InvalidHeader, "Loop has invalid header"); 79 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 80 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 81 STATISTIC(InvalidLatch, "Loop has invalid latch"); 82 STATISTIC(InvalidLoop, "Loop is invalid"); 83 STATISTIC(AddressTakenBB, "Basic block has address taken"); 84 STATISTIC(MayThrowException, "Loop may throw an exception"); 85 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 86 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 87 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 88 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 89 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 90 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 91 STATISTIC(NonAdjacent, "Loops are not adjacent"); 92 STATISTIC( 93 NonEmptyPreheader, 94 "Loop has a non-empty preheader with instructions that cannot be moved"); 95 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 96 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 97 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 98 "instructions that cannot be moved"); 99 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 100 "instructions that cannot be moved"); 101 STATISTIC(NotRotated, "Candidate is not rotated"); 102 STATISTIC(OnlySecondCandidateIsGuarded, 103 "The second candidate is guarded while the first one is not"); 104 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); 105 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); 106 107 enum FusionDependenceAnalysisChoice { 108 FUSION_DEPENDENCE_ANALYSIS_SCEV, 109 FUSION_DEPENDENCE_ANALYSIS_DA, 110 FUSION_DEPENDENCE_ANALYSIS_ALL, 111 }; 112 113 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 114 "loop-fusion-dependence-analysis", 115 cl::desc("Which dependence analysis should loop fusion use?"), 116 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 117 "Use the scalar evolution interface"), 118 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 119 "Use the dependence analysis interface"), 120 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 121 "Use all available analyses")), 122 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL)); 123 124 static cl::opt<unsigned> FusionPeelMaxCount( 125 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 126 cl::desc("Max number of iterations to be peeled from a loop, such that " 127 "fusion can take place")); 128 129 #ifndef NDEBUG 130 static cl::opt<bool> 131 VerboseFusionDebugging("loop-fusion-verbose-debug", 132 cl::desc("Enable verbose debugging for Loop Fusion"), 133 cl::Hidden, cl::init(false)); 134 #endif 135 136 namespace { 137 /// This class is used to represent a candidate for loop fusion. When it is 138 /// constructed, it checks the conditions for loop fusion to ensure that it 139 /// represents a valid candidate. It caches several parts of a loop that are 140 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 141 /// of continually querying the underlying Loop to retrieve these values. It is 142 /// assumed these will not change throughout loop fusion. 143 /// 144 /// The invalidate method should be used to indicate that the FusionCandidate is 145 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 146 /// be used to ensure that the FusionCandidate is still valid for fusion. 147 struct FusionCandidate { 148 /// Cache of parts of the loop used throughout loop fusion. These should not 149 /// need to change throughout the analysis and transformation. 150 /// These parts are cached to avoid repeatedly looking up in the Loop class. 151 152 /// Preheader of the loop this candidate represents 153 BasicBlock *Preheader; 154 /// Header of the loop this candidate represents 155 BasicBlock *Header; 156 /// Blocks in the loop that exit the loop 157 BasicBlock *ExitingBlock; 158 /// The successor block of this loop (where the exiting blocks go to) 159 BasicBlock *ExitBlock; 160 /// Latch of the loop 161 BasicBlock *Latch; 162 /// The loop that this fusion candidate represents 163 Loop *L; 164 /// Vector of instructions in this loop that read from memory 165 SmallVector<Instruction *, 16> MemReads; 166 /// Vector of instructions in this loop that write to memory 167 SmallVector<Instruction *, 16> MemWrites; 168 /// Are all of the members of this fusion candidate still valid 169 bool Valid; 170 /// Guard branch of the loop, if it exists 171 BranchInst *GuardBranch; 172 /// Peeling Paramaters of the Loop. 173 TTI::PeelingPreferences PP; 174 /// Can you Peel this Loop? 175 bool AbleToPeel; 176 /// Has this loop been Peeled 177 bool Peeled; 178 179 /// Dominator and PostDominator trees are needed for the 180 /// FusionCandidateCompare function, required by FusionCandidateSet to 181 /// determine where the FusionCandidate should be inserted into the set. These 182 /// are used to establish ordering of the FusionCandidates based on dominance. 183 DominatorTree &DT; 184 const PostDominatorTree *PDT; 185 186 OptimizationRemarkEmitter &ORE; 187 188 FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, 189 OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) 190 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 191 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 192 Latch(L->getLoopLatch()), L(L), Valid(true), 193 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 194 Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 195 196 // Walk over all blocks in the loop and check for conditions that may 197 // prevent fusion. For each block, walk over all instructions and collect 198 // the memory reads and writes If any instructions that prevent fusion are 199 // found, invalidate this object and return. 200 for (BasicBlock *BB : L->blocks()) { 201 if (BB->hasAddressTaken()) { 202 invalidate(); 203 reportInvalidCandidate(AddressTakenBB); 204 return; 205 } 206 207 for (Instruction &I : *BB) { 208 if (I.mayThrow()) { 209 invalidate(); 210 reportInvalidCandidate(MayThrowException); 211 return; 212 } 213 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 214 if (SI->isVolatile()) { 215 invalidate(); 216 reportInvalidCandidate(ContainsVolatileAccess); 217 return; 218 } 219 } 220 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 221 if (LI->isVolatile()) { 222 invalidate(); 223 reportInvalidCandidate(ContainsVolatileAccess); 224 return; 225 } 226 } 227 if (I.mayWriteToMemory()) 228 MemWrites.push_back(&I); 229 if (I.mayReadFromMemory()) 230 MemReads.push_back(&I); 231 } 232 } 233 } 234 235 /// Check if all members of the class are valid. 236 bool isValid() const { 237 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 238 !L->isInvalid() && Valid; 239 } 240 241 /// Verify that all members are in sync with the Loop object. 242 void verify() const { 243 assert(isValid() && "Candidate is not valid!!"); 244 assert(!L->isInvalid() && "Loop is invalid!"); 245 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 246 assert(Header == L->getHeader() && "Header is out of sync"); 247 assert(ExitingBlock == L->getExitingBlock() && 248 "Exiting Blocks is out of sync"); 249 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 250 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 251 } 252 253 /// Get the entry block for this fusion candidate. 254 /// 255 /// If this fusion candidate represents a guarded loop, the entry block is the 256 /// loop guard block. If it represents an unguarded loop, the entry block is 257 /// the preheader of the loop. 258 BasicBlock *getEntryBlock() const { 259 if (GuardBranch) 260 return GuardBranch->getParent(); 261 else 262 return Preheader; 263 } 264 265 /// After Peeling the loop is modified quite a bit, hence all of the Blocks 266 /// need to be updated accordingly. 267 void updateAfterPeeling() { 268 Preheader = L->getLoopPreheader(); 269 Header = L->getHeader(); 270 ExitingBlock = L->getExitingBlock(); 271 ExitBlock = L->getExitBlock(); 272 Latch = L->getLoopLatch(); 273 verify(); 274 } 275 276 /// Given a guarded loop, get the successor of the guard that is not in the 277 /// loop. 278 /// 279 /// This method returns the successor of the loop guard that is not located 280 /// within the loop (i.e., the successor of the guard that is not the 281 /// preheader). 282 /// This method is only valid for guarded loops. 283 BasicBlock *getNonLoopBlock() const { 284 assert(GuardBranch && "Only valid on guarded loops."); 285 assert(GuardBranch->isConditional() && 286 "Expecting guard to be a conditional branch."); 287 if (Peeled) 288 return GuardBranch->getSuccessor(1); 289 return (GuardBranch->getSuccessor(0) == Preheader) 290 ? GuardBranch->getSuccessor(1) 291 : GuardBranch->getSuccessor(0); 292 } 293 294 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 295 LLVM_DUMP_METHOD void dump() const { 296 dbgs() << "\tGuardBranch: "; 297 if (GuardBranch) 298 dbgs() << *GuardBranch; 299 else 300 dbgs() << "nullptr"; 301 dbgs() << "\n" 302 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 303 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 304 << "\n" 305 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 306 << "\tExitingBB: " 307 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 308 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 309 << "\n" 310 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 311 << "\tEntryBlock: " 312 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 313 << "\n"; 314 } 315 #endif 316 317 /// Determine if a fusion candidate (representing a loop) is eligible for 318 /// fusion. Note that this only checks whether a single loop can be fused - it 319 /// does not check whether it is *legal* to fuse two loops together. 320 bool isEligibleForFusion(ScalarEvolution &SE) const { 321 if (!isValid()) { 322 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 323 if (!Preheader) 324 ++InvalidPreheader; 325 if (!Header) 326 ++InvalidHeader; 327 if (!ExitingBlock) 328 ++InvalidExitingBlock; 329 if (!ExitBlock) 330 ++InvalidExitBlock; 331 if (!Latch) 332 ++InvalidLatch; 333 if (L->isInvalid()) 334 ++InvalidLoop; 335 336 return false; 337 } 338 339 // Require ScalarEvolution to be able to determine a trip count. 340 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 341 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 342 << " trip count not computable!\n"); 343 return reportInvalidCandidate(UnknownTripCount); 344 } 345 346 if (!L->isLoopSimplifyForm()) { 347 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 348 << " is not in simplified form!\n"); 349 return reportInvalidCandidate(NotSimplifiedForm); 350 } 351 352 if (!L->isRotatedForm()) { 353 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 354 return reportInvalidCandidate(NotRotated); 355 } 356 357 return true; 358 } 359 360 private: 361 // This is only used internally for now, to clear the MemWrites and MemReads 362 // list and setting Valid to false. I can't envision other uses of this right 363 // now, since once FusionCandidates are put into the FusionCandidateSet they 364 // are immutable. Thus, any time we need to change/update a FusionCandidate, 365 // we must create a new one and insert it into the FusionCandidateSet to 366 // ensure the FusionCandidateSet remains ordered correctly. 367 void invalidate() { 368 MemWrites.clear(); 369 MemReads.clear(); 370 Valid = false; 371 } 372 373 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 374 using namespace ore; 375 assert(L && Preheader && "Fusion candidate not initialized properly!"); 376 #if LLVM_ENABLE_STATS 377 ++Stat; 378 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 379 L->getStartLoc(), Preheader) 380 << "[" << Preheader->getParent()->getName() << "]: " 381 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 382 #endif 383 return false; 384 } 385 }; 386 387 struct FusionCandidateCompare { 388 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 389 /// into dominance order. 390 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 391 /// IF RHS dominates LHS and LHS post-dominates RHS, return false; 392 bool operator()(const FusionCandidate &LHS, 393 const FusionCandidate &RHS) const { 394 const DominatorTree *DT = &(LHS.DT); 395 396 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 397 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 398 399 // Do not save PDT to local variable as it is only used in asserts and thus 400 // will trigger an unused variable warning if building without asserts. 401 assert(DT && LHS.PDT && "Expecting valid dominator tree"); 402 403 // Do this compare first so if LHS == RHS, function returns false. 404 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 405 // RHS dominates LHS 406 // Verify LHS post-dominates RHS 407 assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 408 return false; 409 } 410 411 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 412 // Verify RHS Postdominates LHS 413 assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 414 return true; 415 } 416 417 // If LHS does not dominate RHS and RHS does not dominate LHS then there is 418 // no dominance relationship between the two FusionCandidates. Thus, they 419 // should not be in the same set together. 420 llvm_unreachable( 421 "No dominance relationship between these fusion candidates!"); 422 } 423 }; 424 425 using LoopVector = SmallVector<Loop *, 4>; 426 427 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 428 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 429 // dominates FC1 and FC1 post-dominates FC0. 430 // std::set was chosen because we want a sorted data structure with stable 431 // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent 432 // loops by moving intervening code around. When this intervening code contains 433 // loops, those loops will be moved also. The corresponding FusionCandidates 434 // will also need to be moved accordingly. As this is done, having stable 435 // iterators will simplify the logic. Similarly, having an efficient insert that 436 // keeps the FusionCandidateSet sorted will also simplify the implementation. 437 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 438 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 439 440 #if !defined(NDEBUG) 441 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 442 const FusionCandidate &FC) { 443 if (FC.isValid()) 444 OS << FC.Preheader->getName(); 445 else 446 OS << "<Invalid>"; 447 448 return OS; 449 } 450 451 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 452 const FusionCandidateSet &CandSet) { 453 for (const FusionCandidate &FC : CandSet) 454 OS << FC << '\n'; 455 456 return OS; 457 } 458 459 static void 460 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 461 dbgs() << "Fusion Candidates: \n"; 462 for (const auto &CandidateSet : FusionCandidates) { 463 dbgs() << "*** Fusion Candidate Set ***\n"; 464 dbgs() << CandidateSet; 465 dbgs() << "****************************\n"; 466 } 467 } 468 #endif 469 470 /// Collect all loops in function at the same nest level, starting at the 471 /// outermost level. 472 /// 473 /// This data structure collects all loops at the same nest level for a 474 /// given function (specified by the LoopInfo object). It starts at the 475 /// outermost level. 476 struct LoopDepthTree { 477 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 478 using iterator = LoopsOnLevelTy::iterator; 479 using const_iterator = LoopsOnLevelTy::const_iterator; 480 481 LoopDepthTree(LoopInfo &LI) : Depth(1) { 482 if (!LI.empty()) 483 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 484 } 485 486 /// Test whether a given loop has been removed from the function, and thus is 487 /// no longer valid. 488 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 489 490 /// Record that a given loop has been removed from the function and is no 491 /// longer valid. 492 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 493 494 /// Descend the tree to the next (inner) nesting level 495 void descend() { 496 LoopsOnLevelTy LoopsOnNextLevel; 497 498 for (const LoopVector &LV : *this) 499 for (Loop *L : LV) 500 if (!isRemovedLoop(L) && L->begin() != L->end()) 501 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 502 503 LoopsOnLevel = LoopsOnNextLevel; 504 RemovedLoops.clear(); 505 Depth++; 506 } 507 508 bool empty() const { return size() == 0; } 509 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 510 unsigned getDepth() const { return Depth; } 511 512 iterator begin() { return LoopsOnLevel.begin(); } 513 iterator end() { return LoopsOnLevel.end(); } 514 const_iterator begin() const { return LoopsOnLevel.begin(); } 515 const_iterator end() const { return LoopsOnLevel.end(); } 516 517 private: 518 /// Set of loops that have been removed from the function and are no longer 519 /// valid. 520 SmallPtrSet<const Loop *, 8> RemovedLoops; 521 522 /// Depth of the current level, starting at 1 (outermost loops). 523 unsigned Depth; 524 525 /// Vector of loops at the current depth level that have the same parent loop 526 LoopsOnLevelTy LoopsOnLevel; 527 }; 528 529 #ifndef NDEBUG 530 static void printLoopVector(const LoopVector &LV) { 531 dbgs() << "****************************\n"; 532 for (auto *L : LV) 533 printLoop(*L, dbgs()); 534 dbgs() << "****************************\n"; 535 } 536 #endif 537 538 struct LoopFuser { 539 private: 540 // Sets of control flow equivalent fusion candidates for a given nest level. 541 FusionCandidateCollection FusionCandidates; 542 543 LoopDepthTree LDT; 544 DomTreeUpdater DTU; 545 546 LoopInfo &LI; 547 DominatorTree &DT; 548 DependenceInfo &DI; 549 ScalarEvolution &SE; 550 PostDominatorTree &PDT; 551 OptimizationRemarkEmitter &ORE; 552 AssumptionCache &AC; 553 const TargetTransformInfo &TTI; 554 555 public: 556 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 557 ScalarEvolution &SE, PostDominatorTree &PDT, 558 OptimizationRemarkEmitter &ORE, const DataLayout &DL, 559 AssumptionCache &AC, const TargetTransformInfo &TTI) 560 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 561 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 562 563 /// This is the main entry point for loop fusion. It will traverse the 564 /// specified function and collect candidate loops to fuse, starting at the 565 /// outermost nesting level and working inwards. 566 bool fuseLoops(Function &F) { 567 #ifndef NDEBUG 568 if (VerboseFusionDebugging) { 569 LI.print(dbgs()); 570 } 571 #endif 572 573 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 574 << "\n"); 575 bool Changed = false; 576 577 while (!LDT.empty()) { 578 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 579 << LDT.getDepth() << "\n";); 580 581 for (const LoopVector &LV : LDT) { 582 assert(LV.size() > 0 && "Empty loop set was build!"); 583 584 // Skip singleton loop sets as they do not offer fusion opportunities on 585 // this level. 586 if (LV.size() == 1) 587 continue; 588 #ifndef NDEBUG 589 if (VerboseFusionDebugging) { 590 LLVM_DEBUG({ 591 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 592 printLoopVector(LV); 593 }); 594 } 595 #endif 596 597 collectFusionCandidates(LV); 598 Changed |= fuseCandidates(); 599 } 600 601 // Finished analyzing candidates at this level. 602 // Descend to the next level and clear all of the candidates currently 603 // collected. Note that it will not be possible to fuse any of the 604 // existing candidates with new candidates because the new candidates will 605 // be at a different nest level and thus not be control flow equivalent 606 // with all of the candidates collected so far. 607 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 608 LDT.descend(); 609 FusionCandidates.clear(); 610 } 611 612 if (Changed) 613 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 614 615 #ifndef NDEBUG 616 assert(DT.verify()); 617 assert(PDT.verify()); 618 LI.verify(DT); 619 SE.verify(); 620 #endif 621 622 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 623 return Changed; 624 } 625 626 private: 627 /// Determine if two fusion candidates are control flow equivalent. 628 /// 629 /// Two fusion candidates are control flow equivalent if when one executes, 630 /// the other is guaranteed to execute. This is determined using dominators 631 /// and post-dominators: if A dominates B and B post-dominates A then A and B 632 /// are control-flow equivalent. 633 bool isControlFlowEquivalent(const FusionCandidate &FC0, 634 const FusionCandidate &FC1) const { 635 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 636 637 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 638 DT, PDT); 639 } 640 641 /// Iterate over all loops in the given loop set and identify the loops that 642 /// are eligible for fusion. Place all eligible fusion candidates into Control 643 /// Flow Equivalent sets, sorted by dominance. 644 void collectFusionCandidates(const LoopVector &LV) { 645 for (Loop *L : LV) { 646 TTI::PeelingPreferences PP = 647 gatherPeelingPreferences(L, SE, TTI, None, None); 648 FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); 649 if (!CurrCand.isEligibleForFusion(SE)) 650 continue; 651 652 // Go through each list in FusionCandidates and determine if L is control 653 // flow equivalent with the first loop in that list. If it is, append LV. 654 // If not, go to the next list. 655 // If no suitable list is found, start another list and add it to 656 // FusionCandidates. 657 bool FoundSet = false; 658 659 for (auto &CurrCandSet : FusionCandidates) { 660 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 661 CurrCandSet.insert(CurrCand); 662 FoundSet = true; 663 #ifndef NDEBUG 664 if (VerboseFusionDebugging) 665 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 666 << " to existing candidate set\n"); 667 #endif 668 break; 669 } 670 } 671 if (!FoundSet) { 672 // No set was found. Create a new set and add to FusionCandidates 673 #ifndef NDEBUG 674 if (VerboseFusionDebugging) 675 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 676 #endif 677 FusionCandidateSet NewCandSet; 678 NewCandSet.insert(CurrCand); 679 FusionCandidates.push_back(NewCandSet); 680 } 681 NumFusionCandidates++; 682 } 683 } 684 685 /// Determine if it is beneficial to fuse two loops. 686 /// 687 /// For now, this method simply returns true because we want to fuse as much 688 /// as possible (primarily to test the pass). This method will evolve, over 689 /// time, to add heuristics for profitability of fusion. 690 bool isBeneficialFusion(const FusionCandidate &FC0, 691 const FusionCandidate &FC1) { 692 return true; 693 } 694 695 /// Determine if two fusion candidates have the same trip count (i.e., they 696 /// execute the same number of iterations). 697 /// 698 /// This function will return a pair of values. The first is a boolean, 699 /// stating whether or not the two candidates are known at compile time to 700 /// have the same TripCount. The second is the difference in the two 701 /// TripCounts. This information can be used later to determine whether or not 702 /// peeling can be performed on either one of the candidates. 703 std::pair<bool, Optional<unsigned>> 704 haveIdenticalTripCounts(const FusionCandidate &FC0, 705 const FusionCandidate &FC1) const { 706 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 707 if (isa<SCEVCouldNotCompute>(TripCount0)) { 708 UncomputableTripCount++; 709 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 710 return {false, None}; 711 } 712 713 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 714 if (isa<SCEVCouldNotCompute>(TripCount1)) { 715 UncomputableTripCount++; 716 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 717 return {false, None}; 718 } 719 720 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 721 << *TripCount1 << " are " 722 << (TripCount0 == TripCount1 ? "identical" : "different") 723 << "\n"); 724 725 if (TripCount0 == TripCount1) 726 return {true, 0}; 727 728 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 729 "determining the difference between trip counts\n"); 730 731 // Currently only considering loops with a single exit point 732 // and a non-constant trip count. 733 const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 734 const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 735 736 // If any of the tripcounts are zero that means that loop(s) do not have 737 // a single exit or a constant tripcount. 738 if (TC0 == 0 || TC1 == 0) { 739 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 740 "have a constant number of iterations. Peeling " 741 "is not benefical\n"); 742 return {false, None}; 743 } 744 745 Optional<unsigned> Difference; 746 int Diff = TC0 - TC1; 747 748 if (Diff > 0) 749 Difference = Diff; 750 else { 751 LLVM_DEBUG( 752 dbgs() << "Difference is less than 0. FC1 (second loop) has more " 753 "iterations than the first one. Currently not supported\n"); 754 } 755 756 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 757 << "\n"); 758 759 return {false, Difference}; 760 } 761 762 void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 763 unsigned PeelCount) { 764 assert(FC0.AbleToPeel && "Should be able to peel loop"); 765 766 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 767 << " iterations of the first loop. \n"); 768 769 FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true); 770 if (FC0.Peeled) { 771 LLVM_DEBUG(dbgs() << "Done Peeling\n"); 772 773 #ifndef NDEBUG 774 auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 775 776 assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 777 "Loops should have identical trip counts after peeling"); 778 #endif 779 780 FC0.PP.PeelCount += PeelCount; 781 782 // Peeling does not update the PDT 783 PDT.recalculate(*FC0.Preheader->getParent()); 784 785 FC0.updateAfterPeeling(); 786 787 // In this case the iterations of the loop are constant, so the first 788 // loop will execute completely (will not jump from one of 789 // the peeled blocks to the second loop). Here we are updating the 790 // branch conditions of each of the peeled blocks, such that it will 791 // branch to its successor which is not the preheader of the second loop 792 // in the case of unguarded loops, or the succesors of the exit block of 793 // the first loop otherwise. Doing this update will ensure that the entry 794 // block of the first loop dominates the entry block of the second loop. 795 BasicBlock *BB = 796 FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 797 if (BB) { 798 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 799 SmallVector<Instruction *, 8> WorkList; 800 for (BasicBlock *Pred : predecessors(BB)) { 801 if (Pred != FC0.ExitBlock) { 802 WorkList.emplace_back(Pred->getTerminator()); 803 TreeUpdates.emplace_back( 804 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 805 } 806 } 807 // Cannot modify the predecessors inside the above loop as it will cause 808 // the iterators to be nullptrs, causing memory errors. 809 for (Instruction *CurrentBranch : WorkList) { 810 BasicBlock *Succ = CurrentBranch->getSuccessor(0); 811 if (Succ == BB) 812 Succ = CurrentBranch->getSuccessor(1); 813 ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); 814 } 815 816 DTU.applyUpdates(TreeUpdates); 817 DTU.flush(); 818 } 819 LLVM_DEBUG( 820 dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 821 << " iterations from the first loop.\n" 822 "Both Loops have the same number of iterations now.\n"); 823 } 824 } 825 826 /// Walk each set of control flow equivalent fusion candidates and attempt to 827 /// fuse them. This does a single linear traversal of all candidates in the 828 /// set. The conditions for legal fusion are checked at this point. If a pair 829 /// of fusion candidates passes all legality checks, they are fused together 830 /// and a new fusion candidate is created and added to the FusionCandidateSet. 831 /// The original fusion candidates are then removed, as they are no longer 832 /// valid. 833 bool fuseCandidates() { 834 bool Fused = false; 835 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 836 for (auto &CandidateSet : FusionCandidates) { 837 if (CandidateSet.size() < 2) 838 continue; 839 840 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 841 << CandidateSet << "\n"); 842 843 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 844 assert(!LDT.isRemovedLoop(FC0->L) && 845 "Should not have removed loops in CandidateSet!"); 846 auto FC1 = FC0; 847 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 848 assert(!LDT.isRemovedLoop(FC1->L) && 849 "Should not have removed loops in CandidateSet!"); 850 851 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 852 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 853 854 FC0->verify(); 855 FC1->verify(); 856 857 // Check if the candidates have identical tripcounts (first value of 858 // pair), and if not check the difference in the tripcounts between 859 // the loops (second value of pair). The difference is not equal to 860 // None iff the loops iterate a constant number of times, and have a 861 // single exit. 862 std::pair<bool, Optional<unsigned>> IdenticalTripCountRes = 863 haveIdenticalTripCounts(*FC0, *FC1); 864 bool SameTripCount = IdenticalTripCountRes.first; 865 Optional<unsigned> TCDifference = IdenticalTripCountRes.second; 866 867 // Here we are checking that FC0 (the first loop) can be peeled, and 868 // both loops have different tripcounts. 869 if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 870 if (*TCDifference > FusionPeelMaxCount) { 871 LLVM_DEBUG(dbgs() 872 << "Difference in loop trip counts: " << *TCDifference 873 << " is greater than maximum peel count specificed: " 874 << FusionPeelMaxCount << "\n"); 875 } else { 876 // Dependent on peeling being performed on the first loop, and 877 // assuming all other conditions for fusion return true. 878 SameTripCount = true; 879 } 880 } 881 882 if (!SameTripCount) { 883 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 884 "counts. Not fusing.\n"); 885 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 886 NonEqualTripCount); 887 continue; 888 } 889 890 if (!isAdjacent(*FC0, *FC1)) { 891 LLVM_DEBUG(dbgs() 892 << "Fusion candidates are not adjacent. Not fusing.\n"); 893 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 894 continue; 895 } 896 897 if (!FC0->GuardBranch && FC1->GuardBranch) { 898 LLVM_DEBUG(dbgs() << "The second candidate is guarded while the " 899 "first one is not. Not fusing.\n"); 900 reportLoopFusion<OptimizationRemarkMissed>( 901 *FC0, *FC1, OnlySecondCandidateIsGuarded); 902 continue; 903 } 904 905 // Ensure that FC0 and FC1 have identical guards. 906 // If one (or both) are not guarded, this check is not necessary. 907 if (FC0->GuardBranch && FC1->GuardBranch && 908 !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 909 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 910 "guards. Not Fusing.\n"); 911 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 912 NonIdenticalGuards); 913 continue; 914 } 915 916 if (FC0->GuardBranch) { 917 assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 918 919 if (!isSafeToMoveBefore(*FC0->ExitBlock, 920 *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 921 &PDT, &DI)) { 922 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 923 "instructions in exit block. Not fusing.\n"); 924 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 925 NonEmptyExitBlock); 926 continue; 927 } 928 929 if (!isSafeToMoveBefore( 930 *FC1->GuardBranch->getParent(), 931 *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 932 &DI)) { 933 LLVM_DEBUG(dbgs() 934 << "Fusion candidate contains unsafe " 935 "instructions in guard block. Not fusing.\n"); 936 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 937 NonEmptyGuardBlock); 938 continue; 939 } 940 } 941 942 // Check the dependencies across the loops and do not fuse if it would 943 // violate them. 944 if (!dependencesAllowFusion(*FC0, *FC1)) { 945 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 946 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 947 InvalidDependencies); 948 continue; 949 } 950 951 // If the second loop has instructions in the pre-header, attempt to 952 // hoist them up to the first loop's pre-header or sink them into the 953 // body of the second loop. 954 SmallVector<Instruction *, 4> SafeToHoist; 955 SmallVector<Instruction *, 4> SafeToSink; 956 // At this point, this is the last remaining legality check. 957 // Which means if we can make this pre-header empty, we can fuse 958 // these loops 959 if (!isEmptyPreheader(*FC1)) { 960 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " 961 "preheader.\n"); 962 963 // If it is not safe to hoist/sink all instructions in the 964 // pre-header, we cannot fuse these loops. 965 if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, 966 SafeToSink)) { 967 LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " 968 "Fusion Candidate Pre-header.\n" 969 << "Not Fusing.\n"); 970 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 971 NonEmptyPreheader); 972 continue; 973 } 974 } 975 976 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 977 LLVM_DEBUG(dbgs() 978 << "\tFusion appears to be " 979 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 980 if (!BeneficialToFuse) { 981 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 982 FusionNotBeneficial); 983 continue; 984 } 985 // All analysis has completed and has determined that fusion is legal 986 // and profitable. At this point, start transforming the code and 987 // perform fusion. 988 989 // Execute the hoist/sink operations on preheader instructions 990 movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); 991 992 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 993 << *FC1 << "\n"); 994 995 FusionCandidate FC0Copy = *FC0; 996 // Peel the loop after determining that fusion is legal. The Loops 997 // will still be safe to fuse after the peeling is performed. 998 bool Peel = TCDifference && *TCDifference > 0; 999 if (Peel) 1000 peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 1001 1002 // Report fusion to the Optimization Remarks. 1003 // Note this needs to be done *before* performFusion because 1004 // performFusion will change the original loops, making it not 1005 // possible to identify them after fusion is complete. 1006 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 1007 FuseCounter); 1008 1009 FusionCandidate FusedCand( 1010 performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE, 1011 FC0Copy.PP); 1012 FusedCand.verify(); 1013 assert(FusedCand.isEligibleForFusion(SE) && 1014 "Fused candidate should be eligible for fusion!"); 1015 1016 // Notify the loop-depth-tree that these loops are not valid objects 1017 LDT.removeLoop(FC1->L); 1018 1019 CandidateSet.erase(FC0); 1020 CandidateSet.erase(FC1); 1021 1022 auto InsertPos = CandidateSet.insert(FusedCand); 1023 1024 assert(InsertPos.second && 1025 "Unable to insert TargetCandidate in CandidateSet!"); 1026 1027 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 1028 // of the FC1 loop will attempt to fuse the new (fused) loop with the 1029 // remaining candidates in the current candidate set. 1030 FC0 = FC1 = InsertPos.first; 1031 1032 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 1033 << "\n"); 1034 1035 Fused = true; 1036 } 1037 } 1038 } 1039 return Fused; 1040 } 1041 1042 // Returns true if the instruction \p I can be hoisted to the end of the 1043 // preheader of \p FC0. \p SafeToHoist contains the instructions that are 1044 // known to be safe to hoist. The instructions encountered that cannot be 1045 // hoisted are in \p NotHoisting. 1046 // TODO: Move functionality into CodeMoverUtils 1047 bool canHoistInst(Instruction &I, 1048 const SmallVector<Instruction *, 4> &SafeToHoist, 1049 const SmallVector<Instruction *, 4> &NotHoisting, 1050 const FusionCandidate &FC0) const { 1051 const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); 1052 assert(FC0PreheaderTarget && 1053 "Expected single successor for loop preheader."); 1054 1055 for (Use &Op : I.operands()) { 1056 if (auto *OpInst = dyn_cast<Instruction>(Op)) { 1057 bool OpHoisted = is_contained(SafeToHoist, OpInst); 1058 // Check if we have already decided to hoist this operand. In this 1059 // case, it does not dominate FC0 *yet*, but will after we hoist it. 1060 if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { 1061 return false; 1062 } 1063 } 1064 } 1065 1066 // If this isn't a memory inst, hoisting is safe 1067 if (!I.mayReadOrWriteMemory()) 1068 return true; 1069 1070 LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); 1071 for (Instruction *NotHoistedInst : NotHoisting) { 1072 if (auto D = DI.depends(&I, NotHoistedInst, true)) { 1073 // Dependency is not read-before-write, write-before-read or 1074 // write-before-write 1075 if (D->isFlow() || D->isAnti() || D->isOutput()) { 1076 LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " 1077 "preheader that is not being hoisted.\n"); 1078 return false; 1079 } 1080 } 1081 } 1082 1083 for (Instruction *ReadInst : FC0.MemReads) { 1084 if (auto D = DI.depends(ReadInst, &I, true)) { 1085 // Dependency is not read-before-write 1086 if (D->isAnti()) { 1087 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); 1088 return false; 1089 } 1090 } 1091 } 1092 1093 for (Instruction *WriteInst : FC0.MemWrites) { 1094 if (auto D = DI.depends(WriteInst, &I, true)) { 1095 // Dependency is not write-before-read or write-before-write 1096 if (D->isFlow() || D->isOutput()) { 1097 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); 1098 return false; 1099 } 1100 } 1101 } 1102 return true; 1103 } 1104 1105 // Returns true if the instruction \p I can be sunk to the top of the exit 1106 // block of \p FC1. 1107 // TODO: Move functionality into CodeMoverUtils 1108 bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { 1109 for (User *U : I.users()) { 1110 if (auto *UI{dyn_cast<Instruction>(U)}) { 1111 // Cannot sink if user in loop 1112 // If FC1 has phi users of this value, we cannot sink it into FC1. 1113 if (FC1.L->contains(UI)) { 1114 // Cannot hoist or sink this instruction. No hoisting/sinking 1115 // should take place, loops should not fuse 1116 return false; 1117 } 1118 } 1119 } 1120 1121 // If this isn't a memory inst, sinking is safe 1122 if (!I.mayReadOrWriteMemory()) 1123 return true; 1124 1125 for (Instruction *ReadInst : FC1.MemReads) { 1126 if (auto D = DI.depends(&I, ReadInst, true)) { 1127 // Dependency is not write-before-read 1128 if (D->isFlow()) { 1129 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); 1130 return false; 1131 } 1132 } 1133 } 1134 1135 for (Instruction *WriteInst : FC1.MemWrites) { 1136 if (auto D = DI.depends(&I, WriteInst, true)) { 1137 // Dependency is not write-before-write or read-before-write 1138 if (D->isOutput() || D->isAnti()) { 1139 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); 1140 return false; 1141 } 1142 } 1143 } 1144 1145 return true; 1146 } 1147 1148 /// Collect instructions in the \p FC1 Preheader that can be hoisted 1149 /// to the \p FC0 Preheader or sunk into the \p FC1 Body 1150 bool collectMovablePreheaderInsts( 1151 const FusionCandidate &FC0, const FusionCandidate &FC1, 1152 SmallVector<Instruction *, 4> &SafeToHoist, 1153 SmallVector<Instruction *, 4> &SafeToSink) const { 1154 BasicBlock *FC1Preheader = FC1.Preheader; 1155 // Save the instructions that are not being hoisted, so we know not to hoist 1156 // mem insts that they dominate. 1157 SmallVector<Instruction *, 4> NotHoisting; 1158 1159 for (Instruction &I : *FC1Preheader) { 1160 // Can't move a branch 1161 if (&I == FC1Preheader->getTerminator()) 1162 continue; 1163 // If the instruction has side-effects, give up. 1164 // TODO: The case of mayReadFromMemory we can handle but requires 1165 // additional work with a dependence analysis so for now we give 1166 // up on memory reads. 1167 if (I.mayThrow() || !I.willReturn()) { 1168 LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); 1169 return false; 1170 } 1171 1172 LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); 1173 1174 if (I.isAtomic() || I.isVolatile()) { 1175 LLVM_DEBUG( 1176 dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n"); 1177 return false; 1178 } 1179 1180 if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { 1181 SafeToHoist.push_back(&I); 1182 LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); 1183 } else { 1184 LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); 1185 NotHoisting.push_back(&I); 1186 1187 if (canSinkInst(I, FC1)) { 1188 SafeToSink.push_back(&I); 1189 LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); 1190 } else { 1191 LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); 1192 return false; 1193 } 1194 } 1195 } 1196 LLVM_DEBUG( 1197 dbgs() << "All preheader instructions could be sunk or hoisted!\n"); 1198 return true; 1199 } 1200 1201 /// Rewrite all additive recurrences in a SCEV to use a new loop. 1202 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 1203 public: 1204 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 1205 bool UseMax = true) 1206 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 1207 NewL(NewL) {} 1208 1209 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 1210 const Loop *ExprL = Expr->getLoop(); 1211 SmallVector<const SCEV *, 2> Operands; 1212 if (ExprL == &OldL) { 1213 Operands.append(Expr->op_begin(), Expr->op_end()); 1214 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 1215 } 1216 1217 if (OldL.contains(ExprL)) { 1218 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 1219 if (!UseMax || !Pos || !Expr->isAffine()) { 1220 Valid = false; 1221 return Expr; 1222 } 1223 return visit(Expr->getStart()); 1224 } 1225 1226 for (const SCEV *Op : Expr->operands()) 1227 Operands.push_back(visit(Op)); 1228 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 1229 } 1230 1231 bool wasValidSCEV() const { return Valid; } 1232 1233 private: 1234 bool Valid, UseMax; 1235 const Loop &OldL, &NewL; 1236 }; 1237 1238 /// Return false if the access functions of \p I0 and \p I1 could cause 1239 /// a negative dependence. 1240 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 1241 Instruction &I1, bool EqualIsInvalid) { 1242 Value *Ptr0 = getLoadStorePointerOperand(&I0); 1243 Value *Ptr1 = getLoadStorePointerOperand(&I1); 1244 if (!Ptr0 || !Ptr1) 1245 return false; 1246 1247 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 1248 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 1249 #ifndef NDEBUG 1250 if (VerboseFusionDebugging) 1251 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 1252 << *SCEVPtr1 << "\n"); 1253 #endif 1254 AddRecLoopReplacer Rewriter(SE, L0, L1); 1255 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 1256 #ifndef NDEBUG 1257 if (VerboseFusionDebugging) 1258 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 1259 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 1260 #endif 1261 if (!Rewriter.wasValidSCEV()) 1262 return false; 1263 1264 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 1265 // L0) and the other is not. We could check if it is monotone and test 1266 // the beginning and end value instead. 1267 1268 BasicBlock *L0Header = L0.getHeader(); 1269 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 1270 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 1271 if (!AddRec) 1272 return false; 1273 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 1274 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 1275 }; 1276 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 1277 return false; 1278 1279 ICmpInst::Predicate Pred = 1280 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 1281 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 1282 #ifndef NDEBUG 1283 if (VerboseFusionDebugging) 1284 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 1285 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 1286 << "\n"); 1287 #endif 1288 return IsAlwaysGE; 1289 } 1290 1291 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 1292 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 1293 /// specified by @p DepChoice are used to determine this. 1294 bool dependencesAllowFusion(const FusionCandidate &FC0, 1295 const FusionCandidate &FC1, Instruction &I0, 1296 Instruction &I1, bool AnyDep, 1297 FusionDependenceAnalysisChoice DepChoice) { 1298 #ifndef NDEBUG 1299 if (VerboseFusionDebugging) { 1300 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 1301 << DepChoice << "\n"); 1302 } 1303 #endif 1304 switch (DepChoice) { 1305 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 1306 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 1307 case FUSION_DEPENDENCE_ANALYSIS_DA: { 1308 auto DepResult = DI.depends(&I0, &I1, true); 1309 if (!DepResult) 1310 return true; 1311 #ifndef NDEBUG 1312 if (VerboseFusionDebugging) { 1313 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 1314 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 1315 << (DepResult->isOrdered() ? "true" : "false") 1316 << "]\n"); 1317 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 1318 << "\n"); 1319 } 1320 #endif 1321 1322 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 1323 LLVM_DEBUG( 1324 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 1325 1326 // TODO: Can we actually use the dependence info analysis here? 1327 return false; 1328 } 1329 1330 case FUSION_DEPENDENCE_ANALYSIS_ALL: 1331 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1332 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 1333 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1334 FUSION_DEPENDENCE_ANALYSIS_DA); 1335 } 1336 1337 llvm_unreachable("Unknown fusion dependence analysis choice!"); 1338 } 1339 1340 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 1341 bool dependencesAllowFusion(const FusionCandidate &FC0, 1342 const FusionCandidate &FC1) { 1343 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 1344 << "\n"); 1345 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 1346 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 1347 1348 for (Instruction *WriteL0 : FC0.MemWrites) { 1349 for (Instruction *WriteL1 : FC1.MemWrites) 1350 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1351 /* AnyDep */ false, 1352 FusionDependenceAnalysis)) { 1353 InvalidDependencies++; 1354 return false; 1355 } 1356 for (Instruction *ReadL1 : FC1.MemReads) 1357 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 1358 /* AnyDep */ false, 1359 FusionDependenceAnalysis)) { 1360 InvalidDependencies++; 1361 return false; 1362 } 1363 } 1364 1365 for (Instruction *WriteL1 : FC1.MemWrites) { 1366 for (Instruction *WriteL0 : FC0.MemWrites) 1367 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1368 /* AnyDep */ false, 1369 FusionDependenceAnalysis)) { 1370 InvalidDependencies++; 1371 return false; 1372 } 1373 for (Instruction *ReadL0 : FC0.MemReads) 1374 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1375 /* AnyDep */ false, 1376 FusionDependenceAnalysis)) { 1377 InvalidDependencies++; 1378 return false; 1379 } 1380 } 1381 1382 // Walk through all uses in FC1. For each use, find the reaching def. If the 1383 // def is located in FC0 then it is is not safe to fuse. 1384 for (BasicBlock *BB : FC1.L->blocks()) 1385 for (Instruction &I : *BB) 1386 for (auto &Op : I.operands()) 1387 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1388 if (FC0.L->contains(Def->getParent())) { 1389 InvalidDependencies++; 1390 return false; 1391 } 1392 1393 return true; 1394 } 1395 1396 /// Determine if two fusion candidates are adjacent in the CFG. 1397 /// 1398 /// This method will determine if there are additional basic blocks in the CFG 1399 /// between the exit of \p FC0 and the entry of \p FC1. 1400 /// If the two candidates are guarded loops, then it checks whether the 1401 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1402 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1403 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1404 /// preheader of \p FC1. 1405 bool isAdjacent(const FusionCandidate &FC0, 1406 const FusionCandidate &FC1) const { 1407 // If the successor of the guard branch is FC1, then the loops are adjacent 1408 if (FC0.GuardBranch) 1409 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1410 else 1411 return FC0.ExitBlock == FC1.getEntryBlock(); 1412 } 1413 1414 bool isEmptyPreheader(const FusionCandidate &FC) const { 1415 return FC.Preheader->size() == 1; 1416 } 1417 1418 /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader 1419 /// and sink others into the body of \p FC1. 1420 void movePreheaderInsts(const FusionCandidate &FC0, 1421 const FusionCandidate &FC1, 1422 SmallVector<Instruction *, 4> &HoistInsts, 1423 SmallVector<Instruction *, 4> &SinkInsts) const { 1424 // All preheader instructions except the branch must be hoisted or sunk 1425 assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && 1426 "Attempting to sink and hoist preheader instructions, but not all " 1427 "the preheader instructions are accounted for."); 1428 1429 NumHoistedInsts += HoistInsts.size(); 1430 NumSunkInsts += SinkInsts.size(); 1431 1432 LLVM_DEBUG(if (VerboseFusionDebugging) { 1433 if (!HoistInsts.empty()) 1434 dbgs() << "Hoisting: \n"; 1435 for (Instruction *I : HoistInsts) 1436 dbgs() << *I << "\n"; 1437 if (!SinkInsts.empty()) 1438 dbgs() << "Sinking: \n"; 1439 for (Instruction *I : SinkInsts) 1440 dbgs() << *I << "\n"; 1441 }); 1442 1443 for (Instruction *I : HoistInsts) { 1444 assert(I->getParent() == FC1.Preheader); 1445 I->moveBefore(FC0.Preheader->getTerminator()); 1446 } 1447 // insert instructions in reverse order to maintain dominance relationship 1448 for (Instruction *I : reverse(SinkInsts)) { 1449 assert(I->getParent() == FC1.Preheader); 1450 I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); 1451 } 1452 } 1453 1454 /// Determine if two fusion candidates have identical guards 1455 /// 1456 /// This method will determine if two fusion candidates have the same guards. 1457 /// The guards are considered the same if: 1458 /// 1. The instructions to compute the condition used in the compare are 1459 /// identical. 1460 /// 2. The successors of the guard have the same flow into/around the loop. 1461 /// If the compare instructions are identical, then the first successor of the 1462 /// guard must go to the same place (either the preheader of the loop or the 1463 /// NonLoopBlock). In other words, the the first successor of both loops must 1464 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1465 /// the NonLoopBlock). The same must be true for the second successor. 1466 bool haveIdenticalGuards(const FusionCandidate &FC0, 1467 const FusionCandidate &FC1) const { 1468 assert(FC0.GuardBranch && FC1.GuardBranch && 1469 "Expecting FC0 and FC1 to be guarded loops."); 1470 1471 if (auto FC0CmpInst = 1472 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1473 if (auto FC1CmpInst = 1474 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1475 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1476 return false; 1477 1478 // The compare instructions are identical. 1479 // Now make sure the successor of the guards have the same flow into/around 1480 // the loop 1481 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1482 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1483 else 1484 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1485 } 1486 1487 /// Modify the latch branch of FC to be unconditional since successors of the 1488 /// branch are the same. 1489 void simplifyLatchBranch(const FusionCandidate &FC) const { 1490 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1491 if (FCLatchBranch) { 1492 assert(FCLatchBranch->isConditional() && 1493 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1494 "Expecting the two successors of FCLatchBranch to be the same"); 1495 BranchInst *NewBranch = 1496 BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1497 ReplaceInstWithInst(FCLatchBranch, NewBranch); 1498 } 1499 } 1500 1501 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1502 /// successor, then merge FC0.Latch with its unique successor. 1503 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1504 moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1505 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1506 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1507 DTU.flush(); 1508 } 1509 } 1510 1511 /// Fuse two fusion candidates, creating a new fused loop. 1512 /// 1513 /// This method contains the mechanics of fusing two loops, represented by \p 1514 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1515 /// postdominates \p FC0 (making them control flow equivalent). It also 1516 /// assumes that the other conditions for fusion have been met: adjacent, 1517 /// identical trip counts, and no negative distance dependencies exist that 1518 /// would prevent fusion. Thus, there is no checking for these conditions in 1519 /// this method. 1520 /// 1521 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1522 /// components of tho loop. Specifically, the following changes are done: 1523 /// 1524 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1525 /// (because it is currently only a single statement block). 1526 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1527 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1528 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1529 /// 1530 /// All of these modifications are done with dominator tree updates, thus 1531 /// keeping the dominator (and post dominator) information up-to-date. 1532 /// 1533 /// This can be improved in the future by actually merging blocks during 1534 /// fusion. For example, the preheader of \p FC1 can be merged with the 1535 /// preheader of \p FC0. This would allow loops with more than a single 1536 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1537 /// two loops could also be fused into a single block. This will require 1538 /// analysis to prove it is safe to move the contents of the block past 1539 /// existing code, which currently has not been implemented. 1540 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1541 assert(FC0.isValid() && FC1.isValid() && 1542 "Expecting valid fusion candidates"); 1543 1544 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1545 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1546 1547 // Move instructions from the preheader of FC1 to the end of the preheader 1548 // of FC0. 1549 moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 1550 1551 // Fusing guarded loops is handled slightly differently than non-guarded 1552 // loops and has been broken out into a separate method instead of trying to 1553 // intersperse the logic within a single method. 1554 if (FC0.GuardBranch) 1555 return fuseGuardedLoops(FC0, FC1); 1556 1557 assert(FC1.Preheader == 1558 (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 1559 assert(FC1.Preheader->size() == 1 && 1560 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1561 1562 // Remember the phi nodes originally in the header of FC0 in order to rewire 1563 // them later. However, this is only necessary if the new loop carried 1564 // values might not dominate the exiting branch. While we do not generally 1565 // test if this is the case but simply insert intermediate phi nodes, we 1566 // need to make sure these intermediate phi nodes have different 1567 // predecessors. To this end, we filter the special case where the exiting 1568 // block is the latch block of the first loop. Nothing needs to be done 1569 // anyway as all loop carried values dominate the latch and thereby also the 1570 // exiting branch. 1571 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1572 if (FC0.ExitingBlock != FC0.Latch) 1573 for (PHINode &PHI : FC0.Header->phis()) 1574 OriginalFC0PHIs.push_back(&PHI); 1575 1576 // Replace incoming blocks for header PHIs first. 1577 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1578 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1579 1580 // Then modify the control flow and update DT and PDT. 1581 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1582 1583 // The old exiting block of the first loop (FC0) has to jump to the header 1584 // of the second as we need to execute the code in the second header block 1585 // regardless of the trip count. That is, if the trip count is 0, so the 1586 // back edge is never taken, we still have to execute both loop headers, 1587 // especially (but not only!) if the second is a do-while style loop. 1588 // However, doing so might invalidate the phi nodes of the first loop as 1589 // the new values do only need to dominate their latch and not the exiting 1590 // predicate. To remedy this potential problem we always introduce phi 1591 // nodes in the header of the second loop later that select the loop carried 1592 // value, if the second header was reached through an old latch of the 1593 // first, or undef otherwise. This is sound as exiting the first implies the 1594 // second will exit too, __without__ taking the back-edge. [Their 1595 // trip-counts are equal after all. 1596 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1597 // to FC1.Header? I think this is basically what the three sequences are 1598 // trying to accomplish; however, doing this directly in the CFG may mean 1599 // the DT/PDT becomes invalid 1600 if (!FC0.Peeled) { 1601 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1602 FC1.Header); 1603 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1604 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1605 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1606 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1607 } else { 1608 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1609 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1610 1611 // Remove the ExitBlock of the first Loop (also not needed) 1612 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1613 FC1.Header); 1614 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1615 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1616 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1617 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1618 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1619 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1620 } 1621 1622 // The pre-header of L1 is not necessary anymore. 1623 assert(pred_empty(FC1.Preheader)); 1624 FC1.Preheader->getTerminator()->eraseFromParent(); 1625 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1626 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1627 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1628 1629 // Moves the phi nodes from the second to the first loops header block. 1630 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1631 if (SE.isSCEVable(PHI->getType())) 1632 SE.forgetValue(PHI); 1633 if (PHI->hasNUsesOrMore(1)) 1634 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1635 else 1636 PHI->eraseFromParent(); 1637 } 1638 1639 // Introduce new phi nodes in the second loop header to ensure 1640 // exiting the first and jumping to the header of the second does not break 1641 // the SSA property of the phis originally in the first loop. See also the 1642 // comment above. 1643 Instruction *L1HeaderIP = &FC1.Header->front(); 1644 for (PHINode *LCPHI : OriginalFC0PHIs) { 1645 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1646 assert(L1LatchBBIdx >= 0 && 1647 "Expected loop carried value to be rewired at this point!"); 1648 1649 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1650 1651 PHINode *L1HeaderPHI = PHINode::Create( 1652 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1653 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1654 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1655 FC0.ExitingBlock); 1656 1657 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1658 } 1659 1660 // Replace latch terminator destinations. 1661 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1662 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1663 1664 // Modify the latch branch of FC0 to be unconditional as both successors of 1665 // the branch are the same. 1666 simplifyLatchBranch(FC0); 1667 1668 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1669 // performed the updates above. 1670 if (FC0.Latch != FC0.ExitingBlock) 1671 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1672 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1673 1674 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1675 FC0.Latch, FC0.Header)); 1676 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1677 FC1.Latch, FC0.Header)); 1678 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1679 FC1.Latch, FC1.Header)); 1680 1681 // Update DT/PDT 1682 DTU.applyUpdates(TreeUpdates); 1683 1684 LI.removeBlock(FC1.Preheader); 1685 DTU.deleteBB(FC1.Preheader); 1686 if (FC0.Peeled) { 1687 LI.removeBlock(FC0.ExitBlock); 1688 DTU.deleteBB(FC0.ExitBlock); 1689 } 1690 1691 DTU.flush(); 1692 1693 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1694 // and rebuild the information in subsequent passes of fusion? 1695 // Note: Need to forget the loops before merging the loop latches, as 1696 // mergeLatch may remove the only block in FC1. 1697 SE.forgetLoop(FC1.L); 1698 SE.forgetLoop(FC0.L); 1699 SE.forgetLoopDispositions(); 1700 1701 // Move instructions from FC0.Latch to FC1.Latch. 1702 // Note: mergeLatch requires an updated DT. 1703 mergeLatch(FC0, FC1); 1704 1705 // Merge the loops. 1706 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 1707 for (BasicBlock *BB : Blocks) { 1708 FC0.L->addBlockEntry(BB); 1709 FC1.L->removeBlockFromLoop(BB); 1710 if (LI.getLoopFor(BB) != FC1.L) 1711 continue; 1712 LI.changeLoopFor(BB, FC0.L); 1713 } 1714 while (!FC1.L->isInnermost()) { 1715 const auto &ChildLoopIt = FC1.L->begin(); 1716 Loop *ChildLoop = *ChildLoopIt; 1717 FC1.L->removeChildLoop(ChildLoopIt); 1718 FC0.L->addChildLoop(ChildLoop); 1719 } 1720 1721 // Delete the now empty loop L1. 1722 LI.erase(FC1.L); 1723 1724 #ifndef NDEBUG 1725 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1726 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1727 assert(PDT.verify()); 1728 LI.verify(DT); 1729 SE.verify(); 1730 #endif 1731 1732 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1733 1734 return FC0.L; 1735 } 1736 1737 /// Report details on loop fusion opportunities. 1738 /// 1739 /// This template function can be used to report both successful and missed 1740 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1741 /// be one of: 1742 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1743 /// given two valid fusion candidates. 1744 /// - OptimizationRemark to report successful fusion of two fusion 1745 /// candidates. 1746 /// The remarks will be printed using the form: 1747 /// <path/filename>:<line number>:<column number>: [<function name>]: 1748 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1749 template <typename RemarkKind> 1750 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1751 llvm::Statistic &Stat) { 1752 assert(FC0.Preheader && FC1.Preheader && 1753 "Expecting valid fusion candidates"); 1754 using namespace ore; 1755 #if LLVM_ENABLE_STATS 1756 ++Stat; 1757 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1758 FC0.Preheader) 1759 << "[" << FC0.Preheader->getParent()->getName() 1760 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1761 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1762 << ": " << Stat.getDesc()); 1763 #endif 1764 } 1765 1766 /// Fuse two guarded fusion candidates, creating a new fused loop. 1767 /// 1768 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1769 /// loops. The rewiring of the CFG is slightly different though, because of 1770 /// the presence of the guards around the loops and the exit blocks after the 1771 /// loop body. As such, the new loop is rewired as follows: 1772 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1773 /// from the FC1 guard branch. 1774 /// 2. Remove the exit block from FC0 (this exit block should be empty 1775 /// right now). 1776 /// 3. Remove the guard branch for FC1 1777 /// 4. Remove the preheader for FC1. 1778 /// The exit block successor for the latch of FC0 is updated to be the header 1779 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1780 /// be the header of FC0, thus creating the fused loop. 1781 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1782 const FusionCandidate &FC1) { 1783 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1784 1785 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1786 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1787 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1788 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1789 BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 1790 1791 // Move instructions from the exit block of FC0 to the beginning of the exit 1792 // block of FC1, in the case that the FC0 loop has not been peeled. In the 1793 // case that FC0 loop is peeled, then move the instructions of the successor 1794 // of the FC0 Exit block to the beginning of the exit block of FC1. 1795 moveInstructionsToTheBeginning( 1796 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1797 DT, PDT, DI); 1798 1799 // Move instructions from the guard block of FC1 to the end of the guard 1800 // block of FC0. 1801 moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 1802 1803 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1804 1805 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1806 1807 //////////////////////////////////////////////////////////////////////////// 1808 // Update the Loop Guard 1809 //////////////////////////////////////////////////////////////////////////// 1810 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1811 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1812 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1813 // executes the new fused loop) and the other path goes to the NonLoopBlock 1814 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1815 FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 1816 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1817 1818 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1819 BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 1820 1821 // The guard of FC1 is not necessary anymore. 1822 FC1.GuardBranch->eraseFromParent(); 1823 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1824 1825 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1826 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1827 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1828 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1829 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1830 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1831 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1832 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1833 1834 if (FC0.Peeled) { 1835 // Remove the Block after the ExitBlock of FC0 1836 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1837 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1838 FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1839 new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1840 FC0ExitBlockSuccessor); 1841 } 1842 1843 assert(pred_empty(FC1GuardBlock) && 1844 "Expecting guard block to have no predecessors"); 1845 assert(succ_empty(FC1GuardBlock) && 1846 "Expecting guard block to have no successors"); 1847 1848 // Remember the phi nodes originally in the header of FC0 in order to rewire 1849 // them later. However, this is only necessary if the new loop carried 1850 // values might not dominate the exiting branch. While we do not generally 1851 // test if this is the case but simply insert intermediate phi nodes, we 1852 // need to make sure these intermediate phi nodes have different 1853 // predecessors. To this end, we filter the special case where the exiting 1854 // block is the latch block of the first loop. Nothing needs to be done 1855 // anyway as all loop carried values dominate the latch and thereby also the 1856 // exiting branch. 1857 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1858 // (because the loops are rotated. Thus, nothing will ever be added to 1859 // OriginalFC0PHIs. 1860 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1861 if (FC0.ExitingBlock != FC0.Latch) 1862 for (PHINode &PHI : FC0.Header->phis()) 1863 OriginalFC0PHIs.push_back(&PHI); 1864 1865 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1866 1867 // Replace incoming blocks for header PHIs first. 1868 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1869 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1870 1871 // The old exiting block of the first loop (FC0) has to jump to the header 1872 // of the second as we need to execute the code in the second header block 1873 // regardless of the trip count. That is, if the trip count is 0, so the 1874 // back edge is never taken, we still have to execute both loop headers, 1875 // especially (but not only!) if the second is a do-while style loop. 1876 // However, doing so might invalidate the phi nodes of the first loop as 1877 // the new values do only need to dominate their latch and not the exiting 1878 // predicate. To remedy this potential problem we always introduce phi 1879 // nodes in the header of the second loop later that select the loop carried 1880 // value, if the second header was reached through an old latch of the 1881 // first, or undef otherwise. This is sound as exiting the first implies the 1882 // second will exit too, __without__ taking the back-edge (their 1883 // trip-counts are equal after all). 1884 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1885 FC1.Header); 1886 1887 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1888 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1889 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1890 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1891 1892 // Remove FC0 Exit Block 1893 // The exit block for FC0 is no longer needed since control will flow 1894 // directly to the header of FC1. Since it is an empty block, it can be 1895 // removed at this point. 1896 // TODO: In the future, we can handle non-empty exit blocks my merging any 1897 // instructions from FC0 exit block into FC1 exit block prior to removing 1898 // the block. 1899 assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); 1900 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1901 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1902 1903 // Remove FC1 Preheader 1904 // The pre-header of L1 is not necessary anymore. 1905 assert(pred_empty(FC1.Preheader)); 1906 FC1.Preheader->getTerminator()->eraseFromParent(); 1907 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1908 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1909 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1910 1911 // Moves the phi nodes from the second to the first loops header block. 1912 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1913 if (SE.isSCEVable(PHI->getType())) 1914 SE.forgetValue(PHI); 1915 if (PHI->hasNUsesOrMore(1)) 1916 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1917 else 1918 PHI->eraseFromParent(); 1919 } 1920 1921 // Introduce new phi nodes in the second loop header to ensure 1922 // exiting the first and jumping to the header of the second does not break 1923 // the SSA property of the phis originally in the first loop. See also the 1924 // comment above. 1925 Instruction *L1HeaderIP = &FC1.Header->front(); 1926 for (PHINode *LCPHI : OriginalFC0PHIs) { 1927 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1928 assert(L1LatchBBIdx >= 0 && 1929 "Expected loop carried value to be rewired at this point!"); 1930 1931 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1932 1933 PHINode *L1HeaderPHI = PHINode::Create( 1934 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1935 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1936 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1937 FC0.ExitingBlock); 1938 1939 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1940 } 1941 1942 // Update the latches 1943 1944 // Replace latch terminator destinations. 1945 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1946 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1947 1948 // Modify the latch branch of FC0 to be unconditional as both successors of 1949 // the branch are the same. 1950 simplifyLatchBranch(FC0); 1951 1952 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1953 // performed the updates above. 1954 if (FC0.Latch != FC0.ExitingBlock) 1955 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1956 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1957 1958 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1959 FC0.Latch, FC0.Header)); 1960 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1961 FC1.Latch, FC0.Header)); 1962 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1963 FC1.Latch, FC1.Header)); 1964 1965 // All done 1966 // Apply the updates to the Dominator Tree and cleanup. 1967 1968 assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); 1969 assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); 1970 1971 // Update DT/PDT 1972 DTU.applyUpdates(TreeUpdates); 1973 1974 LI.removeBlock(FC1GuardBlock); 1975 LI.removeBlock(FC1.Preheader); 1976 LI.removeBlock(FC0.ExitBlock); 1977 if (FC0.Peeled) { 1978 LI.removeBlock(FC0ExitBlockSuccessor); 1979 DTU.deleteBB(FC0ExitBlockSuccessor); 1980 } 1981 DTU.deleteBB(FC1GuardBlock); 1982 DTU.deleteBB(FC1.Preheader); 1983 DTU.deleteBB(FC0.ExitBlock); 1984 DTU.flush(); 1985 1986 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1987 // and rebuild the information in subsequent passes of fusion? 1988 // Note: Need to forget the loops before merging the loop latches, as 1989 // mergeLatch may remove the only block in FC1. 1990 SE.forgetLoop(FC1.L); 1991 SE.forgetLoop(FC0.L); 1992 SE.forgetLoopDispositions(); 1993 1994 // Move instructions from FC0.Latch to FC1.Latch. 1995 // Note: mergeLatch requires an updated DT. 1996 mergeLatch(FC0, FC1); 1997 1998 // Merge the loops. 1999 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 2000 for (BasicBlock *BB : Blocks) { 2001 FC0.L->addBlockEntry(BB); 2002 FC1.L->removeBlockFromLoop(BB); 2003 if (LI.getLoopFor(BB) != FC1.L) 2004 continue; 2005 LI.changeLoopFor(BB, FC0.L); 2006 } 2007 while (!FC1.L->isInnermost()) { 2008 const auto &ChildLoopIt = FC1.L->begin(); 2009 Loop *ChildLoop = *ChildLoopIt; 2010 FC1.L->removeChildLoop(ChildLoopIt); 2011 FC0.L->addChildLoop(ChildLoop); 2012 } 2013 2014 // Delete the now empty loop L1. 2015 LI.erase(FC1.L); 2016 2017 #ifndef NDEBUG 2018 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 2019 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 2020 assert(PDT.verify()); 2021 LI.verify(DT); 2022 SE.verify(); 2023 #endif 2024 2025 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 2026 2027 return FC0.L; 2028 } 2029 }; 2030 2031 struct LoopFuseLegacy : public FunctionPass { 2032 2033 static char ID; 2034 2035 LoopFuseLegacy() : FunctionPass(ID) { 2036 initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 2037 } 2038 2039 void getAnalysisUsage(AnalysisUsage &AU) const override { 2040 AU.addRequiredID(LoopSimplifyID); 2041 AU.addRequired<ScalarEvolutionWrapperPass>(); 2042 AU.addRequired<LoopInfoWrapperPass>(); 2043 AU.addRequired<DominatorTreeWrapperPass>(); 2044 AU.addRequired<PostDominatorTreeWrapperPass>(); 2045 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2046 AU.addRequired<DependenceAnalysisWrapperPass>(); 2047 AU.addRequired<AssumptionCacheTracker>(); 2048 AU.addRequired<TargetTransformInfoWrapperPass>(); 2049 2050 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2051 AU.addPreserved<LoopInfoWrapperPass>(); 2052 AU.addPreserved<DominatorTreeWrapperPass>(); 2053 AU.addPreserved<PostDominatorTreeWrapperPass>(); 2054 } 2055 2056 bool runOnFunction(Function &F) override { 2057 if (skipFunction(F)) 2058 return false; 2059 2060 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2061 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2062 auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 2063 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2064 auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 2065 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2066 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2067 const TargetTransformInfo &TTI = 2068 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2069 const DataLayout &DL = F.getParent()->getDataLayout(); 2070 2071 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2072 return LF.fuseLoops(F); 2073 } 2074 }; 2075 } // namespace 2076 2077 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 2078 auto &LI = AM.getResult<LoopAnalysis>(F); 2079 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 2080 auto &DI = AM.getResult<DependenceAnalysis>(F); 2081 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 2082 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 2083 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 2084 auto &AC = AM.getResult<AssumptionAnalysis>(F); 2085 const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 2086 const DataLayout &DL = F.getParent()->getDataLayout(); 2087 2088 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2089 bool Changed = LF.fuseLoops(F); 2090 if (!Changed) 2091 return PreservedAnalyses::all(); 2092 2093 PreservedAnalyses PA; 2094 PA.preserve<DominatorTreeAnalysis>(); 2095 PA.preserve<PostDominatorTreeAnalysis>(); 2096 PA.preserve<ScalarEvolutionAnalysis>(); 2097 PA.preserve<LoopAnalysis>(); 2098 return PA; 2099 } 2100 2101 char LoopFuseLegacy::ID = 0; 2102 2103 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 2104 false) 2105 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 2106 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 2107 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 2108 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 2109 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 2110 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 2111 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 2112 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 2113 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 2114 2115 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 2116