10b57cec5SDimitry Andric //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This file implements the loop fusion pass. 110b57cec5SDimitry Andric /// The implementation is largely based on the following document: 120b57cec5SDimitry Andric /// 130b57cec5SDimitry Andric /// Code Transformations to Augment the Scope of Loop Fusion in a 140b57cec5SDimitry Andric /// Production Compiler 150b57cec5SDimitry Andric /// Christopher Mark Barton 160b57cec5SDimitry Andric /// MSc Thesis 170b57cec5SDimitry Andric /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 180b57cec5SDimitry Andric /// 190b57cec5SDimitry Andric /// The general approach taken is to collect sets of control flow equivalent 200b57cec5SDimitry Andric /// loops and test whether they can be fused. The necessary conditions for 210b57cec5SDimitry Andric /// fusion are: 220b57cec5SDimitry Andric /// 1. The loops must be adjacent (there cannot be any statements between 230b57cec5SDimitry Andric /// the two loops). 240b57cec5SDimitry Andric /// 2. The loops must be conforming (they must execute the same number of 250b57cec5SDimitry Andric /// iterations). 260b57cec5SDimitry Andric /// 3. The loops must be control flow equivalent (if one loop executes, the 270b57cec5SDimitry Andric /// other is guaranteed to execute). 280b57cec5SDimitry Andric /// 4. There cannot be any negative distance dependencies between the loops. 290b57cec5SDimitry Andric /// If all of these conditions are satisfied, it is safe to fuse the loops. 300b57cec5SDimitry Andric /// 310b57cec5SDimitry Andric /// This implementation creates FusionCandidates that represent the loop and the 320b57cec5SDimitry Andric /// necessary information needed by fusion. It then operates on the fusion 330b57cec5SDimitry Andric /// candidates, first confirming that the candidate is eligible for fusion. The 340b57cec5SDimitry Andric /// candidates are then collected into control flow equivalent sets, sorted in 350b57cec5SDimitry Andric /// dominance order. Each set of control flow equivalent candidates is then 360b57cec5SDimitry Andric /// traversed, attempting to fuse pairs of candidates in the set. If all 370b57cec5SDimitry Andric /// requirements for fusion are met, the two candidates are fused, creating a 380b57cec5SDimitry Andric /// new (fused) candidate which is then added back into the set to consider for 390b57cec5SDimitry Andric /// additional fusion. 400b57cec5SDimitry Andric /// 410b57cec5SDimitry Andric /// This implementation currently does not make any modifications to remove 420b57cec5SDimitry Andric /// conditions for fusion. Code transformations to make loops conform to each of 430b57cec5SDimitry Andric /// the conditions for fusion are discussed in more detail in the document 440b57cec5SDimitry Andric /// above. These can be added to the current implementation in the future. 450b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 460b57cec5SDimitry Andric 470b57cec5SDimitry Andric #include "llvm/Transforms/Scalar/LoopFuse.h" 480b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 49e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 500b57cec5SDimitry Andric #include "llvm/Analysis/DependenceAnalysis.h" 510b57cec5SDimitry Andric #include "llvm/Analysis/DomTreeUpdater.h" 520b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 530b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h" 540b57cec5SDimitry Andric #include "llvm/Analysis/PostDominators.h" 550b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h" 560b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57e8d8bef9SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 580b57cec5SDimitry Andric #include "llvm/IR/Function.h" 590b57cec5SDimitry Andric #include "llvm/IR/Verifier.h" 60480093f4SDimitry Andric #include "llvm/Support/CommandLine.h" 610b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 620b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h" 630b57cec5SDimitry Andric #include "llvm/Transforms/Utils.h" 640b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65480093f4SDimitry Andric #include "llvm/Transforms/Utils/CodeMoverUtils.h" 66e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/LoopPeel.h" 67bdd1243dSDimitry Andric #include "llvm/Transforms/Utils/LoopSimplify.h" 680b57cec5SDimitry Andric 690b57cec5SDimitry Andric using namespace llvm; 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric #define DEBUG_TYPE "loop-fusion" 720b57cec5SDimitry Andric 738bcb0991SDimitry Andric STATISTIC(FuseCounter, "Loops fused"); 740b57cec5SDimitry Andric STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 750b57cec5SDimitry Andric STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 760b57cec5SDimitry Andric STATISTIC(InvalidHeader, "Loop has invalid header"); 770b57cec5SDimitry Andric STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 780b57cec5SDimitry Andric STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 790b57cec5SDimitry Andric STATISTIC(InvalidLatch, "Loop has invalid latch"); 800b57cec5SDimitry Andric STATISTIC(InvalidLoop, "Loop is invalid"); 810b57cec5SDimitry Andric STATISTIC(AddressTakenBB, "Basic block has address taken"); 820b57cec5SDimitry Andric STATISTIC(MayThrowException, "Loop may throw an exception"); 830b57cec5SDimitry Andric STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 840b57cec5SDimitry Andric STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 850b57cec5SDimitry Andric STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 868bcb0991SDimitry Andric STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 870b57cec5SDimitry Andric STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 888bcb0991SDimitry Andric STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 898bcb0991SDimitry Andric STATISTIC(NonAdjacent, "Loops are not adjacent"); 905ffd83dbSDimitry Andric STATISTIC( 915ffd83dbSDimitry Andric NonEmptyPreheader, 925ffd83dbSDimitry Andric "Loop has a non-empty preheader with instructions that cannot be moved"); 938bcb0991SDimitry Andric STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 948bcb0991SDimitry Andric STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 955ffd83dbSDimitry Andric STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 965ffd83dbSDimitry Andric "instructions that cannot be moved"); 975ffd83dbSDimitry Andric STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 985ffd83dbSDimitry Andric "instructions that cannot be moved"); 99480093f4SDimitry Andric STATISTIC(NotRotated, "Candidate is not rotated"); 100fe6060f1SDimitry Andric STATISTIC(OnlySecondCandidateIsGuarded, 101fe6060f1SDimitry Andric "The second candidate is guarded while the first one is not"); 102bdd1243dSDimitry Andric STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); 103bdd1243dSDimitry Andric STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); 1040b57cec5SDimitry Andric 1050b57cec5SDimitry Andric enum FusionDependenceAnalysisChoice { 1060b57cec5SDimitry Andric FUSION_DEPENDENCE_ANALYSIS_SCEV, 1070b57cec5SDimitry Andric FUSION_DEPENDENCE_ANALYSIS_DA, 1080b57cec5SDimitry Andric FUSION_DEPENDENCE_ANALYSIS_ALL, 1090b57cec5SDimitry Andric }; 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 1120b57cec5SDimitry Andric "loop-fusion-dependence-analysis", 1130b57cec5SDimitry Andric cl::desc("Which dependence analysis should loop fusion use?"), 1140b57cec5SDimitry Andric cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 1150b57cec5SDimitry Andric "Use the scalar evolution interface"), 1160b57cec5SDimitry Andric clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 1170b57cec5SDimitry Andric "Use the dependence analysis interface"), 1180b57cec5SDimitry Andric clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 1190b57cec5SDimitry Andric "Use all available analyses")), 12081ad6265SDimitry Andric cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL)); 1210b57cec5SDimitry Andric 122e8d8bef9SDimitry Andric static cl::opt<unsigned> FusionPeelMaxCount( 123e8d8bef9SDimitry Andric "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 124e8d8bef9SDimitry Andric cl::desc("Max number of iterations to be peeled from a loop, such that " 125e8d8bef9SDimitry Andric "fusion can take place")); 126e8d8bef9SDimitry Andric 1270b57cec5SDimitry Andric #ifndef NDEBUG 1280b57cec5SDimitry Andric static cl::opt<bool> 1290b57cec5SDimitry Andric VerboseFusionDebugging("loop-fusion-verbose-debug", 1300b57cec5SDimitry Andric cl::desc("Enable verbose debugging for Loop Fusion"), 13181ad6265SDimitry Andric cl::Hidden, cl::init(false)); 1320b57cec5SDimitry Andric #endif 1330b57cec5SDimitry Andric 1348bcb0991SDimitry Andric namespace { 1350b57cec5SDimitry Andric /// This class is used to represent a candidate for loop fusion. When it is 1360b57cec5SDimitry Andric /// constructed, it checks the conditions for loop fusion to ensure that it 1370b57cec5SDimitry Andric /// represents a valid candidate. It caches several parts of a loop that are 1380b57cec5SDimitry Andric /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 1390b57cec5SDimitry Andric /// of continually querying the underlying Loop to retrieve these values. It is 1400b57cec5SDimitry Andric /// assumed these will not change throughout loop fusion. 1410b57cec5SDimitry Andric /// 1420b57cec5SDimitry Andric /// The invalidate method should be used to indicate that the FusionCandidate is 1430b57cec5SDimitry Andric /// no longer a valid candidate for fusion. Similarly, the isValid() method can 1440b57cec5SDimitry Andric /// be used to ensure that the FusionCandidate is still valid for fusion. 1450b57cec5SDimitry Andric struct FusionCandidate { 1460b57cec5SDimitry Andric /// Cache of parts of the loop used throughout loop fusion. These should not 1470b57cec5SDimitry Andric /// need to change throughout the analysis and transformation. 1480b57cec5SDimitry Andric /// These parts are cached to avoid repeatedly looking up in the Loop class. 1490b57cec5SDimitry Andric 1500b57cec5SDimitry Andric /// Preheader of the loop this candidate represents 1510b57cec5SDimitry Andric BasicBlock *Preheader; 1520b57cec5SDimitry Andric /// Header of the loop this candidate represents 1530b57cec5SDimitry Andric BasicBlock *Header; 1540b57cec5SDimitry Andric /// Blocks in the loop that exit the loop 1550b57cec5SDimitry Andric BasicBlock *ExitingBlock; 1560b57cec5SDimitry Andric /// The successor block of this loop (where the exiting blocks go to) 1570b57cec5SDimitry Andric BasicBlock *ExitBlock; 1580b57cec5SDimitry Andric /// Latch of the loop 1590b57cec5SDimitry Andric BasicBlock *Latch; 1600b57cec5SDimitry Andric /// The loop that this fusion candidate represents 1610b57cec5SDimitry Andric Loop *L; 1620b57cec5SDimitry Andric /// Vector of instructions in this loop that read from memory 1630b57cec5SDimitry Andric SmallVector<Instruction *, 16> MemReads; 1640b57cec5SDimitry Andric /// Vector of instructions in this loop that write to memory 1650b57cec5SDimitry Andric SmallVector<Instruction *, 16> MemWrites; 1660b57cec5SDimitry Andric /// Are all of the members of this fusion candidate still valid 1670b57cec5SDimitry Andric bool Valid; 1688bcb0991SDimitry Andric /// Guard branch of the loop, if it exists 1698bcb0991SDimitry Andric BranchInst *GuardBranch; 170e8d8bef9SDimitry Andric /// Peeling Paramaters of the Loop. 171e8d8bef9SDimitry Andric TTI::PeelingPreferences PP; 172e8d8bef9SDimitry Andric /// Can you Peel this Loop? 173e8d8bef9SDimitry Andric bool AbleToPeel; 174e8d8bef9SDimitry Andric /// Has this loop been Peeled 175e8d8bef9SDimitry Andric bool Peeled; 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric /// Dominator and PostDominator trees are needed for the 1780b57cec5SDimitry Andric /// FusionCandidateCompare function, required by FusionCandidateSet to 1790b57cec5SDimitry Andric /// determine where the FusionCandidate should be inserted into the set. These 1800b57cec5SDimitry Andric /// are used to establish ordering of the FusionCandidates based on dominance. 18181ad6265SDimitry Andric DominatorTree &DT; 1820b57cec5SDimitry Andric const PostDominatorTree *PDT; 1830b57cec5SDimitry Andric 1848bcb0991SDimitry Andric OptimizationRemarkEmitter &ORE; 1858bcb0991SDimitry Andric 186bdd1243dSDimitry Andric FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, 187bdd1243dSDimitry Andric OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) 1880b57cec5SDimitry Andric : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 1890b57cec5SDimitry Andric ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 190480093f4SDimitry Andric Latch(L->getLoopLatch()), L(L), Valid(true), 191e8d8bef9SDimitry Andric GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 192e8d8bef9SDimitry Andric Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric // Walk over all blocks in the loop and check for conditions that may 1950b57cec5SDimitry Andric // prevent fusion. For each block, walk over all instructions and collect 1960b57cec5SDimitry Andric // the memory reads and writes If any instructions that prevent fusion are 1970b57cec5SDimitry Andric // found, invalidate this object and return. 1980b57cec5SDimitry Andric for (BasicBlock *BB : L->blocks()) { 1990b57cec5SDimitry Andric if (BB->hasAddressTaken()) { 2000b57cec5SDimitry Andric invalidate(); 2018bcb0991SDimitry Andric reportInvalidCandidate(AddressTakenBB); 2020b57cec5SDimitry Andric return; 2030b57cec5SDimitry Andric } 2040b57cec5SDimitry Andric 2050b57cec5SDimitry Andric for (Instruction &I : *BB) { 2060b57cec5SDimitry Andric if (I.mayThrow()) { 2070b57cec5SDimitry Andric invalidate(); 2088bcb0991SDimitry Andric reportInvalidCandidate(MayThrowException); 2090b57cec5SDimitry Andric return; 2100b57cec5SDimitry Andric } 2110b57cec5SDimitry Andric if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 2120b57cec5SDimitry Andric if (SI->isVolatile()) { 2130b57cec5SDimitry Andric invalidate(); 2148bcb0991SDimitry Andric reportInvalidCandidate(ContainsVolatileAccess); 2150b57cec5SDimitry Andric return; 2160b57cec5SDimitry Andric } 2170b57cec5SDimitry Andric } 2180b57cec5SDimitry Andric if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 2190b57cec5SDimitry Andric if (LI->isVolatile()) { 2200b57cec5SDimitry Andric invalidate(); 2218bcb0991SDimitry Andric reportInvalidCandidate(ContainsVolatileAccess); 2220b57cec5SDimitry Andric return; 2230b57cec5SDimitry Andric } 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric if (I.mayWriteToMemory()) 2260b57cec5SDimitry Andric MemWrites.push_back(&I); 2270b57cec5SDimitry Andric if (I.mayReadFromMemory()) 2280b57cec5SDimitry Andric MemReads.push_back(&I); 2290b57cec5SDimitry Andric } 2300b57cec5SDimitry Andric } 2310b57cec5SDimitry Andric } 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric /// Check if all members of the class are valid. 2340b57cec5SDimitry Andric bool isValid() const { 2350b57cec5SDimitry Andric return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 2360b57cec5SDimitry Andric !L->isInvalid() && Valid; 2370b57cec5SDimitry Andric } 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric /// Verify that all members are in sync with the Loop object. 2400b57cec5SDimitry Andric void verify() const { 2410b57cec5SDimitry Andric assert(isValid() && "Candidate is not valid!!"); 2420b57cec5SDimitry Andric assert(!L->isInvalid() && "Loop is invalid!"); 2430b57cec5SDimitry Andric assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 2440b57cec5SDimitry Andric assert(Header == L->getHeader() && "Header is out of sync"); 2450b57cec5SDimitry Andric assert(ExitingBlock == L->getExitingBlock() && 2460b57cec5SDimitry Andric "Exiting Blocks is out of sync"); 2470b57cec5SDimitry Andric assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 2480b57cec5SDimitry Andric assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric 2518bcb0991SDimitry Andric /// Get the entry block for this fusion candidate. 2528bcb0991SDimitry Andric /// 2538bcb0991SDimitry Andric /// If this fusion candidate represents a guarded loop, the entry block is the 2548bcb0991SDimitry Andric /// loop guard block. If it represents an unguarded loop, the entry block is 2558bcb0991SDimitry Andric /// the preheader of the loop. 2568bcb0991SDimitry Andric BasicBlock *getEntryBlock() const { 2578bcb0991SDimitry Andric if (GuardBranch) 2588bcb0991SDimitry Andric return GuardBranch->getParent(); 2598bcb0991SDimitry Andric else 2608bcb0991SDimitry Andric return Preheader; 2618bcb0991SDimitry Andric } 2628bcb0991SDimitry Andric 263e8d8bef9SDimitry Andric /// After Peeling the loop is modified quite a bit, hence all of the Blocks 264e8d8bef9SDimitry Andric /// need to be updated accordingly. 265e8d8bef9SDimitry Andric void updateAfterPeeling() { 266e8d8bef9SDimitry Andric Preheader = L->getLoopPreheader(); 267e8d8bef9SDimitry Andric Header = L->getHeader(); 268e8d8bef9SDimitry Andric ExitingBlock = L->getExitingBlock(); 269e8d8bef9SDimitry Andric ExitBlock = L->getExitBlock(); 270e8d8bef9SDimitry Andric Latch = L->getLoopLatch(); 271e8d8bef9SDimitry Andric verify(); 272e8d8bef9SDimitry Andric } 273e8d8bef9SDimitry Andric 2748bcb0991SDimitry Andric /// Given a guarded loop, get the successor of the guard that is not in the 2758bcb0991SDimitry Andric /// loop. 2768bcb0991SDimitry Andric /// 2778bcb0991SDimitry Andric /// This method returns the successor of the loop guard that is not located 2788bcb0991SDimitry Andric /// within the loop (i.e., the successor of the guard that is not the 2798bcb0991SDimitry Andric /// preheader). 2808bcb0991SDimitry Andric /// This method is only valid for guarded loops. 2818bcb0991SDimitry Andric BasicBlock *getNonLoopBlock() const { 2828bcb0991SDimitry Andric assert(GuardBranch && "Only valid on guarded loops."); 2838bcb0991SDimitry Andric assert(GuardBranch->isConditional() && 2848bcb0991SDimitry Andric "Expecting guard to be a conditional branch."); 285e8d8bef9SDimitry Andric if (Peeled) 286e8d8bef9SDimitry Andric return GuardBranch->getSuccessor(1); 2878bcb0991SDimitry Andric return (GuardBranch->getSuccessor(0) == Preheader) 2888bcb0991SDimitry Andric ? GuardBranch->getSuccessor(1) 2898bcb0991SDimitry Andric : GuardBranch->getSuccessor(0); 2908bcb0991SDimitry Andric } 2918bcb0991SDimitry Andric 2920b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2930b57cec5SDimitry Andric LLVM_DUMP_METHOD void dump() const { 294480093f4SDimitry Andric dbgs() << "\tGuardBranch: "; 295480093f4SDimitry Andric if (GuardBranch) 296480093f4SDimitry Andric dbgs() << *GuardBranch; 297480093f4SDimitry Andric else 298480093f4SDimitry Andric dbgs() << "nullptr"; 299480093f4SDimitry Andric dbgs() << "\n" 3008bcb0991SDimitry Andric << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 3018bcb0991SDimitry Andric << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 3020b57cec5SDimitry Andric << "\n" 3030b57cec5SDimitry Andric << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 3040b57cec5SDimitry Andric << "\tExitingBB: " 3050b57cec5SDimitry Andric << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 3060b57cec5SDimitry Andric << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 3070b57cec5SDimitry Andric << "\n" 3088bcb0991SDimitry Andric << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 3098bcb0991SDimitry Andric << "\tEntryBlock: " 3108bcb0991SDimitry Andric << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 3118bcb0991SDimitry Andric << "\n"; 3120b57cec5SDimitry Andric } 3130b57cec5SDimitry Andric #endif 3140b57cec5SDimitry Andric 3158bcb0991SDimitry Andric /// Determine if a fusion candidate (representing a loop) is eligible for 3168bcb0991SDimitry Andric /// fusion. Note that this only checks whether a single loop can be fused - it 3178bcb0991SDimitry Andric /// does not check whether it is *legal* to fuse two loops together. 3188bcb0991SDimitry Andric bool isEligibleForFusion(ScalarEvolution &SE) const { 3198bcb0991SDimitry Andric if (!isValid()) { 3208bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 3218bcb0991SDimitry Andric if (!Preheader) 3228bcb0991SDimitry Andric ++InvalidPreheader; 3238bcb0991SDimitry Andric if (!Header) 3248bcb0991SDimitry Andric ++InvalidHeader; 3258bcb0991SDimitry Andric if (!ExitingBlock) 3268bcb0991SDimitry Andric ++InvalidExitingBlock; 3278bcb0991SDimitry Andric if (!ExitBlock) 3288bcb0991SDimitry Andric ++InvalidExitBlock; 3298bcb0991SDimitry Andric if (!Latch) 3308bcb0991SDimitry Andric ++InvalidLatch; 3318bcb0991SDimitry Andric if (L->isInvalid()) 3328bcb0991SDimitry Andric ++InvalidLoop; 3338bcb0991SDimitry Andric 3348bcb0991SDimitry Andric return false; 3358bcb0991SDimitry Andric } 3368bcb0991SDimitry Andric 3378bcb0991SDimitry Andric // Require ScalarEvolution to be able to determine a trip count. 3388bcb0991SDimitry Andric if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 3398bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Loop " << L->getName() 3408bcb0991SDimitry Andric << " trip count not computable!\n"); 3418bcb0991SDimitry Andric return reportInvalidCandidate(UnknownTripCount); 3428bcb0991SDimitry Andric } 3438bcb0991SDimitry Andric 3448bcb0991SDimitry Andric if (!L->isLoopSimplifyForm()) { 3458bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Loop " << L->getName() 3468bcb0991SDimitry Andric << " is not in simplified form!\n"); 3478bcb0991SDimitry Andric return reportInvalidCandidate(NotSimplifiedForm); 3488bcb0991SDimitry Andric } 3498bcb0991SDimitry Andric 350480093f4SDimitry Andric if (!L->isRotatedForm()) { 351480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 352480093f4SDimitry Andric return reportInvalidCandidate(NotRotated); 353480093f4SDimitry Andric } 354480093f4SDimitry Andric 3558bcb0991SDimitry Andric return true; 3568bcb0991SDimitry Andric } 3578bcb0991SDimitry Andric 3580b57cec5SDimitry Andric private: 3590b57cec5SDimitry Andric // This is only used internally for now, to clear the MemWrites and MemReads 3600b57cec5SDimitry Andric // list and setting Valid to false. I can't envision other uses of this right 3610b57cec5SDimitry Andric // now, since once FusionCandidates are put into the FusionCandidateSet they 3620b57cec5SDimitry Andric // are immutable. Thus, any time we need to change/update a FusionCandidate, 3630b57cec5SDimitry Andric // we must create a new one and insert it into the FusionCandidateSet to 3640b57cec5SDimitry Andric // ensure the FusionCandidateSet remains ordered correctly. 3650b57cec5SDimitry Andric void invalidate() { 3660b57cec5SDimitry Andric MemWrites.clear(); 3670b57cec5SDimitry Andric MemReads.clear(); 3680b57cec5SDimitry Andric Valid = false; 3690b57cec5SDimitry Andric } 3700b57cec5SDimitry Andric 3718bcb0991SDimitry Andric bool reportInvalidCandidate(llvm::Statistic &Stat) const { 3728bcb0991SDimitry Andric using namespace ore; 3738bcb0991SDimitry Andric assert(L && Preheader && "Fusion candidate not initialized properly!"); 374fe6060f1SDimitry Andric #if LLVM_ENABLE_STATS 3758bcb0991SDimitry Andric ++Stat; 3768bcb0991SDimitry Andric ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 3778bcb0991SDimitry Andric L->getStartLoc(), Preheader) 3788bcb0991SDimitry Andric << "[" << Preheader->getParent()->getName() << "]: " 3798bcb0991SDimitry Andric << "Loop is not a candidate for fusion: " << Stat.getDesc()); 380fe6060f1SDimitry Andric #endif 3818bcb0991SDimitry Andric return false; 3820b57cec5SDimitry Andric } 3838bcb0991SDimitry Andric }; 3840b57cec5SDimitry Andric 3850b57cec5SDimitry Andric struct FusionCandidateCompare { 3860b57cec5SDimitry Andric /// Comparison functor to sort two Control Flow Equivalent fusion candidates 3870b57cec5SDimitry Andric /// into dominance order. 3880b57cec5SDimitry Andric /// If LHS dominates RHS and RHS post-dominates LHS, return true; 389bdd1243dSDimitry Andric /// If RHS dominates LHS and LHS post-dominates RHS, return false; 390bdd1243dSDimitry Andric /// If both LHS and RHS are not dominating each other then, non-strictly 391bdd1243dSDimitry Andric /// post dominate check will decide the order of candidates. If RHS 392bdd1243dSDimitry Andric /// non-strictly post dominates LHS then, return true. If LHS non-strictly 393bdd1243dSDimitry Andric /// post dominates RHS then, return false. If both are non-strictly post 394bdd1243dSDimitry Andric /// dominate each other then, level in the post dominator tree will decide 395bdd1243dSDimitry Andric /// the order of candidates. 3960b57cec5SDimitry Andric bool operator()(const FusionCandidate &LHS, 3970b57cec5SDimitry Andric const FusionCandidate &RHS) const { 39881ad6265SDimitry Andric const DominatorTree *DT = &(LHS.DT); 3990b57cec5SDimitry Andric 4008bcb0991SDimitry Andric BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 4018bcb0991SDimitry Andric BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 4028bcb0991SDimitry Andric 4030b57cec5SDimitry Andric // Do not save PDT to local variable as it is only used in asserts and thus 4040b57cec5SDimitry Andric // will trigger an unused variable warning if building without asserts. 4050b57cec5SDimitry Andric assert(DT && LHS.PDT && "Expecting valid dominator tree"); 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric // Do this compare first so if LHS == RHS, function returns false. 4088bcb0991SDimitry Andric if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 4090b57cec5SDimitry Andric // RHS dominates LHS 4100b57cec5SDimitry Andric // Verify LHS post-dominates RHS 4118bcb0991SDimitry Andric assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 4120b57cec5SDimitry Andric return false; 4130b57cec5SDimitry Andric } 4140b57cec5SDimitry Andric 4158bcb0991SDimitry Andric if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 4160b57cec5SDimitry Andric // Verify RHS Postdominates LHS 4178bcb0991SDimitry Andric assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 4180b57cec5SDimitry Andric return true; 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric 421bdd1243dSDimitry Andric // If two FusionCandidates are in the same level of dominator tree, 422bdd1243dSDimitry Andric // they will not dominate each other, but may still be control flow 423bdd1243dSDimitry Andric // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() 424bdd1243dSDimitry Andric // function is needed. 425bdd1243dSDimitry Andric bool WrongOrder = 426bdd1243dSDimitry Andric nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT); 427bdd1243dSDimitry Andric bool RightOrder = 428bdd1243dSDimitry Andric nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT); 429bdd1243dSDimitry Andric if (WrongOrder && RightOrder) { 430bdd1243dSDimitry Andric // If common predecessor of LHS and RHS post dominates both 431bdd1243dSDimitry Andric // FusionCandidates then, Order of FusionCandidate can be 432bdd1243dSDimitry Andric // identified by its level in post dominator tree. 433bdd1243dSDimitry Andric DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock); 434bdd1243dSDimitry Andric DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock); 435bdd1243dSDimitry Andric return LNode->getLevel() > RNode->getLevel(); 436bdd1243dSDimitry Andric } else if (WrongOrder) 437bdd1243dSDimitry Andric return false; 438bdd1243dSDimitry Andric else if (RightOrder) 439bdd1243dSDimitry Andric return true; 440bdd1243dSDimitry Andric 441bdd1243dSDimitry Andric // If LHS does not non-strict Postdominate RHS and RHS does not non-strict 442bdd1243dSDimitry Andric // Postdominate LHS then, there is no dominance relationship between the 443bdd1243dSDimitry Andric // two FusionCandidates. Thus, they should not be in the same set together. 4440b57cec5SDimitry Andric llvm_unreachable( 4450b57cec5SDimitry Andric "No dominance relationship between these fusion candidates!"); 4460b57cec5SDimitry Andric } 4470b57cec5SDimitry Andric }; 4480b57cec5SDimitry Andric 4490b57cec5SDimitry Andric using LoopVector = SmallVector<Loop *, 4>; 4500b57cec5SDimitry Andric 4510b57cec5SDimitry Andric // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 4520b57cec5SDimitry Andric // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 4530b57cec5SDimitry Andric // dominates FC1 and FC1 post-dominates FC0. 4540b57cec5SDimitry Andric // std::set was chosen because we want a sorted data structure with stable 455bdd1243dSDimitry Andric // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent 4560b57cec5SDimitry Andric // loops by moving intervening code around. When this intervening code contains 4570b57cec5SDimitry Andric // loops, those loops will be moved also. The corresponding FusionCandidates 4580b57cec5SDimitry Andric // will also need to be moved accordingly. As this is done, having stable 4590b57cec5SDimitry Andric // iterators will simplify the logic. Similarly, having an efficient insert that 4600b57cec5SDimitry Andric // keeps the FusionCandidateSet sorted will also simplify the implementation. 4610b57cec5SDimitry Andric using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 4620b57cec5SDimitry Andric using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 4630b57cec5SDimitry Andric 4648bcb0991SDimitry Andric #if !defined(NDEBUG) 4658bcb0991SDimitry Andric static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 4668bcb0991SDimitry Andric const FusionCandidate &FC) { 4678bcb0991SDimitry Andric if (FC.isValid()) 4688bcb0991SDimitry Andric OS << FC.Preheader->getName(); 4698bcb0991SDimitry Andric else 4708bcb0991SDimitry Andric OS << "<Invalid>"; 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric return OS; 4730b57cec5SDimitry Andric } 4740b57cec5SDimitry Andric 4758bcb0991SDimitry Andric static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 4768bcb0991SDimitry Andric const FusionCandidateSet &CandSet) { 4778bcb0991SDimitry Andric for (const FusionCandidate &FC : CandSet) 4788bcb0991SDimitry Andric OS << FC << '\n'; 4798bcb0991SDimitry Andric 4808bcb0991SDimitry Andric return OS; 4818bcb0991SDimitry Andric } 4828bcb0991SDimitry Andric 4830b57cec5SDimitry Andric static void 4840b57cec5SDimitry Andric printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 4850b57cec5SDimitry Andric dbgs() << "Fusion Candidates: \n"; 4860b57cec5SDimitry Andric for (const auto &CandidateSet : FusionCandidates) { 4870b57cec5SDimitry Andric dbgs() << "*** Fusion Candidate Set ***\n"; 4880b57cec5SDimitry Andric dbgs() << CandidateSet; 4890b57cec5SDimitry Andric dbgs() << "****************************\n"; 4900b57cec5SDimitry Andric } 4910b57cec5SDimitry Andric } 4920b57cec5SDimitry Andric #endif 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric /// Collect all loops in function at the same nest level, starting at the 4950b57cec5SDimitry Andric /// outermost level. 4960b57cec5SDimitry Andric /// 4970b57cec5SDimitry Andric /// This data structure collects all loops at the same nest level for a 4980b57cec5SDimitry Andric /// given function (specified by the LoopInfo object). It starts at the 4990b57cec5SDimitry Andric /// outermost level. 5000b57cec5SDimitry Andric struct LoopDepthTree { 5010b57cec5SDimitry Andric using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 5020b57cec5SDimitry Andric using iterator = LoopsOnLevelTy::iterator; 5030b57cec5SDimitry Andric using const_iterator = LoopsOnLevelTy::const_iterator; 5040b57cec5SDimitry Andric 5050b57cec5SDimitry Andric LoopDepthTree(LoopInfo &LI) : Depth(1) { 5060b57cec5SDimitry Andric if (!LI.empty()) 5070b57cec5SDimitry Andric LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 5080b57cec5SDimitry Andric } 5090b57cec5SDimitry Andric 5100b57cec5SDimitry Andric /// Test whether a given loop has been removed from the function, and thus is 5110b57cec5SDimitry Andric /// no longer valid. 5120b57cec5SDimitry Andric bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 5130b57cec5SDimitry Andric 5140b57cec5SDimitry Andric /// Record that a given loop has been removed from the function and is no 5150b57cec5SDimitry Andric /// longer valid. 5160b57cec5SDimitry Andric void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 5170b57cec5SDimitry Andric 5180b57cec5SDimitry Andric /// Descend the tree to the next (inner) nesting level 5190b57cec5SDimitry Andric void descend() { 5200b57cec5SDimitry Andric LoopsOnLevelTy LoopsOnNextLevel; 5210b57cec5SDimitry Andric 5220b57cec5SDimitry Andric for (const LoopVector &LV : *this) 5230b57cec5SDimitry Andric for (Loop *L : LV) 5240b57cec5SDimitry Andric if (!isRemovedLoop(L) && L->begin() != L->end()) 5250b57cec5SDimitry Andric LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 5260b57cec5SDimitry Andric 5270b57cec5SDimitry Andric LoopsOnLevel = LoopsOnNextLevel; 5280b57cec5SDimitry Andric RemovedLoops.clear(); 5290b57cec5SDimitry Andric Depth++; 5300b57cec5SDimitry Andric } 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric bool empty() const { return size() == 0; } 5330b57cec5SDimitry Andric size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 5340b57cec5SDimitry Andric unsigned getDepth() const { return Depth; } 5350b57cec5SDimitry Andric 5360b57cec5SDimitry Andric iterator begin() { return LoopsOnLevel.begin(); } 5370b57cec5SDimitry Andric iterator end() { return LoopsOnLevel.end(); } 5380b57cec5SDimitry Andric const_iterator begin() const { return LoopsOnLevel.begin(); } 5390b57cec5SDimitry Andric const_iterator end() const { return LoopsOnLevel.end(); } 5400b57cec5SDimitry Andric 5410b57cec5SDimitry Andric private: 5420b57cec5SDimitry Andric /// Set of loops that have been removed from the function and are no longer 5430b57cec5SDimitry Andric /// valid. 5440b57cec5SDimitry Andric SmallPtrSet<const Loop *, 8> RemovedLoops; 5450b57cec5SDimitry Andric 5460b57cec5SDimitry Andric /// Depth of the current level, starting at 1 (outermost loops). 5470b57cec5SDimitry Andric unsigned Depth; 5480b57cec5SDimitry Andric 5490b57cec5SDimitry Andric /// Vector of loops at the current depth level that have the same parent loop 5500b57cec5SDimitry Andric LoopsOnLevelTy LoopsOnLevel; 5510b57cec5SDimitry Andric }; 5520b57cec5SDimitry Andric 5530b57cec5SDimitry Andric #ifndef NDEBUG 5540b57cec5SDimitry Andric static void printLoopVector(const LoopVector &LV) { 5550b57cec5SDimitry Andric dbgs() << "****************************\n"; 556bdd1243dSDimitry Andric for (auto *L : LV) 5570b57cec5SDimitry Andric printLoop(*L, dbgs()); 5580b57cec5SDimitry Andric dbgs() << "****************************\n"; 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric #endif 5610b57cec5SDimitry Andric 5620b57cec5SDimitry Andric struct LoopFuser { 5630b57cec5SDimitry Andric private: 5640b57cec5SDimitry Andric // Sets of control flow equivalent fusion candidates for a given nest level. 5650b57cec5SDimitry Andric FusionCandidateCollection FusionCandidates; 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric LoopDepthTree LDT; 5680b57cec5SDimitry Andric DomTreeUpdater DTU; 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric LoopInfo &LI; 5710b57cec5SDimitry Andric DominatorTree &DT; 5720b57cec5SDimitry Andric DependenceInfo &DI; 5730b57cec5SDimitry Andric ScalarEvolution &SE; 5740b57cec5SDimitry Andric PostDominatorTree &PDT; 5750b57cec5SDimitry Andric OptimizationRemarkEmitter &ORE; 576e8d8bef9SDimitry Andric AssumptionCache &AC; 577e8d8bef9SDimitry Andric const TargetTransformInfo &TTI; 5780b57cec5SDimitry Andric 5790b57cec5SDimitry Andric public: 5800b57cec5SDimitry Andric LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 5810b57cec5SDimitry Andric ScalarEvolution &SE, PostDominatorTree &PDT, 582e8d8bef9SDimitry Andric OptimizationRemarkEmitter &ORE, const DataLayout &DL, 583e8d8bef9SDimitry Andric AssumptionCache &AC, const TargetTransformInfo &TTI) 5840b57cec5SDimitry Andric : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 585e8d8bef9SDimitry Andric DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric /// This is the main entry point for loop fusion. It will traverse the 5880b57cec5SDimitry Andric /// specified function and collect candidate loops to fuse, starting at the 5890b57cec5SDimitry Andric /// outermost nesting level and working inwards. 5900b57cec5SDimitry Andric bool fuseLoops(Function &F) { 5910b57cec5SDimitry Andric #ifndef NDEBUG 5920b57cec5SDimitry Andric if (VerboseFusionDebugging) { 5930b57cec5SDimitry Andric LI.print(dbgs()); 5940b57cec5SDimitry Andric } 5950b57cec5SDimitry Andric #endif 5960b57cec5SDimitry Andric 5970b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 5980b57cec5SDimitry Andric << "\n"); 5990b57cec5SDimitry Andric bool Changed = false; 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric while (!LDT.empty()) { 6020b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 6030b57cec5SDimitry Andric << LDT.getDepth() << "\n";); 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric for (const LoopVector &LV : LDT) { 6060b57cec5SDimitry Andric assert(LV.size() > 0 && "Empty loop set was build!"); 6070b57cec5SDimitry Andric 6080b57cec5SDimitry Andric // Skip singleton loop sets as they do not offer fusion opportunities on 6090b57cec5SDimitry Andric // this level. 6100b57cec5SDimitry Andric if (LV.size() == 1) 6110b57cec5SDimitry Andric continue; 6120b57cec5SDimitry Andric #ifndef NDEBUG 6130b57cec5SDimitry Andric if (VerboseFusionDebugging) { 6140b57cec5SDimitry Andric LLVM_DEBUG({ 6150b57cec5SDimitry Andric dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 6160b57cec5SDimitry Andric printLoopVector(LV); 6170b57cec5SDimitry Andric }); 6180b57cec5SDimitry Andric } 6190b57cec5SDimitry Andric #endif 6200b57cec5SDimitry Andric 6210b57cec5SDimitry Andric collectFusionCandidates(LV); 6220b57cec5SDimitry Andric Changed |= fuseCandidates(); 6230b57cec5SDimitry Andric } 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric // Finished analyzing candidates at this level. 6260b57cec5SDimitry Andric // Descend to the next level and clear all of the candidates currently 6270b57cec5SDimitry Andric // collected. Note that it will not be possible to fuse any of the 6280b57cec5SDimitry Andric // existing candidates with new candidates because the new candidates will 6290b57cec5SDimitry Andric // be at a different nest level and thus not be control flow equivalent 6300b57cec5SDimitry Andric // with all of the candidates collected so far. 6310b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Descend one level!\n"); 6320b57cec5SDimitry Andric LDT.descend(); 6330b57cec5SDimitry Andric FusionCandidates.clear(); 6340b57cec5SDimitry Andric } 6350b57cec5SDimitry Andric 6360b57cec5SDimitry Andric if (Changed) 6370b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric #ifndef NDEBUG 6400b57cec5SDimitry Andric assert(DT.verify()); 6410b57cec5SDimitry Andric assert(PDT.verify()); 6420b57cec5SDimitry Andric LI.verify(DT); 6430b57cec5SDimitry Andric SE.verify(); 6440b57cec5SDimitry Andric #endif 6450b57cec5SDimitry Andric 6460b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 6470b57cec5SDimitry Andric return Changed; 6480b57cec5SDimitry Andric } 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric private: 6510b57cec5SDimitry Andric /// Determine if two fusion candidates are control flow equivalent. 6520b57cec5SDimitry Andric /// 6530b57cec5SDimitry Andric /// Two fusion candidates are control flow equivalent if when one executes, 6540b57cec5SDimitry Andric /// the other is guaranteed to execute. This is determined using dominators 6550b57cec5SDimitry Andric /// and post-dominators: if A dominates B and B post-dominates A then A and B 6560b57cec5SDimitry Andric /// are control-flow equivalent. 6570b57cec5SDimitry Andric bool isControlFlowEquivalent(const FusionCandidate &FC0, 6580b57cec5SDimitry Andric const FusionCandidate &FC1) const { 6590b57cec5SDimitry Andric assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 6600b57cec5SDimitry Andric 661480093f4SDimitry Andric return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 662480093f4SDimitry Andric DT, PDT); 6630b57cec5SDimitry Andric } 6640b57cec5SDimitry Andric 6650b57cec5SDimitry Andric /// Iterate over all loops in the given loop set and identify the loops that 6660b57cec5SDimitry Andric /// are eligible for fusion. Place all eligible fusion candidates into Control 6670b57cec5SDimitry Andric /// Flow Equivalent sets, sorted by dominance. 6680b57cec5SDimitry Andric void collectFusionCandidates(const LoopVector &LV) { 6690b57cec5SDimitry Andric for (Loop *L : LV) { 670e8d8bef9SDimitry Andric TTI::PeelingPreferences PP = 671bdd1243dSDimitry Andric gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt); 67281ad6265SDimitry Andric FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); 6738bcb0991SDimitry Andric if (!CurrCand.isEligibleForFusion(SE)) 6740b57cec5SDimitry Andric continue; 6750b57cec5SDimitry Andric 6760b57cec5SDimitry Andric // Go through each list in FusionCandidates and determine if L is control 6770b57cec5SDimitry Andric // flow equivalent with the first loop in that list. If it is, append LV. 6780b57cec5SDimitry Andric // If not, go to the next list. 6790b57cec5SDimitry Andric // If no suitable list is found, start another list and add it to 6800b57cec5SDimitry Andric // FusionCandidates. 6810b57cec5SDimitry Andric bool FoundSet = false; 6820b57cec5SDimitry Andric 6830b57cec5SDimitry Andric for (auto &CurrCandSet : FusionCandidates) { 6840b57cec5SDimitry Andric if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 6850b57cec5SDimitry Andric CurrCandSet.insert(CurrCand); 6860b57cec5SDimitry Andric FoundSet = true; 6870b57cec5SDimitry Andric #ifndef NDEBUG 6880b57cec5SDimitry Andric if (VerboseFusionDebugging) 6890b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Adding " << CurrCand 6900b57cec5SDimitry Andric << " to existing candidate set\n"); 6910b57cec5SDimitry Andric #endif 6920b57cec5SDimitry Andric break; 6930b57cec5SDimitry Andric } 6940b57cec5SDimitry Andric } 6950b57cec5SDimitry Andric if (!FoundSet) { 6960b57cec5SDimitry Andric // No set was found. Create a new set and add to FusionCandidates 6970b57cec5SDimitry Andric #ifndef NDEBUG 6980b57cec5SDimitry Andric if (VerboseFusionDebugging) 6990b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 7000b57cec5SDimitry Andric #endif 7010b57cec5SDimitry Andric FusionCandidateSet NewCandSet; 7020b57cec5SDimitry Andric NewCandSet.insert(CurrCand); 7030b57cec5SDimitry Andric FusionCandidates.push_back(NewCandSet); 7040b57cec5SDimitry Andric } 7050b57cec5SDimitry Andric NumFusionCandidates++; 7060b57cec5SDimitry Andric } 7070b57cec5SDimitry Andric } 7080b57cec5SDimitry Andric 7090b57cec5SDimitry Andric /// Determine if it is beneficial to fuse two loops. 7100b57cec5SDimitry Andric /// 7110b57cec5SDimitry Andric /// For now, this method simply returns true because we want to fuse as much 7120b57cec5SDimitry Andric /// as possible (primarily to test the pass). This method will evolve, over 7130b57cec5SDimitry Andric /// time, to add heuristics for profitability of fusion. 7140b57cec5SDimitry Andric bool isBeneficialFusion(const FusionCandidate &FC0, 7150b57cec5SDimitry Andric const FusionCandidate &FC1) { 7160b57cec5SDimitry Andric return true; 7170b57cec5SDimitry Andric } 7180b57cec5SDimitry Andric 7190b57cec5SDimitry Andric /// Determine if two fusion candidates have the same trip count (i.e., they 7200b57cec5SDimitry Andric /// execute the same number of iterations). 7210b57cec5SDimitry Andric /// 722e8d8bef9SDimitry Andric /// This function will return a pair of values. The first is a boolean, 723e8d8bef9SDimitry Andric /// stating whether or not the two candidates are known at compile time to 724e8d8bef9SDimitry Andric /// have the same TripCount. The second is the difference in the two 725e8d8bef9SDimitry Andric /// TripCounts. This information can be used later to determine whether or not 726bdd1243dSDimitry Andric /// peeling can be performed on either one of the candidates. 727bdd1243dSDimitry Andric std::pair<bool, std::optional<unsigned>> 728e8d8bef9SDimitry Andric haveIdenticalTripCounts(const FusionCandidate &FC0, 7290b57cec5SDimitry Andric const FusionCandidate &FC1) const { 7300b57cec5SDimitry Andric const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 7310b57cec5SDimitry Andric if (isa<SCEVCouldNotCompute>(TripCount0)) { 7320b57cec5SDimitry Andric UncomputableTripCount++; 7330b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 734bdd1243dSDimitry Andric return {false, std::nullopt}; 7350b57cec5SDimitry Andric } 7360b57cec5SDimitry Andric 7370b57cec5SDimitry Andric const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 7380b57cec5SDimitry Andric if (isa<SCEVCouldNotCompute>(TripCount1)) { 7390b57cec5SDimitry Andric UncomputableTripCount++; 7400b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 741bdd1243dSDimitry Andric return {false, std::nullopt}; 7420b57cec5SDimitry Andric } 743e8d8bef9SDimitry Andric 7440b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 7450b57cec5SDimitry Andric << *TripCount1 << " are " 7460b57cec5SDimitry Andric << (TripCount0 == TripCount1 ? "identical" : "different") 7470b57cec5SDimitry Andric << "\n"); 7480b57cec5SDimitry Andric 749e8d8bef9SDimitry Andric if (TripCount0 == TripCount1) 750e8d8bef9SDimitry Andric return {true, 0}; 751e8d8bef9SDimitry Andric 752e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 753e8d8bef9SDimitry Andric "determining the difference between trip counts\n"); 754e8d8bef9SDimitry Andric 755e8d8bef9SDimitry Andric // Currently only considering loops with a single exit point 756e8d8bef9SDimitry Andric // and a non-constant trip count. 757e8d8bef9SDimitry Andric const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 758e8d8bef9SDimitry Andric const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 759e8d8bef9SDimitry Andric 760e8d8bef9SDimitry Andric // If any of the tripcounts are zero that means that loop(s) do not have 761e8d8bef9SDimitry Andric // a single exit or a constant tripcount. 762e8d8bef9SDimitry Andric if (TC0 == 0 || TC1 == 0) { 763e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 764e8d8bef9SDimitry Andric "have a constant number of iterations. Peeling " 765e8d8bef9SDimitry Andric "is not benefical\n"); 766bdd1243dSDimitry Andric return {false, std::nullopt}; 767e8d8bef9SDimitry Andric } 768e8d8bef9SDimitry Andric 769bdd1243dSDimitry Andric std::optional<unsigned> Difference; 770e8d8bef9SDimitry Andric int Diff = TC0 - TC1; 771e8d8bef9SDimitry Andric 772e8d8bef9SDimitry Andric if (Diff > 0) 773e8d8bef9SDimitry Andric Difference = Diff; 774e8d8bef9SDimitry Andric else { 775e8d8bef9SDimitry Andric LLVM_DEBUG( 776e8d8bef9SDimitry Andric dbgs() << "Difference is less than 0. FC1 (second loop) has more " 777e8d8bef9SDimitry Andric "iterations than the first one. Currently not supported\n"); 778e8d8bef9SDimitry Andric } 779e8d8bef9SDimitry Andric 780e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 781e8d8bef9SDimitry Andric << "\n"); 782e8d8bef9SDimitry Andric 783e8d8bef9SDimitry Andric return {false, Difference}; 784e8d8bef9SDimitry Andric } 785e8d8bef9SDimitry Andric 786e8d8bef9SDimitry Andric void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 787e8d8bef9SDimitry Andric unsigned PeelCount) { 788e8d8bef9SDimitry Andric assert(FC0.AbleToPeel && "Should be able to peel loop"); 789e8d8bef9SDimitry Andric 790e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 791e8d8bef9SDimitry Andric << " iterations of the first loop. \n"); 792e8d8bef9SDimitry Andric 793bdd1243dSDimitry Andric ValueToValueMapTy VMap; 794bdd1243dSDimitry Andric FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap); 795e8d8bef9SDimitry Andric if (FC0.Peeled) { 796e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Done Peeling\n"); 797e8d8bef9SDimitry Andric 798e8d8bef9SDimitry Andric #ifndef NDEBUG 799e8d8bef9SDimitry Andric auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 800e8d8bef9SDimitry Andric 801e8d8bef9SDimitry Andric assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 802e8d8bef9SDimitry Andric "Loops should have identical trip counts after peeling"); 803e8d8bef9SDimitry Andric #endif 804e8d8bef9SDimitry Andric 805e8d8bef9SDimitry Andric FC0.PP.PeelCount += PeelCount; 806e8d8bef9SDimitry Andric 807e8d8bef9SDimitry Andric // Peeling does not update the PDT 808e8d8bef9SDimitry Andric PDT.recalculate(*FC0.Preheader->getParent()); 809e8d8bef9SDimitry Andric 810e8d8bef9SDimitry Andric FC0.updateAfterPeeling(); 811e8d8bef9SDimitry Andric 812e8d8bef9SDimitry Andric // In this case the iterations of the loop are constant, so the first 813e8d8bef9SDimitry Andric // loop will execute completely (will not jump from one of 814e8d8bef9SDimitry Andric // the peeled blocks to the second loop). Here we are updating the 815e8d8bef9SDimitry Andric // branch conditions of each of the peeled blocks, such that it will 816e8d8bef9SDimitry Andric // branch to its successor which is not the preheader of the second loop 817e8d8bef9SDimitry Andric // in the case of unguarded loops, or the succesors of the exit block of 818e8d8bef9SDimitry Andric // the first loop otherwise. Doing this update will ensure that the entry 819e8d8bef9SDimitry Andric // block of the first loop dominates the entry block of the second loop. 820e8d8bef9SDimitry Andric BasicBlock *BB = 821e8d8bef9SDimitry Andric FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 822e8d8bef9SDimitry Andric if (BB) { 823e8d8bef9SDimitry Andric SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 824e8d8bef9SDimitry Andric SmallVector<Instruction *, 8> WorkList; 825e8d8bef9SDimitry Andric for (BasicBlock *Pred : predecessors(BB)) { 826e8d8bef9SDimitry Andric if (Pred != FC0.ExitBlock) { 827e8d8bef9SDimitry Andric WorkList.emplace_back(Pred->getTerminator()); 828e8d8bef9SDimitry Andric TreeUpdates.emplace_back( 829e8d8bef9SDimitry Andric DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 830e8d8bef9SDimitry Andric } 831e8d8bef9SDimitry Andric } 832e8d8bef9SDimitry Andric // Cannot modify the predecessors inside the above loop as it will cause 833e8d8bef9SDimitry Andric // the iterators to be nullptrs, causing memory errors. 834e8d8bef9SDimitry Andric for (Instruction *CurrentBranch : WorkList) { 835e8d8bef9SDimitry Andric BasicBlock *Succ = CurrentBranch->getSuccessor(0); 836e8d8bef9SDimitry Andric if (Succ == BB) 837e8d8bef9SDimitry Andric Succ = CurrentBranch->getSuccessor(1); 838e8d8bef9SDimitry Andric ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); 839e8d8bef9SDimitry Andric } 840e8d8bef9SDimitry Andric 841e8d8bef9SDimitry Andric DTU.applyUpdates(TreeUpdates); 842e8d8bef9SDimitry Andric DTU.flush(); 843e8d8bef9SDimitry Andric } 844e8d8bef9SDimitry Andric LLVM_DEBUG( 845e8d8bef9SDimitry Andric dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 846e8d8bef9SDimitry Andric << " iterations from the first loop.\n" 847e8d8bef9SDimitry Andric "Both Loops have the same number of iterations now.\n"); 848e8d8bef9SDimitry Andric } 8490b57cec5SDimitry Andric } 8500b57cec5SDimitry Andric 8510b57cec5SDimitry Andric /// Walk each set of control flow equivalent fusion candidates and attempt to 8520b57cec5SDimitry Andric /// fuse them. This does a single linear traversal of all candidates in the 8530b57cec5SDimitry Andric /// set. The conditions for legal fusion are checked at this point. If a pair 8540b57cec5SDimitry Andric /// of fusion candidates passes all legality checks, they are fused together 8550b57cec5SDimitry Andric /// and a new fusion candidate is created and added to the FusionCandidateSet. 8560b57cec5SDimitry Andric /// The original fusion candidates are then removed, as they are no longer 8570b57cec5SDimitry Andric /// valid. 8580b57cec5SDimitry Andric bool fuseCandidates() { 8590b57cec5SDimitry Andric bool Fused = false; 8600b57cec5SDimitry Andric LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 8610b57cec5SDimitry Andric for (auto &CandidateSet : FusionCandidates) { 8620b57cec5SDimitry Andric if (CandidateSet.size() < 2) 8630b57cec5SDimitry Andric continue; 8640b57cec5SDimitry Andric 8650b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 8660b57cec5SDimitry Andric << CandidateSet << "\n"); 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 8690b57cec5SDimitry Andric assert(!LDT.isRemovedLoop(FC0->L) && 8700b57cec5SDimitry Andric "Should not have removed loops in CandidateSet!"); 8710b57cec5SDimitry Andric auto FC1 = FC0; 8720b57cec5SDimitry Andric for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 8730b57cec5SDimitry Andric assert(!LDT.isRemovedLoop(FC1->L) && 8740b57cec5SDimitry Andric "Should not have removed loops in CandidateSet!"); 8750b57cec5SDimitry Andric 8760b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 8770b57cec5SDimitry Andric dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 8780b57cec5SDimitry Andric 8790b57cec5SDimitry Andric FC0->verify(); 8800b57cec5SDimitry Andric FC1->verify(); 8810b57cec5SDimitry Andric 882e8d8bef9SDimitry Andric // Check if the candidates have identical tripcounts (first value of 883e8d8bef9SDimitry Andric // pair), and if not check the difference in the tripcounts between 884e8d8bef9SDimitry Andric // the loops (second value of pair). The difference is not equal to 885bdd1243dSDimitry Andric // std::nullopt iff the loops iterate a constant number of times, and 886bdd1243dSDimitry Andric // have a single exit. 887bdd1243dSDimitry Andric std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = 888e8d8bef9SDimitry Andric haveIdenticalTripCounts(*FC0, *FC1); 889e8d8bef9SDimitry Andric bool SameTripCount = IdenticalTripCountRes.first; 890bdd1243dSDimitry Andric std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; 891e8d8bef9SDimitry Andric 892e8d8bef9SDimitry Andric // Here we are checking that FC0 (the first loop) can be peeled, and 893e8d8bef9SDimitry Andric // both loops have different tripcounts. 894e8d8bef9SDimitry Andric if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 895e8d8bef9SDimitry Andric if (*TCDifference > FusionPeelMaxCount) { 896e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() 897e8d8bef9SDimitry Andric << "Difference in loop trip counts: " << *TCDifference 898e8d8bef9SDimitry Andric << " is greater than maximum peel count specificed: " 899e8d8bef9SDimitry Andric << FusionPeelMaxCount << "\n"); 900e8d8bef9SDimitry Andric } else { 901e8d8bef9SDimitry Andric // Dependent on peeling being performed on the first loop, and 902e8d8bef9SDimitry Andric // assuming all other conditions for fusion return true. 903e8d8bef9SDimitry Andric SameTripCount = true; 904e8d8bef9SDimitry Andric } 905e8d8bef9SDimitry Andric } 906e8d8bef9SDimitry Andric 907e8d8bef9SDimitry Andric if (!SameTripCount) { 9080b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 9090b57cec5SDimitry Andric "counts. Not fusing.\n"); 9108bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 9118bcb0991SDimitry Andric NonEqualTripCount); 9120b57cec5SDimitry Andric continue; 9130b57cec5SDimitry Andric } 9140b57cec5SDimitry Andric 9150b57cec5SDimitry Andric if (!isAdjacent(*FC0, *FC1)) { 9160b57cec5SDimitry Andric LLVM_DEBUG(dbgs() 9170b57cec5SDimitry Andric << "Fusion candidates are not adjacent. Not fusing.\n"); 9188bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 9190b57cec5SDimitry Andric continue; 9200b57cec5SDimitry Andric } 9210b57cec5SDimitry Andric 922bdd1243dSDimitry Andric if ((!FC0->GuardBranch && FC1->GuardBranch) || 923bdd1243dSDimitry Andric (FC0->GuardBranch && !FC1->GuardBranch)) { 924bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " 925bdd1243dSDimitry Andric "another one is not. Not fusing.\n"); 926fe6060f1SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>( 927fe6060f1SDimitry Andric *FC0, *FC1, OnlySecondCandidateIsGuarded); 928fe6060f1SDimitry Andric continue; 929fe6060f1SDimitry Andric } 930fe6060f1SDimitry Andric 9318bcb0991SDimitry Andric // Ensure that FC0 and FC1 have identical guards. 9328bcb0991SDimitry Andric // If one (or both) are not guarded, this check is not necessary. 9338bcb0991SDimitry Andric if (FC0->GuardBranch && FC1->GuardBranch && 934e8d8bef9SDimitry Andric !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 9358bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 9368bcb0991SDimitry Andric "guards. Not Fusing.\n"); 9378bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 9388bcb0991SDimitry Andric NonIdenticalGuards); 9398bcb0991SDimitry Andric continue; 9408bcb0991SDimitry Andric } 9418bcb0991SDimitry Andric 9425ffd83dbSDimitry Andric if (FC0->GuardBranch) { 9435ffd83dbSDimitry Andric assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 9445ffd83dbSDimitry Andric 9455ffd83dbSDimitry Andric if (!isSafeToMoveBefore(*FC0->ExitBlock, 9465ffd83dbSDimitry Andric *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 9475ffd83dbSDimitry Andric &PDT, &DI)) { 9485ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 9495ffd83dbSDimitry Andric "instructions in exit block. Not fusing.\n"); 9508bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 9518bcb0991SDimitry Andric NonEmptyExitBlock); 9528bcb0991SDimitry Andric continue; 9538bcb0991SDimitry Andric } 9548bcb0991SDimitry Andric 9555ffd83dbSDimitry Andric if (!isSafeToMoveBefore( 9565ffd83dbSDimitry Andric *FC1->GuardBranch->getParent(), 9575ffd83dbSDimitry Andric *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 9585ffd83dbSDimitry Andric &DI)) { 9595ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() 9605ffd83dbSDimitry Andric << "Fusion candidate contains unsafe " 9615ffd83dbSDimitry Andric "instructions in guard block. Not fusing.\n"); 9628bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 9638bcb0991SDimitry Andric NonEmptyGuardBlock); 9648bcb0991SDimitry Andric continue; 9658bcb0991SDimitry Andric } 9665ffd83dbSDimitry Andric } 9678bcb0991SDimitry Andric 9688bcb0991SDimitry Andric // Check the dependencies across the loops and do not fuse if it would 9698bcb0991SDimitry Andric // violate them. 9700b57cec5SDimitry Andric if (!dependencesAllowFusion(*FC0, *FC1)) { 9710b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 9728bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 9738bcb0991SDimitry Andric InvalidDependencies); 9740b57cec5SDimitry Andric continue; 9750b57cec5SDimitry Andric } 9760b57cec5SDimitry Andric 977bdd1243dSDimitry Andric // If the second loop has instructions in the pre-header, attempt to 978bdd1243dSDimitry Andric // hoist them up to the first loop's pre-header or sink them into the 979bdd1243dSDimitry Andric // body of the second loop. 980bdd1243dSDimitry Andric SmallVector<Instruction *, 4> SafeToHoist; 981bdd1243dSDimitry Andric SmallVector<Instruction *, 4> SafeToSink; 982bdd1243dSDimitry Andric // At this point, this is the last remaining legality check. 983bdd1243dSDimitry Andric // Which means if we can make this pre-header empty, we can fuse 984bdd1243dSDimitry Andric // these loops 985bdd1243dSDimitry Andric if (!isEmptyPreheader(*FC1)) { 986bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " 987bdd1243dSDimitry Andric "preheader.\n"); 988bdd1243dSDimitry Andric 989bdd1243dSDimitry Andric // If it is not safe to hoist/sink all instructions in the 990bdd1243dSDimitry Andric // pre-header, we cannot fuse these loops. 991bdd1243dSDimitry Andric if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, 992bdd1243dSDimitry Andric SafeToSink)) { 993bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " 994bdd1243dSDimitry Andric "Fusion Candidate Pre-header.\n" 995bdd1243dSDimitry Andric << "Not Fusing.\n"); 996bdd1243dSDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 997bdd1243dSDimitry Andric NonEmptyPreheader); 998bdd1243dSDimitry Andric continue; 999bdd1243dSDimitry Andric } 1000bdd1243dSDimitry Andric } 1001bdd1243dSDimitry Andric 10020b57cec5SDimitry Andric bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 10030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() 10040b57cec5SDimitry Andric << "\tFusion appears to be " 10050b57cec5SDimitry Andric << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 10068bcb0991SDimitry Andric if (!BeneficialToFuse) { 10078bcb0991SDimitry Andric reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 10088bcb0991SDimitry Andric FusionNotBeneficial); 10090b57cec5SDimitry Andric continue; 10108bcb0991SDimitry Andric } 10110b57cec5SDimitry Andric // All analysis has completed and has determined that fusion is legal 10120b57cec5SDimitry Andric // and profitable. At this point, start transforming the code and 10130b57cec5SDimitry Andric // perform fusion. 10140b57cec5SDimitry Andric 1015bdd1243dSDimitry Andric // Execute the hoist/sink operations on preheader instructions 1016bdd1243dSDimitry Andric movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); 1017bdd1243dSDimitry Andric 10180b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 10190b57cec5SDimitry Andric << *FC1 << "\n"); 10200b57cec5SDimitry Andric 1021e8d8bef9SDimitry Andric FusionCandidate FC0Copy = *FC0; 1022e8d8bef9SDimitry Andric // Peel the loop after determining that fusion is legal. The Loops 1023e8d8bef9SDimitry Andric // will still be safe to fuse after the peeling is performed. 1024e8d8bef9SDimitry Andric bool Peel = TCDifference && *TCDifference > 0; 1025e8d8bef9SDimitry Andric if (Peel) 1026e8d8bef9SDimitry Andric peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 1027e8d8bef9SDimitry Andric 10280b57cec5SDimitry Andric // Report fusion to the Optimization Remarks. 10290b57cec5SDimitry Andric // Note this needs to be done *before* performFusion because 10300b57cec5SDimitry Andric // performFusion will change the original loops, making it not 10310b57cec5SDimitry Andric // possible to identify them after fusion is complete. 1032e8d8bef9SDimitry Andric reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 1033e8d8bef9SDimitry Andric FuseCounter); 10340b57cec5SDimitry Andric 1035e8d8bef9SDimitry Andric FusionCandidate FusedCand( 103681ad6265SDimitry Andric performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE, 1037e8d8bef9SDimitry Andric FC0Copy.PP); 10380b57cec5SDimitry Andric FusedCand.verify(); 10398bcb0991SDimitry Andric assert(FusedCand.isEligibleForFusion(SE) && 10400b57cec5SDimitry Andric "Fused candidate should be eligible for fusion!"); 10410b57cec5SDimitry Andric 10420b57cec5SDimitry Andric // Notify the loop-depth-tree that these loops are not valid objects 10430b57cec5SDimitry Andric LDT.removeLoop(FC1->L); 10440b57cec5SDimitry Andric 10450b57cec5SDimitry Andric CandidateSet.erase(FC0); 10460b57cec5SDimitry Andric CandidateSet.erase(FC1); 10470b57cec5SDimitry Andric 10480b57cec5SDimitry Andric auto InsertPos = CandidateSet.insert(FusedCand); 10490b57cec5SDimitry Andric 10500b57cec5SDimitry Andric assert(InsertPos.second && 10510b57cec5SDimitry Andric "Unable to insert TargetCandidate in CandidateSet!"); 10520b57cec5SDimitry Andric 10530b57cec5SDimitry Andric // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 10540b57cec5SDimitry Andric // of the FC1 loop will attempt to fuse the new (fused) loop with the 10550b57cec5SDimitry Andric // remaining candidates in the current candidate set. 10560b57cec5SDimitry Andric FC0 = FC1 = InsertPos.first; 10570b57cec5SDimitry Andric 10580b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 10590b57cec5SDimitry Andric << "\n"); 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric Fused = true; 10620b57cec5SDimitry Andric } 10630b57cec5SDimitry Andric } 10640b57cec5SDimitry Andric } 10650b57cec5SDimitry Andric return Fused; 10660b57cec5SDimitry Andric } 10670b57cec5SDimitry Andric 1068bdd1243dSDimitry Andric // Returns true if the instruction \p I can be hoisted to the end of the 1069bdd1243dSDimitry Andric // preheader of \p FC0. \p SafeToHoist contains the instructions that are 1070bdd1243dSDimitry Andric // known to be safe to hoist. The instructions encountered that cannot be 1071bdd1243dSDimitry Andric // hoisted are in \p NotHoisting. 1072bdd1243dSDimitry Andric // TODO: Move functionality into CodeMoverUtils 1073bdd1243dSDimitry Andric bool canHoistInst(Instruction &I, 1074bdd1243dSDimitry Andric const SmallVector<Instruction *, 4> &SafeToHoist, 1075bdd1243dSDimitry Andric const SmallVector<Instruction *, 4> &NotHoisting, 1076bdd1243dSDimitry Andric const FusionCandidate &FC0) const { 1077bdd1243dSDimitry Andric const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); 1078bdd1243dSDimitry Andric assert(FC0PreheaderTarget && 1079bdd1243dSDimitry Andric "Expected single successor for loop preheader."); 1080bdd1243dSDimitry Andric 1081bdd1243dSDimitry Andric for (Use &Op : I.operands()) { 1082bdd1243dSDimitry Andric if (auto *OpInst = dyn_cast<Instruction>(Op)) { 1083bdd1243dSDimitry Andric bool OpHoisted = is_contained(SafeToHoist, OpInst); 1084bdd1243dSDimitry Andric // Check if we have already decided to hoist this operand. In this 1085bdd1243dSDimitry Andric // case, it does not dominate FC0 *yet*, but will after we hoist it. 1086bdd1243dSDimitry Andric if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { 1087bdd1243dSDimitry Andric return false; 1088bdd1243dSDimitry Andric } 1089bdd1243dSDimitry Andric } 1090bdd1243dSDimitry Andric } 1091bdd1243dSDimitry Andric 1092bdd1243dSDimitry Andric // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs 1093bdd1243dSDimitry Andric // cannot be hoisted and should be sunk to the exit of the fused loop. 1094bdd1243dSDimitry Andric if (isa<PHINode>(I)) 1095bdd1243dSDimitry Andric return false; 1096bdd1243dSDimitry Andric 1097bdd1243dSDimitry Andric // If this isn't a memory inst, hoisting is safe 1098bdd1243dSDimitry Andric if (!I.mayReadOrWriteMemory()) 1099bdd1243dSDimitry Andric return true; 1100bdd1243dSDimitry Andric 1101bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); 1102bdd1243dSDimitry Andric for (Instruction *NotHoistedInst : NotHoisting) { 1103bdd1243dSDimitry Andric if (auto D = DI.depends(&I, NotHoistedInst, true)) { 1104bdd1243dSDimitry Andric // Dependency is not read-before-write, write-before-read or 1105bdd1243dSDimitry Andric // write-before-write 1106bdd1243dSDimitry Andric if (D->isFlow() || D->isAnti() || D->isOutput()) { 1107bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " 1108bdd1243dSDimitry Andric "preheader that is not being hoisted.\n"); 1109bdd1243dSDimitry Andric return false; 1110bdd1243dSDimitry Andric } 1111bdd1243dSDimitry Andric } 1112bdd1243dSDimitry Andric } 1113bdd1243dSDimitry Andric 1114bdd1243dSDimitry Andric for (Instruction *ReadInst : FC0.MemReads) { 1115bdd1243dSDimitry Andric if (auto D = DI.depends(ReadInst, &I, true)) { 1116bdd1243dSDimitry Andric // Dependency is not read-before-write 1117bdd1243dSDimitry Andric if (D->isAnti()) { 1118bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); 1119bdd1243dSDimitry Andric return false; 1120bdd1243dSDimitry Andric } 1121bdd1243dSDimitry Andric } 1122bdd1243dSDimitry Andric } 1123bdd1243dSDimitry Andric 1124bdd1243dSDimitry Andric for (Instruction *WriteInst : FC0.MemWrites) { 1125bdd1243dSDimitry Andric if (auto D = DI.depends(WriteInst, &I, true)) { 1126bdd1243dSDimitry Andric // Dependency is not write-before-read or write-before-write 1127bdd1243dSDimitry Andric if (D->isFlow() || D->isOutput()) { 1128bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); 1129bdd1243dSDimitry Andric return false; 1130bdd1243dSDimitry Andric } 1131bdd1243dSDimitry Andric } 1132bdd1243dSDimitry Andric } 1133bdd1243dSDimitry Andric return true; 1134bdd1243dSDimitry Andric } 1135bdd1243dSDimitry Andric 1136bdd1243dSDimitry Andric // Returns true if the instruction \p I can be sunk to the top of the exit 1137bdd1243dSDimitry Andric // block of \p FC1. 1138bdd1243dSDimitry Andric // TODO: Move functionality into CodeMoverUtils 1139bdd1243dSDimitry Andric bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { 1140bdd1243dSDimitry Andric for (User *U : I.users()) { 1141bdd1243dSDimitry Andric if (auto *UI{dyn_cast<Instruction>(U)}) { 1142bdd1243dSDimitry Andric // Cannot sink if user in loop 1143bdd1243dSDimitry Andric // If FC1 has phi users of this value, we cannot sink it into FC1. 1144bdd1243dSDimitry Andric if (FC1.L->contains(UI)) { 1145bdd1243dSDimitry Andric // Cannot hoist or sink this instruction. No hoisting/sinking 1146bdd1243dSDimitry Andric // should take place, loops should not fuse 1147bdd1243dSDimitry Andric return false; 1148bdd1243dSDimitry Andric } 1149bdd1243dSDimitry Andric } 1150bdd1243dSDimitry Andric } 1151bdd1243dSDimitry Andric 1152bdd1243dSDimitry Andric // If this isn't a memory inst, sinking is safe 1153bdd1243dSDimitry Andric if (!I.mayReadOrWriteMemory()) 1154bdd1243dSDimitry Andric return true; 1155bdd1243dSDimitry Andric 1156bdd1243dSDimitry Andric for (Instruction *ReadInst : FC1.MemReads) { 1157bdd1243dSDimitry Andric if (auto D = DI.depends(&I, ReadInst, true)) { 1158bdd1243dSDimitry Andric // Dependency is not write-before-read 1159bdd1243dSDimitry Andric if (D->isFlow()) { 1160bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); 1161bdd1243dSDimitry Andric return false; 1162bdd1243dSDimitry Andric } 1163bdd1243dSDimitry Andric } 1164bdd1243dSDimitry Andric } 1165bdd1243dSDimitry Andric 1166bdd1243dSDimitry Andric for (Instruction *WriteInst : FC1.MemWrites) { 1167bdd1243dSDimitry Andric if (auto D = DI.depends(&I, WriteInst, true)) { 1168bdd1243dSDimitry Andric // Dependency is not write-before-write or read-before-write 1169bdd1243dSDimitry Andric if (D->isOutput() || D->isAnti()) { 1170bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); 1171bdd1243dSDimitry Andric return false; 1172bdd1243dSDimitry Andric } 1173bdd1243dSDimitry Andric } 1174bdd1243dSDimitry Andric } 1175bdd1243dSDimitry Andric 1176bdd1243dSDimitry Andric return true; 1177bdd1243dSDimitry Andric } 1178bdd1243dSDimitry Andric 1179bdd1243dSDimitry Andric /// Collect instructions in the \p FC1 Preheader that can be hoisted 1180bdd1243dSDimitry Andric /// to the \p FC0 Preheader or sunk into the \p FC1 Body 1181bdd1243dSDimitry Andric bool collectMovablePreheaderInsts( 1182bdd1243dSDimitry Andric const FusionCandidate &FC0, const FusionCandidate &FC1, 1183bdd1243dSDimitry Andric SmallVector<Instruction *, 4> &SafeToHoist, 1184bdd1243dSDimitry Andric SmallVector<Instruction *, 4> &SafeToSink) const { 1185bdd1243dSDimitry Andric BasicBlock *FC1Preheader = FC1.Preheader; 1186bdd1243dSDimitry Andric // Save the instructions that are not being hoisted, so we know not to hoist 1187bdd1243dSDimitry Andric // mem insts that they dominate. 1188bdd1243dSDimitry Andric SmallVector<Instruction *, 4> NotHoisting; 1189bdd1243dSDimitry Andric 1190bdd1243dSDimitry Andric for (Instruction &I : *FC1Preheader) { 1191bdd1243dSDimitry Andric // Can't move a branch 1192bdd1243dSDimitry Andric if (&I == FC1Preheader->getTerminator()) 1193bdd1243dSDimitry Andric continue; 1194bdd1243dSDimitry Andric // If the instruction has side-effects, give up. 1195bdd1243dSDimitry Andric // TODO: The case of mayReadFromMemory we can handle but requires 1196bdd1243dSDimitry Andric // additional work with a dependence analysis so for now we give 1197bdd1243dSDimitry Andric // up on memory reads. 1198bdd1243dSDimitry Andric if (I.mayThrow() || !I.willReturn()) { 1199bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); 1200bdd1243dSDimitry Andric return false; 1201bdd1243dSDimitry Andric } 1202bdd1243dSDimitry Andric 1203bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); 1204bdd1243dSDimitry Andric 1205bdd1243dSDimitry Andric if (I.isAtomic() || I.isVolatile()) { 1206bdd1243dSDimitry Andric LLVM_DEBUG( 1207bdd1243dSDimitry Andric dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n"); 1208bdd1243dSDimitry Andric return false; 1209bdd1243dSDimitry Andric } 1210bdd1243dSDimitry Andric 1211bdd1243dSDimitry Andric if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { 1212bdd1243dSDimitry Andric SafeToHoist.push_back(&I); 1213bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); 1214bdd1243dSDimitry Andric } else { 1215bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); 1216bdd1243dSDimitry Andric NotHoisting.push_back(&I); 1217bdd1243dSDimitry Andric 1218bdd1243dSDimitry Andric if (canSinkInst(I, FC1)) { 1219bdd1243dSDimitry Andric SafeToSink.push_back(&I); 1220bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); 1221bdd1243dSDimitry Andric } else { 1222bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); 1223bdd1243dSDimitry Andric return false; 1224bdd1243dSDimitry Andric } 1225bdd1243dSDimitry Andric } 1226bdd1243dSDimitry Andric } 1227bdd1243dSDimitry Andric LLVM_DEBUG( 1228bdd1243dSDimitry Andric dbgs() << "All preheader instructions could be sunk or hoisted!\n"); 1229bdd1243dSDimitry Andric return true; 1230bdd1243dSDimitry Andric } 1231bdd1243dSDimitry Andric 12320b57cec5SDimitry Andric /// Rewrite all additive recurrences in a SCEV to use a new loop. 12330b57cec5SDimitry Andric class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 12340b57cec5SDimitry Andric public: 12350b57cec5SDimitry Andric AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 12360b57cec5SDimitry Andric bool UseMax = true) 12370b57cec5SDimitry Andric : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 12380b57cec5SDimitry Andric NewL(NewL) {} 12390b57cec5SDimitry Andric 12400b57cec5SDimitry Andric const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 12410b57cec5SDimitry Andric const Loop *ExprL = Expr->getLoop(); 12420b57cec5SDimitry Andric SmallVector<const SCEV *, 2> Operands; 12430b57cec5SDimitry Andric if (ExprL == &OldL) { 1244bdd1243dSDimitry Andric append_range(Operands, Expr->operands()); 12450b57cec5SDimitry Andric return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 12460b57cec5SDimitry Andric } 12470b57cec5SDimitry Andric 12480b57cec5SDimitry Andric if (OldL.contains(ExprL)) { 12490b57cec5SDimitry Andric bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 12500b57cec5SDimitry Andric if (!UseMax || !Pos || !Expr->isAffine()) { 12510b57cec5SDimitry Andric Valid = false; 12520b57cec5SDimitry Andric return Expr; 12530b57cec5SDimitry Andric } 12540b57cec5SDimitry Andric return visit(Expr->getStart()); 12550b57cec5SDimitry Andric } 12560b57cec5SDimitry Andric 12570b57cec5SDimitry Andric for (const SCEV *Op : Expr->operands()) 12580b57cec5SDimitry Andric Operands.push_back(visit(Op)); 12590b57cec5SDimitry Andric return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 12600b57cec5SDimitry Andric } 12610b57cec5SDimitry Andric 12620b57cec5SDimitry Andric bool wasValidSCEV() const { return Valid; } 12630b57cec5SDimitry Andric 12640b57cec5SDimitry Andric private: 12650b57cec5SDimitry Andric bool Valid, UseMax; 12660b57cec5SDimitry Andric const Loop &OldL, &NewL; 12670b57cec5SDimitry Andric }; 12680b57cec5SDimitry Andric 12690b57cec5SDimitry Andric /// Return false if the access functions of \p I0 and \p I1 could cause 12700b57cec5SDimitry Andric /// a negative dependence. 12710b57cec5SDimitry Andric bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 12720b57cec5SDimitry Andric Instruction &I1, bool EqualIsInvalid) { 12730b57cec5SDimitry Andric Value *Ptr0 = getLoadStorePointerOperand(&I0); 12740b57cec5SDimitry Andric Value *Ptr1 = getLoadStorePointerOperand(&I1); 12750b57cec5SDimitry Andric if (!Ptr0 || !Ptr1) 12760b57cec5SDimitry Andric return false; 12770b57cec5SDimitry Andric 12780b57cec5SDimitry Andric const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 12790b57cec5SDimitry Andric const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 12800b57cec5SDimitry Andric #ifndef NDEBUG 12810b57cec5SDimitry Andric if (VerboseFusionDebugging) 12820b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 12830b57cec5SDimitry Andric << *SCEVPtr1 << "\n"); 12840b57cec5SDimitry Andric #endif 12850b57cec5SDimitry Andric AddRecLoopReplacer Rewriter(SE, L0, L1); 12860b57cec5SDimitry Andric SCEVPtr0 = Rewriter.visit(SCEVPtr0); 12870b57cec5SDimitry Andric #ifndef NDEBUG 12880b57cec5SDimitry Andric if (VerboseFusionDebugging) 12890b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 12900b57cec5SDimitry Andric << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 12910b57cec5SDimitry Andric #endif 12920b57cec5SDimitry Andric if (!Rewriter.wasValidSCEV()) 12930b57cec5SDimitry Andric return false; 12940b57cec5SDimitry Andric 12950b57cec5SDimitry Andric // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 12960b57cec5SDimitry Andric // L0) and the other is not. We could check if it is monotone and test 12970b57cec5SDimitry Andric // the beginning and end value instead. 12980b57cec5SDimitry Andric 12990b57cec5SDimitry Andric BasicBlock *L0Header = L0.getHeader(); 13000b57cec5SDimitry Andric auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 13010b57cec5SDimitry Andric const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 13020b57cec5SDimitry Andric if (!AddRec) 13030b57cec5SDimitry Andric return false; 13040b57cec5SDimitry Andric return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 13050b57cec5SDimitry Andric !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 13060b57cec5SDimitry Andric }; 13070b57cec5SDimitry Andric if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 13080b57cec5SDimitry Andric return false; 13090b57cec5SDimitry Andric 13100b57cec5SDimitry Andric ICmpInst::Predicate Pred = 13110b57cec5SDimitry Andric EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 13120b57cec5SDimitry Andric bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 13130b57cec5SDimitry Andric #ifndef NDEBUG 13140b57cec5SDimitry Andric if (VerboseFusionDebugging) 13150b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 13160b57cec5SDimitry Andric << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 13170b57cec5SDimitry Andric << "\n"); 13180b57cec5SDimitry Andric #endif 13190b57cec5SDimitry Andric return IsAlwaysGE; 13200b57cec5SDimitry Andric } 13210b57cec5SDimitry Andric 13220b57cec5SDimitry Andric /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 13230b57cec5SDimitry Andric /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 13240b57cec5SDimitry Andric /// specified by @p DepChoice are used to determine this. 13250b57cec5SDimitry Andric bool dependencesAllowFusion(const FusionCandidate &FC0, 13260b57cec5SDimitry Andric const FusionCandidate &FC1, Instruction &I0, 13270b57cec5SDimitry Andric Instruction &I1, bool AnyDep, 13280b57cec5SDimitry Andric FusionDependenceAnalysisChoice DepChoice) { 13290b57cec5SDimitry Andric #ifndef NDEBUG 13300b57cec5SDimitry Andric if (VerboseFusionDebugging) { 13310b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 13320b57cec5SDimitry Andric << DepChoice << "\n"); 13330b57cec5SDimitry Andric } 13340b57cec5SDimitry Andric #endif 13350b57cec5SDimitry Andric switch (DepChoice) { 13360b57cec5SDimitry Andric case FUSION_DEPENDENCE_ANALYSIS_SCEV: 13370b57cec5SDimitry Andric return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 13380b57cec5SDimitry Andric case FUSION_DEPENDENCE_ANALYSIS_DA: { 13390b57cec5SDimitry Andric auto DepResult = DI.depends(&I0, &I1, true); 13400b57cec5SDimitry Andric if (!DepResult) 13410b57cec5SDimitry Andric return true; 13420b57cec5SDimitry Andric #ifndef NDEBUG 13430b57cec5SDimitry Andric if (VerboseFusionDebugging) { 13440b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 13450b57cec5SDimitry Andric dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 13460b57cec5SDimitry Andric << (DepResult->isOrdered() ? "true" : "false") 13470b57cec5SDimitry Andric << "]\n"); 13480b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 13490b57cec5SDimitry Andric << "\n"); 13500b57cec5SDimitry Andric } 13510b57cec5SDimitry Andric #endif 13520b57cec5SDimitry Andric 13530b57cec5SDimitry Andric if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 13540b57cec5SDimitry Andric LLVM_DEBUG( 13550b57cec5SDimitry Andric dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 13560b57cec5SDimitry Andric 13570b57cec5SDimitry Andric // TODO: Can we actually use the dependence info analysis here? 13580b57cec5SDimitry Andric return false; 13590b57cec5SDimitry Andric } 13600b57cec5SDimitry Andric 13610b57cec5SDimitry Andric case FUSION_DEPENDENCE_ANALYSIS_ALL: 13620b57cec5SDimitry Andric return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 13630b57cec5SDimitry Andric FUSION_DEPENDENCE_ANALYSIS_SCEV) || 13640b57cec5SDimitry Andric dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 13650b57cec5SDimitry Andric FUSION_DEPENDENCE_ANALYSIS_DA); 13660b57cec5SDimitry Andric } 13670b57cec5SDimitry Andric 13680b57cec5SDimitry Andric llvm_unreachable("Unknown fusion dependence analysis choice!"); 13690b57cec5SDimitry Andric } 13700b57cec5SDimitry Andric 13710b57cec5SDimitry Andric /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 13720b57cec5SDimitry Andric bool dependencesAllowFusion(const FusionCandidate &FC0, 13730b57cec5SDimitry Andric const FusionCandidate &FC1) { 13740b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 13750b57cec5SDimitry Andric << "\n"); 13760b57cec5SDimitry Andric assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 13778bcb0991SDimitry Andric assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 13780b57cec5SDimitry Andric 13790b57cec5SDimitry Andric for (Instruction *WriteL0 : FC0.MemWrites) { 13800b57cec5SDimitry Andric for (Instruction *WriteL1 : FC1.MemWrites) 13810b57cec5SDimitry Andric if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 13820b57cec5SDimitry Andric /* AnyDep */ false, 13830b57cec5SDimitry Andric FusionDependenceAnalysis)) { 13840b57cec5SDimitry Andric InvalidDependencies++; 13850b57cec5SDimitry Andric return false; 13860b57cec5SDimitry Andric } 13870b57cec5SDimitry Andric for (Instruction *ReadL1 : FC1.MemReads) 13880b57cec5SDimitry Andric if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 13890b57cec5SDimitry Andric /* AnyDep */ false, 13900b57cec5SDimitry Andric FusionDependenceAnalysis)) { 13910b57cec5SDimitry Andric InvalidDependencies++; 13920b57cec5SDimitry Andric return false; 13930b57cec5SDimitry Andric } 13940b57cec5SDimitry Andric } 13950b57cec5SDimitry Andric 13960b57cec5SDimitry Andric for (Instruction *WriteL1 : FC1.MemWrites) { 13970b57cec5SDimitry Andric for (Instruction *WriteL0 : FC0.MemWrites) 13980b57cec5SDimitry Andric if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 13990b57cec5SDimitry Andric /* AnyDep */ false, 14000b57cec5SDimitry Andric FusionDependenceAnalysis)) { 14010b57cec5SDimitry Andric InvalidDependencies++; 14020b57cec5SDimitry Andric return false; 14030b57cec5SDimitry Andric } 14040b57cec5SDimitry Andric for (Instruction *ReadL0 : FC0.MemReads) 14050b57cec5SDimitry Andric if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 14060b57cec5SDimitry Andric /* AnyDep */ false, 14070b57cec5SDimitry Andric FusionDependenceAnalysis)) { 14080b57cec5SDimitry Andric InvalidDependencies++; 14090b57cec5SDimitry Andric return false; 14100b57cec5SDimitry Andric } 14110b57cec5SDimitry Andric } 14120b57cec5SDimitry Andric 14130b57cec5SDimitry Andric // Walk through all uses in FC1. For each use, find the reaching def. If the 14145f757f3fSDimitry Andric // def is located in FC0 then it is not safe to fuse. 14150b57cec5SDimitry Andric for (BasicBlock *BB : FC1.L->blocks()) 14160b57cec5SDimitry Andric for (Instruction &I : *BB) 14170b57cec5SDimitry Andric for (auto &Op : I.operands()) 14180b57cec5SDimitry Andric if (Instruction *Def = dyn_cast<Instruction>(Op)) 14190b57cec5SDimitry Andric if (FC0.L->contains(Def->getParent())) { 14200b57cec5SDimitry Andric InvalidDependencies++; 14210b57cec5SDimitry Andric return false; 14220b57cec5SDimitry Andric } 14230b57cec5SDimitry Andric 14240b57cec5SDimitry Andric return true; 14250b57cec5SDimitry Andric } 14260b57cec5SDimitry Andric 14278bcb0991SDimitry Andric /// Determine if two fusion candidates are adjacent in the CFG. 14288bcb0991SDimitry Andric /// 14298bcb0991SDimitry Andric /// This method will determine if there are additional basic blocks in the CFG 14308bcb0991SDimitry Andric /// between the exit of \p FC0 and the entry of \p FC1. 14318bcb0991SDimitry Andric /// If the two candidates are guarded loops, then it checks whether the 14328bcb0991SDimitry Andric /// non-loop successor of the \p FC0 guard branch is the entry block of \p 14338bcb0991SDimitry Andric /// FC1. If not, then the loops are not adjacent. If the two candidates are 14348bcb0991SDimitry Andric /// not guarded loops, then it checks whether the exit block of \p FC0 is the 14358bcb0991SDimitry Andric /// preheader of \p FC1. 14360b57cec5SDimitry Andric bool isAdjacent(const FusionCandidate &FC0, 14370b57cec5SDimitry Andric const FusionCandidate &FC1) const { 14388bcb0991SDimitry Andric // If the successor of the guard branch is FC1, then the loops are adjacent 14398bcb0991SDimitry Andric if (FC0.GuardBranch) 14408bcb0991SDimitry Andric return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 14418bcb0991SDimitry Andric else 14428bcb0991SDimitry Andric return FC0.ExitBlock == FC1.getEntryBlock(); 14438bcb0991SDimitry Andric } 14448bcb0991SDimitry Andric 1445bdd1243dSDimitry Andric bool isEmptyPreheader(const FusionCandidate &FC) const { 1446bdd1243dSDimitry Andric return FC.Preheader->size() == 1; 1447bdd1243dSDimitry Andric } 1448bdd1243dSDimitry Andric 1449bdd1243dSDimitry Andric /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader 1450bdd1243dSDimitry Andric /// and sink others into the body of \p FC1. 1451bdd1243dSDimitry Andric void movePreheaderInsts(const FusionCandidate &FC0, 1452bdd1243dSDimitry Andric const FusionCandidate &FC1, 1453bdd1243dSDimitry Andric SmallVector<Instruction *, 4> &HoistInsts, 1454bdd1243dSDimitry Andric SmallVector<Instruction *, 4> &SinkInsts) const { 1455bdd1243dSDimitry Andric // All preheader instructions except the branch must be hoisted or sunk 1456bdd1243dSDimitry Andric assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && 1457bdd1243dSDimitry Andric "Attempting to sink and hoist preheader instructions, but not all " 1458bdd1243dSDimitry Andric "the preheader instructions are accounted for."); 1459bdd1243dSDimitry Andric 1460bdd1243dSDimitry Andric NumHoistedInsts += HoistInsts.size(); 1461bdd1243dSDimitry Andric NumSunkInsts += SinkInsts.size(); 1462bdd1243dSDimitry Andric 1463bdd1243dSDimitry Andric LLVM_DEBUG(if (VerboseFusionDebugging) { 1464bdd1243dSDimitry Andric if (!HoistInsts.empty()) 1465bdd1243dSDimitry Andric dbgs() << "Hoisting: \n"; 1466bdd1243dSDimitry Andric for (Instruction *I : HoistInsts) 1467bdd1243dSDimitry Andric dbgs() << *I << "\n"; 1468bdd1243dSDimitry Andric if (!SinkInsts.empty()) 1469bdd1243dSDimitry Andric dbgs() << "Sinking: \n"; 1470bdd1243dSDimitry Andric for (Instruction *I : SinkInsts) 1471bdd1243dSDimitry Andric dbgs() << *I << "\n"; 1472bdd1243dSDimitry Andric }); 1473bdd1243dSDimitry Andric 1474bdd1243dSDimitry Andric for (Instruction *I : HoistInsts) { 1475bdd1243dSDimitry Andric assert(I->getParent() == FC1.Preheader); 14765f757f3fSDimitry Andric I->moveBefore(*FC0.Preheader, 14775f757f3fSDimitry Andric FC0.Preheader->getTerminator()->getIterator()); 1478bdd1243dSDimitry Andric } 1479bdd1243dSDimitry Andric // insert instructions in reverse order to maintain dominance relationship 1480bdd1243dSDimitry Andric for (Instruction *I : reverse(SinkInsts)) { 1481bdd1243dSDimitry Andric assert(I->getParent() == FC1.Preheader); 14825f757f3fSDimitry Andric I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt()); 1483bdd1243dSDimitry Andric } 1484bdd1243dSDimitry Andric } 1485bdd1243dSDimitry Andric 14868bcb0991SDimitry Andric /// Determine if two fusion candidates have identical guards 14878bcb0991SDimitry Andric /// 14888bcb0991SDimitry Andric /// This method will determine if two fusion candidates have the same guards. 14898bcb0991SDimitry Andric /// The guards are considered the same if: 14908bcb0991SDimitry Andric /// 1. The instructions to compute the condition used in the compare are 14918bcb0991SDimitry Andric /// identical. 14928bcb0991SDimitry Andric /// 2. The successors of the guard have the same flow into/around the loop. 14938bcb0991SDimitry Andric /// If the compare instructions are identical, then the first successor of the 14948bcb0991SDimitry Andric /// guard must go to the same place (either the preheader of the loop or the 14955f757f3fSDimitry Andric /// NonLoopBlock). In other words, the first successor of both loops must 14968bcb0991SDimitry Andric /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 14978bcb0991SDimitry Andric /// the NonLoopBlock). The same must be true for the second successor. 14988bcb0991SDimitry Andric bool haveIdenticalGuards(const FusionCandidate &FC0, 14998bcb0991SDimitry Andric const FusionCandidate &FC1) const { 15008bcb0991SDimitry Andric assert(FC0.GuardBranch && FC1.GuardBranch && 15018bcb0991SDimitry Andric "Expecting FC0 and FC1 to be guarded loops."); 15028bcb0991SDimitry Andric 15038bcb0991SDimitry Andric if (auto FC0CmpInst = 15048bcb0991SDimitry Andric dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 15058bcb0991SDimitry Andric if (auto FC1CmpInst = 15068bcb0991SDimitry Andric dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 15078bcb0991SDimitry Andric if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 15088bcb0991SDimitry Andric return false; 15098bcb0991SDimitry Andric 15108bcb0991SDimitry Andric // The compare instructions are identical. 15118bcb0991SDimitry Andric // Now make sure the successor of the guards have the same flow into/around 15128bcb0991SDimitry Andric // the loop 15138bcb0991SDimitry Andric if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 15148bcb0991SDimitry Andric return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 15158bcb0991SDimitry Andric else 15168bcb0991SDimitry Andric return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 15178bcb0991SDimitry Andric } 15188bcb0991SDimitry Andric 1519e8d8bef9SDimitry Andric /// Modify the latch branch of FC to be unconditional since successors of the 1520e8d8bef9SDimitry Andric /// branch are the same. 1521480093f4SDimitry Andric void simplifyLatchBranch(const FusionCandidate &FC) const { 1522480093f4SDimitry Andric BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1523480093f4SDimitry Andric if (FCLatchBranch) { 1524480093f4SDimitry Andric assert(FCLatchBranch->isConditional() && 1525480093f4SDimitry Andric FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1526480093f4SDimitry Andric "Expecting the two successors of FCLatchBranch to be the same"); 1527e8d8bef9SDimitry Andric BranchInst *NewBranch = 1528e8d8bef9SDimitry Andric BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1529e8d8bef9SDimitry Andric ReplaceInstWithInst(FCLatchBranch, NewBranch); 1530480093f4SDimitry Andric } 1531480093f4SDimitry Andric } 1532480093f4SDimitry Andric 1533480093f4SDimitry Andric /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1534480093f4SDimitry Andric /// successor, then merge FC0.Latch with its unique successor. 1535480093f4SDimitry Andric void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 15365ffd83dbSDimitry Andric moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1537480093f4SDimitry Andric if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1538480093f4SDimitry Andric MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1539480093f4SDimitry Andric DTU.flush(); 1540480093f4SDimitry Andric } 1541480093f4SDimitry Andric } 1542480093f4SDimitry Andric 15430b57cec5SDimitry Andric /// Fuse two fusion candidates, creating a new fused loop. 15440b57cec5SDimitry Andric /// 15450b57cec5SDimitry Andric /// This method contains the mechanics of fusing two loops, represented by \p 15460b57cec5SDimitry Andric /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 15470b57cec5SDimitry Andric /// postdominates \p FC0 (making them control flow equivalent). It also 15480b57cec5SDimitry Andric /// assumes that the other conditions for fusion have been met: adjacent, 15490b57cec5SDimitry Andric /// identical trip counts, and no negative distance dependencies exist that 15500b57cec5SDimitry Andric /// would prevent fusion. Thus, there is no checking for these conditions in 15510b57cec5SDimitry Andric /// this method. 15520b57cec5SDimitry Andric /// 15530b57cec5SDimitry Andric /// Fusion is performed by rewiring the CFG to update successor blocks of the 15540b57cec5SDimitry Andric /// components of tho loop. Specifically, the following changes are done: 15550b57cec5SDimitry Andric /// 15560b57cec5SDimitry Andric /// 1. The preheader of \p FC1 is removed as it is no longer necessary 15570b57cec5SDimitry Andric /// (because it is currently only a single statement block). 15580b57cec5SDimitry Andric /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 15590b57cec5SDimitry Andric /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 15600b57cec5SDimitry Andric /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 15610b57cec5SDimitry Andric /// 15620b57cec5SDimitry Andric /// All of these modifications are done with dominator tree updates, thus 15630b57cec5SDimitry Andric /// keeping the dominator (and post dominator) information up-to-date. 15640b57cec5SDimitry Andric /// 15650b57cec5SDimitry Andric /// This can be improved in the future by actually merging blocks during 15660b57cec5SDimitry Andric /// fusion. For example, the preheader of \p FC1 can be merged with the 15670b57cec5SDimitry Andric /// preheader of \p FC0. This would allow loops with more than a single 15680b57cec5SDimitry Andric /// statement in the preheader to be fused. Similarly, the latch blocks of the 15690b57cec5SDimitry Andric /// two loops could also be fused into a single block. This will require 15700b57cec5SDimitry Andric /// analysis to prove it is safe to move the contents of the block past 15710b57cec5SDimitry Andric /// existing code, which currently has not been implemented. 15720b57cec5SDimitry Andric Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 15730b57cec5SDimitry Andric assert(FC0.isValid() && FC1.isValid() && 15740b57cec5SDimitry Andric "Expecting valid fusion candidates"); 15750b57cec5SDimitry Andric 15760b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 15770b57cec5SDimitry Andric dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 15780b57cec5SDimitry Andric 15795ffd83dbSDimitry Andric // Move instructions from the preheader of FC1 to the end of the preheader 15805ffd83dbSDimitry Andric // of FC0. 15815ffd83dbSDimitry Andric moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 15825ffd83dbSDimitry Andric 15838bcb0991SDimitry Andric // Fusing guarded loops is handled slightly differently than non-guarded 15848bcb0991SDimitry Andric // loops and has been broken out into a separate method instead of trying to 15858bcb0991SDimitry Andric // intersperse the logic within a single method. 15868bcb0991SDimitry Andric if (FC0.GuardBranch) 15878bcb0991SDimitry Andric return fuseGuardedLoops(FC0, FC1); 15888bcb0991SDimitry Andric 1589e8d8bef9SDimitry Andric assert(FC1.Preheader == 1590e8d8bef9SDimitry Andric (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 15910b57cec5SDimitry Andric assert(FC1.Preheader->size() == 1 && 15920b57cec5SDimitry Andric FC1.Preheader->getSingleSuccessor() == FC1.Header); 15930b57cec5SDimitry Andric 15940b57cec5SDimitry Andric // Remember the phi nodes originally in the header of FC0 in order to rewire 15950b57cec5SDimitry Andric // them later. However, this is only necessary if the new loop carried 15960b57cec5SDimitry Andric // values might not dominate the exiting branch. While we do not generally 15970b57cec5SDimitry Andric // test if this is the case but simply insert intermediate phi nodes, we 15980b57cec5SDimitry Andric // need to make sure these intermediate phi nodes have different 15990b57cec5SDimitry Andric // predecessors. To this end, we filter the special case where the exiting 16000b57cec5SDimitry Andric // block is the latch block of the first loop. Nothing needs to be done 16010b57cec5SDimitry Andric // anyway as all loop carried values dominate the latch and thereby also the 16020b57cec5SDimitry Andric // exiting branch. 16030b57cec5SDimitry Andric SmallVector<PHINode *, 8> OriginalFC0PHIs; 16040b57cec5SDimitry Andric if (FC0.ExitingBlock != FC0.Latch) 16050b57cec5SDimitry Andric for (PHINode &PHI : FC0.Header->phis()) 16060b57cec5SDimitry Andric OriginalFC0PHIs.push_back(&PHI); 16070b57cec5SDimitry Andric 16080b57cec5SDimitry Andric // Replace incoming blocks for header PHIs first. 16090b57cec5SDimitry Andric FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 16100b57cec5SDimitry Andric FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 16110b57cec5SDimitry Andric 16120b57cec5SDimitry Andric // Then modify the control flow and update DT and PDT. 16130b57cec5SDimitry Andric SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 16140b57cec5SDimitry Andric 16150b57cec5SDimitry Andric // The old exiting block of the first loop (FC0) has to jump to the header 16160b57cec5SDimitry Andric // of the second as we need to execute the code in the second header block 16170b57cec5SDimitry Andric // regardless of the trip count. That is, if the trip count is 0, so the 16180b57cec5SDimitry Andric // back edge is never taken, we still have to execute both loop headers, 16190b57cec5SDimitry Andric // especially (but not only!) if the second is a do-while style loop. 16200b57cec5SDimitry Andric // However, doing so might invalidate the phi nodes of the first loop as 16210b57cec5SDimitry Andric // the new values do only need to dominate their latch and not the exiting 16220b57cec5SDimitry Andric // predicate. To remedy this potential problem we always introduce phi 16230b57cec5SDimitry Andric // nodes in the header of the second loop later that select the loop carried 16240b57cec5SDimitry Andric // value, if the second header was reached through an old latch of the 16250b57cec5SDimitry Andric // first, or undef otherwise. This is sound as exiting the first implies the 16260b57cec5SDimitry Andric // second will exit too, __without__ taking the back-edge. [Their 16270b57cec5SDimitry Andric // trip-counts are equal after all. 16285f757f3fSDimitry Andric // KB: Would this sequence be simpler to just make FC0.ExitingBlock go 16290b57cec5SDimitry Andric // to FC1.Header? I think this is basically what the three sequences are 16300b57cec5SDimitry Andric // trying to accomplish; however, doing this directly in the CFG may mean 16310b57cec5SDimitry Andric // the DT/PDT becomes invalid 1632e8d8bef9SDimitry Andric if (!FC0.Peeled) { 16330b57cec5SDimitry Andric FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 16340b57cec5SDimitry Andric FC1.Header); 16350b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 16360b57cec5SDimitry Andric DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 16370b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 16380b57cec5SDimitry Andric DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1639e8d8bef9SDimitry Andric } else { 1640e8d8bef9SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 1641e8d8bef9SDimitry Andric DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1642e8d8bef9SDimitry Andric 1643e8d8bef9SDimitry Andric // Remove the ExitBlock of the first Loop (also not needed) 1644e8d8bef9SDimitry Andric FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1645e8d8bef9SDimitry Andric FC1.Header); 1646e8d8bef9SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 1647e8d8bef9SDimitry Andric DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1648e8d8bef9SDimitry Andric FC0.ExitBlock->getTerminator()->eraseFromParent(); 1649e8d8bef9SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 1650e8d8bef9SDimitry Andric DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1651e8d8bef9SDimitry Andric new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1652e8d8bef9SDimitry Andric } 16530b57cec5SDimitry Andric 16540b57cec5SDimitry Andric // The pre-header of L1 is not necessary anymore. 1655e8d8bef9SDimitry Andric assert(pred_empty(FC1.Preheader)); 16560b57cec5SDimitry Andric FC1.Preheader->getTerminator()->eraseFromParent(); 16570b57cec5SDimitry Andric new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 16580b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 16590b57cec5SDimitry Andric DominatorTree::Delete, FC1.Preheader, FC1.Header)); 16600b57cec5SDimitry Andric 16610b57cec5SDimitry Andric // Moves the phi nodes from the second to the first loops header block. 16620b57cec5SDimitry Andric while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 16630b57cec5SDimitry Andric if (SE.isSCEVable(PHI->getType())) 16640b57cec5SDimitry Andric SE.forgetValue(PHI); 16650b57cec5SDimitry Andric if (PHI->hasNUsesOrMore(1)) 16660b57cec5SDimitry Andric PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 16670b57cec5SDimitry Andric else 16680b57cec5SDimitry Andric PHI->eraseFromParent(); 16690b57cec5SDimitry Andric } 16700b57cec5SDimitry Andric 16710b57cec5SDimitry Andric // Introduce new phi nodes in the second loop header to ensure 16720b57cec5SDimitry Andric // exiting the first and jumping to the header of the second does not break 16730b57cec5SDimitry Andric // the SSA property of the phis originally in the first loop. See also the 16740b57cec5SDimitry Andric // comment above. 16755f757f3fSDimitry Andric BasicBlock::iterator L1HeaderIP = FC1.Header->begin(); 16760b57cec5SDimitry Andric for (PHINode *LCPHI : OriginalFC0PHIs) { 16770b57cec5SDimitry Andric int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 16780b57cec5SDimitry Andric assert(L1LatchBBIdx >= 0 && 16790b57cec5SDimitry Andric "Expected loop carried value to be rewired at this point!"); 16800b57cec5SDimitry Andric 16810b57cec5SDimitry Andric Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 16820b57cec5SDimitry Andric 16835f757f3fSDimitry Andric PHINode *L1HeaderPHI = 16845f757f3fSDimitry Andric PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0"); 16855f757f3fSDimitry Andric L1HeaderPHI->insertBefore(L1HeaderIP); 16860b57cec5SDimitry Andric L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1687*0fca6ea1SDimitry Andric L1HeaderPHI->addIncoming(PoisonValue::get(LCV->getType()), 16880b57cec5SDimitry Andric FC0.ExitingBlock); 16890b57cec5SDimitry Andric 16900b57cec5SDimitry Andric LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 16910b57cec5SDimitry Andric } 16920b57cec5SDimitry Andric 16930b57cec5SDimitry Andric // Replace latch terminator destinations. 16940b57cec5SDimitry Andric FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 16950b57cec5SDimitry Andric FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 16960b57cec5SDimitry Andric 1697e8d8bef9SDimitry Andric // Modify the latch branch of FC0 to be unconditional as both successors of 1698480093f4SDimitry Andric // the branch are the same. 1699480093f4SDimitry Andric simplifyLatchBranch(FC0); 1700480093f4SDimitry Andric 17010b57cec5SDimitry Andric // If FC0.Latch and FC0.ExitingBlock are the same then we have already 17020b57cec5SDimitry Andric // performed the updates above. 17030b57cec5SDimitry Andric if (FC0.Latch != FC0.ExitingBlock) 17040b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 17050b57cec5SDimitry Andric DominatorTree::Insert, FC0.Latch, FC1.Header)); 17060b57cec5SDimitry Andric 17070b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 17080b57cec5SDimitry Andric FC0.Latch, FC0.Header)); 17090b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 17100b57cec5SDimitry Andric FC1.Latch, FC0.Header)); 17110b57cec5SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 17120b57cec5SDimitry Andric FC1.Latch, FC1.Header)); 17130b57cec5SDimitry Andric 17140b57cec5SDimitry Andric // Update DT/PDT 17150b57cec5SDimitry Andric DTU.applyUpdates(TreeUpdates); 17160b57cec5SDimitry Andric 17170b57cec5SDimitry Andric LI.removeBlock(FC1.Preheader); 17180b57cec5SDimitry Andric DTU.deleteBB(FC1.Preheader); 1719e8d8bef9SDimitry Andric if (FC0.Peeled) { 1720e8d8bef9SDimitry Andric LI.removeBlock(FC0.ExitBlock); 1721e8d8bef9SDimitry Andric DTU.deleteBB(FC0.ExitBlock); 1722e8d8bef9SDimitry Andric } 1723e8d8bef9SDimitry Andric 17240b57cec5SDimitry Andric DTU.flush(); 17250b57cec5SDimitry Andric 17260b57cec5SDimitry Andric // Is there a way to keep SE up-to-date so we don't need to forget the loops 17270b57cec5SDimitry Andric // and rebuild the information in subsequent passes of fusion? 1728480093f4SDimitry Andric // Note: Need to forget the loops before merging the loop latches, as 1729480093f4SDimitry Andric // mergeLatch may remove the only block in FC1. 17300b57cec5SDimitry Andric SE.forgetLoop(FC1.L); 17310b57cec5SDimitry Andric SE.forgetLoop(FC0.L); 1732bdd1243dSDimitry Andric SE.forgetLoopDispositions(); 17330b57cec5SDimitry Andric 1734480093f4SDimitry Andric // Move instructions from FC0.Latch to FC1.Latch. 1735480093f4SDimitry Andric // Note: mergeLatch requires an updated DT. 1736480093f4SDimitry Andric mergeLatch(FC0, FC1); 1737480093f4SDimitry Andric 17380b57cec5SDimitry Andric // Merge the loops. 1739e8d8bef9SDimitry Andric SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 17400b57cec5SDimitry Andric for (BasicBlock *BB : Blocks) { 17410b57cec5SDimitry Andric FC0.L->addBlockEntry(BB); 17420b57cec5SDimitry Andric FC1.L->removeBlockFromLoop(BB); 17430b57cec5SDimitry Andric if (LI.getLoopFor(BB) != FC1.L) 17440b57cec5SDimitry Andric continue; 17450b57cec5SDimitry Andric LI.changeLoopFor(BB, FC0.L); 17460b57cec5SDimitry Andric } 1747e8d8bef9SDimitry Andric while (!FC1.L->isInnermost()) { 17480b57cec5SDimitry Andric const auto &ChildLoopIt = FC1.L->begin(); 17490b57cec5SDimitry Andric Loop *ChildLoop = *ChildLoopIt; 17500b57cec5SDimitry Andric FC1.L->removeChildLoop(ChildLoopIt); 17510b57cec5SDimitry Andric FC0.L->addChildLoop(ChildLoop); 17520b57cec5SDimitry Andric } 17530b57cec5SDimitry Andric 17540b57cec5SDimitry Andric // Delete the now empty loop L1. 17550b57cec5SDimitry Andric LI.erase(FC1.L); 17560b57cec5SDimitry Andric 17570b57cec5SDimitry Andric #ifndef NDEBUG 17580b57cec5SDimitry Andric assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 17590b57cec5SDimitry Andric assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 17600b57cec5SDimitry Andric assert(PDT.verify()); 17610b57cec5SDimitry Andric LI.verify(DT); 17620b57cec5SDimitry Andric SE.verify(); 17630b57cec5SDimitry Andric #endif 17640b57cec5SDimitry Andric 17658bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Fusion done:\n"); 17668bcb0991SDimitry Andric 17678bcb0991SDimitry Andric return FC0.L; 17688bcb0991SDimitry Andric } 17698bcb0991SDimitry Andric 17708bcb0991SDimitry Andric /// Report details on loop fusion opportunities. 17718bcb0991SDimitry Andric /// 17728bcb0991SDimitry Andric /// This template function can be used to report both successful and missed 17738bcb0991SDimitry Andric /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 17748bcb0991SDimitry Andric /// be one of: 17758bcb0991SDimitry Andric /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 17768bcb0991SDimitry Andric /// given two valid fusion candidates. 17778bcb0991SDimitry Andric /// - OptimizationRemark to report successful fusion of two fusion 17788bcb0991SDimitry Andric /// candidates. 17798bcb0991SDimitry Andric /// The remarks will be printed using the form: 17808bcb0991SDimitry Andric /// <path/filename>:<line number>:<column number>: [<function name>]: 17818bcb0991SDimitry Andric /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 17828bcb0991SDimitry Andric template <typename RemarkKind> 17838bcb0991SDimitry Andric void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 17848bcb0991SDimitry Andric llvm::Statistic &Stat) { 17858bcb0991SDimitry Andric assert(FC0.Preheader && FC1.Preheader && 17868bcb0991SDimitry Andric "Expecting valid fusion candidates"); 17878bcb0991SDimitry Andric using namespace ore; 1788fe6060f1SDimitry Andric #if LLVM_ENABLE_STATS 17898bcb0991SDimitry Andric ++Stat; 17908bcb0991SDimitry Andric ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 17918bcb0991SDimitry Andric FC0.Preheader) 17928bcb0991SDimitry Andric << "[" << FC0.Preheader->getParent()->getName() 17938bcb0991SDimitry Andric << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 17948bcb0991SDimitry Andric << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 17958bcb0991SDimitry Andric << ": " << Stat.getDesc()); 1796fe6060f1SDimitry Andric #endif 17978bcb0991SDimitry Andric } 17988bcb0991SDimitry Andric 17998bcb0991SDimitry Andric /// Fuse two guarded fusion candidates, creating a new fused loop. 18008bcb0991SDimitry Andric /// 18018bcb0991SDimitry Andric /// Fusing guarded loops is handled much the same way as fusing non-guarded 18028bcb0991SDimitry Andric /// loops. The rewiring of the CFG is slightly different though, because of 18038bcb0991SDimitry Andric /// the presence of the guards around the loops and the exit blocks after the 18048bcb0991SDimitry Andric /// loop body. As such, the new loop is rewired as follows: 18058bcb0991SDimitry Andric /// 1. Keep the guard branch from FC0 and use the non-loop block target 18068bcb0991SDimitry Andric /// from the FC1 guard branch. 18078bcb0991SDimitry Andric /// 2. Remove the exit block from FC0 (this exit block should be empty 18088bcb0991SDimitry Andric /// right now). 18098bcb0991SDimitry Andric /// 3. Remove the guard branch for FC1 18108bcb0991SDimitry Andric /// 4. Remove the preheader for FC1. 18118bcb0991SDimitry Andric /// The exit block successor for the latch of FC0 is updated to be the header 18128bcb0991SDimitry Andric /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 18138bcb0991SDimitry Andric /// be the header of FC0, thus creating the fused loop. 18148bcb0991SDimitry Andric Loop *fuseGuardedLoops(const FusionCandidate &FC0, 18158bcb0991SDimitry Andric const FusionCandidate &FC1) { 18168bcb0991SDimitry Andric assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 18178bcb0991SDimitry Andric 18188bcb0991SDimitry Andric BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 18198bcb0991SDimitry Andric BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 18208bcb0991SDimitry Andric BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 18218bcb0991SDimitry Andric BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1822e8d8bef9SDimitry Andric BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 18238bcb0991SDimitry Andric 18245ffd83dbSDimitry Andric // Move instructions from the exit block of FC0 to the beginning of the exit 1825e8d8bef9SDimitry Andric // block of FC1, in the case that the FC0 loop has not been peeled. In the 1826e8d8bef9SDimitry Andric // case that FC0 loop is peeled, then move the instructions of the successor 1827e8d8bef9SDimitry Andric // of the FC0 Exit block to the beginning of the exit block of FC1. 1828e8d8bef9SDimitry Andric moveInstructionsToTheBeginning( 1829e8d8bef9SDimitry Andric (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1830e8d8bef9SDimitry Andric DT, PDT, DI); 18315ffd83dbSDimitry Andric 18325ffd83dbSDimitry Andric // Move instructions from the guard block of FC1 to the end of the guard 18335ffd83dbSDimitry Andric // block of FC0. 18345ffd83dbSDimitry Andric moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 18355ffd83dbSDimitry Andric 18368bcb0991SDimitry Andric assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 18378bcb0991SDimitry Andric 18388bcb0991SDimitry Andric SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 18398bcb0991SDimitry Andric 18408bcb0991SDimitry Andric //////////////////////////////////////////////////////////////////////////// 18418bcb0991SDimitry Andric // Update the Loop Guard 18428bcb0991SDimitry Andric //////////////////////////////////////////////////////////////////////////// 18438bcb0991SDimitry Andric // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 18448bcb0991SDimitry Andric // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 18458bcb0991SDimitry Andric // Thus, one path from the guard goes to the preheader for FC0 (and thus 18468bcb0991SDimitry Andric // executes the new fused loop) and the other path goes to the NonLoopBlock 18478bcb0991SDimitry Andric // for FC1 (where FC1 guard would have gone if FC1 was not executed). 18485ffd83dbSDimitry Andric FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 18498bcb0991SDimitry Andric FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1850e8d8bef9SDimitry Andric 1851e8d8bef9SDimitry Andric BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1852e8d8bef9SDimitry Andric BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 18538bcb0991SDimitry Andric 18548bcb0991SDimitry Andric // The guard of FC1 is not necessary anymore. 18558bcb0991SDimitry Andric FC1.GuardBranch->eraseFromParent(); 18568bcb0991SDimitry Andric new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 18578bcb0991SDimitry Andric 18588bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 18598bcb0991SDimitry Andric DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 18608bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 18618bcb0991SDimitry Andric DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 18628bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 18638bcb0991SDimitry Andric DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 18648bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 18658bcb0991SDimitry Andric DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 18668bcb0991SDimitry Andric 1867e8d8bef9SDimitry Andric if (FC0.Peeled) { 1868e8d8bef9SDimitry Andric // Remove the Block after the ExitBlock of FC0 1869e8d8bef9SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 1870e8d8bef9SDimitry Andric DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1871e8d8bef9SDimitry Andric FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1872e8d8bef9SDimitry Andric new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1873e8d8bef9SDimitry Andric FC0ExitBlockSuccessor); 1874e8d8bef9SDimitry Andric } 1875e8d8bef9SDimitry Andric 1876e8d8bef9SDimitry Andric assert(pred_empty(FC1GuardBlock) && 18778bcb0991SDimitry Andric "Expecting guard block to have no predecessors"); 1878e8d8bef9SDimitry Andric assert(succ_empty(FC1GuardBlock) && 18798bcb0991SDimitry Andric "Expecting guard block to have no successors"); 18808bcb0991SDimitry Andric 18818bcb0991SDimitry Andric // Remember the phi nodes originally in the header of FC0 in order to rewire 18828bcb0991SDimitry Andric // them later. However, this is only necessary if the new loop carried 18838bcb0991SDimitry Andric // values might not dominate the exiting branch. While we do not generally 18848bcb0991SDimitry Andric // test if this is the case but simply insert intermediate phi nodes, we 18858bcb0991SDimitry Andric // need to make sure these intermediate phi nodes have different 18868bcb0991SDimitry Andric // predecessors. To this end, we filter the special case where the exiting 18878bcb0991SDimitry Andric // block is the latch block of the first loop. Nothing needs to be done 18888bcb0991SDimitry Andric // anyway as all loop carried values dominate the latch and thereby also the 18898bcb0991SDimitry Andric // exiting branch. 18908bcb0991SDimitry Andric // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 18918bcb0991SDimitry Andric // (because the loops are rotated. Thus, nothing will ever be added to 18928bcb0991SDimitry Andric // OriginalFC0PHIs. 18938bcb0991SDimitry Andric SmallVector<PHINode *, 8> OriginalFC0PHIs; 18948bcb0991SDimitry Andric if (FC0.ExitingBlock != FC0.Latch) 18958bcb0991SDimitry Andric for (PHINode &PHI : FC0.Header->phis()) 18968bcb0991SDimitry Andric OriginalFC0PHIs.push_back(&PHI); 18978bcb0991SDimitry Andric 18988bcb0991SDimitry Andric assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 18998bcb0991SDimitry Andric 19008bcb0991SDimitry Andric // Replace incoming blocks for header PHIs first. 19018bcb0991SDimitry Andric FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 19028bcb0991SDimitry Andric FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 19038bcb0991SDimitry Andric 19048bcb0991SDimitry Andric // The old exiting block of the first loop (FC0) has to jump to the header 19058bcb0991SDimitry Andric // of the second as we need to execute the code in the second header block 19068bcb0991SDimitry Andric // regardless of the trip count. That is, if the trip count is 0, so the 19078bcb0991SDimitry Andric // back edge is never taken, we still have to execute both loop headers, 19088bcb0991SDimitry Andric // especially (but not only!) if the second is a do-while style loop. 19098bcb0991SDimitry Andric // However, doing so might invalidate the phi nodes of the first loop as 19108bcb0991SDimitry Andric // the new values do only need to dominate their latch and not the exiting 19118bcb0991SDimitry Andric // predicate. To remedy this potential problem we always introduce phi 19128bcb0991SDimitry Andric // nodes in the header of the second loop later that select the loop carried 19138bcb0991SDimitry Andric // value, if the second header was reached through an old latch of the 19148bcb0991SDimitry Andric // first, or undef otherwise. This is sound as exiting the first implies the 19158bcb0991SDimitry Andric // second will exit too, __without__ taking the back-edge (their 19168bcb0991SDimitry Andric // trip-counts are equal after all). 19178bcb0991SDimitry Andric FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 19188bcb0991SDimitry Andric FC1.Header); 19198bcb0991SDimitry Andric 19208bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 19218bcb0991SDimitry Andric DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 19228bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 19238bcb0991SDimitry Andric DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 19248bcb0991SDimitry Andric 19258bcb0991SDimitry Andric // Remove FC0 Exit Block 19268bcb0991SDimitry Andric // The exit block for FC0 is no longer needed since control will flow 19278bcb0991SDimitry Andric // directly to the header of FC1. Since it is an empty block, it can be 19288bcb0991SDimitry Andric // removed at this point. 19298bcb0991SDimitry Andric // TODO: In the future, we can handle non-empty exit blocks my merging any 19308bcb0991SDimitry Andric // instructions from FC0 exit block into FC1 exit block prior to removing 19318bcb0991SDimitry Andric // the block. 1932e8d8bef9SDimitry Andric assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); 19338bcb0991SDimitry Andric FC0.ExitBlock->getTerminator()->eraseFromParent(); 19348bcb0991SDimitry Andric new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 19358bcb0991SDimitry Andric 19368bcb0991SDimitry Andric // Remove FC1 Preheader 19378bcb0991SDimitry Andric // The pre-header of L1 is not necessary anymore. 1938e8d8bef9SDimitry Andric assert(pred_empty(FC1.Preheader)); 19398bcb0991SDimitry Andric FC1.Preheader->getTerminator()->eraseFromParent(); 19408bcb0991SDimitry Andric new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 19418bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 19428bcb0991SDimitry Andric DominatorTree::Delete, FC1.Preheader, FC1.Header)); 19438bcb0991SDimitry Andric 19448bcb0991SDimitry Andric // Moves the phi nodes from the second to the first loops header block. 19458bcb0991SDimitry Andric while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 19468bcb0991SDimitry Andric if (SE.isSCEVable(PHI->getType())) 19478bcb0991SDimitry Andric SE.forgetValue(PHI); 19488bcb0991SDimitry Andric if (PHI->hasNUsesOrMore(1)) 19498bcb0991SDimitry Andric PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 19508bcb0991SDimitry Andric else 19518bcb0991SDimitry Andric PHI->eraseFromParent(); 19528bcb0991SDimitry Andric } 19538bcb0991SDimitry Andric 19548bcb0991SDimitry Andric // Introduce new phi nodes in the second loop header to ensure 19558bcb0991SDimitry Andric // exiting the first and jumping to the header of the second does not break 19568bcb0991SDimitry Andric // the SSA property of the phis originally in the first loop. See also the 19578bcb0991SDimitry Andric // comment above. 19585f757f3fSDimitry Andric BasicBlock::iterator L1HeaderIP = FC1.Header->begin(); 19598bcb0991SDimitry Andric for (PHINode *LCPHI : OriginalFC0PHIs) { 19608bcb0991SDimitry Andric int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 19618bcb0991SDimitry Andric assert(L1LatchBBIdx >= 0 && 19628bcb0991SDimitry Andric "Expected loop carried value to be rewired at this point!"); 19638bcb0991SDimitry Andric 19648bcb0991SDimitry Andric Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 19658bcb0991SDimitry Andric 19665f757f3fSDimitry Andric PHINode *L1HeaderPHI = 19675f757f3fSDimitry Andric PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0"); 19685f757f3fSDimitry Andric L1HeaderPHI->insertBefore(L1HeaderIP); 19698bcb0991SDimitry Andric L1HeaderPHI->addIncoming(LCV, FC0.Latch); 19708bcb0991SDimitry Andric L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 19718bcb0991SDimitry Andric FC0.ExitingBlock); 19728bcb0991SDimitry Andric 19738bcb0991SDimitry Andric LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 19748bcb0991SDimitry Andric } 19758bcb0991SDimitry Andric 19768bcb0991SDimitry Andric // Update the latches 19778bcb0991SDimitry Andric 19788bcb0991SDimitry Andric // Replace latch terminator destinations. 19798bcb0991SDimitry Andric FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 19808bcb0991SDimitry Andric FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 19818bcb0991SDimitry Andric 1982e8d8bef9SDimitry Andric // Modify the latch branch of FC0 to be unconditional as both successors of 1983480093f4SDimitry Andric // the branch are the same. 1984480093f4SDimitry Andric simplifyLatchBranch(FC0); 1985480093f4SDimitry Andric 19868bcb0991SDimitry Andric // If FC0.Latch and FC0.ExitingBlock are the same then we have already 19878bcb0991SDimitry Andric // performed the updates above. 19888bcb0991SDimitry Andric if (FC0.Latch != FC0.ExitingBlock) 19898bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType( 19908bcb0991SDimitry Andric DominatorTree::Insert, FC0.Latch, FC1.Header)); 19918bcb0991SDimitry Andric 19928bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 19938bcb0991SDimitry Andric FC0.Latch, FC0.Header)); 19948bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 19958bcb0991SDimitry Andric FC1.Latch, FC0.Header)); 19968bcb0991SDimitry Andric TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 19978bcb0991SDimitry Andric FC1.Latch, FC1.Header)); 19988bcb0991SDimitry Andric 19998bcb0991SDimitry Andric // All done 20008bcb0991SDimitry Andric // Apply the updates to the Dominator Tree and cleanup. 20018bcb0991SDimitry Andric 2002e8d8bef9SDimitry Andric assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); 2003e8d8bef9SDimitry Andric assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); 20048bcb0991SDimitry Andric 20058bcb0991SDimitry Andric // Update DT/PDT 20068bcb0991SDimitry Andric DTU.applyUpdates(TreeUpdates); 20078bcb0991SDimitry Andric 20085ffd83dbSDimitry Andric LI.removeBlock(FC1GuardBlock); 20098bcb0991SDimitry Andric LI.removeBlock(FC1.Preheader); 20105ffd83dbSDimitry Andric LI.removeBlock(FC0.ExitBlock); 2011e8d8bef9SDimitry Andric if (FC0.Peeled) { 2012e8d8bef9SDimitry Andric LI.removeBlock(FC0ExitBlockSuccessor); 2013e8d8bef9SDimitry Andric DTU.deleteBB(FC0ExitBlockSuccessor); 2014e8d8bef9SDimitry Andric } 20155ffd83dbSDimitry Andric DTU.deleteBB(FC1GuardBlock); 20168bcb0991SDimitry Andric DTU.deleteBB(FC1.Preheader); 20178bcb0991SDimitry Andric DTU.deleteBB(FC0.ExitBlock); 20188bcb0991SDimitry Andric DTU.flush(); 20198bcb0991SDimitry Andric 20208bcb0991SDimitry Andric // Is there a way to keep SE up-to-date so we don't need to forget the loops 20218bcb0991SDimitry Andric // and rebuild the information in subsequent passes of fusion? 2022480093f4SDimitry Andric // Note: Need to forget the loops before merging the loop latches, as 2023480093f4SDimitry Andric // mergeLatch may remove the only block in FC1. 20248bcb0991SDimitry Andric SE.forgetLoop(FC1.L); 20258bcb0991SDimitry Andric SE.forgetLoop(FC0.L); 2026bdd1243dSDimitry Andric SE.forgetLoopDispositions(); 20278bcb0991SDimitry Andric 2028480093f4SDimitry Andric // Move instructions from FC0.Latch to FC1.Latch. 2029480093f4SDimitry Andric // Note: mergeLatch requires an updated DT. 2030480093f4SDimitry Andric mergeLatch(FC0, FC1); 2031480093f4SDimitry Andric 20328bcb0991SDimitry Andric // Merge the loops. 2033e8d8bef9SDimitry Andric SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 20348bcb0991SDimitry Andric for (BasicBlock *BB : Blocks) { 20358bcb0991SDimitry Andric FC0.L->addBlockEntry(BB); 20368bcb0991SDimitry Andric FC1.L->removeBlockFromLoop(BB); 20378bcb0991SDimitry Andric if (LI.getLoopFor(BB) != FC1.L) 20388bcb0991SDimitry Andric continue; 20398bcb0991SDimitry Andric LI.changeLoopFor(BB, FC0.L); 20408bcb0991SDimitry Andric } 2041e8d8bef9SDimitry Andric while (!FC1.L->isInnermost()) { 20428bcb0991SDimitry Andric const auto &ChildLoopIt = FC1.L->begin(); 20438bcb0991SDimitry Andric Loop *ChildLoop = *ChildLoopIt; 20448bcb0991SDimitry Andric FC1.L->removeChildLoop(ChildLoopIt); 20458bcb0991SDimitry Andric FC0.L->addChildLoop(ChildLoop); 20468bcb0991SDimitry Andric } 20478bcb0991SDimitry Andric 20488bcb0991SDimitry Andric // Delete the now empty loop L1. 20498bcb0991SDimitry Andric LI.erase(FC1.L); 20508bcb0991SDimitry Andric 20518bcb0991SDimitry Andric #ifndef NDEBUG 20528bcb0991SDimitry Andric assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 20538bcb0991SDimitry Andric assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 20548bcb0991SDimitry Andric assert(PDT.verify()); 20558bcb0991SDimitry Andric LI.verify(DT); 20568bcb0991SDimitry Andric SE.verify(); 20578bcb0991SDimitry Andric #endif 20580b57cec5SDimitry Andric 20590b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Fusion done:\n"); 20600b57cec5SDimitry Andric 20610b57cec5SDimitry Andric return FC0.L; 20620b57cec5SDimitry Andric } 20630b57cec5SDimitry Andric }; 20648bcb0991SDimitry Andric } // namespace 20650b57cec5SDimitry Andric 20660b57cec5SDimitry Andric PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 20670b57cec5SDimitry Andric auto &LI = AM.getResult<LoopAnalysis>(F); 20680b57cec5SDimitry Andric auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 20690b57cec5SDimitry Andric auto &DI = AM.getResult<DependenceAnalysis>(F); 20700b57cec5SDimitry Andric auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 20710b57cec5SDimitry Andric auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 20720b57cec5SDimitry Andric auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 2073e8d8bef9SDimitry Andric auto &AC = AM.getResult<AssumptionAnalysis>(F); 2074e8d8bef9SDimitry Andric const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 2075*0fca6ea1SDimitry Andric const DataLayout &DL = F.getDataLayout(); 2076e8d8bef9SDimitry Andric 2077bdd1243dSDimitry Andric // Ensure loops are in simplifed form which is a pre-requisite for loop fusion 2078bdd1243dSDimitry Andric // pass. Added only for new PM since the legacy PM has already added 2079bdd1243dSDimitry Andric // LoopSimplify pass as a dependency. 2080bdd1243dSDimitry Andric bool Changed = false; 2081bdd1243dSDimitry Andric for (auto &L : LI) { 2082bdd1243dSDimitry Andric Changed |= 2083bdd1243dSDimitry Andric simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); 2084bdd1243dSDimitry Andric } 2085bdd1243dSDimitry Andric if (Changed) 2086bdd1243dSDimitry Andric PDT.recalculate(F); 2087bdd1243dSDimitry Andric 2088e8d8bef9SDimitry Andric LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2089bdd1243dSDimitry Andric Changed |= LF.fuseLoops(F); 20900b57cec5SDimitry Andric if (!Changed) 20910b57cec5SDimitry Andric return PreservedAnalyses::all(); 20920b57cec5SDimitry Andric 20930b57cec5SDimitry Andric PreservedAnalyses PA; 20940b57cec5SDimitry Andric PA.preserve<DominatorTreeAnalysis>(); 20950b57cec5SDimitry Andric PA.preserve<PostDominatorTreeAnalysis>(); 20960b57cec5SDimitry Andric PA.preserve<ScalarEvolutionAnalysis>(); 20970b57cec5SDimitry Andric PA.preserve<LoopAnalysis>(); 20980b57cec5SDimitry Andric return PA; 20990b57cec5SDimitry Andric } 2100