xref: /openbsd-src/gnu/llvm/llvm/lib/CodeGen/MachineBlockPlacement.cpp (revision 097a140d792de8b2bbe59ad827d39eabf9b4280a)
109467b48Spatrick //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This file implements basic block placement transformations using the CFG
1009467b48Spatrick // structure and branch probability estimates.
1109467b48Spatrick //
1209467b48Spatrick // The pass strives to preserve the structure of the CFG (that is, retain
1309467b48Spatrick // a topological ordering of basic blocks) in the absence of a *strong* signal
1409467b48Spatrick // to the contrary from probabilities. However, within the CFG structure, it
1509467b48Spatrick // attempts to choose an ordering which favors placing more likely sequences of
1609467b48Spatrick // blocks adjacent to each other.
1709467b48Spatrick //
1809467b48Spatrick // The algorithm works from the inner-most loop within a function outward, and
1909467b48Spatrick // at each stage walks through the basic blocks, trying to coalesce them into
2009467b48Spatrick // sequential chains where allowed by the CFG (or demanded by heavy
2109467b48Spatrick // probabilities). Finally, it walks the blocks in topological order, and the
2209467b48Spatrick // first time it reaches a chain of basic blocks, it schedules them in the
2309467b48Spatrick // function in-order.
2409467b48Spatrick //
2509467b48Spatrick //===----------------------------------------------------------------------===//
2609467b48Spatrick 
2709467b48Spatrick #include "BranchFolding.h"
2809467b48Spatrick #include "llvm/ADT/ArrayRef.h"
2909467b48Spatrick #include "llvm/ADT/DenseMap.h"
3009467b48Spatrick #include "llvm/ADT/STLExtras.h"
3109467b48Spatrick #include "llvm/ADT/SetVector.h"
3209467b48Spatrick #include "llvm/ADT/SmallPtrSet.h"
3309467b48Spatrick #include "llvm/ADT/SmallVector.h"
3409467b48Spatrick #include "llvm/ADT/Statistic.h"
3509467b48Spatrick #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
3609467b48Spatrick #include "llvm/Analysis/ProfileSummaryInfo.h"
3709467b48Spatrick #include "llvm/CodeGen/MachineBasicBlock.h"
3809467b48Spatrick #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
3909467b48Spatrick #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
4009467b48Spatrick #include "llvm/CodeGen/MachineFunction.h"
4109467b48Spatrick #include "llvm/CodeGen/MachineFunctionPass.h"
4209467b48Spatrick #include "llvm/CodeGen/MachineLoopInfo.h"
4309467b48Spatrick #include "llvm/CodeGen/MachineModuleInfo.h"
4409467b48Spatrick #include "llvm/CodeGen/MachinePostDominators.h"
4509467b48Spatrick #include "llvm/CodeGen/MachineSizeOpts.h"
4609467b48Spatrick #include "llvm/CodeGen/TailDuplicator.h"
4709467b48Spatrick #include "llvm/CodeGen/TargetInstrInfo.h"
4809467b48Spatrick #include "llvm/CodeGen/TargetLowering.h"
4909467b48Spatrick #include "llvm/CodeGen/TargetPassConfig.h"
5009467b48Spatrick #include "llvm/CodeGen/TargetSubtargetInfo.h"
5109467b48Spatrick #include "llvm/IR/DebugLoc.h"
5209467b48Spatrick #include "llvm/IR/Function.h"
5309467b48Spatrick #include "llvm/InitializePasses.h"
5409467b48Spatrick #include "llvm/Pass.h"
5509467b48Spatrick #include "llvm/Support/Allocator.h"
5609467b48Spatrick #include "llvm/Support/BlockFrequency.h"
5709467b48Spatrick #include "llvm/Support/BranchProbability.h"
5809467b48Spatrick #include "llvm/Support/CodeGen.h"
5909467b48Spatrick #include "llvm/Support/CommandLine.h"
6009467b48Spatrick #include "llvm/Support/Compiler.h"
6109467b48Spatrick #include "llvm/Support/Debug.h"
6209467b48Spatrick #include "llvm/Support/raw_ostream.h"
6309467b48Spatrick #include "llvm/Target/TargetMachine.h"
6409467b48Spatrick #include <algorithm>
6509467b48Spatrick #include <cassert>
6609467b48Spatrick #include <cstdint>
6709467b48Spatrick #include <iterator>
6809467b48Spatrick #include <memory>
6909467b48Spatrick #include <string>
7009467b48Spatrick #include <tuple>
7109467b48Spatrick #include <utility>
7209467b48Spatrick #include <vector>
7309467b48Spatrick 
7409467b48Spatrick using namespace llvm;
7509467b48Spatrick 
7609467b48Spatrick #define DEBUG_TYPE "block-placement"
7709467b48Spatrick 
7809467b48Spatrick STATISTIC(NumCondBranches, "Number of conditional branches");
7909467b48Spatrick STATISTIC(NumUncondBranches, "Number of unconditional branches");
8009467b48Spatrick STATISTIC(CondBranchTakenFreq,
8109467b48Spatrick           "Potential frequency of taking conditional branches");
8209467b48Spatrick STATISTIC(UncondBranchTakenFreq,
8309467b48Spatrick           "Potential frequency of taking unconditional branches");
8409467b48Spatrick 
8509467b48Spatrick static cl::opt<unsigned> AlignAllBlock(
8609467b48Spatrick     "align-all-blocks",
8709467b48Spatrick     cl::desc("Force the alignment of all blocks in the function in log2 format "
8809467b48Spatrick              "(e.g 4 means align on 16B boundaries)."),
8909467b48Spatrick     cl::init(0), cl::Hidden);
9009467b48Spatrick 
9109467b48Spatrick static cl::opt<unsigned> AlignAllNonFallThruBlocks(
9209467b48Spatrick     "align-all-nofallthru-blocks",
9309467b48Spatrick     cl::desc("Force the alignment of all blocks that have no fall-through "
9409467b48Spatrick              "predecessors (i.e. don't add nops that are executed). In log2 "
9509467b48Spatrick              "format (e.g 4 means align on 16B boundaries)."),
9609467b48Spatrick     cl::init(0), cl::Hidden);
9709467b48Spatrick 
9809467b48Spatrick // FIXME: Find a good default for this flag and remove the flag.
9909467b48Spatrick static cl::opt<unsigned> ExitBlockBias(
10009467b48Spatrick     "block-placement-exit-block-bias",
10109467b48Spatrick     cl::desc("Block frequency percentage a loop exit block needs "
10209467b48Spatrick              "over the original exit to be considered the new exit."),
10309467b48Spatrick     cl::init(0), cl::Hidden);
10409467b48Spatrick 
10509467b48Spatrick // Definition:
10609467b48Spatrick // - Outlining: placement of a basic block outside the chain or hot path.
10709467b48Spatrick 
10809467b48Spatrick static cl::opt<unsigned> LoopToColdBlockRatio(
10909467b48Spatrick     "loop-to-cold-block-ratio",
11009467b48Spatrick     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
11109467b48Spatrick              "(frequency of block) is greater than this ratio"),
11209467b48Spatrick     cl::init(5), cl::Hidden);
11309467b48Spatrick 
11409467b48Spatrick static cl::opt<bool> ForceLoopColdBlock(
11509467b48Spatrick     "force-loop-cold-block",
11609467b48Spatrick     cl::desc("Force outlining cold blocks from loops."),
11709467b48Spatrick     cl::init(false), cl::Hidden);
11809467b48Spatrick 
11909467b48Spatrick static cl::opt<bool>
12009467b48Spatrick     PreciseRotationCost("precise-rotation-cost",
12109467b48Spatrick                         cl::desc("Model the cost of loop rotation more "
12209467b48Spatrick                                  "precisely by using profile data."),
12309467b48Spatrick                         cl::init(false), cl::Hidden);
12409467b48Spatrick 
12509467b48Spatrick static cl::opt<bool>
12609467b48Spatrick     ForcePreciseRotationCost("force-precise-rotation-cost",
12709467b48Spatrick                              cl::desc("Force the use of precise cost "
12809467b48Spatrick                                       "loop rotation strategy."),
12909467b48Spatrick                              cl::init(false), cl::Hidden);
13009467b48Spatrick 
13109467b48Spatrick static cl::opt<unsigned> MisfetchCost(
13209467b48Spatrick     "misfetch-cost",
13309467b48Spatrick     cl::desc("Cost that models the probabilistic risk of an instruction "
13409467b48Spatrick              "misfetch due to a jump comparing to falling through, whose cost "
13509467b48Spatrick              "is zero."),
13609467b48Spatrick     cl::init(1), cl::Hidden);
13709467b48Spatrick 
13809467b48Spatrick static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
13909467b48Spatrick                                       cl::desc("Cost of jump instructions."),
14009467b48Spatrick                                       cl::init(1), cl::Hidden);
14109467b48Spatrick static cl::opt<bool>
14209467b48Spatrick TailDupPlacement("tail-dup-placement",
14309467b48Spatrick               cl::desc("Perform tail duplication during placement. "
14409467b48Spatrick                        "Creates more fallthrough opportunites in "
14509467b48Spatrick                        "outline branches."),
14609467b48Spatrick               cl::init(true), cl::Hidden);
14709467b48Spatrick 
14809467b48Spatrick static cl::opt<bool>
14909467b48Spatrick BranchFoldPlacement("branch-fold-placement",
15009467b48Spatrick               cl::desc("Perform branch folding during placement. "
15109467b48Spatrick                        "Reduces code size."),
15209467b48Spatrick               cl::init(true), cl::Hidden);
15309467b48Spatrick 
15409467b48Spatrick // Heuristic for tail duplication.
15509467b48Spatrick static cl::opt<unsigned> TailDupPlacementThreshold(
15609467b48Spatrick     "tail-dup-placement-threshold",
15709467b48Spatrick     cl::desc("Instruction cutoff for tail duplication during layout. "
15809467b48Spatrick              "Tail merging during layout is forced to have a threshold "
15909467b48Spatrick              "that won't conflict."), cl::init(2),
16009467b48Spatrick     cl::Hidden);
16109467b48Spatrick 
16209467b48Spatrick // Heuristic for aggressive tail duplication.
16309467b48Spatrick static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
16409467b48Spatrick     "tail-dup-placement-aggressive-threshold",
16509467b48Spatrick     cl::desc("Instruction cutoff for aggressive tail duplication during "
16609467b48Spatrick              "layout. Used at -O3. Tail merging during layout is forced to "
16709467b48Spatrick              "have a threshold that won't conflict."), cl::init(4),
16809467b48Spatrick     cl::Hidden);
16909467b48Spatrick 
17009467b48Spatrick // Heuristic for tail duplication.
17109467b48Spatrick static cl::opt<unsigned> TailDupPlacementPenalty(
17209467b48Spatrick     "tail-dup-placement-penalty",
17309467b48Spatrick     cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
17409467b48Spatrick              "Copying can increase fallthrough, but it also increases icache "
17509467b48Spatrick              "pressure. This parameter controls the penalty to account for that. "
17609467b48Spatrick              "Percent as integer."),
17709467b48Spatrick     cl::init(2),
17809467b48Spatrick     cl::Hidden);
17909467b48Spatrick 
18009467b48Spatrick // Heuristic for triangle chains.
18109467b48Spatrick static cl::opt<unsigned> TriangleChainCount(
18209467b48Spatrick     "triangle-chain-count",
18309467b48Spatrick     cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
18409467b48Spatrick              "triangle tail duplication heuristic to kick in. 0 to disable."),
18509467b48Spatrick     cl::init(2),
18609467b48Spatrick     cl::Hidden);
18709467b48Spatrick 
18809467b48Spatrick extern cl::opt<unsigned> StaticLikelyProb;
18909467b48Spatrick extern cl::opt<unsigned> ProfileLikelyProb;
19009467b48Spatrick 
19109467b48Spatrick // Internal option used to control BFI display only after MBP pass.
19209467b48Spatrick // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
19309467b48Spatrick // -view-block-layout-with-bfi=
19409467b48Spatrick extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
19509467b48Spatrick 
19609467b48Spatrick // Command line option to specify the name of the function for CFG dump
19709467b48Spatrick // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
19809467b48Spatrick extern cl::opt<std::string> ViewBlockFreqFuncName;
19909467b48Spatrick 
20009467b48Spatrick namespace {
20109467b48Spatrick 
20209467b48Spatrick class BlockChain;
20309467b48Spatrick 
20409467b48Spatrick /// Type for our function-wide basic block -> block chain mapping.
20509467b48Spatrick using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
20609467b48Spatrick 
20709467b48Spatrick /// A chain of blocks which will be laid out contiguously.
20809467b48Spatrick ///
20909467b48Spatrick /// This is the datastructure representing a chain of consecutive blocks that
21009467b48Spatrick /// are profitable to layout together in order to maximize fallthrough
21109467b48Spatrick /// probabilities and code locality. We also can use a block chain to represent
21209467b48Spatrick /// a sequence of basic blocks which have some external (correctness)
21309467b48Spatrick /// requirement for sequential layout.
21409467b48Spatrick ///
21509467b48Spatrick /// Chains can be built around a single basic block and can be merged to grow
21609467b48Spatrick /// them. They participate in a block-to-chain mapping, which is updated
21709467b48Spatrick /// automatically as chains are merged together.
21809467b48Spatrick class BlockChain {
21909467b48Spatrick   /// The sequence of blocks belonging to this chain.
22009467b48Spatrick   ///
22109467b48Spatrick   /// This is the sequence of blocks for a particular chain. These will be laid
22209467b48Spatrick   /// out in-order within the function.
22309467b48Spatrick   SmallVector<MachineBasicBlock *, 4> Blocks;
22409467b48Spatrick 
22509467b48Spatrick   /// A handle to the function-wide basic block to block chain mapping.
22609467b48Spatrick   ///
22709467b48Spatrick   /// This is retained in each block chain to simplify the computation of child
22809467b48Spatrick   /// block chains for SCC-formation and iteration. We store the edges to child
22909467b48Spatrick   /// basic blocks, and map them back to their associated chains using this
23009467b48Spatrick   /// structure.
23109467b48Spatrick   BlockToChainMapType &BlockToChain;
23209467b48Spatrick 
23309467b48Spatrick public:
23409467b48Spatrick   /// Construct a new BlockChain.
23509467b48Spatrick   ///
23609467b48Spatrick   /// This builds a new block chain representing a single basic block in the
23709467b48Spatrick   /// function. It also registers itself as the chain that block participates
23809467b48Spatrick   /// in with the BlockToChain mapping.
23909467b48Spatrick   BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
24009467b48Spatrick       : Blocks(1, BB), BlockToChain(BlockToChain) {
24109467b48Spatrick     assert(BB && "Cannot create a chain with a null basic block");
24209467b48Spatrick     BlockToChain[BB] = this;
24309467b48Spatrick   }
24409467b48Spatrick 
24509467b48Spatrick   /// Iterator over blocks within the chain.
24609467b48Spatrick   using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator;
24709467b48Spatrick   using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator;
24809467b48Spatrick 
24909467b48Spatrick   /// Beginning of blocks within the chain.
25009467b48Spatrick   iterator begin() { return Blocks.begin(); }
25109467b48Spatrick   const_iterator begin() const { return Blocks.begin(); }
25209467b48Spatrick 
25309467b48Spatrick   /// End of blocks within the chain.
25409467b48Spatrick   iterator end() { return Blocks.end(); }
25509467b48Spatrick   const_iterator end() const { return Blocks.end(); }
25609467b48Spatrick 
25709467b48Spatrick   bool remove(MachineBasicBlock* BB) {
25809467b48Spatrick     for(iterator i = begin(); i != end(); ++i) {
25909467b48Spatrick       if (*i == BB) {
26009467b48Spatrick         Blocks.erase(i);
26109467b48Spatrick         return true;
26209467b48Spatrick       }
26309467b48Spatrick     }
26409467b48Spatrick     return false;
26509467b48Spatrick   }
26609467b48Spatrick 
26709467b48Spatrick   /// Merge a block chain into this one.
26809467b48Spatrick   ///
26909467b48Spatrick   /// This routine merges a block chain into this one. It takes care of forming
27009467b48Spatrick   /// a contiguous sequence of basic blocks, updating the edge list, and
27109467b48Spatrick   /// updating the block -> chain mapping. It does not free or tear down the
27209467b48Spatrick   /// old chain, but the old chain's block list is no longer valid.
27309467b48Spatrick   void merge(MachineBasicBlock *BB, BlockChain *Chain) {
27409467b48Spatrick     assert(BB && "Can't merge a null block.");
27509467b48Spatrick     assert(!Blocks.empty() && "Can't merge into an empty chain.");
27609467b48Spatrick 
27709467b48Spatrick     // Fast path in case we don't have a chain already.
27809467b48Spatrick     if (!Chain) {
27909467b48Spatrick       assert(!BlockToChain[BB] &&
28009467b48Spatrick              "Passed chain is null, but BB has entry in BlockToChain.");
28109467b48Spatrick       Blocks.push_back(BB);
28209467b48Spatrick       BlockToChain[BB] = this;
28309467b48Spatrick       return;
28409467b48Spatrick     }
28509467b48Spatrick 
28609467b48Spatrick     assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
28709467b48Spatrick     assert(Chain->begin() != Chain->end());
28809467b48Spatrick 
28909467b48Spatrick     // Update the incoming blocks to point to this chain, and add them to the
29009467b48Spatrick     // chain structure.
29109467b48Spatrick     for (MachineBasicBlock *ChainBB : *Chain) {
29209467b48Spatrick       Blocks.push_back(ChainBB);
29309467b48Spatrick       assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
29409467b48Spatrick       BlockToChain[ChainBB] = this;
29509467b48Spatrick     }
29609467b48Spatrick   }
29709467b48Spatrick 
29809467b48Spatrick #ifndef NDEBUG
29909467b48Spatrick   /// Dump the blocks in this chain.
30009467b48Spatrick   LLVM_DUMP_METHOD void dump() {
30109467b48Spatrick     for (MachineBasicBlock *MBB : *this)
30209467b48Spatrick       MBB->dump();
30309467b48Spatrick   }
30409467b48Spatrick #endif // NDEBUG
30509467b48Spatrick 
30609467b48Spatrick   /// Count of predecessors of any block within the chain which have not
30709467b48Spatrick   /// yet been scheduled.  In general, we will delay scheduling this chain
30809467b48Spatrick   /// until those predecessors are scheduled (or we find a sufficiently good
30909467b48Spatrick   /// reason to override this heuristic.)  Note that when forming loop chains,
31009467b48Spatrick   /// blocks outside the loop are ignored and treated as if they were already
31109467b48Spatrick   /// scheduled.
31209467b48Spatrick   ///
31309467b48Spatrick   /// Note: This field is reinitialized multiple times - once for each loop,
31409467b48Spatrick   /// and then once for the function as a whole.
31509467b48Spatrick   unsigned UnscheduledPredecessors = 0;
31609467b48Spatrick };
31709467b48Spatrick 
31809467b48Spatrick class MachineBlockPlacement : public MachineFunctionPass {
31909467b48Spatrick   /// A type for a block filter set.
32009467b48Spatrick   using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
32109467b48Spatrick 
32209467b48Spatrick   /// Pair struct containing basic block and taildup profitability
32309467b48Spatrick   struct BlockAndTailDupResult {
32409467b48Spatrick     MachineBasicBlock *BB;
32509467b48Spatrick     bool ShouldTailDup;
32609467b48Spatrick   };
32709467b48Spatrick 
32809467b48Spatrick   /// Triple struct containing edge weight and the edge.
32909467b48Spatrick   struct WeightedEdge {
33009467b48Spatrick     BlockFrequency Weight;
33109467b48Spatrick     MachineBasicBlock *Src;
33209467b48Spatrick     MachineBasicBlock *Dest;
33309467b48Spatrick   };
33409467b48Spatrick 
33509467b48Spatrick   /// work lists of blocks that are ready to be laid out
33609467b48Spatrick   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
33709467b48Spatrick   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
33809467b48Spatrick 
33909467b48Spatrick   /// Edges that have already been computed as optimal.
34009467b48Spatrick   DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
34109467b48Spatrick 
34209467b48Spatrick   /// Machine Function
34309467b48Spatrick   MachineFunction *F;
34409467b48Spatrick 
34509467b48Spatrick   /// A handle to the branch probability pass.
34609467b48Spatrick   const MachineBranchProbabilityInfo *MBPI;
34709467b48Spatrick 
34809467b48Spatrick   /// A handle to the function-wide block frequency pass.
349*097a140dSpatrick   std::unique_ptr<MBFIWrapper> MBFI;
35009467b48Spatrick 
35109467b48Spatrick   /// A handle to the loop info.
35209467b48Spatrick   MachineLoopInfo *MLI;
35309467b48Spatrick 
35409467b48Spatrick   /// Preferred loop exit.
35509467b48Spatrick   /// Member variable for convenience. It may be removed by duplication deep
35609467b48Spatrick   /// in the call stack.
35709467b48Spatrick   MachineBasicBlock *PreferredLoopExit;
35809467b48Spatrick 
35909467b48Spatrick   /// A handle to the target's instruction info.
36009467b48Spatrick   const TargetInstrInfo *TII;
36109467b48Spatrick 
36209467b48Spatrick   /// A handle to the target's lowering info.
36309467b48Spatrick   const TargetLoweringBase *TLI;
36409467b48Spatrick 
36509467b48Spatrick   /// A handle to the post dominator tree.
36609467b48Spatrick   MachinePostDominatorTree *MPDT;
36709467b48Spatrick 
36809467b48Spatrick   ProfileSummaryInfo *PSI;
36909467b48Spatrick 
37009467b48Spatrick   /// Duplicator used to duplicate tails during placement.
37109467b48Spatrick   ///
37209467b48Spatrick   /// Placement decisions can open up new tail duplication opportunities, but
37309467b48Spatrick   /// since tail duplication affects placement decisions of later blocks, it
37409467b48Spatrick   /// must be done inline.
37509467b48Spatrick   TailDuplicator TailDup;
37609467b48Spatrick 
377*097a140dSpatrick   /// Partial tail duplication threshold.
378*097a140dSpatrick   BlockFrequency DupThreshold;
379*097a140dSpatrick 
38009467b48Spatrick   /// Allocator and owner of BlockChain structures.
38109467b48Spatrick   ///
38209467b48Spatrick   /// We build BlockChains lazily while processing the loop structure of
38309467b48Spatrick   /// a function. To reduce malloc traffic, we allocate them using this
38409467b48Spatrick   /// slab-like allocator, and destroy them after the pass completes. An
38509467b48Spatrick   /// important guarantee is that this allocator produces stable pointers to
38609467b48Spatrick   /// the chains.
38709467b48Spatrick   SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
38809467b48Spatrick 
38909467b48Spatrick   /// Function wide BasicBlock to BlockChain mapping.
39009467b48Spatrick   ///
39109467b48Spatrick   /// This mapping allows efficiently moving from any given basic block to the
39209467b48Spatrick   /// BlockChain it participates in, if any. We use it to, among other things,
39309467b48Spatrick   /// allow implicitly defining edges between chains as the existing edges
39409467b48Spatrick   /// between basic blocks.
39509467b48Spatrick   DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
39609467b48Spatrick 
39709467b48Spatrick #ifndef NDEBUG
39809467b48Spatrick   /// The set of basic blocks that have terminators that cannot be fully
39909467b48Spatrick   /// analyzed.  These basic blocks cannot be re-ordered safely by
40009467b48Spatrick   /// MachineBlockPlacement, and we must preserve physical layout of these
40109467b48Spatrick   /// blocks and their successors through the pass.
40209467b48Spatrick   SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
40309467b48Spatrick #endif
40409467b48Spatrick 
405*097a140dSpatrick   /// Scale the DupThreshold according to basic block size.
406*097a140dSpatrick   BlockFrequency scaleThreshold(MachineBasicBlock *BB);
407*097a140dSpatrick   void initDupThreshold();
408*097a140dSpatrick 
40909467b48Spatrick   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
41009467b48Spatrick   /// if the count goes to 0, add them to the appropriate work list.
41109467b48Spatrick   void markChainSuccessors(
41209467b48Spatrick       const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
41309467b48Spatrick       const BlockFilterSet *BlockFilter = nullptr);
41409467b48Spatrick 
41509467b48Spatrick   /// Decrease the UnscheduledPredecessors count for a single block, and
41609467b48Spatrick   /// if the count goes to 0, add them to the appropriate work list.
41709467b48Spatrick   void markBlockSuccessors(
41809467b48Spatrick       const BlockChain &Chain, const MachineBasicBlock *BB,
41909467b48Spatrick       const MachineBasicBlock *LoopHeaderBB,
42009467b48Spatrick       const BlockFilterSet *BlockFilter = nullptr);
42109467b48Spatrick 
42209467b48Spatrick   BranchProbability
42309467b48Spatrick   collectViableSuccessors(
42409467b48Spatrick       const MachineBasicBlock *BB, const BlockChain &Chain,
42509467b48Spatrick       const BlockFilterSet *BlockFilter,
42609467b48Spatrick       SmallVector<MachineBasicBlock *, 4> &Successors);
42709467b48Spatrick   bool shouldPredBlockBeOutlined(
42809467b48Spatrick       const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
42909467b48Spatrick       const BlockChain &Chain, const BlockFilterSet *BlockFilter,
43009467b48Spatrick       BranchProbability SuccProb, BranchProbability HotProb);
431*097a140dSpatrick   bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
432*097a140dSpatrick                        BlockFilterSet *BlockFilter);
433*097a140dSpatrick   void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
434*097a140dSpatrick                                MachineBasicBlock *BB,
435*097a140dSpatrick                                BlockFilterSet *BlockFilter);
43609467b48Spatrick   bool repeatedlyTailDuplicateBlock(
43709467b48Spatrick       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
43809467b48Spatrick       const MachineBasicBlock *LoopHeaderBB,
43909467b48Spatrick       BlockChain &Chain, BlockFilterSet *BlockFilter,
44009467b48Spatrick       MachineFunction::iterator &PrevUnplacedBlockIt);
44109467b48Spatrick   bool maybeTailDuplicateBlock(
44209467b48Spatrick       MachineBasicBlock *BB, MachineBasicBlock *LPred,
44309467b48Spatrick       BlockChain &Chain, BlockFilterSet *BlockFilter,
44409467b48Spatrick       MachineFunction::iterator &PrevUnplacedBlockIt,
44509467b48Spatrick       bool &DuplicatedToLPred);
44609467b48Spatrick   bool hasBetterLayoutPredecessor(
44709467b48Spatrick       const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
44809467b48Spatrick       const BlockChain &SuccChain, BranchProbability SuccProb,
44909467b48Spatrick       BranchProbability RealSuccProb, const BlockChain &Chain,
45009467b48Spatrick       const BlockFilterSet *BlockFilter);
45109467b48Spatrick   BlockAndTailDupResult selectBestSuccessor(
45209467b48Spatrick       const MachineBasicBlock *BB, const BlockChain &Chain,
45309467b48Spatrick       const BlockFilterSet *BlockFilter);
45409467b48Spatrick   MachineBasicBlock *selectBestCandidateBlock(
45509467b48Spatrick       const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
45609467b48Spatrick   MachineBasicBlock *getFirstUnplacedBlock(
45709467b48Spatrick       const BlockChain &PlacedChain,
45809467b48Spatrick       MachineFunction::iterator &PrevUnplacedBlockIt,
45909467b48Spatrick       const BlockFilterSet *BlockFilter);
46009467b48Spatrick 
46109467b48Spatrick   /// Add a basic block to the work list if it is appropriate.
46209467b48Spatrick   ///
46309467b48Spatrick   /// If the optional parameter BlockFilter is provided, only MBB
46409467b48Spatrick   /// present in the set will be added to the worklist. If nullptr
46509467b48Spatrick   /// is provided, no filtering occurs.
46609467b48Spatrick   void fillWorkLists(const MachineBasicBlock *MBB,
46709467b48Spatrick                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
46809467b48Spatrick                      const BlockFilterSet *BlockFilter);
46909467b48Spatrick 
47009467b48Spatrick   void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
47109467b48Spatrick                   BlockFilterSet *BlockFilter = nullptr);
47209467b48Spatrick   bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
47309467b48Spatrick                                const MachineBasicBlock *OldTop);
47409467b48Spatrick   bool hasViableTopFallthrough(const MachineBasicBlock *Top,
47509467b48Spatrick                                const BlockFilterSet &LoopBlockSet);
47609467b48Spatrick   BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
47709467b48Spatrick                                     const BlockFilterSet &LoopBlockSet);
47809467b48Spatrick   BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
47909467b48Spatrick                                   const MachineBasicBlock *OldTop,
48009467b48Spatrick                                   const MachineBasicBlock *ExitBB,
48109467b48Spatrick                                   const BlockFilterSet &LoopBlockSet);
48209467b48Spatrick   MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
48309467b48Spatrick       const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
48409467b48Spatrick   MachineBasicBlock *findBestLoopTop(
48509467b48Spatrick       const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
48609467b48Spatrick   MachineBasicBlock *findBestLoopExit(
48709467b48Spatrick       const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
48809467b48Spatrick       BlockFrequency &ExitFreq);
48909467b48Spatrick   BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
49009467b48Spatrick   void buildLoopChains(const MachineLoop &L);
49109467b48Spatrick   void rotateLoop(
49209467b48Spatrick       BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
49309467b48Spatrick       BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
49409467b48Spatrick   void rotateLoopWithProfile(
49509467b48Spatrick       BlockChain &LoopChain, const MachineLoop &L,
49609467b48Spatrick       const BlockFilterSet &LoopBlockSet);
49709467b48Spatrick   void buildCFGChains();
49809467b48Spatrick   void optimizeBranches();
49909467b48Spatrick   void alignBlocks();
50009467b48Spatrick   /// Returns true if a block should be tail-duplicated to increase fallthrough
50109467b48Spatrick   /// opportunities.
50209467b48Spatrick   bool shouldTailDuplicate(MachineBasicBlock *BB);
50309467b48Spatrick   /// Check the edge frequencies to see if tail duplication will increase
50409467b48Spatrick   /// fallthroughs.
50509467b48Spatrick   bool isProfitableToTailDup(
50609467b48Spatrick     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
50709467b48Spatrick     BranchProbability QProb,
50809467b48Spatrick     const BlockChain &Chain, const BlockFilterSet *BlockFilter);
50909467b48Spatrick 
51009467b48Spatrick   /// Check for a trellis layout.
51109467b48Spatrick   bool isTrellis(const MachineBasicBlock *BB,
51209467b48Spatrick                  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
51309467b48Spatrick                  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
51409467b48Spatrick 
51509467b48Spatrick   /// Get the best successor given a trellis layout.
51609467b48Spatrick   BlockAndTailDupResult getBestTrellisSuccessor(
51709467b48Spatrick       const MachineBasicBlock *BB,
51809467b48Spatrick       const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
51909467b48Spatrick       BranchProbability AdjustedSumProb, const BlockChain &Chain,
52009467b48Spatrick       const BlockFilterSet *BlockFilter);
52109467b48Spatrick 
52209467b48Spatrick   /// Get the best pair of non-conflicting edges.
52309467b48Spatrick   static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
52409467b48Spatrick       const MachineBasicBlock *BB,
52509467b48Spatrick       MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
52609467b48Spatrick 
52709467b48Spatrick   /// Returns true if a block can tail duplicate into all unplaced
52809467b48Spatrick   /// predecessors. Filters based on loop.
52909467b48Spatrick   bool canTailDuplicateUnplacedPreds(
53009467b48Spatrick       const MachineBasicBlock *BB, MachineBasicBlock *Succ,
53109467b48Spatrick       const BlockChain &Chain, const BlockFilterSet *BlockFilter);
53209467b48Spatrick 
53309467b48Spatrick   /// Find chains of triangles to tail-duplicate where a global analysis works,
53409467b48Spatrick   /// but a local analysis would not find them.
53509467b48Spatrick   void precomputeTriangleChains();
53609467b48Spatrick 
53709467b48Spatrick public:
53809467b48Spatrick   static char ID; // Pass identification, replacement for typeid
53909467b48Spatrick 
54009467b48Spatrick   MachineBlockPlacement() : MachineFunctionPass(ID) {
54109467b48Spatrick     initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
54209467b48Spatrick   }
54309467b48Spatrick 
54409467b48Spatrick   bool runOnMachineFunction(MachineFunction &F) override;
54509467b48Spatrick 
54609467b48Spatrick   bool allowTailDupPlacement() const {
54709467b48Spatrick     assert(F);
54809467b48Spatrick     return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
54909467b48Spatrick   }
55009467b48Spatrick 
55109467b48Spatrick   void getAnalysisUsage(AnalysisUsage &AU) const override {
55209467b48Spatrick     AU.addRequired<MachineBranchProbabilityInfo>();
55309467b48Spatrick     AU.addRequired<MachineBlockFrequencyInfo>();
55409467b48Spatrick     if (TailDupPlacement)
55509467b48Spatrick       AU.addRequired<MachinePostDominatorTree>();
55609467b48Spatrick     AU.addRequired<MachineLoopInfo>();
55709467b48Spatrick     AU.addRequired<ProfileSummaryInfoWrapperPass>();
55809467b48Spatrick     AU.addRequired<TargetPassConfig>();
55909467b48Spatrick     MachineFunctionPass::getAnalysisUsage(AU);
56009467b48Spatrick   }
56109467b48Spatrick };
56209467b48Spatrick 
56309467b48Spatrick } // end anonymous namespace
56409467b48Spatrick 
56509467b48Spatrick char MachineBlockPlacement::ID = 0;
56609467b48Spatrick 
56709467b48Spatrick char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
56809467b48Spatrick 
56909467b48Spatrick INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
57009467b48Spatrick                       "Branch Probability Basic Block Placement", false, false)
57109467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
57209467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
57309467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
57409467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
57509467b48Spatrick INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
57609467b48Spatrick INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
57709467b48Spatrick                     "Branch Probability Basic Block Placement", false, false)
57809467b48Spatrick 
57909467b48Spatrick #ifndef NDEBUG
58009467b48Spatrick /// Helper to print the name of a MBB.
58109467b48Spatrick ///
58209467b48Spatrick /// Only used by debug logging.
58309467b48Spatrick static std::string getBlockName(const MachineBasicBlock *BB) {
58409467b48Spatrick   std::string Result;
58509467b48Spatrick   raw_string_ostream OS(Result);
58609467b48Spatrick   OS << printMBBReference(*BB);
58709467b48Spatrick   OS << " ('" << BB->getName() << "')";
58809467b48Spatrick   OS.flush();
58909467b48Spatrick   return Result;
59009467b48Spatrick }
59109467b48Spatrick #endif
59209467b48Spatrick 
59309467b48Spatrick /// Mark a chain's successors as having one fewer preds.
59409467b48Spatrick ///
59509467b48Spatrick /// When a chain is being merged into the "placed" chain, this routine will
59609467b48Spatrick /// quickly walk the successors of each block in the chain and mark them as
59709467b48Spatrick /// having one fewer active predecessor. It also adds any successors of this
59809467b48Spatrick /// chain which reach the zero-predecessor state to the appropriate worklist.
59909467b48Spatrick void MachineBlockPlacement::markChainSuccessors(
60009467b48Spatrick     const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
60109467b48Spatrick     const BlockFilterSet *BlockFilter) {
60209467b48Spatrick   // Walk all the blocks in this chain, marking their successors as having
60309467b48Spatrick   // a predecessor placed.
60409467b48Spatrick   for (MachineBasicBlock *MBB : Chain) {
60509467b48Spatrick     markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
60609467b48Spatrick   }
60709467b48Spatrick }
60809467b48Spatrick 
60909467b48Spatrick /// Mark a single block's successors as having one fewer preds.
61009467b48Spatrick ///
61109467b48Spatrick /// Under normal circumstances, this is only called by markChainSuccessors,
61209467b48Spatrick /// but if a block that was to be placed is completely tail-duplicated away,
61309467b48Spatrick /// and was duplicated into the chain end, we need to redo markBlockSuccessors
61409467b48Spatrick /// for just that block.
61509467b48Spatrick void MachineBlockPlacement::markBlockSuccessors(
61609467b48Spatrick     const BlockChain &Chain, const MachineBasicBlock *MBB,
61709467b48Spatrick     const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
61809467b48Spatrick   // Add any successors for which this is the only un-placed in-loop
61909467b48Spatrick   // predecessor to the worklist as a viable candidate for CFG-neutral
62009467b48Spatrick   // placement. No subsequent placement of this block will violate the CFG
62109467b48Spatrick   // shape, so we get to use heuristics to choose a favorable placement.
62209467b48Spatrick   for (MachineBasicBlock *Succ : MBB->successors()) {
62309467b48Spatrick     if (BlockFilter && !BlockFilter->count(Succ))
62409467b48Spatrick       continue;
62509467b48Spatrick     BlockChain &SuccChain = *BlockToChain[Succ];
62609467b48Spatrick     // Disregard edges within a fixed chain, or edges to the loop header.
62709467b48Spatrick     if (&Chain == &SuccChain || Succ == LoopHeaderBB)
62809467b48Spatrick       continue;
62909467b48Spatrick 
63009467b48Spatrick     // This is a cross-chain edge that is within the loop, so decrement the
63109467b48Spatrick     // loop predecessor count of the destination chain.
63209467b48Spatrick     if (SuccChain.UnscheduledPredecessors == 0 ||
63309467b48Spatrick         --SuccChain.UnscheduledPredecessors > 0)
63409467b48Spatrick       continue;
63509467b48Spatrick 
63609467b48Spatrick     auto *NewBB = *SuccChain.begin();
63709467b48Spatrick     if (NewBB->isEHPad())
63809467b48Spatrick       EHPadWorkList.push_back(NewBB);
63909467b48Spatrick     else
64009467b48Spatrick       BlockWorkList.push_back(NewBB);
64109467b48Spatrick   }
64209467b48Spatrick }
64309467b48Spatrick 
64409467b48Spatrick /// This helper function collects the set of successors of block
64509467b48Spatrick /// \p BB that are allowed to be its layout successors, and return
64609467b48Spatrick /// the total branch probability of edges from \p BB to those
64709467b48Spatrick /// blocks.
64809467b48Spatrick BranchProbability MachineBlockPlacement::collectViableSuccessors(
64909467b48Spatrick     const MachineBasicBlock *BB, const BlockChain &Chain,
65009467b48Spatrick     const BlockFilterSet *BlockFilter,
65109467b48Spatrick     SmallVector<MachineBasicBlock *, 4> &Successors) {
65209467b48Spatrick   // Adjust edge probabilities by excluding edges pointing to blocks that is
65309467b48Spatrick   // either not in BlockFilter or is already in the current chain. Consider the
65409467b48Spatrick   // following CFG:
65509467b48Spatrick   //
65609467b48Spatrick   //     --->A
65709467b48Spatrick   //     |  / \
65809467b48Spatrick   //     | B   C
65909467b48Spatrick   //     |  \ / \
66009467b48Spatrick   //     ----D   E
66109467b48Spatrick   //
66209467b48Spatrick   // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
66309467b48Spatrick   // A->C is chosen as a fall-through, D won't be selected as a successor of C
66409467b48Spatrick   // due to CFG constraint (the probability of C->D is not greater than
66509467b48Spatrick   // HotProb to break topo-order). If we exclude E that is not in BlockFilter
66609467b48Spatrick   // when calculating the probability of C->D, D will be selected and we
66709467b48Spatrick   // will get A C D B as the layout of this loop.
66809467b48Spatrick   auto AdjustedSumProb = BranchProbability::getOne();
66909467b48Spatrick   for (MachineBasicBlock *Succ : BB->successors()) {
67009467b48Spatrick     bool SkipSucc = false;
67109467b48Spatrick     if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
67209467b48Spatrick       SkipSucc = true;
67309467b48Spatrick     } else {
67409467b48Spatrick       BlockChain *SuccChain = BlockToChain[Succ];
67509467b48Spatrick       if (SuccChain == &Chain) {
67609467b48Spatrick         SkipSucc = true;
67709467b48Spatrick       } else if (Succ != *SuccChain->begin()) {
67809467b48Spatrick         LLVM_DEBUG(dbgs() << "    " << getBlockName(Succ)
67909467b48Spatrick                           << " -> Mid chain!\n");
68009467b48Spatrick         continue;
68109467b48Spatrick       }
68209467b48Spatrick     }
68309467b48Spatrick     if (SkipSucc)
68409467b48Spatrick       AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
68509467b48Spatrick     else
68609467b48Spatrick       Successors.push_back(Succ);
68709467b48Spatrick   }
68809467b48Spatrick 
68909467b48Spatrick   return AdjustedSumProb;
69009467b48Spatrick }
69109467b48Spatrick 
69209467b48Spatrick /// The helper function returns the branch probability that is adjusted
69309467b48Spatrick /// or normalized over the new total \p AdjustedSumProb.
69409467b48Spatrick static BranchProbability
69509467b48Spatrick getAdjustedProbability(BranchProbability OrigProb,
69609467b48Spatrick                        BranchProbability AdjustedSumProb) {
69709467b48Spatrick   BranchProbability SuccProb;
69809467b48Spatrick   uint32_t SuccProbN = OrigProb.getNumerator();
69909467b48Spatrick   uint32_t SuccProbD = AdjustedSumProb.getNumerator();
70009467b48Spatrick   if (SuccProbN >= SuccProbD)
70109467b48Spatrick     SuccProb = BranchProbability::getOne();
70209467b48Spatrick   else
70309467b48Spatrick     SuccProb = BranchProbability(SuccProbN, SuccProbD);
70409467b48Spatrick 
70509467b48Spatrick   return SuccProb;
70609467b48Spatrick }
70709467b48Spatrick 
70809467b48Spatrick /// Check if \p BB has exactly the successors in \p Successors.
70909467b48Spatrick static bool
71009467b48Spatrick hasSameSuccessors(MachineBasicBlock &BB,
71109467b48Spatrick                   SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
71209467b48Spatrick   if (BB.succ_size() != Successors.size())
71309467b48Spatrick     return false;
71409467b48Spatrick   // We don't want to count self-loops
71509467b48Spatrick   if (Successors.count(&BB))
71609467b48Spatrick     return false;
71709467b48Spatrick   for (MachineBasicBlock *Succ : BB.successors())
71809467b48Spatrick     if (!Successors.count(Succ))
71909467b48Spatrick       return false;
72009467b48Spatrick   return true;
72109467b48Spatrick }
72209467b48Spatrick 
72309467b48Spatrick /// Check if a block should be tail duplicated to increase fallthrough
72409467b48Spatrick /// opportunities.
72509467b48Spatrick /// \p BB Block to check.
72609467b48Spatrick bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
72709467b48Spatrick   // Blocks with single successors don't create additional fallthrough
72809467b48Spatrick   // opportunities. Don't duplicate them. TODO: When conditional exits are
72909467b48Spatrick   // analyzable, allow them to be duplicated.
73009467b48Spatrick   bool IsSimple = TailDup.isSimpleBB(BB);
73109467b48Spatrick 
73209467b48Spatrick   if (BB->succ_size() == 1)
73309467b48Spatrick     return false;
73409467b48Spatrick   return TailDup.shouldTailDuplicate(IsSimple, *BB);
73509467b48Spatrick }
73609467b48Spatrick 
73709467b48Spatrick /// Compare 2 BlockFrequency's with a small penalty for \p A.
73809467b48Spatrick /// In order to be conservative, we apply a X% penalty to account for
73909467b48Spatrick /// increased icache pressure and static heuristics. For small frequencies
74009467b48Spatrick /// we use only the numerators to improve accuracy. For simplicity, we assume the
74109467b48Spatrick /// penalty is less than 100%
74209467b48Spatrick /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
74309467b48Spatrick static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
74409467b48Spatrick                             uint64_t EntryFreq) {
74509467b48Spatrick   BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
74609467b48Spatrick   BlockFrequency Gain = A - B;
74709467b48Spatrick   return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
74809467b48Spatrick }
74909467b48Spatrick 
75009467b48Spatrick /// Check the edge frequencies to see if tail duplication will increase
75109467b48Spatrick /// fallthroughs. It only makes sense to call this function when
75209467b48Spatrick /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
75309467b48Spatrick /// always locally profitable if we would have picked \p Succ without
75409467b48Spatrick /// considering duplication.
75509467b48Spatrick bool MachineBlockPlacement::isProfitableToTailDup(
75609467b48Spatrick     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
75709467b48Spatrick     BranchProbability QProb,
75809467b48Spatrick     const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
75909467b48Spatrick   // We need to do a probability calculation to make sure this is profitable.
76009467b48Spatrick   // First: does succ have a successor that post-dominates? This affects the
76109467b48Spatrick   // calculation. The 2 relevant cases are:
76209467b48Spatrick   //    BB         BB
76309467b48Spatrick   //    | \Qout    | \Qout
76409467b48Spatrick   //   P|  C       |P C
76509467b48Spatrick   //    =   C'     =   C'
76609467b48Spatrick   //    |  /Qin    |  /Qin
76709467b48Spatrick   //    | /        | /
76809467b48Spatrick   //    Succ       Succ
76909467b48Spatrick   //    / \        | \  V
77009467b48Spatrick   //  U/   =V      |U \
77109467b48Spatrick   //  /     \      =   D
77209467b48Spatrick   //  D      E     |  /
77309467b48Spatrick   //               | /
77409467b48Spatrick   //               |/
77509467b48Spatrick   //               PDom
77609467b48Spatrick   //  '=' : Branch taken for that CFG edge
77709467b48Spatrick   // In the second case, Placing Succ while duplicating it into C prevents the
77809467b48Spatrick   // fallthrough of Succ into either D or PDom, because they now have C as an
77909467b48Spatrick   // unplaced predecessor
78009467b48Spatrick 
78109467b48Spatrick   // Start by figuring out which case we fall into
78209467b48Spatrick   MachineBasicBlock *PDom = nullptr;
78309467b48Spatrick   SmallVector<MachineBasicBlock *, 4> SuccSuccs;
78409467b48Spatrick   // Only scan the relevant successors
78509467b48Spatrick   auto AdjustedSuccSumProb =
78609467b48Spatrick       collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
78709467b48Spatrick   BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
78809467b48Spatrick   auto BBFreq = MBFI->getBlockFreq(BB);
78909467b48Spatrick   auto SuccFreq = MBFI->getBlockFreq(Succ);
79009467b48Spatrick   BlockFrequency P = BBFreq * PProb;
79109467b48Spatrick   BlockFrequency Qout = BBFreq * QProb;
79209467b48Spatrick   uint64_t EntryFreq = MBFI->getEntryFreq();
79309467b48Spatrick   // If there are no more successors, it is profitable to copy, as it strictly
79409467b48Spatrick   // increases fallthrough.
79509467b48Spatrick   if (SuccSuccs.size() == 0)
79609467b48Spatrick     return greaterWithBias(P, Qout, EntryFreq);
79709467b48Spatrick 
79809467b48Spatrick   auto BestSuccSucc = BranchProbability::getZero();
79909467b48Spatrick   // Find the PDom or the best Succ if no PDom exists.
80009467b48Spatrick   for (MachineBasicBlock *SuccSucc : SuccSuccs) {
80109467b48Spatrick     auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
80209467b48Spatrick     if (Prob > BestSuccSucc)
80309467b48Spatrick       BestSuccSucc = Prob;
80409467b48Spatrick     if (PDom == nullptr)
80509467b48Spatrick       if (MPDT->dominates(SuccSucc, Succ)) {
80609467b48Spatrick         PDom = SuccSucc;
80709467b48Spatrick         break;
80809467b48Spatrick       }
80909467b48Spatrick   }
81009467b48Spatrick   // For the comparisons, we need to know Succ's best incoming edge that isn't
81109467b48Spatrick   // from BB.
81209467b48Spatrick   auto SuccBestPred = BlockFrequency(0);
81309467b48Spatrick   for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
81409467b48Spatrick     if (SuccPred == Succ || SuccPred == BB
81509467b48Spatrick         || BlockToChain[SuccPred] == &Chain
81609467b48Spatrick         || (BlockFilter && !BlockFilter->count(SuccPred)))
81709467b48Spatrick       continue;
81809467b48Spatrick     auto Freq = MBFI->getBlockFreq(SuccPred)
81909467b48Spatrick         * MBPI->getEdgeProbability(SuccPred, Succ);
82009467b48Spatrick     if (Freq > SuccBestPred)
82109467b48Spatrick       SuccBestPred = Freq;
82209467b48Spatrick   }
82309467b48Spatrick   // Qin is Succ's best unplaced incoming edge that isn't BB
82409467b48Spatrick   BlockFrequency Qin = SuccBestPred;
82509467b48Spatrick   // If it doesn't have a post-dominating successor, here is the calculation:
82609467b48Spatrick   //    BB        BB
82709467b48Spatrick   //    | \Qout   |  \
82809467b48Spatrick   //   P|  C      |   =
82909467b48Spatrick   //    =   C'    |    C
83009467b48Spatrick   //    |  /Qin   |     |
83109467b48Spatrick   //    | /       |     C' (+Succ)
83209467b48Spatrick   //    Succ      Succ /|
83309467b48Spatrick   //    / \       |  \/ |
83409467b48Spatrick   //  U/   =V     |  == |
83509467b48Spatrick   //  /     \     | /  \|
83609467b48Spatrick   //  D      E    D     E
83709467b48Spatrick   //  '=' : Branch taken for that CFG edge
83809467b48Spatrick   //  Cost in the first case is: P + V
83909467b48Spatrick   //  For this calculation, we always assume P > Qout. If Qout > P
84009467b48Spatrick   //  The result of this function will be ignored at the caller.
84109467b48Spatrick   //  Let F = SuccFreq - Qin
84209467b48Spatrick   //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
84309467b48Spatrick 
84409467b48Spatrick   if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
84509467b48Spatrick     BranchProbability UProb = BestSuccSucc;
84609467b48Spatrick     BranchProbability VProb = AdjustedSuccSumProb - UProb;
84709467b48Spatrick     BlockFrequency F = SuccFreq - Qin;
84809467b48Spatrick     BlockFrequency V = SuccFreq * VProb;
84909467b48Spatrick     BlockFrequency QinU = std::min(Qin, F) * UProb;
85009467b48Spatrick     BlockFrequency BaseCost = P + V;
85109467b48Spatrick     BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
85209467b48Spatrick     return greaterWithBias(BaseCost, DupCost, EntryFreq);
85309467b48Spatrick   }
85409467b48Spatrick   BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
85509467b48Spatrick   BranchProbability VProb = AdjustedSuccSumProb - UProb;
85609467b48Spatrick   BlockFrequency U = SuccFreq * UProb;
85709467b48Spatrick   BlockFrequency V = SuccFreq * VProb;
85809467b48Spatrick   BlockFrequency F = SuccFreq - Qin;
85909467b48Spatrick   // If there is a post-dominating successor, here is the calculation:
86009467b48Spatrick   // BB         BB                 BB          BB
86109467b48Spatrick   // | \Qout    |   \               | \Qout     |  \
86209467b48Spatrick   // |P C       |    =              |P C        |   =
86309467b48Spatrick   // =   C'     |P    C             =   C'      |P   C
86409467b48Spatrick   // |  /Qin    |      |            |  /Qin     |     |
86509467b48Spatrick   // | /        |      C' (+Succ)   | /         |     C' (+Succ)
86609467b48Spatrick   // Succ       Succ  /|            Succ        Succ /|
86709467b48Spatrick   // | \  V     |   \/ |            | \  V      |  \/ |
86809467b48Spatrick   // |U \       |U  /\ =?           |U =        |U /\ |
86909467b48Spatrick   // =   D      = =  =?|            |   D       | =  =|
87009467b48Spatrick   // |  /       |/     D            |  /        |/    D
87109467b48Spatrick   // | /        |     /             | =         |    /
87209467b48Spatrick   // |/         |    /              |/          |   =
87309467b48Spatrick   // Dom         Dom                Dom         Dom
87409467b48Spatrick   //  '=' : Branch taken for that CFG edge
87509467b48Spatrick   // The cost for taken branches in the first case is P + U
87609467b48Spatrick   // Let F = SuccFreq - Qin
87709467b48Spatrick   // The cost in the second case (assuming independence), given the layout:
87809467b48Spatrick   // BB, Succ, (C+Succ), D, Dom or the layout:
87909467b48Spatrick   // BB, Succ, D, Dom, (C+Succ)
88009467b48Spatrick   // is Qout + max(F, Qin) * U + min(F, Qin)
88109467b48Spatrick   // compare P + U vs Qout + P * U + Qin.
88209467b48Spatrick   //
88309467b48Spatrick   // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
88409467b48Spatrick   //
88509467b48Spatrick   // For the 3rd case, the cost is P + 2 * V
88609467b48Spatrick   // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
88709467b48Spatrick   // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
88809467b48Spatrick   if (UProb > AdjustedSuccSumProb / 2 &&
88909467b48Spatrick       !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
89009467b48Spatrick                                   Chain, BlockFilter))
89109467b48Spatrick     // Cases 3 & 4
89209467b48Spatrick     return greaterWithBias(
89309467b48Spatrick         (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
89409467b48Spatrick         EntryFreq);
89509467b48Spatrick   // Cases 1 & 2
89609467b48Spatrick   return greaterWithBias((P + U),
89709467b48Spatrick                          (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
89809467b48Spatrick                           std::max(Qin, F) * UProb),
89909467b48Spatrick                          EntryFreq);
90009467b48Spatrick }
90109467b48Spatrick 
90209467b48Spatrick /// Check for a trellis layout. \p BB is the upper part of a trellis if its
90309467b48Spatrick /// successors form the lower part of a trellis. A successor set S forms the
90409467b48Spatrick /// lower part of a trellis if all of the predecessors of S are either in S or
90509467b48Spatrick /// have all of S as successors. We ignore trellises where BB doesn't have 2
90609467b48Spatrick /// successors because for fewer than 2, it's trivial, and for 3 or greater they
90709467b48Spatrick /// are very uncommon and complex to compute optimally. Allowing edges within S
90809467b48Spatrick /// is not strictly a trellis, but the same algorithm works, so we allow it.
90909467b48Spatrick bool MachineBlockPlacement::isTrellis(
91009467b48Spatrick     const MachineBasicBlock *BB,
91109467b48Spatrick     const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
91209467b48Spatrick     const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
91309467b48Spatrick   // Technically BB could form a trellis with branching factor higher than 2.
91409467b48Spatrick   // But that's extremely uncommon.
91509467b48Spatrick   if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
91609467b48Spatrick     return false;
91709467b48Spatrick 
91809467b48Spatrick   SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
91909467b48Spatrick                                                        BB->succ_end());
92009467b48Spatrick   // To avoid reviewing the same predecessors twice.
92109467b48Spatrick   SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
92209467b48Spatrick 
92309467b48Spatrick   for (MachineBasicBlock *Succ : ViableSuccs) {
92409467b48Spatrick     int PredCount = 0;
92509467b48Spatrick     for (auto SuccPred : Succ->predecessors()) {
92609467b48Spatrick       // Allow triangle successors, but don't count them.
92709467b48Spatrick       if (Successors.count(SuccPred)) {
92809467b48Spatrick         // Make sure that it is actually a triangle.
92909467b48Spatrick         for (MachineBasicBlock *CheckSucc : SuccPred->successors())
93009467b48Spatrick           if (!Successors.count(CheckSucc))
93109467b48Spatrick             return false;
93209467b48Spatrick         continue;
93309467b48Spatrick       }
93409467b48Spatrick       const BlockChain *PredChain = BlockToChain[SuccPred];
93509467b48Spatrick       if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
93609467b48Spatrick           PredChain == &Chain || PredChain == BlockToChain[Succ])
93709467b48Spatrick         continue;
93809467b48Spatrick       ++PredCount;
93909467b48Spatrick       // Perform the successor check only once.
94009467b48Spatrick       if (!SeenPreds.insert(SuccPred).second)
94109467b48Spatrick         continue;
94209467b48Spatrick       if (!hasSameSuccessors(*SuccPred, Successors))
94309467b48Spatrick         return false;
94409467b48Spatrick     }
94509467b48Spatrick     // If one of the successors has only BB as a predecessor, it is not a
94609467b48Spatrick     // trellis.
94709467b48Spatrick     if (PredCount < 1)
94809467b48Spatrick       return false;
94909467b48Spatrick   }
95009467b48Spatrick   return true;
95109467b48Spatrick }
95209467b48Spatrick 
95309467b48Spatrick /// Pick the highest total weight pair of edges that can both be laid out.
95409467b48Spatrick /// The edges in \p Edges[0] are assumed to have a different destination than
95509467b48Spatrick /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
95609467b48Spatrick /// the individual highest weight edges to the 2 different destinations, or in
95709467b48Spatrick /// case of a conflict, one of them should be replaced with a 2nd best edge.
95809467b48Spatrick std::pair<MachineBlockPlacement::WeightedEdge,
95909467b48Spatrick           MachineBlockPlacement::WeightedEdge>
96009467b48Spatrick MachineBlockPlacement::getBestNonConflictingEdges(
96109467b48Spatrick     const MachineBasicBlock *BB,
96209467b48Spatrick     MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
96309467b48Spatrick         Edges) {
96409467b48Spatrick   // Sort the edges, and then for each successor, find the best incoming
96509467b48Spatrick   // predecessor. If the best incoming predecessors aren't the same,
96609467b48Spatrick   // then that is clearly the best layout. If there is a conflict, one of the
96709467b48Spatrick   // successors will have to fallthrough from the second best predecessor. We
96809467b48Spatrick   // compare which combination is better overall.
96909467b48Spatrick 
97009467b48Spatrick   // Sort for highest frequency.
97109467b48Spatrick   auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
97209467b48Spatrick 
97309467b48Spatrick   llvm::stable_sort(Edges[0], Cmp);
97409467b48Spatrick   llvm::stable_sort(Edges[1], Cmp);
97509467b48Spatrick   auto BestA = Edges[0].begin();
97609467b48Spatrick   auto BestB = Edges[1].begin();
97709467b48Spatrick   // Arrange for the correct answer to be in BestA and BestB
97809467b48Spatrick   // If the 2 best edges don't conflict, the answer is already there.
97909467b48Spatrick   if (BestA->Src == BestB->Src) {
98009467b48Spatrick     // Compare the total fallthrough of (Best + Second Best) for both pairs
98109467b48Spatrick     auto SecondBestA = std::next(BestA);
98209467b48Spatrick     auto SecondBestB = std::next(BestB);
98309467b48Spatrick     BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
98409467b48Spatrick     BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
98509467b48Spatrick     if (BestAScore < BestBScore)
98609467b48Spatrick       BestA = SecondBestA;
98709467b48Spatrick     else
98809467b48Spatrick       BestB = SecondBestB;
98909467b48Spatrick   }
99009467b48Spatrick   // Arrange for the BB edge to be in BestA if it exists.
99109467b48Spatrick   if (BestB->Src == BB)
99209467b48Spatrick     std::swap(BestA, BestB);
99309467b48Spatrick   return std::make_pair(*BestA, *BestB);
99409467b48Spatrick }
99509467b48Spatrick 
99609467b48Spatrick /// Get the best successor from \p BB based on \p BB being part of a trellis.
99709467b48Spatrick /// We only handle trellises with 2 successors, so the algorithm is
99809467b48Spatrick /// straightforward: Find the best pair of edges that don't conflict. We find
99909467b48Spatrick /// the best incoming edge for each successor in the trellis. If those conflict,
100009467b48Spatrick /// we consider which of them should be replaced with the second best.
100109467b48Spatrick /// Upon return the two best edges will be in \p BestEdges. If one of the edges
100209467b48Spatrick /// comes from \p BB, it will be in \p BestEdges[0]
100309467b48Spatrick MachineBlockPlacement::BlockAndTailDupResult
100409467b48Spatrick MachineBlockPlacement::getBestTrellisSuccessor(
100509467b48Spatrick     const MachineBasicBlock *BB,
100609467b48Spatrick     const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
100709467b48Spatrick     BranchProbability AdjustedSumProb, const BlockChain &Chain,
100809467b48Spatrick     const BlockFilterSet *BlockFilter) {
100909467b48Spatrick 
101009467b48Spatrick   BlockAndTailDupResult Result = {nullptr, false};
101109467b48Spatrick   SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
101209467b48Spatrick                                                        BB->succ_end());
101309467b48Spatrick 
101409467b48Spatrick   // We assume size 2 because it's common. For general n, we would have to do
101509467b48Spatrick   // the Hungarian algorithm, but it's not worth the complexity because more
101609467b48Spatrick   // than 2 successors is fairly uncommon, and a trellis even more so.
101709467b48Spatrick   if (Successors.size() != 2 || ViableSuccs.size() != 2)
101809467b48Spatrick     return Result;
101909467b48Spatrick 
102009467b48Spatrick   // Collect the edge frequencies of all edges that form the trellis.
102109467b48Spatrick   SmallVector<WeightedEdge, 8> Edges[2];
102209467b48Spatrick   int SuccIndex = 0;
102309467b48Spatrick   for (auto Succ : ViableSuccs) {
102409467b48Spatrick     for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
102509467b48Spatrick       // Skip any placed predecessors that are not BB
102609467b48Spatrick       if (SuccPred != BB)
102709467b48Spatrick         if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
102809467b48Spatrick             BlockToChain[SuccPred] == &Chain ||
102909467b48Spatrick             BlockToChain[SuccPred] == BlockToChain[Succ])
103009467b48Spatrick           continue;
103109467b48Spatrick       BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
103209467b48Spatrick                                 MBPI->getEdgeProbability(SuccPred, Succ);
103309467b48Spatrick       Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
103409467b48Spatrick     }
103509467b48Spatrick     ++SuccIndex;
103609467b48Spatrick   }
103709467b48Spatrick 
103809467b48Spatrick   // Pick the best combination of 2 edges from all the edges in the trellis.
103909467b48Spatrick   WeightedEdge BestA, BestB;
104009467b48Spatrick   std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
104109467b48Spatrick 
104209467b48Spatrick   if (BestA.Src != BB) {
104309467b48Spatrick     // If we have a trellis, and BB doesn't have the best fallthrough edges,
104409467b48Spatrick     // we shouldn't choose any successor. We've already looked and there's a
104509467b48Spatrick     // better fallthrough edge for all the successors.
104609467b48Spatrick     LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
104709467b48Spatrick     return Result;
104809467b48Spatrick   }
104909467b48Spatrick 
105009467b48Spatrick   // Did we pick the triangle edge? If tail-duplication is profitable, do
105109467b48Spatrick   // that instead. Otherwise merge the triangle edge now while we know it is
105209467b48Spatrick   // optimal.
105309467b48Spatrick   if (BestA.Dest == BestB.Src) {
105409467b48Spatrick     // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
105509467b48Spatrick     // would be better.
105609467b48Spatrick     MachineBasicBlock *Succ1 = BestA.Dest;
105709467b48Spatrick     MachineBasicBlock *Succ2 = BestB.Dest;
105809467b48Spatrick     // Check to see if tail-duplication would be profitable.
105909467b48Spatrick     if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
106009467b48Spatrick         canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
106109467b48Spatrick         isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
106209467b48Spatrick                               Chain, BlockFilter)) {
106309467b48Spatrick       LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
106409467b48Spatrick                      MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
106509467b48Spatrick                  dbgs() << "    Selected: " << getBlockName(Succ2)
106609467b48Spatrick                         << ", probability: " << Succ2Prob
106709467b48Spatrick                         << " (Tail Duplicate)\n");
106809467b48Spatrick       Result.BB = Succ2;
106909467b48Spatrick       Result.ShouldTailDup = true;
107009467b48Spatrick       return Result;
107109467b48Spatrick     }
107209467b48Spatrick   }
107309467b48Spatrick   // We have already computed the optimal edge for the other side of the
107409467b48Spatrick   // trellis.
107509467b48Spatrick   ComputedEdges[BestB.Src] = { BestB.Dest, false };
107609467b48Spatrick 
107709467b48Spatrick   auto TrellisSucc = BestA.Dest;
107809467b48Spatrick   LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
107909467b48Spatrick                  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
108009467b48Spatrick              dbgs() << "    Selected: " << getBlockName(TrellisSucc)
108109467b48Spatrick                     << ", probability: " << SuccProb << " (Trellis)\n");
108209467b48Spatrick   Result.BB = TrellisSucc;
108309467b48Spatrick   return Result;
108409467b48Spatrick }
108509467b48Spatrick 
108609467b48Spatrick /// When the option allowTailDupPlacement() is on, this method checks if the
108709467b48Spatrick /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
108809467b48Spatrick /// into all of its unplaced, unfiltered predecessors, that are not BB.
108909467b48Spatrick bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
109009467b48Spatrick     const MachineBasicBlock *BB, MachineBasicBlock *Succ,
109109467b48Spatrick     const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
109209467b48Spatrick   if (!shouldTailDuplicate(Succ))
109309467b48Spatrick     return false;
109409467b48Spatrick 
109509467b48Spatrick   // The result of canTailDuplicate.
109609467b48Spatrick   bool Duplicate = true;
109709467b48Spatrick   // Number of possible duplication.
109809467b48Spatrick   unsigned int NumDup = 0;
109909467b48Spatrick 
110009467b48Spatrick   // For CFG checking.
110109467b48Spatrick   SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
110209467b48Spatrick                                                        BB->succ_end());
110309467b48Spatrick   for (MachineBasicBlock *Pred : Succ->predecessors()) {
110409467b48Spatrick     // Make sure all unplaced and unfiltered predecessors can be
110509467b48Spatrick     // tail-duplicated into.
110609467b48Spatrick     // Skip any blocks that are already placed or not in this loop.
110709467b48Spatrick     if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
110809467b48Spatrick         || BlockToChain[Pred] == &Chain)
110909467b48Spatrick       continue;
111009467b48Spatrick     if (!TailDup.canTailDuplicate(Succ, Pred)) {
111109467b48Spatrick       if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
111209467b48Spatrick         // This will result in a trellis after tail duplication, so we don't
111309467b48Spatrick         // need to copy Succ into this predecessor. In the presence
111409467b48Spatrick         // of a trellis tail duplication can continue to be profitable.
111509467b48Spatrick         // For example:
111609467b48Spatrick         // A            A
111709467b48Spatrick         // |\           |\
111809467b48Spatrick         // | \          | \
111909467b48Spatrick         // |  C         |  C+BB
112009467b48Spatrick         // | /          |  |
112109467b48Spatrick         // |/           |  |
112209467b48Spatrick         // BB    =>     BB |
112309467b48Spatrick         // |\           |\/|
112409467b48Spatrick         // | \          |/\|
112509467b48Spatrick         // |  D         |  D
112609467b48Spatrick         // | /          | /
112709467b48Spatrick         // |/           |/
112809467b48Spatrick         // Succ         Succ
112909467b48Spatrick         //
113009467b48Spatrick         // After BB was duplicated into C, the layout looks like the one on the
113109467b48Spatrick         // right. BB and C now have the same successors. When considering
113209467b48Spatrick         // whether Succ can be duplicated into all its unplaced predecessors, we
113309467b48Spatrick         // ignore C.
113409467b48Spatrick         // We can do this because C already has a profitable fallthrough, namely
113509467b48Spatrick         // D. TODO(iteratee): ignore sufficiently cold predecessors for
113609467b48Spatrick         // duplication and for this test.
113709467b48Spatrick         //
113809467b48Spatrick         // This allows trellises to be laid out in 2 separate chains
113909467b48Spatrick         // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
114009467b48Spatrick         // because it allows the creation of 2 fallthrough paths with links
114109467b48Spatrick         // between them, and we correctly identify the best layout for these
114209467b48Spatrick         // CFGs. We want to extend trellises that the user created in addition
114309467b48Spatrick         // to trellises created by tail-duplication, so we just look for the
114409467b48Spatrick         // CFG.
114509467b48Spatrick         continue;
114609467b48Spatrick       Duplicate = false;
114709467b48Spatrick       continue;
114809467b48Spatrick     }
114909467b48Spatrick     NumDup++;
115009467b48Spatrick   }
115109467b48Spatrick 
115209467b48Spatrick   // No possible duplication in current filter set.
115309467b48Spatrick   if (NumDup == 0)
115409467b48Spatrick     return false;
115509467b48Spatrick 
1156*097a140dSpatrick   // If profile information is available, findDuplicateCandidates can do more
1157*097a140dSpatrick   // precise benefit analysis.
1158*097a140dSpatrick   if (F->getFunction().hasProfileData())
1159*097a140dSpatrick     return true;
1160*097a140dSpatrick 
116109467b48Spatrick   // This is mainly for function exit BB.
116209467b48Spatrick   // The integrated tail duplication is really designed for increasing
116309467b48Spatrick   // fallthrough from predecessors from Succ to its successors. We may need
116409467b48Spatrick   // other machanism to handle different cases.
116509467b48Spatrick   if (Succ->succ_size() == 0)
116609467b48Spatrick     return true;
116709467b48Spatrick 
116809467b48Spatrick   // Plus the already placed predecessor.
116909467b48Spatrick   NumDup++;
117009467b48Spatrick 
117109467b48Spatrick   // If the duplication candidate has more unplaced predecessors than
117209467b48Spatrick   // successors, the extra duplication can't bring more fallthrough.
117309467b48Spatrick   //
117409467b48Spatrick   //     Pred1 Pred2 Pred3
117509467b48Spatrick   //         \   |   /
117609467b48Spatrick   //          \  |  /
117709467b48Spatrick   //           \ | /
117809467b48Spatrick   //            Dup
117909467b48Spatrick   //            / \
118009467b48Spatrick   //           /   \
118109467b48Spatrick   //       Succ1  Succ2
118209467b48Spatrick   //
118309467b48Spatrick   // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
118409467b48Spatrick   // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
118509467b48Spatrick   // but the duplication into Pred3 can't increase fallthrough.
118609467b48Spatrick   //
118709467b48Spatrick   // A small number of extra duplication may not hurt too much. We need a better
118809467b48Spatrick   // heuristic to handle it.
118909467b48Spatrick   if ((NumDup > Succ->succ_size()) || !Duplicate)
119009467b48Spatrick     return false;
119109467b48Spatrick 
119209467b48Spatrick   return true;
119309467b48Spatrick }
119409467b48Spatrick 
119509467b48Spatrick /// Find chains of triangles where we believe it would be profitable to
119609467b48Spatrick /// tail-duplicate them all, but a local analysis would not find them.
119709467b48Spatrick /// There are 3 ways this can be profitable:
119809467b48Spatrick /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
119909467b48Spatrick ///    longer chains)
120009467b48Spatrick /// 2) The chains are statically correlated. Branch probabilities have a very
120109467b48Spatrick ///    U-shaped distribution.
120209467b48Spatrick ///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
120309467b48Spatrick ///    If the branches in a chain are likely to be from the same side of the
120409467b48Spatrick ///    distribution as their predecessor, but are independent at runtime, this
120509467b48Spatrick ///    transformation is profitable. (Because the cost of being wrong is a small
120609467b48Spatrick ///    fixed cost, unlike the standard triangle layout where the cost of being
120709467b48Spatrick ///    wrong scales with the # of triangles.)
120809467b48Spatrick /// 3) The chains are dynamically correlated. If the probability that a previous
120909467b48Spatrick ///    branch was taken positively influences whether the next branch will be
121009467b48Spatrick ///    taken
121109467b48Spatrick /// We believe that 2 and 3 are common enough to justify the small margin in 1.
121209467b48Spatrick void MachineBlockPlacement::precomputeTriangleChains() {
121309467b48Spatrick   struct TriangleChain {
121409467b48Spatrick     std::vector<MachineBasicBlock *> Edges;
121509467b48Spatrick 
121609467b48Spatrick     TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
121709467b48Spatrick         : Edges({src, dst}) {}
121809467b48Spatrick 
121909467b48Spatrick     void append(MachineBasicBlock *dst) {
122009467b48Spatrick       assert(getKey()->isSuccessor(dst) &&
122109467b48Spatrick              "Attempting to append a block that is not a successor.");
122209467b48Spatrick       Edges.push_back(dst);
122309467b48Spatrick     }
122409467b48Spatrick 
122509467b48Spatrick     unsigned count() const { return Edges.size() - 1; }
122609467b48Spatrick 
122709467b48Spatrick     MachineBasicBlock *getKey() const {
122809467b48Spatrick       return Edges.back();
122909467b48Spatrick     }
123009467b48Spatrick   };
123109467b48Spatrick 
123209467b48Spatrick   if (TriangleChainCount == 0)
123309467b48Spatrick     return;
123409467b48Spatrick 
123509467b48Spatrick   LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
123609467b48Spatrick   // Map from last block to the chain that contains it. This allows us to extend
123709467b48Spatrick   // chains as we find new triangles.
123809467b48Spatrick   DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
123909467b48Spatrick   for (MachineBasicBlock &BB : *F) {
124009467b48Spatrick     // If BB doesn't have 2 successors, it doesn't start a triangle.
124109467b48Spatrick     if (BB.succ_size() != 2)
124209467b48Spatrick       continue;
124309467b48Spatrick     MachineBasicBlock *PDom = nullptr;
124409467b48Spatrick     for (MachineBasicBlock *Succ : BB.successors()) {
124509467b48Spatrick       if (!MPDT->dominates(Succ, &BB))
124609467b48Spatrick         continue;
124709467b48Spatrick       PDom = Succ;
124809467b48Spatrick       break;
124909467b48Spatrick     }
125009467b48Spatrick     // If BB doesn't have a post-dominating successor, it doesn't form a
125109467b48Spatrick     // triangle.
125209467b48Spatrick     if (PDom == nullptr)
125309467b48Spatrick       continue;
125409467b48Spatrick     // If PDom has a hint that it is low probability, skip this triangle.
125509467b48Spatrick     if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
125609467b48Spatrick       continue;
125709467b48Spatrick     // If PDom isn't eligible for duplication, this isn't the kind of triangle
125809467b48Spatrick     // we're looking for.
125909467b48Spatrick     if (!shouldTailDuplicate(PDom))
126009467b48Spatrick       continue;
126109467b48Spatrick     bool CanTailDuplicate = true;
126209467b48Spatrick     // If PDom can't tail-duplicate into it's non-BB predecessors, then this
126309467b48Spatrick     // isn't the kind of triangle we're looking for.
126409467b48Spatrick     for (MachineBasicBlock* Pred : PDom->predecessors()) {
126509467b48Spatrick       if (Pred == &BB)
126609467b48Spatrick         continue;
126709467b48Spatrick       if (!TailDup.canTailDuplicate(PDom, Pred)) {
126809467b48Spatrick         CanTailDuplicate = false;
126909467b48Spatrick         break;
127009467b48Spatrick       }
127109467b48Spatrick     }
127209467b48Spatrick     // If we can't tail-duplicate PDom to its predecessors, then skip this
127309467b48Spatrick     // triangle.
127409467b48Spatrick     if (!CanTailDuplicate)
127509467b48Spatrick       continue;
127609467b48Spatrick 
127709467b48Spatrick     // Now we have an interesting triangle. Insert it if it's not part of an
127809467b48Spatrick     // existing chain.
127909467b48Spatrick     // Note: This cannot be replaced with a call insert() or emplace() because
128009467b48Spatrick     // the find key is BB, but the insert/emplace key is PDom.
128109467b48Spatrick     auto Found = TriangleChainMap.find(&BB);
128209467b48Spatrick     // If it is, remove the chain from the map, grow it, and put it back in the
128309467b48Spatrick     // map with the end as the new key.
128409467b48Spatrick     if (Found != TriangleChainMap.end()) {
128509467b48Spatrick       TriangleChain Chain = std::move(Found->second);
128609467b48Spatrick       TriangleChainMap.erase(Found);
128709467b48Spatrick       Chain.append(PDom);
128809467b48Spatrick       TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
128909467b48Spatrick     } else {
129009467b48Spatrick       auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
129109467b48Spatrick       assert(InsertResult.second && "Block seen twice.");
129209467b48Spatrick       (void)InsertResult;
129309467b48Spatrick     }
129409467b48Spatrick   }
129509467b48Spatrick 
129609467b48Spatrick   // Iterating over a DenseMap is safe here, because the only thing in the body
129709467b48Spatrick   // of the loop is inserting into another DenseMap (ComputedEdges).
129809467b48Spatrick   // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
129909467b48Spatrick   for (auto &ChainPair : TriangleChainMap) {
130009467b48Spatrick     TriangleChain &Chain = ChainPair.second;
130109467b48Spatrick     // Benchmarking has shown that due to branch correlation duplicating 2 or
130209467b48Spatrick     // more triangles is profitable, despite the calculations assuming
130309467b48Spatrick     // independence.
130409467b48Spatrick     if (Chain.count() < TriangleChainCount)
130509467b48Spatrick       continue;
130609467b48Spatrick     MachineBasicBlock *dst = Chain.Edges.back();
130709467b48Spatrick     Chain.Edges.pop_back();
130809467b48Spatrick     for (MachineBasicBlock *src : reverse(Chain.Edges)) {
130909467b48Spatrick       LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
131009467b48Spatrick                         << getBlockName(dst)
131109467b48Spatrick                         << " as pre-computed based on triangles.\n");
131209467b48Spatrick 
131309467b48Spatrick       auto InsertResult = ComputedEdges.insert({src, {dst, true}});
131409467b48Spatrick       assert(InsertResult.second && "Block seen twice.");
131509467b48Spatrick       (void)InsertResult;
131609467b48Spatrick 
131709467b48Spatrick       dst = src;
131809467b48Spatrick     }
131909467b48Spatrick   }
132009467b48Spatrick }
132109467b48Spatrick 
132209467b48Spatrick // When profile is not present, return the StaticLikelyProb.
132309467b48Spatrick // When profile is available, we need to handle the triangle-shape CFG.
132409467b48Spatrick static BranchProbability getLayoutSuccessorProbThreshold(
132509467b48Spatrick       const MachineBasicBlock *BB) {
132609467b48Spatrick   if (!BB->getParent()->getFunction().hasProfileData())
132709467b48Spatrick     return BranchProbability(StaticLikelyProb, 100);
132809467b48Spatrick   if (BB->succ_size() == 2) {
132909467b48Spatrick     const MachineBasicBlock *Succ1 = *BB->succ_begin();
133009467b48Spatrick     const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
133109467b48Spatrick     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
133209467b48Spatrick       /* See case 1 below for the cost analysis. For BB->Succ to
133309467b48Spatrick        * be taken with smaller cost, the following needs to hold:
133409467b48Spatrick        *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
133509467b48Spatrick        *   So the threshold T in the calculation below
133609467b48Spatrick        *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
133709467b48Spatrick        *   So T / (1 - T) = 2, Yielding T = 2/3
133809467b48Spatrick        * Also adding user specified branch bias, we have
133909467b48Spatrick        *   T = (2/3)*(ProfileLikelyProb/50)
134009467b48Spatrick        *     = (2*ProfileLikelyProb)/150)
134109467b48Spatrick        */
134209467b48Spatrick       return BranchProbability(2 * ProfileLikelyProb, 150);
134309467b48Spatrick     }
134409467b48Spatrick   }
134509467b48Spatrick   return BranchProbability(ProfileLikelyProb, 100);
134609467b48Spatrick }
134709467b48Spatrick 
134809467b48Spatrick /// Checks to see if the layout candidate block \p Succ has a better layout
134909467b48Spatrick /// predecessor than \c BB. If yes, returns true.
135009467b48Spatrick /// \p SuccProb: The probability adjusted for only remaining blocks.
135109467b48Spatrick ///   Only used for logging
135209467b48Spatrick /// \p RealSuccProb: The un-adjusted probability.
135309467b48Spatrick /// \p Chain: The chain that BB belongs to and Succ is being considered for.
135409467b48Spatrick /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
135509467b48Spatrick ///    considered
135609467b48Spatrick bool MachineBlockPlacement::hasBetterLayoutPredecessor(
135709467b48Spatrick     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
135809467b48Spatrick     const BlockChain &SuccChain, BranchProbability SuccProb,
135909467b48Spatrick     BranchProbability RealSuccProb, const BlockChain &Chain,
136009467b48Spatrick     const BlockFilterSet *BlockFilter) {
136109467b48Spatrick 
136209467b48Spatrick   // There isn't a better layout when there are no unscheduled predecessors.
136309467b48Spatrick   if (SuccChain.UnscheduledPredecessors == 0)
136409467b48Spatrick     return false;
136509467b48Spatrick 
136609467b48Spatrick   // There are two basic scenarios here:
136709467b48Spatrick   // -------------------------------------
136809467b48Spatrick   // Case 1: triangular shape CFG (if-then):
136909467b48Spatrick   //     BB
137009467b48Spatrick   //     | \
137109467b48Spatrick   //     |  \
137209467b48Spatrick   //     |   Pred
137309467b48Spatrick   //     |   /
137409467b48Spatrick   //     Succ
137509467b48Spatrick   // In this case, we are evaluating whether to select edge -> Succ, e.g.
137609467b48Spatrick   // set Succ as the layout successor of BB. Picking Succ as BB's
137709467b48Spatrick   // successor breaks the CFG constraints (FIXME: define these constraints).
137809467b48Spatrick   // With this layout, Pred BB
137909467b48Spatrick   // is forced to be outlined, so the overall cost will be cost of the
138009467b48Spatrick   // branch taken from BB to Pred, plus the cost of back taken branch
138109467b48Spatrick   // from Pred to Succ, as well as the additional cost associated
138209467b48Spatrick   // with the needed unconditional jump instruction from Pred To Succ.
138309467b48Spatrick 
138409467b48Spatrick   // The cost of the topological order layout is the taken branch cost
138509467b48Spatrick   // from BB to Succ, so to make BB->Succ a viable candidate, the following
138609467b48Spatrick   // must hold:
138709467b48Spatrick   //     2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
138809467b48Spatrick   //      < freq(BB->Succ) *  taken_branch_cost.
138909467b48Spatrick   // Ignoring unconditional jump cost, we get
139009467b48Spatrick   //    freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
139109467b48Spatrick   //    prob(BB->Succ) > 2 * prob(BB->Pred)
139209467b48Spatrick   //
139309467b48Spatrick   // When real profile data is available, we can precisely compute the
139409467b48Spatrick   // probability threshold that is needed for edge BB->Succ to be considered.
139509467b48Spatrick   // Without profile data, the heuristic requires the branch bias to be
139609467b48Spatrick   // a lot larger to make sure the signal is very strong (e.g. 80% default).
139709467b48Spatrick   // -----------------------------------------------------------------
139809467b48Spatrick   // Case 2: diamond like CFG (if-then-else):
139909467b48Spatrick   //     S
140009467b48Spatrick   //    / \
140109467b48Spatrick   //   |   \
140209467b48Spatrick   //  BB    Pred
140309467b48Spatrick   //   \    /
140409467b48Spatrick   //    Succ
140509467b48Spatrick   //    ..
140609467b48Spatrick   //
140709467b48Spatrick   // The current block is BB and edge BB->Succ is now being evaluated.
140809467b48Spatrick   // Note that edge S->BB was previously already selected because
140909467b48Spatrick   // prob(S->BB) > prob(S->Pred).
141009467b48Spatrick   // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
141109467b48Spatrick   // choose Pred, we will have a topological ordering as shown on the left
141209467b48Spatrick   // in the picture below. If we choose Succ, we have the solution as shown
141309467b48Spatrick   // on the right:
141409467b48Spatrick   //
141509467b48Spatrick   //   topo-order:
141609467b48Spatrick   //
141709467b48Spatrick   //       S-----                             ---S
141809467b48Spatrick   //       |    |                             |  |
141909467b48Spatrick   //    ---BB   |                             |  BB
142009467b48Spatrick   //    |       |                             |  |
142109467b48Spatrick   //    |  Pred--                             |  Succ--
142209467b48Spatrick   //    |  |                                  |       |
142309467b48Spatrick   //    ---Succ                               ---Pred--
142409467b48Spatrick   //
142509467b48Spatrick   // cost = freq(S->Pred) + freq(BB->Succ)    cost = 2 * freq (S->Pred)
142609467b48Spatrick   //      = freq(S->Pred) + freq(S->BB)
142709467b48Spatrick   //
142809467b48Spatrick   // If we have profile data (i.e, branch probabilities can be trusted), the
142909467b48Spatrick   // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
143009467b48Spatrick   // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
143109467b48Spatrick   // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
143209467b48Spatrick   // means the cost of topological order is greater.
143309467b48Spatrick   // When profile data is not available, however, we need to be more
143409467b48Spatrick   // conservative. If the branch prediction is wrong, breaking the topo-order
143509467b48Spatrick   // will actually yield a layout with large cost. For this reason, we need
143609467b48Spatrick   // strong biased branch at block S with Prob(S->BB) in order to select
143709467b48Spatrick   // BB->Succ. This is equivalent to looking the CFG backward with backward
143809467b48Spatrick   // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
143909467b48Spatrick   // profile data).
144009467b48Spatrick   // --------------------------------------------------------------------------
144109467b48Spatrick   // Case 3: forked diamond
144209467b48Spatrick   //       S
144309467b48Spatrick   //      / \
144409467b48Spatrick   //     /   \
144509467b48Spatrick   //   BB    Pred
144609467b48Spatrick   //   | \   / |
144709467b48Spatrick   //   |  \ /  |
144809467b48Spatrick   //   |   X   |
144909467b48Spatrick   //   |  / \  |
145009467b48Spatrick   //   | /   \ |
145109467b48Spatrick   //   S1     S2
145209467b48Spatrick   //
145309467b48Spatrick   // The current block is BB and edge BB->S1 is now being evaluated.
145409467b48Spatrick   // As above S->BB was already selected because
145509467b48Spatrick   // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
145609467b48Spatrick   //
145709467b48Spatrick   // topo-order:
145809467b48Spatrick   //
145909467b48Spatrick   //     S-------|                     ---S
146009467b48Spatrick   //     |       |                     |  |
146109467b48Spatrick   //  ---BB      |                     |  BB
146209467b48Spatrick   //  |          |                     |  |
146309467b48Spatrick   //  |  Pred----|                     |  S1----
146409467b48Spatrick   //  |  |                             |       |
146509467b48Spatrick   //  --(S1 or S2)                     ---Pred--
146609467b48Spatrick   //                                        |
146709467b48Spatrick   //                                       S2
146809467b48Spatrick   //
146909467b48Spatrick   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
147009467b48Spatrick   //    + min(freq(Pred->S1), freq(Pred->S2))
147109467b48Spatrick   // Non-topo-order cost:
147209467b48Spatrick   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
147309467b48Spatrick   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
147409467b48Spatrick   // is 0. Then the non topo layout is better when
147509467b48Spatrick   // freq(S->Pred) < freq(BB->S1).
147609467b48Spatrick   // This is exactly what is checked below.
147709467b48Spatrick   // Note there are other shapes that apply (Pred may not be a single block,
147809467b48Spatrick   // but they all fit this general pattern.)
147909467b48Spatrick   BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
148009467b48Spatrick 
148109467b48Spatrick   // Make sure that a hot successor doesn't have a globally more
148209467b48Spatrick   // important predecessor.
148309467b48Spatrick   BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
148409467b48Spatrick   bool BadCFGConflict = false;
148509467b48Spatrick 
148609467b48Spatrick   for (MachineBasicBlock *Pred : Succ->predecessors()) {
148709467b48Spatrick     BlockChain *PredChain = BlockToChain[Pred];
148809467b48Spatrick     if (Pred == Succ || PredChain == &SuccChain ||
148909467b48Spatrick         (BlockFilter && !BlockFilter->count(Pred)) ||
149009467b48Spatrick         PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
149109467b48Spatrick         // This check is redundant except for look ahead. This function is
149209467b48Spatrick         // called for lookahead by isProfitableToTailDup when BB hasn't been
149309467b48Spatrick         // placed yet.
149409467b48Spatrick         (Pred == BB))
149509467b48Spatrick       continue;
149609467b48Spatrick     // Do backward checking.
149709467b48Spatrick     // For all cases above, we need a backward checking to filter out edges that
149809467b48Spatrick     // are not 'strongly' biased.
149909467b48Spatrick     // BB  Pred
150009467b48Spatrick     //  \ /
150109467b48Spatrick     //  Succ
150209467b48Spatrick     // We select edge BB->Succ if
150309467b48Spatrick     //      freq(BB->Succ) > freq(Succ) * HotProb
150409467b48Spatrick     //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
150509467b48Spatrick     //      HotProb
150609467b48Spatrick     //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
150709467b48Spatrick     // Case 1 is covered too, because the first equation reduces to:
150809467b48Spatrick     // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
150909467b48Spatrick     BlockFrequency PredEdgeFreq =
151009467b48Spatrick         MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
151109467b48Spatrick     if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
151209467b48Spatrick       BadCFGConflict = true;
151309467b48Spatrick       break;
151409467b48Spatrick     }
151509467b48Spatrick   }
151609467b48Spatrick 
151709467b48Spatrick   if (BadCFGConflict) {
151809467b48Spatrick     LLVM_DEBUG(dbgs() << "    Not a candidate: " << getBlockName(Succ) << " -> "
151909467b48Spatrick                       << SuccProb << " (prob) (non-cold CFG conflict)\n");
152009467b48Spatrick     return true;
152109467b48Spatrick   }
152209467b48Spatrick 
152309467b48Spatrick   return false;
152409467b48Spatrick }
152509467b48Spatrick 
152609467b48Spatrick /// Select the best successor for a block.
152709467b48Spatrick ///
152809467b48Spatrick /// This looks across all successors of a particular block and attempts to
152909467b48Spatrick /// select the "best" one to be the layout successor. It only considers direct
153009467b48Spatrick /// successors which also pass the block filter. It will attempt to avoid
153109467b48Spatrick /// breaking CFG structure, but cave and break such structures in the case of
153209467b48Spatrick /// very hot successor edges.
153309467b48Spatrick ///
153409467b48Spatrick /// \returns The best successor block found, or null if none are viable, along
153509467b48Spatrick /// with a boolean indicating if tail duplication is necessary.
153609467b48Spatrick MachineBlockPlacement::BlockAndTailDupResult
153709467b48Spatrick MachineBlockPlacement::selectBestSuccessor(
153809467b48Spatrick     const MachineBasicBlock *BB, const BlockChain &Chain,
153909467b48Spatrick     const BlockFilterSet *BlockFilter) {
154009467b48Spatrick   const BranchProbability HotProb(StaticLikelyProb, 100);
154109467b48Spatrick 
154209467b48Spatrick   BlockAndTailDupResult BestSucc = { nullptr, false };
154309467b48Spatrick   auto BestProb = BranchProbability::getZero();
154409467b48Spatrick 
154509467b48Spatrick   SmallVector<MachineBasicBlock *, 4> Successors;
154609467b48Spatrick   auto AdjustedSumProb =
154709467b48Spatrick       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
154809467b48Spatrick 
154909467b48Spatrick   LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
155009467b48Spatrick                     << "\n");
155109467b48Spatrick 
155209467b48Spatrick   // if we already precomputed the best successor for BB, return that if still
155309467b48Spatrick   // applicable.
155409467b48Spatrick   auto FoundEdge = ComputedEdges.find(BB);
155509467b48Spatrick   if (FoundEdge != ComputedEdges.end()) {
155609467b48Spatrick     MachineBasicBlock *Succ = FoundEdge->second.BB;
155709467b48Spatrick     ComputedEdges.erase(FoundEdge);
155809467b48Spatrick     BlockChain *SuccChain = BlockToChain[Succ];
155909467b48Spatrick     if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
156009467b48Spatrick         SuccChain != &Chain && Succ == *SuccChain->begin())
156109467b48Spatrick       return FoundEdge->second;
156209467b48Spatrick   }
156309467b48Spatrick 
156409467b48Spatrick   // if BB is part of a trellis, Use the trellis to determine the optimal
156509467b48Spatrick   // fallthrough edges
156609467b48Spatrick   if (isTrellis(BB, Successors, Chain, BlockFilter))
156709467b48Spatrick     return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
156809467b48Spatrick                                    BlockFilter);
156909467b48Spatrick 
157009467b48Spatrick   // For blocks with CFG violations, we may be able to lay them out anyway with
157109467b48Spatrick   // tail-duplication. We keep this vector so we can perform the probability
157209467b48Spatrick   // calculations the minimum number of times.
1573*097a140dSpatrick   SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4>
157409467b48Spatrick       DupCandidates;
157509467b48Spatrick   for (MachineBasicBlock *Succ : Successors) {
157609467b48Spatrick     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
157709467b48Spatrick     BranchProbability SuccProb =
157809467b48Spatrick         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
157909467b48Spatrick 
158009467b48Spatrick     BlockChain &SuccChain = *BlockToChain[Succ];
158109467b48Spatrick     // Skip the edge \c BB->Succ if block \c Succ has a better layout
158209467b48Spatrick     // predecessor that yields lower global cost.
158309467b48Spatrick     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
158409467b48Spatrick                                    Chain, BlockFilter)) {
158509467b48Spatrick       // If tail duplication would make Succ profitable, place it.
158609467b48Spatrick       if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1587*097a140dSpatrick         DupCandidates.emplace_back(SuccProb, Succ);
158809467b48Spatrick       continue;
158909467b48Spatrick     }
159009467b48Spatrick 
159109467b48Spatrick     LLVM_DEBUG(
159209467b48Spatrick         dbgs() << "    Candidate: " << getBlockName(Succ)
159309467b48Spatrick                << ", probability: " << SuccProb
159409467b48Spatrick                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
159509467b48Spatrick                << "\n");
159609467b48Spatrick 
159709467b48Spatrick     if (BestSucc.BB && BestProb >= SuccProb) {
159809467b48Spatrick       LLVM_DEBUG(dbgs() << "    Not the best candidate, continuing\n");
159909467b48Spatrick       continue;
160009467b48Spatrick     }
160109467b48Spatrick 
160209467b48Spatrick     LLVM_DEBUG(dbgs() << "    Setting it as best candidate\n");
160309467b48Spatrick     BestSucc.BB = Succ;
160409467b48Spatrick     BestProb = SuccProb;
160509467b48Spatrick   }
160609467b48Spatrick   // Handle the tail duplication candidates in order of decreasing probability.
160709467b48Spatrick   // Stop at the first one that is profitable. Also stop if they are less
160809467b48Spatrick   // profitable than BestSucc. Position is important because we preserve it and
160909467b48Spatrick   // prefer first best match. Here we aren't comparing in order, so we capture
161009467b48Spatrick   // the position instead.
161109467b48Spatrick   llvm::stable_sort(DupCandidates,
161209467b48Spatrick                     [](std::tuple<BranchProbability, MachineBasicBlock *> L,
161309467b48Spatrick                        std::tuple<BranchProbability, MachineBasicBlock *> R) {
161409467b48Spatrick                       return std::get<0>(L) > std::get<0>(R);
161509467b48Spatrick                     });
161609467b48Spatrick   for (auto &Tup : DupCandidates) {
161709467b48Spatrick     BranchProbability DupProb;
161809467b48Spatrick     MachineBasicBlock *Succ;
161909467b48Spatrick     std::tie(DupProb, Succ) = Tup;
162009467b48Spatrick     if (DupProb < BestProb)
162109467b48Spatrick       break;
162209467b48Spatrick     if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
162309467b48Spatrick         && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
162409467b48Spatrick       LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Succ)
162509467b48Spatrick                         << ", probability: " << DupProb
162609467b48Spatrick                         << " (Tail Duplicate)\n");
162709467b48Spatrick       BestSucc.BB = Succ;
162809467b48Spatrick       BestSucc.ShouldTailDup = true;
162909467b48Spatrick       break;
163009467b48Spatrick     }
163109467b48Spatrick   }
163209467b48Spatrick 
163309467b48Spatrick   if (BestSucc.BB)
163409467b48Spatrick     LLVM_DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
163509467b48Spatrick 
163609467b48Spatrick   return BestSucc;
163709467b48Spatrick }
163809467b48Spatrick 
163909467b48Spatrick /// Select the best block from a worklist.
164009467b48Spatrick ///
164109467b48Spatrick /// This looks through the provided worklist as a list of candidate basic
164209467b48Spatrick /// blocks and select the most profitable one to place. The definition of
164309467b48Spatrick /// profitable only really makes sense in the context of a loop. This returns
164409467b48Spatrick /// the most frequently visited block in the worklist, which in the case of
164509467b48Spatrick /// a loop, is the one most desirable to be physically close to the rest of the
164609467b48Spatrick /// loop body in order to improve i-cache behavior.
164709467b48Spatrick ///
164809467b48Spatrick /// \returns The best block found, or null if none are viable.
164909467b48Spatrick MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
165009467b48Spatrick     const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
165109467b48Spatrick   // Once we need to walk the worklist looking for a candidate, cleanup the
165209467b48Spatrick   // worklist of already placed entries.
165309467b48Spatrick   // FIXME: If this shows up on profiles, it could be folded (at the cost of
165409467b48Spatrick   // some code complexity) into the loop below.
165509467b48Spatrick   WorkList.erase(llvm::remove_if(WorkList,
165609467b48Spatrick                                  [&](MachineBasicBlock *BB) {
165709467b48Spatrick                                    return BlockToChain.lookup(BB) == &Chain;
165809467b48Spatrick                                  }),
165909467b48Spatrick                  WorkList.end());
166009467b48Spatrick 
166109467b48Spatrick   if (WorkList.empty())
166209467b48Spatrick     return nullptr;
166309467b48Spatrick 
166409467b48Spatrick   bool IsEHPad = WorkList[0]->isEHPad();
166509467b48Spatrick 
166609467b48Spatrick   MachineBasicBlock *BestBlock = nullptr;
166709467b48Spatrick   BlockFrequency BestFreq;
166809467b48Spatrick   for (MachineBasicBlock *MBB : WorkList) {
166909467b48Spatrick     assert(MBB->isEHPad() == IsEHPad &&
167009467b48Spatrick            "EHPad mismatch between block and work list.");
167109467b48Spatrick 
167209467b48Spatrick     BlockChain &SuccChain = *BlockToChain[MBB];
167309467b48Spatrick     if (&SuccChain == &Chain)
167409467b48Spatrick       continue;
167509467b48Spatrick 
167609467b48Spatrick     assert(SuccChain.UnscheduledPredecessors == 0 &&
167709467b48Spatrick            "Found CFG-violating block");
167809467b48Spatrick 
167909467b48Spatrick     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
168009467b48Spatrick     LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
168109467b48Spatrick                MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
168209467b48Spatrick 
168309467b48Spatrick     // For ehpad, we layout the least probable first as to avoid jumping back
168409467b48Spatrick     // from least probable landingpads to more probable ones.
168509467b48Spatrick     //
168609467b48Spatrick     // FIXME: Using probability is probably (!) not the best way to achieve
168709467b48Spatrick     // this. We should probably have a more principled approach to layout
168809467b48Spatrick     // cleanup code.
168909467b48Spatrick     //
169009467b48Spatrick     // The goal is to get:
169109467b48Spatrick     //
169209467b48Spatrick     //                 +--------------------------+
169309467b48Spatrick     //                 |                          V
169409467b48Spatrick     // InnerLp -> InnerCleanup    OuterLp -> OuterCleanup -> Resume
169509467b48Spatrick     //
169609467b48Spatrick     // Rather than:
169709467b48Spatrick     //
169809467b48Spatrick     //                 +-------------------------------------+
169909467b48Spatrick     //                 V                                     |
170009467b48Spatrick     // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
170109467b48Spatrick     if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
170209467b48Spatrick       continue;
170309467b48Spatrick 
170409467b48Spatrick     BestBlock = MBB;
170509467b48Spatrick     BestFreq = CandidateFreq;
170609467b48Spatrick   }
170709467b48Spatrick 
170809467b48Spatrick   return BestBlock;
170909467b48Spatrick }
171009467b48Spatrick 
171109467b48Spatrick /// Retrieve the first unplaced basic block.
171209467b48Spatrick ///
171309467b48Spatrick /// This routine is called when we are unable to use the CFG to walk through
171409467b48Spatrick /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
171509467b48Spatrick /// We walk through the function's blocks in order, starting from the
171609467b48Spatrick /// LastUnplacedBlockIt. We update this iterator on each call to avoid
171709467b48Spatrick /// re-scanning the entire sequence on repeated calls to this routine.
171809467b48Spatrick MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
171909467b48Spatrick     const BlockChain &PlacedChain,
172009467b48Spatrick     MachineFunction::iterator &PrevUnplacedBlockIt,
172109467b48Spatrick     const BlockFilterSet *BlockFilter) {
172209467b48Spatrick   for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
172309467b48Spatrick        ++I) {
172409467b48Spatrick     if (BlockFilter && !BlockFilter->count(&*I))
172509467b48Spatrick       continue;
172609467b48Spatrick     if (BlockToChain[&*I] != &PlacedChain) {
172709467b48Spatrick       PrevUnplacedBlockIt = I;
172809467b48Spatrick       // Now select the head of the chain to which the unplaced block belongs
172909467b48Spatrick       // as the block to place. This will force the entire chain to be placed,
173009467b48Spatrick       // and satisfies the requirements of merging chains.
173109467b48Spatrick       return *BlockToChain[&*I]->begin();
173209467b48Spatrick     }
173309467b48Spatrick   }
173409467b48Spatrick   return nullptr;
173509467b48Spatrick }
173609467b48Spatrick 
173709467b48Spatrick void MachineBlockPlacement::fillWorkLists(
173809467b48Spatrick     const MachineBasicBlock *MBB,
173909467b48Spatrick     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
174009467b48Spatrick     const BlockFilterSet *BlockFilter = nullptr) {
174109467b48Spatrick   BlockChain &Chain = *BlockToChain[MBB];
174209467b48Spatrick   if (!UpdatedPreds.insert(&Chain).second)
174309467b48Spatrick     return;
174409467b48Spatrick 
174509467b48Spatrick   assert(
174609467b48Spatrick       Chain.UnscheduledPredecessors == 0 &&
174709467b48Spatrick       "Attempting to place block with unscheduled predecessors in worklist.");
174809467b48Spatrick   for (MachineBasicBlock *ChainBB : Chain) {
174909467b48Spatrick     assert(BlockToChain[ChainBB] == &Chain &&
175009467b48Spatrick            "Block in chain doesn't match BlockToChain map.");
175109467b48Spatrick     for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
175209467b48Spatrick       if (BlockFilter && !BlockFilter->count(Pred))
175309467b48Spatrick         continue;
175409467b48Spatrick       if (BlockToChain[Pred] == &Chain)
175509467b48Spatrick         continue;
175609467b48Spatrick       ++Chain.UnscheduledPredecessors;
175709467b48Spatrick     }
175809467b48Spatrick   }
175909467b48Spatrick 
176009467b48Spatrick   if (Chain.UnscheduledPredecessors != 0)
176109467b48Spatrick     return;
176209467b48Spatrick 
176309467b48Spatrick   MachineBasicBlock *BB = *Chain.begin();
176409467b48Spatrick   if (BB->isEHPad())
176509467b48Spatrick     EHPadWorkList.push_back(BB);
176609467b48Spatrick   else
176709467b48Spatrick     BlockWorkList.push_back(BB);
176809467b48Spatrick }
176909467b48Spatrick 
177009467b48Spatrick void MachineBlockPlacement::buildChain(
177109467b48Spatrick     const MachineBasicBlock *HeadBB, BlockChain &Chain,
177209467b48Spatrick     BlockFilterSet *BlockFilter) {
177309467b48Spatrick   assert(HeadBB && "BB must not be null.\n");
177409467b48Spatrick   assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
177509467b48Spatrick   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
177609467b48Spatrick 
177709467b48Spatrick   const MachineBasicBlock *LoopHeaderBB = HeadBB;
177809467b48Spatrick   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
177909467b48Spatrick   MachineBasicBlock *BB = *std::prev(Chain.end());
178009467b48Spatrick   while (true) {
178109467b48Spatrick     assert(BB && "null block found at end of chain in loop.");
178209467b48Spatrick     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
178309467b48Spatrick     assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
178409467b48Spatrick 
178509467b48Spatrick 
178609467b48Spatrick     // Look for the best viable successor if there is one to place immediately
178709467b48Spatrick     // after this block.
178809467b48Spatrick     auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
178909467b48Spatrick     MachineBasicBlock* BestSucc = Result.BB;
179009467b48Spatrick     bool ShouldTailDup = Result.ShouldTailDup;
179109467b48Spatrick     if (allowTailDupPlacement())
179209467b48Spatrick       ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
179309467b48Spatrick                                                                   Chain,
179409467b48Spatrick                                                                   BlockFilter));
179509467b48Spatrick 
179609467b48Spatrick     // If an immediate successor isn't available, look for the best viable
179709467b48Spatrick     // block among those we've identified as not violating the loop's CFG at
179809467b48Spatrick     // this point. This won't be a fallthrough, but it will increase locality.
179909467b48Spatrick     if (!BestSucc)
180009467b48Spatrick       BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
180109467b48Spatrick     if (!BestSucc)
180209467b48Spatrick       BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
180309467b48Spatrick 
180409467b48Spatrick     if (!BestSucc) {
180509467b48Spatrick       BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
180609467b48Spatrick       if (!BestSucc)
180709467b48Spatrick         break;
180809467b48Spatrick 
180909467b48Spatrick       LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
181009467b48Spatrick                            "layout successor until the CFG reduces\n");
181109467b48Spatrick     }
181209467b48Spatrick 
181309467b48Spatrick     // Placement may have changed tail duplication opportunities.
181409467b48Spatrick     // Check for that now.
181509467b48Spatrick     if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1816*097a140dSpatrick       repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1817*097a140dSpatrick                                        BlockFilter, PrevUnplacedBlockIt);
1818*097a140dSpatrick       // If the chosen successor was duplicated into BB, don't bother laying
1819*097a140dSpatrick       // it out, just go round the loop again with BB as the chain end.
1820*097a140dSpatrick       if (!BB->isSuccessor(BestSucc))
182109467b48Spatrick         continue;
182209467b48Spatrick     }
182309467b48Spatrick 
182409467b48Spatrick     // Place this block, updating the datastructures to reflect its placement.
182509467b48Spatrick     BlockChain &SuccChain = *BlockToChain[BestSucc];
182609467b48Spatrick     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
182709467b48Spatrick     // we selected a successor that didn't fit naturally into the CFG.
182809467b48Spatrick     SuccChain.UnscheduledPredecessors = 0;
182909467b48Spatrick     LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
183009467b48Spatrick                       << getBlockName(BestSucc) << "\n");
183109467b48Spatrick     markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
183209467b48Spatrick     Chain.merge(BestSucc, &SuccChain);
183309467b48Spatrick     BB = *std::prev(Chain.end());
183409467b48Spatrick   }
183509467b48Spatrick 
183609467b48Spatrick   LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
183709467b48Spatrick                     << getBlockName(*Chain.begin()) << "\n");
183809467b48Spatrick }
183909467b48Spatrick 
184009467b48Spatrick // If bottom of block BB has only one successor OldTop, in most cases it is
184109467b48Spatrick // profitable to move it before OldTop, except the following case:
184209467b48Spatrick //
184309467b48Spatrick //     -->OldTop<-
184409467b48Spatrick //     |    .    |
184509467b48Spatrick //     |    .    |
184609467b48Spatrick //     |    .    |
184709467b48Spatrick //     ---Pred   |
184809467b48Spatrick //          |    |
184909467b48Spatrick //         BB-----
185009467b48Spatrick //
185109467b48Spatrick // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
185209467b48Spatrick // layout the other successor below it, so it can't reduce taken branch.
185309467b48Spatrick // In this case we keep its original layout.
185409467b48Spatrick bool
185509467b48Spatrick MachineBlockPlacement::canMoveBottomBlockToTop(
185609467b48Spatrick     const MachineBasicBlock *BottomBlock,
185709467b48Spatrick     const MachineBasicBlock *OldTop) {
185809467b48Spatrick   if (BottomBlock->pred_size() != 1)
185909467b48Spatrick     return true;
186009467b48Spatrick   MachineBasicBlock *Pred = *BottomBlock->pred_begin();
186109467b48Spatrick   if (Pred->succ_size() != 2)
186209467b48Spatrick     return true;
186309467b48Spatrick 
186409467b48Spatrick   MachineBasicBlock *OtherBB = *Pred->succ_begin();
186509467b48Spatrick   if (OtherBB == BottomBlock)
186609467b48Spatrick     OtherBB = *Pred->succ_rbegin();
186709467b48Spatrick   if (OtherBB == OldTop)
186809467b48Spatrick     return false;
186909467b48Spatrick 
187009467b48Spatrick   return true;
187109467b48Spatrick }
187209467b48Spatrick 
187309467b48Spatrick // Find out the possible fall through frequence to the top of a loop.
187409467b48Spatrick BlockFrequency
187509467b48Spatrick MachineBlockPlacement::TopFallThroughFreq(
187609467b48Spatrick     const MachineBasicBlock *Top,
187709467b48Spatrick     const BlockFilterSet &LoopBlockSet) {
187809467b48Spatrick   BlockFrequency MaxFreq = 0;
187909467b48Spatrick   for (MachineBasicBlock *Pred : Top->predecessors()) {
188009467b48Spatrick     BlockChain *PredChain = BlockToChain[Pred];
188109467b48Spatrick     if (!LoopBlockSet.count(Pred) &&
188209467b48Spatrick         (!PredChain || Pred == *std::prev(PredChain->end()))) {
188309467b48Spatrick       // Found a Pred block can be placed before Top.
188409467b48Spatrick       // Check if Top is the best successor of Pred.
188509467b48Spatrick       auto TopProb = MBPI->getEdgeProbability(Pred, Top);
188609467b48Spatrick       bool TopOK = true;
188709467b48Spatrick       for (MachineBasicBlock *Succ : Pred->successors()) {
188809467b48Spatrick         auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
188909467b48Spatrick         BlockChain *SuccChain = BlockToChain[Succ];
189009467b48Spatrick         // Check if Succ can be placed after Pred.
189109467b48Spatrick         // Succ should not be in any chain, or it is the head of some chain.
189209467b48Spatrick         if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
189309467b48Spatrick             (!SuccChain || Succ == *SuccChain->begin())) {
189409467b48Spatrick           TopOK = false;
189509467b48Spatrick           break;
189609467b48Spatrick         }
189709467b48Spatrick       }
189809467b48Spatrick       if (TopOK) {
189909467b48Spatrick         BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
190009467b48Spatrick                                   MBPI->getEdgeProbability(Pred, Top);
190109467b48Spatrick         if (EdgeFreq > MaxFreq)
190209467b48Spatrick           MaxFreq = EdgeFreq;
190309467b48Spatrick       }
190409467b48Spatrick     }
190509467b48Spatrick   }
190609467b48Spatrick   return MaxFreq;
190709467b48Spatrick }
190809467b48Spatrick 
190909467b48Spatrick // Compute the fall through gains when move NewTop before OldTop.
191009467b48Spatrick //
191109467b48Spatrick // In following diagram, edges marked as "-" are reduced fallthrough, edges
191209467b48Spatrick // marked as "+" are increased fallthrough, this function computes
191309467b48Spatrick //
191409467b48Spatrick //      SUM(increased fallthrough) - SUM(decreased fallthrough)
191509467b48Spatrick //
191609467b48Spatrick //              |
191709467b48Spatrick //              | -
191809467b48Spatrick //              V
191909467b48Spatrick //        --->OldTop
192009467b48Spatrick //        |     .
192109467b48Spatrick //        |     .
192209467b48Spatrick //       +|     .    +
192309467b48Spatrick //        |   Pred --->
192409467b48Spatrick //        |     |-
192509467b48Spatrick //        |     V
192609467b48Spatrick //        --- NewTop <---
192709467b48Spatrick //              |-
192809467b48Spatrick //              V
192909467b48Spatrick //
193009467b48Spatrick BlockFrequency
193109467b48Spatrick MachineBlockPlacement::FallThroughGains(
193209467b48Spatrick     const MachineBasicBlock *NewTop,
193309467b48Spatrick     const MachineBasicBlock *OldTop,
193409467b48Spatrick     const MachineBasicBlock *ExitBB,
193509467b48Spatrick     const BlockFilterSet &LoopBlockSet) {
193609467b48Spatrick   BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
193709467b48Spatrick   BlockFrequency FallThrough2Exit = 0;
193809467b48Spatrick   if (ExitBB)
193909467b48Spatrick     FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
194009467b48Spatrick         MBPI->getEdgeProbability(NewTop, ExitBB);
194109467b48Spatrick   BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
194209467b48Spatrick       MBPI->getEdgeProbability(NewTop, OldTop);
194309467b48Spatrick 
194409467b48Spatrick   // Find the best Pred of NewTop.
194509467b48Spatrick    MachineBasicBlock *BestPred = nullptr;
194609467b48Spatrick    BlockFrequency FallThroughFromPred = 0;
194709467b48Spatrick    for (MachineBasicBlock *Pred : NewTop->predecessors()) {
194809467b48Spatrick      if (!LoopBlockSet.count(Pred))
194909467b48Spatrick        continue;
195009467b48Spatrick      BlockChain *PredChain = BlockToChain[Pred];
195109467b48Spatrick      if (!PredChain || Pred == *std::prev(PredChain->end())) {
195209467b48Spatrick        BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
195309467b48Spatrick            MBPI->getEdgeProbability(Pred, NewTop);
195409467b48Spatrick        if (EdgeFreq > FallThroughFromPred) {
195509467b48Spatrick          FallThroughFromPred = EdgeFreq;
195609467b48Spatrick          BestPred = Pred;
195709467b48Spatrick        }
195809467b48Spatrick      }
195909467b48Spatrick    }
196009467b48Spatrick 
196109467b48Spatrick    // If NewTop is not placed after Pred, another successor can be placed
196209467b48Spatrick    // after Pred.
196309467b48Spatrick    BlockFrequency NewFreq = 0;
196409467b48Spatrick    if (BestPred) {
196509467b48Spatrick      for (MachineBasicBlock *Succ : BestPred->successors()) {
196609467b48Spatrick        if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
196709467b48Spatrick          continue;
196809467b48Spatrick        if (ComputedEdges.find(Succ) != ComputedEdges.end())
196909467b48Spatrick          continue;
197009467b48Spatrick        BlockChain *SuccChain = BlockToChain[Succ];
197109467b48Spatrick        if ((SuccChain && (Succ != *SuccChain->begin())) ||
197209467b48Spatrick            (SuccChain == BlockToChain[BestPred]))
197309467b48Spatrick          continue;
197409467b48Spatrick        BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
197509467b48Spatrick            MBPI->getEdgeProbability(BestPred, Succ);
197609467b48Spatrick        if (EdgeFreq > NewFreq)
197709467b48Spatrick          NewFreq = EdgeFreq;
197809467b48Spatrick      }
197909467b48Spatrick      BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
198009467b48Spatrick          MBPI->getEdgeProbability(BestPred, NewTop);
198109467b48Spatrick      if (NewFreq > OrigEdgeFreq) {
198209467b48Spatrick        // If NewTop is not the best successor of Pred, then Pred doesn't
198309467b48Spatrick        // fallthrough to NewTop. So there is no FallThroughFromPred and
198409467b48Spatrick        // NewFreq.
198509467b48Spatrick        NewFreq = 0;
198609467b48Spatrick        FallThroughFromPred = 0;
198709467b48Spatrick      }
198809467b48Spatrick    }
198909467b48Spatrick 
199009467b48Spatrick    BlockFrequency Result = 0;
199109467b48Spatrick    BlockFrequency Gains = BackEdgeFreq + NewFreq;
199209467b48Spatrick    BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
199309467b48Spatrick        FallThroughFromPred;
199409467b48Spatrick    if (Gains > Lost)
199509467b48Spatrick      Result = Gains - Lost;
199609467b48Spatrick    return Result;
199709467b48Spatrick }
199809467b48Spatrick 
199909467b48Spatrick /// Helper function of findBestLoopTop. Find the best loop top block
200009467b48Spatrick /// from predecessors of old top.
200109467b48Spatrick ///
200209467b48Spatrick /// Look for a block which is strictly better than the old top for laying
200309467b48Spatrick /// out before the old top of the loop. This looks for only two patterns:
200409467b48Spatrick ///
200509467b48Spatrick ///     1. a block has only one successor, the old loop top
200609467b48Spatrick ///
200709467b48Spatrick ///        Because such a block will always result in an unconditional jump,
200809467b48Spatrick ///        rotating it in front of the old top is always profitable.
200909467b48Spatrick ///
201009467b48Spatrick ///     2. a block has two successors, one is old top, another is exit
201109467b48Spatrick ///        and it has more than one predecessors
201209467b48Spatrick ///
201309467b48Spatrick ///        If it is below one of its predecessors P, only P can fall through to
201409467b48Spatrick ///        it, all other predecessors need a jump to it, and another conditional
201509467b48Spatrick ///        jump to loop header. If it is moved before loop header, all its
201609467b48Spatrick ///        predecessors jump to it, then fall through to loop header. So all its
201709467b48Spatrick ///        predecessors except P can reduce one taken branch.
201809467b48Spatrick ///        At the same time, move it before old top increases the taken branch
201909467b48Spatrick ///        to loop exit block, so the reduced taken branch will be compared with
202009467b48Spatrick ///        the increased taken branch to the loop exit block.
202109467b48Spatrick MachineBasicBlock *
202209467b48Spatrick MachineBlockPlacement::findBestLoopTopHelper(
202309467b48Spatrick     MachineBasicBlock *OldTop,
202409467b48Spatrick     const MachineLoop &L,
202509467b48Spatrick     const BlockFilterSet &LoopBlockSet) {
202609467b48Spatrick   // Check that the header hasn't been fused with a preheader block due to
202709467b48Spatrick   // crazy branches. If it has, we need to start with the header at the top to
202809467b48Spatrick   // prevent pulling the preheader into the loop body.
202909467b48Spatrick   BlockChain &HeaderChain = *BlockToChain[OldTop];
203009467b48Spatrick   if (!LoopBlockSet.count(*HeaderChain.begin()))
203109467b48Spatrick     return OldTop;
203209467b48Spatrick 
203309467b48Spatrick   LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
203409467b48Spatrick                     << "\n");
203509467b48Spatrick 
203609467b48Spatrick   BlockFrequency BestGains = 0;
203709467b48Spatrick   MachineBasicBlock *BestPred = nullptr;
203809467b48Spatrick   for (MachineBasicBlock *Pred : OldTop->predecessors()) {
203909467b48Spatrick     if (!LoopBlockSet.count(Pred))
204009467b48Spatrick       continue;
204109467b48Spatrick     if (Pred == L.getHeader())
204209467b48Spatrick       continue;
204309467b48Spatrick     LLVM_DEBUG(dbgs() << "   old top pred: " << getBlockName(Pred) << ", has "
204409467b48Spatrick                       << Pred->succ_size() << " successors, ";
204509467b48Spatrick                MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
204609467b48Spatrick     if (Pred->succ_size() > 2)
204709467b48Spatrick       continue;
204809467b48Spatrick 
204909467b48Spatrick     MachineBasicBlock *OtherBB = nullptr;
205009467b48Spatrick     if (Pred->succ_size() == 2) {
205109467b48Spatrick       OtherBB = *Pred->succ_begin();
205209467b48Spatrick       if (OtherBB == OldTop)
205309467b48Spatrick         OtherBB = *Pred->succ_rbegin();
205409467b48Spatrick     }
205509467b48Spatrick 
205609467b48Spatrick     if (!canMoveBottomBlockToTop(Pred, OldTop))
205709467b48Spatrick       continue;
205809467b48Spatrick 
205909467b48Spatrick     BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
206009467b48Spatrick                                             LoopBlockSet);
206109467b48Spatrick     if ((Gains > 0) && (Gains > BestGains ||
206209467b48Spatrick         ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
206309467b48Spatrick       BestPred = Pred;
206409467b48Spatrick       BestGains = Gains;
206509467b48Spatrick     }
206609467b48Spatrick   }
206709467b48Spatrick 
206809467b48Spatrick   // If no direct predecessor is fine, just use the loop header.
206909467b48Spatrick   if (!BestPred) {
207009467b48Spatrick     LLVM_DEBUG(dbgs() << "    final top unchanged\n");
207109467b48Spatrick     return OldTop;
207209467b48Spatrick   }
207309467b48Spatrick 
207409467b48Spatrick   // Walk backwards through any straight line of predecessors.
207509467b48Spatrick   while (BestPred->pred_size() == 1 &&
207609467b48Spatrick          (*BestPred->pred_begin())->succ_size() == 1 &&
207709467b48Spatrick          *BestPred->pred_begin() != L.getHeader())
207809467b48Spatrick     BestPred = *BestPred->pred_begin();
207909467b48Spatrick 
208009467b48Spatrick   LLVM_DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
208109467b48Spatrick   return BestPred;
208209467b48Spatrick }
208309467b48Spatrick 
208409467b48Spatrick /// Find the best loop top block for layout.
208509467b48Spatrick ///
208609467b48Spatrick /// This function iteratively calls findBestLoopTopHelper, until no new better
208709467b48Spatrick /// BB can be found.
208809467b48Spatrick MachineBasicBlock *
208909467b48Spatrick MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
209009467b48Spatrick                                        const BlockFilterSet &LoopBlockSet) {
209109467b48Spatrick   // Placing the latch block before the header may introduce an extra branch
209209467b48Spatrick   // that skips this block the first time the loop is executed, which we want
209309467b48Spatrick   // to avoid when optimising for size.
209409467b48Spatrick   // FIXME: in theory there is a case that does not introduce a new branch,
209509467b48Spatrick   // i.e. when the layout predecessor does not fallthrough to the loop header.
209609467b48Spatrick   // In practice this never happens though: there always seems to be a preheader
209709467b48Spatrick   // that can fallthrough and that is also placed before the header.
209809467b48Spatrick   bool OptForSize = F->getFunction().hasOptSize() ||
2099*097a140dSpatrick                     llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
210009467b48Spatrick   if (OptForSize)
210109467b48Spatrick     return L.getHeader();
210209467b48Spatrick 
210309467b48Spatrick   MachineBasicBlock *OldTop = nullptr;
210409467b48Spatrick   MachineBasicBlock *NewTop = L.getHeader();
210509467b48Spatrick   while (NewTop != OldTop) {
210609467b48Spatrick     OldTop = NewTop;
210709467b48Spatrick     NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
210809467b48Spatrick     if (NewTop != OldTop)
210909467b48Spatrick       ComputedEdges[NewTop] = { OldTop, false };
211009467b48Spatrick   }
211109467b48Spatrick   return NewTop;
211209467b48Spatrick }
211309467b48Spatrick 
211409467b48Spatrick /// Find the best loop exiting block for layout.
211509467b48Spatrick ///
211609467b48Spatrick /// This routine implements the logic to analyze the loop looking for the best
211709467b48Spatrick /// block to layout at the top of the loop. Typically this is done to maximize
211809467b48Spatrick /// fallthrough opportunities.
211909467b48Spatrick MachineBasicBlock *
212009467b48Spatrick MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
212109467b48Spatrick                                         const BlockFilterSet &LoopBlockSet,
212209467b48Spatrick                                         BlockFrequency &ExitFreq) {
212309467b48Spatrick   // We don't want to layout the loop linearly in all cases. If the loop header
212409467b48Spatrick   // is just a normal basic block in the loop, we want to look for what block
212509467b48Spatrick   // within the loop is the best one to layout at the top. However, if the loop
212609467b48Spatrick   // header has be pre-merged into a chain due to predecessors not having
212709467b48Spatrick   // analyzable branches, *and* the predecessor it is merged with is *not* part
212809467b48Spatrick   // of the loop, rotating the header into the middle of the loop will create
212909467b48Spatrick   // a non-contiguous range of blocks which is Very Bad. So start with the
213009467b48Spatrick   // header and only rotate if safe.
213109467b48Spatrick   BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
213209467b48Spatrick   if (!LoopBlockSet.count(*HeaderChain.begin()))
213309467b48Spatrick     return nullptr;
213409467b48Spatrick 
213509467b48Spatrick   BlockFrequency BestExitEdgeFreq;
213609467b48Spatrick   unsigned BestExitLoopDepth = 0;
213709467b48Spatrick   MachineBasicBlock *ExitingBB = nullptr;
213809467b48Spatrick   // If there are exits to outer loops, loop rotation can severely limit
213909467b48Spatrick   // fallthrough opportunities unless it selects such an exit. Keep a set of
214009467b48Spatrick   // blocks where rotating to exit with that block will reach an outer loop.
214109467b48Spatrick   SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
214209467b48Spatrick 
214309467b48Spatrick   LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
214409467b48Spatrick                     << getBlockName(L.getHeader()) << "\n");
214509467b48Spatrick   for (MachineBasicBlock *MBB : L.getBlocks()) {
214609467b48Spatrick     BlockChain &Chain = *BlockToChain[MBB];
214709467b48Spatrick     // Ensure that this block is at the end of a chain; otherwise it could be
214809467b48Spatrick     // mid-way through an inner loop or a successor of an unanalyzable branch.
214909467b48Spatrick     if (MBB != *std::prev(Chain.end()))
215009467b48Spatrick       continue;
215109467b48Spatrick 
215209467b48Spatrick     // Now walk the successors. We need to establish whether this has a viable
215309467b48Spatrick     // exiting successor and whether it has a viable non-exiting successor.
215409467b48Spatrick     // We store the old exiting state and restore it if a viable looping
215509467b48Spatrick     // successor isn't found.
215609467b48Spatrick     MachineBasicBlock *OldExitingBB = ExitingBB;
215709467b48Spatrick     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
215809467b48Spatrick     bool HasLoopingSucc = false;
215909467b48Spatrick     for (MachineBasicBlock *Succ : MBB->successors()) {
216009467b48Spatrick       if (Succ->isEHPad())
216109467b48Spatrick         continue;
216209467b48Spatrick       if (Succ == MBB)
216309467b48Spatrick         continue;
216409467b48Spatrick       BlockChain &SuccChain = *BlockToChain[Succ];
216509467b48Spatrick       // Don't split chains, either this chain or the successor's chain.
216609467b48Spatrick       if (&Chain == &SuccChain) {
216709467b48Spatrick         LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
216809467b48Spatrick                           << getBlockName(Succ) << " (chain conflict)\n");
216909467b48Spatrick         continue;
217009467b48Spatrick       }
217109467b48Spatrick 
217209467b48Spatrick       auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
217309467b48Spatrick       if (LoopBlockSet.count(Succ)) {
217409467b48Spatrick         LLVM_DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
217509467b48Spatrick                           << getBlockName(Succ) << " (" << SuccProb << ")\n");
217609467b48Spatrick         HasLoopingSucc = true;
217709467b48Spatrick         continue;
217809467b48Spatrick       }
217909467b48Spatrick 
218009467b48Spatrick       unsigned SuccLoopDepth = 0;
218109467b48Spatrick       if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
218209467b48Spatrick         SuccLoopDepth = ExitLoop->getLoopDepth();
218309467b48Spatrick         if (ExitLoop->contains(&L))
218409467b48Spatrick           BlocksExitingToOuterLoop.insert(MBB);
218509467b48Spatrick       }
218609467b48Spatrick 
218709467b48Spatrick       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
218809467b48Spatrick       LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
218909467b48Spatrick                         << getBlockName(Succ) << " [L:" << SuccLoopDepth
219009467b48Spatrick                         << "] (";
219109467b48Spatrick                  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
219209467b48Spatrick       // Note that we bias this toward an existing layout successor to retain
219309467b48Spatrick       // incoming order in the absence of better information. The exit must have
219409467b48Spatrick       // a frequency higher than the current exit before we consider breaking
219509467b48Spatrick       // the layout.
219609467b48Spatrick       BranchProbability Bias(100 - ExitBlockBias, 100);
219709467b48Spatrick       if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
219809467b48Spatrick           ExitEdgeFreq > BestExitEdgeFreq ||
219909467b48Spatrick           (MBB->isLayoutSuccessor(Succ) &&
220009467b48Spatrick            !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
220109467b48Spatrick         BestExitEdgeFreq = ExitEdgeFreq;
220209467b48Spatrick         ExitingBB = MBB;
220309467b48Spatrick       }
220409467b48Spatrick     }
220509467b48Spatrick 
220609467b48Spatrick     if (!HasLoopingSucc) {
220709467b48Spatrick       // Restore the old exiting state, no viable looping successor was found.
220809467b48Spatrick       ExitingBB = OldExitingBB;
220909467b48Spatrick       BestExitEdgeFreq = OldBestExitEdgeFreq;
221009467b48Spatrick     }
221109467b48Spatrick   }
221209467b48Spatrick   // Without a candidate exiting block or with only a single block in the
221309467b48Spatrick   // loop, just use the loop header to layout the loop.
221409467b48Spatrick   if (!ExitingBB) {
221509467b48Spatrick     LLVM_DEBUG(
221609467b48Spatrick         dbgs() << "    No other candidate exit blocks, using loop header\n");
221709467b48Spatrick     return nullptr;
221809467b48Spatrick   }
221909467b48Spatrick   if (L.getNumBlocks() == 1) {
222009467b48Spatrick     LLVM_DEBUG(dbgs() << "    Loop has 1 block, using loop header as exit\n");
222109467b48Spatrick     return nullptr;
222209467b48Spatrick   }
222309467b48Spatrick 
222409467b48Spatrick   // Also, if we have exit blocks which lead to outer loops but didn't select
222509467b48Spatrick   // one of them as the exiting block we are rotating toward, disable loop
222609467b48Spatrick   // rotation altogether.
222709467b48Spatrick   if (!BlocksExitingToOuterLoop.empty() &&
222809467b48Spatrick       !BlocksExitingToOuterLoop.count(ExitingBB))
222909467b48Spatrick     return nullptr;
223009467b48Spatrick 
223109467b48Spatrick   LLVM_DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB)
223209467b48Spatrick                     << "\n");
223309467b48Spatrick   ExitFreq = BestExitEdgeFreq;
223409467b48Spatrick   return ExitingBB;
223509467b48Spatrick }
223609467b48Spatrick 
223709467b48Spatrick /// Check if there is a fallthrough to loop header Top.
223809467b48Spatrick ///
223909467b48Spatrick ///   1. Look for a Pred that can be layout before Top.
224009467b48Spatrick ///   2. Check if Top is the most possible successor of Pred.
224109467b48Spatrick bool
224209467b48Spatrick MachineBlockPlacement::hasViableTopFallthrough(
224309467b48Spatrick     const MachineBasicBlock *Top,
224409467b48Spatrick     const BlockFilterSet &LoopBlockSet) {
224509467b48Spatrick   for (MachineBasicBlock *Pred : Top->predecessors()) {
224609467b48Spatrick     BlockChain *PredChain = BlockToChain[Pred];
224709467b48Spatrick     if (!LoopBlockSet.count(Pred) &&
224809467b48Spatrick         (!PredChain || Pred == *std::prev(PredChain->end()))) {
224909467b48Spatrick       // Found a Pred block can be placed before Top.
225009467b48Spatrick       // Check if Top is the best successor of Pred.
225109467b48Spatrick       auto TopProb = MBPI->getEdgeProbability(Pred, Top);
225209467b48Spatrick       bool TopOK = true;
225309467b48Spatrick       for (MachineBasicBlock *Succ : Pred->successors()) {
225409467b48Spatrick         auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
225509467b48Spatrick         BlockChain *SuccChain = BlockToChain[Succ];
225609467b48Spatrick         // Check if Succ can be placed after Pred.
225709467b48Spatrick         // Succ should not be in any chain, or it is the head of some chain.
225809467b48Spatrick         if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
225909467b48Spatrick           TopOK = false;
226009467b48Spatrick           break;
226109467b48Spatrick         }
226209467b48Spatrick       }
226309467b48Spatrick       if (TopOK)
226409467b48Spatrick         return true;
226509467b48Spatrick     }
226609467b48Spatrick   }
226709467b48Spatrick   return false;
226809467b48Spatrick }
226909467b48Spatrick 
227009467b48Spatrick /// Attempt to rotate an exiting block to the bottom of the loop.
227109467b48Spatrick ///
227209467b48Spatrick /// Once we have built a chain, try to rotate it to line up the hot exit block
227309467b48Spatrick /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
227409467b48Spatrick /// branches. For example, if the loop has fallthrough into its header and out
227509467b48Spatrick /// of its bottom already, don't rotate it.
227609467b48Spatrick void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
227709467b48Spatrick                                        const MachineBasicBlock *ExitingBB,
227809467b48Spatrick                                        BlockFrequency ExitFreq,
227909467b48Spatrick                                        const BlockFilterSet &LoopBlockSet) {
228009467b48Spatrick   if (!ExitingBB)
228109467b48Spatrick     return;
228209467b48Spatrick 
228309467b48Spatrick   MachineBasicBlock *Top = *LoopChain.begin();
228409467b48Spatrick   MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
228509467b48Spatrick 
228609467b48Spatrick   // If ExitingBB is already the last one in a chain then nothing to do.
228709467b48Spatrick   if (Bottom == ExitingBB)
228809467b48Spatrick     return;
228909467b48Spatrick 
229009467b48Spatrick   bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
229109467b48Spatrick 
229209467b48Spatrick   // If the header has viable fallthrough, check whether the current loop
229309467b48Spatrick   // bottom is a viable exiting block. If so, bail out as rotating will
229409467b48Spatrick   // introduce an unnecessary branch.
229509467b48Spatrick   if (ViableTopFallthrough) {
229609467b48Spatrick     for (MachineBasicBlock *Succ : Bottom->successors()) {
229709467b48Spatrick       BlockChain *SuccChain = BlockToChain[Succ];
229809467b48Spatrick       if (!LoopBlockSet.count(Succ) &&
229909467b48Spatrick           (!SuccChain || Succ == *SuccChain->begin()))
230009467b48Spatrick         return;
230109467b48Spatrick     }
230209467b48Spatrick 
230309467b48Spatrick     // Rotate will destroy the top fallthrough, we need to ensure the new exit
230409467b48Spatrick     // frequency is larger than top fallthrough.
230509467b48Spatrick     BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
230609467b48Spatrick     if (FallThrough2Top >= ExitFreq)
230709467b48Spatrick       return;
230809467b48Spatrick   }
230909467b48Spatrick 
231009467b48Spatrick   BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
231109467b48Spatrick   if (ExitIt == LoopChain.end())
231209467b48Spatrick     return;
231309467b48Spatrick 
231409467b48Spatrick   // Rotating a loop exit to the bottom when there is a fallthrough to top
231509467b48Spatrick   // trades the entry fallthrough for an exit fallthrough.
231609467b48Spatrick   // If there is no bottom->top edge, but the chosen exit block does have
231709467b48Spatrick   // a fallthrough, we break that fallthrough for nothing in return.
231809467b48Spatrick 
231909467b48Spatrick   // Let's consider an example. We have a built chain of basic blocks
232009467b48Spatrick   // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
232109467b48Spatrick   // By doing a rotation we get
232209467b48Spatrick   // Bk+1, ..., Bn, B1, ..., Bk
232309467b48Spatrick   // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
232409467b48Spatrick   // If we had a fallthrough Bk -> Bk+1 it is broken now.
232509467b48Spatrick   // It might be compensated by fallthrough Bn -> B1.
232609467b48Spatrick   // So we have a condition to avoid creation of extra branch by loop rotation.
232709467b48Spatrick   // All below must be true to avoid loop rotation:
232809467b48Spatrick   //   If there is a fallthrough to top (B1)
232909467b48Spatrick   //   There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
233009467b48Spatrick   //   There is no fallthrough from bottom (Bn) to top (B1).
233109467b48Spatrick   // Please note that there is no exit fallthrough from Bn because we checked it
233209467b48Spatrick   // above.
233309467b48Spatrick   if (ViableTopFallthrough) {
233409467b48Spatrick     assert(std::next(ExitIt) != LoopChain.end() &&
233509467b48Spatrick            "Exit should not be last BB");
233609467b48Spatrick     MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
233709467b48Spatrick     if (ExitingBB->isSuccessor(NextBlockInChain))
233809467b48Spatrick       if (!Bottom->isSuccessor(Top))
233909467b48Spatrick         return;
234009467b48Spatrick   }
234109467b48Spatrick 
234209467b48Spatrick   LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
234309467b48Spatrick                     << " at bottom\n");
234409467b48Spatrick   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
234509467b48Spatrick }
234609467b48Spatrick 
234709467b48Spatrick /// Attempt to rotate a loop based on profile data to reduce branch cost.
234809467b48Spatrick ///
234909467b48Spatrick /// With profile data, we can determine the cost in terms of missed fall through
235009467b48Spatrick /// opportunities when rotating a loop chain and select the best rotation.
235109467b48Spatrick /// Basically, there are three kinds of cost to consider for each rotation:
235209467b48Spatrick ///    1. The possibly missed fall through edge (if it exists) from BB out of
235309467b48Spatrick ///    the loop to the loop header.
235409467b48Spatrick ///    2. The possibly missed fall through edges (if they exist) from the loop
235509467b48Spatrick ///    exits to BB out of the loop.
235609467b48Spatrick ///    3. The missed fall through edge (if it exists) from the last BB to the
235709467b48Spatrick ///    first BB in the loop chain.
235809467b48Spatrick ///  Therefore, the cost for a given rotation is the sum of costs listed above.
235909467b48Spatrick ///  We select the best rotation with the smallest cost.
236009467b48Spatrick void MachineBlockPlacement::rotateLoopWithProfile(
236109467b48Spatrick     BlockChain &LoopChain, const MachineLoop &L,
236209467b48Spatrick     const BlockFilterSet &LoopBlockSet) {
236309467b48Spatrick   auto RotationPos = LoopChain.end();
236409467b48Spatrick 
236509467b48Spatrick   BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
236609467b48Spatrick 
236709467b48Spatrick   // A utility lambda that scales up a block frequency by dividing it by a
236809467b48Spatrick   // branch probability which is the reciprocal of the scale.
236909467b48Spatrick   auto ScaleBlockFrequency = [](BlockFrequency Freq,
237009467b48Spatrick                                 unsigned Scale) -> BlockFrequency {
237109467b48Spatrick     if (Scale == 0)
237209467b48Spatrick       return 0;
237309467b48Spatrick     // Use operator / between BlockFrequency and BranchProbability to implement
237409467b48Spatrick     // saturating multiplication.
237509467b48Spatrick     return Freq / BranchProbability(1, Scale);
237609467b48Spatrick   };
237709467b48Spatrick 
237809467b48Spatrick   // Compute the cost of the missed fall-through edge to the loop header if the
237909467b48Spatrick   // chain head is not the loop header. As we only consider natural loops with
238009467b48Spatrick   // single header, this computation can be done only once.
238109467b48Spatrick   BlockFrequency HeaderFallThroughCost(0);
238209467b48Spatrick   MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
238309467b48Spatrick   for (auto *Pred : ChainHeaderBB->predecessors()) {
238409467b48Spatrick     BlockChain *PredChain = BlockToChain[Pred];
238509467b48Spatrick     if (!LoopBlockSet.count(Pred) &&
238609467b48Spatrick         (!PredChain || Pred == *std::prev(PredChain->end()))) {
238709467b48Spatrick       auto EdgeFreq = MBFI->getBlockFreq(Pred) *
238809467b48Spatrick           MBPI->getEdgeProbability(Pred, ChainHeaderBB);
238909467b48Spatrick       auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
239009467b48Spatrick       // If the predecessor has only an unconditional jump to the header, we
239109467b48Spatrick       // need to consider the cost of this jump.
239209467b48Spatrick       if (Pred->succ_size() == 1)
239309467b48Spatrick         FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
239409467b48Spatrick       HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
239509467b48Spatrick     }
239609467b48Spatrick   }
239709467b48Spatrick 
239809467b48Spatrick   // Here we collect all exit blocks in the loop, and for each exit we find out
239909467b48Spatrick   // its hottest exit edge. For each loop rotation, we define the loop exit cost
240009467b48Spatrick   // as the sum of frequencies of exit edges we collect here, excluding the exit
240109467b48Spatrick   // edge from the tail of the loop chain.
240209467b48Spatrick   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
240309467b48Spatrick   for (auto BB : LoopChain) {
240409467b48Spatrick     auto LargestExitEdgeProb = BranchProbability::getZero();
240509467b48Spatrick     for (auto *Succ : BB->successors()) {
240609467b48Spatrick       BlockChain *SuccChain = BlockToChain[Succ];
240709467b48Spatrick       if (!LoopBlockSet.count(Succ) &&
240809467b48Spatrick           (!SuccChain || Succ == *SuccChain->begin())) {
240909467b48Spatrick         auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
241009467b48Spatrick         LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
241109467b48Spatrick       }
241209467b48Spatrick     }
241309467b48Spatrick     if (LargestExitEdgeProb > BranchProbability::getZero()) {
241409467b48Spatrick       auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
241509467b48Spatrick       ExitsWithFreq.emplace_back(BB, ExitFreq);
241609467b48Spatrick     }
241709467b48Spatrick   }
241809467b48Spatrick 
241909467b48Spatrick   // In this loop we iterate every block in the loop chain and calculate the
242009467b48Spatrick   // cost assuming the block is the head of the loop chain. When the loop ends,
242109467b48Spatrick   // we should have found the best candidate as the loop chain's head.
242209467b48Spatrick   for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
242309467b48Spatrick             EndIter = LoopChain.end();
242409467b48Spatrick        Iter != EndIter; Iter++, TailIter++) {
242509467b48Spatrick     // TailIter is used to track the tail of the loop chain if the block we are
242609467b48Spatrick     // checking (pointed by Iter) is the head of the chain.
242709467b48Spatrick     if (TailIter == LoopChain.end())
242809467b48Spatrick       TailIter = LoopChain.begin();
242909467b48Spatrick 
243009467b48Spatrick     auto TailBB = *TailIter;
243109467b48Spatrick 
243209467b48Spatrick     // Calculate the cost by putting this BB to the top.
243309467b48Spatrick     BlockFrequency Cost = 0;
243409467b48Spatrick 
243509467b48Spatrick     // If the current BB is the loop header, we need to take into account the
243609467b48Spatrick     // cost of the missed fall through edge from outside of the loop to the
243709467b48Spatrick     // header.
243809467b48Spatrick     if (Iter != LoopChain.begin())
243909467b48Spatrick       Cost += HeaderFallThroughCost;
244009467b48Spatrick 
244109467b48Spatrick     // Collect the loop exit cost by summing up frequencies of all exit edges
244209467b48Spatrick     // except the one from the chain tail.
244309467b48Spatrick     for (auto &ExitWithFreq : ExitsWithFreq)
244409467b48Spatrick       if (TailBB != ExitWithFreq.first)
244509467b48Spatrick         Cost += ExitWithFreq.second;
244609467b48Spatrick 
244709467b48Spatrick     // The cost of breaking the once fall-through edge from the tail to the top
244809467b48Spatrick     // of the loop chain. Here we need to consider three cases:
244909467b48Spatrick     // 1. If the tail node has only one successor, then we will get an
245009467b48Spatrick     //    additional jmp instruction. So the cost here is (MisfetchCost +
245109467b48Spatrick     //    JumpInstCost) * tail node frequency.
245209467b48Spatrick     // 2. If the tail node has two successors, then we may still get an
245309467b48Spatrick     //    additional jmp instruction if the layout successor after the loop
245409467b48Spatrick     //    chain is not its CFG successor. Note that the more frequently executed
245509467b48Spatrick     //    jmp instruction will be put ahead of the other one. Assume the
245609467b48Spatrick     //    frequency of those two branches are x and y, where x is the frequency
245709467b48Spatrick     //    of the edge to the chain head, then the cost will be
245809467b48Spatrick     //    (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
245909467b48Spatrick     // 3. If the tail node has more than two successors (this rarely happens),
246009467b48Spatrick     //    we won't consider any additional cost.
246109467b48Spatrick     if (TailBB->isSuccessor(*Iter)) {
246209467b48Spatrick       auto TailBBFreq = MBFI->getBlockFreq(TailBB);
246309467b48Spatrick       if (TailBB->succ_size() == 1)
246409467b48Spatrick         Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
246509467b48Spatrick                                     MisfetchCost + JumpInstCost);
246609467b48Spatrick       else if (TailBB->succ_size() == 2) {
246709467b48Spatrick         auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
246809467b48Spatrick         auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
246909467b48Spatrick         auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
247009467b48Spatrick                                   ? TailBBFreq * TailToHeadProb.getCompl()
247109467b48Spatrick                                   : TailToHeadFreq;
247209467b48Spatrick         Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
247309467b48Spatrick                 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
247409467b48Spatrick       }
247509467b48Spatrick     }
247609467b48Spatrick 
247709467b48Spatrick     LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
247809467b48Spatrick                       << getBlockName(*Iter)
247909467b48Spatrick                       << " to the top: " << Cost.getFrequency() << "\n");
248009467b48Spatrick 
248109467b48Spatrick     if (Cost < SmallestRotationCost) {
248209467b48Spatrick       SmallestRotationCost = Cost;
248309467b48Spatrick       RotationPos = Iter;
248409467b48Spatrick     }
248509467b48Spatrick   }
248609467b48Spatrick 
248709467b48Spatrick   if (RotationPos != LoopChain.end()) {
248809467b48Spatrick     LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
248909467b48Spatrick                       << " to the top\n");
249009467b48Spatrick     std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
249109467b48Spatrick   }
249209467b48Spatrick }
249309467b48Spatrick 
249409467b48Spatrick /// Collect blocks in the given loop that are to be placed.
249509467b48Spatrick ///
249609467b48Spatrick /// When profile data is available, exclude cold blocks from the returned set;
249709467b48Spatrick /// otherwise, collect all blocks in the loop.
249809467b48Spatrick MachineBlockPlacement::BlockFilterSet
249909467b48Spatrick MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
250009467b48Spatrick   BlockFilterSet LoopBlockSet;
250109467b48Spatrick 
250209467b48Spatrick   // Filter cold blocks off from LoopBlockSet when profile data is available.
250309467b48Spatrick   // Collect the sum of frequencies of incoming edges to the loop header from
250409467b48Spatrick   // outside. If we treat the loop as a super block, this is the frequency of
250509467b48Spatrick   // the loop. Then for each block in the loop, we calculate the ratio between
250609467b48Spatrick   // its frequency and the frequency of the loop block. When it is too small,
250709467b48Spatrick   // don't add it to the loop chain. If there are outer loops, then this block
250809467b48Spatrick   // will be merged into the first outer loop chain for which this block is not
250909467b48Spatrick   // cold anymore. This needs precise profile data and we only do this when
251009467b48Spatrick   // profile data is available.
251109467b48Spatrick   if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
251209467b48Spatrick     BlockFrequency LoopFreq(0);
251309467b48Spatrick     for (auto LoopPred : L.getHeader()->predecessors())
251409467b48Spatrick       if (!L.contains(LoopPred))
251509467b48Spatrick         LoopFreq += MBFI->getBlockFreq(LoopPred) *
251609467b48Spatrick                     MBPI->getEdgeProbability(LoopPred, L.getHeader());
251709467b48Spatrick 
251809467b48Spatrick     for (MachineBasicBlock *LoopBB : L.getBlocks()) {
251909467b48Spatrick       auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
252009467b48Spatrick       if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
252109467b48Spatrick         continue;
252209467b48Spatrick       LoopBlockSet.insert(LoopBB);
252309467b48Spatrick     }
252409467b48Spatrick   } else
252509467b48Spatrick     LoopBlockSet.insert(L.block_begin(), L.block_end());
252609467b48Spatrick 
252709467b48Spatrick   return LoopBlockSet;
252809467b48Spatrick }
252909467b48Spatrick 
253009467b48Spatrick /// Forms basic block chains from the natural loop structures.
253109467b48Spatrick ///
253209467b48Spatrick /// These chains are designed to preserve the existing *structure* of the code
253309467b48Spatrick /// as much as possible. We can then stitch the chains together in a way which
253409467b48Spatrick /// both preserves the topological structure and minimizes taken conditional
253509467b48Spatrick /// branches.
253609467b48Spatrick void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
253709467b48Spatrick   // First recurse through any nested loops, building chains for those inner
253809467b48Spatrick   // loops.
253909467b48Spatrick   for (const MachineLoop *InnerLoop : L)
254009467b48Spatrick     buildLoopChains(*InnerLoop);
254109467b48Spatrick 
254209467b48Spatrick   assert(BlockWorkList.empty() &&
254309467b48Spatrick          "BlockWorkList not empty when starting to build loop chains.");
254409467b48Spatrick   assert(EHPadWorkList.empty() &&
254509467b48Spatrick          "EHPadWorkList not empty when starting to build loop chains.");
254609467b48Spatrick   BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
254709467b48Spatrick 
254809467b48Spatrick   // Check if we have profile data for this function. If yes, we will rotate
254909467b48Spatrick   // this loop by modeling costs more precisely which requires the profile data
255009467b48Spatrick   // for better layout.
255109467b48Spatrick   bool RotateLoopWithProfile =
255209467b48Spatrick       ForcePreciseRotationCost ||
255309467b48Spatrick       (PreciseRotationCost && F->getFunction().hasProfileData());
255409467b48Spatrick 
255509467b48Spatrick   // First check to see if there is an obviously preferable top block for the
255609467b48Spatrick   // loop. This will default to the header, but may end up as one of the
255709467b48Spatrick   // predecessors to the header if there is one which will result in strictly
255809467b48Spatrick   // fewer branches in the loop body.
255909467b48Spatrick   MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
256009467b48Spatrick 
256109467b48Spatrick   // If we selected just the header for the loop top, look for a potentially
256209467b48Spatrick   // profitable exit block in the event that rotating the loop can eliminate
256309467b48Spatrick   // branches by placing an exit edge at the bottom.
256409467b48Spatrick   //
256509467b48Spatrick   // Loops are processed innermost to uttermost, make sure we clear
256609467b48Spatrick   // PreferredLoopExit before processing a new loop.
256709467b48Spatrick   PreferredLoopExit = nullptr;
256809467b48Spatrick   BlockFrequency ExitFreq;
256909467b48Spatrick   if (!RotateLoopWithProfile && LoopTop == L.getHeader())
257009467b48Spatrick     PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
257109467b48Spatrick 
257209467b48Spatrick   BlockChain &LoopChain = *BlockToChain[LoopTop];
257309467b48Spatrick 
257409467b48Spatrick   // FIXME: This is a really lame way of walking the chains in the loop: we
257509467b48Spatrick   // walk the blocks, and use a set to prevent visiting a particular chain
257609467b48Spatrick   // twice.
257709467b48Spatrick   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
257809467b48Spatrick   assert(LoopChain.UnscheduledPredecessors == 0 &&
257909467b48Spatrick          "LoopChain should not have unscheduled predecessors.");
258009467b48Spatrick   UpdatedPreds.insert(&LoopChain);
258109467b48Spatrick 
258209467b48Spatrick   for (const MachineBasicBlock *LoopBB : LoopBlockSet)
258309467b48Spatrick     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
258409467b48Spatrick 
258509467b48Spatrick   buildChain(LoopTop, LoopChain, &LoopBlockSet);
258609467b48Spatrick 
258709467b48Spatrick   if (RotateLoopWithProfile)
258809467b48Spatrick     rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
258909467b48Spatrick   else
259009467b48Spatrick     rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
259109467b48Spatrick 
259209467b48Spatrick   LLVM_DEBUG({
259309467b48Spatrick     // Crash at the end so we get all of the debugging output first.
259409467b48Spatrick     bool BadLoop = false;
259509467b48Spatrick     if (LoopChain.UnscheduledPredecessors) {
259609467b48Spatrick       BadLoop = true;
259709467b48Spatrick       dbgs() << "Loop chain contains a block without its preds placed!\n"
259809467b48Spatrick              << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
259909467b48Spatrick              << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
260009467b48Spatrick     }
260109467b48Spatrick     for (MachineBasicBlock *ChainBB : LoopChain) {
260209467b48Spatrick       dbgs() << "          ... " << getBlockName(ChainBB) << "\n";
260309467b48Spatrick       if (!LoopBlockSet.remove(ChainBB)) {
260409467b48Spatrick         // We don't mark the loop as bad here because there are real situations
260509467b48Spatrick         // where this can occur. For example, with an unanalyzable fallthrough
260609467b48Spatrick         // from a loop block to a non-loop block or vice versa.
260709467b48Spatrick         dbgs() << "Loop chain contains a block not contained by the loop!\n"
260809467b48Spatrick                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
260909467b48Spatrick                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
261009467b48Spatrick                << "  Bad block:    " << getBlockName(ChainBB) << "\n";
261109467b48Spatrick       }
261209467b48Spatrick     }
261309467b48Spatrick 
261409467b48Spatrick     if (!LoopBlockSet.empty()) {
261509467b48Spatrick       BadLoop = true;
261609467b48Spatrick       for (const MachineBasicBlock *LoopBB : LoopBlockSet)
261709467b48Spatrick         dbgs() << "Loop contains blocks never placed into a chain!\n"
261809467b48Spatrick                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
261909467b48Spatrick                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
262009467b48Spatrick                << "  Bad block:    " << getBlockName(LoopBB) << "\n";
262109467b48Spatrick     }
262209467b48Spatrick     assert(!BadLoop && "Detected problems with the placement of this loop.");
262309467b48Spatrick   });
262409467b48Spatrick 
262509467b48Spatrick   BlockWorkList.clear();
262609467b48Spatrick   EHPadWorkList.clear();
262709467b48Spatrick }
262809467b48Spatrick 
262909467b48Spatrick void MachineBlockPlacement::buildCFGChains() {
263009467b48Spatrick   // Ensure that every BB in the function has an associated chain to simplify
263109467b48Spatrick   // the assumptions of the remaining algorithm.
2632*097a140dSpatrick   SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
263309467b48Spatrick   for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
263409467b48Spatrick        ++FI) {
263509467b48Spatrick     MachineBasicBlock *BB = &*FI;
263609467b48Spatrick     BlockChain *Chain =
263709467b48Spatrick         new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
263809467b48Spatrick     // Also, merge any blocks which we cannot reason about and must preserve
263909467b48Spatrick     // the exact fallthrough behavior for.
264009467b48Spatrick     while (true) {
264109467b48Spatrick       Cond.clear();
2642*097a140dSpatrick       MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
264309467b48Spatrick       if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
264409467b48Spatrick         break;
264509467b48Spatrick 
264609467b48Spatrick       MachineFunction::iterator NextFI = std::next(FI);
264709467b48Spatrick       MachineBasicBlock *NextBB = &*NextFI;
264809467b48Spatrick       // Ensure that the layout successor is a viable block, as we know that
264909467b48Spatrick       // fallthrough is a possibility.
265009467b48Spatrick       assert(NextFI != FE && "Can't fallthrough past the last block.");
265109467b48Spatrick       LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
265209467b48Spatrick                         << getBlockName(BB) << " -> " << getBlockName(NextBB)
265309467b48Spatrick                         << "\n");
265409467b48Spatrick       Chain->merge(NextBB, nullptr);
265509467b48Spatrick #ifndef NDEBUG
265609467b48Spatrick       BlocksWithUnanalyzableExits.insert(&*BB);
265709467b48Spatrick #endif
265809467b48Spatrick       FI = NextFI;
265909467b48Spatrick       BB = NextBB;
266009467b48Spatrick     }
266109467b48Spatrick   }
266209467b48Spatrick 
266309467b48Spatrick   // Build any loop-based chains.
266409467b48Spatrick   PreferredLoopExit = nullptr;
266509467b48Spatrick   for (MachineLoop *L : *MLI)
266609467b48Spatrick     buildLoopChains(*L);
266709467b48Spatrick 
266809467b48Spatrick   assert(BlockWorkList.empty() &&
266909467b48Spatrick          "BlockWorkList should be empty before building final chain.");
267009467b48Spatrick   assert(EHPadWorkList.empty() &&
267109467b48Spatrick          "EHPadWorkList should be empty before building final chain.");
267209467b48Spatrick 
267309467b48Spatrick   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
267409467b48Spatrick   for (MachineBasicBlock &MBB : *F)
267509467b48Spatrick     fillWorkLists(&MBB, UpdatedPreds);
267609467b48Spatrick 
267709467b48Spatrick   BlockChain &FunctionChain = *BlockToChain[&F->front()];
267809467b48Spatrick   buildChain(&F->front(), FunctionChain);
267909467b48Spatrick 
268009467b48Spatrick #ifndef NDEBUG
268109467b48Spatrick   using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
268209467b48Spatrick #endif
268309467b48Spatrick   LLVM_DEBUG({
268409467b48Spatrick     // Crash at the end so we get all of the debugging output first.
268509467b48Spatrick     bool BadFunc = false;
268609467b48Spatrick     FunctionBlockSetType FunctionBlockSet;
268709467b48Spatrick     for (MachineBasicBlock &MBB : *F)
268809467b48Spatrick       FunctionBlockSet.insert(&MBB);
268909467b48Spatrick 
269009467b48Spatrick     for (MachineBasicBlock *ChainBB : FunctionChain)
269109467b48Spatrick       if (!FunctionBlockSet.erase(ChainBB)) {
269209467b48Spatrick         BadFunc = true;
269309467b48Spatrick         dbgs() << "Function chain contains a block not in the function!\n"
269409467b48Spatrick                << "  Bad block:    " << getBlockName(ChainBB) << "\n";
269509467b48Spatrick       }
269609467b48Spatrick 
269709467b48Spatrick     if (!FunctionBlockSet.empty()) {
269809467b48Spatrick       BadFunc = true;
269909467b48Spatrick       for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
270009467b48Spatrick         dbgs() << "Function contains blocks never placed into a chain!\n"
270109467b48Spatrick                << "  Bad block:    " << getBlockName(RemainingBB) << "\n";
270209467b48Spatrick     }
270309467b48Spatrick     assert(!BadFunc && "Detected problems with the block placement.");
270409467b48Spatrick   });
270509467b48Spatrick 
2706*097a140dSpatrick   // Remember original layout ordering, so we can update terminators after
2707*097a140dSpatrick   // reordering to point to the original layout successor.
2708*097a140dSpatrick   SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2709*097a140dSpatrick       F->getNumBlockIDs());
2710*097a140dSpatrick   {
2711*097a140dSpatrick     MachineBasicBlock *LastMBB = nullptr;
2712*097a140dSpatrick     for (auto &MBB : *F) {
2713*097a140dSpatrick       if (LastMBB != nullptr)
2714*097a140dSpatrick         OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2715*097a140dSpatrick       LastMBB = &MBB;
2716*097a140dSpatrick     }
2717*097a140dSpatrick     OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2718*097a140dSpatrick   }
2719*097a140dSpatrick 
272009467b48Spatrick   // Splice the blocks into place.
272109467b48Spatrick   MachineFunction::iterator InsertPos = F->begin();
272209467b48Spatrick   LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
272309467b48Spatrick   for (MachineBasicBlock *ChainBB : FunctionChain) {
272409467b48Spatrick     LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
272509467b48Spatrick                                                             : "          ... ")
272609467b48Spatrick                       << getBlockName(ChainBB) << "\n");
272709467b48Spatrick     if (InsertPos != MachineFunction::iterator(ChainBB))
272809467b48Spatrick       F->splice(InsertPos, ChainBB);
272909467b48Spatrick     else
273009467b48Spatrick       ++InsertPos;
273109467b48Spatrick 
273209467b48Spatrick     // Update the terminator of the previous block.
273309467b48Spatrick     if (ChainBB == *FunctionChain.begin())
273409467b48Spatrick       continue;
273509467b48Spatrick     MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
273609467b48Spatrick 
273709467b48Spatrick     // FIXME: It would be awesome of updateTerminator would just return rather
273809467b48Spatrick     // than assert when the branch cannot be analyzed in order to remove this
273909467b48Spatrick     // boiler plate.
274009467b48Spatrick     Cond.clear();
2741*097a140dSpatrick     MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
274209467b48Spatrick 
274309467b48Spatrick #ifndef NDEBUG
274409467b48Spatrick     if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
274509467b48Spatrick       // Given the exact block placement we chose, we may actually not _need_ to
274609467b48Spatrick       // be able to edit PrevBB's terminator sequence, but not being _able_ to
274709467b48Spatrick       // do that at this point is a bug.
274809467b48Spatrick       assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
274909467b48Spatrick               !PrevBB->canFallThrough()) &&
275009467b48Spatrick              "Unexpected block with un-analyzable fallthrough!");
275109467b48Spatrick       Cond.clear();
275209467b48Spatrick       TBB = FBB = nullptr;
275309467b48Spatrick     }
275409467b48Spatrick #endif
275509467b48Spatrick 
275609467b48Spatrick     // The "PrevBB" is not yet updated to reflect current code layout, so,
275709467b48Spatrick     //   o. it may fall-through to a block without explicit "goto" instruction
275809467b48Spatrick     //      before layout, and no longer fall-through it after layout; or
275909467b48Spatrick     //   o. just opposite.
276009467b48Spatrick     //
276109467b48Spatrick     // analyzeBranch() may return erroneous value for FBB when these two
276209467b48Spatrick     // situations take place. For the first scenario FBB is mistakenly set NULL;
276309467b48Spatrick     // for the 2nd scenario, the FBB, which is expected to be NULL, is
276409467b48Spatrick     // mistakenly pointing to "*BI".
276509467b48Spatrick     // Thus, if the future change needs to use FBB before the layout is set, it
276609467b48Spatrick     // has to correct FBB first by using the code similar to the following:
276709467b48Spatrick     //
276809467b48Spatrick     // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
276909467b48Spatrick     //   PrevBB->updateTerminator();
277009467b48Spatrick     //   Cond.clear();
277109467b48Spatrick     //   TBB = FBB = nullptr;
277209467b48Spatrick     //   if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
277309467b48Spatrick     //     // FIXME: This should never take place.
277409467b48Spatrick     //     TBB = FBB = nullptr;
277509467b48Spatrick     //   }
277609467b48Spatrick     // }
2777*097a140dSpatrick     if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2778*097a140dSpatrick       PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2779*097a140dSpatrick     }
278009467b48Spatrick   }
278109467b48Spatrick 
278209467b48Spatrick   // Fixup the last block.
278309467b48Spatrick   Cond.clear();
2784*097a140dSpatrick   MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2785*097a140dSpatrick   if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2786*097a140dSpatrick     MachineBasicBlock *PrevBB = &F->back();
2787*097a140dSpatrick     PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2788*097a140dSpatrick   }
278909467b48Spatrick 
279009467b48Spatrick   BlockWorkList.clear();
279109467b48Spatrick   EHPadWorkList.clear();
279209467b48Spatrick }
279309467b48Spatrick 
279409467b48Spatrick void MachineBlockPlacement::optimizeBranches() {
279509467b48Spatrick   BlockChain &FunctionChain = *BlockToChain[&F->front()];
2796*097a140dSpatrick   SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
279709467b48Spatrick 
279809467b48Spatrick   // Now that all the basic blocks in the chain have the proper layout,
2799*097a140dSpatrick   // make a final call to analyzeBranch with AllowModify set.
280009467b48Spatrick   // Indeed, the target may be able to optimize the branches in a way we
280109467b48Spatrick   // cannot because all branches may not be analyzable.
280209467b48Spatrick   // E.g., the target may be able to remove an unconditional branch to
280309467b48Spatrick   // a fallthrough when it occurs after predicated terminators.
280409467b48Spatrick   for (MachineBasicBlock *ChainBB : FunctionChain) {
280509467b48Spatrick     Cond.clear();
2806*097a140dSpatrick     MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
280709467b48Spatrick     if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
280809467b48Spatrick       // If PrevBB has a two-way branch, try to re-order the branches
280909467b48Spatrick       // such that we branch to the successor with higher probability first.
281009467b48Spatrick       if (TBB && !Cond.empty() && FBB &&
281109467b48Spatrick           MBPI->getEdgeProbability(ChainBB, FBB) >
281209467b48Spatrick               MBPI->getEdgeProbability(ChainBB, TBB) &&
281309467b48Spatrick           !TII->reverseBranchCondition(Cond)) {
281409467b48Spatrick         LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
281509467b48Spatrick                           << getBlockName(ChainBB) << "\n");
281609467b48Spatrick         LLVM_DEBUG(dbgs() << "    Edge probability: "
281709467b48Spatrick                           << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
281809467b48Spatrick                           << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
281909467b48Spatrick         DebugLoc dl; // FIXME: this is nowhere
282009467b48Spatrick         TII->removeBranch(*ChainBB);
282109467b48Spatrick         TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
282209467b48Spatrick       }
282309467b48Spatrick     }
282409467b48Spatrick   }
282509467b48Spatrick }
282609467b48Spatrick 
282709467b48Spatrick void MachineBlockPlacement::alignBlocks() {
282809467b48Spatrick   // Walk through the backedges of the function now that we have fully laid out
282909467b48Spatrick   // the basic blocks and align the destination of each backedge. We don't rely
283009467b48Spatrick   // exclusively on the loop info here so that we can align backedges in
283109467b48Spatrick   // unnatural CFGs and backedges that were introduced purely because of the
283209467b48Spatrick   // loop rotations done during this layout pass.
283309467b48Spatrick   if (F->getFunction().hasMinSize() ||
283409467b48Spatrick       (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
283509467b48Spatrick     return;
283609467b48Spatrick   BlockChain &FunctionChain = *BlockToChain[&F->front()];
283709467b48Spatrick   if (FunctionChain.begin() == FunctionChain.end())
283809467b48Spatrick     return; // Empty chain.
283909467b48Spatrick 
284009467b48Spatrick   const BranchProbability ColdProb(1, 5); // 20%
284109467b48Spatrick   BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
284209467b48Spatrick   BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
284309467b48Spatrick   for (MachineBasicBlock *ChainBB : FunctionChain) {
284409467b48Spatrick     if (ChainBB == *FunctionChain.begin())
284509467b48Spatrick       continue;
284609467b48Spatrick 
284709467b48Spatrick     // Don't align non-looping basic blocks. These are unlikely to execute
284809467b48Spatrick     // enough times to matter in practice. Note that we'll still handle
284909467b48Spatrick     // unnatural CFGs inside of a natural outer loop (the common case) and
285009467b48Spatrick     // rotated loops.
285109467b48Spatrick     MachineLoop *L = MLI->getLoopFor(ChainBB);
285209467b48Spatrick     if (!L)
285309467b48Spatrick       continue;
285409467b48Spatrick 
285509467b48Spatrick     const Align Align = TLI->getPrefLoopAlignment(L);
285609467b48Spatrick     if (Align == 1)
285709467b48Spatrick       continue; // Don't care about loop alignment.
285809467b48Spatrick 
285909467b48Spatrick     // If the block is cold relative to the function entry don't waste space
286009467b48Spatrick     // aligning it.
286109467b48Spatrick     BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
286209467b48Spatrick     if (Freq < WeightedEntryFreq)
286309467b48Spatrick       continue;
286409467b48Spatrick 
286509467b48Spatrick     // If the block is cold relative to its loop header, don't align it
286609467b48Spatrick     // regardless of what edges into the block exist.
286709467b48Spatrick     MachineBasicBlock *LoopHeader = L->getHeader();
286809467b48Spatrick     BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
286909467b48Spatrick     if (Freq < (LoopHeaderFreq * ColdProb))
287009467b48Spatrick       continue;
287109467b48Spatrick 
287209467b48Spatrick     // If the global profiles indicates so, don't align it.
2873*097a140dSpatrick     if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
287409467b48Spatrick         !TLI->alignLoopsWithOptSize())
287509467b48Spatrick       continue;
287609467b48Spatrick 
287709467b48Spatrick     // Check for the existence of a non-layout predecessor which would benefit
287809467b48Spatrick     // from aligning this block.
287909467b48Spatrick     MachineBasicBlock *LayoutPred =
288009467b48Spatrick         &*std::prev(MachineFunction::iterator(ChainBB));
288109467b48Spatrick 
288209467b48Spatrick     // Force alignment if all the predecessors are jumps. We already checked
288309467b48Spatrick     // that the block isn't cold above.
288409467b48Spatrick     if (!LayoutPred->isSuccessor(ChainBB)) {
288509467b48Spatrick       ChainBB->setAlignment(Align);
288609467b48Spatrick       continue;
288709467b48Spatrick     }
288809467b48Spatrick 
288909467b48Spatrick     // Align this block if the layout predecessor's edge into this block is
289009467b48Spatrick     // cold relative to the block. When this is true, other predecessors make up
289109467b48Spatrick     // all of the hot entries into the block and thus alignment is likely to be
289209467b48Spatrick     // important.
289309467b48Spatrick     BranchProbability LayoutProb =
289409467b48Spatrick         MBPI->getEdgeProbability(LayoutPred, ChainBB);
289509467b48Spatrick     BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
289609467b48Spatrick     if (LayoutEdgeFreq <= (Freq * ColdProb))
289709467b48Spatrick       ChainBB->setAlignment(Align);
289809467b48Spatrick   }
289909467b48Spatrick }
290009467b48Spatrick 
290109467b48Spatrick /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
290209467b48Spatrick /// it was duplicated into its chain predecessor and removed.
290309467b48Spatrick /// \p BB    - Basic block that may be duplicated.
290409467b48Spatrick ///
290509467b48Spatrick /// \p LPred - Chosen layout predecessor of \p BB.
290609467b48Spatrick ///            Updated to be the chain end if LPred is removed.
290709467b48Spatrick /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
290809467b48Spatrick /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
290909467b48Spatrick ///                  Used to identify which blocks to update predecessor
291009467b48Spatrick ///                  counts.
291109467b48Spatrick /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
291209467b48Spatrick ///                          chosen in the given order due to unnatural CFG
291309467b48Spatrick ///                          only needed if \p BB is removed and
291409467b48Spatrick ///                          \p PrevUnplacedBlockIt pointed to \p BB.
291509467b48Spatrick /// @return true if \p BB was removed.
291609467b48Spatrick bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
291709467b48Spatrick     MachineBasicBlock *BB, MachineBasicBlock *&LPred,
291809467b48Spatrick     const MachineBasicBlock *LoopHeaderBB,
291909467b48Spatrick     BlockChain &Chain, BlockFilterSet *BlockFilter,
292009467b48Spatrick     MachineFunction::iterator &PrevUnplacedBlockIt) {
292109467b48Spatrick   bool Removed, DuplicatedToLPred;
292209467b48Spatrick   bool DuplicatedToOriginalLPred;
292309467b48Spatrick   Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
292409467b48Spatrick                                     PrevUnplacedBlockIt,
292509467b48Spatrick                                     DuplicatedToLPred);
292609467b48Spatrick   if (!Removed)
292709467b48Spatrick     return false;
292809467b48Spatrick   DuplicatedToOriginalLPred = DuplicatedToLPred;
292909467b48Spatrick   // Iteratively try to duplicate again. It can happen that a block that is
293009467b48Spatrick   // duplicated into is still small enough to be duplicated again.
293109467b48Spatrick   // No need to call markBlockSuccessors in this case, as the blocks being
293209467b48Spatrick   // duplicated from here on are already scheduled.
2933*097a140dSpatrick   while (DuplicatedToLPred && Removed) {
293409467b48Spatrick     MachineBasicBlock *DupBB, *DupPred;
293509467b48Spatrick     // The removal callback causes Chain.end() to be updated when a block is
293609467b48Spatrick     // removed. On the first pass through the loop, the chain end should be the
293709467b48Spatrick     // same as it was on function entry. On subsequent passes, because we are
293809467b48Spatrick     // duplicating the block at the end of the chain, if it is removed the
293909467b48Spatrick     // chain will have shrunk by one block.
294009467b48Spatrick     BlockChain::iterator ChainEnd = Chain.end();
294109467b48Spatrick     DupBB = *(--ChainEnd);
294209467b48Spatrick     // Now try to duplicate again.
294309467b48Spatrick     if (ChainEnd == Chain.begin())
294409467b48Spatrick       break;
294509467b48Spatrick     DupPred = *std::prev(ChainEnd);
294609467b48Spatrick     Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
294709467b48Spatrick                                       PrevUnplacedBlockIt,
294809467b48Spatrick                                       DuplicatedToLPred);
294909467b48Spatrick   }
295009467b48Spatrick   // If BB was duplicated into LPred, it is now scheduled. But because it was
295109467b48Spatrick   // removed, markChainSuccessors won't be called for its chain. Instead we
295209467b48Spatrick   // call markBlockSuccessors for LPred to achieve the same effect. This must go
295309467b48Spatrick   // at the end because repeating the tail duplication can increase the number
295409467b48Spatrick   // of unscheduled predecessors.
295509467b48Spatrick   LPred = *std::prev(Chain.end());
295609467b48Spatrick   if (DuplicatedToOriginalLPred)
295709467b48Spatrick     markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
295809467b48Spatrick   return true;
295909467b48Spatrick }
296009467b48Spatrick 
296109467b48Spatrick /// Tail duplicate \p BB into (some) predecessors if profitable.
296209467b48Spatrick /// \p BB    - Basic block that may be duplicated
296309467b48Spatrick /// \p LPred - Chosen layout predecessor of \p BB
296409467b48Spatrick /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
296509467b48Spatrick /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
296609467b48Spatrick ///                  Used to identify which blocks to update predecessor
296709467b48Spatrick ///                  counts.
296809467b48Spatrick /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
296909467b48Spatrick ///                          chosen in the given order due to unnatural CFG
297009467b48Spatrick ///                          only needed if \p BB is removed and
297109467b48Spatrick ///                          \p PrevUnplacedBlockIt pointed to \p BB.
2972*097a140dSpatrick /// \p DuplicatedToLPred - True if the block was duplicated into LPred.
297309467b48Spatrick /// \return  - True if the block was duplicated into all preds and removed.
297409467b48Spatrick bool MachineBlockPlacement::maybeTailDuplicateBlock(
297509467b48Spatrick     MachineBasicBlock *BB, MachineBasicBlock *LPred,
297609467b48Spatrick     BlockChain &Chain, BlockFilterSet *BlockFilter,
297709467b48Spatrick     MachineFunction::iterator &PrevUnplacedBlockIt,
297809467b48Spatrick     bool &DuplicatedToLPred) {
297909467b48Spatrick   DuplicatedToLPred = false;
298009467b48Spatrick   if (!shouldTailDuplicate(BB))
298109467b48Spatrick     return false;
298209467b48Spatrick 
298309467b48Spatrick   LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
298409467b48Spatrick                     << "\n");
298509467b48Spatrick 
298609467b48Spatrick   // This has to be a callback because none of it can be done after
298709467b48Spatrick   // BB is deleted.
298809467b48Spatrick   bool Removed = false;
298909467b48Spatrick   auto RemovalCallback =
299009467b48Spatrick       [&](MachineBasicBlock *RemBB) {
299109467b48Spatrick         // Signal to outer function
299209467b48Spatrick         Removed = true;
299309467b48Spatrick 
299409467b48Spatrick         // Conservative default.
299509467b48Spatrick         bool InWorkList = true;
299609467b48Spatrick         // Remove from the Chain and Chain Map
299709467b48Spatrick         if (BlockToChain.count(RemBB)) {
299809467b48Spatrick           BlockChain *Chain = BlockToChain[RemBB];
299909467b48Spatrick           InWorkList = Chain->UnscheduledPredecessors == 0;
300009467b48Spatrick           Chain->remove(RemBB);
300109467b48Spatrick           BlockToChain.erase(RemBB);
300209467b48Spatrick         }
300309467b48Spatrick 
300409467b48Spatrick         // Handle the unplaced block iterator
300509467b48Spatrick         if (&(*PrevUnplacedBlockIt) == RemBB) {
300609467b48Spatrick           PrevUnplacedBlockIt++;
300709467b48Spatrick         }
300809467b48Spatrick 
300909467b48Spatrick         // Handle the Work Lists
301009467b48Spatrick         if (InWorkList) {
301109467b48Spatrick           SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
301209467b48Spatrick           if (RemBB->isEHPad())
301309467b48Spatrick             RemoveList = EHPadWorkList;
301409467b48Spatrick           RemoveList.erase(
301509467b48Spatrick               llvm::remove_if(RemoveList,
301609467b48Spatrick                               [RemBB](MachineBasicBlock *BB) {
301709467b48Spatrick                                 return BB == RemBB;
301809467b48Spatrick                               }),
301909467b48Spatrick               RemoveList.end());
302009467b48Spatrick         }
302109467b48Spatrick 
302209467b48Spatrick         // Handle the filter set
302309467b48Spatrick         if (BlockFilter) {
302409467b48Spatrick           BlockFilter->remove(RemBB);
302509467b48Spatrick         }
302609467b48Spatrick 
302709467b48Spatrick         // Remove the block from loop info.
302809467b48Spatrick         MLI->removeBlock(RemBB);
302909467b48Spatrick         if (RemBB == PreferredLoopExit)
303009467b48Spatrick           PreferredLoopExit = nullptr;
303109467b48Spatrick 
303209467b48Spatrick         LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
303309467b48Spatrick                           << getBlockName(RemBB) << "\n");
303409467b48Spatrick       };
303509467b48Spatrick   auto RemovalCallbackRef =
303609467b48Spatrick       function_ref<void(MachineBasicBlock*)>(RemovalCallback);
303709467b48Spatrick 
303809467b48Spatrick   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
303909467b48Spatrick   bool IsSimple = TailDup.isSimpleBB(BB);
3040*097a140dSpatrick   SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3041*097a140dSpatrick   SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3042*097a140dSpatrick   if (F->getFunction().hasProfileData()) {
3043*097a140dSpatrick     // We can do partial duplication with precise profile information.
3044*097a140dSpatrick     findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3045*097a140dSpatrick     if (CandidatePreds.size() == 0)
3046*097a140dSpatrick       return false;
3047*097a140dSpatrick     if (CandidatePreds.size() < BB->pred_size())
3048*097a140dSpatrick       CandidatePtr = &CandidatePreds;
3049*097a140dSpatrick   }
3050*097a140dSpatrick   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3051*097a140dSpatrick                                  &RemovalCallbackRef, CandidatePtr);
305209467b48Spatrick 
305309467b48Spatrick   // Update UnscheduledPredecessors to reflect tail-duplication.
305409467b48Spatrick   DuplicatedToLPred = false;
305509467b48Spatrick   for (MachineBasicBlock *Pred : DuplicatedPreds) {
305609467b48Spatrick     // We're only looking for unscheduled predecessors that match the filter.
305709467b48Spatrick     BlockChain* PredChain = BlockToChain[Pred];
305809467b48Spatrick     if (Pred == LPred)
305909467b48Spatrick       DuplicatedToLPred = true;
306009467b48Spatrick     if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
306109467b48Spatrick         || PredChain == &Chain)
306209467b48Spatrick       continue;
306309467b48Spatrick     for (MachineBasicBlock *NewSucc : Pred->successors()) {
306409467b48Spatrick       if (BlockFilter && !BlockFilter->count(NewSucc))
306509467b48Spatrick         continue;
306609467b48Spatrick       BlockChain *NewChain = BlockToChain[NewSucc];
306709467b48Spatrick       if (NewChain != &Chain && NewChain != PredChain)
306809467b48Spatrick         NewChain->UnscheduledPredecessors++;
306909467b48Spatrick     }
307009467b48Spatrick   }
307109467b48Spatrick   return Removed;
307209467b48Spatrick }
307309467b48Spatrick 
3074*097a140dSpatrick // Count the number of actual machine instructions.
3075*097a140dSpatrick static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
3076*097a140dSpatrick   uint64_t InstrCount = 0;
3077*097a140dSpatrick   for (MachineInstr &MI : *MBB) {
3078*097a140dSpatrick     if (!MI.isPHI() && !MI.isMetaInstruction())
3079*097a140dSpatrick       InstrCount += 1;
3080*097a140dSpatrick   }
3081*097a140dSpatrick   return InstrCount;
3082*097a140dSpatrick }
3083*097a140dSpatrick 
3084*097a140dSpatrick // The size cost of duplication is the instruction size of the duplicated block.
3085*097a140dSpatrick // So we should scale the threshold accordingly. But the instruction size is not
3086*097a140dSpatrick // available on all targets, so we use the number of instructions instead.
3087*097a140dSpatrick BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3088*097a140dSpatrick   return DupThreshold.getFrequency() * countMBBInstruction(BB);
3089*097a140dSpatrick }
3090*097a140dSpatrick 
3091*097a140dSpatrick // Returns true if BB is Pred's best successor.
3092*097a140dSpatrick bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3093*097a140dSpatrick                                             MachineBasicBlock *Pred,
3094*097a140dSpatrick                                             BlockFilterSet *BlockFilter) {
3095*097a140dSpatrick   if (BB == Pred)
3096*097a140dSpatrick     return false;
3097*097a140dSpatrick   if (BlockFilter && !BlockFilter->count(Pred))
3098*097a140dSpatrick     return false;
3099*097a140dSpatrick   BlockChain *PredChain = BlockToChain[Pred];
3100*097a140dSpatrick   if (PredChain && (Pred != *std::prev(PredChain->end())))
3101*097a140dSpatrick     return false;
3102*097a140dSpatrick 
3103*097a140dSpatrick   // Find the successor with largest probability excluding BB.
3104*097a140dSpatrick   BranchProbability BestProb = BranchProbability::getZero();
3105*097a140dSpatrick   for (MachineBasicBlock *Succ : Pred->successors())
3106*097a140dSpatrick     if (Succ != BB) {
3107*097a140dSpatrick       if (BlockFilter && !BlockFilter->count(Succ))
3108*097a140dSpatrick         continue;
3109*097a140dSpatrick       BlockChain *SuccChain = BlockToChain[Succ];
3110*097a140dSpatrick       if (SuccChain && (Succ != *SuccChain->begin()))
3111*097a140dSpatrick         continue;
3112*097a140dSpatrick       BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3113*097a140dSpatrick       if (SuccProb > BestProb)
3114*097a140dSpatrick         BestProb = SuccProb;
3115*097a140dSpatrick     }
3116*097a140dSpatrick 
3117*097a140dSpatrick   BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3118*097a140dSpatrick   if (BBProb <= BestProb)
3119*097a140dSpatrick     return false;
3120*097a140dSpatrick 
3121*097a140dSpatrick   // Compute the number of reduced taken branches if Pred falls through to BB
3122*097a140dSpatrick   // instead of another successor. Then compare it with threshold.
3123*097a140dSpatrick   BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
3124*097a140dSpatrick   BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3125*097a140dSpatrick   return Gain > scaleThreshold(BB);
3126*097a140dSpatrick }
3127*097a140dSpatrick 
3128*097a140dSpatrick // Find out the predecessors of BB and BB can be beneficially duplicated into
3129*097a140dSpatrick // them.
3130*097a140dSpatrick void MachineBlockPlacement::findDuplicateCandidates(
3131*097a140dSpatrick     SmallVectorImpl<MachineBasicBlock *> &Candidates,
3132*097a140dSpatrick     MachineBasicBlock *BB,
3133*097a140dSpatrick     BlockFilterSet *BlockFilter) {
3134*097a140dSpatrick   MachineBasicBlock *Fallthrough = nullptr;
3135*097a140dSpatrick   BranchProbability DefaultBranchProb = BranchProbability::getZero();
3136*097a140dSpatrick   BlockFrequency BBDupThreshold(scaleThreshold(BB));
3137*097a140dSpatrick   SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());
3138*097a140dSpatrick   SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());
3139*097a140dSpatrick 
3140*097a140dSpatrick   // Sort for highest frequency.
3141*097a140dSpatrick   auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3142*097a140dSpatrick     return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3143*097a140dSpatrick   };
3144*097a140dSpatrick   auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3145*097a140dSpatrick     return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3146*097a140dSpatrick   };
3147*097a140dSpatrick   llvm::stable_sort(Succs, CmpSucc);
3148*097a140dSpatrick   llvm::stable_sort(Preds, CmpPred);
3149*097a140dSpatrick 
3150*097a140dSpatrick   auto SuccIt = Succs.begin();
3151*097a140dSpatrick   if (SuccIt != Succs.end()) {
3152*097a140dSpatrick     DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3153*097a140dSpatrick   }
3154*097a140dSpatrick 
3155*097a140dSpatrick   // For each predecessors of BB, compute the benefit of duplicating BB,
3156*097a140dSpatrick   // if it is larger than the threshold, add it into Candidates.
3157*097a140dSpatrick   //
3158*097a140dSpatrick   // If we have following control flow.
3159*097a140dSpatrick   //
3160*097a140dSpatrick   //     PB1 PB2 PB3 PB4
3161*097a140dSpatrick   //      \   |  /    /\
3162*097a140dSpatrick   //       \  | /    /  \
3163*097a140dSpatrick   //        \ |/    /    \
3164*097a140dSpatrick   //         BB----/     OB
3165*097a140dSpatrick   //         /\
3166*097a140dSpatrick   //        /  \
3167*097a140dSpatrick   //      SB1 SB2
3168*097a140dSpatrick   //
3169*097a140dSpatrick   // And it can be partially duplicated as
3170*097a140dSpatrick   //
3171*097a140dSpatrick   //   PB2+BB
3172*097a140dSpatrick   //      |  PB1 PB3 PB4
3173*097a140dSpatrick   //      |   |  /    /\
3174*097a140dSpatrick   //      |   | /    /  \
3175*097a140dSpatrick   //      |   |/    /    \
3176*097a140dSpatrick   //      |  BB----/     OB
3177*097a140dSpatrick   //      |\ /|
3178*097a140dSpatrick   //      | X |
3179*097a140dSpatrick   //      |/ \|
3180*097a140dSpatrick   //     SB2 SB1
3181*097a140dSpatrick   //
3182*097a140dSpatrick   // The benefit of duplicating into a predecessor is defined as
3183*097a140dSpatrick   //         Orig_taken_branch - Duplicated_taken_branch
3184*097a140dSpatrick   //
3185*097a140dSpatrick   // The Orig_taken_branch is computed with the assumption that predecessor
3186*097a140dSpatrick   // jumps to BB and the most possible successor is laid out after BB.
3187*097a140dSpatrick   //
3188*097a140dSpatrick   // The Duplicated_taken_branch is computed with the assumption that BB is
3189*097a140dSpatrick   // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3190*097a140dSpatrick   // SB2 for PB2 in our case). If there is no available successor, the combined
3191*097a140dSpatrick   // block jumps to all BB's successor, like PB3 in this example.
3192*097a140dSpatrick   //
3193*097a140dSpatrick   // If a predecessor has multiple successors, so BB can't be duplicated into
3194*097a140dSpatrick   // it. But it can beneficially fall through to BB, and duplicate BB into other
3195*097a140dSpatrick   // predecessors.
3196*097a140dSpatrick   for (MachineBasicBlock *Pred : Preds) {
3197*097a140dSpatrick     BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
3198*097a140dSpatrick 
3199*097a140dSpatrick     if (!TailDup.canTailDuplicate(BB, Pred)) {
3200*097a140dSpatrick       // BB can't be duplicated into Pred, but it is possible to be layout
3201*097a140dSpatrick       // below Pred.
3202*097a140dSpatrick       if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3203*097a140dSpatrick         Fallthrough = Pred;
3204*097a140dSpatrick         if (SuccIt != Succs.end())
3205*097a140dSpatrick           SuccIt++;
3206*097a140dSpatrick       }
3207*097a140dSpatrick       continue;
3208*097a140dSpatrick     }
3209*097a140dSpatrick 
3210*097a140dSpatrick     BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3211*097a140dSpatrick     BlockFrequency DupCost;
3212*097a140dSpatrick     if (SuccIt == Succs.end()) {
3213*097a140dSpatrick       // Jump to all successors;
3214*097a140dSpatrick       if (Succs.size() > 0)
3215*097a140dSpatrick         DupCost += PredFreq;
3216*097a140dSpatrick     } else {
3217*097a140dSpatrick       // Fallthrough to *SuccIt, jump to all other successors;
3218*097a140dSpatrick       DupCost += PredFreq;
3219*097a140dSpatrick       DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3220*097a140dSpatrick     }
3221*097a140dSpatrick 
3222*097a140dSpatrick     assert(OrigCost >= DupCost);
3223*097a140dSpatrick     OrigCost -= DupCost;
3224*097a140dSpatrick     if (OrigCost > BBDupThreshold) {
3225*097a140dSpatrick       Candidates.push_back(Pred);
3226*097a140dSpatrick       if (SuccIt != Succs.end())
3227*097a140dSpatrick         SuccIt++;
3228*097a140dSpatrick     }
3229*097a140dSpatrick   }
3230*097a140dSpatrick 
3231*097a140dSpatrick   // No predecessors can optimally fallthrough to BB.
3232*097a140dSpatrick   // So we can change one duplication into fallthrough.
3233*097a140dSpatrick   if (!Fallthrough) {
3234*097a140dSpatrick     if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3235*097a140dSpatrick       Candidates[0] = Candidates.back();
3236*097a140dSpatrick       Candidates.pop_back();
3237*097a140dSpatrick     }
3238*097a140dSpatrick   }
3239*097a140dSpatrick }
3240*097a140dSpatrick 
3241*097a140dSpatrick void MachineBlockPlacement::initDupThreshold() {
3242*097a140dSpatrick   DupThreshold = 0;
3243*097a140dSpatrick   if (!F->getFunction().hasProfileData())
3244*097a140dSpatrick     return;
3245*097a140dSpatrick 
3246*097a140dSpatrick   BlockFrequency MaxFreq = 0;
3247*097a140dSpatrick   for (MachineBasicBlock &MBB : *F) {
3248*097a140dSpatrick     BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3249*097a140dSpatrick     if (Freq > MaxFreq)
3250*097a140dSpatrick       MaxFreq = Freq;
3251*097a140dSpatrick   }
3252*097a140dSpatrick 
3253*097a140dSpatrick   // FIXME: we may use profile count instead of frequency,
3254*097a140dSpatrick   // and need more fine tuning.
3255*097a140dSpatrick   BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3256*097a140dSpatrick   DupThreshold = MaxFreq * ThresholdProb;
3257*097a140dSpatrick }
3258*097a140dSpatrick 
325909467b48Spatrick bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
326009467b48Spatrick   if (skipFunction(MF.getFunction()))
326109467b48Spatrick     return false;
326209467b48Spatrick 
326309467b48Spatrick   // Check for single-block functions and skip them.
326409467b48Spatrick   if (std::next(MF.begin()) == MF.end())
326509467b48Spatrick     return false;
326609467b48Spatrick 
326709467b48Spatrick   F = &MF;
326809467b48Spatrick   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3269*097a140dSpatrick   MBFI = std::make_unique<MBFIWrapper>(
327009467b48Spatrick       getAnalysis<MachineBlockFrequencyInfo>());
327109467b48Spatrick   MLI = &getAnalysis<MachineLoopInfo>();
327209467b48Spatrick   TII = MF.getSubtarget().getInstrInfo();
327309467b48Spatrick   TLI = MF.getSubtarget().getTargetLowering();
327409467b48Spatrick   MPDT = nullptr;
327509467b48Spatrick   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
327609467b48Spatrick 
3277*097a140dSpatrick   initDupThreshold();
3278*097a140dSpatrick 
327909467b48Spatrick   // Initialize PreferredLoopExit to nullptr here since it may never be set if
328009467b48Spatrick   // there are no MachineLoops.
328109467b48Spatrick   PreferredLoopExit = nullptr;
328209467b48Spatrick 
328309467b48Spatrick   assert(BlockToChain.empty() &&
328409467b48Spatrick          "BlockToChain map should be empty before starting placement.");
328509467b48Spatrick   assert(ComputedEdges.empty() &&
328609467b48Spatrick          "Computed Edge map should be empty before starting placement.");
328709467b48Spatrick 
328809467b48Spatrick   unsigned TailDupSize = TailDupPlacementThreshold;
328909467b48Spatrick   // If only the aggressive threshold is explicitly set, use it.
329009467b48Spatrick   if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
329109467b48Spatrick       TailDupPlacementThreshold.getNumOccurrences() == 0)
329209467b48Spatrick     TailDupSize = TailDupPlacementAggressiveThreshold;
329309467b48Spatrick 
329409467b48Spatrick   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
329509467b48Spatrick   // For aggressive optimization, we can adjust some thresholds to be less
329609467b48Spatrick   // conservative.
329709467b48Spatrick   if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
329809467b48Spatrick     // At O3 we should be more willing to copy blocks for tail duplication. This
329909467b48Spatrick     // increases size pressure, so we only do it at O3
330009467b48Spatrick     // Do this unless only the regular threshold is explicitly set.
330109467b48Spatrick     if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
330209467b48Spatrick         TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
330309467b48Spatrick       TailDupSize = TailDupPlacementAggressiveThreshold;
330409467b48Spatrick   }
330509467b48Spatrick 
330609467b48Spatrick   if (allowTailDupPlacement()) {
330709467b48Spatrick     MPDT = &getAnalysis<MachinePostDominatorTree>();
330809467b48Spatrick     bool OptForSize = MF.getFunction().hasOptSize() ||
330909467b48Spatrick                       llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
331009467b48Spatrick     if (OptForSize)
331109467b48Spatrick       TailDupSize = 1;
331209467b48Spatrick     bool PreRegAlloc = false;
3313*097a140dSpatrick     TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
331409467b48Spatrick                    /* LayoutMode */ true, TailDupSize);
331509467b48Spatrick     precomputeTriangleChains();
331609467b48Spatrick   }
331709467b48Spatrick 
331809467b48Spatrick   buildCFGChains();
331909467b48Spatrick 
332009467b48Spatrick   // Changing the layout can create new tail merging opportunities.
332109467b48Spatrick   // TailMerge can create jump into if branches that make CFG irreducible for
332209467b48Spatrick   // HW that requires structured CFG.
332309467b48Spatrick   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
332409467b48Spatrick                          PassConfig->getEnableTailMerge() &&
332509467b48Spatrick                          BranchFoldPlacement;
332609467b48Spatrick   // No tail merging opportunities if the block number is less than four.
332709467b48Spatrick   if (MF.size() > 3 && EnableTailMerge) {
332809467b48Spatrick     unsigned TailMergeSize = TailDupSize + 1;
332909467b48Spatrick     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
333009467b48Spatrick                     *MBPI, PSI, TailMergeSize);
333109467b48Spatrick 
3332*097a140dSpatrick     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
333309467b48Spatrick                             /*AfterPlacement=*/true)) {
333409467b48Spatrick       // Redo the layout if tail merging creates/removes/moves blocks.
333509467b48Spatrick       BlockToChain.clear();
333609467b48Spatrick       ComputedEdges.clear();
333709467b48Spatrick       // Must redo the post-dominator tree if blocks were changed.
333809467b48Spatrick       if (MPDT)
333909467b48Spatrick         MPDT->runOnMachineFunction(MF);
334009467b48Spatrick       ChainAllocator.DestroyAll();
334109467b48Spatrick       buildCFGChains();
334209467b48Spatrick     }
334309467b48Spatrick   }
334409467b48Spatrick 
334509467b48Spatrick   optimizeBranches();
334609467b48Spatrick   alignBlocks();
334709467b48Spatrick 
334809467b48Spatrick   BlockToChain.clear();
334909467b48Spatrick   ComputedEdges.clear();
335009467b48Spatrick   ChainAllocator.DestroyAll();
335109467b48Spatrick 
335209467b48Spatrick   if (AlignAllBlock)
335309467b48Spatrick     // Align all of the blocks in the function to a specific alignment.
335409467b48Spatrick     for (MachineBasicBlock &MBB : MF)
335509467b48Spatrick       MBB.setAlignment(Align(1ULL << AlignAllBlock));
335609467b48Spatrick   else if (AlignAllNonFallThruBlocks) {
335709467b48Spatrick     // Align all of the blocks that have no fall-through predecessors to a
335809467b48Spatrick     // specific alignment.
335909467b48Spatrick     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
336009467b48Spatrick       auto LayoutPred = std::prev(MBI);
336109467b48Spatrick       if (!LayoutPred->isSuccessor(&*MBI))
336209467b48Spatrick         MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
336309467b48Spatrick     }
336409467b48Spatrick   }
336509467b48Spatrick   if (ViewBlockLayoutWithBFI != GVDT_None &&
336609467b48Spatrick       (ViewBlockFreqFuncName.empty() ||
336709467b48Spatrick        F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
336809467b48Spatrick     MBFI->view("MBP." + MF.getName(), false);
336909467b48Spatrick   }
337009467b48Spatrick 
337109467b48Spatrick 
337209467b48Spatrick   // We always return true as we have no way to track whether the final order
337309467b48Spatrick   // differs from the original order.
337409467b48Spatrick   return true;
337509467b48Spatrick }
337609467b48Spatrick 
337709467b48Spatrick namespace {
337809467b48Spatrick 
337909467b48Spatrick /// A pass to compute block placement statistics.
338009467b48Spatrick ///
338109467b48Spatrick /// A separate pass to compute interesting statistics for evaluating block
338209467b48Spatrick /// placement. This is separate from the actual placement pass so that they can
338309467b48Spatrick /// be computed in the absence of any placement transformations or when using
338409467b48Spatrick /// alternative placement strategies.
338509467b48Spatrick class MachineBlockPlacementStats : public MachineFunctionPass {
338609467b48Spatrick   /// A handle to the branch probability pass.
338709467b48Spatrick   const MachineBranchProbabilityInfo *MBPI;
338809467b48Spatrick 
338909467b48Spatrick   /// A handle to the function-wide block frequency pass.
339009467b48Spatrick   const MachineBlockFrequencyInfo *MBFI;
339109467b48Spatrick 
339209467b48Spatrick public:
339309467b48Spatrick   static char ID; // Pass identification, replacement for typeid
339409467b48Spatrick 
339509467b48Spatrick   MachineBlockPlacementStats() : MachineFunctionPass(ID) {
339609467b48Spatrick     initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry());
339709467b48Spatrick   }
339809467b48Spatrick 
339909467b48Spatrick   bool runOnMachineFunction(MachineFunction &F) override;
340009467b48Spatrick 
340109467b48Spatrick   void getAnalysisUsage(AnalysisUsage &AU) const override {
340209467b48Spatrick     AU.addRequired<MachineBranchProbabilityInfo>();
340309467b48Spatrick     AU.addRequired<MachineBlockFrequencyInfo>();
340409467b48Spatrick     AU.setPreservesAll();
340509467b48Spatrick     MachineFunctionPass::getAnalysisUsage(AU);
340609467b48Spatrick   }
340709467b48Spatrick };
340809467b48Spatrick 
340909467b48Spatrick } // end anonymous namespace
341009467b48Spatrick 
341109467b48Spatrick char MachineBlockPlacementStats::ID = 0;
341209467b48Spatrick 
341309467b48Spatrick char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
341409467b48Spatrick 
341509467b48Spatrick INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
341609467b48Spatrick                       "Basic Block Placement Stats", false, false)
341709467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
341809467b48Spatrick INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
341909467b48Spatrick INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
342009467b48Spatrick                     "Basic Block Placement Stats", false, false)
342109467b48Spatrick 
342209467b48Spatrick bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
342309467b48Spatrick   // Check for single-block functions and skip them.
342409467b48Spatrick   if (std::next(F.begin()) == F.end())
342509467b48Spatrick     return false;
342609467b48Spatrick 
342709467b48Spatrick   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
342809467b48Spatrick   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
342909467b48Spatrick 
343009467b48Spatrick   for (MachineBasicBlock &MBB : F) {
343109467b48Spatrick     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
343209467b48Spatrick     Statistic &NumBranches =
343309467b48Spatrick         (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
343409467b48Spatrick     Statistic &BranchTakenFreq =
343509467b48Spatrick         (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
343609467b48Spatrick     for (MachineBasicBlock *Succ : MBB.successors()) {
343709467b48Spatrick       // Skip if this successor is a fallthrough.
343809467b48Spatrick       if (MBB.isLayoutSuccessor(Succ))
343909467b48Spatrick         continue;
344009467b48Spatrick 
344109467b48Spatrick       BlockFrequency EdgeFreq =
344209467b48Spatrick           BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
344309467b48Spatrick       ++NumBranches;
344409467b48Spatrick       BranchTakenFreq += EdgeFreq.getFrequency();
344509467b48Spatrick     }
344609467b48Spatrick   }
344709467b48Spatrick 
344809467b48Spatrick   return false;
344909467b48Spatrick }
3450