10b57cec5SDimitry Andric //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This file implements a pass that converts X86 cmov instructions into 110b57cec5SDimitry Andric /// branches when profitable. This pass is conservative. It transforms if and 120b57cec5SDimitry Andric /// only if it can guarantee a gain with high confidence. 130b57cec5SDimitry Andric /// 140b57cec5SDimitry Andric /// Thus, the optimization applies under the following conditions: 150b57cec5SDimitry Andric /// 1. Consider as candidates only CMOVs in innermost loops (assume that 160b57cec5SDimitry Andric /// most hotspots are represented by these loops). 170b57cec5SDimitry Andric /// 2. Given a group of CMOV instructions that are using the same EFLAGS def 180b57cec5SDimitry Andric /// instruction: 190b57cec5SDimitry Andric /// a. Consider them as candidates only if all have the same code condition 200b57cec5SDimitry Andric /// or the opposite one to prevent generating more than one conditional 210b57cec5SDimitry Andric /// jump per EFLAGS def instruction. 220b57cec5SDimitry Andric /// b. Consider them as candidates only if all are profitable to be 230b57cec5SDimitry Andric /// converted (assume that one bad conversion may cause a degradation). 240b57cec5SDimitry Andric /// 3. Apply conversion only for loops that are found profitable and only for 250b57cec5SDimitry Andric /// CMOV candidates that were found profitable. 260b57cec5SDimitry Andric /// a. A loop is considered profitable only if conversion will reduce its 270b57cec5SDimitry Andric /// depth cost by some threshold. 280b57cec5SDimitry Andric /// b. CMOV is considered profitable if the cost of its condition is higher 290b57cec5SDimitry Andric /// than the average cost of its true-value and false-value by 25% of 300b57cec5SDimitry Andric /// branch-misprediction-penalty. This assures no degradation even with 310b57cec5SDimitry Andric /// 25% branch misprediction. 320b57cec5SDimitry Andric /// 330b57cec5SDimitry Andric /// Note: This pass is assumed to run on SSA machine code. 340b57cec5SDimitry Andric // 350b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 360b57cec5SDimitry Andric // 370b57cec5SDimitry Andric // External interfaces: 380b57cec5SDimitry Andric // FunctionPass *llvm::createX86CmovConverterPass(); 390b57cec5SDimitry Andric // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); 400b57cec5SDimitry Andric // 410b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric #include "X86.h" 440b57cec5SDimitry Andric #include "X86InstrInfo.h" 450b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h" 460b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h" 470b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h" 480b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h" 490b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 500b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 510b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 520b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 530b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 540b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 550b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 560b57cec5SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h" 570b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 580b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 590b57cec5SDimitry Andric #include "llvm/CodeGen/TargetInstrInfo.h" 600b57cec5SDimitry Andric #include "llvm/CodeGen/TargetRegisterInfo.h" 610b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSchedule.h" 620b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h" 630b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h" 64480093f4SDimitry Andric #include "llvm/InitializePasses.h" 650b57cec5SDimitry Andric #include "llvm/MC/MCSchedule.h" 660b57cec5SDimitry Andric #include "llvm/Pass.h" 670b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h" 680b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 690b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h" 70bdd1243dSDimitry Andric #include "llvm/Target/CGPassBuilderOption.h" 710b57cec5SDimitry Andric #include <algorithm> 720b57cec5SDimitry Andric #include <cassert> 730b57cec5SDimitry Andric #include <iterator> 740b57cec5SDimitry Andric #include <utility> 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric using namespace llvm; 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric #define DEBUG_TYPE "x86-cmov-conversion" 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); 810b57cec5SDimitry Andric STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); 820b57cec5SDimitry Andric STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); 830b57cec5SDimitry Andric STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric // This internal switch can be used to turn off the cmov/branch optimization. 860b57cec5SDimitry Andric static cl::opt<bool> 870b57cec5SDimitry Andric EnableCmovConverter("x86-cmov-converter", 880b57cec5SDimitry Andric cl::desc("Enable the X86 cmov-to-branch optimization."), 890b57cec5SDimitry Andric cl::init(true), cl::Hidden); 900b57cec5SDimitry Andric 910b57cec5SDimitry Andric static cl::opt<unsigned> 920b57cec5SDimitry Andric GainCycleThreshold("x86-cmov-converter-threshold", 930b57cec5SDimitry Andric cl::desc("Minimum gain per loop (in cycles) threshold."), 940b57cec5SDimitry Andric cl::init(4), cl::Hidden); 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric static cl::opt<bool> ForceMemOperand( 970b57cec5SDimitry Andric "x86-cmov-converter-force-mem-operand", 980b57cec5SDimitry Andric cl::desc("Convert cmovs to branches whenever they have memory operands."), 990b57cec5SDimitry Andric cl::init(true), cl::Hidden); 1000b57cec5SDimitry Andric 10181ad6265SDimitry Andric static cl::opt<bool> ForceAll( 10281ad6265SDimitry Andric "x86-cmov-converter-force-all", 10381ad6265SDimitry Andric cl::desc("Convert all cmovs to branches."), 10481ad6265SDimitry Andric cl::init(false), cl::Hidden); 10581ad6265SDimitry Andric 1060b57cec5SDimitry Andric namespace { 1070b57cec5SDimitry Andric 1080b57cec5SDimitry Andric /// Converts X86 cmov instructions into branches when profitable. 1090b57cec5SDimitry Andric class X86CmovConverterPass : public MachineFunctionPass { 1100b57cec5SDimitry Andric public: 1110b57cec5SDimitry Andric X86CmovConverterPass() : MachineFunctionPass(ID) { } 1120b57cec5SDimitry Andric 1130b57cec5SDimitry Andric StringRef getPassName() const override { return "X86 cmov Conversion"; } 1140b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 1150b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 1160b57cec5SDimitry Andric 1170b57cec5SDimitry Andric /// Pass identification, replacement for typeid. 1180b57cec5SDimitry Andric static char ID; 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric private: 121480093f4SDimitry Andric MachineRegisterInfo *MRI = nullptr; 122480093f4SDimitry Andric const TargetInstrInfo *TII = nullptr; 123480093f4SDimitry Andric const TargetRegisterInfo *TRI = nullptr; 124fe6060f1SDimitry Andric MachineLoopInfo *MLI = nullptr; 1250b57cec5SDimitry Andric TargetSchedModel TSchedModel; 1260b57cec5SDimitry Andric 1270b57cec5SDimitry Andric /// List of consecutive CMOV instructions. 1280b57cec5SDimitry Andric using CmovGroup = SmallVector<MachineInstr *, 2>; 1290b57cec5SDimitry Andric using CmovGroups = SmallVector<CmovGroup, 2>; 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric /// Collect all CMOV-group-candidates in \p CurrLoop and update \p 1320b57cec5SDimitry Andric /// CmovInstGroups accordingly. 1330b57cec5SDimitry Andric /// 1340b57cec5SDimitry Andric /// \param Blocks List of blocks to process. 1350b57cec5SDimitry Andric /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. 1360b57cec5SDimitry Andric /// \returns true iff it found any CMOV-group-candidate. 1370b57cec5SDimitry Andric bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, 1380b57cec5SDimitry Andric CmovGroups &CmovInstGroups, 1390b57cec5SDimitry Andric bool IncludeLoads = false); 1400b57cec5SDimitry Andric 1410b57cec5SDimitry Andric /// Check if it is profitable to transform each CMOV-group-candidates into 1420b57cec5SDimitry Andric /// branch. Remove all groups that are not profitable from \p CmovInstGroups. 1430b57cec5SDimitry Andric /// 1440b57cec5SDimitry Andric /// \param Blocks List of blocks to process. 1450b57cec5SDimitry Andric /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. 1460b57cec5SDimitry Andric /// \returns true iff any CMOV-group-candidate remain. 1470b57cec5SDimitry Andric bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, 1480b57cec5SDimitry Andric CmovGroups &CmovInstGroups); 1490b57cec5SDimitry Andric 1500b57cec5SDimitry Andric /// Convert the given list of consecutive CMOV instructions into a branch. 1510b57cec5SDimitry Andric /// 1520b57cec5SDimitry Andric /// \param Group Consecutive CMOV instructions to be converted into branch. 1530b57cec5SDimitry Andric void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; 1540b57cec5SDimitry Andric }; 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric } // end anonymous namespace 1570b57cec5SDimitry Andric 1580b57cec5SDimitry Andric char X86CmovConverterPass::ID = 0; 1590b57cec5SDimitry Andric 1600b57cec5SDimitry Andric void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { 1610b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 162*0fca6ea1SDimitry Andric AU.addRequired<MachineLoopInfoWrapperPass>(); 1630b57cec5SDimitry Andric } 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { 1660b57cec5SDimitry Andric if (skipFunction(MF.getFunction())) 1670b57cec5SDimitry Andric return false; 1680b57cec5SDimitry Andric if (!EnableCmovConverter) 1690b57cec5SDimitry Andric return false; 1700b57cec5SDimitry Andric 171bdd1243dSDimitry Andric // If the SelectOptimize pass is enabled, cmovs have already been optimized. 172bdd1243dSDimitry Andric if (!getCGPassBuilderOption().DisableSelectOptimize) 173bdd1243dSDimitry Andric return false; 174bdd1243dSDimitry Andric 1750b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 1760b57cec5SDimitry Andric << "**********\n"); 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric bool Changed = false; 179*0fca6ea1SDimitry Andric MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 1800b57cec5SDimitry Andric const TargetSubtargetInfo &STI = MF.getSubtarget(); 1810b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 1820b57cec5SDimitry Andric TII = STI.getInstrInfo(); 1830b57cec5SDimitry Andric TRI = STI.getRegisterInfo(); 1840b57cec5SDimitry Andric TSchedModel.init(&STI); 1850b57cec5SDimitry Andric 1860b57cec5SDimitry Andric // Before we handle the more subtle cases of register-register CMOVs inside 18781ad6265SDimitry Andric // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or 18881ad6265SDimitry Andric // the ones with a memory operand (ForceMemOperand option). The latter CMOV 18981ad6265SDimitry Andric // will risk a stall waiting for the load to complete that speculative 19081ad6265SDimitry Andric // execution behind a branch is better suited to handle on modern x86 chips. 19181ad6265SDimitry Andric if (ForceMemOperand || ForceAll) { 1920b57cec5SDimitry Andric CmovGroups AllCmovGroups; 1930b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> Blocks; 1940b57cec5SDimitry Andric for (auto &MBB : MF) 1950b57cec5SDimitry Andric Blocks.push_back(&MBB); 1960b57cec5SDimitry Andric if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { 1970b57cec5SDimitry Andric for (auto &Group : AllCmovGroups) { 1980b57cec5SDimitry Andric // Skip any group that doesn't do at least one memory operand cmov. 19981ad6265SDimitry Andric if (ForceMemOperand && !ForceAll && 20081ad6265SDimitry Andric llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) 2010b57cec5SDimitry Andric continue; 2020b57cec5SDimitry Andric 2030b57cec5SDimitry Andric // For CMOV groups which we can rewrite and which contain a memory load, 2040b57cec5SDimitry Andric // always rewrite them. On x86, a CMOV will dramatically amplify any 2050b57cec5SDimitry Andric // memory latency by blocking speculative execution. 2060b57cec5SDimitry Andric Changed = true; 2070b57cec5SDimitry Andric convertCmovInstsToBranches(Group); 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric } 21081ad6265SDimitry Andric // Early return as ForceAll converts all CmovGroups. 21181ad6265SDimitry Andric if (ForceAll) 21281ad6265SDimitry Andric return Changed; 2130b57cec5SDimitry Andric } 2140b57cec5SDimitry Andric 2150b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 2160b57cec5SDimitry Andric // Register-operand Conversion Algorithm 2170b57cec5SDimitry Andric // --------- 2180b57cec5SDimitry Andric // For each innermost loop 2190b57cec5SDimitry Andric // collectCmovCandidates() { 2200b57cec5SDimitry Andric // Find all CMOV-group-candidates. 2210b57cec5SDimitry Andric // } 2220b57cec5SDimitry Andric // 2230b57cec5SDimitry Andric // checkForProfitableCmovCandidates() { 2240b57cec5SDimitry Andric // * Calculate both loop-depth and optimized-loop-depth. 2250b57cec5SDimitry Andric // * Use these depth to check for loop transformation profitability. 2260b57cec5SDimitry Andric // * Check for CMOV-group-candidate transformation profitability. 2270b57cec5SDimitry Andric // } 2280b57cec5SDimitry Andric // 2290b57cec5SDimitry Andric // For each profitable CMOV-group-candidate 2300b57cec5SDimitry Andric // convertCmovInstsToBranches() { 2310b57cec5SDimitry Andric // * Create FalseBB, SinkBB, Conditional branch to SinkBB. 2320b57cec5SDimitry Andric // * Replace each CMOV instruction with a PHI instruction in SinkBB. 2330b57cec5SDimitry Andric // } 2340b57cec5SDimitry Andric // 2350b57cec5SDimitry Andric // Note: For more details, see each function description. 2360b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 2370b57cec5SDimitry Andric 2380b57cec5SDimitry Andric // Build up the loops in pre-order. 239fe6060f1SDimitry Andric SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end()); 2400b57cec5SDimitry Andric // Note that we need to check size on each iteration as we accumulate child 2410b57cec5SDimitry Andric // loops. 2420b57cec5SDimitry Andric for (int i = 0; i < (int)Loops.size(); ++i) 2430b57cec5SDimitry Andric for (MachineLoop *Child : Loops[i]->getSubLoops()) 2440b57cec5SDimitry Andric Loops.push_back(Child); 2450b57cec5SDimitry Andric 2460b57cec5SDimitry Andric for (MachineLoop *CurrLoop : Loops) { 2470b57cec5SDimitry Andric // Optimize only innermost loops. 2480b57cec5SDimitry Andric if (!CurrLoop->getSubLoops().empty()) 2490b57cec5SDimitry Andric continue; 2500b57cec5SDimitry Andric 2510b57cec5SDimitry Andric // List of consecutive CMOV instructions to be processed. 2520b57cec5SDimitry Andric CmovGroups CmovInstGroups; 2530b57cec5SDimitry Andric 2540b57cec5SDimitry Andric if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups)) 2550b57cec5SDimitry Andric continue; 2560b57cec5SDimitry Andric 2570b57cec5SDimitry Andric if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(), 2580b57cec5SDimitry Andric CmovInstGroups)) 2590b57cec5SDimitry Andric continue; 2600b57cec5SDimitry Andric 2610b57cec5SDimitry Andric Changed = true; 2620b57cec5SDimitry Andric for (auto &Group : CmovInstGroups) 2630b57cec5SDimitry Andric convertCmovInstsToBranches(Group); 2640b57cec5SDimitry Andric } 2650b57cec5SDimitry Andric 2660b57cec5SDimitry Andric return Changed; 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric bool X86CmovConverterPass::collectCmovCandidates( 2700b57cec5SDimitry Andric ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups, 2710b57cec5SDimitry Andric bool IncludeLoads) { 2720b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 2730b57cec5SDimitry Andric // Collect all CMOV-group-candidates and add them into CmovInstGroups. 2740b57cec5SDimitry Andric // 2750b57cec5SDimitry Andric // CMOV-group: 2760b57cec5SDimitry Andric // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. 2770b57cec5SDimitry Andric // 2780b57cec5SDimitry Andric // CMOV-group-candidate: 2790b57cec5SDimitry Andric // CMOV-group where all the CMOV instructions are 2800b57cec5SDimitry Andric // 1. consecutive. 2810b57cec5SDimitry Andric // 2. have same condition code or opposite one. 2820b57cec5SDimitry Andric // 3. have only operand registers (X86::CMOVrr). 2830b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 2840b57cec5SDimitry Andric // List of possible improvement (TODO's): 2850b57cec5SDimitry Andric // -------------------------------------- 2860b57cec5SDimitry Andric // TODO: Add support for X86::CMOVrm instructions. 2870b57cec5SDimitry Andric // TODO: Add support for X86::SETcc instructions. 2880b57cec5SDimitry Andric // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. 2890b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric // Current processed CMOV-Group. 2920b57cec5SDimitry Andric CmovGroup Group; 2930b57cec5SDimitry Andric for (auto *MBB : Blocks) { 2940b57cec5SDimitry Andric Group.clear(); 2950b57cec5SDimitry Andric // Condition code of first CMOV instruction current processed range and its 2960b57cec5SDimitry Andric // opposite condition code. 2970b57cec5SDimitry Andric X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID, 2980b57cec5SDimitry Andric MemOpCC = X86::COND_INVALID; 2990b57cec5SDimitry Andric // Indicator of a non CMOVrr instruction in the current processed range. 3000b57cec5SDimitry Andric bool FoundNonCMOVInst = false; 3010b57cec5SDimitry Andric // Indicator for current processed CMOV-group if it should be skipped. 3020b57cec5SDimitry Andric bool SkipGroup = false; 3030b57cec5SDimitry Andric 3040b57cec5SDimitry Andric for (auto &I : *MBB) { 3050b57cec5SDimitry Andric // Skip debug instructions. 3060b57cec5SDimitry Andric if (I.isDebugInstr()) 3070b57cec5SDimitry Andric continue; 30806c3fb27SDimitry Andric 3090b57cec5SDimitry Andric X86::CondCode CC = X86::getCondFromCMov(I); 31006c3fb27SDimitry Andric // Check if we found a X86::CMOVrr instruction. If it is marked as 31106c3fb27SDimitry Andric // unpredictable, skip it and do not convert it to branch. 31206c3fb27SDimitry Andric if (CC != X86::COND_INVALID && 31306c3fb27SDimitry Andric !I.getFlag(MachineInstr::MIFlag::Unpredictable) && 31406c3fb27SDimitry Andric (IncludeLoads || !I.mayLoad())) { 3150b57cec5SDimitry Andric if (Group.empty()) { 3160b57cec5SDimitry Andric // We found first CMOV in the range, reset flags. 3170b57cec5SDimitry Andric FirstCC = CC; 3180b57cec5SDimitry Andric FirstOppCC = X86::GetOppositeBranchCondition(CC); 3190b57cec5SDimitry Andric // Clear out the prior group's memory operand CC. 3200b57cec5SDimitry Andric MemOpCC = X86::COND_INVALID; 3210b57cec5SDimitry Andric FoundNonCMOVInst = false; 3220b57cec5SDimitry Andric SkipGroup = false; 3230b57cec5SDimitry Andric } 3240b57cec5SDimitry Andric Group.push_back(&I); 3250b57cec5SDimitry Andric // Check if it is a non-consecutive CMOV instruction or it has different 3260b57cec5SDimitry Andric // condition code than FirstCC or FirstOppCC. 3270b57cec5SDimitry Andric if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) 3280b57cec5SDimitry Andric // Mark the SKipGroup indicator to skip current processed CMOV-Group. 3290b57cec5SDimitry Andric SkipGroup = true; 3300b57cec5SDimitry Andric if (I.mayLoad()) { 3310b57cec5SDimitry Andric if (MemOpCC == X86::COND_INVALID) 3320b57cec5SDimitry Andric // The first memory operand CMOV. 3330b57cec5SDimitry Andric MemOpCC = CC; 3340b57cec5SDimitry Andric else if (CC != MemOpCC) 3350b57cec5SDimitry Andric // Can't handle mixed conditions with memory operands. 3360b57cec5SDimitry Andric SkipGroup = true; 3370b57cec5SDimitry Andric } 3380b57cec5SDimitry Andric // Check if we were relying on zero-extending behavior of the CMOV. 3390b57cec5SDimitry Andric if (!SkipGroup && 3400b57cec5SDimitry Andric llvm::any_of( 3410b57cec5SDimitry Andric MRI->use_nodbg_instructions(I.defs().begin()->getReg()), 3420b57cec5SDimitry Andric [&](MachineInstr &UseI) { 3430b57cec5SDimitry Andric return UseI.getOpcode() == X86::SUBREG_TO_REG; 3440b57cec5SDimitry Andric })) 3450b57cec5SDimitry Andric // FIXME: We should model the cost of using an explicit MOV to handle 3460b57cec5SDimitry Andric // the zero-extension rather than just refusing to handle this. 3470b57cec5SDimitry Andric SkipGroup = true; 3480b57cec5SDimitry Andric continue; 3490b57cec5SDimitry Andric } 3500b57cec5SDimitry Andric // If Group is empty, keep looking for first CMOV in the range. 3510b57cec5SDimitry Andric if (Group.empty()) 3520b57cec5SDimitry Andric continue; 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric // We found a non X86::CMOVrr instruction. 3550b57cec5SDimitry Andric FoundNonCMOVInst = true; 3560b57cec5SDimitry Andric // Check if this instruction define EFLAGS, to determine end of processed 3570b57cec5SDimitry Andric // range, as there would be no more instructions using current EFLAGS def. 358*0fca6ea1SDimitry Andric if (I.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) { 3590b57cec5SDimitry Andric // Check if current processed CMOV-group should not be skipped and add 3600b57cec5SDimitry Andric // it as a CMOV-group-candidate. 3610b57cec5SDimitry Andric if (!SkipGroup) 3620b57cec5SDimitry Andric CmovInstGroups.push_back(Group); 3630b57cec5SDimitry Andric else 3640b57cec5SDimitry Andric ++NumOfSkippedCmovGroups; 3650b57cec5SDimitry Andric Group.clear(); 3660b57cec5SDimitry Andric } 3670b57cec5SDimitry Andric } 3680b57cec5SDimitry Andric // End of basic block is considered end of range, check if current processed 3690b57cec5SDimitry Andric // CMOV-group should not be skipped and add it as a CMOV-group-candidate. 3700b57cec5SDimitry Andric if (Group.empty()) 3710b57cec5SDimitry Andric continue; 3720b57cec5SDimitry Andric if (!SkipGroup) 3730b57cec5SDimitry Andric CmovInstGroups.push_back(Group); 3740b57cec5SDimitry Andric else 3750b57cec5SDimitry Andric ++NumOfSkippedCmovGroups; 3760b57cec5SDimitry Andric } 3770b57cec5SDimitry Andric 3780b57cec5SDimitry Andric NumOfCmovGroupCandidate += CmovInstGroups.size(); 3790b57cec5SDimitry Andric return !CmovInstGroups.empty(); 3800b57cec5SDimitry Andric } 3810b57cec5SDimitry Andric 3820b57cec5SDimitry Andric /// \returns Depth of CMOV instruction as if it was converted into branch. 3830b57cec5SDimitry Andric /// \param TrueOpDepth depth cost of CMOV true value operand. 3840b57cec5SDimitry Andric /// \param FalseOpDepth depth cost of CMOV false value operand. 3850b57cec5SDimitry Andric static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { 38647395794SDimitry Andric // The depth of the result after branch conversion is 38747395794SDimitry Andric // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability. 38847395794SDimitry Andric // As we have no info about branch weight, we assume 75% for one and 25% for 38947395794SDimitry Andric // the other, and pick the result with the largest resulting depth. 39047395794SDimitry Andric return std::max( 39147395794SDimitry Andric divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4), 39247395794SDimitry Andric divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4)); 3930b57cec5SDimitry Andric } 3940b57cec5SDimitry Andric 3950b57cec5SDimitry Andric bool X86CmovConverterPass::checkForProfitableCmovCandidates( 3960b57cec5SDimitry Andric ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { 3970b57cec5SDimitry Andric struct DepthInfo { 3980b57cec5SDimitry Andric /// Depth of original loop. 3990b57cec5SDimitry Andric unsigned Depth; 4000b57cec5SDimitry Andric /// Depth of optimized loop. 4010b57cec5SDimitry Andric unsigned OptDepth; 4020b57cec5SDimitry Andric }; 4030b57cec5SDimitry Andric /// Number of loop iterations to calculate depth for ?! 4040b57cec5SDimitry Andric static const unsigned LoopIterations = 2; 4050b57cec5SDimitry Andric DenseMap<MachineInstr *, DepthInfo> DepthMap; 4060b57cec5SDimitry Andric DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}}; 4070b57cec5SDimitry Andric enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; 4080b57cec5SDimitry Andric /// For each register type maps the register to its last def instruction. 4090b57cec5SDimitry Andric DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum]; 4100b57cec5SDimitry Andric /// Maps register operand to its def instruction, which can be nullptr if it 4110b57cec5SDimitry Andric /// is unknown (e.g., operand is defined outside the loop). 4120b57cec5SDimitry Andric DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap; 4130b57cec5SDimitry Andric 4140b57cec5SDimitry Andric // Set depth of unknown instruction (i.e., nullptr) to zero. 4150b57cec5SDimitry Andric DepthMap[nullptr] = {0, 0}; 4160b57cec5SDimitry Andric 4170b57cec5SDimitry Andric SmallPtrSet<MachineInstr *, 4> CmovInstructions; 4180b57cec5SDimitry Andric for (auto &Group : CmovInstGroups) 4190b57cec5SDimitry Andric CmovInstructions.insert(Group.begin(), Group.end()); 4200b57cec5SDimitry Andric 4210b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 4220b57cec5SDimitry Andric // Step 1: Calculate instruction depth and loop depth. 4230b57cec5SDimitry Andric // Optimized-Loop: 4240b57cec5SDimitry Andric // loop with CMOV-group-candidates converted into branches. 4250b57cec5SDimitry Andric // 4260b57cec5SDimitry Andric // Instruction-Depth: 4270b57cec5SDimitry Andric // instruction latency + max operand depth. 4280b57cec5SDimitry Andric // * For CMOV instruction in optimized loop the depth is calculated as: 4290b57cec5SDimitry Andric // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) 4300b57cec5SDimitry Andric // TODO: Find a better way to estimate the latency of the branch instruction 4310b57cec5SDimitry Andric // rather than using the CMOV latency. 4320b57cec5SDimitry Andric // 4330b57cec5SDimitry Andric // Loop-Depth: 4340b57cec5SDimitry Andric // max instruction depth of all instructions in the loop. 4350b57cec5SDimitry Andric // Note: instruction with max depth represents the critical-path in the loop. 4360b57cec5SDimitry Andric // 4370b57cec5SDimitry Andric // Loop-Depth[i]: 4380b57cec5SDimitry Andric // Loop-Depth calculated for first `i` iterations. 4390b57cec5SDimitry Andric // Note: it is enough to calculate depth for up to two iterations. 4400b57cec5SDimitry Andric // 4410b57cec5SDimitry Andric // Depth-Diff[i]: 4420b57cec5SDimitry Andric // Number of cycles saved in first 'i` iterations by optimizing the loop. 4430b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 444bdd1243dSDimitry Andric for (DepthInfo &MaxDepth : LoopDepth) { 4450b57cec5SDimitry Andric for (auto *MBB : Blocks) { 4460b57cec5SDimitry Andric // Clear physical registers Def map. 4470b57cec5SDimitry Andric RegDefMaps[PhyRegType].clear(); 4480b57cec5SDimitry Andric for (MachineInstr &MI : *MBB) { 4490b57cec5SDimitry Andric // Skip debug instructions. 4500b57cec5SDimitry Andric if (MI.isDebugInstr()) 4510b57cec5SDimitry Andric continue; 4520b57cec5SDimitry Andric unsigned MIDepth = 0; 4530b57cec5SDimitry Andric unsigned MIDepthOpt = 0; 4540b57cec5SDimitry Andric bool IsCMOV = CmovInstructions.count(&MI); 4550b57cec5SDimitry Andric for (auto &MO : MI.uses()) { 4560b57cec5SDimitry Andric // Checks for "isUse()" as "uses()" returns also implicit definitions. 4570b57cec5SDimitry Andric if (!MO.isReg() || !MO.isUse()) 4580b57cec5SDimitry Andric continue; 4598bcb0991SDimitry Andric Register Reg = MO.getReg(); 460e8d8bef9SDimitry Andric auto &RDM = RegDefMaps[Reg.isVirtual()]; 4610b57cec5SDimitry Andric if (MachineInstr *DefMI = RDM.lookup(Reg)) { 4620b57cec5SDimitry Andric OperandToDefMap[&MO] = DefMI; 4630b57cec5SDimitry Andric DepthInfo Info = DepthMap.lookup(DefMI); 4640b57cec5SDimitry Andric MIDepth = std::max(MIDepth, Info.Depth); 4650b57cec5SDimitry Andric if (!IsCMOV) 4660b57cec5SDimitry Andric MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth); 4670b57cec5SDimitry Andric } 4680b57cec5SDimitry Andric } 4690b57cec5SDimitry Andric 4700b57cec5SDimitry Andric if (IsCMOV) 4710b57cec5SDimitry Andric MIDepthOpt = getDepthOfOptCmov( 4720b57cec5SDimitry Andric DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth, 4730b57cec5SDimitry Andric DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth); 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric // Iterates over all operands to handle implicit definitions as well. 4760b57cec5SDimitry Andric for (auto &MO : MI.operands()) { 4770b57cec5SDimitry Andric if (!MO.isReg() || !MO.isDef()) 4780b57cec5SDimitry Andric continue; 4798bcb0991SDimitry Andric Register Reg = MO.getReg(); 480e8d8bef9SDimitry Andric RegDefMaps[Reg.isVirtual()][Reg] = &MI; 4810b57cec5SDimitry Andric } 4820b57cec5SDimitry Andric 4830b57cec5SDimitry Andric unsigned Latency = TSchedModel.computeInstrLatency(&MI); 4840b57cec5SDimitry Andric DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency}; 4850b57cec5SDimitry Andric MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth); 4860b57cec5SDimitry Andric MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt); 4870b57cec5SDimitry Andric } 4880b57cec5SDimitry Andric } 4890b57cec5SDimitry Andric } 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, 4920b57cec5SDimitry Andric LoopDepth[1].Depth - LoopDepth[1].OptDepth}; 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 4950b57cec5SDimitry Andric // Step 2: Check if Loop worth to be optimized. 4960b57cec5SDimitry Andric // Worth-Optimize-Loop: 4970b57cec5SDimitry Andric // case 1: Diff[1] == Diff[0] 4980b57cec5SDimitry Andric // Critical-path is iteration independent - there is no dependency 4990b57cec5SDimitry Andric // of critical-path instructions on critical-path instructions of 5000b57cec5SDimitry Andric // previous iteration. 5010b57cec5SDimitry Andric // Thus, it is enough to check gain percent of 1st iteration - 5020b57cec5SDimitry Andric // To be conservative, the optimized loop need to have a depth of 5030b57cec5SDimitry Andric // 12.5% cycles less than original loop, per iteration. 5040b57cec5SDimitry Andric // 5050b57cec5SDimitry Andric // case 2: Diff[1] > Diff[0] 5060b57cec5SDimitry Andric // Critical-path is iteration dependent - there is dependency of 5070b57cec5SDimitry Andric // critical-path instructions on critical-path instructions of 5080b57cec5SDimitry Andric // previous iteration. 5090b57cec5SDimitry Andric // Thus, check the gain percent of the 2nd iteration (similar to the 5100b57cec5SDimitry Andric // previous case), but it is also required to check the gradient of 5110b57cec5SDimitry Andric // the gain - the change in Depth-Diff compared to the change in 5120b57cec5SDimitry Andric // Loop-Depth between 1st and 2nd iterations. 5130b57cec5SDimitry Andric // To be conservative, the gradient need to be at least 50%. 5140b57cec5SDimitry Andric // 5150b57cec5SDimitry Andric // In addition, In order not to optimize loops with very small gain, the 5160b57cec5SDimitry Andric // gain (in cycles) after 2nd iteration should not be less than a given 5170b57cec5SDimitry Andric // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. 5180b57cec5SDimitry Andric // 5190b57cec5SDimitry Andric // If loop is not worth optimizing, remove all CMOV-group-candidates. 5200b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 5210b57cec5SDimitry Andric if (Diff[1] < GainCycleThreshold) 5220b57cec5SDimitry Andric return false; 5230b57cec5SDimitry Andric 5240b57cec5SDimitry Andric bool WorthOptLoop = false; 5250b57cec5SDimitry Andric if (Diff[1] == Diff[0]) 5260b57cec5SDimitry Andric WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; 5270b57cec5SDimitry Andric else if (Diff[1] > Diff[0]) 5280b57cec5SDimitry Andric WorthOptLoop = 5290b57cec5SDimitry Andric (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && 5300b57cec5SDimitry Andric (Diff[1] * 8 >= LoopDepth[1].Depth); 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric if (!WorthOptLoop) 5330b57cec5SDimitry Andric return false; 5340b57cec5SDimitry Andric 5350b57cec5SDimitry Andric ++NumOfLoopCandidate; 5360b57cec5SDimitry Andric 5370b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 5380b57cec5SDimitry Andric // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. 5390b57cec5SDimitry Andric // Worth-Optimize-Group: 54081ad6265SDimitry Andric // Iff it is worth to optimize all CMOV instructions in the group. 5410b57cec5SDimitry Andric // 5420b57cec5SDimitry Andric // Worth-Optimize-CMOV: 5430b57cec5SDimitry Andric // Predicted branch is faster than CMOV by the difference between depth of 5440b57cec5SDimitry Andric // condition operand and depth of taken (predicted) value operand. 5450b57cec5SDimitry Andric // To be conservative, the gain of such CMOV transformation should cover at 5460b57cec5SDimitry Andric // at least 25% of branch-misprediction-penalty. 5470b57cec5SDimitry Andric //===--------------------------------------------------------------------===// 5480b57cec5SDimitry Andric unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; 5490b57cec5SDimitry Andric CmovGroups TempGroups; 5500b57cec5SDimitry Andric std::swap(TempGroups, CmovInstGroups); 5510b57cec5SDimitry Andric for (auto &Group : TempGroups) { 5520b57cec5SDimitry Andric bool WorthOpGroup = true; 5530b57cec5SDimitry Andric for (auto *MI : Group) { 5540b57cec5SDimitry Andric // Avoid CMOV instruction which value is used as a pointer to load from. 5550b57cec5SDimitry Andric // This is another conservative check to avoid converting CMOV instruction 5560b57cec5SDimitry Andric // used with tree-search like algorithm, where the branch is unpredicted. 5570b57cec5SDimitry Andric auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); 558e8d8bef9SDimitry Andric if (!UIs.empty() && ++UIs.begin() == UIs.end()) { 5590b57cec5SDimitry Andric unsigned Op = UIs.begin()->getOpcode(); 5600b57cec5SDimitry Andric if (Op == X86::MOV64rm || Op == X86::MOV32rm) { 5610b57cec5SDimitry Andric WorthOpGroup = false; 5620b57cec5SDimitry Andric break; 5630b57cec5SDimitry Andric } 5640b57cec5SDimitry Andric } 5650b57cec5SDimitry Andric 5660b57cec5SDimitry Andric unsigned CondCost = 5670b57cec5SDimitry Andric DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth; 5680b57cec5SDimitry Andric unsigned ValCost = getDepthOfOptCmov( 5690b57cec5SDimitry Andric DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, 5700b57cec5SDimitry Andric DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); 5710b57cec5SDimitry Andric if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { 5720b57cec5SDimitry Andric WorthOpGroup = false; 5730b57cec5SDimitry Andric break; 5740b57cec5SDimitry Andric } 5750b57cec5SDimitry Andric } 5760b57cec5SDimitry Andric 5770b57cec5SDimitry Andric if (WorthOpGroup) 5780b57cec5SDimitry Andric CmovInstGroups.push_back(Group); 5790b57cec5SDimitry Andric } 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric return !CmovInstGroups.empty(); 5820b57cec5SDimitry Andric } 5830b57cec5SDimitry Andric 5840b57cec5SDimitry Andric static bool checkEFLAGSLive(MachineInstr *MI) { 585*0fca6ea1SDimitry Andric if (MI->killsRegister(X86::EFLAGS, /*TRI=*/nullptr)) 5860b57cec5SDimitry Andric return false; 5870b57cec5SDimitry Andric 5880b57cec5SDimitry Andric // The EFLAGS operand of MI might be missing a kill marker. 5890b57cec5SDimitry Andric // Figure out whether EFLAGS operand should LIVE after MI instruction. 5900b57cec5SDimitry Andric MachineBasicBlock *BB = MI->getParent(); 5910b57cec5SDimitry Andric MachineBasicBlock::iterator ItrMI = MI; 5920b57cec5SDimitry Andric 5930b57cec5SDimitry Andric // Scan forward through BB for a use/def of EFLAGS. 5940b57cec5SDimitry Andric for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) { 595*0fca6ea1SDimitry Andric if (I->readsRegister(X86::EFLAGS, /*TRI=*/nullptr)) 5960b57cec5SDimitry Andric return true; 597*0fca6ea1SDimitry Andric if (I->definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) 5980b57cec5SDimitry Andric return false; 5990b57cec5SDimitry Andric } 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric // We hit the end of the block, check whether EFLAGS is live into a successor. 602349cc55cSDimitry Andric for (MachineBasicBlock *Succ : BB->successors()) 603349cc55cSDimitry Andric if (Succ->isLiveIn(X86::EFLAGS)) 6040b57cec5SDimitry Andric return true; 6050b57cec5SDimitry Andric 6060b57cec5SDimitry Andric return false; 6070b57cec5SDimitry Andric } 6080b57cec5SDimitry Andric 6090b57cec5SDimitry Andric /// Given /p First CMOV instruction and /p Last CMOV instruction representing a 6100b57cec5SDimitry Andric /// group of CMOV instructions, which may contain debug instructions in between, 6110b57cec5SDimitry Andric /// move all debug instructions to after the last CMOV instruction, making the 6120b57cec5SDimitry Andric /// CMOV group consecutive. 6130b57cec5SDimitry Andric static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { 6140b57cec5SDimitry Andric assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID && 6150b57cec5SDimitry Andric "Last instruction in a CMOV group must be a CMOV instruction"); 6160b57cec5SDimitry Andric 6170b57cec5SDimitry Andric SmallVector<MachineInstr *, 2> DBGInstructions; 6180b57cec5SDimitry Andric for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) { 6190b57cec5SDimitry Andric if (I->isDebugInstr()) 6200b57cec5SDimitry Andric DBGInstructions.push_back(&*I); 6210b57cec5SDimitry Andric } 6220b57cec5SDimitry Andric 6230b57cec5SDimitry Andric // Splice the debug instruction after the cmov group. 6240b57cec5SDimitry Andric MachineBasicBlock *MBB = First->getParent(); 6250b57cec5SDimitry Andric for (auto *MI : DBGInstructions) 6260b57cec5SDimitry Andric MBB->insertAfter(Last, MI->removeFromParent()); 6270b57cec5SDimitry Andric } 6280b57cec5SDimitry Andric 6290b57cec5SDimitry Andric void X86CmovConverterPass::convertCmovInstsToBranches( 6300b57cec5SDimitry Andric SmallVectorImpl<MachineInstr *> &Group) const { 6310b57cec5SDimitry Andric assert(!Group.empty() && "No CMOV instructions to convert"); 6320b57cec5SDimitry Andric ++NumOfOptimizedCmovGroups; 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric // If the CMOV group is not packed, e.g., there are debug instructions between 6350b57cec5SDimitry Andric // first CMOV and last CMOV, then pack the group and make the CMOV instruction 6360b57cec5SDimitry Andric // consecutive by moving the debug instructions to after the last CMOV. 6370b57cec5SDimitry Andric packCmovGroup(Group.front(), Group.back()); 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric // To convert a CMOVcc instruction, we actually have to insert the diamond 6400b57cec5SDimitry Andric // control-flow pattern. The incoming instruction knows the destination vreg 6410b57cec5SDimitry Andric // to set, the condition code register to branch on, the true/false values to 6420b57cec5SDimitry Andric // select between, and a branch opcode to use. 6430b57cec5SDimitry Andric 6440b57cec5SDimitry Andric // Before 6450b57cec5SDimitry Andric // ----- 6460b57cec5SDimitry Andric // MBB: 6470b57cec5SDimitry Andric // cond = cmp ... 6480b57cec5SDimitry Andric // v1 = CMOVge t1, f1, cond 6490b57cec5SDimitry Andric // v2 = CMOVlt t2, f2, cond 6500b57cec5SDimitry Andric // v3 = CMOVge v1, f3, cond 6510b57cec5SDimitry Andric // 6520b57cec5SDimitry Andric // After 6530b57cec5SDimitry Andric // ----- 6540b57cec5SDimitry Andric // MBB: 6550b57cec5SDimitry Andric // cond = cmp ... 6560b57cec5SDimitry Andric // jge %SinkMBB 6570b57cec5SDimitry Andric // 6580b57cec5SDimitry Andric // FalseMBB: 6590b57cec5SDimitry Andric // jmp %SinkMBB 6600b57cec5SDimitry Andric // 6610b57cec5SDimitry Andric // SinkMBB: 6620b57cec5SDimitry Andric // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] 6630b57cec5SDimitry Andric // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch 6640b57cec5SDimitry Andric // ; true-value with false-value 6650b57cec5SDimitry Andric // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use 6660b57cec5SDimitry Andric // ; previous Phi instruction result 6670b57cec5SDimitry Andric 6680b57cec5SDimitry Andric MachineInstr &MI = *Group.front(); 6690b57cec5SDimitry Andric MachineInstr *LastCMOV = Group.back(); 6700b57cec5SDimitry Andric DebugLoc DL = MI.getDebugLoc(); 6710b57cec5SDimitry Andric 6720b57cec5SDimitry Andric X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI)); 6730b57cec5SDimitry Andric X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); 6740b57cec5SDimitry Andric // Potentially swap the condition codes so that any memory operand to a CMOV 6750b57cec5SDimitry Andric // is in the *false* position instead of the *true* position. We can invert 6760b57cec5SDimitry Andric // any non-memory operand CMOV instructions to cope with this and we ensure 6770b57cec5SDimitry Andric // memory operand CMOVs are only included with a single condition code. 6780b57cec5SDimitry Andric if (llvm::any_of(Group, [&](MachineInstr *I) { 6790b57cec5SDimitry Andric return I->mayLoad() && X86::getCondFromCMov(*I) == CC; 6800b57cec5SDimitry Andric })) 6810b57cec5SDimitry Andric std::swap(CC, OppCC); 6820b57cec5SDimitry Andric 6830b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 6840b57cec5SDimitry Andric MachineFunction::iterator It = ++MBB->getIterator(); 6850b57cec5SDimitry Andric MachineFunction *F = MBB->getParent(); 6860b57cec5SDimitry Andric const BasicBlock *BB = MBB->getBasicBlock(); 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); 6890b57cec5SDimitry Andric MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); 6900b57cec5SDimitry Andric F->insert(It, FalseMBB); 6910b57cec5SDimitry Andric F->insert(It, SinkMBB); 6920b57cec5SDimitry Andric 6930b57cec5SDimitry Andric // If the EFLAGS register isn't dead in the terminator, then claim that it's 6940b57cec5SDimitry Andric // live into the sink and copy blocks. 6950b57cec5SDimitry Andric if (checkEFLAGSLive(LastCMOV)) { 6960b57cec5SDimitry Andric FalseMBB->addLiveIn(X86::EFLAGS); 6970b57cec5SDimitry Andric SinkMBB->addLiveIn(X86::EFLAGS); 6980b57cec5SDimitry Andric } 6990b57cec5SDimitry Andric 7000b57cec5SDimitry Andric // Transfer the remainder of BB and its successor edges to SinkMBB. 7010b57cec5SDimitry Andric SinkMBB->splice(SinkMBB->begin(), MBB, 7020b57cec5SDimitry Andric std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end()); 7030b57cec5SDimitry Andric SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric // Add the false and sink blocks as its successors. 7060b57cec5SDimitry Andric MBB->addSuccessor(FalseMBB); 7070b57cec5SDimitry Andric MBB->addSuccessor(SinkMBB); 7080b57cec5SDimitry Andric 7090b57cec5SDimitry Andric // Create the conditional branch instruction. 7100b57cec5SDimitry Andric BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); 7110b57cec5SDimitry Andric 7120b57cec5SDimitry Andric // Add the sink block to the false block successors. 7130b57cec5SDimitry Andric FalseMBB->addSuccessor(SinkMBB); 7140b57cec5SDimitry Andric 7150b57cec5SDimitry Andric MachineInstrBuilder MIB; 7160b57cec5SDimitry Andric MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); 7170b57cec5SDimitry Andric MachineBasicBlock::iterator MIItEnd = 7180b57cec5SDimitry Andric std::next(MachineBasicBlock::iterator(LastCMOV)); 7190b57cec5SDimitry Andric MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin(); 7200b57cec5SDimitry Andric MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); 7210b57cec5SDimitry Andric 7220b57cec5SDimitry Andric // First we need to insert an explicit load on the false path for any memory 7230b57cec5SDimitry Andric // operand. We also need to potentially do register rewriting here, but it is 7240b57cec5SDimitry Andric // simpler as the memory operands are always on the false path so we can 7250b57cec5SDimitry Andric // simply take that input, whatever it is. 7260b57cec5SDimitry Andric DenseMap<unsigned, unsigned> FalseBBRegRewriteTable; 7270b57cec5SDimitry Andric for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) { 7280b57cec5SDimitry Andric auto &MI = *MIIt++; 7290b57cec5SDimitry Andric // Skip any CMOVs in this group which don't load from memory. 7300b57cec5SDimitry Andric if (!MI.mayLoad()) { 7310b57cec5SDimitry Andric // Remember the false-side register input. 7328bcb0991SDimitry Andric Register FalseReg = 7330b57cec5SDimitry Andric MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); 7340b57cec5SDimitry Andric // Walk back through any intermediate cmovs referenced. 7350b57cec5SDimitry Andric while (true) { 7360b57cec5SDimitry Andric auto FRIt = FalseBBRegRewriteTable.find(FalseReg); 7370b57cec5SDimitry Andric if (FRIt == FalseBBRegRewriteTable.end()) 7380b57cec5SDimitry Andric break; 7390b57cec5SDimitry Andric FalseReg = FRIt->second; 7400b57cec5SDimitry Andric } 7410b57cec5SDimitry Andric FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg; 7420b57cec5SDimitry Andric continue; 7430b57cec5SDimitry Andric } 7440b57cec5SDimitry Andric 7450b57cec5SDimitry Andric // The condition must be the *opposite* of the one we've decided to branch 7460b57cec5SDimitry Andric // on as the branch will go *around* the load and the load should happen 7470b57cec5SDimitry Andric // when the CMOV condition is false. 7480b57cec5SDimitry Andric assert(X86::getCondFromCMov(MI) == OppCC && 7490b57cec5SDimitry Andric "Can only handle memory-operand cmov instructions with a condition " 7500b57cec5SDimitry Andric "opposite to the selected branch direction."); 7510b57cec5SDimitry Andric 7520b57cec5SDimitry Andric // The goal is to rewrite the cmov from: 7530b57cec5SDimitry Andric // 7540b57cec5SDimitry Andric // MBB: 7550b57cec5SDimitry Andric // %A = CMOVcc %B (tied), (mem) 7560b57cec5SDimitry Andric // 7570b57cec5SDimitry Andric // to 7580b57cec5SDimitry Andric // 7590b57cec5SDimitry Andric // MBB: 7600b57cec5SDimitry Andric // %A = CMOVcc %B (tied), %C 7610b57cec5SDimitry Andric // FalseMBB: 7620b57cec5SDimitry Andric // %C = MOV (mem) 7630b57cec5SDimitry Andric // 7640b57cec5SDimitry Andric // Which will allow the next loop to rewrite the CMOV in terms of a PHI: 7650b57cec5SDimitry Andric // 7660b57cec5SDimitry Andric // MBB: 7670b57cec5SDimitry Andric // JMP!cc SinkMBB 7680b57cec5SDimitry Andric // FalseMBB: 7690b57cec5SDimitry Andric // %C = MOV (mem) 7700b57cec5SDimitry Andric // SinkMBB: 7710b57cec5SDimitry Andric // %A = PHI [ %C, FalseMBB ], [ %B, MBB] 7720b57cec5SDimitry Andric 7730b57cec5SDimitry Andric // Get a fresh register to use as the destination of the MOV. 7740b57cec5SDimitry Andric const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); 7758bcb0991SDimitry Andric Register TmpReg = MRI->createVirtualRegister(RC); 7760b57cec5SDimitry Andric 77706c3fb27SDimitry Andric // Retain debug instr number when unfolded. 77806c3fb27SDimitry Andric unsigned OldDebugInstrNum = MI.peekDebugInstrNum(); 7790b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> NewMIs; 7800b57cec5SDimitry Andric bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, 7810b57cec5SDimitry Andric /*UnfoldLoad*/ true, 7820b57cec5SDimitry Andric /*UnfoldStore*/ false, NewMIs); 7830b57cec5SDimitry Andric (void)Unfolded; 7840b57cec5SDimitry Andric assert(Unfolded && "Should never fail to unfold a loading cmov!"); 7850b57cec5SDimitry Andric 7860b57cec5SDimitry Andric // Move the new CMOV to just before the old one and reset any impacted 7870b57cec5SDimitry Andric // iterator. 7880b57cec5SDimitry Andric auto *NewCMOV = NewMIs.pop_back_val(); 7890b57cec5SDimitry Andric assert(X86::getCondFromCMov(*NewCMOV) == OppCC && 7900b57cec5SDimitry Andric "Last new instruction isn't the expected CMOV!"); 7910b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); 7920b57cec5SDimitry Andric MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); 7930b57cec5SDimitry Andric if (&*MIItBegin == &MI) 7940b57cec5SDimitry Andric MIItBegin = MachineBasicBlock::iterator(NewCMOV); 7950b57cec5SDimitry Andric 79606c3fb27SDimitry Andric if (OldDebugInstrNum) 79706c3fb27SDimitry Andric NewCMOV->setDebugInstrNum(OldDebugInstrNum); 79806c3fb27SDimitry Andric 7990b57cec5SDimitry Andric // Sink whatever instructions were needed to produce the unfolded operand 8000b57cec5SDimitry Andric // into the false block. 8010b57cec5SDimitry Andric for (auto *NewMI : NewMIs) { 8020b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump()); 8030b57cec5SDimitry Andric FalseMBB->insert(FalseInsertionPoint, NewMI); 8040b57cec5SDimitry Andric // Re-map any operands that are from other cmovs to the inputs for this block. 8050b57cec5SDimitry Andric for (auto &MOp : NewMI->uses()) { 8060b57cec5SDimitry Andric if (!MOp.isReg()) 8070b57cec5SDimitry Andric continue; 8080b57cec5SDimitry Andric auto It = FalseBBRegRewriteTable.find(MOp.getReg()); 8090b57cec5SDimitry Andric if (It == FalseBBRegRewriteTable.end()) 8100b57cec5SDimitry Andric continue; 8110b57cec5SDimitry Andric 8120b57cec5SDimitry Andric MOp.setReg(It->second); 8130b57cec5SDimitry Andric // This might have been a kill when it referenced the cmov result, but 8140b57cec5SDimitry Andric // it won't necessarily be once rewritten. 8150b57cec5SDimitry Andric // FIXME: We could potentially improve this by tracking whether the 8160b57cec5SDimitry Andric // operand to the cmov was also a kill, and then skipping the PHI node 8170b57cec5SDimitry Andric // construction below. 8180b57cec5SDimitry Andric MOp.setIsKill(false); 8190b57cec5SDimitry Andric } 8200b57cec5SDimitry Andric } 821349cc55cSDimitry Andric MBB->erase(&MI); 8220b57cec5SDimitry Andric 8230b57cec5SDimitry Andric // Add this PHI to the rewrite table. 8240b57cec5SDimitry Andric FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg; 8250b57cec5SDimitry Andric } 8260b57cec5SDimitry Andric 8270b57cec5SDimitry Andric // As we are creating the PHIs, we have to be careful if there is more than 8280b57cec5SDimitry Andric // one. Later CMOVs may reference the results of earlier CMOVs, but later 8290b57cec5SDimitry Andric // PHIs have to reference the individual true/false inputs from earlier PHIs. 8300b57cec5SDimitry Andric // That also means that PHI construction must work forward from earlier to 8310b57cec5SDimitry Andric // later, and that the code must maintain a mapping from earlier PHI's 8320b57cec5SDimitry Andric // destination registers, and the registers that went into the PHI. 8330b57cec5SDimitry Andric DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; 8340b57cec5SDimitry Andric 8350b57cec5SDimitry Andric for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { 8368bcb0991SDimitry Andric Register DestReg = MIIt->getOperand(0).getReg(); 8378bcb0991SDimitry Andric Register Op1Reg = MIIt->getOperand(1).getReg(); 8388bcb0991SDimitry Andric Register Op2Reg = MIIt->getOperand(2).getReg(); 8390b57cec5SDimitry Andric 8400b57cec5SDimitry Andric // If this CMOV we are processing is the opposite condition from the jump we 8410b57cec5SDimitry Andric // generated, then we have to swap the operands for the PHI that is going to 8420b57cec5SDimitry Andric // be generated. 8430b57cec5SDimitry Andric if (X86::getCondFromCMov(*MIIt) == OppCC) 8440b57cec5SDimitry Andric std::swap(Op1Reg, Op2Reg); 8450b57cec5SDimitry Andric 8460b57cec5SDimitry Andric auto Op1Itr = RegRewriteTable.find(Op1Reg); 8470b57cec5SDimitry Andric if (Op1Itr != RegRewriteTable.end()) 8480b57cec5SDimitry Andric Op1Reg = Op1Itr->second.first; 8490b57cec5SDimitry Andric 8500b57cec5SDimitry Andric auto Op2Itr = RegRewriteTable.find(Op2Reg); 8510b57cec5SDimitry Andric if (Op2Itr != RegRewriteTable.end()) 8520b57cec5SDimitry Andric Op2Reg = Op2Itr->second.second; 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric // SinkMBB: 8550b57cec5SDimitry Andric // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] 8560b57cec5SDimitry Andric // ... 8570b57cec5SDimitry Andric MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) 8580b57cec5SDimitry Andric .addReg(Op1Reg) 8590b57cec5SDimitry Andric .addMBB(FalseMBB) 8600b57cec5SDimitry Andric .addReg(Op2Reg) 8610b57cec5SDimitry Andric .addMBB(MBB); 8620b57cec5SDimitry Andric (void)MIB; 8630b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump()); 8640b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump()); 8650b57cec5SDimitry Andric 86606c3fb27SDimitry Andric // debug-info: we can just copy the instr-ref number from one instruction 86706c3fb27SDimitry Andric // to the other, seeing how it's a one-for-one substitution. 86806c3fb27SDimitry Andric if (unsigned InstrNum = MIIt->peekDebugInstrNum()) 86906c3fb27SDimitry Andric MIB->setDebugInstrNum(InstrNum); 87006c3fb27SDimitry Andric 8710b57cec5SDimitry Andric // Add this PHI to the rewrite table. 8720b57cec5SDimitry Andric RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); 8730b57cec5SDimitry Andric } 8740b57cec5SDimitry Andric 87506c3fb27SDimitry Andric // Reset the NoPHIs property if a PHI was inserted to prevent a conflict with 87606c3fb27SDimitry Andric // the MachineVerifier during testing. 87706c3fb27SDimitry Andric if (MIItBegin != MIItEnd) 87806c3fb27SDimitry Andric F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs); 87906c3fb27SDimitry Andric 8800b57cec5SDimitry Andric // Now remove the CMOV(s). 8810b57cec5SDimitry Andric MBB->erase(MIItBegin, MIItEnd); 882fe6060f1SDimitry Andric 883fe6060f1SDimitry Andric // Add new basic blocks to MachineLoopInfo. 884fe6060f1SDimitry Andric if (MachineLoop *L = MLI->getLoopFor(MBB)) { 885*0fca6ea1SDimitry Andric L->addBasicBlockToLoop(FalseMBB, *MLI); 886*0fca6ea1SDimitry Andric L->addBasicBlockToLoop(SinkMBB, *MLI); 887fe6060f1SDimitry Andric } 8880b57cec5SDimitry Andric } 8890b57cec5SDimitry Andric 8900b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", 8910b57cec5SDimitry Andric false, false) 892*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) 8930b57cec5SDimitry Andric INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", 8940b57cec5SDimitry Andric false, false) 8950b57cec5SDimitry Andric 8960b57cec5SDimitry Andric FunctionPass *llvm::createX86CmovConverterPass() { 8970b57cec5SDimitry Andric return new X86CmovConverterPass(); 8980b57cec5SDimitry Andric } 899