1 //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // \file This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains the X86 implementation of the DAG scheduling mutation to 11 // pair instructions back to back. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86MacroFusion.h" 16 #include "X86Subtarget.h" 17 #include "llvm/Support/CommandLine.h" 18 #include "llvm/Target/TargetInstrInfo.h" 19 20 #define DEBUG_TYPE "misched" 21 22 using namespace llvm; 23 24 static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden, 25 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 26 27 namespace { 28 29 /// \brief Verify that the instruction pair, First and Second, 30 /// should be scheduled back to back. If either instruction is unspecified, 31 /// then verify that the other instruction may be part of a pair at all. 32 static bool shouldScheduleAdjacent(const X86Subtarget &ST, 33 const MachineInstr *First, 34 const MachineInstr *Second) { 35 // Check if this processor supports macro-fusion. Since this is a minor 36 // heuristic, we haven't specifically reserved a feature. hasAVX is a decent 37 // proxy for SandyBridge+. 38 if (!ST.hasAVX()) 39 return false; 40 41 enum { 42 FuseTest, 43 FuseCmp, 44 FuseInc 45 } FuseKind; 46 47 assert((First || Second) && "At least one instr must be specified"); 48 unsigned FirstOpcode = First 49 ? First->getOpcode() 50 : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); 51 unsigned SecondOpcode = Second 52 ? Second->getOpcode() 53 : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); 54 55 switch (SecondOpcode) { 56 default: 57 return false; 58 case X86::JE_1: 59 case X86::JNE_1: 60 case X86::JL_1: 61 case X86::JLE_1: 62 case X86::JG_1: 63 case X86::JGE_1: 64 FuseKind = FuseInc; 65 break; 66 case X86::JB_1: 67 case X86::JBE_1: 68 case X86::JA_1: 69 case X86::JAE_1: 70 FuseKind = FuseCmp; 71 break; 72 case X86::JS_1: 73 case X86::JNS_1: 74 case X86::JP_1: 75 case X86::JNP_1: 76 case X86::JO_1: 77 case X86::JNO_1: 78 FuseKind = FuseTest; 79 break; 80 } 81 82 switch (FirstOpcode) { 83 default: 84 return false; 85 case X86::TEST8rr: 86 case X86::TEST16rr: 87 case X86::TEST32rr: 88 case X86::TEST64rr: 89 case X86::TEST8ri: 90 case X86::TEST16ri: 91 case X86::TEST32ri: 92 case X86::TEST32i32: 93 case X86::TEST64i32: 94 case X86::TEST64ri32: 95 case X86::TEST8rm: 96 case X86::TEST16rm: 97 case X86::TEST32rm: 98 case X86::TEST64rm: 99 case X86::TEST8ri_NOREX: 100 case X86::AND16i16: 101 case X86::AND16ri: 102 case X86::AND16ri8: 103 case X86::AND16rm: 104 case X86::AND16rr: 105 case X86::AND32i32: 106 case X86::AND32ri: 107 case X86::AND32ri8: 108 case X86::AND32rm: 109 case X86::AND32rr: 110 case X86::AND64i32: 111 case X86::AND64ri32: 112 case X86::AND64ri8: 113 case X86::AND64rm: 114 case X86::AND64rr: 115 case X86::AND8i8: 116 case X86::AND8ri: 117 case X86::AND8rm: 118 case X86::AND8rr: 119 return true; 120 case X86::CMP16i16: 121 case X86::CMP16ri: 122 case X86::CMP16ri8: 123 case X86::CMP16rm: 124 case X86::CMP16rr: 125 case X86::CMP32i32: 126 case X86::CMP32ri: 127 case X86::CMP32ri8: 128 case X86::CMP32rm: 129 case X86::CMP32rr: 130 case X86::CMP64i32: 131 case X86::CMP64ri32: 132 case X86::CMP64ri8: 133 case X86::CMP64rm: 134 case X86::CMP64rr: 135 case X86::CMP8i8: 136 case X86::CMP8ri: 137 case X86::CMP8rm: 138 case X86::CMP8rr: 139 case X86::ADD16i16: 140 case X86::ADD16ri: 141 case X86::ADD16ri8: 142 case X86::ADD16ri8_DB: 143 case X86::ADD16ri_DB: 144 case X86::ADD16rm: 145 case X86::ADD16rr: 146 case X86::ADD16rr_DB: 147 case X86::ADD32i32: 148 case X86::ADD32ri: 149 case X86::ADD32ri8: 150 case X86::ADD32ri8_DB: 151 case X86::ADD32ri_DB: 152 case X86::ADD32rm: 153 case X86::ADD32rr: 154 case X86::ADD32rr_DB: 155 case X86::ADD64i32: 156 case X86::ADD64ri32: 157 case X86::ADD64ri32_DB: 158 case X86::ADD64ri8: 159 case X86::ADD64ri8_DB: 160 case X86::ADD64rm: 161 case X86::ADD64rr: 162 case X86::ADD64rr_DB: 163 case X86::ADD8i8: 164 case X86::ADD8mi: 165 case X86::ADD8mr: 166 case X86::ADD8ri: 167 case X86::ADD8rm: 168 case X86::ADD8rr: 169 case X86::SUB16i16: 170 case X86::SUB16ri: 171 case X86::SUB16ri8: 172 case X86::SUB16rm: 173 case X86::SUB16rr: 174 case X86::SUB32i32: 175 case X86::SUB32ri: 176 case X86::SUB32ri8: 177 case X86::SUB32rm: 178 case X86::SUB32rr: 179 case X86::SUB64i32: 180 case X86::SUB64ri32: 181 case X86::SUB64ri8: 182 case X86::SUB64rm: 183 case X86::SUB64rr: 184 case X86::SUB8i8: 185 case X86::SUB8ri: 186 case X86::SUB8rm: 187 case X86::SUB8rr: 188 return FuseKind == FuseCmp || FuseKind == FuseInc; 189 case X86::INC16r: 190 case X86::INC32r: 191 case X86::INC64r: 192 case X86::INC8r: 193 case X86::DEC16r: 194 case X86::DEC32r: 195 case X86::DEC64r: 196 case X86::DEC8r: 197 return FuseKind == FuseInc; 198 case X86::INSTRUCTION_LIST_END: 199 return true; 200 } 201 } 202 203 /// \brief Post-process the DAG to create cluster edges between instructions 204 /// that may be fused by the processor into a single operation. 205 class X86MacroFusion : public ScheduleDAGMutation { 206 public: 207 X86MacroFusion() {} 208 209 void apply(ScheduleDAGInstrs *DAGInstrs) override; 210 }; 211 212 void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { 213 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 214 const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>(); 215 216 // For now, assume targets can only fuse with the branch. 217 SUnit &ExitSU = DAG->ExitSU; 218 MachineInstr *Branch = ExitSU.getInstr(); 219 if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) 220 return; 221 222 for (SDep &PredDep : ExitSU.Preds) { 223 if (PredDep.isWeak()) 224 continue; 225 SUnit &SU = *PredDep.getSUnit(); 226 MachineInstr &Pred = *SU.getInstr(); 227 if (!shouldScheduleAdjacent(ST, &Pred, Branch)) 228 continue; 229 230 // Create a single weak edge from SU to ExitSU. The only effect is to cause 231 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 232 // need to copy predecessor edges from ExitSU to SU, since top-down 233 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 234 // of SU, we could create an artificial edge from the deepest root, but it 235 // hasn't been needed yet. 236 bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); 237 (void)Success; 238 assert(Success && "No DAG nodes should be reachable from ExitSU"); 239 240 // Adjust latency of data deps between the nodes. 241 for (SDep &PredDep : ExitSU.Preds) 242 if (PredDep.getSUnit() == &SU) 243 PredDep.setLatency(0); 244 for (SDep &SuccDep : SU.Succs) 245 if (SuccDep.getSUnit() == &ExitSU) 246 SuccDep.setLatency(0); 247 248 DEBUG(dbgs() << "Macro fuse "; 249 SU.print(dbgs(), DAG); 250 dbgs() << " - ExitSU" << '\n'); 251 252 break; 253 } 254 } 255 256 } // end namespace 257 258 namespace llvm { 259 260 std::unique_ptr<ScheduleDAGMutation> 261 createX86MacroFusionDAGMutation () { 262 return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr; 263 } 264 265 } // end namespace llvm 266