1 //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // \file This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains the X86 implementation of the DAG scheduling mutation to 11 // pair instructions back to back. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86MacroFusion.h" 16 #include "X86Subtarget.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Support/CommandLine.h" 19 #include "llvm/Target/TargetInstrInfo.h" 20 21 #define DEBUG_TYPE "misched" 22 23 STATISTIC(NumFused, "Number of instr pairs fused"); 24 25 using namespace llvm; 26 27 static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden, 28 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 29 30 namespace { 31 32 /// \brief Verify that the instruction pair, First and Second, 33 /// should be scheduled back to back. If either instruction is unspecified, 34 /// then verify that the other instruction may be part of a pair at all. 35 static bool shouldScheduleAdjacent(const X86Subtarget &ST, 36 const MachineInstr *First, 37 const MachineInstr *Second) { 38 // Check if this processor supports macro-fusion. Since this is a minor 39 // heuristic, we haven't specifically reserved a feature. hasAVX is a decent 40 // proxy for SandyBridge+. 41 if (!ST.hasAVX()) 42 return false; 43 44 enum { 45 FuseTest, 46 FuseCmp, 47 FuseInc 48 } FuseKind; 49 50 assert((First || Second) && "At least one instr must be specified"); 51 unsigned FirstOpcode = First 52 ? First->getOpcode() 53 : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); 54 unsigned SecondOpcode = Second 55 ? Second->getOpcode() 56 : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); 57 58 switch (SecondOpcode) { 59 default: 60 return false; 61 case X86::JE_1: 62 case X86::JNE_1: 63 case X86::JL_1: 64 case X86::JLE_1: 65 case X86::JG_1: 66 case X86::JGE_1: 67 FuseKind = FuseInc; 68 break; 69 case X86::JB_1: 70 case X86::JBE_1: 71 case X86::JA_1: 72 case X86::JAE_1: 73 FuseKind = FuseCmp; 74 break; 75 case X86::JS_1: 76 case X86::JNS_1: 77 case X86::JP_1: 78 case X86::JNP_1: 79 case X86::JO_1: 80 case X86::JNO_1: 81 FuseKind = FuseTest; 82 break; 83 } 84 85 switch (FirstOpcode) { 86 default: 87 return false; 88 case X86::TEST8rr: 89 case X86::TEST16rr: 90 case X86::TEST32rr: 91 case X86::TEST64rr: 92 case X86::TEST8ri: 93 case X86::TEST16ri: 94 case X86::TEST32ri: 95 case X86::TEST32i32: 96 case X86::TEST64i32: 97 case X86::TEST64ri32: 98 case X86::TEST8rm: 99 case X86::TEST16rm: 100 case X86::TEST32rm: 101 case X86::TEST64rm: 102 case X86::TEST8ri_NOREX: 103 case X86::AND16i16: 104 case X86::AND16ri: 105 case X86::AND16ri8: 106 case X86::AND16rm: 107 case X86::AND16rr: 108 case X86::AND32i32: 109 case X86::AND32ri: 110 case X86::AND32ri8: 111 case X86::AND32rm: 112 case X86::AND32rr: 113 case X86::AND64i32: 114 case X86::AND64ri32: 115 case X86::AND64ri8: 116 case X86::AND64rm: 117 case X86::AND64rr: 118 case X86::AND8i8: 119 case X86::AND8ri: 120 case X86::AND8rm: 121 case X86::AND8rr: 122 return true; 123 case X86::CMP16i16: 124 case X86::CMP16ri: 125 case X86::CMP16ri8: 126 case X86::CMP16rm: 127 case X86::CMP16rr: 128 case X86::CMP32i32: 129 case X86::CMP32ri: 130 case X86::CMP32ri8: 131 case X86::CMP32rm: 132 case X86::CMP32rr: 133 case X86::CMP64i32: 134 case X86::CMP64ri32: 135 case X86::CMP64ri8: 136 case X86::CMP64rm: 137 case X86::CMP64rr: 138 case X86::CMP8i8: 139 case X86::CMP8ri: 140 case X86::CMP8rm: 141 case X86::CMP8rr: 142 case X86::ADD16i16: 143 case X86::ADD16ri: 144 case X86::ADD16ri8: 145 case X86::ADD16ri8_DB: 146 case X86::ADD16ri_DB: 147 case X86::ADD16rm: 148 case X86::ADD16rr: 149 case X86::ADD16rr_DB: 150 case X86::ADD32i32: 151 case X86::ADD32ri: 152 case X86::ADD32ri8: 153 case X86::ADD32ri8_DB: 154 case X86::ADD32ri_DB: 155 case X86::ADD32rm: 156 case X86::ADD32rr: 157 case X86::ADD32rr_DB: 158 case X86::ADD64i32: 159 case X86::ADD64ri32: 160 case X86::ADD64ri32_DB: 161 case X86::ADD64ri8: 162 case X86::ADD64ri8_DB: 163 case X86::ADD64rm: 164 case X86::ADD64rr: 165 case X86::ADD64rr_DB: 166 case X86::ADD8i8: 167 case X86::ADD8mi: 168 case X86::ADD8mr: 169 case X86::ADD8ri: 170 case X86::ADD8rm: 171 case X86::ADD8rr: 172 case X86::SUB16i16: 173 case X86::SUB16ri: 174 case X86::SUB16ri8: 175 case X86::SUB16rm: 176 case X86::SUB16rr: 177 case X86::SUB32i32: 178 case X86::SUB32ri: 179 case X86::SUB32ri8: 180 case X86::SUB32rm: 181 case X86::SUB32rr: 182 case X86::SUB64i32: 183 case X86::SUB64ri32: 184 case X86::SUB64ri8: 185 case X86::SUB64rm: 186 case X86::SUB64rr: 187 case X86::SUB8i8: 188 case X86::SUB8ri: 189 case X86::SUB8rm: 190 case X86::SUB8rr: 191 return FuseKind == FuseCmp || FuseKind == FuseInc; 192 case X86::INC16r: 193 case X86::INC32r: 194 case X86::INC64r: 195 case X86::INC8r: 196 case X86::DEC16r: 197 case X86::DEC32r: 198 case X86::DEC64r: 199 case X86::DEC8r: 200 return FuseKind == FuseInc; 201 case X86::INSTRUCTION_LIST_END: 202 return true; 203 } 204 } 205 206 /// \brief Post-process the DAG to create cluster edges between instructions 207 /// that may be fused by the processor into a single operation. 208 class X86MacroFusion : public ScheduleDAGMutation { 209 public: 210 X86MacroFusion() {} 211 212 void apply(ScheduleDAGInstrs *DAGInstrs) override; 213 }; 214 215 void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { 216 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 217 const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>(); 218 219 // For now, assume targets can only fuse with the branch. 220 SUnit &ExitSU = DAG->ExitSU; 221 MachineInstr *Branch = ExitSU.getInstr(); 222 if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) 223 return; 224 225 for (SDep &PredDep : ExitSU.Preds) { 226 if (PredDep.isWeak()) 227 continue; 228 SUnit &SU = *PredDep.getSUnit(); 229 MachineInstr &Pred = *SU.getInstr(); 230 if (!shouldScheduleAdjacent(ST, &Pred, Branch)) 231 continue; 232 233 // Create a single weak edge from SU to ExitSU. The only effect is to cause 234 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 235 // need to copy predecessor edges from ExitSU to SU, since top-down 236 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 237 // of SU, we could create an artificial edge from the deepest root, but it 238 // hasn't been needed yet. 239 bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); 240 (void)Success; 241 assert(Success && "No DAG nodes should be reachable from ExitSU"); 242 243 // Adjust latency of data deps between the nodes. 244 for (SDep &PredDep : ExitSU.Preds) 245 if (PredDep.getSUnit() == &SU) 246 PredDep.setLatency(0); 247 for (SDep &SuccDep : SU.Succs) 248 if (SuccDep.getSUnit() == &ExitSU) 249 SuccDep.setLatency(0); 250 251 ++NumFused; 252 DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; 253 SU.print(dbgs(), DAG); 254 dbgs() << " - ExitSU" 255 << " / " << DAG->TII->getName(Pred.getOpcode()) << " - " 256 << DAG->TII->getName(Branch->getOpcode()) << '\n';); 257 258 break; 259 } 260 } 261 262 } // end namespace 263 264 namespace llvm { 265 266 std::unique_ptr<ScheduleDAGMutation> 267 createX86MacroFusionDAGMutation () { 268 return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr; 269 } 270 271 } // end namespace llvm 272