1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 #define DEBUG_TYPE "machine-combiner" 14 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/CodeGen/MachineDominators.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/MachineLoopInfo.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/MachineTraceMetrics.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetSchedule.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineTraceMetrics *Traces; 44 MachineTraceMetrics::Ensemble *MinInstr; 45 46 TargetSchedModel TSchedModel; 47 48 /// True if optimizing for code size. 49 bool OptSize; 50 51 public: 52 static char ID; 53 MachineCombiner() : MachineFunctionPass(ID) { 54 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 55 } 56 void getAnalysisUsage(AnalysisUsage &AU) const override; 57 bool runOnMachineFunction(MachineFunction &MF) override; 58 const char *getPassName() const override { return "Machine InstCombiner"; } 59 60 private: 61 bool doSubstitute(unsigned NewSize, unsigned OldSize); 62 bool combineInstructions(MachineBasicBlock *); 63 MachineInstr *getOperandDef(const MachineOperand &MO); 64 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 65 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 66 MachineTraceMetrics::Trace BlockTrace); 67 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 68 MachineTraceMetrics::Trace BlockTrace); 69 bool 70 preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 71 MachineTraceMetrics::Trace BlockTrace, 72 SmallVectorImpl<MachineInstr *> &InsInstrs, 73 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg); 74 bool preservesResourceLen(MachineBasicBlock *MBB, 75 MachineTraceMetrics::Trace BlockTrace, 76 SmallVectorImpl<MachineInstr *> &InsInstrs, 77 SmallVectorImpl<MachineInstr *> &DelInstrs); 78 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 79 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 80 }; 81 } 82 83 char MachineCombiner::ID = 0; 84 char &llvm::MachineCombinerID = MachineCombiner::ID; 85 86 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 87 "Machine InstCombiner", false, false) 88 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 89 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 90 false, false) 91 92 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 93 AU.setPreservesCFG(); 94 AU.addPreserved<MachineDominatorTree>(); 95 AU.addPreserved<MachineLoopInfo>(); 96 AU.addRequired<MachineTraceMetrics>(); 97 AU.addPreserved<MachineTraceMetrics>(); 98 MachineFunctionPass::getAnalysisUsage(AU); 99 } 100 101 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 102 MachineInstr *DefInstr = nullptr; 103 // We need a virtual register definition. 104 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 105 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 106 // PHI's have no depth etc. 107 if (DefInstr && DefInstr->isPHI()) 108 DefInstr = nullptr; 109 return DefInstr; 110 } 111 112 /// Computes depth of instructions in vector \InsInstr. 113 /// 114 /// \param InsInstrs is a vector of machine instructions 115 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 116 /// of defining machine instruction in \p InsInstrs 117 /// \param BlockTrace is a trace of machine instructions 118 /// 119 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 120 unsigned 121 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 122 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 123 MachineTraceMetrics::Trace BlockTrace) { 124 125 SmallVector<unsigned, 16> InstrDepth; 126 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 127 128 // For each instruction in the new sequence compute the depth based on the 129 // operands. Use the trace information when possible. For new operands which 130 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 131 for (auto *InstrPtr : InsInstrs) { // for each Use 132 unsigned IDepth = 0; 133 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 134 for (const MachineOperand &MO : InstrPtr->operands()) { 135 // Check for virtual register operand. 136 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 137 continue; 138 if (!MO.isUse()) 139 continue; 140 unsigned DepthOp = 0; 141 unsigned LatencyOp = 0; 142 DenseMap<unsigned, unsigned>::iterator II = 143 InstrIdxForVirtReg.find(MO.getReg()); 144 if (II != InstrIdxForVirtReg.end()) { 145 // Operand is new virtual register not in trace 146 assert(II->second < InstrDepth.size() && "Bad Index"); 147 MachineInstr *DefInstr = InsInstrs[II->second]; 148 assert(DefInstr && 149 "There must be a definition for a new virtual register"); 150 DepthOp = InstrDepth[II->second]; 151 LatencyOp = TSchedModel.computeOperandLatency( 152 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 153 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 154 } else { 155 MachineInstr *DefInstr = getOperandDef(MO); 156 if (DefInstr) { 157 DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; 158 LatencyOp = TSchedModel.computeOperandLatency( 159 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 160 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 161 } 162 } 163 IDepth = std::max(IDepth, DepthOp + LatencyOp); 164 } 165 InstrDepth.push_back(IDepth); 166 } 167 unsigned NewRootIdx = InsInstrs.size() - 1; 168 return InstrDepth[NewRootIdx]; 169 } 170 171 /// Computes instruction latency as max of latency of defined operands. 172 /// 173 /// \param Root is a machine instruction that could be replaced by NewRoot. 174 /// It is used to compute a more accurate latency information for NewRoot in 175 /// case there is a dependent instruction in the same trace (\p BlockTrace) 176 /// \param NewRoot is the instruction for which the latency is computed 177 /// \param BlockTrace is a trace of machine instructions 178 /// 179 /// \returns Latency of \p NewRoot 180 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 181 MachineTraceMetrics::Trace BlockTrace) { 182 183 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 184 185 // Check each definition in NewRoot and compute the latency 186 unsigned NewRootLatency = 0; 187 188 for (const MachineOperand &MO : NewRoot->operands()) { 189 // Check for virtual register operand. 190 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 191 continue; 192 if (!MO.isDef()) 193 continue; 194 // Get the first instruction that uses MO 195 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 196 RI++; 197 MachineInstr *UseMO = RI->getParent(); 198 unsigned LatencyOp = 0; 199 if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { 200 LatencyOp = TSchedModel.computeOperandLatency( 201 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 202 UseMO->findRegisterUseOperandIdx(MO.getReg())); 203 } else { 204 LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); 205 } 206 NewRootLatency = std::max(NewRootLatency, LatencyOp); 207 } 208 return NewRootLatency; 209 } 210 211 /// True when the new instruction sequence does not 212 /// lengthen the critical path. The DAGCombine code sequence ends in MI 213 /// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A 214 /// necessary condition for the new sequence to replace the old sequence is that 215 /// it cannot lengthen the critical path. This is decided by the formula 216 /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). 217 /// The slack is the number of cycles Root can be delayed before the critical 218 /// patch becomes longer. 219 bool MachineCombiner::preservesCriticalPathLen( 220 MachineBasicBlock *MBB, MachineInstr *Root, 221 MachineTraceMetrics::Trace BlockTrace, 222 SmallVectorImpl<MachineInstr *> &InsInstrs, 223 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 224 225 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 226 // NewRoot is the last instruction in the \p InsInstrs vector. 227 // Get depth and latency of NewRoot. 228 unsigned NewRootIdx = InsInstrs.size() - 1; 229 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 230 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 231 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 232 233 // Get depth, latency and slack of Root. 234 unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; 235 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 236 unsigned RootSlack = BlockTrace.getInstrSlack(Root); 237 238 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 239 dbgs() << " NewRootDepth: " << NewRootDepth 240 << " NewRootLatency: " << NewRootLatency << "\n"; 241 dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency 242 << " RootSlack: " << RootSlack << "\n"; 243 dbgs() << " NewRootDepth + NewRootLatency " 244 << NewRootDepth + NewRootLatency << "\n"; 245 dbgs() << " RootDepth + RootLatency + RootSlack " 246 << RootDepth + RootLatency + RootSlack << "\n";); 247 248 /// True when the new sequence does not lengthen the critical path. 249 return ((NewRootDepth + NewRootLatency) <= 250 (RootDepth + RootLatency + RootSlack)); 251 } 252 253 /// helper routine to convert instructions into SC 254 void MachineCombiner::instr2instrSC( 255 SmallVectorImpl<MachineInstr *> &Instrs, 256 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 257 for (auto *InstrPtr : Instrs) { 258 unsigned Opc = InstrPtr->getOpcode(); 259 unsigned Idx = TII->get(Opc).getSchedClass(); 260 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 261 InstrsSC.push_back(SC); 262 } 263 } 264 /// True when the new instructions do not increase resource length 265 bool MachineCombiner::preservesResourceLen( 266 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 267 SmallVectorImpl<MachineInstr *> &InsInstrs, 268 SmallVectorImpl<MachineInstr *> &DelInstrs) { 269 270 // Compute current resource length 271 272 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 273 SmallVector <const MachineBasicBlock *, 1> MBBarr; 274 MBBarr.push_back(MBB); 275 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 276 277 // Deal with SC rather than Instructions. 278 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 279 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 280 281 instr2instrSC(InsInstrs, InsInstrsSC); 282 instr2instrSC(DelInstrs, DelInstrsSC); 283 284 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 285 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 286 287 // Compute new resource length. 288 unsigned ResLenAfterCombine = 289 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 290 291 DEBUG(dbgs() << "RESOURCE DATA: \n"; 292 dbgs() << " resource len before: " << ResLenBeforeCombine 293 << " after: " << ResLenAfterCombine << "\n";); 294 295 return ResLenAfterCombine <= ResLenBeforeCombine; 296 } 297 298 /// \returns true when new instruction sequence should be generated 299 /// independent if it lengthens critical path or not 300 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 301 if (OptSize && (NewSize < OldSize)) 302 return true; 303 if (!TSchedModel.hasInstrSchedModel()) 304 return true; 305 return false; 306 } 307 308 /// Substitute a slow code sequence with a faster one by 309 /// evaluating instruction combining pattern. 310 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 311 /// combining based on machine trace metrics. Only combine a sequence of 312 /// instructions when this neither lengthens the critical path nor increases 313 /// resource pressure. When optimizing for codesize always combine when the new 314 /// sequence is shorter. 315 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 316 bool Changed = false; 317 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 318 319 auto BlockIter = MBB->begin(); 320 321 while (BlockIter != MBB->end()) { 322 auto &MI = *BlockIter++; 323 324 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 325 SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Pattern; 326 // The motivating example is: 327 // 328 // MUL Other MUL_op1 MUL_op2 Other 329 // \ / \ | / 330 // ADD/SUB => MADD/MSUB 331 // (=Root) (=NewRoot) 332 333 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 334 // usually beneficial for code size it unfortunately can hurt performance 335 // when the ADD is on the critical path, but the MUL is not. With the 336 // substitution the MUL becomes part of the critical path (in form of the 337 // MADD) and can lengthen it on architectures where the MADD latency is 338 // longer than the ADD latency. 339 // 340 // For each instruction we check if it can be the root of a combiner 341 // pattern. Then for each pattern the new code sequence in form of MI is 342 // generated and evaluated. When the efficiency criteria (don't lengthen 343 // critical path, don't use more resources) is met the new sequence gets 344 // hooked up into the basic block before the old sequence is removed. 345 // 346 // The algorithm does not try to evaluate all patterns and pick the best. 347 // This is only an artificial restriction though. In practice there is 348 // mostly one pattern and hasPattern() can order patterns based on an 349 // internal cost heuristic. 350 351 if (TII->hasPattern(MI, Pattern)) { 352 for (auto P : Pattern) { 353 SmallVector<MachineInstr *, 16> InsInstrs; 354 SmallVector<MachineInstr *, 16> DelInstrs; 355 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 356 if (!MinInstr) 357 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 358 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 359 Traces->verifyAnalysis(); 360 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 361 InstrIdxForVirtReg); 362 // Found pattern, but did not generate alternative sequence. 363 // This can happen e.g. when an immediate could not be materialized 364 // in a single instruction. 365 if (!InsInstrs.size()) 366 continue; 367 // Substitute when we optimize for codesize and the new sequence has 368 // fewer instructions OR 369 // the new sequence neither lengthens the critical path nor increases 370 // resource pressure. 371 if (doSubstitute(InsInstrs.size(), DelInstrs.size()) || 372 (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 373 InstrIdxForVirtReg) && 374 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 375 for (auto *InstrPtr : InsInstrs) 376 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 377 for (auto *InstrPtr : DelInstrs) 378 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 379 380 Changed = true; 381 ++NumInstCombined; 382 383 Traces->invalidate(MBB); 384 Traces->verifyAnalysis(); 385 // Eagerly stop after the first pattern fires. 386 break; 387 } else { 388 // Cleanup instructions of the alternative code sequence. There is no 389 // use for them. 390 for (auto *InstrPtr : InsInstrs) { 391 MachineFunction *MF = MBB->getParent(); 392 MF->DeleteMachineInstr(InstrPtr); 393 } 394 } 395 InstrIdxForVirtReg.clear(); 396 } 397 } 398 } 399 400 return Changed; 401 } 402 403 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 404 const TargetSubtargetInfo &STI = MF.getSubtarget(); 405 TII = STI.getInstrInfo(); 406 TRI = STI.getRegisterInfo(); 407 SchedModel = STI.getSchedModel(); 408 TSchedModel.init(SchedModel, &STI, TII); 409 MRI = &MF.getRegInfo(); 410 Traces = &getAnalysis<MachineTraceMetrics>(); 411 MinInstr = 0; 412 413 OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); 414 415 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 416 if (!TII->useMachineCombiner()) { 417 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 418 return false; 419 } 420 421 bool Changed = false; 422 423 // Try to combine instructions. 424 for (auto &MBB : MF) 425 Changed |= combineInstructions(&MBB); 426 427 return Changed; 428 } 429