1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "machine-combiner" 15 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/ADT/DenseMap.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineLoopInfo.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineTraceMetrics.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Target/TargetInstrInfo.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 #include "llvm/Target/TargetSubtargetInfo.h" 33 34 using namespace llvm; 35 36 STATISTIC(NumInstCombined, "Number of machineinst combined"); 37 38 namespace { 39 class MachineCombiner : public MachineFunctionPass { 40 const TargetInstrInfo *TII; 41 const TargetRegisterInfo *TRI; 42 MCSchedModel SchedModel; 43 MachineRegisterInfo *MRI; 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52 public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 const char *getPassName() const override { return "Machine InstCombiner"; } 60 61 private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 75 bool NewCodeHasLessInsts); 76 bool preservesResourceLen(MachineBasicBlock *MBB, 77 MachineTraceMetrics::Trace BlockTrace, 78 SmallVectorImpl<MachineInstr *> &InsInstrs, 79 SmallVectorImpl<MachineInstr *> &DelInstrs); 80 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 81 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 82 }; 83 } 84 85 char MachineCombiner::ID = 0; 86 char &llvm::MachineCombinerID = MachineCombiner::ID; 87 88 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 89 "Machine InstCombiner", false, false) 90 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 91 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 92 false, false) 93 94 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 95 AU.setPreservesCFG(); 96 AU.addPreserved<MachineDominatorTree>(); 97 AU.addPreserved<MachineLoopInfo>(); 98 AU.addRequired<MachineTraceMetrics>(); 99 AU.addPreserved<MachineTraceMetrics>(); 100 MachineFunctionPass::getAnalysisUsage(AU); 101 } 102 103 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 104 MachineInstr *DefInstr = nullptr; 105 // We need a virtual register definition. 106 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 107 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 108 // PHI's have no depth etc. 109 if (DefInstr && DefInstr->isPHI()) 110 DefInstr = nullptr; 111 return DefInstr; 112 } 113 114 /// Computes depth of instructions in vector \InsInstr. 115 /// 116 /// \param InsInstrs is a vector of machine instructions 117 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 118 /// of defining machine instruction in \p InsInstrs 119 /// \param BlockTrace is a trace of machine instructions 120 /// 121 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 122 unsigned 123 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 124 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 125 MachineTraceMetrics::Trace BlockTrace) { 126 SmallVector<unsigned, 16> InstrDepth; 127 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 128 "Missing machine model\n"); 129 130 // For each instruction in the new sequence compute the depth based on the 131 // operands. Use the trace information when possible. For new operands which 132 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 133 for (auto *InstrPtr : InsInstrs) { // for each Use 134 unsigned IDepth = 0; 135 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 136 for (const MachineOperand &MO : InstrPtr->operands()) { 137 // Check for virtual register operand. 138 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 139 continue; 140 if (!MO.isUse()) 141 continue; 142 unsigned DepthOp = 0; 143 unsigned LatencyOp = 0; 144 DenseMap<unsigned, unsigned>::iterator II = 145 InstrIdxForVirtReg.find(MO.getReg()); 146 if (II != InstrIdxForVirtReg.end()) { 147 // Operand is new virtual register not in trace 148 assert(II->second < InstrDepth.size() && "Bad Index"); 149 MachineInstr *DefInstr = InsInstrs[II->second]; 150 assert(DefInstr && 151 "There must be a definition for a new virtual register"); 152 DepthOp = InstrDepth[II->second]; 153 LatencyOp = TSchedModel.computeOperandLatency( 154 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 155 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 156 } else { 157 MachineInstr *DefInstr = getOperandDef(MO); 158 if (DefInstr) { 159 DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; 160 LatencyOp = TSchedModel.computeOperandLatency( 161 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 162 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 163 } 164 } 165 IDepth = std::max(IDepth, DepthOp + LatencyOp); 166 } 167 InstrDepth.push_back(IDepth); 168 } 169 unsigned NewRootIdx = InsInstrs.size() - 1; 170 return InstrDepth[NewRootIdx]; 171 } 172 173 /// Computes instruction latency as max of latency of defined operands. 174 /// 175 /// \param Root is a machine instruction that could be replaced by NewRoot. 176 /// It is used to compute a more accurate latency information for NewRoot in 177 /// case there is a dependent instruction in the same trace (\p BlockTrace) 178 /// \param NewRoot is the instruction for which the latency is computed 179 /// \param BlockTrace is a trace of machine instructions 180 /// 181 /// \returns Latency of \p NewRoot 182 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 183 MachineTraceMetrics::Trace BlockTrace) { 184 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 185 "Missing machine model\n"); 186 187 // Check each definition in NewRoot and compute the latency 188 unsigned NewRootLatency = 0; 189 190 for (const MachineOperand &MO : NewRoot->operands()) { 191 // Check for virtual register operand. 192 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 193 continue; 194 if (!MO.isDef()) 195 continue; 196 // Get the first instruction that uses MO 197 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 198 RI++; 199 MachineInstr *UseMO = RI->getParent(); 200 unsigned LatencyOp = 0; 201 if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { 202 LatencyOp = TSchedModel.computeOperandLatency( 203 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 204 UseMO->findRegisterUseOperandIdx(MO.getReg())); 205 } else { 206 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 207 } 208 NewRootLatency = std::max(NewRootLatency, LatencyOp); 209 } 210 return NewRootLatency; 211 } 212 213 /// True when the new instruction sequence does not lengthen the critical path 214 /// and the new sequence has less instructions or the new sequence improves the 215 /// critical path. 216 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 217 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 218 /// sequence to replace the old sequence is that it cannot lengthen the critical 219 /// path. This is decided by the formula: 220 /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). 221 /// If the new sequence has an equal length critical path but does not reduce 222 /// the number of instructions (NewCodeHasLessInsts is false), then it is not 223 /// considered an improvement. The slack is the number of cycles Root can be 224 /// delayed before the critical patch becomes longer. 225 bool MachineCombiner::improvesCriticalPathLen( 226 MachineBasicBlock *MBB, MachineInstr *Root, 227 MachineTraceMetrics::Trace BlockTrace, 228 SmallVectorImpl<MachineInstr *> &InsInstrs, 229 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 230 bool NewCodeHasLessInsts) { 231 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 232 "Missing machine model\n"); 233 // NewRoot is the last instruction in the \p InsInstrs vector. 234 // Get depth and latency of NewRoot. 235 unsigned NewRootIdx = InsInstrs.size() - 1; 236 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 237 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 238 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 239 240 // Get depth, latency and slack of Root. 241 unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; 242 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 243 unsigned RootSlack = BlockTrace.getInstrSlack(Root); 244 245 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 246 dbgs() << " NewRootDepth: " << NewRootDepth 247 << " NewRootLatency: " << NewRootLatency << "\n"; 248 dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency 249 << " RootSlack: " << RootSlack << "\n"; 250 dbgs() << " NewRootDepth + NewRootLatency = " 251 << NewRootDepth + NewRootLatency << "\n"; 252 dbgs() << " RootDepth + RootLatency + RootSlack = " 253 << RootDepth + RootLatency + RootSlack << "\n";); 254 255 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 256 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 257 258 if (NewCodeHasLessInsts) 259 return NewCycleCount <= OldCycleCount; 260 else 261 return NewCycleCount < OldCycleCount; 262 } 263 264 /// helper routine to convert instructions into SC 265 void MachineCombiner::instr2instrSC( 266 SmallVectorImpl<MachineInstr *> &Instrs, 267 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 268 for (auto *InstrPtr : Instrs) { 269 unsigned Opc = InstrPtr->getOpcode(); 270 unsigned Idx = TII->get(Opc).getSchedClass(); 271 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 272 InstrsSC.push_back(SC); 273 } 274 } 275 276 /// True when the new instructions do not increase resource length 277 bool MachineCombiner::preservesResourceLen( 278 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 279 SmallVectorImpl<MachineInstr *> &InsInstrs, 280 SmallVectorImpl<MachineInstr *> &DelInstrs) { 281 if (!TSchedModel.hasInstrSchedModel()) 282 return true; 283 284 // Compute current resource length 285 286 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 287 SmallVector <const MachineBasicBlock *, 1> MBBarr; 288 MBBarr.push_back(MBB); 289 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 290 291 // Deal with SC rather than Instructions. 292 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 293 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 294 295 instr2instrSC(InsInstrs, InsInstrsSC); 296 instr2instrSC(DelInstrs, DelInstrsSC); 297 298 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 299 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 300 301 // Compute new resource length. 302 unsigned ResLenAfterCombine = 303 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 304 305 DEBUG(dbgs() << "RESOURCE DATA: \n"; 306 dbgs() << " resource len before: " << ResLenBeforeCombine 307 << " after: " << ResLenAfterCombine << "\n";); 308 309 return ResLenAfterCombine <= ResLenBeforeCombine; 310 } 311 312 /// \returns true when new instruction sequence should be generated 313 /// independent if it lengthens critical path or not 314 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 315 if (OptSize && (NewSize < OldSize)) 316 return true; 317 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 318 return true; 319 return false; 320 } 321 322 /// Substitute a slow code sequence with a faster one by 323 /// evaluating instruction combining pattern. 324 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 325 /// combining based on machine trace metrics. Only combine a sequence of 326 /// instructions when this neither lengthens the critical path nor increases 327 /// resource pressure. When optimizing for codesize always combine when the new 328 /// sequence is shorter. 329 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 330 bool Changed = false; 331 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 332 333 auto BlockIter = MBB->begin(); 334 335 while (BlockIter != MBB->end()) { 336 auto &MI = *BlockIter++; 337 338 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 339 SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Patterns; 340 // The motivating example is: 341 // 342 // MUL Other MUL_op1 MUL_op2 Other 343 // \ / \ | / 344 // ADD/SUB => MADD/MSUB 345 // (=Root) (=NewRoot) 346 347 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 348 // usually beneficial for code size it unfortunately can hurt performance 349 // when the ADD is on the critical path, but the MUL is not. With the 350 // substitution the MUL becomes part of the critical path (in form of the 351 // MADD) and can lengthen it on architectures where the MADD latency is 352 // longer than the ADD latency. 353 // 354 // For each instruction we check if it can be the root of a combiner 355 // pattern. Then for each pattern the new code sequence in form of MI is 356 // generated and evaluated. When the efficiency criteria (don't lengthen 357 // critical path, don't use more resources) is met the new sequence gets 358 // hooked up into the basic block before the old sequence is removed. 359 // 360 // The algorithm does not try to evaluate all patterns and pick the best. 361 // This is only an artificial restriction though. In practice there is 362 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 363 // based on an internal cost heuristic. 364 365 if (TII->getMachineCombinerPatterns(MI, Patterns)) { 366 for (auto P : Patterns) { 367 SmallVector<MachineInstr *, 16> InsInstrs; 368 SmallVector<MachineInstr *, 16> DelInstrs; 369 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 370 if (!MinInstr) 371 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 372 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 373 Traces->verifyAnalysis(); 374 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 375 InstrIdxForVirtReg); 376 unsigned NewInstCount = InsInstrs.size(); 377 unsigned OldInstCount = DelInstrs.size(); 378 // Found pattern, but did not generate alternative sequence. 379 // This can happen e.g. when an immediate could not be materialized 380 // in a single instruction. 381 if (!NewInstCount) 382 continue; 383 // Substitute when we optimize for codesize and the new sequence has 384 // fewer instructions OR 385 // the new sequence neither lengthens the critical path nor increases 386 // resource pressure. 387 if (doSubstitute(NewInstCount, OldInstCount) || 388 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 389 InstrIdxForVirtReg, 390 NewInstCount < OldInstCount) && 391 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 392 for (auto *InstrPtr : InsInstrs) 393 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 394 for (auto *InstrPtr : DelInstrs) 395 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 396 397 Changed = true; 398 ++NumInstCombined; 399 400 Traces->invalidate(MBB); 401 Traces->verifyAnalysis(); 402 // Eagerly stop after the first pattern fires. 403 break; 404 } else { 405 // Cleanup instructions of the alternative code sequence. There is no 406 // use for them. 407 MachineFunction *MF = MBB->getParent(); 408 for (auto *InstrPtr : InsInstrs) 409 MF->DeleteMachineInstr(InstrPtr); 410 } 411 InstrIdxForVirtReg.clear(); 412 } 413 } 414 } 415 416 return Changed; 417 } 418 419 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 420 const TargetSubtargetInfo &STI = MF.getSubtarget(); 421 TII = STI.getInstrInfo(); 422 TRI = STI.getRegisterInfo(); 423 SchedModel = STI.getSchedModel(); 424 TSchedModel.init(SchedModel, &STI, TII); 425 MRI = &MF.getRegInfo(); 426 Traces = &getAnalysis<MachineTraceMetrics>(); 427 MinInstr = nullptr; 428 OptSize = MF.getFunction()->optForSize(); 429 430 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 431 if (!TII->useMachineCombiner()) { 432 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 433 return false; 434 } 435 436 bool Changed = false; 437 438 // Try to combine instructions. 439 for (auto &MBB : MF) 440 Changed |= combineInstructions(&MBB); 441 442 return Changed; 443 } 444