1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 #define DEBUG_TYPE "machine-combiner" 14 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/CodeGen/MachineDominators.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/MachineLoopInfo.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/MachineTraceMetrics.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetSchedule.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineTraceMetrics *Traces; 44 MachineTraceMetrics::Ensemble *MinInstr; 45 46 TargetSchedModel TSchedModel; 47 48 /// True if optimizing for code size. 49 bool OptSize; 50 51 public: 52 static char ID; 53 MachineCombiner() : MachineFunctionPass(ID) { 54 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 55 } 56 void getAnalysisUsage(AnalysisUsage &AU) const override; 57 bool runOnMachineFunction(MachineFunction &MF) override; 58 const char *getPassName() const override { return "Machine InstCombiner"; } 59 60 private: 61 bool doSubstitute(unsigned NewSize, unsigned OldSize); 62 bool combineInstructions(MachineBasicBlock *); 63 MachineInstr *getOperandDef(const MachineOperand &MO); 64 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 65 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 66 MachineTraceMetrics::Trace BlockTrace); 67 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 68 MachineTraceMetrics::Trace BlockTrace); 69 bool 70 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 71 MachineTraceMetrics::Trace BlockTrace, 72 SmallVectorImpl<MachineInstr *> &InsInstrs, 73 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 74 bool NewCodeHasLessInsts); 75 bool preservesResourceLen(MachineBasicBlock *MBB, 76 MachineTraceMetrics::Trace BlockTrace, 77 SmallVectorImpl<MachineInstr *> &InsInstrs, 78 SmallVectorImpl<MachineInstr *> &DelInstrs); 79 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 80 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 81 }; 82 } 83 84 char MachineCombiner::ID = 0; 85 char &llvm::MachineCombinerID = MachineCombiner::ID; 86 87 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 88 "Machine InstCombiner", false, false) 89 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 90 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 91 false, false) 92 93 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 94 AU.setPreservesCFG(); 95 AU.addPreserved<MachineDominatorTree>(); 96 AU.addPreserved<MachineLoopInfo>(); 97 AU.addRequired<MachineTraceMetrics>(); 98 AU.addPreserved<MachineTraceMetrics>(); 99 MachineFunctionPass::getAnalysisUsage(AU); 100 } 101 102 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 103 MachineInstr *DefInstr = nullptr; 104 // We need a virtual register definition. 105 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 106 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 107 // PHI's have no depth etc. 108 if (DefInstr && DefInstr->isPHI()) 109 DefInstr = nullptr; 110 return DefInstr; 111 } 112 113 /// Computes depth of instructions in vector \InsInstr. 114 /// 115 /// \param InsInstrs is a vector of machine instructions 116 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 117 /// of defining machine instruction in \p InsInstrs 118 /// \param BlockTrace is a trace of machine instructions 119 /// 120 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 121 unsigned 122 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 123 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 124 MachineTraceMetrics::Trace BlockTrace) { 125 126 SmallVector<unsigned, 16> InstrDepth; 127 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 128 "Missing machine model\n"); 129 130 // For each instruction in the new sequence compute the depth based on the 131 // operands. Use the trace information when possible. For new operands which 132 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 133 for (auto *InstrPtr : InsInstrs) { // for each Use 134 unsigned IDepth = 0; 135 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 136 for (const MachineOperand &MO : InstrPtr->operands()) { 137 // Check for virtual register operand. 138 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 139 continue; 140 if (!MO.isUse()) 141 continue; 142 unsigned DepthOp = 0; 143 unsigned LatencyOp = 0; 144 DenseMap<unsigned, unsigned>::iterator II = 145 InstrIdxForVirtReg.find(MO.getReg()); 146 if (II != InstrIdxForVirtReg.end()) { 147 // Operand is new virtual register not in trace 148 assert(II->second < InstrDepth.size() && "Bad Index"); 149 MachineInstr *DefInstr = InsInstrs[II->second]; 150 assert(DefInstr && 151 "There must be a definition for a new virtual register"); 152 DepthOp = InstrDepth[II->second]; 153 LatencyOp = TSchedModel.computeOperandLatency( 154 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 155 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 156 } else { 157 MachineInstr *DefInstr = getOperandDef(MO); 158 if (DefInstr) { 159 DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; 160 LatencyOp = TSchedModel.computeOperandLatency( 161 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 162 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 163 } 164 } 165 IDepth = std::max(IDepth, DepthOp + LatencyOp); 166 } 167 InstrDepth.push_back(IDepth); 168 } 169 unsigned NewRootIdx = InsInstrs.size() - 1; 170 return InstrDepth[NewRootIdx]; 171 } 172 173 /// Computes instruction latency as max of latency of defined operands. 174 /// 175 /// \param Root is a machine instruction that could be replaced by NewRoot. 176 /// It is used to compute a more accurate latency information for NewRoot in 177 /// case there is a dependent instruction in the same trace (\p BlockTrace) 178 /// \param NewRoot is the instruction for which the latency is computed 179 /// \param BlockTrace is a trace of machine instructions 180 /// 181 /// \returns Latency of \p NewRoot 182 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 183 MachineTraceMetrics::Trace BlockTrace) { 184 185 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 186 "Missing machine model\n"); 187 188 // Check each definition in NewRoot and compute the latency 189 unsigned NewRootLatency = 0; 190 191 for (const MachineOperand &MO : NewRoot->operands()) { 192 // Check for virtual register operand. 193 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 194 continue; 195 if (!MO.isDef()) 196 continue; 197 // Get the first instruction that uses MO 198 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 199 RI++; 200 MachineInstr *UseMO = RI->getParent(); 201 unsigned LatencyOp = 0; 202 if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { 203 LatencyOp = TSchedModel.computeOperandLatency( 204 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 205 UseMO->findRegisterUseOperandIdx(MO.getReg())); 206 } else { 207 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 208 } 209 NewRootLatency = std::max(NewRootLatency, LatencyOp); 210 } 211 return NewRootLatency; 212 } 213 214 /// True when the new instruction sequence does not lengthen the critical path 215 /// and the new sequence has less instructions or the new sequence improves the 216 /// critical path. 217 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 218 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 219 /// sequence to replace the old sequence is that it cannot lengthen the critical 220 /// path. This is decided by the formula: 221 /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). 222 /// If the new sequence has an equal length critical path but does not reduce 223 /// the number of instructions (NewCodeHasLessInsts is false), then it is not 224 /// considered an improvement. The slack is the number of cycles Root can be 225 /// delayed before the critical patch becomes longer. 226 bool MachineCombiner::improvesCriticalPathLen( 227 MachineBasicBlock *MBB, MachineInstr *Root, 228 MachineTraceMetrics::Trace BlockTrace, 229 SmallVectorImpl<MachineInstr *> &InsInstrs, 230 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 231 bool NewCodeHasLessInsts) { 232 233 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 234 "Missing machine model\n"); 235 // NewRoot is the last instruction in the \p InsInstrs vector. 236 // Get depth and latency of NewRoot. 237 unsigned NewRootIdx = InsInstrs.size() - 1; 238 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 239 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 240 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 241 242 // Get depth, latency and slack of Root. 243 unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; 244 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 245 unsigned RootSlack = BlockTrace.getInstrSlack(Root); 246 247 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 248 dbgs() << " NewRootDepth: " << NewRootDepth 249 << " NewRootLatency: " << NewRootLatency << "\n"; 250 dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency 251 << " RootSlack: " << RootSlack << "\n"; 252 dbgs() << " NewRootDepth + NewRootLatency " 253 << NewRootDepth + NewRootLatency << "\n"; 254 dbgs() << " RootDepth + RootLatency + RootSlack " 255 << RootDepth + RootLatency + RootSlack << "\n";); 256 257 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 258 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 259 260 if (NewCodeHasLessInsts) 261 return NewCycleCount <= OldCycleCount; 262 else 263 return NewCycleCount < OldCycleCount; 264 } 265 266 /// helper routine to convert instructions into SC 267 void MachineCombiner::instr2instrSC( 268 SmallVectorImpl<MachineInstr *> &Instrs, 269 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 270 for (auto *InstrPtr : Instrs) { 271 unsigned Opc = InstrPtr->getOpcode(); 272 unsigned Idx = TII->get(Opc).getSchedClass(); 273 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 274 InstrsSC.push_back(SC); 275 } 276 } 277 /// True when the new instructions do not increase resource length 278 bool MachineCombiner::preservesResourceLen( 279 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 280 SmallVectorImpl<MachineInstr *> &InsInstrs, 281 SmallVectorImpl<MachineInstr *> &DelInstrs) { 282 if (!TSchedModel.hasInstrSchedModel()) 283 return true; 284 285 // Compute current resource length 286 287 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 288 SmallVector <const MachineBasicBlock *, 1> MBBarr; 289 MBBarr.push_back(MBB); 290 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 291 292 // Deal with SC rather than Instructions. 293 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 294 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 295 296 instr2instrSC(InsInstrs, InsInstrsSC); 297 instr2instrSC(DelInstrs, DelInstrsSC); 298 299 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 300 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 301 302 // Compute new resource length. 303 unsigned ResLenAfterCombine = 304 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 305 306 DEBUG(dbgs() << "RESOURCE DATA: \n"; 307 dbgs() << " resource len before: " << ResLenBeforeCombine 308 << " after: " << ResLenAfterCombine << "\n";); 309 310 return ResLenAfterCombine <= ResLenBeforeCombine; 311 } 312 313 /// \returns true when new instruction sequence should be generated 314 /// independent if it lengthens critical path or not 315 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 316 if (OptSize && (NewSize < OldSize)) 317 return true; 318 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 319 return true; 320 return false; 321 } 322 323 /// Substitute a slow code sequence with a faster one by 324 /// evaluating instruction combining pattern. 325 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 326 /// combining based on machine trace metrics. Only combine a sequence of 327 /// instructions when this neither lengthens the critical path nor increases 328 /// resource pressure. When optimizing for codesize always combine when the new 329 /// sequence is shorter. 330 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 331 bool Changed = false; 332 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 333 334 auto BlockIter = MBB->begin(); 335 336 while (BlockIter != MBB->end()) { 337 auto &MI = *BlockIter++; 338 339 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 340 SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Patterns; 341 // The motivating example is: 342 // 343 // MUL Other MUL_op1 MUL_op2 Other 344 // \ / \ | / 345 // ADD/SUB => MADD/MSUB 346 // (=Root) (=NewRoot) 347 348 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 349 // usually beneficial for code size it unfortunately can hurt performance 350 // when the ADD is on the critical path, but the MUL is not. With the 351 // substitution the MUL becomes part of the critical path (in form of the 352 // MADD) and can lengthen it on architectures where the MADD latency is 353 // longer than the ADD latency. 354 // 355 // For each instruction we check if it can be the root of a combiner 356 // pattern. Then for each pattern the new code sequence in form of MI is 357 // generated and evaluated. When the efficiency criteria (don't lengthen 358 // critical path, don't use more resources) is met the new sequence gets 359 // hooked up into the basic block before the old sequence is removed. 360 // 361 // The algorithm does not try to evaluate all patterns and pick the best. 362 // This is only an artificial restriction though. In practice there is 363 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 364 // based on an internal cost heuristic. 365 366 if (TII->getMachineCombinerPatterns(MI, Patterns)) { 367 for (auto P : Patterns) { 368 SmallVector<MachineInstr *, 16> InsInstrs; 369 SmallVector<MachineInstr *, 16> DelInstrs; 370 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 371 if (!MinInstr) 372 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 373 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 374 Traces->verifyAnalysis(); 375 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 376 InstrIdxForVirtReg); 377 unsigned NewInstCount = InsInstrs.size(); 378 unsigned OldInstCount = DelInstrs.size(); 379 // Found pattern, but did not generate alternative sequence. 380 // This can happen e.g. when an immediate could not be materialized 381 // in a single instruction. 382 if (!NewInstCount) 383 continue; 384 // Substitute when we optimize for codesize and the new sequence has 385 // fewer instructions OR 386 // the new sequence neither lengthens the critical path nor increases 387 // resource pressure. 388 if (doSubstitute(NewInstCount, OldInstCount) || 389 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 390 InstrIdxForVirtReg, 391 NewInstCount < OldInstCount) && 392 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 393 for (auto *InstrPtr : InsInstrs) 394 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 395 for (auto *InstrPtr : DelInstrs) 396 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 397 398 Changed = true; 399 ++NumInstCombined; 400 401 Traces->invalidate(MBB); 402 Traces->verifyAnalysis(); 403 // Eagerly stop after the first pattern fires. 404 break; 405 } else { 406 // Cleanup instructions of the alternative code sequence. There is no 407 // use for them. 408 MachineFunction *MF = MBB->getParent(); 409 for (auto *InstrPtr : InsInstrs) 410 MF->DeleteMachineInstr(InstrPtr); 411 } 412 InstrIdxForVirtReg.clear(); 413 } 414 } 415 } 416 417 return Changed; 418 } 419 420 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 421 const TargetSubtargetInfo &STI = MF.getSubtarget(); 422 TII = STI.getInstrInfo(); 423 TRI = STI.getRegisterInfo(); 424 SchedModel = STI.getSchedModel(); 425 TSchedModel.init(SchedModel, &STI, TII); 426 MRI = &MF.getRegInfo(); 427 Traces = &getAnalysis<MachineTraceMetrics>(); 428 MinInstr = 0; 429 OptSize = MF.getFunction()->optForSize(); 430 431 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 432 if (!TII->useMachineCombiner()) { 433 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 434 return false; 435 } 436 437 bool Changed = false; 438 439 // Try to combine instructions. 440 for (auto &MBB : MF) 441 Changed |= combineInstructions(&MBB); 442 443 return Changed; 444 } 445