1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 #define DEBUG_TYPE "machine-combiner" 14 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/CodeGen/MachineDominators.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/MachineLoopInfo.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/MachineTraceMetrics.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetSchedule.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineTraceMetrics *Traces; 44 MachineTraceMetrics::Ensemble *MinInstr; 45 46 TargetSchedModel TSchedModel; 47 48 /// True if optimizing for code size. 49 bool OptSize; 50 51 public: 52 static char ID; 53 MachineCombiner() : MachineFunctionPass(ID) { 54 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 55 } 56 void getAnalysisUsage(AnalysisUsage &AU) const override; 57 bool runOnMachineFunction(MachineFunction &MF) override; 58 const char *getPassName() const override { return "Machine InstCombiner"; } 59 60 private: 61 bool doSubstitute(unsigned NewSize, unsigned OldSize); 62 bool combineInstructions(MachineBasicBlock *); 63 MachineInstr *getOperandDef(const MachineOperand &MO); 64 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 65 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 66 MachineTraceMetrics::Trace BlockTrace); 67 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 68 MachineTraceMetrics::Trace BlockTrace); 69 bool 70 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 71 MachineTraceMetrics::Trace BlockTrace, 72 SmallVectorImpl<MachineInstr *> &InsInstrs, 73 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 74 bool NewCodeHasLessInsts); 75 bool preservesResourceLen(MachineBasicBlock *MBB, 76 MachineTraceMetrics::Trace BlockTrace, 77 SmallVectorImpl<MachineInstr *> &InsInstrs, 78 SmallVectorImpl<MachineInstr *> &DelInstrs); 79 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 80 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 81 }; 82 } // namespace 83 84 char MachineCombiner::ID = 0; 85 char &llvm::MachineCombinerID = MachineCombiner::ID; 86 87 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 88 "Machine InstCombiner", false, false) 89 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 90 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 91 false, false) 92 93 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 94 AU.setPreservesCFG(); 95 AU.addPreserved<MachineDominatorTree>(); 96 AU.addPreserved<MachineLoopInfo>(); 97 AU.addRequired<MachineTraceMetrics>(); 98 AU.addPreserved<MachineTraceMetrics>(); 99 MachineFunctionPass::getAnalysisUsage(AU); 100 } 101 102 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 103 MachineInstr *DefInstr = nullptr; 104 // We need a virtual register definition. 105 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 106 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 107 // PHI's have no depth etc. 108 if (DefInstr && DefInstr->isPHI()) 109 DefInstr = nullptr; 110 return DefInstr; 111 } 112 113 /// Computes depth of instructions in vector \InsInstr. 114 /// 115 /// \param InsInstrs is a vector of machine instructions 116 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 117 /// of defining machine instruction in \p InsInstrs 118 /// \param BlockTrace is a trace of machine instructions 119 /// 120 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 121 unsigned 122 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 123 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 124 MachineTraceMetrics::Trace BlockTrace) { 125 126 SmallVector<unsigned, 16> InstrDepth; 127 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 128 129 // For each instruction in the new sequence compute the depth based on the 130 // operands. Use the trace information when possible. For new operands which 131 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 132 for (auto *InstrPtr : InsInstrs) { // for each Use 133 unsigned IDepth = 0; 134 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 135 for (const MachineOperand &MO : InstrPtr->operands()) { 136 // Check for virtual register operand. 137 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 138 continue; 139 if (!MO.isUse()) 140 continue; 141 unsigned DepthOp = 0; 142 unsigned LatencyOp = 0; 143 DenseMap<unsigned, unsigned>::iterator II = 144 InstrIdxForVirtReg.find(MO.getReg()); 145 if (II != InstrIdxForVirtReg.end()) { 146 // Operand is new virtual register not in trace 147 assert(II->second < InstrDepth.size() && "Bad Index"); 148 MachineInstr *DefInstr = InsInstrs[II->second]; 149 assert(DefInstr && 150 "There must be a definition for a new virtual register"); 151 DepthOp = InstrDepth[II->second]; 152 LatencyOp = TSchedModel.computeOperandLatency( 153 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 154 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 155 } else { 156 MachineInstr *DefInstr = getOperandDef(MO); 157 if (DefInstr) { 158 DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; 159 LatencyOp = TSchedModel.computeOperandLatency( 160 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 161 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 162 } 163 } 164 IDepth = std::max(IDepth, DepthOp + LatencyOp); 165 } 166 InstrDepth.push_back(IDepth); 167 } 168 unsigned NewRootIdx = InsInstrs.size() - 1; 169 return InstrDepth[NewRootIdx]; 170 } 171 172 /// Computes instruction latency as max of latency of defined operands. 173 /// 174 /// \param Root is a machine instruction that could be replaced by NewRoot. 175 /// It is used to compute a more accurate latency information for NewRoot in 176 /// case there is a dependent instruction in the same trace (\p BlockTrace) 177 /// \param NewRoot is the instruction for which the latency is computed 178 /// \param BlockTrace is a trace of machine instructions 179 /// 180 /// \returns Latency of \p NewRoot 181 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 182 MachineTraceMetrics::Trace BlockTrace) { 183 184 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 185 186 // Check each definition in NewRoot and compute the latency 187 unsigned NewRootLatency = 0; 188 189 for (const MachineOperand &MO : NewRoot->operands()) { 190 // Check for virtual register operand. 191 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 192 continue; 193 if (!MO.isDef()) 194 continue; 195 // Get the first instruction that uses MO 196 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 197 RI++; 198 MachineInstr *UseMO = RI->getParent(); 199 unsigned LatencyOp = 0; 200 if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { 201 LatencyOp = TSchedModel.computeOperandLatency( 202 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 203 UseMO->findRegisterUseOperandIdx(MO.getReg())); 204 } else { 205 LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); 206 } 207 NewRootLatency = std::max(NewRootLatency, LatencyOp); 208 } 209 return NewRootLatency; 210 } 211 212 /// True when the new instruction sequence does not lengthen the critical path 213 /// and the new sequence has less instructions or the new sequence improves the 214 /// critical path. 215 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 216 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 217 /// sequence to replace the old sequence is that it cannot lengthen the critical 218 /// path. This is decided by the formula: 219 /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). 220 /// If the new sequence has an equal length critical path but does not reduce 221 /// the number of instructions (NewCodeHasLessInsts is false), then it is not 222 /// considered an improvement. The slack is the number of cycles Root can be 223 /// delayed before the critical patch becomes longer. 224 bool MachineCombiner::improvesCriticalPathLen( 225 MachineBasicBlock *MBB, MachineInstr *Root, 226 MachineTraceMetrics::Trace BlockTrace, 227 SmallVectorImpl<MachineInstr *> &InsInstrs, 228 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 229 bool NewCodeHasLessInsts) { 230 231 assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); 232 // NewRoot is the last instruction in the \p InsInstrs vector. 233 // Get depth and latency of NewRoot. 234 unsigned NewRootIdx = InsInstrs.size() - 1; 235 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 236 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 237 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 238 239 // Get depth, latency and slack of Root. 240 unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; 241 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 242 unsigned RootSlack = BlockTrace.getInstrSlack(Root); 243 244 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 245 dbgs() << " NewRootDepth: " << NewRootDepth 246 << " NewRootLatency: " << NewRootLatency << "\n"; 247 dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency 248 << " RootSlack: " << RootSlack << "\n"; 249 dbgs() << " NewRootDepth + NewRootLatency " 250 << NewRootDepth + NewRootLatency << "\n"; 251 dbgs() << " RootDepth + RootLatency + RootSlack " 252 << RootDepth + RootLatency + RootSlack << "\n";); 253 254 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 255 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 256 257 if (NewCodeHasLessInsts) 258 return NewCycleCount <= OldCycleCount; 259 else 260 return NewCycleCount < OldCycleCount; 261 } 262 263 /// helper routine to convert instructions into SC 264 void MachineCombiner::instr2instrSC( 265 SmallVectorImpl<MachineInstr *> &Instrs, 266 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 267 for (auto *InstrPtr : Instrs) { 268 unsigned Opc = InstrPtr->getOpcode(); 269 unsigned Idx = TII->get(Opc).getSchedClass(); 270 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 271 InstrsSC.push_back(SC); 272 } 273 } 274 /// True when the new instructions do not increase resource length 275 bool MachineCombiner::preservesResourceLen( 276 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 277 SmallVectorImpl<MachineInstr *> &InsInstrs, 278 SmallVectorImpl<MachineInstr *> &DelInstrs) { 279 280 // Compute current resource length 281 282 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 283 SmallVector <const MachineBasicBlock *, 1> MBBarr; 284 MBBarr.push_back(MBB); 285 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 286 287 // Deal with SC rather than Instructions. 288 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 289 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 290 291 instr2instrSC(InsInstrs, InsInstrsSC); 292 instr2instrSC(DelInstrs, DelInstrsSC); 293 294 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 295 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 296 297 // Compute new resource length. 298 unsigned ResLenAfterCombine = 299 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 300 301 DEBUG(dbgs() << "RESOURCE DATA: \n"; 302 dbgs() << " resource len before: " << ResLenBeforeCombine 303 << " after: " << ResLenAfterCombine << "\n";); 304 305 return ResLenAfterCombine <= ResLenBeforeCombine; 306 } 307 308 /// \returns true when new instruction sequence should be generated 309 /// independent if it lengthens critical path or not 310 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 311 if (OptSize && (NewSize < OldSize)) 312 return true; 313 if (!TSchedModel.hasInstrSchedModel()) 314 return true; 315 return false; 316 } 317 318 /// Substitute a slow code sequence with a faster one by 319 /// evaluating instruction combining pattern. 320 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 321 /// combining based on machine trace metrics. Only combine a sequence of 322 /// instructions when this neither lengthens the critical path nor increases 323 /// resource pressure. When optimizing for codesize always combine when the new 324 /// sequence is shorter. 325 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 326 bool Changed = false; 327 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 328 329 auto BlockIter = MBB->begin(); 330 331 while (BlockIter != MBB->end()) { 332 auto &MI = *BlockIter++; 333 334 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 335 SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Patterns; 336 // The motivating example is: 337 // 338 // MUL Other MUL_op1 MUL_op2 Other 339 // \ / \ | / 340 // ADD/SUB => MADD/MSUB 341 // (=Root) (=NewRoot) 342 343 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 344 // usually beneficial for code size it unfortunately can hurt performance 345 // when the ADD is on the critical path, but the MUL is not. With the 346 // substitution the MUL becomes part of the critical path (in form of the 347 // MADD) and can lengthen it on architectures where the MADD latency is 348 // longer than the ADD latency. 349 // 350 // For each instruction we check if it can be the root of a combiner 351 // pattern. Then for each pattern the new code sequence in form of MI is 352 // generated and evaluated. When the efficiency criteria (don't lengthen 353 // critical path, don't use more resources) is met the new sequence gets 354 // hooked up into the basic block before the old sequence is removed. 355 // 356 // The algorithm does not try to evaluate all patterns and pick the best. 357 // This is only an artificial restriction though. In practice there is 358 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 359 // based on an internal cost heuristic. 360 361 if (TII->getMachineCombinerPatterns(MI, Patterns)) { 362 for (auto P : Patterns) { 363 SmallVector<MachineInstr *, 16> InsInstrs; 364 SmallVector<MachineInstr *, 16> DelInstrs; 365 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 366 if (!MinInstr) 367 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 368 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 369 Traces->verifyAnalysis(); 370 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 371 InstrIdxForVirtReg); 372 unsigned NewInstCount = InsInstrs.size(); 373 unsigned OldInstCount = DelInstrs.size(); 374 // Found pattern, but did not generate alternative sequence. 375 // This can happen e.g. when an immediate could not be materialized 376 // in a single instruction. 377 if (!NewInstCount) 378 continue; 379 // Substitute when we optimize for codesize and the new sequence has 380 // fewer instructions OR 381 // the new sequence neither lengthens the critical path nor increases 382 // resource pressure. 383 if (doSubstitute(NewInstCount, OldInstCount) || 384 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 385 InstrIdxForVirtReg, 386 NewInstCount < OldInstCount) && 387 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 388 for (auto *InstrPtr : InsInstrs) 389 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 390 for (auto *InstrPtr : DelInstrs) 391 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 392 393 Changed = true; 394 ++NumInstCombined; 395 396 Traces->invalidate(MBB); 397 Traces->verifyAnalysis(); 398 // Eagerly stop after the first pattern fires. 399 break; 400 } else { 401 // Cleanup instructions of the alternative code sequence. There is no 402 // use for them. 403 MachineFunction *MF = MBB->getParent(); 404 for (auto *InstrPtr : InsInstrs) 405 MF->DeleteMachineInstr(InstrPtr); 406 } 407 InstrIdxForVirtReg.clear(); 408 } 409 } 410 } 411 412 return Changed; 413 } 414 415 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 416 const TargetSubtargetInfo &STI = MF.getSubtarget(); 417 TII = STI.getInstrInfo(); 418 TRI = STI.getRegisterInfo(); 419 SchedModel = STI.getSchedModel(); 420 TSchedModel.init(SchedModel, &STI, TII); 421 MRI = &MF.getRegInfo(); 422 Traces = &getAnalysis<MachineTraceMetrics>(); 423 MinInstr = 0; 424 425 OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); 426 427 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 428 if (!TII->useMachineCombiner()) { 429 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 430 return false; 431 } 432 433 bool Changed = false; 434 435 // Try to combine instructions. 436 for (auto &MBB : MF) 437 Changed |= combineInstructions(&MBB); 438 439 return Changed; 440 } 441