1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The machine combiner pass uses machine trace metrics to ensure the combined 10 // instructions do not lengthen the critical path or the resource depth. 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/ADT/DenseMap.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/Analysis/ProfileSummaryInfo.h" 16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" 17 #include "llvm/CodeGen/MachineDominators.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/CodeGen/MachineLoopInfo.h" 21 #include "llvm/CodeGen/MachineRegisterInfo.h" 22 #include "llvm/CodeGen/MachineSizeOpts.h" 23 #include "llvm/CodeGen/MachineTraceMetrics.h" 24 #include "llvm/CodeGen/RegisterClassInfo.h" 25 #include "llvm/CodeGen/TargetInstrInfo.h" 26 #include "llvm/CodeGen/TargetRegisterInfo.h" 27 #include "llvm/CodeGen/TargetSchedule.h" 28 #include "llvm/CodeGen/TargetSubtargetInfo.h" 29 #include "llvm/InitializePasses.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "machine-combiner" 37 38 STATISTIC(NumInstCombined, "Number of machineinst combined"); 39 40 static cl::opt<unsigned> 41 inc_threshold("machine-combiner-inc-threshold", cl::Hidden, 42 cl::desc("Incremental depth computation will be used for basic " 43 "blocks with more instructions."), cl::init(500)); 44 45 static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, 46 cl::desc("Dump all substituted intrs"), 47 cl::init(false)); 48 49 #ifdef EXPENSIVE_CHECKS 50 static cl::opt<bool> VerifyPatternOrder( 51 "machine-combiner-verify-pattern-order", cl::Hidden, 52 cl::desc( 53 "Verify that the generated patterns are ordered by increasing latency"), 54 cl::init(true)); 55 #else 56 static cl::opt<bool> VerifyPatternOrder( 57 "machine-combiner-verify-pattern-order", cl::Hidden, 58 cl::desc( 59 "Verify that the generated patterns are ordered by increasing latency"), 60 cl::init(false)); 61 #endif 62 63 namespace { 64 class MachineCombiner : public MachineFunctionPass { 65 const TargetSubtargetInfo *STI; 66 const TargetInstrInfo *TII; 67 const TargetRegisterInfo *TRI; 68 MCSchedModel SchedModel; 69 MachineRegisterInfo *MRI; 70 MachineLoopInfo *MLI; // Current MachineLoopInfo 71 MachineTraceMetrics *Traces; 72 MachineTraceMetrics::Ensemble *MinInstr; 73 MachineBlockFrequencyInfo *MBFI; 74 ProfileSummaryInfo *PSI; 75 RegisterClassInfo RegClassInfo; 76 77 TargetSchedModel TSchedModel; 78 79 /// True if optimizing for code size. 80 bool OptSize; 81 82 public: 83 static char ID; 84 MachineCombiner() : MachineFunctionPass(ID) { 85 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 86 } 87 void getAnalysisUsage(AnalysisUsage &AU) const override; 88 bool runOnMachineFunction(MachineFunction &MF) override; 89 StringRef getPassName() const override { return "Machine InstCombiner"; } 90 91 private: 92 bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); 93 bool combineInstructions(MachineBasicBlock *); 94 MachineInstr *getOperandDef(const MachineOperand &MO); 95 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 96 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 97 MachineTraceMetrics::Trace BlockTrace); 98 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 99 MachineTraceMetrics::Trace BlockTrace); 100 bool 101 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 102 MachineTraceMetrics::Trace BlockTrace, 103 SmallVectorImpl<MachineInstr *> &InsInstrs, 104 SmallVectorImpl<MachineInstr *> &DelInstrs, 105 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 106 MachineCombinerPattern Pattern, bool SlackIsAccurate); 107 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, 108 SmallVectorImpl<MachineInstr *> &InsInstrs, 109 SmallVectorImpl<MachineInstr *> &DelInstrs, 110 MachineCombinerPattern Pattern); 111 bool preservesResourceLen(MachineBasicBlock *MBB, 112 MachineTraceMetrics::Trace BlockTrace, 113 SmallVectorImpl<MachineInstr *> &InsInstrs, 114 SmallVectorImpl<MachineInstr *> &DelInstrs); 115 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 116 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 117 std::pair<unsigned, unsigned> 118 getLatenciesForInstrSequences(MachineInstr &MI, 119 SmallVectorImpl<MachineInstr *> &InsInstrs, 120 SmallVectorImpl<MachineInstr *> &DelInstrs, 121 MachineTraceMetrics::Trace BlockTrace); 122 123 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, 124 SmallVector<MachineCombinerPattern, 16> &Patterns); 125 }; 126 } 127 128 char MachineCombiner::ID = 0; 129 char &llvm::MachineCombinerID = MachineCombiner::ID; 130 131 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, 132 "Machine InstCombiner", false, false) 133 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 134 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 135 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", 136 false, false) 137 138 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 139 AU.setPreservesCFG(); 140 AU.addPreserved<MachineDominatorTree>(); 141 AU.addRequired<MachineLoopInfo>(); 142 AU.addPreserved<MachineLoopInfo>(); 143 AU.addRequired<MachineTraceMetrics>(); 144 AU.addPreserved<MachineTraceMetrics>(); 145 AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); 146 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 150 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 151 MachineInstr *DefInstr = nullptr; 152 // We need a virtual register definition. 153 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 154 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 155 // PHI's have no depth etc. 156 if (DefInstr && DefInstr->isPHI()) 157 DefInstr = nullptr; 158 return DefInstr; 159 } 160 161 /// Computes depth of instructions in vector \InsInstr. 162 /// 163 /// \param InsInstrs is a vector of machine instructions 164 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 165 /// of defining machine instruction in \p InsInstrs 166 /// \param BlockTrace is a trace of machine instructions 167 /// 168 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 169 unsigned 170 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 171 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 172 MachineTraceMetrics::Trace BlockTrace) { 173 SmallVector<unsigned, 16> InstrDepth; 174 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 175 "Missing machine model\n"); 176 177 // For each instruction in the new sequence compute the depth based on the 178 // operands. Use the trace information when possible. For new operands which 179 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 180 for (auto *InstrPtr : InsInstrs) { // for each Use 181 unsigned IDepth = 0; 182 for (const MachineOperand &MO : InstrPtr->operands()) { 183 // Check for virtual register operand. 184 if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg()))) 185 continue; 186 if (!MO.isUse()) 187 continue; 188 unsigned DepthOp = 0; 189 unsigned LatencyOp = 0; 190 DenseMap<unsigned, unsigned>::iterator II = 191 InstrIdxForVirtReg.find(MO.getReg()); 192 if (II != InstrIdxForVirtReg.end()) { 193 // Operand is new virtual register not in trace 194 assert(II->second < InstrDepth.size() && "Bad Index"); 195 MachineInstr *DefInstr = InsInstrs[II->second]; 196 assert(DefInstr && 197 "There must be a definition for a new virtual register"); 198 DepthOp = InstrDepth[II->second]; 199 int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); 200 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); 201 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, 202 InstrPtr, UseIdx); 203 } else { 204 MachineInstr *DefInstr = getOperandDef(MO); 205 if (DefInstr) { 206 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 207 LatencyOp = TSchedModel.computeOperandLatency( 208 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 209 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 210 } 211 } 212 IDepth = std::max(IDepth, DepthOp + LatencyOp); 213 } 214 InstrDepth.push_back(IDepth); 215 } 216 unsigned NewRootIdx = InsInstrs.size() - 1; 217 return InstrDepth[NewRootIdx]; 218 } 219 220 /// Computes instruction latency as max of latency of defined operands. 221 /// 222 /// \param Root is a machine instruction that could be replaced by NewRoot. 223 /// It is used to compute a more accurate latency information for NewRoot in 224 /// case there is a dependent instruction in the same trace (\p BlockTrace) 225 /// \param NewRoot is the instruction for which the latency is computed 226 /// \param BlockTrace is a trace of machine instructions 227 /// 228 /// \returns Latency of \p NewRoot 229 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 230 MachineTraceMetrics::Trace BlockTrace) { 231 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 232 "Missing machine model\n"); 233 234 // Check each definition in NewRoot and compute the latency 235 unsigned NewRootLatency = 0; 236 237 for (const MachineOperand &MO : NewRoot->operands()) { 238 // Check for virtual register operand. 239 if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg()))) 240 continue; 241 if (!MO.isDef()) 242 continue; 243 // Get the first instruction that uses MO 244 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 245 RI++; 246 if (RI == MRI->reg_end()) 247 continue; 248 MachineInstr *UseMO = RI->getParent(); 249 unsigned LatencyOp = 0; 250 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 251 LatencyOp = TSchedModel.computeOperandLatency( 252 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 253 UseMO->findRegisterUseOperandIdx(MO.getReg())); 254 } else { 255 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 256 } 257 NewRootLatency = std::max(NewRootLatency, LatencyOp); 258 } 259 return NewRootLatency; 260 } 261 262 /// The combiner's goal may differ based on which pattern it is attempting 263 /// to optimize. 264 enum class CombinerObjective { 265 MustReduceDepth, // The data dependency chain must be improved. 266 MustReduceRegisterPressure, // The register pressure must be reduced. 267 Default // The critical path must not be lengthened. 268 }; 269 270 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 271 // TODO: If C++ ever gets a real enum class, make this part of the 272 // MachineCombinerPattern class. 273 switch (P) { 274 case MachineCombinerPattern::REASSOC_AX_BY: 275 case MachineCombinerPattern::REASSOC_AX_YB: 276 case MachineCombinerPattern::REASSOC_XA_BY: 277 case MachineCombinerPattern::REASSOC_XA_YB: 278 case MachineCombinerPattern::REASSOC_XY_AMM_BMM: 279 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: 280 return CombinerObjective::MustReduceDepth; 281 case MachineCombinerPattern::REASSOC_XY_BCA: 282 case MachineCombinerPattern::REASSOC_XY_BAC: 283 return CombinerObjective::MustReduceRegisterPressure; 284 default: 285 return CombinerObjective::Default; 286 } 287 } 288 289 /// Estimate the latency of the new and original instruction sequence by summing 290 /// up the latencies of the inserted and deleted instructions. This assumes 291 /// that the inserted and deleted instructions are dependent instruction chains, 292 /// which might not hold in all cases. 293 std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( 294 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, 295 SmallVectorImpl<MachineInstr *> &DelInstrs, 296 MachineTraceMetrics::Trace BlockTrace) { 297 assert(!InsInstrs.empty() && "Only support sequences that insert instrs."); 298 unsigned NewRootLatency = 0; 299 // NewRoot is the last instruction in the \p InsInstrs vector. 300 MachineInstr *NewRoot = InsInstrs.back(); 301 for (unsigned i = 0; i < InsInstrs.size() - 1; i++) 302 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); 303 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace); 304 305 unsigned RootLatency = 0; 306 for (auto I : DelInstrs) 307 RootLatency += TSchedModel.computeInstrLatency(I); 308 309 return {NewRootLatency, RootLatency}; 310 } 311 312 bool MachineCombiner::reduceRegisterPressure( 313 MachineInstr &Root, MachineBasicBlock *MBB, 314 SmallVectorImpl<MachineInstr *> &InsInstrs, 315 SmallVectorImpl<MachineInstr *> &DelInstrs, 316 MachineCombinerPattern Pattern) { 317 // FIXME: for now, we don't do any check for the register pressure patterns. 318 // We treat them as always profitable. But we can do better if we make 319 // RegPressureTracker class be aware of TIE attribute. Then we can get an 320 // accurate compare of register pressure with DelInstrs or InsInstrs. 321 return true; 322 } 323 324 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 325 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 326 /// sequence to replace the old sequence is that it cannot lengthen the critical 327 /// path. The definition of "improve" may be restricted by specifying that the 328 /// new path improves the data dependency chain (MustReduceDepth). 329 bool MachineCombiner::improvesCriticalPathLen( 330 MachineBasicBlock *MBB, MachineInstr *Root, 331 MachineTraceMetrics::Trace BlockTrace, 332 SmallVectorImpl<MachineInstr *> &InsInstrs, 333 SmallVectorImpl<MachineInstr *> &DelInstrs, 334 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 335 MachineCombinerPattern Pattern, 336 bool SlackIsAccurate) { 337 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 338 "Missing machine model\n"); 339 // Get depth and latency of NewRoot and Root. 340 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 341 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 342 343 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " 344 << NewRootDepth << "\tRootDepth: " << RootDepth); 345 346 // For a transform such as reassociation, the cost equation is 347 // conservatively calculated so that we must improve the depth (data 348 // dependency cycles) in the critical path to proceed with the transform. 349 // Being conservative also protects against inaccuracies in the underlying 350 // machine trace metrics and CPU models. 351 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { 352 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth "); 353 LLVM_DEBUG(NewRootDepth < RootDepth 354 ? dbgs() << "\t and it does it\n" 355 : dbgs() << "\t but it does NOT do it\n"); 356 return NewRootDepth < RootDepth; 357 } 358 359 // A more flexible cost calculation for the critical path includes the slack 360 // of the original code sequence. This may allow the transform to proceed 361 // even if the instruction depths (data dependency cycles) become worse. 362 363 // Account for the latency of the inserted and deleted instructions by 364 unsigned NewRootLatency, RootLatency; 365 std::tie(NewRootLatency, RootLatency) = 366 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); 367 368 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 369 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 370 unsigned OldCycleCount = 371 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); 372 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency 373 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " 374 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate 375 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount 376 << "\n\tRootDepth + RootLatency + RootSlack = " 377 << OldCycleCount;); 378 LLVM_DEBUG(NewCycleCount <= OldCycleCount 379 ? dbgs() << "\n\t It IMPROVES PathLen because" 380 : dbgs() << "\n\t It DOES NOT improve PathLen because"); 381 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount 382 << ", OldCycleCount = " << OldCycleCount << "\n"); 383 384 return NewCycleCount <= OldCycleCount; 385 } 386 387 /// helper routine to convert instructions into SC 388 void MachineCombiner::instr2instrSC( 389 SmallVectorImpl<MachineInstr *> &Instrs, 390 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 391 for (auto *InstrPtr : Instrs) { 392 unsigned Opc = InstrPtr->getOpcode(); 393 unsigned Idx = TII->get(Opc).getSchedClass(); 394 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 395 InstrsSC.push_back(SC); 396 } 397 } 398 399 /// True when the new instructions do not increase resource length 400 bool MachineCombiner::preservesResourceLen( 401 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 402 SmallVectorImpl<MachineInstr *> &InsInstrs, 403 SmallVectorImpl<MachineInstr *> &DelInstrs) { 404 if (!TSchedModel.hasInstrSchedModel()) 405 return true; 406 407 // Compute current resource length 408 409 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 410 SmallVector <const MachineBasicBlock *, 1> MBBarr; 411 MBBarr.push_back(MBB); 412 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 413 414 // Deal with SC rather than Instructions. 415 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 416 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 417 418 instr2instrSC(InsInstrs, InsInstrsSC); 419 instr2instrSC(DelInstrs, DelInstrsSC); 420 421 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 422 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 423 424 // Compute new resource length. 425 unsigned ResLenAfterCombine = 426 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 427 428 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " 429 << ResLenBeforeCombine 430 << " and after: " << ResLenAfterCombine << "\n";); 431 LLVM_DEBUG( 432 ResLenAfterCombine <= 433 ResLenBeforeCombine + TII->getExtendResourceLenLimit() 434 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" 435 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " 436 "Length\n"); 437 438 return ResLenAfterCombine <= 439 ResLenBeforeCombine + TII->getExtendResourceLenLimit(); 440 } 441 442 /// \returns true when new instruction sequence should be generated 443 /// independent if it lengthens critical path or not 444 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize, 445 bool OptForSize) { 446 if (OptForSize && (NewSize < OldSize)) 447 return true; 448 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 449 return true; 450 return false; 451 } 452 453 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction 454 /// depths if requested. 455 /// 456 /// \param MBB basic block to insert instructions in 457 /// \param MI current machine instruction 458 /// \param InsInstrs new instructions to insert in \p MBB 459 /// \param DelInstrs instruction to delete from \p MBB 460 /// \param MinInstr is a pointer to the machine trace information 461 /// \param RegUnits set of live registers, needed to compute instruction depths 462 /// \param TII is target instruction info, used to call target hook 463 /// \param Pattern is used to call target hook finalizeInsInstrs 464 /// \param IncrementalUpdate if true, compute instruction depths incrementally, 465 /// otherwise invalidate the trace 466 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, 467 SmallVector<MachineInstr *, 16> InsInstrs, 468 SmallVector<MachineInstr *, 16> DelInstrs, 469 MachineTraceMetrics::Ensemble *MinInstr, 470 SparseSet<LiveRegUnit> &RegUnits, 471 const TargetInstrInfo *TII, 472 MachineCombinerPattern Pattern, 473 bool IncrementalUpdate) { 474 // If we want to fix up some placeholder for some target, do it now. 475 // We need this because in genAlternativeCodeSequence, we have not decided the 476 // better pattern InsInstrs or DelInstrs, so we don't want generate some 477 // sideeffect to the function. For example we need to delay the constant pool 478 // entry creation here after InsInstrs is selected as better pattern. 479 // Otherwise the constant pool entry created for InsInstrs will not be deleted 480 // even if InsInstrs is not the better pattern. 481 TII->finalizeInsInstrs(MI, Pattern, InsInstrs); 482 483 for (auto *InstrPtr : InsInstrs) 484 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); 485 486 for (auto *InstrPtr : DelInstrs) { 487 InstrPtr->eraseFromParent(); 488 // Erase all LiveRegs defined by the removed instruction 489 for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { 490 if (I->MI == InstrPtr) 491 I = RegUnits.erase(I); 492 else 493 I++; 494 } 495 } 496 497 if (IncrementalUpdate) 498 for (auto *InstrPtr : InsInstrs) 499 MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); 500 else 501 MinInstr->invalidate(MBB); 502 503 NumInstCombined++; 504 } 505 506 // Check that the difference between original and new latency is decreasing for 507 // later patterns. This helps to discover sub-optimal pattern orderings. 508 void MachineCombiner::verifyPatternOrder( 509 MachineBasicBlock *MBB, MachineInstr &Root, 510 SmallVector<MachineCombinerPattern, 16> &Patterns) { 511 long PrevLatencyDiff = std::numeric_limits<long>::max(); 512 (void)PrevLatencyDiff; // Variable is used in assert only. 513 for (auto P : Patterns) { 514 SmallVector<MachineInstr *, 16> InsInstrs; 515 SmallVector<MachineInstr *, 16> DelInstrs; 516 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 517 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs, 518 InstrIdxForVirtReg); 519 // Found pattern, but did not generate alternative sequence. 520 // This can happen e.g. when an immediate could not be materialized 521 // in a single instruction. 522 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) 523 continue; 524 525 unsigned NewRootLatency, RootLatency; 526 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( 527 Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); 528 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); 529 assert(CurrentLatencyDiff <= PrevLatencyDiff && 530 "Current pattern is better than previous pattern."); 531 PrevLatencyDiff = CurrentLatencyDiff; 532 } 533 } 534 535 /// Substitute a slow code sequence with a faster one by 536 /// evaluating instruction combining pattern. 537 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 538 /// combining based on machine trace metrics. Only combine a sequence of 539 /// instructions when this neither lengthens the critical path nor increases 540 /// resource pressure. When optimizing for codesize always combine when the new 541 /// sequence is shorter. 542 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 543 bool Changed = false; 544 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 545 546 bool IncrementalUpdate = false; 547 auto BlockIter = MBB->begin(); 548 decltype(BlockIter) LastUpdate; 549 // Check if the block is in a loop. 550 const MachineLoop *ML = MLI->getLoopFor(MBB); 551 if (!MinInstr) 552 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 553 554 SparseSet<LiveRegUnit> RegUnits; 555 RegUnits.setUniverse(TRI->getNumRegUnits()); 556 557 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); 558 559 bool DoRegPressureReduce = 560 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo); 561 562 while (BlockIter != MBB->end()) { 563 auto &MI = *BlockIter++; 564 SmallVector<MachineCombinerPattern, 16> Patterns; 565 // The motivating example is: 566 // 567 // MUL Other MUL_op1 MUL_op2 Other 568 // \ / \ | / 569 // ADD/SUB => MADD/MSUB 570 // (=Root) (=NewRoot) 571 572 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 573 // usually beneficial for code size it unfortunately can hurt performance 574 // when the ADD is on the critical path, but the MUL is not. With the 575 // substitution the MUL becomes part of the critical path (in form of the 576 // MADD) and can lengthen it on architectures where the MADD latency is 577 // longer than the ADD latency. 578 // 579 // For each instruction we check if it can be the root of a combiner 580 // pattern. Then for each pattern the new code sequence in form of MI is 581 // generated and evaluated. When the efficiency criteria (don't lengthen 582 // critical path, don't use more resources) is met the new sequence gets 583 // hooked up into the basic block before the old sequence is removed. 584 // 585 // The algorithm does not try to evaluate all patterns and pick the best. 586 // This is only an artificial restriction though. In practice there is 587 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 588 // based on an internal cost heuristic. If 589 // machine-combiner-verify-pattern-order is enabled, all patterns are 590 // checked to ensure later patterns do not provide better latency savings. 591 592 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce)) 593 continue; 594 595 if (VerifyPatternOrder) 596 verifyPatternOrder(MBB, MI, Patterns); 597 598 for (auto P : Patterns) { 599 SmallVector<MachineInstr *, 16> InsInstrs; 600 SmallVector<MachineInstr *, 16> DelInstrs; 601 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 602 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 603 InstrIdxForVirtReg); 604 unsigned NewInstCount = InsInstrs.size(); 605 unsigned OldInstCount = DelInstrs.size(); 606 // Found pattern, but did not generate alternative sequence. 607 // This can happen e.g. when an immediate could not be materialized 608 // in a single instruction. 609 if (!NewInstCount) 610 continue; 611 612 LLVM_DEBUG(if (dump_intrs) { 613 dbgs() << "\tFor the Pattern (" << (int)P 614 << ") these instructions could be removed\n"; 615 for (auto const *InstrPtr : DelInstrs) 616 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, 617 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); 618 dbgs() << "\tThese instructions could replace the removed ones\n"; 619 for (auto const *InstrPtr : InsInstrs) 620 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, 621 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); 622 }); 623 624 bool SubstituteAlways = false; 625 if (ML && TII->isThroughputPattern(P)) 626 SubstituteAlways = true; 627 628 if (IncrementalUpdate && LastUpdate != BlockIter) { 629 // Update depths since the last incremental update. 630 MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits); 631 LastUpdate = BlockIter; 632 } 633 634 if (DoRegPressureReduce && 635 getCombinerObjective(P) == 636 CombinerObjective::MustReduceRegisterPressure) { 637 if (MBB->size() > inc_threshold) { 638 // Use incremental depth updates for basic blocks above threshold 639 IncrementalUpdate = true; 640 LastUpdate = BlockIter; 641 } 642 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) { 643 // Replace DelInstrs with InsInstrs. 644 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 645 RegUnits, TII, P, IncrementalUpdate); 646 Changed |= true; 647 648 // Go back to previous instruction as it may have ILP reassociation 649 // opportunity. 650 BlockIter--; 651 break; 652 } 653 } 654 655 // Substitute when we optimize for codesize and the new sequence has 656 // fewer instructions OR 657 // the new sequence neither lengthens the critical path nor increases 658 // resource pressure. 659 if (SubstituteAlways || 660 doSubstitute(NewInstCount, OldInstCount, OptForSize)) { 661 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 662 RegUnits, TII, P, IncrementalUpdate); 663 // Eagerly stop after the first pattern fires. 664 Changed = true; 665 break; 666 } else { 667 // For big basic blocks, we only compute the full trace the first time 668 // we hit this. We do not invalidate the trace, but instead update the 669 // instruction depths incrementally. 670 // NOTE: Only the instruction depths up to MI are accurate. All other 671 // trace information is not updated. 672 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 673 Traces->verifyAnalysis(); 674 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, 675 InstrIdxForVirtReg, P, 676 !IncrementalUpdate) && 677 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { 678 if (MBB->size() > inc_threshold) { 679 // Use incremental depth updates for basic blocks above treshold 680 IncrementalUpdate = true; 681 LastUpdate = BlockIter; 682 } 683 684 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, 685 RegUnits, TII, P, IncrementalUpdate); 686 687 // Eagerly stop after the first pattern fires. 688 Changed = true; 689 break; 690 } 691 // Cleanup instructions of the alternative code sequence. There is no 692 // use for them. 693 MachineFunction *MF = MBB->getParent(); 694 for (auto *InstrPtr : InsInstrs) 695 MF->deleteMachineInstr(InstrPtr); 696 } 697 InstrIdxForVirtReg.clear(); 698 } 699 } 700 701 if (Changed && IncrementalUpdate) 702 Traces->invalidate(MBB); 703 return Changed; 704 } 705 706 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 707 STI = &MF.getSubtarget(); 708 TII = STI->getInstrInfo(); 709 TRI = STI->getRegisterInfo(); 710 SchedModel = STI->getSchedModel(); 711 TSchedModel.init(STI); 712 MRI = &MF.getRegInfo(); 713 MLI = &getAnalysis<MachineLoopInfo>(); 714 Traces = &getAnalysis<MachineTraceMetrics>(); 715 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 716 MBFI = (PSI && PSI->hasProfileSummary()) ? 717 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : 718 nullptr; 719 MinInstr = nullptr; 720 OptSize = MF.getFunction().hasOptSize(); 721 RegClassInfo.runOnMachineFunction(MF); 722 723 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 724 if (!TII->useMachineCombiner()) { 725 LLVM_DEBUG( 726 dbgs() 727 << " Skipping pass: Target does not support machine combiner\n"); 728 return false; 729 } 730 731 bool Changed = false; 732 733 // Try to combine instructions. 734 for (auto &MBB : MF) 735 Changed |= combineInstructions(&MBB); 736 737 return Changed; 738 } 739