1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/BasicAliasAnalysis.h" 20 #include "llvm/Analysis/CGSCCPassManager.h" 21 #include "llvm/Analysis/CtxProfAnalysis.h" 22 #include "llvm/Analysis/GlobalsModRef.h" 23 #include "llvm/Analysis/InlineAdvisor.h" 24 #include "llvm/Analysis/ProfileSummaryInfo.h" 25 #include "llvm/Analysis/ScopedNoAliasAA.h" 26 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 27 #include "llvm/CodeGen/GlobalMergeFunctions.h" 28 #include "llvm/IR/PassManager.h" 29 #include "llvm/Pass.h" 30 #include "llvm/Passes/OptimizationLevel.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/PGOOptions.h" 35 #include "llvm/Support/VirtualFileSystem.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 38 #include "llvm/Transforms/Coroutines/CoroAnnotationElide.h" 39 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 40 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" 41 #include "llvm/Transforms/Coroutines/CoroEarly.h" 42 #include "llvm/Transforms/Coroutines/CoroElide.h" 43 #include "llvm/Transforms/Coroutines/CoroSplit.h" 44 #include "llvm/Transforms/HipStdPar/HipStdPar.h" 45 #include "llvm/Transforms/IPO/AlwaysInliner.h" 46 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 47 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 48 #include "llvm/Transforms/IPO/Attributor.h" 49 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 50 #include "llvm/Transforms/IPO/ConstantMerge.h" 51 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 52 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 53 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 54 #include "llvm/Transforms/IPO/EmbedBitcodePass.h" 55 #include "llvm/Transforms/IPO/ExpandVariadics.h" 56 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 57 #include "llvm/Transforms/IPO/FunctionAttrs.h" 58 #include "llvm/Transforms/IPO/GlobalDCE.h" 59 #include "llvm/Transforms/IPO/GlobalOpt.h" 60 #include "llvm/Transforms/IPO/GlobalSplit.h" 61 #include "llvm/Transforms/IPO/HotColdSplitting.h" 62 #include "llvm/Transforms/IPO/IROutliner.h" 63 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 64 #include "llvm/Transforms/IPO/Inliner.h" 65 #include "llvm/Transforms/IPO/LowerTypeTests.h" 66 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 67 #include "llvm/Transforms/IPO/MergeFunctions.h" 68 #include "llvm/Transforms/IPO/ModuleInliner.h" 69 #include "llvm/Transforms/IPO/OpenMPOpt.h" 70 #include "llvm/Transforms/IPO/PartialInlining.h" 71 #include "llvm/Transforms/IPO/SCCP.h" 72 #include "llvm/Transforms/IPO/SampleProfile.h" 73 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 74 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 75 #include "llvm/Transforms/InstCombine/InstCombine.h" 76 #include "llvm/Transforms/Instrumentation/CGProfile.h" 77 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 78 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 79 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 80 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 81 #include "llvm/Transforms/Instrumentation/PGOCtxProfFlattening.h" 82 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h" 83 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h" 84 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 85 #include "llvm/Transforms/Scalar/ADCE.h" 86 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 87 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 88 #include "llvm/Transforms/Scalar/BDCE.h" 89 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 90 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 91 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 92 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 93 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 94 #include "llvm/Transforms/Scalar/DivRemPairs.h" 95 #include "llvm/Transforms/Scalar/EarlyCSE.h" 96 #include "llvm/Transforms/Scalar/Float2Int.h" 97 #include "llvm/Transforms/Scalar/GVN.h" 98 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 99 #include "llvm/Transforms/Scalar/InferAlignment.h" 100 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 101 #include "llvm/Transforms/Scalar/JumpTableToSwitch.h" 102 #include "llvm/Transforms/Scalar/JumpThreading.h" 103 #include "llvm/Transforms/Scalar/LICM.h" 104 #include "llvm/Transforms/Scalar/LoopDeletion.h" 105 #include "llvm/Transforms/Scalar/LoopDistribute.h" 106 #include "llvm/Transforms/Scalar/LoopFlatten.h" 107 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 108 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 109 #include "llvm/Transforms/Scalar/LoopInterchange.h" 110 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 111 #include "llvm/Transforms/Scalar/LoopPassManager.h" 112 #include "llvm/Transforms/Scalar/LoopRotation.h" 113 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 114 #include "llvm/Transforms/Scalar/LoopSink.h" 115 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 116 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 117 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h" 118 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 119 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 120 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 121 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 122 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 123 #include "llvm/Transforms/Scalar/NewGVN.h" 124 #include "llvm/Transforms/Scalar/Reassociate.h" 125 #include "llvm/Transforms/Scalar/SCCP.h" 126 #include "llvm/Transforms/Scalar/SROA.h" 127 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 128 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 129 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 130 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 131 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 132 #include "llvm/Transforms/Utils/AddDiscriminators.h" 133 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 134 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 135 #include "llvm/Transforms/Utils/CountVisits.h" 136 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" 137 #include "llvm/Transforms/Utils/ExtraPassManager.h" 138 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 139 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 140 #include "llvm/Transforms/Utils/Mem2Reg.h" 141 #include "llvm/Transforms/Utils/MoveAutoInit.h" 142 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 143 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 144 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 145 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 146 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 147 #include "llvm/Transforms/Vectorize/VectorCombine.h" 148 149 using namespace llvm; 150 151 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 152 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 153 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 154 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 155 "Heuristics-based inliner version"), 156 clEnumValN(InliningAdvisorMode::Development, "development", 157 "Use development mode (runtime-loadable model)"), 158 clEnumValN(InliningAdvisorMode::Release, "release", 159 "Use release mode (AOT-compiled model)"))); 160 161 /// Flag to enable inline deferral during PGO. 162 static cl::opt<bool> 163 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 164 cl::Hidden, 165 cl::desc("Enable inline deferral during PGO")); 166 167 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 168 cl::init(false), cl::Hidden, 169 cl::desc("Enable module inliner")); 170 171 static cl::opt<bool> PerformMandatoryInliningsFirst( 172 "mandatory-inlining-first", cl::init(false), cl::Hidden, 173 cl::desc("Perform mandatory inlinings module-wide, before performing " 174 "inlining")); 175 176 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 177 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 178 cl::desc("Eagerly invalidate more analyses in default pipelines")); 179 180 static cl::opt<bool> EnableMergeFunctions( 181 "enable-merge-functions", cl::init(false), cl::Hidden, 182 cl::desc("Enable function merging as part of the optimization pipeline")); 183 184 static cl::opt<bool> EnablePostPGOLoopRotation( 185 "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden, 186 cl::desc("Run the loop rotation transformation after PGO instrumentation")); 187 188 static cl::opt<bool> EnableGlobalAnalyses( 189 "enable-global-analyses", cl::init(true), cl::Hidden, 190 cl::desc("Enable inter-procedural analyses")); 191 192 static cl::opt<bool> RunPartialInlining("enable-partial-inlining", 193 cl::init(false), cl::Hidden, 194 cl::desc("Run Partial inlining pass")); 195 196 static cl::opt<bool> ExtraVectorizerPasses( 197 "extra-vectorizer-passes", cl::init(false), cl::Hidden, 198 cl::desc("Run cleanup optimization passes after vectorization")); 199 200 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, 201 cl::desc("Run the NewGVN pass")); 202 203 static cl::opt<bool> EnableLoopInterchange( 204 "enable-loopinterchange", cl::init(false), cl::Hidden, 205 cl::desc("Enable the experimental LoopInterchange Pass")); 206 207 static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", 208 cl::init(false), cl::Hidden, 209 cl::desc("Enable Unroll And Jam Pass")); 210 211 static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false), 212 cl::Hidden, 213 cl::desc("Enable the LoopFlatten Pass")); 214 215 // Experimentally allow loop header duplication. This should allow for better 216 // optimization at Oz, since loop-idiom recognition can then recognize things 217 // like memcpy. If this ends up being useful for many targets, we should drop 218 // this flag and make a code generation option that can be controlled 219 // independent of the opt level and exposed through the frontend. 220 static cl::opt<bool> EnableLoopHeaderDuplication( 221 "enable-loop-header-duplication", cl::init(false), cl::Hidden, 222 cl::desc("Enable loop header duplication at any optimization level")); 223 224 static cl::opt<bool> 225 EnableDFAJumpThreading("enable-dfa-jump-thread", 226 cl::desc("Enable DFA jump threading"), 227 cl::init(false), cl::Hidden); 228 229 static cl::opt<bool> 230 EnableHotColdSplit("hot-cold-split", 231 cl::desc("Enable hot-cold splitting pass")); 232 233 static cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), 234 cl::Hidden, 235 cl::desc("Enable ir outliner pass")); 236 237 static cl::opt<bool> 238 DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden, 239 cl::desc("Disable pre-instrumentation inliner")); 240 241 static cl::opt<int> PreInlineThreshold( 242 "preinline-threshold", cl::Hidden, cl::init(75), 243 cl::desc("Control the amount of inlining in pre-instrumentation inliner " 244 "(default = 75)")); 245 246 static cl::opt<bool> 247 EnableGVNHoist("enable-gvn-hoist", 248 cl::desc("Enable the GVN hoisting pass (default = off)")); 249 250 static cl::opt<bool> 251 EnableGVNSink("enable-gvn-sink", 252 cl::desc("Enable the GVN sinking pass (default = off)")); 253 254 static cl::opt<bool> EnableJumpTableToSwitch( 255 "enable-jump-table-to-switch", 256 cl::desc("Enable JumpTableToSwitch pass (default = off)")); 257 258 // This option is used in simplifying testing SampleFDO optimizations for 259 // profile loading. 260 static cl::opt<bool> 261 EnableCHR("enable-chr", cl::init(true), cl::Hidden, 262 cl::desc("Enable control height reduction optimization (CHR)")); 263 264 static cl::opt<bool> FlattenedProfileUsed( 265 "flattened-profile-used", cl::init(false), cl::Hidden, 266 cl::desc("Indicate the sample profile being used is flattened, i.e., " 267 "no inline hierarchy exists in the profile")); 268 269 static cl::opt<bool> EnableOrderFileInstrumentation( 270 "enable-order-file-instrumentation", cl::init(false), cl::Hidden, 271 cl::desc("Enable order file instrumentation (default = off)")); 272 273 static cl::opt<bool> 274 EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, 275 cl::desc("Enable lowering of the matrix intrinsics")); 276 277 static cl::opt<bool> EnableConstraintElimination( 278 "enable-constraint-elimination", cl::init(true), cl::Hidden, 279 cl::desc( 280 "Enable pass to eliminate conditions based on linear constraints")); 281 282 static cl::opt<AttributorRunOption> AttributorRun( 283 "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), 284 cl::desc("Enable the attributor inter-procedural deduction pass"), 285 cl::values(clEnumValN(AttributorRunOption::ALL, "all", 286 "enable all attributor runs"), 287 clEnumValN(AttributorRunOption::MODULE, "module", 288 "enable module-wide attributor runs"), 289 clEnumValN(AttributorRunOption::CGSCC, "cgscc", 290 "enable call graph SCC attributor runs"), 291 clEnumValN(AttributorRunOption::NONE, "none", 292 "disable attributor runs"))); 293 294 static cl::opt<bool> EnableSampledInstr( 295 "enable-sampled-instrumentation", cl::init(false), cl::Hidden, 296 cl::desc("Enable profile instrumentation sampling (default = off)")); 297 static cl::opt<bool> UseLoopVersioningLICM( 298 "enable-loop-versioning-licm", cl::init(false), cl::Hidden, 299 cl::desc("Enable the experimental Loop Versioning LICM pass")); 300 301 static cl::opt<std::string> InstrumentColdFuncOnlyPath( 302 "instrument-cold-function-only-path", cl::init(""), 303 cl::desc("File path for cold function only instrumentation(requires use " 304 "with --pgo-instrument-cold-function-only)"), 305 cl::Hidden); 306 307 extern cl::opt<std::string> UseCtxProfile; 308 extern cl::opt<bool> PGOInstrumentColdFunctionOnly; 309 310 namespace llvm { 311 extern cl::opt<bool> EnableMemProfContextDisambiguation; 312 } // namespace llvm 313 314 PipelineTuningOptions::PipelineTuningOptions() { 315 LoopInterleaving = true; 316 LoopVectorization = true; 317 SLPVectorization = false; 318 LoopUnrolling = true; 319 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 320 LicmMssaOptCap = SetLicmMssaOptCap; 321 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 322 CallGraphProfile = true; 323 UnifiedLTO = false; 324 MergeFunctions = EnableMergeFunctions; 325 InlinerThreshold = -1; 326 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 327 } 328 329 namespace llvm { 330 extern cl::opt<unsigned> MaxDevirtIterations; 331 } // namespace llvm 332 333 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 334 OptimizationLevel Level) { 335 for (auto &C : PeepholeEPCallbacks) 336 C(FPM, Level); 337 } 338 void PassBuilder::invokeLateLoopOptimizationsEPCallbacks( 339 LoopPassManager &LPM, OptimizationLevel Level) { 340 for (auto &C : LateLoopOptimizationsEPCallbacks) 341 C(LPM, Level); 342 } 343 void PassBuilder::invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM, 344 OptimizationLevel Level) { 345 for (auto &C : LoopOptimizerEndEPCallbacks) 346 C(LPM, Level); 347 } 348 void PassBuilder::invokeScalarOptimizerLateEPCallbacks( 349 FunctionPassManager &FPM, OptimizationLevel Level) { 350 for (auto &C : ScalarOptimizerLateEPCallbacks) 351 C(FPM, Level); 352 } 353 void PassBuilder::invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM, 354 OptimizationLevel Level) { 355 for (auto &C : CGSCCOptimizerLateEPCallbacks) 356 C(CGPM, Level); 357 } 358 void PassBuilder::invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM, 359 OptimizationLevel Level) { 360 for (auto &C : VectorizerStartEPCallbacks) 361 C(FPM, Level); 362 } 363 void PassBuilder::invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM, 364 OptimizationLevel Level) { 365 for (auto &C : VectorizerEndEPCallbacks) 366 C(FPM, Level); 367 } 368 void PassBuilder::invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM, 369 OptimizationLevel Level, 370 ThinOrFullLTOPhase Phase) { 371 for (auto &C : OptimizerEarlyEPCallbacks) 372 C(MPM, Level, Phase); 373 } 374 void PassBuilder::invokeOptimizerLastEPCallbacks(ModulePassManager &MPM, 375 OptimizationLevel Level, 376 ThinOrFullLTOPhase Phase) { 377 for (auto &C : OptimizerLastEPCallbacks) 378 C(MPM, Level, Phase); 379 } 380 void PassBuilder::invokeFullLinkTimeOptimizationEarlyEPCallbacks( 381 ModulePassManager &MPM, OptimizationLevel Level) { 382 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) 383 C(MPM, Level); 384 } 385 void PassBuilder::invokeFullLinkTimeOptimizationLastEPCallbacks( 386 ModulePassManager &MPM, OptimizationLevel Level) { 387 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 388 C(MPM, Level); 389 } 390 void PassBuilder::invokePipelineStartEPCallbacks(ModulePassManager &MPM, 391 OptimizationLevel Level) { 392 for (auto &C : PipelineStartEPCallbacks) 393 C(MPM, Level); 394 } 395 void PassBuilder::invokePipelineEarlySimplificationEPCallbacks( 396 ModulePassManager &MPM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { 397 for (auto &C : PipelineEarlySimplificationEPCallbacks) 398 C(MPM, Level, Phase); 399 } 400 401 // Helper to add AnnotationRemarksPass. 402 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 403 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 404 } 405 406 // Helper to check if the current compilation phase is preparing for LTO 407 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 408 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 409 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 410 } 411 412 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 413 FunctionPassManager 414 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 415 ThinOrFullLTOPhase Phase) { 416 417 FunctionPassManager FPM; 418 419 if (AreStatisticsEnabled()) 420 FPM.addPass(CountVisitsPass()); 421 422 // Form SSA out of local memory accesses after breaking apart aggregates into 423 // scalars. 424 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 425 426 // Catch trivial redundancies 427 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 428 429 // Hoisting of scalars and load expressions. 430 FPM.addPass( 431 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 432 FPM.addPass(InstCombinePass()); 433 434 FPM.addPass(LibCallsShrinkWrapPass()); 435 436 invokePeepholeEPCallbacks(FPM, Level); 437 438 FPM.addPass( 439 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 440 441 // Form canonically associated expression trees, and simplify the trees using 442 // basic mathematical properties. For example, this will form (nearly) 443 // minimal multiplication trees. 444 FPM.addPass(ReassociatePass()); 445 446 // Add the primary loop simplification pipeline. 447 // FIXME: Currently this is split into two loop pass pipelines because we run 448 // some function passes in between them. These can and should be removed 449 // and/or replaced by scheduling the loop pass equivalents in the correct 450 // positions. But those equivalent passes aren't powerful enough yet. 451 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 452 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 453 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 454 // `LoopInstSimplify`. 455 LoopPassManager LPM1, LPM2; 456 457 // Simplify the loop body. We do this initially to clean up after other loop 458 // passes run, either when iterating on a loop or on inner loops with 459 // implications on the outer loop. 460 LPM1.addPass(LoopInstSimplifyPass()); 461 LPM1.addPass(LoopSimplifyCFGPass()); 462 463 // Try to remove as much code from the loop header as possible, 464 // to reduce amount of IR that will have to be duplicated. However, 465 // do not perform speculative hoisting the first time as LICM 466 // will destroy metadata that may not need to be destroyed if run 467 // after loop rotation. 468 // TODO: Investigate promotion cap for O1. 469 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 470 /*AllowSpeculation=*/false)); 471 472 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 473 isLTOPreLink(Phase))); 474 // TODO: Investigate promotion cap for O1. 475 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 476 /*AllowSpeculation=*/true)); 477 LPM1.addPass(SimpleLoopUnswitchPass()); 478 if (EnableLoopFlatten) 479 LPM1.addPass(LoopFlattenPass()); 480 481 LPM2.addPass(LoopIdiomRecognizePass()); 482 LPM2.addPass(IndVarSimplifyPass()); 483 484 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 485 486 LPM2.addPass(LoopDeletionPass()); 487 488 if (EnableLoopInterchange) 489 LPM2.addPass(LoopInterchangePass()); 490 491 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 492 // because it changes IR to makes profile annotation in back compile 493 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 494 // attributes so we need to make sure and allow the full unroll pass to pay 495 // attention to it. 496 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 497 PGOOpt->Action != PGOOptions::SampleUse) 498 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 499 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 500 PTO.ForgetAllSCEVInLoopUnroll)); 501 502 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 503 504 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 505 /*UseMemorySSA=*/true, 506 /*UseBlockFrequencyInfo=*/true)); 507 FPM.addPass( 508 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 509 FPM.addPass(InstCombinePass()); 510 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 511 // *All* loop passes must preserve it, in order to be able to use it. 512 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 513 /*UseMemorySSA=*/false, 514 /*UseBlockFrequencyInfo=*/false)); 515 516 // Delete small array after loop unroll. 517 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 518 519 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 520 FPM.addPass(MemCpyOptPass()); 521 522 // Sparse conditional constant propagation. 523 // FIXME: It isn't clear why we do this *after* loop passes rather than 524 // before... 525 FPM.addPass(SCCPPass()); 526 527 // Delete dead bit computations (instcombine runs after to fold away the dead 528 // computations, and then ADCE will run later to exploit any new DCE 529 // opportunities that creates). 530 FPM.addPass(BDCEPass()); 531 532 // Run instcombine after redundancy and dead bit elimination to exploit 533 // opportunities opened up by them. 534 FPM.addPass(InstCombinePass()); 535 invokePeepholeEPCallbacks(FPM, Level); 536 537 FPM.addPass(CoroElidePass()); 538 539 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 540 541 // Finally, do an expensive DCE pass to catch all the dead code exposed by 542 // the simplifications and basic cleanup after all the simplifications. 543 // TODO: Investigate if this is too expensive. 544 FPM.addPass(ADCEPass()); 545 FPM.addPass( 546 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 547 FPM.addPass(InstCombinePass()); 548 invokePeepholeEPCallbacks(FPM, Level); 549 550 return FPM; 551 } 552 553 FunctionPassManager 554 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 555 ThinOrFullLTOPhase Phase) { 556 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 557 558 // The O1 pipeline has a separate pipeline creation function to simplify 559 // construction readability. 560 if (Level.getSpeedupLevel() == 1) 561 return buildO1FunctionSimplificationPipeline(Level, Phase); 562 563 FunctionPassManager FPM; 564 565 if (AreStatisticsEnabled()) 566 FPM.addPass(CountVisitsPass()); 567 568 // Form SSA out of local memory accesses after breaking apart aggregates into 569 // scalars. 570 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 571 572 // Catch trivial redundancies 573 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 574 if (EnableKnowledgeRetention) 575 FPM.addPass(AssumeSimplifyPass()); 576 577 // Hoisting of scalars and load expressions. 578 if (EnableGVNHoist) 579 FPM.addPass(GVNHoistPass()); 580 581 // Global value numbering based sinking. 582 if (EnableGVNSink) { 583 FPM.addPass(GVNSinkPass()); 584 FPM.addPass( 585 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 586 } 587 588 // Speculative execution if the target has divergent branches; otherwise nop. 589 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 590 591 // Optimize based on known information about branches, and cleanup afterward. 592 FPM.addPass(JumpThreadingPass()); 593 FPM.addPass(CorrelatedValuePropagationPass()); 594 595 // Jump table to switch conversion. 596 if (EnableJumpTableToSwitch) 597 FPM.addPass(JumpTableToSwitchPass()); 598 599 FPM.addPass( 600 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 601 FPM.addPass(InstCombinePass()); 602 FPM.addPass(AggressiveInstCombinePass()); 603 604 if (!Level.isOptimizingForSize()) 605 FPM.addPass(LibCallsShrinkWrapPass()); 606 607 invokePeepholeEPCallbacks(FPM, Level); 608 609 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 610 // using the size value profile. Don't perform this when optimizing for size. 611 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 612 !Level.isOptimizingForSize()) 613 FPM.addPass(PGOMemOPSizeOpt()); 614 615 FPM.addPass(TailCallElimPass()); 616 FPM.addPass( 617 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 618 619 // Form canonically associated expression trees, and simplify the trees using 620 // basic mathematical properties. For example, this will form (nearly) 621 // minimal multiplication trees. 622 FPM.addPass(ReassociatePass()); 623 624 if (EnableConstraintElimination) 625 FPM.addPass(ConstraintEliminationPass()); 626 627 // Add the primary loop simplification pipeline. 628 // FIXME: Currently this is split into two loop pass pipelines because we run 629 // some function passes in between them. These can and should be removed 630 // and/or replaced by scheduling the loop pass equivalents in the correct 631 // positions. But those equivalent passes aren't powerful enough yet. 632 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 633 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 634 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 635 // `LoopInstSimplify`. 636 LoopPassManager LPM1, LPM2; 637 638 // Simplify the loop body. We do this initially to clean up after other loop 639 // passes run, either when iterating on a loop or on inner loops with 640 // implications on the outer loop. 641 LPM1.addPass(LoopInstSimplifyPass()); 642 LPM1.addPass(LoopSimplifyCFGPass()); 643 644 // Try to remove as much code from the loop header as possible, 645 // to reduce amount of IR that will have to be duplicated. However, 646 // do not perform speculative hoisting the first time as LICM 647 // will destroy metadata that may not need to be destroyed if run 648 // after loop rotation. 649 // TODO: Investigate promotion cap for O1. 650 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 651 /*AllowSpeculation=*/false)); 652 653 // Disable header duplication in loop rotation at -Oz. 654 LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication || 655 Level != OptimizationLevel::Oz, 656 isLTOPreLink(Phase))); 657 // TODO: Investigate promotion cap for O1. 658 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 659 /*AllowSpeculation=*/true)); 660 LPM1.addPass( 661 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); 662 if (EnableLoopFlatten) 663 LPM1.addPass(LoopFlattenPass()); 664 665 LPM2.addPass(LoopIdiomRecognizePass()); 666 LPM2.addPass(IndVarSimplifyPass()); 667 668 { 669 ExtraLoopPassManager<ShouldRunExtraSimpleLoopUnswitch> ExtraPasses; 670 ExtraPasses.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 671 OptimizationLevel::O3)); 672 LPM2.addPass(std::move(ExtraPasses)); 673 } 674 675 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 676 677 LPM2.addPass(LoopDeletionPass()); 678 679 if (EnableLoopInterchange) 680 LPM2.addPass(LoopInterchangePass()); 681 682 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 683 // because it changes IR to makes profile annotation in back compile 684 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 685 // attributes so we need to make sure and allow the full unroll pass to pay 686 // attention to it. 687 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 688 PGOOpt->Action != PGOOptions::SampleUse) 689 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 690 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 691 PTO.ForgetAllSCEVInLoopUnroll)); 692 693 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 694 695 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 696 /*UseMemorySSA=*/true, 697 /*UseBlockFrequencyInfo=*/true)); 698 FPM.addPass( 699 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 700 FPM.addPass(InstCombinePass()); 701 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 702 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 703 // *All* loop passes must preserve it, in order to be able to use it. 704 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 705 /*UseMemorySSA=*/false, 706 /*UseBlockFrequencyInfo=*/false)); 707 708 // Delete small array after loop unroll. 709 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 710 711 // Try vectorization/scalarization transforms that are both improvements 712 // themselves and can allow further folds with GVN and InstCombine. 713 FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); 714 715 // Eliminate redundancies. 716 FPM.addPass(MergedLoadStoreMotionPass()); 717 if (RunNewGVN) 718 FPM.addPass(NewGVNPass()); 719 else 720 FPM.addPass(GVNPass()); 721 722 // Sparse conditional constant propagation. 723 // FIXME: It isn't clear why we do this *after* loop passes rather than 724 // before... 725 FPM.addPass(SCCPPass()); 726 727 // Delete dead bit computations (instcombine runs after to fold away the dead 728 // computations, and then ADCE will run later to exploit any new DCE 729 // opportunities that creates). 730 FPM.addPass(BDCEPass()); 731 732 // Run instcombine after redundancy and dead bit elimination to exploit 733 // opportunities opened up by them. 734 FPM.addPass(InstCombinePass()); 735 invokePeepholeEPCallbacks(FPM, Level); 736 737 // Re-consider control flow based optimizations after redundancy elimination, 738 // redo DCE, etc. 739 if (EnableDFAJumpThreading) 740 FPM.addPass(DFAJumpThreadingPass()); 741 742 FPM.addPass(JumpThreadingPass()); 743 FPM.addPass(CorrelatedValuePropagationPass()); 744 745 // Finally, do an expensive DCE pass to catch all the dead code exposed by 746 // the simplifications and basic cleanup after all the simplifications. 747 // TODO: Investigate if this is too expensive. 748 FPM.addPass(ADCEPass()); 749 750 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 751 FPM.addPass(MemCpyOptPass()); 752 753 FPM.addPass(DSEPass()); 754 FPM.addPass(MoveAutoInitPass()); 755 756 FPM.addPass(createFunctionToLoopPassAdaptor( 757 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 758 /*AllowSpeculation=*/true), 759 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 760 761 FPM.addPass(CoroElidePass()); 762 763 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 764 765 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 766 .convertSwitchRangeToICmp(true) 767 .hoistCommonInsts(true) 768 .sinkCommonInsts(true))); 769 FPM.addPass(InstCombinePass()); 770 invokePeepholeEPCallbacks(FPM, Level); 771 772 return FPM; 773 } 774 775 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 776 MPM.addPass(CanonicalizeAliasesPass()); 777 MPM.addPass(NameAnonGlobalPass()); 778 } 779 780 void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM, 781 OptimizationLevel Level, 782 ThinOrFullLTOPhase LTOPhase) { 783 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 784 if (DisablePreInliner) 785 return; 786 InlineParams IP; 787 788 IP.DefaultThreshold = PreInlineThreshold; 789 790 // FIXME: The hint threshold has the same value used by the regular inliner 791 // when not optimzing for size. This should probably be lowered after 792 // performance testing. 793 // FIXME: this comment is cargo culted from the old pass manager, revisit). 794 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 795 ModuleInlinerWrapperPass MIWP( 796 IP, /* MandatoryFirst */ true, 797 InlineContext{LTOPhase, InlinePass::EarlyInliner}); 798 CGSCCPassManager &CGPipeline = MIWP.getPM(); 799 800 FunctionPassManager FPM; 801 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 802 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 803 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 804 true))); // Merge & remove basic blocks. 805 FPM.addPass(InstCombinePass()); // Combine silly sequences. 806 invokePeepholeEPCallbacks(FPM, Level); 807 808 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 809 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 810 811 MPM.addPass(std::move(MIWP)); 812 813 // Delete anything that is now dead to make sure that we don't instrument 814 // dead code. Instrumentation can end up keeping dead code around and 815 // dramatically increase code size. 816 MPM.addPass(GlobalDCEPass()); 817 } 818 819 void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM, 820 OptimizationLevel Level) { 821 if (EnablePostPGOLoopRotation) { 822 // Disable header duplication in loop rotation at -Oz. 823 MPM.addPass(createModuleToFunctionPassAdaptor( 824 createFunctionToLoopPassAdaptor( 825 LoopRotatePass(EnableLoopHeaderDuplication || 826 Level != OptimizationLevel::Oz), 827 /*UseMemorySSA=*/false, 828 /*UseBlockFrequencyInfo=*/false), 829 PTO.EagerlyInvalidateAnalyses)); 830 } 831 } 832 833 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 834 OptimizationLevel Level, bool RunProfileGen, 835 bool IsCS, bool AtomicCounterUpdate, 836 std::string ProfileFile, 837 std::string ProfileRemappingFile, 838 IntrusiveRefCntPtr<vfs::FileSystem> FS) { 839 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 840 841 if (!RunProfileGen) { 842 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 843 MPM.addPass( 844 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 845 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 846 // RequireAnalysisPass for PSI before subsequent non-module passes. 847 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 848 return; 849 } 850 851 // Perform PGO instrumentation. 852 MPM.addPass(PGOInstrumentationGen(IsCS ? PGOInstrumentationType::CSFDO 853 : PGOInstrumentationType::FDO)); 854 855 addPostPGOLoopRotation(MPM, Level); 856 // Add the profile lowering pass. 857 InstrProfOptions Options; 858 if (!ProfileFile.empty()) 859 Options.InstrProfileOutput = ProfileFile; 860 // Do counter promotion at Level greater than O0. 861 Options.DoCounterPromotion = true; 862 Options.UseBFIInPromotion = IsCS; 863 if (EnableSampledInstr) { 864 Options.Sampling = true; 865 // With sampling, there is little beneifit to enable counter promotion. 866 // But note that sampling does work with counter promotion. 867 Options.DoCounterPromotion = false; 868 } 869 Options.Atomic = AtomicCounterUpdate; 870 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 871 } 872 873 void PassBuilder::addPGOInstrPassesForO0( 874 ModulePassManager &MPM, bool RunProfileGen, bool IsCS, 875 bool AtomicCounterUpdate, std::string ProfileFile, 876 std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) { 877 if (!RunProfileGen) { 878 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 879 MPM.addPass( 880 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 881 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 882 // RequireAnalysisPass for PSI before subsequent non-module passes. 883 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 884 return; 885 } 886 887 // Perform PGO instrumentation. 888 MPM.addPass(PGOInstrumentationGen(IsCS ? PGOInstrumentationType::CSFDO 889 : PGOInstrumentationType::FDO)); 890 // Add the profile lowering pass. 891 InstrProfOptions Options; 892 if (!ProfileFile.empty()) 893 Options.InstrProfileOutput = ProfileFile; 894 // Do not do counter promotion at O0. 895 Options.DoCounterPromotion = false; 896 Options.UseBFIInPromotion = IsCS; 897 Options.Atomic = AtomicCounterUpdate; 898 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 899 } 900 901 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 902 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 903 } 904 905 ModuleInlinerWrapperPass 906 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 907 ThinOrFullLTOPhase Phase) { 908 InlineParams IP; 909 if (PTO.InlinerThreshold == -1) 910 IP = getInlineParamsFromOptLevel(Level); 911 else 912 IP = getInlineParams(PTO.InlinerThreshold); 913 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 914 // disable hot callsite inline (as much as possible [1]) because it makes 915 // profile annotation in the backend inaccurate. 916 // 917 // [1] Note the cost of a function could be below zero due to erased 918 // prologue / epilogue. 919 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 920 PGOOpt->Action == PGOOptions::SampleUse) 921 IP.HotCallSiteThreshold = 0; 922 923 if (PGOOpt) 924 IP.EnableDeferral = EnablePGOInlineDeferral; 925 926 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 927 InlineContext{Phase, InlinePass::CGSCCInliner}, 928 UseInlineAdvisor, MaxDevirtIterations); 929 930 // Require the GlobalsAA analysis for the module so we can query it within 931 // the CGSCC pipeline. 932 if (EnableGlobalAnalyses) { 933 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 934 // Invalidate AAManager so it can be recreated and pick up the newly 935 // available GlobalsAA. 936 MIWP.addModulePass( 937 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 938 } 939 940 // Require the ProfileSummaryAnalysis for the module so we can query it within 941 // the inliner pass. 942 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 943 944 // Now begin the main postorder CGSCC pipeline. 945 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 946 // manager and trying to emulate its precise behavior. Much of this doesn't 947 // make a lot of sense and we should revisit the core CGSCC structure. 948 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 949 950 // Note: historically, the PruneEH pass was run first to deduce nounwind and 951 // generally clean up exception handling overhead. It isn't clear this is 952 // valuable as the inliner doesn't currently care whether it is inlining an 953 // invoke or a call. 954 955 if (AttributorRun & AttributorRunOption::CGSCC) 956 MainCGPipeline.addPass(AttributorCGSCCPass()); 957 958 // Deduce function attributes. We do another run of this after the function 959 // simplification pipeline, so this only needs to run when it could affect the 960 // function simplification pipeline, which is only the case with recursive 961 // functions. 962 MainCGPipeline.addPass(PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true)); 963 964 // When at O3 add argument promotion to the pass pipeline. 965 // FIXME: It isn't at all clear why this should be limited to O3. 966 if (Level == OptimizationLevel::O3) 967 MainCGPipeline.addPass(ArgumentPromotionPass()); 968 969 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 970 // there are no OpenMP runtime calls present in the module. 971 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 972 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 973 974 invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level); 975 976 // Add the core function simplification pipeline nested inside the 977 // CGSCC walk. 978 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 979 buildFunctionSimplificationPipeline(Level, Phase), 980 PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true)); 981 982 // Finally, deduce any function attributes based on the fully simplified 983 // function. 984 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 985 986 // Mark that the function is fully simplified and that it shouldn't be 987 // simplified again if we somehow revisit it due to CGSCC mutations unless 988 // it's been modified since. 989 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 990 RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>())); 991 992 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 993 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 994 MainCGPipeline.addPass(CoroAnnotationElidePass()); 995 } 996 997 // Make sure we don't affect potential future NoRerun CGSCC adaptors. 998 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 999 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 1000 1001 return MIWP; 1002 } 1003 1004 ModulePassManager 1005 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 1006 ThinOrFullLTOPhase Phase) { 1007 ModulePassManager MPM; 1008 1009 InlineParams IP = getInlineParamsFromOptLevel(Level); 1010 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 1011 // disable hot callsite inline (as much as possible [1]) because it makes 1012 // profile annotation in the backend inaccurate. 1013 // 1014 // [1] Note the cost of a function could be below zero due to erased 1015 // prologue / epilogue. 1016 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 1017 PGOOpt->Action == PGOOptions::SampleUse) 1018 IP.HotCallSiteThreshold = 0; 1019 1020 if (PGOOpt) 1021 IP.EnableDeferral = EnablePGOInlineDeferral; 1022 1023 // The inline deferral logic is used to avoid losing some 1024 // inlining chance in future. It is helpful in SCC inliner, in which 1025 // inlining is processed in bottom-up order. 1026 // While in module inliner, the inlining order is a priority-based order 1027 // by default. The inline deferral is unnecessary there. So we disable the 1028 // inline deferral logic in module inliner. 1029 IP.EnableDeferral = false; 1030 1031 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase)); 1032 if (!UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPostLink) { 1033 MPM.addPass(GlobalOptPass()); 1034 MPM.addPass(GlobalDCEPass()); 1035 MPM.addPass(PGOCtxProfFlatteningPass()); 1036 } 1037 1038 MPM.addPass(createModuleToFunctionPassAdaptor( 1039 buildFunctionSimplificationPipeline(Level, Phase), 1040 PTO.EagerlyInvalidateAnalyses)); 1041 1042 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 1043 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 1044 CoroSplitPass(Level != OptimizationLevel::O0))); 1045 MPM.addPass( 1046 createModuleToPostOrderCGSCCPassAdaptor(CoroAnnotationElidePass())); 1047 } 1048 1049 return MPM; 1050 } 1051 1052 ModulePassManager 1053 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 1054 ThinOrFullLTOPhase Phase) { 1055 assert(Level != OptimizationLevel::O0 && 1056 "Should not be used for O0 pipeline"); 1057 1058 assert(Phase != ThinOrFullLTOPhase::FullLTOPostLink && 1059 "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!"); 1060 1061 ModulePassManager MPM; 1062 1063 // Place pseudo probe instrumentation as the first pass of the pipeline to 1064 // minimize the impact of optimization changes. 1065 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1066 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 1067 MPM.addPass(SampleProfileProbePass(TM)); 1068 1069 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 1070 1071 // In ThinLTO mode, when flattened profile is used, all the available 1072 // profile information will be annotated in PreLink phase so there is 1073 // no need to load the profile again in PostLink. 1074 bool LoadSampleProfile = 1075 HasSampleProfile && 1076 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 1077 1078 // During the ThinLTO backend phase we perform early indirect call promotion 1079 // here, before globalopt. Otherwise imported available_externally functions 1080 // look unreferenced and are removed. If we are going to load the sample 1081 // profile then defer until later. 1082 // TODO: See if we can move later and consolidate with the location where 1083 // we perform ICP when we are loading a sample profile. 1084 // TODO: We pass HasSampleProfile (whether there was a sample profile file 1085 // passed to the compile) to the SamplePGO flag of ICP. This is used to 1086 // determine whether the new direct calls are annotated with prof metadata. 1087 // Ideally this should be determined from whether the IR is annotated with 1088 // sample profile, and not whether the a sample profile was provided on the 1089 // command line. E.g. for flattened profiles where we will not be reloading 1090 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 1091 // provide the sample profile file. 1092 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 1093 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 1094 1095 // Create an early function pass manager to cleanup the output of the 1096 // frontend. Not necessary with LTO post link pipelines since the pre link 1097 // pipeline already cleaned up the frontend output. 1098 if (Phase != ThinOrFullLTOPhase::ThinLTOPostLink) { 1099 // Do basic inference of function attributes from known properties of system 1100 // libraries and other oracles. 1101 MPM.addPass(InferFunctionAttrsPass()); 1102 MPM.addPass(CoroEarlyPass()); 1103 1104 FunctionPassManager EarlyFPM; 1105 EarlyFPM.addPass(EntryExitInstrumenterPass(/*PostInlining=*/false)); 1106 // Lower llvm.expect to metadata before attempting transforms. 1107 // Compare/branch metadata may alter the behavior of passes like 1108 // SimplifyCFG. 1109 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 1110 EarlyFPM.addPass(SimplifyCFGPass()); 1111 EarlyFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 1112 EarlyFPM.addPass(EarlyCSEPass()); 1113 if (Level == OptimizationLevel::O3) 1114 EarlyFPM.addPass(CallSiteSplittingPass()); 1115 MPM.addPass(createModuleToFunctionPassAdaptor( 1116 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses)); 1117 } 1118 1119 if (LoadSampleProfile) { 1120 // Annotate sample profile right after early FPM to ensure freshness of 1121 // the debug info. 1122 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1123 PGOOpt->ProfileRemappingFile, Phase)); 1124 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1125 // RequireAnalysisPass for PSI before subsequent non-module passes. 1126 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1127 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 1128 // for the profile annotation to be accurate in the LTO backend. 1129 if (!isLTOPreLink(Phase)) 1130 // We perform early indirect call promotion here, before globalopt. 1131 // This is important for the ThinLTO backend phase because otherwise 1132 // imported available_externally functions look unreferenced and are 1133 // removed. 1134 MPM.addPass( 1135 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 1136 } 1137 1138 // Try to perform OpenMP specific optimizations on the module. This is a 1139 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 1140 MPM.addPass(OpenMPOptPass()); 1141 1142 if (AttributorRun & AttributorRunOption::MODULE) 1143 MPM.addPass(AttributorPass()); 1144 1145 // Lower type metadata and the type.test intrinsic in the ThinLTO 1146 // post link pipeline after ICP. This is to enable usage of the type 1147 // tests in ICP sequences. 1148 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 1149 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, 1150 lowertypetests::DropTestKind::Assume)); 1151 1152 invokePipelineEarlySimplificationEPCallbacks(MPM, Level, Phase); 1153 1154 // Interprocedural constant propagation now that basic cleanup has occurred 1155 // and prior to optimizing globals. 1156 // FIXME: This position in the pipeline hasn't been carefully considered in 1157 // years, it should be re-analyzed. 1158 MPM.addPass(IPSCCPPass( 1159 IPSCCPOptions(/*AllowFuncSpec=*/ 1160 Level != OptimizationLevel::Os && 1161 Level != OptimizationLevel::Oz && 1162 !isLTOPreLink(Phase)))); 1163 1164 // Attach metadata to indirect call sites indicating the set of functions 1165 // they may target at run-time. This should follow IPSCCP. 1166 MPM.addPass(CalledValuePropagationPass()); 1167 1168 // Optimize globals to try and fold them into constants. 1169 MPM.addPass(GlobalOptPass()); 1170 1171 // Create a small function pass pipeline to cleanup after all the global 1172 // optimizations. 1173 FunctionPassManager GlobalCleanupPM; 1174 // FIXME: Should this instead by a run of SROA? 1175 GlobalCleanupPM.addPass(PromotePass()); 1176 GlobalCleanupPM.addPass(InstCombinePass()); 1177 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 1178 GlobalCleanupPM.addPass( 1179 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1180 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 1181 PTO.EagerlyInvalidateAnalyses)); 1182 1183 // We already asserted this happens in non-FullLTOPostLink earlier. 1184 const bool IsPreLink = Phase != ThinOrFullLTOPhase::ThinLTOPostLink; 1185 const bool IsPGOPreLink = PGOOpt && IsPreLink; 1186 const bool IsPGOInstrGen = 1187 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRInstr; 1188 const bool IsPGOInstrUse = 1189 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRUse; 1190 const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty(); 1191 // We don't want to mix pgo ctx gen and pgo gen; we also don't currently 1192 // enable ctx profiling from the frontend. 1193 assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) && 1194 "Enabling both instrumented PGO and contextual instrumentation is not " 1195 "supported."); 1196 // Enable contextual profiling instrumentation. 1197 const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink && 1198 PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled(); 1199 const bool IsCtxProfUse = 1200 !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink; 1201 1202 assert( 1203 (InstrumentColdFuncOnlyPath.empty() || PGOInstrumentColdFunctionOnly) && 1204 "--instrument-cold-function-only-path is provided but " 1205 "--pgo-instrument-cold-function-only is not enabled"); 1206 const bool IsColdFuncOnlyInstrGen = PGOInstrumentColdFunctionOnly && 1207 IsPGOPreLink && 1208 !InstrumentColdFuncOnlyPath.empty(); 1209 1210 if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen || 1211 IsCtxProfUse || IsColdFuncOnlyInstrGen) 1212 addPreInlinerPasses(MPM, Level, Phase); 1213 1214 // Add all the requested passes for instrumentation PGO, if requested. 1215 if (IsPGOInstrGen || IsPGOInstrUse) { 1216 addPGOInstrPasses(MPM, Level, 1217 /*RunProfileGen=*/IsPGOInstrGen, 1218 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, 1219 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1220 PGOOpt->FS); 1221 } else if (IsCtxProfGen || IsCtxProfUse) { 1222 MPM.addPass(PGOInstrumentationGen(PGOInstrumentationType::CTXPROF)); 1223 // In pre-link, we just want the instrumented IR. We use the contextual 1224 // profile in the post-thinlink phase. 1225 // The instrumentation will be removed in post-thinlink after IPO. 1226 // FIXME(mtrofin): move AssignGUIDPass if there is agreement to use this 1227 // mechanism for GUIDs. 1228 MPM.addPass(AssignGUIDPass()); 1229 if (IsCtxProfUse) 1230 return MPM; 1231 addPostPGOLoopRotation(MPM, Level); 1232 MPM.addPass(PGOCtxProfLoweringPass()); 1233 } else if (IsColdFuncOnlyInstrGen) { 1234 addPGOInstrPasses( 1235 MPM, Level, /* RunProfileGen */ true, /* IsCS */ false, 1236 /* AtomicCounterUpdate */ false, InstrumentColdFuncOnlyPath, 1237 /* ProfileRemappingFile */ "", IntrusiveRefCntPtr<vfs::FileSystem>()); 1238 } 1239 1240 if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen) 1241 MPM.addPass(PGOIndirectCallPromotion(false, false)); 1242 1243 if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr) 1244 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile, 1245 EnableSampledInstr)); 1246 1247 if (IsMemprofUse) 1248 MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS)); 1249 1250 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRUse || 1251 PGOOpt->Action == PGOOptions::SampleUse)) 1252 MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType)); 1253 1254 MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true)); 1255 1256 if (EnableModuleInliner) 1257 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 1258 else 1259 MPM.addPass(buildInlinerPipeline(Level, Phase)); 1260 1261 // Remove any dead arguments exposed by cleanups, constant folding globals, 1262 // and argument promotion. 1263 MPM.addPass(DeadArgumentEliminationPass()); 1264 1265 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) 1266 MPM.addPass(CoroCleanupPass()); 1267 1268 // Optimize globals now that functions are fully simplified. 1269 MPM.addPass(GlobalOptPass()); 1270 MPM.addPass(GlobalDCEPass()); 1271 1272 return MPM; 1273 } 1274 1275 /// TODO: Should LTO cause any differences to this set of passes? 1276 void PassBuilder::addVectorPasses(OptimizationLevel Level, 1277 FunctionPassManager &FPM, bool IsFullLTO) { 1278 FPM.addPass(LoopVectorizePass( 1279 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 1280 1281 FPM.addPass(InferAlignmentPass()); 1282 if (IsFullLTO) { 1283 // The vectorizer may have significantly shortened a loop body; unroll 1284 // again. Unroll small loops to hide loop backedge latency and saturate any 1285 // parallel execution resources of an out-of-order processor. We also then 1286 // need to clean up redundancies and loop invariant code. 1287 // FIXME: It would be really good to use a loop-integrated instruction 1288 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1289 // across the loop nests. 1290 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1291 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1292 FPM.addPass(createFunctionToLoopPassAdaptor( 1293 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1294 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1295 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1296 PTO.ForgetAllSCEVInLoopUnroll))); 1297 FPM.addPass(WarnMissedTransformationsPass()); 1298 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1299 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1300 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1301 // NOTE: we are very late in the pipeline, and we don't have any LICM 1302 // or SimplifyCFG passes scheduled after us, that would cleanup 1303 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1304 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1305 } 1306 1307 if (!IsFullLTO) { 1308 // Eliminate loads by forwarding stores from the previous iteration to loads 1309 // of the current iteration. 1310 FPM.addPass(LoopLoadEliminationPass()); 1311 } 1312 // Cleanup after the loop optimization passes. 1313 FPM.addPass(InstCombinePass()); 1314 1315 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1316 ExtraFunctionPassManager<ShouldRunExtraVectorPasses> ExtraPasses; 1317 // At higher optimization levels, try to clean up any runtime overlap and 1318 // alignment checks inserted by the vectorizer. We want to track correlated 1319 // runtime checks for two inner loops in the same outer loop, fold any 1320 // common computations, hoist loop-invariant aspects out of any outer loop, 1321 // and unswitch the runtime checks if possible. Once hoisted, we may have 1322 // dead (or speculatable) control flows or more combining opportunities. 1323 ExtraPasses.addPass(EarlyCSEPass()); 1324 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1325 ExtraPasses.addPass(InstCombinePass()); 1326 LoopPassManager LPM; 1327 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1328 /*AllowSpeculation=*/true)); 1329 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1330 OptimizationLevel::O3)); 1331 ExtraPasses.addPass( 1332 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1333 /*UseBlockFrequencyInfo=*/true)); 1334 ExtraPasses.addPass( 1335 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1336 ExtraPasses.addPass(InstCombinePass()); 1337 FPM.addPass(std::move(ExtraPasses)); 1338 } 1339 1340 // Now that we've formed fast to execute loop structures, we do further 1341 // optimizations. These are run afterward as they might block doing complex 1342 // analyses and transforms such as what are needed for loop vectorization. 1343 1344 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1345 // GVN, loop transforms, and others have already run, so it's now better to 1346 // convert to more optimized IR using more aggressive simplify CFG options. 1347 // The extra sinking transform can create larger basic blocks, so do this 1348 // before SLP vectorization. 1349 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1350 .forwardSwitchCondToPhi(true) 1351 .convertSwitchRangeToICmp(true) 1352 .convertSwitchToLookupTable(true) 1353 .needCanonicalLoops(false) 1354 .hoistCommonInsts(true) 1355 .sinkCommonInsts(true))); 1356 1357 if (IsFullLTO) { 1358 FPM.addPass(SCCPPass()); 1359 FPM.addPass(InstCombinePass()); 1360 FPM.addPass(BDCEPass()); 1361 } 1362 1363 // Optimize parallel scalar instruction chains into SIMD instructions. 1364 if (PTO.SLPVectorization) { 1365 FPM.addPass(SLPVectorizerPass()); 1366 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1367 FPM.addPass(EarlyCSEPass()); 1368 } 1369 } 1370 // Enhance/cleanup vector code. 1371 FPM.addPass(VectorCombinePass()); 1372 1373 if (!IsFullLTO) { 1374 FPM.addPass(InstCombinePass()); 1375 // Unroll small loops to hide loop backedge latency and saturate any 1376 // parallel execution resources of an out-of-order processor. We also then 1377 // need to clean up redundancies and loop invariant code. 1378 // FIXME: It would be really good to use a loop-integrated instruction 1379 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1380 // across the loop nests. 1381 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1382 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1383 FPM.addPass(createFunctionToLoopPassAdaptor( 1384 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1385 } 1386 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1387 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1388 PTO.ForgetAllSCEVInLoopUnroll))); 1389 FPM.addPass(WarnMissedTransformationsPass()); 1390 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1391 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1392 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1393 // NOTE: we are very late in the pipeline, and we don't have any LICM 1394 // or SimplifyCFG passes scheduled after us, that would cleanup 1395 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1396 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1397 } 1398 1399 FPM.addPass(InferAlignmentPass()); 1400 FPM.addPass(InstCombinePass()); 1401 1402 // This is needed for two reasons: 1403 // 1. It works around problems that instcombine introduces, such as sinking 1404 // expensive FP divides into loops containing multiplications using the 1405 // divide result. 1406 // 2. It helps to clean up some loop-invariant code created by the loop 1407 // unroll pass when IsFullLTO=false. 1408 FPM.addPass(createFunctionToLoopPassAdaptor( 1409 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1410 /*AllowSpeculation=*/true), 1411 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1412 1413 // Now that we've vectorized and unrolled loops, we may have more refined 1414 // alignment information, try to re-derive it here. 1415 FPM.addPass(AlignmentFromAssumptionsPass()); 1416 } 1417 1418 ModulePassManager 1419 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1420 ThinOrFullLTOPhase LTOPhase) { 1421 const bool LTOPreLink = isLTOPreLink(LTOPhase); 1422 ModulePassManager MPM; 1423 1424 // Run partial inlining pass to partially inline functions that have 1425 // large bodies. 1426 if (RunPartialInlining) 1427 MPM.addPass(PartialInlinerPass()); 1428 1429 // Remove avail extern fns and globals definitions since we aren't compiling 1430 // an object file for later LTO. For LTO we want to preserve these so they 1431 // are eligible for inlining at link-time. Note if they are unreferenced they 1432 // will be removed by GlobalDCE later, so this only impacts referenced 1433 // available externally globals. Eventually they will be suppressed during 1434 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1435 // may make globals referenced by available external functions dead and saves 1436 // running remaining passes on the eliminated functions. These should be 1437 // preserved during prelinking for link-time inlining decisions. 1438 if (!LTOPreLink) 1439 MPM.addPass(EliminateAvailableExternallyPass()); 1440 1441 if (EnableOrderFileInstrumentation) 1442 MPM.addPass(InstrOrderFilePass()); 1443 1444 // Do RPO function attribute inference across the module to forward-propagate 1445 // attributes where applicable. 1446 // FIXME: Is this really an optimization rather than a canonicalization? 1447 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1448 1449 // Do a post inline PGO instrumentation and use pass. This is a context 1450 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1451 // cross-module inline has not been done yet. The context sensitive 1452 // instrumentation is after all the inlines are done. 1453 if (!LTOPreLink && PGOOpt) { 1454 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1455 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 1456 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1457 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 1458 PGOOpt->FS); 1459 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1460 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 1461 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1462 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1463 PGOOpt->FS); 1464 } 1465 1466 // Re-compute GlobalsAA here prior to function passes. This is particularly 1467 // useful as the above will have inlined, DCE'ed, and function-attr 1468 // propagated everything. We should at this point have a reasonably minimal 1469 // and richly annotated call graph. By computing aliasing and mod/ref 1470 // information for all local globals here, the late loop passes and notably 1471 // the vectorizer will be able to use them to help recognize vectorizable 1472 // memory operations. 1473 if (EnableGlobalAnalyses) 1474 MPM.addPass(RecomputeGlobalsAAPass()); 1475 1476 invokeOptimizerEarlyEPCallbacks(MPM, Level, LTOPhase); 1477 1478 FunctionPassManager OptimizePM; 1479 // Scheduling LoopVersioningLICM when inlining is over, because after that 1480 // we may see more accurate aliasing. Reason to run this late is that too 1481 // early versioning may prevent further inlining due to increase of code 1482 // size. Other optimizations which runs later might get benefit of no-alias 1483 // assumption in clone loop. 1484 if (UseLoopVersioningLICM) { 1485 OptimizePM.addPass( 1486 createFunctionToLoopPassAdaptor(LoopVersioningLICMPass())); 1487 // LoopVersioningLICM pass might increase new LICM opportunities. 1488 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1489 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1490 /*AllowSpeculation=*/true), 1491 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1492 } 1493 1494 OptimizePM.addPass(Float2IntPass()); 1495 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1496 1497 if (EnableMatrix) { 1498 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1499 OptimizePM.addPass(EarlyCSEPass()); 1500 } 1501 1502 // CHR pass should only be applied with the profile information. 1503 // The check is to check the profile summary information in CHR. 1504 if (EnableCHR && Level == OptimizationLevel::O3) 1505 OptimizePM.addPass(ControlHeightReductionPass()); 1506 1507 // FIXME: We need to run some loop optimizations to re-rotate loops after 1508 // simplifycfg and others undo their rotation. 1509 1510 // Optimize the loop execution. These passes operate on entire loop nests 1511 // rather than on each loop in an inside-out manner, and so they are actually 1512 // function passes. 1513 1514 invokeVectorizerStartEPCallbacks(OptimizePM, Level); 1515 1516 LoopPassManager LPM; 1517 // First rotate loops that may have been un-rotated by prior passes. 1518 // Disable header duplication at -Oz. 1519 LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication || 1520 Level != OptimizationLevel::Oz, 1521 LTOPreLink)); 1522 // Some loops may have become dead by now. Try to delete them. 1523 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1524 // this may need to be revisited once we run GVN before loop deletion 1525 // in the simplification pipeline. 1526 LPM.addPass(LoopDeletionPass()); 1527 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1528 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1529 1530 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1531 // into separate loop that would otherwise inhibit vectorization. This is 1532 // currently only performed for loops marked with the metadata 1533 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1534 OptimizePM.addPass(LoopDistributePass()); 1535 1536 // Populates the VFABI attribute with the scalar-to-vector mappings 1537 // from the TargetLibraryInfo. 1538 OptimizePM.addPass(InjectTLIMappings()); 1539 1540 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1541 1542 invokeVectorizerEndEPCallbacks(OptimizePM, Level); 1543 1544 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1545 // canonicalization pass that enables other optimizations. As a result, 1546 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1547 // result too early. 1548 OptimizePM.addPass(LoopSinkPass()); 1549 1550 // And finally clean up LCSSA form before generating code. 1551 OptimizePM.addPass(InstSimplifyPass()); 1552 1553 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1554 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1555 // flattening of blocks. 1556 OptimizePM.addPass(DivRemPairsPass()); 1557 1558 // Try to annotate calls that were created during optimization. 1559 OptimizePM.addPass(TailCallElimPass()); 1560 1561 // LoopSink (and other loop passes since the last simplifyCFG) might have 1562 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1563 OptimizePM.addPass( 1564 SimplifyCFGPass(SimplifyCFGOptions() 1565 .convertSwitchRangeToICmp(true) 1566 .speculateUnpredictables(true) 1567 .hoistLoadsStoresWithCondFaulting(true))); 1568 1569 // Add the core optimizing pipeline. 1570 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1571 PTO.EagerlyInvalidateAnalyses)); 1572 1573 invokeOptimizerLastEPCallbacks(MPM, Level, LTOPhase); 1574 1575 // Split out cold code. Splitting is done late to avoid hiding context from 1576 // other optimizations and inadvertently regressing performance. The tradeoff 1577 // is that this has a higher code size cost than splitting early. 1578 if (EnableHotColdSplit && !LTOPreLink) 1579 MPM.addPass(HotColdSplittingPass()); 1580 1581 // Search the code for similar regions of code. If enough similar regions can 1582 // be found where extracting the regions into their own function will decrease 1583 // the size of the program, we extract the regions, a deduplicate the 1584 // structurally similar regions. 1585 if (EnableIROutliner) 1586 MPM.addPass(IROutlinerPass()); 1587 1588 // Now we need to do some global optimization transforms. 1589 // FIXME: It would seem like these should come first in the optimization 1590 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1591 // ordering here. 1592 MPM.addPass(GlobalDCEPass()); 1593 MPM.addPass(ConstantMergePass()); 1594 1595 // Merge functions if requested. It has a better chance to merge functions 1596 // after ConstantMerge folded jump tables. 1597 if (PTO.MergeFunctions) 1598 MPM.addPass(MergeFunctionsPass()); 1599 1600 if (PTO.CallGraphProfile && !LTOPreLink) 1601 MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink || 1602 LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink)); 1603 1604 // RelLookupTableConverterPass runs later in LTO post-link pipeline. 1605 if (!LTOPreLink) 1606 MPM.addPass(RelLookupTableConverterPass()); 1607 1608 return MPM; 1609 } 1610 1611 ModulePassManager 1612 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1613 ThinOrFullLTOPhase Phase) { 1614 if (Level == OptimizationLevel::O0) 1615 return buildO0DefaultPipeline(Level, Phase); 1616 1617 ModulePassManager MPM; 1618 1619 // Convert @llvm.global.annotations to !annotation metadata. 1620 MPM.addPass(Annotation2MetadataPass()); 1621 1622 // Force any function attributes we want the rest of the pipeline to observe. 1623 MPM.addPass(ForceFunctionAttrsPass()); 1624 1625 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1626 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1627 1628 // Apply module pipeline start EP callback. 1629 invokePipelineStartEPCallbacks(MPM, Level); 1630 1631 // Add the core simplification pipeline. 1632 MPM.addPass(buildModuleSimplificationPipeline(Level, Phase)); 1633 1634 // Now add the optimization pipeline. 1635 MPM.addPass(buildModuleOptimizationPipeline(Level, Phase)); 1636 1637 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1638 PGOOpt->Action == PGOOptions::SampleUse) 1639 MPM.addPass(PseudoProbeUpdatePass()); 1640 1641 // Emit annotation remarks. 1642 addAnnotationRemarksPass(MPM); 1643 1644 if (isLTOPreLink(Phase)) 1645 addRequiredLTOPreLinkPasses(MPM); 1646 return MPM; 1647 } 1648 1649 ModulePassManager 1650 PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, 1651 bool EmitSummary) { 1652 ModulePassManager MPM; 1653 if (ThinLTO) 1654 MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level)); 1655 else 1656 MPM.addPass(buildLTOPreLinkDefaultPipeline(Level)); 1657 MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary)); 1658 1659 // If we're doing FatLTO w/ CFI enabled, we don't want the type tests in the 1660 // object code, only in the bitcode section, so drop it before we run 1661 // module optimization and generate machine code. If llvm.type.test() isn't in 1662 // the IR, this won't do anything. 1663 MPM.addPass( 1664 LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::All)); 1665 1666 // Use the ThinLTO post-link pipeline with sample profiling 1667 if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) 1668 MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); 1669 else { 1670 // otherwise, just use module optimization 1671 MPM.addPass( 1672 buildModuleOptimizationPipeline(Level, ThinOrFullLTOPhase::None)); 1673 // Emit annotation remarks. 1674 addAnnotationRemarksPass(MPM); 1675 } 1676 return MPM; 1677 } 1678 1679 ModulePassManager 1680 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1681 if (Level == OptimizationLevel::O0) 1682 return buildO0DefaultPipeline(Level, ThinOrFullLTOPhase::ThinLTOPreLink); 1683 1684 ModulePassManager MPM; 1685 1686 // Convert @llvm.global.annotations to !annotation metadata. 1687 MPM.addPass(Annotation2MetadataPass()); 1688 1689 // Force any function attributes we want the rest of the pipeline to observe. 1690 MPM.addPass(ForceFunctionAttrsPass()); 1691 1692 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1693 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1694 1695 // Apply module pipeline start EP callback. 1696 invokePipelineStartEPCallbacks(MPM, Level); 1697 1698 // If we are planning to perform ThinLTO later, we don't bloat the code with 1699 // unrolling/vectorization/... now. Just simplify the module as much as we 1700 // can. 1701 MPM.addPass(buildModuleSimplificationPipeline( 1702 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1703 // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let 1704 // thinlto use the contextual info to perform imports; then use the contextual 1705 // profile in the post-thinlink phase. 1706 if (!UseCtxProfile.empty()) { 1707 addRequiredLTOPreLinkPasses(MPM); 1708 return MPM; 1709 } 1710 1711 // Run partial inlining pass to partially inline functions that have 1712 // large bodies. 1713 // FIXME: It isn't clear whether this is really the right place to run this 1714 // in ThinLTO. Because there is another canonicalization and simplification 1715 // phase that will run after the thin link, running this here ends up with 1716 // less information than will be available later and it may grow functions in 1717 // ways that aren't beneficial. 1718 if (RunPartialInlining) 1719 MPM.addPass(PartialInlinerPass()); 1720 1721 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1722 PGOOpt->Action == PGOOptions::SampleUse) 1723 MPM.addPass(PseudoProbeUpdatePass()); 1724 1725 // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual 1726 // optimization is going to be done in PostLink stage, but clang can't add 1727 // callbacks there in case of in-process ThinLTO called by linker. 1728 invokeOptimizerEarlyEPCallbacks(MPM, Level, 1729 /*Phase=*/ThinOrFullLTOPhase::ThinLTOPreLink); 1730 invokeOptimizerLastEPCallbacks(MPM, Level, 1731 /*Phase=*/ThinOrFullLTOPhase::ThinLTOPreLink); 1732 1733 // Emit annotation remarks. 1734 addAnnotationRemarksPass(MPM); 1735 1736 addRequiredLTOPreLinkPasses(MPM); 1737 1738 return MPM; 1739 } 1740 1741 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1742 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1743 ModulePassManager MPM; 1744 1745 if (ImportSummary) { 1746 // For ThinLTO we must apply the context disambiguation decisions early, to 1747 // ensure we can correctly match the callsites to summary data. 1748 if (EnableMemProfContextDisambiguation) 1749 MPM.addPass(MemProfContextDisambiguation( 1750 ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1751 1752 // These passes import type identifier resolutions for whole-program 1753 // devirtualization and CFI. They must run early because other passes may 1754 // disturb the specific instruction patterns that these passes look for, 1755 // creating dependencies on resolutions that may not appear in the summary. 1756 // 1757 // For example, GVN may transform the pattern assume(type.test) appearing in 1758 // two basic blocks into assume(phi(type.test, type.test)), which would 1759 // transform a dependency on a WPD resolution into a dependency on a type 1760 // identifier resolution for CFI. 1761 // 1762 // Also, WPD has access to more precise information than ICP and can 1763 // devirtualize more effectively, so it should operate on the IR first. 1764 // 1765 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1766 // metadata and intrinsics. 1767 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1768 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1769 } 1770 1771 if (Level == OptimizationLevel::O0) { 1772 // Run a second time to clean up any type tests left behind by WPD for use 1773 // in ICP. 1774 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, 1775 lowertypetests::DropTestKind::Assume)); 1776 // Drop available_externally and unreferenced globals. This is necessary 1777 // with ThinLTO in order to avoid leaving undefined references to dead 1778 // globals in the object file. 1779 MPM.addPass(EliminateAvailableExternallyPass()); 1780 MPM.addPass(GlobalDCEPass()); 1781 return MPM; 1782 } 1783 if (!UseCtxProfile.empty()) { 1784 MPM.addPass( 1785 buildModuleInlinerPipeline(Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1786 } else { 1787 // Add the core simplification pipeline. 1788 MPM.addPass(buildModuleSimplificationPipeline( 1789 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1790 } 1791 // Now add the optimization pipeline. 1792 MPM.addPass(buildModuleOptimizationPipeline( 1793 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1794 1795 // Emit annotation remarks. 1796 addAnnotationRemarksPass(MPM); 1797 1798 return MPM; 1799 } 1800 1801 ModulePassManager 1802 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1803 // FIXME: We should use a customized pre-link pipeline! 1804 return buildPerModuleDefaultPipeline(Level, 1805 ThinOrFullLTOPhase::FullLTOPreLink); 1806 } 1807 1808 ModulePassManager 1809 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1810 ModuleSummaryIndex *ExportSummary) { 1811 ModulePassManager MPM; 1812 1813 invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); 1814 1815 // Create a function that performs CFI checks for cross-DSO calls with targets 1816 // in the current module. 1817 MPM.addPass(CrossDSOCFIPass()); 1818 1819 if (Level == OptimizationLevel::O0) { 1820 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1821 // metadata and intrinsics. 1822 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1823 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1824 // Run a second time to clean up any type tests left behind by WPD for use 1825 // in ICP. 1826 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, 1827 lowertypetests::DropTestKind::Assume)); 1828 1829 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1830 1831 // Emit annotation remarks. 1832 addAnnotationRemarksPass(MPM); 1833 1834 return MPM; 1835 } 1836 1837 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1838 // Load sample profile before running the LTO optimization pipeline. 1839 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1840 PGOOpt->ProfileRemappingFile, 1841 ThinOrFullLTOPhase::FullLTOPostLink)); 1842 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1843 // RequireAnalysisPass for PSI before subsequent non-module passes. 1844 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1845 } 1846 1847 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1848 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1849 1850 // Remove unused virtual tables to improve the quality of code generated by 1851 // whole-program devirtualization and bitset lowering. 1852 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1853 1854 // Do basic inference of function attributes from known properties of system 1855 // libraries and other oracles. 1856 MPM.addPass(InferFunctionAttrsPass()); 1857 1858 if (Level.getSpeedupLevel() > 1) { 1859 MPM.addPass(createModuleToFunctionPassAdaptor( 1860 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses)); 1861 1862 // Indirect call promotion. This should promote all the targets that are 1863 // left by the earlier promotion pass that promotes intra-module targets. 1864 // This two-step promotion is to save the compile time. For LTO, it should 1865 // produce the same result as if we only do promotion here. 1866 MPM.addPass(PGOIndirectCallPromotion( 1867 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1868 1869 // Promoting by-reference arguments to by-value exposes more constants to 1870 // IPSCCP. 1871 CGSCCPassManager CGPM; 1872 CGPM.addPass(PostOrderFunctionAttrsPass()); 1873 CGPM.addPass(ArgumentPromotionPass()); 1874 CGPM.addPass( 1875 createCGSCCToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG))); 1876 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1877 1878 // Propagate constants at call sites into the functions they call. This 1879 // opens opportunities for globalopt (and inlining) by substituting function 1880 // pointers passed as arguments to direct uses of functions. 1881 MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/ 1882 Level != OptimizationLevel::Os && 1883 Level != OptimizationLevel::Oz))); 1884 1885 // Attach metadata to indirect call sites indicating the set of functions 1886 // they may target at run-time. This should follow IPSCCP. 1887 MPM.addPass(CalledValuePropagationPass()); 1888 } 1889 1890 // Do RPO function attribute inference across the module to forward-propagate 1891 // attributes where applicable. 1892 // FIXME: Is this really an optimization rather than a canonicalization? 1893 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1894 1895 // Use in-range annotations on GEP indices to split globals where beneficial. 1896 MPM.addPass(GlobalSplitPass()); 1897 1898 // Run whole program optimization of virtual call when the list of callees 1899 // is fixed. 1900 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1901 1902 // Stop here at -O1. 1903 if (Level == OptimizationLevel::O1) { 1904 // The LowerTypeTestsPass needs to run to lower type metadata and the 1905 // type.test intrinsics. The pass does nothing if CFI is disabled. 1906 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1907 // Run a second time to clean up any type tests left behind by WPD for use 1908 // in ICP (which is performed earlier than this in the regular LTO 1909 // pipeline). 1910 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, 1911 lowertypetests::DropTestKind::Assume)); 1912 1913 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1914 1915 // Emit annotation remarks. 1916 addAnnotationRemarksPass(MPM); 1917 1918 return MPM; 1919 } 1920 1921 // Optimize globals to try and fold them into constants. 1922 MPM.addPass(GlobalOptPass()); 1923 1924 // Promote any localized globals to SSA registers. 1925 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1926 1927 // Linking modules together can lead to duplicate global constant, only 1928 // keep one copy of each constant. 1929 MPM.addPass(ConstantMergePass()); 1930 1931 // Remove unused arguments from functions. 1932 MPM.addPass(DeadArgumentEliminationPass()); 1933 1934 // Reduce the code after globalopt and ipsccp. Both can open up significant 1935 // simplification opportunities, and both can propagate functions through 1936 // function pointers. When this happens, we often have to resolve varargs 1937 // calls, etc, so let instcombine do this. 1938 FunctionPassManager PeepholeFPM; 1939 PeepholeFPM.addPass(InstCombinePass()); 1940 if (Level.getSpeedupLevel() > 1) 1941 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1942 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1943 1944 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1945 PTO.EagerlyInvalidateAnalyses)); 1946 1947 // Lower variadic functions for supported targets prior to inlining. 1948 MPM.addPass(ExpandVariadicsPass(ExpandVariadicsMode::Optimize)); 1949 1950 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1951 // generally clean up exception handling overhead. It isn't clear this is 1952 // valuable as the inliner doesn't currently care whether it is inlining an 1953 // invoke or a call. 1954 // Run the inliner now. 1955 if (EnableModuleInliner) { 1956 MPM.addPass(ModuleInlinerPass(getInlineParamsFromOptLevel(Level), 1957 UseInlineAdvisor, 1958 ThinOrFullLTOPhase::FullLTOPostLink)); 1959 } else { 1960 MPM.addPass(ModuleInlinerWrapperPass( 1961 getInlineParamsFromOptLevel(Level), 1962 /* MandatoryFirst */ true, 1963 InlineContext{ThinOrFullLTOPhase::FullLTOPostLink, 1964 InlinePass::CGSCCInliner})); 1965 } 1966 1967 // Perform context disambiguation after inlining, since that would reduce the 1968 // amount of additional cloning required to distinguish the allocation 1969 // contexts. 1970 if (EnableMemProfContextDisambiguation) 1971 MPM.addPass(MemProfContextDisambiguation( 1972 /*Summary=*/nullptr, 1973 PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1974 1975 // Optimize globals again after we ran the inliner. 1976 MPM.addPass(GlobalOptPass()); 1977 1978 // Run the OpenMPOpt pass again after global optimizations. 1979 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1980 1981 // Garbage collect dead functions. 1982 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1983 1984 // If we didn't decide to inline a function, check to see if we can 1985 // transform it to pass arguments by value instead of by reference. 1986 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1987 1988 FunctionPassManager FPM; 1989 // The IPO Passes may leave cruft around. Clean up after them. 1990 FPM.addPass(InstCombinePass()); 1991 invokePeepholeEPCallbacks(FPM, Level); 1992 1993 if (EnableConstraintElimination) 1994 FPM.addPass(ConstraintEliminationPass()); 1995 1996 FPM.addPass(JumpThreadingPass()); 1997 1998 // Do a post inline PGO instrumentation and use pass. This is a context 1999 // sensitive PGO pass. 2000 if (PGOOpt) { 2001 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 2002 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 2003 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 2004 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 2005 PGOOpt->FS); 2006 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 2007 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 2008 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 2009 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 2010 PGOOpt->FS); 2011 } 2012 2013 // Break up allocas 2014 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 2015 2016 // LTO provides additional opportunities for tailcall elimination due to 2017 // link-time inlining, and visibility of nocapture attribute. 2018 FPM.addPass(TailCallElimPass()); 2019 2020 // Run a few AA driver optimizations here and now to cleanup the code. 2021 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 2022 PTO.EagerlyInvalidateAnalyses)); 2023 2024 MPM.addPass( 2025 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 2026 2027 // Require the GlobalsAA analysis for the module so we can query it within 2028 // MainFPM. 2029 if (EnableGlobalAnalyses) { 2030 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 2031 // Invalidate AAManager so it can be recreated and pick up the newly 2032 // available GlobalsAA. 2033 MPM.addPass( 2034 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 2035 } 2036 2037 FunctionPassManager MainFPM; 2038 MainFPM.addPass(createFunctionToLoopPassAdaptor( 2039 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 2040 /*AllowSpeculation=*/true), 2041 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 2042 2043 if (RunNewGVN) 2044 MainFPM.addPass(NewGVNPass()); 2045 else 2046 MainFPM.addPass(GVNPass()); 2047 2048 // Remove dead memcpy()'s. 2049 MainFPM.addPass(MemCpyOptPass()); 2050 2051 // Nuke dead stores. 2052 MainFPM.addPass(DSEPass()); 2053 MainFPM.addPass(MoveAutoInitPass()); 2054 MainFPM.addPass(MergedLoadStoreMotionPass()); 2055 2056 invokeVectorizerStartEPCallbacks(MainFPM, Level); 2057 2058 LoopPassManager LPM; 2059 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 2060 LPM.addPass(LoopFlattenPass()); 2061 LPM.addPass(IndVarSimplifyPass()); 2062 LPM.addPass(LoopDeletionPass()); 2063 // FIXME: Add loop interchange. 2064 2065 // Unroll small loops and perform peeling. 2066 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 2067 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 2068 PTO.ForgetAllSCEVInLoopUnroll)); 2069 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 2070 // *All* loop passes must preserve it, in order to be able to use it. 2071 MainFPM.addPass(createFunctionToLoopPassAdaptor( 2072 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 2073 2074 MainFPM.addPass(LoopDistributePass()); 2075 2076 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 2077 2078 invokeVectorizerEndEPCallbacks(MainFPM, Level); 2079 2080 // Run the OpenMPOpt CGSCC pass again late. 2081 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 2082 OpenMPOptCGSCCPass(ThinOrFullLTOPhase::FullLTOPostLink))); 2083 2084 invokePeepholeEPCallbacks(MainFPM, Level); 2085 MainFPM.addPass(JumpThreadingPass()); 2086 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 2087 PTO.EagerlyInvalidateAnalyses)); 2088 2089 // Lower type metadata and the type.test intrinsic. This pass supports 2090 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 2091 // to be run at link time if CFI is enabled. This pass does nothing if 2092 // CFI is disabled. 2093 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 2094 // Run a second time to clean up any type tests left behind by WPD for use 2095 // in ICP (which is performed earlier than this in the regular LTO pipeline). 2096 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, 2097 lowertypetests::DropTestKind::Assume)); 2098 2099 // Enable splitting late in the FullLTO post-link pipeline. 2100 if (EnableHotColdSplit) 2101 MPM.addPass(HotColdSplittingPass()); 2102 2103 // Add late LTO optimization passes. 2104 FunctionPassManager LateFPM; 2105 2106 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 2107 // canonicalization pass that enables other optimizations. As a result, 2108 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 2109 // result too early. 2110 LateFPM.addPass(LoopSinkPass()); 2111 2112 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 2113 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 2114 // flattening of blocks. 2115 LateFPM.addPass(DivRemPairsPass()); 2116 2117 // Delete basic blocks, which optimization passes may have killed. 2118 LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 2119 .convertSwitchRangeToICmp(true) 2120 .hoistCommonInsts(true) 2121 .speculateUnpredictables(true))); 2122 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM))); 2123 2124 // Drop bodies of available eternally objects to improve GlobalDCE. 2125 MPM.addPass(EliminateAvailableExternallyPass()); 2126 2127 // Now that we have optimized the program, discard unreachable functions. 2128 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 2129 2130 if (PTO.MergeFunctions) 2131 MPM.addPass(MergeFunctionsPass()); 2132 2133 MPM.addPass(RelLookupTableConverterPass()); 2134 2135 if (PTO.CallGraphProfile) 2136 MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); 2137 2138 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 2139 2140 // Emit annotation remarks. 2141 addAnnotationRemarksPass(MPM); 2142 2143 return MPM; 2144 } 2145 2146 ModulePassManager 2147 PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 2148 ThinOrFullLTOPhase Phase) { 2149 assert(Level == OptimizationLevel::O0 && 2150 "buildO0DefaultPipeline should only be used with O0"); 2151 2152 ModulePassManager MPM; 2153 2154 // Perform pseudo probe instrumentation in O0 mode. This is for the 2155 // consistency between different build modes. For example, a LTO build can be 2156 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 2157 // the postlink will require pseudo probe instrumentation in the prelink. 2158 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 2159 MPM.addPass(SampleProfileProbePass(TM)); 2160 2161 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 2162 PGOOpt->Action == PGOOptions::IRUse)) 2163 addPGOInstrPassesForO0( 2164 MPM, 2165 /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr), 2166 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile, 2167 PGOOpt->ProfileRemappingFile, PGOOpt->FS); 2168 2169 // Instrument function entry and exit before all inlining. 2170 MPM.addPass(createModuleToFunctionPassAdaptor( 2171 EntryExitInstrumenterPass(/*PostInlining=*/false))); 2172 2173 invokePipelineStartEPCallbacks(MPM, Level); 2174 2175 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 2176 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 2177 2178 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 2179 // Explicitly disable sample loader inlining and use flattened profile in O0 2180 // pipeline. 2181 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 2182 PGOOpt->ProfileRemappingFile, 2183 ThinOrFullLTOPhase::None, nullptr, 2184 /*DisableSampleProfileInlining=*/true, 2185 /*UseFlattenedProfile=*/true)); 2186 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 2187 // RequireAnalysisPass for PSI before subsequent non-module passes. 2188 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 2189 } 2190 2191 invokePipelineEarlySimplificationEPCallbacks(MPM, Level, Phase); 2192 2193 // Build a minimal pipeline based on the semantics required by LLVM, 2194 // which is just that always inlining occurs. Further, disable generating 2195 // lifetime intrinsics to avoid enabling further optimizations during 2196 // code generation. 2197 MPM.addPass(AlwaysInlinerPass( 2198 /*InsertLifetimeIntrinsics=*/false)); 2199 2200 if (PTO.MergeFunctions) 2201 MPM.addPass(MergeFunctionsPass()); 2202 2203 if (EnableMatrix) 2204 MPM.addPass( 2205 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 2206 2207 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 2208 CGSCCPassManager CGPM; 2209 invokeCGSCCOptimizerLateEPCallbacks(CGPM, Level); 2210 if (!CGPM.isEmpty()) 2211 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2212 } 2213 if (!LateLoopOptimizationsEPCallbacks.empty()) { 2214 LoopPassManager LPM; 2215 invokeLateLoopOptimizationsEPCallbacks(LPM, Level); 2216 if (!LPM.isEmpty()) { 2217 MPM.addPass(createModuleToFunctionPassAdaptor( 2218 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2219 } 2220 } 2221 if (!LoopOptimizerEndEPCallbacks.empty()) { 2222 LoopPassManager LPM; 2223 invokeLoopOptimizerEndEPCallbacks(LPM, Level); 2224 if (!LPM.isEmpty()) { 2225 MPM.addPass(createModuleToFunctionPassAdaptor( 2226 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2227 } 2228 } 2229 if (!ScalarOptimizerLateEPCallbacks.empty()) { 2230 FunctionPassManager FPM; 2231 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 2232 if (!FPM.isEmpty()) 2233 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2234 } 2235 2236 invokeOptimizerEarlyEPCallbacks(MPM, Level, Phase); 2237 2238 if (!VectorizerStartEPCallbacks.empty()) { 2239 FunctionPassManager FPM; 2240 invokeVectorizerStartEPCallbacks(FPM, Level); 2241 if (!FPM.isEmpty()) 2242 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2243 } 2244 2245 if (!VectorizerEndEPCallbacks.empty()) { 2246 FunctionPassManager FPM; 2247 invokeVectorizerEndEPCallbacks(FPM, Level); 2248 if (!FPM.isEmpty()) 2249 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2250 } 2251 2252 ModulePassManager CoroPM; 2253 CoroPM.addPass(CoroEarlyPass()); 2254 CGSCCPassManager CGPM; 2255 CGPM.addPass(CoroSplitPass()); 2256 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2257 CoroPM.addPass(CoroCleanupPass()); 2258 CoroPM.addPass(GlobalDCEPass()); 2259 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); 2260 2261 invokeOptimizerLastEPCallbacks(MPM, Level, Phase); 2262 2263 if (isLTOPreLink(Phase)) 2264 addRequiredLTOPreLinkPasses(MPM); 2265 2266 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 2267 2268 return MPM; 2269 } 2270 2271 AAManager PassBuilder::buildDefaultAAPipeline() { 2272 AAManager AA; 2273 2274 // The order in which these are registered determines their priority when 2275 // being queried. 2276 2277 // First we register the basic alias analysis that provides the majority of 2278 // per-function local AA logic. This is a stateless, on-demand local set of 2279 // AA techniques. 2280 AA.registerFunctionAnalysis<BasicAA>(); 2281 2282 // Next we query fast, specialized alias analyses that wrap IR-embedded 2283 // information about aliasing. 2284 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 2285 AA.registerFunctionAnalysis<TypeBasedAA>(); 2286 2287 // Add support for querying global aliasing information when available. 2288 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 2289 // analysis, all that the `AAManager` can do is query for any *cached* 2290 // results from `GlobalsAA` through a readonly proxy. 2291 if (EnableGlobalAnalyses) 2292 AA.registerModuleAnalysis<GlobalsAA>(); 2293 2294 // Add target-specific alias analyses. 2295 if (TM) 2296 TM->registerDefaultAliasAnalyses(AA); 2297 2298 return AA; 2299 } 2300