1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/BasicAliasAnalysis.h" 19 #include "llvm/Analysis/CGSCCPassManager.h" 20 #include "llvm/Analysis/GlobalsModRef.h" 21 #include "llvm/Analysis/InlineAdvisor.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 34 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 35 #include "llvm/Transforms/Coroutines/CoroEarly.h" 36 #include "llvm/Transforms/Coroutines/CoroElide.h" 37 #include "llvm/Transforms/Coroutines/CoroSplit.h" 38 #include "llvm/Transforms/IPO/AlwaysInliner.h" 39 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 40 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 41 #include "llvm/Transforms/IPO/Attributor.h" 42 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 43 #include "llvm/Transforms/IPO/ConstantMerge.h" 44 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 45 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 46 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 47 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 48 #include "llvm/Transforms/IPO/FunctionAttrs.h" 49 #include "llvm/Transforms/IPO/GlobalDCE.h" 50 #include "llvm/Transforms/IPO/GlobalOpt.h" 51 #include "llvm/Transforms/IPO/GlobalSplit.h" 52 #include "llvm/Transforms/IPO/HotColdSplitting.h" 53 #include "llvm/Transforms/IPO/IROutliner.h" 54 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 55 #include "llvm/Transforms/IPO/Inliner.h" 56 #include "llvm/Transforms/IPO/LowerTypeTests.h" 57 #include "llvm/Transforms/IPO/MergeFunctions.h" 58 #include "llvm/Transforms/IPO/ModuleInliner.h" 59 #include "llvm/Transforms/IPO/OpenMPOpt.h" 60 #include "llvm/Transforms/IPO/PartialInlining.h" 61 #include "llvm/Transforms/IPO/SCCP.h" 62 #include "llvm/Transforms/IPO/SampleProfile.h" 63 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 64 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 65 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 66 #include "llvm/Transforms/InstCombine/InstCombine.h" 67 #include "llvm/Transforms/Instrumentation/CGProfile.h" 68 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 69 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 70 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 71 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 72 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 73 #include "llvm/Transforms/Scalar/ADCE.h" 74 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 75 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 76 #include "llvm/Transforms/Scalar/BDCE.h" 77 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 78 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 79 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 80 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 81 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 82 #include "llvm/Transforms/Scalar/DivRemPairs.h" 83 #include "llvm/Transforms/Scalar/EarlyCSE.h" 84 #include "llvm/Transforms/Scalar/Float2Int.h" 85 #include "llvm/Transforms/Scalar/GVN.h" 86 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 87 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 88 #include "llvm/Transforms/Scalar/JumpThreading.h" 89 #include "llvm/Transforms/Scalar/LICM.h" 90 #include "llvm/Transforms/Scalar/LoopDeletion.h" 91 #include "llvm/Transforms/Scalar/LoopDistribute.h" 92 #include "llvm/Transforms/Scalar/LoopFlatten.h" 93 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 94 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 95 #include "llvm/Transforms/Scalar/LoopInterchange.h" 96 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 97 #include "llvm/Transforms/Scalar/LoopPassManager.h" 98 #include "llvm/Transforms/Scalar/LoopRotation.h" 99 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 100 #include "llvm/Transforms/Scalar/LoopSink.h" 101 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 102 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 103 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 104 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 105 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 106 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 107 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 108 #include "llvm/Transforms/Scalar/NewGVN.h" 109 #include "llvm/Transforms/Scalar/Reassociate.h" 110 #include "llvm/Transforms/Scalar/SCCP.h" 111 #include "llvm/Transforms/Scalar/SROA.h" 112 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 113 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 114 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 115 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 116 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 117 #include "llvm/Transforms/Utils/AddDiscriminators.h" 118 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 119 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 120 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 121 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 122 #include "llvm/Transforms/Utils/Mem2Reg.h" 123 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 124 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 125 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 126 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 127 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 128 #include "llvm/Transforms/Vectorize/VectorCombine.h" 129 130 using namespace llvm; 131 132 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 133 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 134 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 135 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 136 "Heuristics-based inliner version."), 137 clEnumValN(InliningAdvisorMode::Development, "development", 138 "Use development mode (runtime-loadable model)."), 139 clEnumValN(InliningAdvisorMode::Release, "release", 140 "Use release mode (AOT-compiled model)."))); 141 142 static cl::opt<bool> EnableSyntheticCounts( 143 "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, 144 cl::desc("Run synthetic function entry count generation " 145 "pass")); 146 147 /// Flag to enable inline deferral during PGO. 148 static cl::opt<bool> 149 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 150 cl::Hidden, 151 cl::desc("Enable inline deferral during PGO")); 152 153 static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false), 154 cl::Hidden, cl::ZeroOrMore, 155 cl::desc("Enable memory profiler")); 156 157 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 158 cl::init(false), cl::Hidden, 159 cl::desc("Enable module inliner")); 160 161 static cl::opt<bool> PerformMandatoryInliningsFirst( 162 "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore, 163 cl::desc("Perform mandatory inlinings module-wide, before performing " 164 "inlining.")); 165 166 static cl::opt<bool> EnableO3NonTrivialUnswitching( 167 "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, 168 cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3")); 169 170 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 171 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 172 cl::desc("Eagerly invalidate more analyses in default pipelines")); 173 174 static cl::opt<bool> EnableNoRerunSimplificationPipeline( 175 "enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden, 176 cl::desc( 177 "Prevent running the simplification pipeline on a function more " 178 "than once in the case that SCC mutations cause a function to be " 179 "visited multiple times as long as the function has not been changed")); 180 181 static cl::opt<bool> EnableMergeFunctions( 182 "enable-merge-functions", cl::init(false), cl::Hidden, 183 cl::desc("Enable function merging as part of the optimization pipeline")); 184 185 PipelineTuningOptions::PipelineTuningOptions() { 186 LoopInterleaving = true; 187 LoopVectorization = true; 188 SLPVectorization = false; 189 LoopUnrolling = true; 190 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 191 LicmMssaOptCap = SetLicmMssaOptCap; 192 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 193 CallGraphProfile = true; 194 MergeFunctions = EnableMergeFunctions; 195 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 196 } 197 198 namespace llvm { 199 200 extern cl::opt<unsigned> MaxDevirtIterations; 201 extern cl::opt<bool> EnableConstraintElimination; 202 extern cl::opt<bool> EnableFunctionSpecialization; 203 extern cl::opt<bool> EnableGVNHoist; 204 extern cl::opt<bool> EnableGVNSink; 205 extern cl::opt<bool> EnableHotColdSplit; 206 extern cl::opt<bool> EnableIROutliner; 207 extern cl::opt<bool> EnableOrderFileInstrumentation; 208 extern cl::opt<bool> EnableCHR; 209 extern cl::opt<bool> EnableLoopInterchange; 210 extern cl::opt<bool> EnableUnrollAndJam; 211 extern cl::opt<bool> EnableLoopFlatten; 212 extern cl::opt<bool> EnableDFAJumpThreading; 213 extern cl::opt<bool> RunNewGVN; 214 extern cl::opt<bool> RunPartialInlining; 215 extern cl::opt<bool> ExtraVectorizerPasses; 216 217 extern cl::opt<bool> FlattenedProfileUsed; 218 219 extern cl::opt<AttributorRunOption> AttributorRun; 220 extern cl::opt<bool> EnableKnowledgeRetention; 221 222 extern cl::opt<bool> EnableMatrix; 223 224 extern cl::opt<bool> DisablePreInliner; 225 extern cl::opt<int> PreInlineThreshold; 226 } // namespace llvm 227 228 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 229 OptimizationLevel Level) { 230 for (auto &C : PeepholeEPCallbacks) 231 C(FPM, Level); 232 } 233 234 // Helper to add AnnotationRemarksPass. 235 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 236 FunctionPassManager FPM; 237 FPM.addPass(AnnotationRemarksPass()); 238 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 239 } 240 241 // Helper to check if the current compilation phase is preparing for LTO 242 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 243 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 244 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 245 } 246 247 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 248 FunctionPassManager 249 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 250 ThinOrFullLTOPhase Phase) { 251 252 FunctionPassManager FPM; 253 254 // Form SSA out of local memory accesses after breaking apart aggregates into 255 // scalars. 256 FPM.addPass(SROAPass()); 257 258 // Catch trivial redundancies 259 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 260 261 // Hoisting of scalars and load expressions. 262 FPM.addPass(SimplifyCFGPass()); 263 FPM.addPass(InstCombinePass()); 264 265 FPM.addPass(LibCallsShrinkWrapPass()); 266 267 invokePeepholeEPCallbacks(FPM, Level); 268 269 FPM.addPass(SimplifyCFGPass()); 270 271 // Form canonically associated expression trees, and simplify the trees using 272 // basic mathematical properties. For example, this will form (nearly) 273 // minimal multiplication trees. 274 FPM.addPass(ReassociatePass()); 275 276 // Add the primary loop simplification pipeline. 277 // FIXME: Currently this is split into two loop pass pipelines because we run 278 // some function passes in between them. These can and should be removed 279 // and/or replaced by scheduling the loop pass equivalents in the correct 280 // positions. But those equivalent passes aren't powerful enough yet. 281 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 282 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 283 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 284 // `LoopInstSimplify`. 285 LoopPassManager LPM1, LPM2; 286 287 // Simplify the loop body. We do this initially to clean up after other loop 288 // passes run, either when iterating on a loop or on inner loops with 289 // implications on the outer loop. 290 LPM1.addPass(LoopInstSimplifyPass()); 291 LPM1.addPass(LoopSimplifyCFGPass()); 292 293 // Try to remove as much code from the loop header as possible, 294 // to reduce amount of IR that will have to be duplicated. 295 // TODO: Investigate promotion cap for O1. 296 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); 297 298 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 299 isLTOPreLink(Phase))); 300 // TODO: Investigate promotion cap for O1. 301 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); 302 LPM1.addPass(SimpleLoopUnswitchPass()); 303 if (EnableLoopFlatten) 304 LPM1.addPass(LoopFlattenPass()); 305 306 LPM2.addPass(LoopIdiomRecognizePass()); 307 LPM2.addPass(IndVarSimplifyPass()); 308 309 for (auto &C : LateLoopOptimizationsEPCallbacks) 310 C(LPM2, Level); 311 312 LPM2.addPass(LoopDeletionPass()); 313 314 if (EnableLoopInterchange) 315 LPM2.addPass(LoopInterchangePass()); 316 317 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 318 // because it changes IR to makes profile annotation in back compile 319 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 320 // attributes so we need to make sure and allow the full unroll pass to pay 321 // attention to it. 322 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 323 PGOOpt->Action != PGOOptions::SampleUse) 324 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 325 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 326 PTO.ForgetAllSCEVInLoopUnroll)); 327 328 for (auto &C : LoopOptimizerEndEPCallbacks) 329 C(LPM2, Level); 330 331 // We provide the opt remark emitter pass for LICM to use. We only need to do 332 // this once as it is immutable. 333 FPM.addPass( 334 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 335 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 336 /*UseMemorySSA=*/true, 337 /*UseBlockFrequencyInfo=*/true)); 338 FPM.addPass(SimplifyCFGPass()); 339 FPM.addPass(InstCombinePass()); 340 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 341 // *All* loop passes must preserve it, in order to be able to use it. 342 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 343 /*UseMemorySSA=*/false, 344 /*UseBlockFrequencyInfo=*/false)); 345 346 // Delete small array after loop unroll. 347 FPM.addPass(SROAPass()); 348 349 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 350 FPM.addPass(MemCpyOptPass()); 351 352 // Sparse conditional constant propagation. 353 // FIXME: It isn't clear why we do this *after* loop passes rather than 354 // before... 355 FPM.addPass(SCCPPass()); 356 357 // Delete dead bit computations (instcombine runs after to fold away the dead 358 // computations, and then ADCE will run later to exploit any new DCE 359 // opportunities that creates). 360 FPM.addPass(BDCEPass()); 361 362 // Run instcombine after redundancy and dead bit elimination to exploit 363 // opportunities opened up by them. 364 FPM.addPass(InstCombinePass()); 365 invokePeepholeEPCallbacks(FPM, Level); 366 367 FPM.addPass(CoroElidePass()); 368 369 for (auto &C : ScalarOptimizerLateEPCallbacks) 370 C(FPM, Level); 371 372 // Finally, do an expensive DCE pass to catch all the dead code exposed by 373 // the simplifications and basic cleanup after all the simplifications. 374 // TODO: Investigate if this is too expensive. 375 FPM.addPass(ADCEPass()); 376 FPM.addPass(SimplifyCFGPass()); 377 FPM.addPass(InstCombinePass()); 378 invokePeepholeEPCallbacks(FPM, Level); 379 380 return FPM; 381 } 382 383 FunctionPassManager 384 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 385 ThinOrFullLTOPhase Phase) { 386 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 387 388 // The O1 pipeline has a separate pipeline creation function to simplify 389 // construction readability. 390 if (Level.getSpeedupLevel() == 1) 391 return buildO1FunctionSimplificationPipeline(Level, Phase); 392 393 FunctionPassManager FPM; 394 395 // Form SSA out of local memory accesses after breaking apart aggregates into 396 // scalars. 397 FPM.addPass(SROAPass()); 398 399 // Catch trivial redundancies 400 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 401 if (EnableKnowledgeRetention) 402 FPM.addPass(AssumeSimplifyPass()); 403 404 // Hoisting of scalars and load expressions. 405 if (EnableGVNHoist) 406 FPM.addPass(GVNHoistPass()); 407 408 // Global value numbering based sinking. 409 if (EnableGVNSink) { 410 FPM.addPass(GVNSinkPass()); 411 FPM.addPass(SimplifyCFGPass()); 412 } 413 414 if (EnableConstraintElimination) 415 FPM.addPass(ConstraintEliminationPass()); 416 417 // Speculative execution if the target has divergent branches; otherwise nop. 418 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 419 420 // Optimize based on known information about branches, and cleanup afterward. 421 FPM.addPass(JumpThreadingPass()); 422 FPM.addPass(CorrelatedValuePropagationPass()); 423 424 FPM.addPass(SimplifyCFGPass()); 425 FPM.addPass(InstCombinePass()); 426 if (Level == OptimizationLevel::O3) 427 FPM.addPass(AggressiveInstCombinePass()); 428 429 if (!Level.isOptimizingForSize()) 430 FPM.addPass(LibCallsShrinkWrapPass()); 431 432 invokePeepholeEPCallbacks(FPM, Level); 433 434 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 435 // using the size value profile. Don't perform this when optimizing for size. 436 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 437 !Level.isOptimizingForSize()) 438 FPM.addPass(PGOMemOPSizeOpt()); 439 440 FPM.addPass(TailCallElimPass()); 441 FPM.addPass(SimplifyCFGPass()); 442 443 // Form canonically associated expression trees, and simplify the trees using 444 // basic mathematical properties. For example, this will form (nearly) 445 // minimal multiplication trees. 446 FPM.addPass(ReassociatePass()); 447 448 // Add the primary loop simplification pipeline. 449 // FIXME: Currently this is split into two loop pass pipelines because we run 450 // some function passes in between them. These can and should be removed 451 // and/or replaced by scheduling the loop pass equivalents in the correct 452 // positions. But those equivalent passes aren't powerful enough yet. 453 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 454 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 455 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 456 // `LoopInstSimplify`. 457 LoopPassManager LPM1, LPM2; 458 459 // Simplify the loop body. We do this initially to clean up after other loop 460 // passes run, either when iterating on a loop or on inner loops with 461 // implications on the outer loop. 462 LPM1.addPass(LoopInstSimplifyPass()); 463 LPM1.addPass(LoopSimplifyCFGPass()); 464 465 // Try to remove as much code from the loop header as possible, 466 // to reduce amount of IR that will have to be duplicated. 467 // TODO: Investigate promotion cap for O1. 468 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); 469 470 // Disable header duplication in loop rotation at -Oz. 471 LPM1.addPass( 472 LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); 473 // TODO: Investigate promotion cap for O1. 474 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); 475 LPM1.addPass( 476 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && 477 EnableO3NonTrivialUnswitching)); 478 if (EnableLoopFlatten) 479 LPM1.addPass(LoopFlattenPass()); 480 481 LPM2.addPass(LoopIdiomRecognizePass()); 482 LPM2.addPass(IndVarSimplifyPass()); 483 484 for (auto &C : LateLoopOptimizationsEPCallbacks) 485 C(LPM2, Level); 486 487 LPM2.addPass(LoopDeletionPass()); 488 489 if (EnableLoopInterchange) 490 LPM2.addPass(LoopInterchangePass()); 491 492 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 493 // because it changes IR to makes profile annotation in back compile 494 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 495 // attributes so we need to make sure and allow the full unroll pass to pay 496 // attention to it. 497 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 498 PGOOpt->Action != PGOOptions::SampleUse) 499 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 500 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 501 PTO.ForgetAllSCEVInLoopUnroll)); 502 503 for (auto &C : LoopOptimizerEndEPCallbacks) 504 C(LPM2, Level); 505 506 // We provide the opt remark emitter pass for LICM to use. We only need to do 507 // this once as it is immutable. 508 FPM.addPass( 509 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 510 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 511 /*UseMemorySSA=*/true, 512 /*UseBlockFrequencyInfo=*/true)); 513 FPM.addPass(SimplifyCFGPass()); 514 FPM.addPass(InstCombinePass()); 515 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 516 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 517 // *All* loop passes must preserve it, in order to be able to use it. 518 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 519 /*UseMemorySSA=*/false, 520 /*UseBlockFrequencyInfo=*/false)); 521 522 // Delete small array after loop unroll. 523 FPM.addPass(SROAPass()); 524 525 // The matrix extension can introduce large vector operations early, which can 526 // benefit from running vector-combine early on. 527 if (EnableMatrix) 528 FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true)); 529 530 // Eliminate redundancies. 531 FPM.addPass(MergedLoadStoreMotionPass()); 532 if (RunNewGVN) 533 FPM.addPass(NewGVNPass()); 534 else 535 FPM.addPass(GVNPass()); 536 537 // Sparse conditional constant propagation. 538 // FIXME: It isn't clear why we do this *after* loop passes rather than 539 // before... 540 FPM.addPass(SCCPPass()); 541 542 // Delete dead bit computations (instcombine runs after to fold away the dead 543 // computations, and then ADCE will run later to exploit any new DCE 544 // opportunities that creates). 545 FPM.addPass(BDCEPass()); 546 547 // Run instcombine after redundancy and dead bit elimination to exploit 548 // opportunities opened up by them. 549 FPM.addPass(InstCombinePass()); 550 invokePeepholeEPCallbacks(FPM, Level); 551 552 // Re-consider control flow based optimizations after redundancy elimination, 553 // redo DCE, etc. 554 if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) 555 FPM.addPass(DFAJumpThreadingPass()); 556 557 FPM.addPass(JumpThreadingPass()); 558 FPM.addPass(CorrelatedValuePropagationPass()); 559 560 // Finally, do an expensive DCE pass to catch all the dead code exposed by 561 // the simplifications and basic cleanup after all the simplifications. 562 // TODO: Investigate if this is too expensive. 563 FPM.addPass(ADCEPass()); 564 565 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 566 FPM.addPass(MemCpyOptPass()); 567 568 FPM.addPass(DSEPass()); 569 FPM.addPass(createFunctionToLoopPassAdaptor( 570 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), 571 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 572 573 FPM.addPass(CoroElidePass()); 574 575 for (auto &C : ScalarOptimizerLateEPCallbacks) 576 C(FPM, Level); 577 578 FPM.addPass(SimplifyCFGPass( 579 SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true))); 580 FPM.addPass(InstCombinePass()); 581 invokePeepholeEPCallbacks(FPM, Level); 582 583 if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && 584 (PGOOpt->Action == PGOOptions::IRUse || 585 PGOOpt->Action == PGOOptions::SampleUse)) 586 FPM.addPass(ControlHeightReductionPass()); 587 588 return FPM; 589 } 590 591 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 592 MPM.addPass(CanonicalizeAliasesPass()); 593 MPM.addPass(NameAnonGlobalPass()); 594 } 595 596 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 597 OptimizationLevel Level, bool RunProfileGen, 598 bool IsCS, std::string ProfileFile, 599 std::string ProfileRemappingFile) { 600 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 601 if (!IsCS && !DisablePreInliner) { 602 InlineParams IP; 603 604 IP.DefaultThreshold = PreInlineThreshold; 605 606 // FIXME: The hint threshold has the same value used by the regular inliner 607 // when not optimzing for size. This should probably be lowered after 608 // performance testing. 609 // FIXME: this comment is cargo culted from the old pass manager, revisit). 610 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 611 ModuleInlinerWrapperPass MIWP(IP); 612 CGSCCPassManager &CGPipeline = MIWP.getPM(); 613 614 FunctionPassManager FPM; 615 FPM.addPass(SROAPass()); 616 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 617 FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks. 618 FPM.addPass(InstCombinePass()); // Combine silly sequences. 619 invokePeepholeEPCallbacks(FPM, Level); 620 621 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 622 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 623 624 MPM.addPass(std::move(MIWP)); 625 626 // Delete anything that is now dead to make sure that we don't instrument 627 // dead code. Instrumentation can end up keeping dead code around and 628 // dramatically increase code size. 629 MPM.addPass(GlobalDCEPass()); 630 } 631 632 if (!RunProfileGen) { 633 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 634 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 635 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 636 // RequireAnalysisPass for PSI before subsequent non-module passes. 637 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 638 return; 639 } 640 641 // Perform PGO instrumentation. 642 MPM.addPass(PGOInstrumentationGen(IsCS)); 643 644 FunctionPassManager FPM; 645 // Disable header duplication in loop rotation at -Oz. 646 FPM.addPass(createFunctionToLoopPassAdaptor( 647 LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false, 648 /*UseBlockFrequencyInfo=*/false)); 649 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 650 PTO.EagerlyInvalidateAnalyses)); 651 652 // Add the profile lowering pass. 653 InstrProfOptions Options; 654 if (!ProfileFile.empty()) 655 Options.InstrProfileOutput = ProfileFile; 656 // Do counter promotion at Level greater than O0. 657 Options.DoCounterPromotion = true; 658 Options.UseBFIInPromotion = IsCS; 659 MPM.addPass(InstrProfiling(Options, IsCS)); 660 } 661 662 void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, 663 bool RunProfileGen, bool IsCS, 664 std::string ProfileFile, 665 std::string ProfileRemappingFile) { 666 if (!RunProfileGen) { 667 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 668 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 669 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 670 // RequireAnalysisPass for PSI before subsequent non-module passes. 671 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 672 return; 673 } 674 675 // Perform PGO instrumentation. 676 MPM.addPass(PGOInstrumentationGen(IsCS)); 677 // Add the profile lowering pass. 678 InstrProfOptions Options; 679 if (!ProfileFile.empty()) 680 Options.InstrProfileOutput = ProfileFile; 681 // Do not do counter promotion at O0. 682 Options.DoCounterPromotion = false; 683 Options.UseBFIInPromotion = IsCS; 684 MPM.addPass(InstrProfiling(Options, IsCS)); 685 } 686 687 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 688 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 689 } 690 691 ModuleInlinerWrapperPass 692 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 693 ThinOrFullLTOPhase Phase) { 694 InlineParams IP = getInlineParamsFromOptLevel(Level); 695 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 696 PGOOpt->Action == PGOOptions::SampleUse) 697 IP.HotCallSiteThreshold = 0; 698 699 if (PGOOpt) 700 IP.EnableDeferral = EnablePGOInlineDeferral; 701 702 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 703 UseInlineAdvisor, MaxDevirtIterations); 704 705 // Require the GlobalsAA analysis for the module so we can query it within 706 // the CGSCC pipeline. 707 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 708 // Invalidate AAManager so it can be recreated and pick up the newly available 709 // GlobalsAA. 710 MIWP.addModulePass( 711 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 712 713 // Require the ProfileSummaryAnalysis for the module so we can query it within 714 // the inliner pass. 715 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 716 717 // Now begin the main postorder CGSCC pipeline. 718 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 719 // manager and trying to emulate its precise behavior. Much of this doesn't 720 // make a lot of sense and we should revisit the core CGSCC structure. 721 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 722 723 // Note: historically, the PruneEH pass was run first to deduce nounwind and 724 // generally clean up exception handling overhead. It isn't clear this is 725 // valuable as the inliner doesn't currently care whether it is inlining an 726 // invoke or a call. 727 728 if (AttributorRun & AttributorRunOption::CGSCC) 729 MainCGPipeline.addPass(AttributorCGSCCPass()); 730 731 // Now deduce any function attributes based in the current code. 732 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 733 734 // When at O3 add argument promotion to the pass pipeline. 735 // FIXME: It isn't at all clear why this should be limited to O3. 736 if (Level == OptimizationLevel::O3) 737 MainCGPipeline.addPass(ArgumentPromotionPass()); 738 739 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 740 // there are no OpenMP runtime calls present in the module. 741 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 742 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 743 744 for (auto &C : CGSCCOptimizerLateEPCallbacks) 745 C(MainCGPipeline, Level); 746 747 // Lastly, add the core function simplification pipeline nested inside the 748 // CGSCC walk. 749 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 750 buildFunctionSimplificationPipeline(Level, Phase), 751 PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline)); 752 753 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 754 755 if (EnableNoRerunSimplificationPipeline) 756 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 757 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 758 759 return MIWP; 760 } 761 762 ModulePassManager 763 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 764 ThinOrFullLTOPhase Phase) { 765 ModulePassManager MPM; 766 767 InlineParams IP = getInlineParamsFromOptLevel(Level); 768 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 769 PGOOpt->Action == PGOOptions::SampleUse) 770 IP.HotCallSiteThreshold = 0; 771 772 if (PGOOpt) 773 IP.EnableDeferral = EnablePGOInlineDeferral; 774 775 // The inline deferral logic is used to avoid losing some 776 // inlining chance in future. It is helpful in SCC inliner, in which 777 // inlining is processed in bottom-up order. 778 // While in module inliner, the inlining order is a priority-based order 779 // by default. The inline deferral is unnecessary there. So we disable the 780 // inline deferral logic in module inliner. 781 IP.EnableDeferral = false; 782 783 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor)); 784 785 MPM.addPass(createModuleToFunctionPassAdaptor( 786 buildFunctionSimplificationPipeline(Level, Phase), 787 PTO.EagerlyInvalidateAnalyses)); 788 789 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 790 CoroSplitPass(Level != OptimizationLevel::O0))); 791 792 return MPM; 793 } 794 795 ModulePassManager 796 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 797 ThinOrFullLTOPhase Phase) { 798 ModulePassManager MPM; 799 800 // Place pseudo probe instrumentation as the first pass of the pipeline to 801 // minimize the impact of optimization changes. 802 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 803 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 804 MPM.addPass(SampleProfileProbePass(TM)); 805 806 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 807 808 // In ThinLTO mode, when flattened profile is used, all the available 809 // profile information will be annotated in PreLink phase so there is 810 // no need to load the profile again in PostLink. 811 bool LoadSampleProfile = 812 HasSampleProfile && 813 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 814 815 // During the ThinLTO backend phase we perform early indirect call promotion 816 // here, before globalopt. Otherwise imported available_externally functions 817 // look unreferenced and are removed. If we are going to load the sample 818 // profile then defer until later. 819 // TODO: See if we can move later and consolidate with the location where 820 // we perform ICP when we are loading a sample profile. 821 // TODO: We pass HasSampleProfile (whether there was a sample profile file 822 // passed to the compile) to the SamplePGO flag of ICP. This is used to 823 // determine whether the new direct calls are annotated with prof metadata. 824 // Ideally this should be determined from whether the IR is annotated with 825 // sample profile, and not whether the a sample profile was provided on the 826 // command line. E.g. for flattened profiles where we will not be reloading 827 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 828 // provide the sample profile file. 829 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 830 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 831 832 // Do basic inference of function attributes from known properties of system 833 // libraries and other oracles. 834 MPM.addPass(InferFunctionAttrsPass()); 835 836 // Create an early function pass manager to cleanup the output of the 837 // frontend. 838 FunctionPassManager EarlyFPM; 839 // Lower llvm.expect to metadata before attempting transforms. 840 // Compare/branch metadata may alter the behavior of passes like SimplifyCFG. 841 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 842 EarlyFPM.addPass(SimplifyCFGPass()); 843 EarlyFPM.addPass(SROAPass()); 844 EarlyFPM.addPass(EarlyCSEPass()); 845 EarlyFPM.addPass(CoroEarlyPass()); 846 if (Level == OptimizationLevel::O3) 847 EarlyFPM.addPass(CallSiteSplittingPass()); 848 849 // In SamplePGO ThinLTO backend, we need instcombine before profile annotation 850 // to convert bitcast to direct calls so that they can be inlined during the 851 // profile annotation prepration step. 852 // More details about SamplePGO design can be found in: 853 // https://research.google.com/pubs/pub45290.html 854 // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. 855 if (LoadSampleProfile) 856 EarlyFPM.addPass(InstCombinePass()); 857 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM), 858 PTO.EagerlyInvalidateAnalyses)); 859 860 if (LoadSampleProfile) { 861 // Annotate sample profile right after early FPM to ensure freshness of 862 // the debug info. 863 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 864 PGOOpt->ProfileRemappingFile, Phase)); 865 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 866 // RequireAnalysisPass for PSI before subsequent non-module passes. 867 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 868 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 869 // for the profile annotation to be accurate in the LTO backend. 870 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && 871 Phase != ThinOrFullLTOPhase::FullLTOPreLink) 872 // We perform early indirect call promotion here, before globalopt. 873 // This is important for the ThinLTO backend phase because otherwise 874 // imported available_externally functions look unreferenced and are 875 // removed. 876 MPM.addPass( 877 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 878 } 879 880 // Try to perform OpenMP specific optimizations on the module. This is a 881 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 882 if (Level != OptimizationLevel::O0) 883 MPM.addPass(OpenMPOptPass()); 884 885 if (AttributorRun & AttributorRunOption::MODULE) 886 MPM.addPass(AttributorPass()); 887 888 // Lower type metadata and the type.test intrinsic in the ThinLTO 889 // post link pipeline after ICP. This is to enable usage of the type 890 // tests in ICP sequences. 891 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 892 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 893 894 for (auto &C : PipelineEarlySimplificationEPCallbacks) 895 C(MPM, Level); 896 897 // Specialize functions with IPSCCP. 898 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 899 MPM.addPass(FunctionSpecializationPass()); 900 901 // Interprocedural constant propagation now that basic cleanup has occurred 902 // and prior to optimizing globals. 903 // FIXME: This position in the pipeline hasn't been carefully considered in 904 // years, it should be re-analyzed. 905 MPM.addPass(IPSCCPPass()); 906 907 // Attach metadata to indirect call sites indicating the set of functions 908 // they may target at run-time. This should follow IPSCCP. 909 MPM.addPass(CalledValuePropagationPass()); 910 911 // Optimize globals to try and fold them into constants. 912 MPM.addPass(GlobalOptPass()); 913 914 // Promote any localized globals to SSA registers. 915 // FIXME: Should this instead by a run of SROA? 916 // FIXME: We should probably run instcombine and simplifycfg afterward to 917 // delete control flows that are dead once globals have been folded to 918 // constants. 919 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 920 921 // Remove any dead arguments exposed by cleanups and constant folding 922 // globals. 923 MPM.addPass(DeadArgumentEliminationPass()); 924 925 // Create a small function pass pipeline to cleanup after all the global 926 // optimizations. 927 FunctionPassManager GlobalCleanupPM; 928 GlobalCleanupPM.addPass(InstCombinePass()); 929 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 930 931 GlobalCleanupPM.addPass(SimplifyCFGPass()); 932 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 933 PTO.EagerlyInvalidateAnalyses)); 934 935 // Add all the requested passes for instrumentation PGO, if requested. 936 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 937 (PGOOpt->Action == PGOOptions::IRInstr || 938 PGOOpt->Action == PGOOptions::IRUse)) { 939 addPGOInstrPasses(MPM, Level, 940 /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr, 941 /* IsCS */ false, PGOOpt->ProfileFile, 942 PGOOpt->ProfileRemappingFile); 943 MPM.addPass(PGOIndirectCallPromotion(false, false)); 944 } 945 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 946 PGOOpt->CSAction == PGOOptions::CSIRInstr) 947 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); 948 949 // Synthesize function entry counts for non-PGO compilation. 950 if (EnableSyntheticCounts && !PGOOpt) 951 MPM.addPass(SyntheticCountsPropagation()); 952 953 if (EnableModuleInliner) 954 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 955 else 956 MPM.addPass(buildInlinerPipeline(Level, Phase)); 957 958 if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 959 MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); 960 MPM.addPass(ModuleMemProfilerPass()); 961 } 962 963 return MPM; 964 } 965 966 /// TODO: Should LTO cause any differences to this set of passes? 967 void PassBuilder::addVectorPasses(OptimizationLevel Level, 968 FunctionPassManager &FPM, bool IsFullLTO) { 969 FPM.addPass(LoopVectorizePass( 970 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 971 972 if (IsFullLTO) { 973 // The vectorizer may have significantly shortened a loop body; unroll 974 // again. Unroll small loops to hide loop backedge latency and saturate any 975 // parallel execution resources of an out-of-order processor. We also then 976 // need to clean up redundancies and loop invariant code. 977 // FIXME: It would be really good to use a loop-integrated instruction 978 // combiner for cleanup here so that the unrolling and LICM can be pipelined 979 // across the loop nests. 980 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 981 if (EnableUnrollAndJam && PTO.LoopUnrolling) 982 FPM.addPass(createFunctionToLoopPassAdaptor( 983 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 984 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 985 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 986 PTO.ForgetAllSCEVInLoopUnroll))); 987 FPM.addPass(WarnMissedTransformationsPass()); 988 } 989 990 if (!IsFullLTO) { 991 // Eliminate loads by forwarding stores from the previous iteration to loads 992 // of the current iteration. 993 FPM.addPass(LoopLoadEliminationPass()); 994 } 995 // Cleanup after the loop optimization passes. 996 FPM.addPass(InstCombinePass()); 997 998 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 999 ExtraVectorPassManager ExtraPasses; 1000 // At higher optimization levels, try to clean up any runtime overlap and 1001 // alignment checks inserted by the vectorizer. We want to track correlated 1002 // runtime checks for two inner loops in the same outer loop, fold any 1003 // common computations, hoist loop-invariant aspects out of any outer loop, 1004 // and unswitch the runtime checks if possible. Once hoisted, we may have 1005 // dead (or speculatable) control flows or more combining opportunities. 1006 ExtraPasses.addPass(EarlyCSEPass()); 1007 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1008 ExtraPasses.addPass(InstCombinePass()); 1009 LoopPassManager LPM; 1010 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); 1011 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1012 OptimizationLevel::O3)); 1013 ExtraPasses.addPass( 1014 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1015 ExtraPasses.addPass( 1016 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1017 /*UseBlockFrequencyInfo=*/true)); 1018 ExtraPasses.addPass(SimplifyCFGPass()); 1019 ExtraPasses.addPass(InstCombinePass()); 1020 FPM.addPass(std::move(ExtraPasses)); 1021 } 1022 1023 // Now that we've formed fast to execute loop structures, we do further 1024 // optimizations. These are run afterward as they might block doing complex 1025 // analyses and transforms such as what are needed for loop vectorization. 1026 1027 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1028 // GVN, loop transforms, and others have already run, so it's now better to 1029 // convert to more optimized IR using more aggressive simplify CFG options. 1030 // The extra sinking transform can create larger basic blocks, so do this 1031 // before SLP vectorization. 1032 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1033 .forwardSwitchCondToPhi(true) 1034 .convertSwitchToLookupTable(true) 1035 .needCanonicalLoops(false) 1036 .hoistCommonInsts(true) 1037 .sinkCommonInsts(true))); 1038 1039 if (IsFullLTO) { 1040 FPM.addPass(SCCPPass()); 1041 FPM.addPass(InstCombinePass()); 1042 FPM.addPass(BDCEPass()); 1043 } 1044 1045 // Optimize parallel scalar instruction chains into SIMD instructions. 1046 if (PTO.SLPVectorization) { 1047 FPM.addPass(SLPVectorizerPass()); 1048 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1049 FPM.addPass(EarlyCSEPass()); 1050 } 1051 } 1052 // Enhance/cleanup vector code. 1053 FPM.addPass(VectorCombinePass()); 1054 1055 if (!IsFullLTO) { 1056 FPM.addPass(InstCombinePass()); 1057 // Unroll small loops to hide loop backedge latency and saturate any 1058 // parallel execution resources of an out-of-order processor. We also then 1059 // need to clean up redundancies and loop invariant code. 1060 // FIXME: It would be really good to use a loop-integrated instruction 1061 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1062 // across the loop nests. 1063 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1064 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1065 FPM.addPass(createFunctionToLoopPassAdaptor( 1066 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1067 } 1068 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1069 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1070 PTO.ForgetAllSCEVInLoopUnroll))); 1071 FPM.addPass(WarnMissedTransformationsPass()); 1072 FPM.addPass(InstCombinePass()); 1073 FPM.addPass( 1074 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1075 FPM.addPass(createFunctionToLoopPassAdaptor( 1076 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), 1077 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1078 } 1079 1080 // Now that we've vectorized and unrolled loops, we may have more refined 1081 // alignment information, try to re-derive it here. 1082 FPM.addPass(AlignmentFromAssumptionsPass()); 1083 1084 if (IsFullLTO) 1085 FPM.addPass(InstCombinePass()); 1086 } 1087 1088 ModulePassManager 1089 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1090 bool LTOPreLink) { 1091 ModulePassManager MPM; 1092 1093 // Optimize globals now that the module is fully simplified. 1094 MPM.addPass(GlobalOptPass()); 1095 MPM.addPass(GlobalDCEPass()); 1096 1097 // Run partial inlining pass to partially inline functions that have 1098 // large bodies. 1099 if (RunPartialInlining) 1100 MPM.addPass(PartialInlinerPass()); 1101 1102 // Remove avail extern fns and globals definitions since we aren't compiling 1103 // an object file for later LTO. For LTO we want to preserve these so they 1104 // are eligible for inlining at link-time. Note if they are unreferenced they 1105 // will be removed by GlobalDCE later, so this only impacts referenced 1106 // available externally globals. Eventually they will be suppressed during 1107 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1108 // may make globals referenced by available external functions dead and saves 1109 // running remaining passes on the eliminated functions. These should be 1110 // preserved during prelinking for link-time inlining decisions. 1111 if (!LTOPreLink) 1112 MPM.addPass(EliminateAvailableExternallyPass()); 1113 1114 if (EnableOrderFileInstrumentation) 1115 MPM.addPass(InstrOrderFilePass()); 1116 1117 // Do RPO function attribute inference across the module to forward-propagate 1118 // attributes where applicable. 1119 // FIXME: Is this really an optimization rather than a canonicalization? 1120 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1121 1122 // Do a post inline PGO instrumentation and use pass. This is a context 1123 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1124 // cross-module inline has not been done yet. The context sensitive 1125 // instrumentation is after all the inlines are done. 1126 if (!LTOPreLink && PGOOpt) { 1127 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1128 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1129 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1130 PGOOpt->ProfileRemappingFile); 1131 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1132 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1133 /* IsCS */ true, PGOOpt->ProfileFile, 1134 PGOOpt->ProfileRemappingFile); 1135 } 1136 1137 // Re-require GloblasAA here prior to function passes. This is particularly 1138 // useful as the above will have inlined, DCE'ed, and function-attr 1139 // propagated everything. We should at this point have a reasonably minimal 1140 // and richly annotated call graph. By computing aliasing and mod/ref 1141 // information for all local globals here, the late loop passes and notably 1142 // the vectorizer will be able to use them to help recognize vectorizable 1143 // memory operations. 1144 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1145 1146 FunctionPassManager OptimizePM; 1147 OptimizePM.addPass(Float2IntPass()); 1148 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1149 1150 if (EnableMatrix) { 1151 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1152 OptimizePM.addPass(EarlyCSEPass()); 1153 } 1154 1155 // FIXME: We need to run some loop optimizations to re-rotate loops after 1156 // simplifycfg and others undo their rotation. 1157 1158 // Optimize the loop execution. These passes operate on entire loop nests 1159 // rather than on each loop in an inside-out manner, and so they are actually 1160 // function passes. 1161 1162 for (auto &C : VectorizerStartEPCallbacks) 1163 C(OptimizePM, Level); 1164 1165 LoopPassManager LPM; 1166 // First rotate loops that may have been un-rotated by prior passes. 1167 // Disable header duplication at -Oz. 1168 LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); 1169 // Some loops may have become dead by now. Try to delete them. 1170 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1171 // this may need to be revisited once we run GVN before loop deletion 1172 // in the simplification pipeline. 1173 LPM.addPass(LoopDeletionPass()); 1174 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1175 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1176 1177 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1178 // into separate loop that would otherwise inhibit vectorization. This is 1179 // currently only performed for loops marked with the metadata 1180 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1181 OptimizePM.addPass(LoopDistributePass()); 1182 1183 // Populates the VFABI attribute with the scalar-to-vector mappings 1184 // from the TargetLibraryInfo. 1185 OptimizePM.addPass(InjectTLIMappings()); 1186 1187 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1188 1189 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1190 // canonicalization pass that enables other optimizations. As a result, 1191 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1192 // result too early. 1193 OptimizePM.addPass(LoopSinkPass()); 1194 1195 // And finally clean up LCSSA form before generating code. 1196 OptimizePM.addPass(InstSimplifyPass()); 1197 1198 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1199 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1200 // flattening of blocks. 1201 OptimizePM.addPass(DivRemPairsPass()); 1202 1203 // LoopSink (and other loop passes since the last simplifyCFG) might have 1204 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1205 OptimizePM.addPass(SimplifyCFGPass()); 1206 1207 OptimizePM.addPass(CoroCleanupPass()); 1208 1209 // Add the core optimizing pipeline. 1210 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1211 PTO.EagerlyInvalidateAnalyses)); 1212 1213 for (auto &C : OptimizerLastEPCallbacks) 1214 C(MPM, Level); 1215 1216 // Split out cold code. Splitting is done late to avoid hiding context from 1217 // other optimizations and inadvertently regressing performance. The tradeoff 1218 // is that this has a higher code size cost than splitting early. 1219 if (EnableHotColdSplit && !LTOPreLink) 1220 MPM.addPass(HotColdSplittingPass()); 1221 1222 // Search the code for similar regions of code. If enough similar regions can 1223 // be found where extracting the regions into their own function will decrease 1224 // the size of the program, we extract the regions, a deduplicate the 1225 // structurally similar regions. 1226 if (EnableIROutliner) 1227 MPM.addPass(IROutlinerPass()); 1228 1229 // Merge functions if requested. 1230 if (PTO.MergeFunctions) 1231 MPM.addPass(MergeFunctionsPass()); 1232 1233 if (PTO.CallGraphProfile) 1234 MPM.addPass(CGProfilePass()); 1235 1236 // Now we need to do some global optimization transforms. 1237 // FIXME: It would seem like these should come first in the optimization 1238 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1239 // ordering here. 1240 MPM.addPass(GlobalDCEPass()); 1241 MPM.addPass(ConstantMergePass()); 1242 1243 // TODO: Relative look table converter pass caused an issue when full lto is 1244 // enabled. See https://reviews.llvm.org/D94355 for more details. 1245 // Until the issue fixed, disable this pass during pre-linking phase. 1246 if (!LTOPreLink) 1247 MPM.addPass(RelLookupTableConverterPass()); 1248 1249 return MPM; 1250 } 1251 1252 ModulePassManager 1253 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1254 bool LTOPreLink) { 1255 assert(Level != OptimizationLevel::O0 && 1256 "Must request optimizations for the default pipeline!"); 1257 1258 ModulePassManager MPM; 1259 1260 // Convert @llvm.global.annotations to !annotation metadata. 1261 MPM.addPass(Annotation2MetadataPass()); 1262 1263 // Force any function attributes we want the rest of the pipeline to observe. 1264 MPM.addPass(ForceFunctionAttrsPass()); 1265 1266 // Apply module pipeline start EP callback. 1267 for (auto &C : PipelineStartEPCallbacks) 1268 C(MPM, Level); 1269 1270 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1271 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1272 1273 // Add the core simplification pipeline. 1274 MPM.addPass(buildModuleSimplificationPipeline( 1275 Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink 1276 : ThinOrFullLTOPhase::None)); 1277 1278 // Now add the optimization pipeline. 1279 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink)); 1280 1281 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1282 PGOOpt->Action == PGOOptions::SampleUse) 1283 MPM.addPass(PseudoProbeUpdatePass()); 1284 1285 // Emit annotation remarks. 1286 addAnnotationRemarksPass(MPM); 1287 1288 if (LTOPreLink) 1289 addRequiredLTOPreLinkPasses(MPM); 1290 1291 return MPM; 1292 } 1293 1294 ModulePassManager 1295 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1296 assert(Level != OptimizationLevel::O0 && 1297 "Must request optimizations for the default pipeline!"); 1298 1299 ModulePassManager MPM; 1300 1301 // Convert @llvm.global.annotations to !annotation metadata. 1302 MPM.addPass(Annotation2MetadataPass()); 1303 1304 // Force any function attributes we want the rest of the pipeline to observe. 1305 MPM.addPass(ForceFunctionAttrsPass()); 1306 1307 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1308 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1309 1310 // Apply module pipeline start EP callback. 1311 for (auto &C : PipelineStartEPCallbacks) 1312 C(MPM, Level); 1313 1314 // If we are planning to perform ThinLTO later, we don't bloat the code with 1315 // unrolling/vectorization/... now. Just simplify the module as much as we 1316 // can. 1317 MPM.addPass(buildModuleSimplificationPipeline( 1318 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1319 1320 // Run partial inlining pass to partially inline functions that have 1321 // large bodies. 1322 // FIXME: It isn't clear whether this is really the right place to run this 1323 // in ThinLTO. Because there is another canonicalization and simplification 1324 // phase that will run after the thin link, running this here ends up with 1325 // less information than will be available later and it may grow functions in 1326 // ways that aren't beneficial. 1327 if (RunPartialInlining) 1328 MPM.addPass(PartialInlinerPass()); 1329 1330 // Reduce the size of the IR as much as possible. 1331 MPM.addPass(GlobalOptPass()); 1332 1333 // Module simplification splits coroutines, but does not fully clean up 1334 // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up 1335 // on these, we schedule the cleanup here. 1336 MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1337 1338 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1339 PGOOpt->Action == PGOOptions::SampleUse) 1340 MPM.addPass(PseudoProbeUpdatePass()); 1341 1342 // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual 1343 // optimization is going to be done in PostLink stage, but clang can't 1344 // add callbacks there in case of in-process ThinLTO called by linker. 1345 for (auto &C : OptimizerLastEPCallbacks) 1346 C(MPM, Level); 1347 1348 // Emit annotation remarks. 1349 addAnnotationRemarksPass(MPM); 1350 1351 addRequiredLTOPreLinkPasses(MPM); 1352 1353 return MPM; 1354 } 1355 1356 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1357 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1358 ModulePassManager MPM; 1359 1360 // Convert @llvm.global.annotations to !annotation metadata. 1361 MPM.addPass(Annotation2MetadataPass()); 1362 1363 if (ImportSummary) { 1364 // These passes import type identifier resolutions for whole-program 1365 // devirtualization and CFI. They must run early because other passes may 1366 // disturb the specific instruction patterns that these passes look for, 1367 // creating dependencies on resolutions that may not appear in the summary. 1368 // 1369 // For example, GVN may transform the pattern assume(type.test) appearing in 1370 // two basic blocks into assume(phi(type.test, type.test)), which would 1371 // transform a dependency on a WPD resolution into a dependency on a type 1372 // identifier resolution for CFI. 1373 // 1374 // Also, WPD has access to more precise information than ICP and can 1375 // devirtualize more effectively, so it should operate on the IR first. 1376 // 1377 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1378 // metadata and intrinsics. 1379 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1380 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1381 } 1382 1383 if (Level == OptimizationLevel::O0) { 1384 // Run a second time to clean up any type tests left behind by WPD for use 1385 // in ICP. 1386 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1387 // Drop available_externally and unreferenced globals. This is necessary 1388 // with ThinLTO in order to avoid leaving undefined references to dead 1389 // globals in the object file. 1390 MPM.addPass(EliminateAvailableExternallyPass()); 1391 MPM.addPass(GlobalDCEPass()); 1392 return MPM; 1393 } 1394 1395 // Force any function attributes we want the rest of the pipeline to observe. 1396 MPM.addPass(ForceFunctionAttrsPass()); 1397 1398 // Add the core simplification pipeline. 1399 MPM.addPass(buildModuleSimplificationPipeline( 1400 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1401 1402 // Now add the optimization pipeline. 1403 MPM.addPass(buildModuleOptimizationPipeline(Level)); 1404 1405 // Emit annotation remarks. 1406 addAnnotationRemarksPass(MPM); 1407 1408 return MPM; 1409 } 1410 1411 ModulePassManager 1412 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1413 assert(Level != OptimizationLevel::O0 && 1414 "Must request optimizations for the default pipeline!"); 1415 // FIXME: We should use a customized pre-link pipeline! 1416 return buildPerModuleDefaultPipeline(Level, 1417 /* LTOPreLink */ true); 1418 } 1419 1420 ModulePassManager 1421 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1422 ModuleSummaryIndex *ExportSummary) { 1423 ModulePassManager MPM; 1424 1425 // Convert @llvm.global.annotations to !annotation metadata. 1426 MPM.addPass(Annotation2MetadataPass()); 1427 1428 // Create a function that performs CFI checks for cross-DSO calls with targets 1429 // in the current module. 1430 MPM.addPass(CrossDSOCFIPass()); 1431 1432 if (Level == OptimizationLevel::O0) { 1433 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1434 // metadata and intrinsics. 1435 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1436 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1437 // Run a second time to clean up any type tests left behind by WPD for use 1438 // in ICP. 1439 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1440 1441 // Emit annotation remarks. 1442 addAnnotationRemarksPass(MPM); 1443 1444 return MPM; 1445 } 1446 1447 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1448 // Load sample profile before running the LTO optimization pipeline. 1449 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1450 PGOOpt->ProfileRemappingFile, 1451 ThinOrFullLTOPhase::FullLTOPostLink)); 1452 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1453 // RequireAnalysisPass for PSI before subsequent non-module passes. 1454 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1455 } 1456 1457 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1458 MPM.addPass(OpenMPOptPass()); 1459 1460 // Remove unused virtual tables to improve the quality of code generated by 1461 // whole-program devirtualization and bitset lowering. 1462 MPM.addPass(GlobalDCEPass()); 1463 1464 // Force any function attributes we want the rest of the pipeline to observe. 1465 MPM.addPass(ForceFunctionAttrsPass()); 1466 1467 // Do basic inference of function attributes from known properties of system 1468 // libraries and other oracles. 1469 MPM.addPass(InferFunctionAttrsPass()); 1470 1471 if (Level.getSpeedupLevel() > 1) { 1472 FunctionPassManager EarlyFPM; 1473 EarlyFPM.addPass(CallSiteSplittingPass()); 1474 MPM.addPass(createModuleToFunctionPassAdaptor( 1475 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses)); 1476 1477 // Indirect call promotion. This should promote all the targets that are 1478 // left by the earlier promotion pass that promotes intra-module targets. 1479 // This two-step promotion is to save the compile time. For LTO, it should 1480 // produce the same result as if we only do promotion here. 1481 MPM.addPass(PGOIndirectCallPromotion( 1482 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1483 1484 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 1485 MPM.addPass(FunctionSpecializationPass()); 1486 // Propagate constants at call sites into the functions they call. This 1487 // opens opportunities for globalopt (and inlining) by substituting function 1488 // pointers passed as arguments to direct uses of functions. 1489 MPM.addPass(IPSCCPPass()); 1490 1491 // Attach metadata to indirect call sites indicating the set of functions 1492 // they may target at run-time. This should follow IPSCCP. 1493 MPM.addPass(CalledValuePropagationPass()); 1494 } 1495 1496 // Now deduce any function attributes based in the current code. 1497 MPM.addPass( 1498 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1499 1500 // Do RPO function attribute inference across the module to forward-propagate 1501 // attributes where applicable. 1502 // FIXME: Is this really an optimization rather than a canonicalization? 1503 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1504 1505 // Use in-range annotations on GEP indices to split globals where beneficial. 1506 MPM.addPass(GlobalSplitPass()); 1507 1508 // Run whole program optimization of virtual call when the list of callees 1509 // is fixed. 1510 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1511 1512 // Stop here at -O1. 1513 if (Level == OptimizationLevel::O1) { 1514 // The LowerTypeTestsPass needs to run to lower type metadata and the 1515 // type.test intrinsics. The pass does nothing if CFI is disabled. 1516 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1517 // Run a second time to clean up any type tests left behind by WPD for use 1518 // in ICP (which is performed earlier than this in the regular LTO 1519 // pipeline). 1520 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1521 1522 // Emit annotation remarks. 1523 addAnnotationRemarksPass(MPM); 1524 1525 return MPM; 1526 } 1527 1528 // Optimize globals to try and fold them into constants. 1529 MPM.addPass(GlobalOptPass()); 1530 1531 // Promote any localized globals to SSA registers. 1532 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1533 1534 // Linking modules together can lead to duplicate global constant, only 1535 // keep one copy of each constant. 1536 MPM.addPass(ConstantMergePass()); 1537 1538 // Remove unused arguments from functions. 1539 MPM.addPass(DeadArgumentEliminationPass()); 1540 1541 // Reduce the code after globalopt and ipsccp. Both can open up significant 1542 // simplification opportunities, and both can propagate functions through 1543 // function pointers. When this happens, we often have to resolve varargs 1544 // calls, etc, so let instcombine do this. 1545 FunctionPassManager PeepholeFPM; 1546 PeepholeFPM.addPass(InstCombinePass()); 1547 if (Level == OptimizationLevel::O3) 1548 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1549 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1550 1551 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1552 PTO.EagerlyInvalidateAnalyses)); 1553 1554 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1555 // generally clean up exception handling overhead. It isn't clear this is 1556 // valuable as the inliner doesn't currently care whether it is inlining an 1557 // invoke or a call. 1558 // Run the inliner now. 1559 MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level))); 1560 1561 // Optimize globals again after we ran the inliner. 1562 MPM.addPass(GlobalOptPass()); 1563 1564 // Garbage collect dead functions. 1565 MPM.addPass(GlobalDCEPass()); 1566 1567 // If we didn't decide to inline a function, check to see if we can 1568 // transform it to pass arguments by value instead of by reference. 1569 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1570 1571 FunctionPassManager FPM; 1572 // The IPO Passes may leave cruft around. Clean up after them. 1573 FPM.addPass(InstCombinePass()); 1574 invokePeepholeEPCallbacks(FPM, Level); 1575 1576 FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1577 1578 // Do a post inline PGO instrumentation and use pass. This is a context 1579 // sensitive PGO pass. 1580 if (PGOOpt) { 1581 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1582 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1583 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1584 PGOOpt->ProfileRemappingFile); 1585 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1586 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1587 /* IsCS */ true, PGOOpt->ProfileFile, 1588 PGOOpt->ProfileRemappingFile); 1589 } 1590 1591 // Break up allocas 1592 FPM.addPass(SROAPass()); 1593 1594 // LTO provides additional opportunities for tailcall elimination due to 1595 // link-time inlining, and visibility of nocapture attribute. 1596 FPM.addPass(TailCallElimPass()); 1597 1598 // Run a few AA driver optimizations here and now to cleanup the code. 1599 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1600 PTO.EagerlyInvalidateAnalyses)); 1601 1602 MPM.addPass( 1603 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1604 1605 // Require the GlobalsAA analysis for the module so we can query it within 1606 // MainFPM. 1607 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1608 // Invalidate AAManager so it can be recreated and pick up the newly available 1609 // GlobalsAA. 1610 MPM.addPass( 1611 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1612 1613 FunctionPassManager MainFPM; 1614 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1615 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), 1616 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1617 1618 if (RunNewGVN) 1619 MainFPM.addPass(NewGVNPass()); 1620 else 1621 MainFPM.addPass(GVNPass()); 1622 1623 // Remove dead memcpy()'s. 1624 MainFPM.addPass(MemCpyOptPass()); 1625 1626 // Nuke dead stores. 1627 MainFPM.addPass(DSEPass()); 1628 MainFPM.addPass(MergedLoadStoreMotionPass()); 1629 1630 1631 if (EnableConstraintElimination) 1632 MainFPM.addPass(ConstraintEliminationPass()); 1633 1634 LoopPassManager LPM; 1635 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1636 LPM.addPass(LoopFlattenPass()); 1637 LPM.addPass(IndVarSimplifyPass()); 1638 LPM.addPass(LoopDeletionPass()); 1639 // FIXME: Add loop interchange. 1640 1641 // Unroll small loops and perform peeling. 1642 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1643 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1644 PTO.ForgetAllSCEVInLoopUnroll)); 1645 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1646 // *All* loop passes must preserve it, in order to be able to use it. 1647 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1648 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1649 1650 MainFPM.addPass(LoopDistributePass()); 1651 1652 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 1653 1654 // Run the OpenMPOpt CGSCC pass again late. 1655 MPM.addPass( 1656 createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass())); 1657 1658 invokePeepholeEPCallbacks(MainFPM, Level); 1659 MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1660 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 1661 PTO.EagerlyInvalidateAnalyses)); 1662 1663 // Lower type metadata and the type.test intrinsic. This pass supports 1664 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 1665 // to be run at link time if CFI is enabled. This pass does nothing if 1666 // CFI is disabled. 1667 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1668 // Run a second time to clean up any type tests left behind by WPD for use 1669 // in ICP (which is performed earlier than this in the regular LTO pipeline). 1670 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1671 1672 // Enable splitting late in the FullLTO post-link pipeline. This is done in 1673 // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses). 1674 if (EnableHotColdSplit) 1675 MPM.addPass(HotColdSplittingPass()); 1676 1677 // Add late LTO optimization passes. 1678 // Delete basic blocks, which optimization passes may have killed. 1679 MPM.addPass(createModuleToFunctionPassAdaptor( 1680 SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)))); 1681 1682 // Drop bodies of available eternally objects to improve GlobalDCE. 1683 MPM.addPass(EliminateAvailableExternallyPass()); 1684 1685 // Now that we have optimized the program, discard unreachable functions. 1686 MPM.addPass(GlobalDCEPass()); 1687 1688 if (PTO.MergeFunctions) 1689 MPM.addPass(MergeFunctionsPass()); 1690 1691 // Emit annotation remarks. 1692 addAnnotationRemarksPass(MPM); 1693 1694 return MPM; 1695 } 1696 1697 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 1698 bool LTOPreLink) { 1699 assert(Level == OptimizationLevel::O0 && 1700 "buildO0DefaultPipeline should only be used with O0"); 1701 1702 ModulePassManager MPM; 1703 1704 // Perform pseudo probe instrumentation in O0 mode. This is for the 1705 // consistency between different build modes. For example, a LTO build can be 1706 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 1707 // the postlink will require pseudo probe instrumentation in the prelink. 1708 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 1709 MPM.addPass(SampleProfileProbePass(TM)); 1710 1711 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 1712 PGOOpt->Action == PGOOptions::IRUse)) 1713 addPGOInstrPassesForO0( 1714 MPM, 1715 /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr), 1716 /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); 1717 1718 for (auto &C : PipelineStartEPCallbacks) 1719 C(MPM, Level); 1720 1721 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1722 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1723 1724 for (auto &C : PipelineEarlySimplificationEPCallbacks) 1725 C(MPM, Level); 1726 1727 // Build a minimal pipeline based on the semantics required by LLVM, 1728 // which is just that always inlining occurs. Further, disable generating 1729 // lifetime intrinsics to avoid enabling further optimizations during 1730 // code generation. 1731 MPM.addPass(AlwaysInlinerPass( 1732 /*InsertLifetimeIntrinsics=*/false)); 1733 1734 if (PTO.MergeFunctions) 1735 MPM.addPass(MergeFunctionsPass()); 1736 1737 if (EnableMatrix) 1738 MPM.addPass( 1739 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 1740 1741 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 1742 CGSCCPassManager CGPM; 1743 for (auto &C : CGSCCOptimizerLateEPCallbacks) 1744 C(CGPM, Level); 1745 if (!CGPM.isEmpty()) 1746 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1747 } 1748 if (!LateLoopOptimizationsEPCallbacks.empty()) { 1749 LoopPassManager LPM; 1750 for (auto &C : LateLoopOptimizationsEPCallbacks) 1751 C(LPM, Level); 1752 if (!LPM.isEmpty()) { 1753 MPM.addPass(createModuleToFunctionPassAdaptor( 1754 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1755 } 1756 } 1757 if (!LoopOptimizerEndEPCallbacks.empty()) { 1758 LoopPassManager LPM; 1759 for (auto &C : LoopOptimizerEndEPCallbacks) 1760 C(LPM, Level); 1761 if (!LPM.isEmpty()) { 1762 MPM.addPass(createModuleToFunctionPassAdaptor( 1763 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1764 } 1765 } 1766 if (!ScalarOptimizerLateEPCallbacks.empty()) { 1767 FunctionPassManager FPM; 1768 for (auto &C : ScalarOptimizerLateEPCallbacks) 1769 C(FPM, Level); 1770 if (!FPM.isEmpty()) 1771 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1772 } 1773 if (!VectorizerStartEPCallbacks.empty()) { 1774 FunctionPassManager FPM; 1775 for (auto &C : VectorizerStartEPCallbacks) 1776 C(FPM, Level); 1777 if (!FPM.isEmpty()) 1778 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1779 } 1780 1781 MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass())); 1782 CGSCCPassManager CGPM; 1783 CGPM.addPass(CoroSplitPass()); 1784 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1785 MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1786 1787 for (auto &C : OptimizerLastEPCallbacks) 1788 C(MPM, Level); 1789 1790 if (LTOPreLink) 1791 addRequiredLTOPreLinkPasses(MPM); 1792 1793 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 1794 1795 return MPM; 1796 } 1797 1798 AAManager PassBuilder::buildDefaultAAPipeline() { 1799 AAManager AA; 1800 1801 // The order in which these are registered determines their priority when 1802 // being queried. 1803 1804 // First we register the basic alias analysis that provides the majority of 1805 // per-function local AA logic. This is a stateless, on-demand local set of 1806 // AA techniques. 1807 AA.registerFunctionAnalysis<BasicAA>(); 1808 1809 // Next we query fast, specialized alias analyses that wrap IR-embedded 1810 // information about aliasing. 1811 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 1812 AA.registerFunctionAnalysis<TypeBasedAA>(); 1813 1814 // Add support for querying global aliasing information when available. 1815 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 1816 // analysis, all that the `AAManager` can do is query for any *cached* 1817 // results from `GlobalsAA` through a readonly proxy. 1818 AA.registerModuleAnalysis<GlobalsAA>(); 1819 1820 // Add target-specific alias analyses. 1821 if (TM) 1822 TM->registerDefaultAliasAnalyses(AA); 1823 1824 return AA; 1825 } 1826