1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "polly/Support/ISLTools.h" 57 #include "llvm/ADT/Sequence.h" 58 #include "llvm/ADT/Statistic.h" 59 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "isl/options.h" 63 64 using namespace llvm; 65 using namespace polly; 66 67 namespace llvm { 68 class Loop; 69 class Module; 70 } // namespace llvm 71 72 #include "polly/Support/PollyDebug.h" 73 #define DEBUG_TYPE "polly-opt-isl" 74 75 static cl::opt<std::string> 76 OptimizeDeps("polly-opt-optimize-only", 77 cl::desc("Only a certain kind of dependences (all/raw)"), 78 cl::Hidden, cl::init("all"), cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 SimplifyDeps("polly-opt-simplify-deps", 82 cl::desc("Dependences should be simplified (yes/no)"), 83 cl::Hidden, cl::init("yes"), cl::cat(PollyCategory)); 84 85 static cl::opt<int> MaxConstantTerm( 86 "polly-opt-max-constant-term", 87 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 88 cl::init(20), cl::cat(PollyCategory)); 89 90 static cl::opt<int> MaxCoefficient( 91 "polly-opt-max-coefficient", 92 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 93 cl::init(20), cl::cat(PollyCategory)); 94 95 static cl::opt<std::string> 96 MaximizeBandDepth("polly-opt-maximize-bands", 97 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 98 cl::init("yes"), cl::cat(PollyCategory)); 99 100 static cl::opt<int> 101 ScheduleComputeOut("polly-schedule-computeout", 102 cl::desc("Bound the scheduler by maximal amount" 103 "of computational steps. "), 104 cl::Hidden, cl::init(300000), cl::ZeroOrMore, 105 cl::cat(PollyCategory)); 106 107 static cl::opt<bool> 108 GreedyFusion("polly-loopfusion-greedy", 109 cl::desc("Aggressively try to fuse everything"), cl::Hidden, 110 cl::cat(PollyCategory)); 111 112 static cl::opt<std::string> OuterCoincidence( 113 "polly-opt-outer-coincidence", 114 cl::desc("Try to construct schedules where the outer member of each band " 115 "satisfies the coincidence constraints (yes/no)"), 116 cl::Hidden, cl::init("no"), cl::cat(PollyCategory)); 117 118 static cl::opt<int> PrevectorWidth( 119 "polly-prevect-width", 120 cl::desc( 121 "The number of loop iterations to strip-mine for pre-vectorization"), 122 cl::Hidden, cl::init(4), cl::cat(PollyCategory)); 123 124 static cl::opt<bool> FirstLevelTiling("polly-tiling", 125 cl::desc("Enable loop tiling"), 126 cl::init(true), cl::cat(PollyCategory)); 127 128 static cl::opt<int> FirstLevelDefaultTileSize( 129 "polly-default-tile-size", 130 cl::desc("The default tile size (if not enough were provided by" 131 " --polly-tile-sizes)"), 132 cl::Hidden, cl::init(32), cl::cat(PollyCategory)); 133 134 static cl::list<int> 135 FirstLevelTileSizes("polly-tile-sizes", 136 cl::desc("A tile size for each loop dimension, filled " 137 "with --polly-default-tile-size"), 138 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory)); 139 140 static cl::opt<bool> 141 SecondLevelTiling("polly-2nd-level-tiling", 142 cl::desc("Enable a 2nd level loop of loop tiling"), 143 cl::cat(PollyCategory)); 144 145 static cl::opt<int> SecondLevelDefaultTileSize( 146 "polly-2nd-level-default-tile-size", 147 cl::desc("The default 2nd-level tile size (if not enough were provided by" 148 " --polly-2nd-level-tile-sizes)"), 149 cl::Hidden, cl::init(16), cl::cat(PollyCategory)); 150 151 static cl::list<int> 152 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 153 cl::desc("A tile size for each loop dimension, filled " 154 "with --polly-default-tile-size"), 155 cl::Hidden, cl::CommaSeparated, 156 cl::cat(PollyCategory)); 157 158 static cl::opt<bool> RegisterTiling("polly-register-tiling", 159 cl::desc("Enable register tiling"), 160 cl::cat(PollyCategory)); 161 162 static cl::opt<int> RegisterDefaultTileSize( 163 "polly-register-tiling-default-tile-size", 164 cl::desc("The default register tile size (if not enough were provided by" 165 " --polly-register-tile-sizes)"), 166 cl::Hidden, cl::init(2), cl::cat(PollyCategory)); 167 168 static cl::list<int> 169 RegisterTileSizes("polly-register-tile-sizes", 170 cl::desc("A tile size for each loop dimension, filled " 171 "with --polly-register-tile-size"), 172 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory)); 173 174 static cl::opt<bool> PragmaBasedOpts( 175 "polly-pragma-based-opts", 176 cl::desc("Apply user-directed transformation from metadata"), 177 cl::init(true), cl::cat(PollyCategory)); 178 179 static cl::opt<bool> EnableReschedule("polly-reschedule", 180 cl::desc("Optimize SCoPs using ISL"), 181 cl::init(true), cl::cat(PollyCategory)); 182 183 static cl::opt<bool> 184 PMBasedOpts("polly-pattern-matching-based-opts", 185 cl::desc("Perform optimizations based on pattern matching"), 186 cl::init(true), cl::cat(PollyCategory)); 187 188 static cl::opt<bool> 189 EnablePostopts("polly-postopts", 190 cl::desc("Apply post-rescheduling optimizations such as " 191 "tiling (requires -polly-reschedule)"), 192 cl::init(true), cl::cat(PollyCategory)); 193 194 static cl::opt<bool> OptimizedScops( 195 "polly-optimized-scops", 196 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 197 "the isl scheduling optimizer and the set of post-scheduling " 198 "transformations is applied on the schedule tree"), 199 cl::cat(PollyCategory)); 200 201 STATISTIC(ScopsProcessed, "Number of scops processed"); 202 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 203 STATISTIC(ScopsOptimized, "Number of scops optimized"); 204 205 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 206 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 207 208 #define THREE_STATISTICS(VARNAME, DESC) \ 209 static Statistic VARNAME[3] = { \ 210 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 211 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 212 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 213 214 THREE_STATISTICS(NumBands, "Number of bands"); 215 THREE_STATISTICS(NumBandMembers, "Number of band members"); 216 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 217 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 218 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 219 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 220 221 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 222 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 223 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 224 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 225 STATISTIC(MatMulOpts, 226 "Number of matrix multiplication patterns detected and optimized"); 227 228 namespace { 229 /// Additional parameters of the schedule optimizer. 230 /// 231 /// Target Transform Info and the SCoP dependencies used by the schedule 232 /// optimizer. 233 struct OptimizerAdditionalInfoTy { 234 const llvm::TargetTransformInfo *TTI; 235 const Dependences *D; 236 bool PatternOpts; 237 bool Postopts; 238 bool Prevect; 239 bool &DepsChanged; 240 }; 241 242 class ScheduleTreeOptimizer final { 243 public: 244 /// Apply schedule tree transformations. 245 /// 246 /// This function takes an (possibly already optimized) schedule tree and 247 /// applies a set of additional optimizations on the schedule tree. The 248 /// transformations applied include: 249 /// 250 /// - Pattern-based optimizations 251 /// - Tiling 252 /// - Prevectorization 253 /// 254 /// @param Schedule The schedule object the transformations will be applied 255 /// to. 256 /// @param OAI Target Transform Info and the SCoP dependencies. 257 /// @returns The transformed schedule. 258 static isl::schedule 259 optimizeSchedule(isl::schedule Schedule, 260 const OptimizerAdditionalInfoTy *OAI = nullptr); 261 262 /// Apply schedule tree transformations. 263 /// 264 /// This function takes a node in an (possibly already optimized) schedule 265 /// tree and applies a set of additional optimizations on this schedule tree 266 /// node and its descendants. The transformations applied include: 267 /// 268 /// - Pattern-based optimizations 269 /// - Tiling 270 /// - Prevectorization 271 /// 272 /// @param Node The schedule object post-transformations will be applied to. 273 /// @param OAI Target Transform Info and the SCoP dependencies. 274 /// @returns The transformed schedule. 275 static isl::schedule_node 276 optimizeScheduleNode(isl::schedule_node Node, 277 const OptimizerAdditionalInfoTy *OAI = nullptr); 278 279 /// Decide if the @p NewSchedule is profitable for @p S. 280 /// 281 /// @param S The SCoP we optimize. 282 /// @param NewSchedule The new schedule we computed. 283 /// 284 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 285 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 286 287 /// Isolate a set of partial tile prefixes. 288 /// 289 /// This set should ensure that it contains only partial tile prefixes that 290 /// have exactly VectorWidth iterations. 291 /// 292 /// @param Node A schedule node band, which is a parent of a band node, 293 /// that contains a vector loop. 294 /// @return Modified isl_schedule_node. 295 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 296 int VectorWidth); 297 298 private: 299 /// Check if this node is a band node we want to tile. 300 /// 301 /// We look for innermost band nodes where individual dimensions are marked as 302 /// permutable. 303 /// 304 /// @param Node The node to check. 305 static bool isTileableBandNode(isl::schedule_node Node); 306 307 /// Check if this node is a band node we want to transform using pattern 308 /// matching. 309 /// 310 /// We look for innermost band nodes where individual dimensions are marked as 311 /// permutable. There is no restriction on the number of individual 312 /// dimensions. 313 /// 314 /// @param Node The node to check. 315 static bool isPMOptimizableBandNode(isl::schedule_node Node); 316 317 /// Pre-vectorizes one scheduling dimension of a schedule band. 318 /// 319 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 320 /// sinks the resulting point loop. 321 /// 322 /// Example (DimToVectorize=0, VectorWidth=4): 323 /// 324 /// | Before transformation: 325 /// | 326 /// | A[i,j] -> [i,j] 327 /// | 328 /// | for (i = 0; i < 128; i++) 329 /// | for (j = 0; j < 128; j++) 330 /// | A(i,j); 331 /// 332 /// | After transformation: 333 /// | 334 /// | for (it = 0; it < 32; it+=1) 335 /// | for (j = 0; j < 128; j++) 336 /// | for (ip = 0; ip <= 3; ip++) 337 /// | A(4 * it + ip,j); 338 /// 339 /// The goal of this transformation is to create a trivially vectorizable 340 /// loop. This means a parallel loop at the innermost level that has a 341 /// constant number of iterations corresponding to the target vector width. 342 /// 343 /// This transformation creates a loop at the innermost level. The loop has 344 /// a constant number of iterations, if the number of loop iterations at 345 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 346 /// currently constant and not yet target specific. This function does not 347 /// reason about parallelism. 348 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 349 unsigned DimToVectorize, 350 int VectorWidth); 351 352 /// Apply additional optimizations on the bands in the schedule tree. 353 /// 354 /// We are looking for an innermost band node and apply the following 355 /// transformations: 356 /// 357 /// - Tile the band 358 /// - if the band is tileable 359 /// - if the band has more than one loop dimension 360 /// 361 /// - Prevectorize the schedule of the band (or the point loop in case of 362 /// tiling). 363 /// - if vectorization is enabled 364 /// 365 /// @param Node The schedule node to (possibly) optimize. 366 /// @param User A pointer to forward some use information 367 /// (currently unused). 368 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 369 370 /// Apply tiling optimizations on the bands in the schedule tree. 371 /// 372 /// @param Node The schedule node to (possibly) optimize. 373 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node); 374 375 /// Apply prevectorization on the bands in the schedule tree. 376 /// 377 /// @param Node The schedule node to (possibly) prevectorize. 378 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node); 379 }; 380 381 isl::schedule_node 382 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 383 int VectorWidth) { 384 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 385 Node = Node.child(0).child(0); 386 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 387 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 388 isl::set ScheduleRange{ScheduleRangeUSet}; 389 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 390 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 391 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 392 Node = Node.parent().parent(); 393 isl::union_set Options = IsolateOption.unite(AtomicOption); 394 isl::schedule_node_band Result = 395 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 396 return Result; 397 } 398 399 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> { 400 isl::schedule_node visitBand(isl::schedule_node_band Band) { 401 isl::schedule_node Node = visitChildren(Band); 402 403 // Only add SIMD markers to innermost bands. 404 if (!Node.first_child().isa<isl::schedule_node_leaf>()) 405 return Node; 406 407 isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr); 408 return Band.insert_mark(LoopMarker); 409 } 410 }; 411 412 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 413 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 414 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 415 416 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 417 unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set)); 418 assert(DimToVectorize < ScheduleDimensions); 419 420 if (DimToVectorize > 0) { 421 Node = isl::manage( 422 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 423 Node = Node.child(0); 424 } 425 if (DimToVectorize < ScheduleDimensions - 1) 426 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 427 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 428 auto Sizes = isl::multi_val::zero(Space); 429 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 430 Node = 431 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 432 Node = isolateFullPartialTiles(Node, VectorWidth); 433 Node = Node.child(0); 434 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 435 // we will have troubles to match it in the backend. 436 Node = Node.as<isl::schedule_node_band>().set_ast_build_options( 437 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 438 439 // Sink the inner loop into the smallest possible statements to make them 440 // represent a single vector instruction if possible. 441 Node = isl::manage(isl_schedule_node_band_sink(Node.release())); 442 443 // Add SIMD markers to those vector statements. 444 InsertSimdMarkers SimdMarkerInserter; 445 Node = SimdMarkerInserter.visit(Node); 446 447 PrevectOpts++; 448 return Node.parent(); 449 } 450 451 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 452 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 453 assert(isl_schedule_node_n_children(Node.get()) == 1); 454 455 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 456 457 if (ChildType == isl_schedule_node_leaf) 458 return true; 459 460 if (ChildType != isl_schedule_node_sequence) 461 return false; 462 463 auto Sequence = Node.child(0); 464 465 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 466 ++c) { 467 auto Child = Sequence.child(c); 468 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 469 return false; 470 if (isl_schedule_node_get_type(Child.child(0).get()) != 471 isl_schedule_node_leaf) 472 return false; 473 } 474 return true; 475 } 476 477 /// Check if this node is a band node, which has only one child. 478 /// 479 /// @param Node The node to check. 480 static bool isOneTimeParentBandNode(isl::schedule_node Node) { 481 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 482 return false; 483 484 if (isl_schedule_node_n_children(Node.get()) != 1) 485 return false; 486 487 return true; 488 } 489 490 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 491 if (!isOneTimeParentBandNode(Node)) 492 return false; 493 494 if (!isl_schedule_node_band_get_permutable(Node.get())) 495 return false; 496 497 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 498 499 if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u) 500 return false; 501 502 return isSimpleInnermostBand(Node); 503 } 504 505 bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) { 506 if (!isOneTimeParentBandNode(Node)) 507 return false; 508 509 return Node.child(0).isa<isl::schedule_node_leaf>(); 510 } 511 512 __isl_give isl::schedule_node 513 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { 514 if (FirstLevelTiling) { 515 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 516 FirstLevelDefaultTileSize); 517 FirstLevelTileOpts++; 518 } 519 520 if (SecondLevelTiling) { 521 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 522 SecondLevelDefaultTileSize); 523 SecondLevelTileOpts++; 524 } 525 526 if (RegisterTiling) { 527 Node = 528 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 529 RegisterTileOpts++; 530 } 531 532 return Node; 533 } 534 535 isl::schedule_node 536 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { 537 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 538 int Dims = unsignedFromIslSize(Space.dim(isl::dim::set)); 539 540 for (int i = Dims - 1; i >= 0; i--) 541 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 542 Node = prevectSchedBand(Node, i, PrevectorWidth); 543 break; 544 } 545 546 return Node; 547 } 548 549 __isl_give isl_schedule_node * 550 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, 551 void *User) { 552 const OptimizerAdditionalInfoTy *OAI = 553 static_cast<const OptimizerAdditionalInfoTy *>(User); 554 assert(OAI && "Expecting optimization options"); 555 556 isl::schedule_node Node = isl::manage(NodeArg); 557 558 if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) { 559 isl::schedule_node PatternOptimizedSchedule = 560 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); 561 if (!PatternOptimizedSchedule.is_null()) { 562 MatMulOpts++; 563 OAI->DepsChanged = true; 564 return PatternOptimizedSchedule.release(); 565 } 566 } 567 568 if (!isTileableBandNode(Node)) 569 return Node.release(); 570 571 if (OAI->Postopts) 572 Node = applyTileBandOpt(Node); 573 574 if (OAI->Prevect) { 575 // FIXME: Prevectorization requirements are different from those checked by 576 // isTileableBandNode. 577 Node = applyPrevectBandOpt(Node); 578 } 579 580 return Node.release(); 581 } 582 583 isl::schedule 584 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 585 const OptimizerAdditionalInfoTy *OAI) { 586 auto Root = Schedule.get_root(); 587 Root = optimizeScheduleNode(Root, OAI); 588 return Root.get_schedule(); 589 } 590 591 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 592 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 593 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 594 Node.release(), optimizeBand, 595 const_cast<void *>(static_cast<const void *>(OAI)))); 596 return Node; 597 } 598 599 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 600 isl::schedule NewSchedule) { 601 // To understand if the schedule has been optimized we check if the schedule 602 // has changed at all. 603 // TODO: We can improve this by tracking if any necessarily beneficial 604 // transformations have been performed. This can e.g. be tiling, loop 605 // interchange, or ...) We can track this either at the place where the 606 // transformation has been performed or, in case of automatic ILP based 607 // optimizations, by comparing (yet to be defined) performance metrics 608 // before/after the scheduling optimizer 609 // (e.g., #stride-one accesses) 610 // FIXME: A schedule tree whose union_map-conversion is identical to the 611 // original schedule map may still allow for parallelization, i.e. can still 612 // be profitable. 613 auto NewScheduleMap = NewSchedule.get_map(); 614 auto OldSchedule = S.getSchedule(); 615 assert(!OldSchedule.is_null() && 616 "Only IslScheduleOptimizer can insert extension nodes " 617 "that make Scop::getSchedule() return nullptr."); 618 bool changed = !OldSchedule.is_equal(NewScheduleMap); 619 return changed; 620 } 621 622 class IslScheduleOptimizerWrapperPass final : public ScopPass { 623 public: 624 static char ID; 625 626 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 627 628 /// Optimize the schedule of the SCoP @p S. 629 bool runOnScop(Scop &S) override; 630 631 /// Print the new schedule for the SCoP @p S. 632 void printScop(raw_ostream &OS, Scop &S) const override; 633 634 /// Register all analyses and transformation required. 635 void getAnalysisUsage(AnalysisUsage &AU) const override; 636 637 /// Release the internal memory. 638 void releaseMemory() override { 639 LastSchedule = {}; 640 IslCtx.reset(); 641 } 642 643 private: 644 std::shared_ptr<isl_ctx> IslCtx; 645 isl::schedule LastSchedule; 646 }; 647 648 char IslScheduleOptimizerWrapperPass::ID = 0; 649 650 #ifndef NDEBUG 651 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 652 StringRef Desc) { 653 isl::ctx Ctx = Schedule.ctx(); 654 isl_printer *P = isl_printer_to_str(Ctx.get()); 655 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 656 P = isl_printer_print_schedule(P, Schedule.get()); 657 char *Str = isl_printer_get_str(P); 658 OS << Desc << ": \n" << Str << "\n"; 659 free(Str); 660 isl_printer_free(P); 661 } 662 #endif 663 664 /// Collect statistics for the schedule tree. 665 /// 666 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 667 /// ignored. 668 /// @param Version The version of the schedule tree that is analyzed. 669 /// 0 for the original schedule tree before any transformation. 670 /// 1 for the schedule tree after isl's rescheduling. 671 /// 2 for the schedule tree after optimizations are applied 672 /// (tiling, pattern matching) 673 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 674 auto Root = Schedule.get_root(); 675 if (Root.is_null()) 676 return; 677 678 isl_schedule_node_foreach_descendant_top_down( 679 Root.get(), 680 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 681 isl::schedule_node Node = isl::manage_copy(nodeptr); 682 int Version = *static_cast<int *>(user); 683 684 switch (isl_schedule_node_get_type(Node.get())) { 685 case isl_schedule_node_band: { 686 NumBands[Version]++; 687 if (isl_schedule_node_band_get_permutable(Node.get()) == 688 isl_bool_true) 689 NumPermutable[Version]++; 690 691 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 692 NumBandMembers[Version] += CountMembers; 693 for (int i = 0; i < CountMembers; i += 1) { 694 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 695 NumCoincident[Version]++; 696 } 697 break; 698 } 699 700 case isl_schedule_node_filter: 701 NumFilters[Version]++; 702 break; 703 704 case isl_schedule_node_extension: 705 NumExtension[Version]++; 706 break; 707 708 default: 709 break; 710 } 711 712 return isl_bool_true; 713 }, 714 &Version); 715 } 716 717 static void runIslScheduleOptimizer( 718 Scop &S, 719 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 720 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, 721 isl::schedule &LastSchedule, bool &DepsChanged) { 722 // Skip empty SCoPs but still allow code generation as it will delete the 723 // loops present but not needed. 724 if (S.getSize() == 0) { 725 S.markAsOptimized(); 726 return; 727 } 728 729 ScopsProcessed++; 730 731 // Schedule without optimizations. 732 isl::schedule Schedule = S.getScheduleTree(); 733 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 734 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 735 736 bool HasUserTransformation = false; 737 if (PragmaBasedOpts) { 738 isl::schedule ManuallyTransformed = applyManualTransformations( 739 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); 740 if (ManuallyTransformed.is_null()) { 741 POLLY_DEBUG(dbgs() << "Error during manual optimization\n"); 742 return; 743 } 744 745 if (ManuallyTransformed.get() != Schedule.get()) { 746 // User transformations have precedence over other transformations. 747 HasUserTransformation = true; 748 Schedule = std::move(ManuallyTransformed); 749 POLLY_DEBUG( 750 printSchedule(dbgs(), Schedule, "After manual transformations")); 751 } 752 } 753 754 // Only continue if either manual transformations have been applied or we are 755 // allowed to apply heuristics. 756 // TODO: Detect disabled heuristics and no user-directed transformation 757 // metadata earlier in ScopDetection. 758 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 759 POLLY_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 760 return; 761 } 762 763 // Get dependency analysis. 764 const Dependences &D = GetDeps(Dependences::AL_Statement); 765 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 766 POLLY_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 767 return; 768 } 769 if (!D.hasValidDependences()) { 770 POLLY_DEBUG(dbgs() << "Dependency information not available\n"); 771 return; 772 } 773 774 // Apply ISL's algorithm only if not overridden by the user. Note that 775 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 776 // rely on the coincidence/permutable annotations on schedule tree bands that 777 // are added by the rescheduling analyzer. Therefore, disabling the 778 // rescheduler implicitly also disables these optimizations. 779 if (!EnableReschedule) { 780 POLLY_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n"); 781 } else if (HasUserTransformation) { 782 POLLY_DEBUG( 783 dbgs() << "Skipping rescheduling due to manual transformation\n"); 784 } else { 785 // Build input data. 786 int ValidityKinds = 787 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 788 int ProximityKinds; 789 790 if (OptimizeDeps == "all") 791 ProximityKinds = 792 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 793 else if (OptimizeDeps == "raw") 794 ProximityKinds = Dependences::TYPE_RAW; 795 else { 796 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 797 << " Falling back to optimizing all dependences.\n"; 798 ProximityKinds = 799 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 800 } 801 802 isl::union_set Domain = S.getDomains(); 803 804 if (Domain.is_null()) 805 return; 806 807 isl::union_map Validity = D.getDependences(ValidityKinds); 808 isl::union_map Proximity = D.getDependences(ProximityKinds); 809 810 // Simplify the dependences by removing the constraints introduced by the 811 // domains. This can speed up the scheduling time significantly, as large 812 // constant coefficients will be removed from the dependences. The 813 // introduction of some additional dependences reduces the possible 814 // transformations, but in most cases, such transformation do not seem to be 815 // interesting anyway. In some cases this option may stop the scheduler to 816 // find any schedule. 817 if (SimplifyDeps == "yes") { 818 Validity = Validity.gist_domain(Domain); 819 Validity = Validity.gist_range(Domain); 820 Proximity = Proximity.gist_domain(Domain); 821 Proximity = Proximity.gist_range(Domain); 822 } else if (SimplifyDeps != "no") { 823 errs() 824 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 825 "or 'no'. Falling back to default: 'yes'\n"; 826 } 827 828 POLLY_DEBUG(dbgs() << "\n\nCompute schedule from: "); 829 POLLY_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 830 POLLY_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 831 POLLY_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 832 833 int IslMaximizeBands; 834 if (MaximizeBandDepth == "yes") { 835 IslMaximizeBands = 1; 836 } else if (MaximizeBandDepth == "no") { 837 IslMaximizeBands = 0; 838 } else { 839 errs() 840 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 841 " or 'no'. Falling back to default: 'yes'\n"; 842 IslMaximizeBands = 1; 843 } 844 845 int IslOuterCoincidence; 846 if (OuterCoincidence == "yes") { 847 IslOuterCoincidence = 1; 848 } else if (OuterCoincidence == "no") { 849 IslOuterCoincidence = 0; 850 } else { 851 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 852 "'yes' or 'no'. Falling back to default: 'no'\n"; 853 IslOuterCoincidence = 0; 854 } 855 856 isl_ctx *Ctx = S.getIslCtx().get(); 857 858 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 859 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 860 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 861 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 862 isl_options_set_tile_scale_tile_loops(Ctx, 0); 863 864 auto OnErrorStatus = isl_options_get_on_error(Ctx); 865 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 866 867 auto SC = isl::schedule_constraints::on_domain(Domain); 868 SC = SC.set_proximity(Proximity); 869 SC = SC.set_validity(Validity); 870 SC = SC.set_coincidence(Validity); 871 872 { 873 IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut); 874 Schedule = SC.compute_schedule(); 875 876 if (MaxOpGuard.hasQuotaExceeded()) 877 POLLY_DEBUG( 878 dbgs() << "Schedule optimizer calculation exceeds ISL quota\n"); 879 } 880 881 isl_options_set_on_error(Ctx, OnErrorStatus); 882 883 ScopsRescheduled++; 884 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 885 } 886 887 walkScheduleTreeForStatistics(Schedule, 1); 888 889 // In cases the scheduler is not able to optimize the code, we just do not 890 // touch the schedule. 891 if (Schedule.is_null()) 892 return; 893 894 if (GreedyFusion) { 895 isl::union_map Validity = D.getDependences( 896 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); 897 Schedule = applyGreedyFusion(Schedule, Validity); 898 assert(!Schedule.is_null()); 899 } 900 901 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. 902 const OptimizerAdditionalInfoTy OAI = { 903 TTI, 904 const_cast<Dependences *>(&D), 905 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, 906 /*Postopts=*/!HasUserTransformation && EnablePostopts, 907 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE, 908 DepsChanged}; 909 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { 910 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 911 Schedule = hoistExtensionNodes(Schedule); 912 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 913 walkScheduleTreeForStatistics(Schedule, 2); 914 } 915 916 // Skip profitability check if user transformation(s) have been applied. 917 if (!HasUserTransformation && 918 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 919 return; 920 921 auto ScopStats = S.getStatistics(); 922 ScopsOptimized++; 923 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 924 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 925 LastSchedule = Schedule; 926 927 S.setScheduleTree(Schedule); 928 S.markAsOptimized(); 929 930 if (OptimizedScops) 931 errs() << S; 932 } 933 934 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 935 releaseMemory(); 936 937 Function &F = S.getFunction(); 938 IslCtx = S.getSharedIslCtx(); 939 940 auto getDependences = 941 [this](Dependences::AnalysisLevel) -> const Dependences & { 942 return getAnalysis<DependenceInfo>().getDependences( 943 Dependences::AL_Statement); 944 }; 945 OptimizationRemarkEmitter &ORE = 946 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 947 TargetTransformInfo *TTI = 948 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 949 950 bool DepsChanged = false; 951 runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule, 952 DepsChanged); 953 if (DepsChanged) 954 getAnalysis<DependenceInfo>().abandonDependences(); 955 return false; 956 } 957 958 static void runScheduleOptimizerPrinter(raw_ostream &OS, 959 isl::schedule LastSchedule) { 960 isl_printer *p; 961 char *ScheduleStr; 962 963 OS << "Calculated schedule:\n"; 964 965 if (LastSchedule.is_null()) { 966 OS << "n/a\n"; 967 return; 968 } 969 970 p = isl_printer_to_str(LastSchedule.ctx().get()); 971 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 972 p = isl_printer_print_schedule(p, LastSchedule.get()); 973 ScheduleStr = isl_printer_get_str(p); 974 isl_printer_free(p); 975 976 OS << ScheduleStr << "\n"; 977 978 free(ScheduleStr); 979 } 980 981 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 982 runScheduleOptimizerPrinter(OS, LastSchedule); 983 } 984 985 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 986 AnalysisUsage &AU) const { 987 ScopPass::getAnalysisUsage(AU); 988 AU.addRequired<DependenceInfo>(); 989 AU.addRequired<TargetTransformInfoWrapperPass>(); 990 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 991 992 AU.addPreserved<DependenceInfo>(); 993 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 994 } 995 996 } // namespace 997 998 Pass *polly::createIslScheduleOptimizerWrapperPass() { 999 return new IslScheduleOptimizerWrapperPass(); 1000 } 1001 1002 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 1003 "Polly - Optimize schedule of SCoP", false, false); 1004 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1005 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 1006 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 1007 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); 1008 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 1009 "Polly - Optimize schedule of SCoP", false, false) 1010 1011 static llvm::PreservedAnalyses 1012 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 1013 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 1014 raw_ostream *OS) { 1015 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 1016 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 1017 return Deps.getDependences(Dependences::AL_Statement); 1018 }; 1019 OptimizationRemarkEmitter ORE(&S.getFunction()); 1020 TargetTransformInfo *TTI = &SAR.TTI; 1021 isl::schedule LastSchedule; 1022 bool DepsChanged = false; 1023 runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); 1024 if (DepsChanged) 1025 Deps.abandonDependences(); 1026 1027 if (OS) { 1028 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 1029 << S.getName() << "' in function '" << S.getFunction().getName() 1030 << "':\n"; 1031 runScheduleOptimizerPrinter(*OS, LastSchedule); 1032 } 1033 return PreservedAnalyses::all(); 1034 } 1035 1036 llvm::PreservedAnalyses 1037 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 1038 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 1039 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 1040 } 1041 1042 llvm::PreservedAnalyses 1043 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 1044 ScopStandardAnalysisResults &SAR, 1045 SPMUpdater &U) { 1046 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 1047 } 1048 1049 //===----------------------------------------------------------------------===// 1050 1051 namespace { 1052 /// Print result from IslScheduleOptimizerWrapperPass. 1053 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass { 1054 public: 1055 static char ID; 1056 1057 IslScheduleOptimizerPrinterLegacyPass() 1058 : IslScheduleOptimizerPrinterLegacyPass(outs()) {} 1059 explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS) 1060 : ScopPass(ID), OS(OS) {} 1061 1062 bool runOnScop(Scop &S) override { 1063 IslScheduleOptimizerWrapperPass &P = 1064 getAnalysis<IslScheduleOptimizerWrapperPass>(); 1065 1066 OS << "Printing analysis '" << P.getPassName() << "' for region: '" 1067 << S.getRegion().getNameStr() << "' in function '" 1068 << S.getFunction().getName() << "':\n"; 1069 P.printScop(OS, S); 1070 1071 return false; 1072 } 1073 1074 void getAnalysisUsage(AnalysisUsage &AU) const override { 1075 ScopPass::getAnalysisUsage(AU); 1076 AU.addRequired<IslScheduleOptimizerWrapperPass>(); 1077 AU.setPreservesAll(); 1078 } 1079 1080 private: 1081 llvm::raw_ostream &OS; 1082 }; 1083 1084 char IslScheduleOptimizerPrinterLegacyPass::ID = 0; 1085 } // namespace 1086 1087 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) { 1088 return new IslScheduleOptimizerPrinterLegacyPass(OS); 1089 } 1090 1091 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass, 1092 "polly-print-opt-isl", 1093 "Polly - Print optimizer schedule of SCoP", false, false); 1094 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass) 1095 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass, 1096 "polly-print-opt-isl", 1097 "Polly - Print optimizer schedule of SCoP", false, false) 1098