xref: /llvm-project/polly/lib/Transform/ScheduleOptimizer.cpp (revision 5aafc6d58f3405662902cee006be11e599801b88)
1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
63 
64 using namespace llvm;
65 using namespace polly;
66 
67 namespace llvm {
68 class Loop;
69 class Module;
70 } // namespace llvm
71 
72 #include "polly/Support/PollyDebug.h"
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::cat(PollyCategory));
79 
80 static cl::opt<std::string>
81     SimplifyDeps("polly-opt-simplify-deps",
82                  cl::desc("Dependences should be simplified (yes/no)"),
83                  cl::Hidden, cl::init("yes"), cl::cat(PollyCategory));
84 
85 static cl::opt<int> MaxConstantTerm(
86     "polly-opt-max-constant-term",
87     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
88     cl::init(20), cl::cat(PollyCategory));
89 
90 static cl::opt<int> MaxCoefficient(
91     "polly-opt-max-coefficient",
92     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
93     cl::init(20), cl::cat(PollyCategory));
94 
95 static cl::opt<std::string>
96     MaximizeBandDepth("polly-opt-maximize-bands",
97                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
98                       cl::init("yes"), cl::cat(PollyCategory));
99 
100 static cl::opt<int>
101     ScheduleComputeOut("polly-schedule-computeout",
102                        cl::desc("Bound the scheduler by maximal amount"
103                                 "of computational steps. "),
104                        cl::Hidden, cl::init(300000), cl::ZeroOrMore,
105                        cl::cat(PollyCategory));
106 
107 static cl::opt<bool>
108     GreedyFusion("polly-loopfusion-greedy",
109                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
110                  cl::cat(PollyCategory));
111 
112 static cl::opt<std::string> OuterCoincidence(
113     "polly-opt-outer-coincidence",
114     cl::desc("Try to construct schedules where the outer member of each band "
115              "satisfies the coincidence constraints (yes/no)"),
116     cl::Hidden, cl::init("no"), cl::cat(PollyCategory));
117 
118 static cl::opt<int> PrevectorWidth(
119     "polly-prevect-width",
120     cl::desc(
121         "The number of loop iterations to strip-mine for pre-vectorization"),
122     cl::Hidden, cl::init(4), cl::cat(PollyCategory));
123 
124 static cl::opt<bool> FirstLevelTiling("polly-tiling",
125                                       cl::desc("Enable loop tiling"),
126                                       cl::init(true), cl::cat(PollyCategory));
127 
128 static cl::opt<int> FirstLevelDefaultTileSize(
129     "polly-default-tile-size",
130     cl::desc("The default tile size (if not enough were provided by"
131              " --polly-tile-sizes)"),
132     cl::Hidden, cl::init(32), cl::cat(PollyCategory));
133 
134 static cl::list<int>
135     FirstLevelTileSizes("polly-tile-sizes",
136                         cl::desc("A tile size for each loop dimension, filled "
137                                  "with --polly-default-tile-size"),
138                         cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
139 
140 static cl::opt<bool>
141     SecondLevelTiling("polly-2nd-level-tiling",
142                       cl::desc("Enable a 2nd level loop of loop tiling"),
143                       cl::cat(PollyCategory));
144 
145 static cl::opt<int> SecondLevelDefaultTileSize(
146     "polly-2nd-level-default-tile-size",
147     cl::desc("The default 2nd-level tile size (if not enough were provided by"
148              " --polly-2nd-level-tile-sizes)"),
149     cl::Hidden, cl::init(16), cl::cat(PollyCategory));
150 
151 static cl::list<int>
152     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
153                          cl::desc("A tile size for each loop dimension, filled "
154                                   "with --polly-default-tile-size"),
155                          cl::Hidden, cl::CommaSeparated,
156                          cl::cat(PollyCategory));
157 
158 static cl::opt<bool> RegisterTiling("polly-register-tiling",
159                                     cl::desc("Enable register tiling"),
160                                     cl::cat(PollyCategory));
161 
162 static cl::opt<int> RegisterDefaultTileSize(
163     "polly-register-tiling-default-tile-size",
164     cl::desc("The default register tile size (if not enough were provided by"
165              " --polly-register-tile-sizes)"),
166     cl::Hidden, cl::init(2), cl::cat(PollyCategory));
167 
168 static cl::list<int>
169     RegisterTileSizes("polly-register-tile-sizes",
170                       cl::desc("A tile size for each loop dimension, filled "
171                                "with --polly-register-tile-size"),
172                       cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
173 
174 static cl::opt<bool> PragmaBasedOpts(
175     "polly-pragma-based-opts",
176     cl::desc("Apply user-directed transformation from metadata"),
177     cl::init(true), cl::cat(PollyCategory));
178 
179 static cl::opt<bool> EnableReschedule("polly-reschedule",
180                                       cl::desc("Optimize SCoPs using ISL"),
181                                       cl::init(true), cl::cat(PollyCategory));
182 
183 static cl::opt<bool>
184     PMBasedOpts("polly-pattern-matching-based-opts",
185                 cl::desc("Perform optimizations based on pattern matching"),
186                 cl::init(true), cl::cat(PollyCategory));
187 
188 static cl::opt<bool>
189     EnablePostopts("polly-postopts",
190                    cl::desc("Apply post-rescheduling optimizations such as "
191                             "tiling (requires -polly-reschedule)"),
192                    cl::init(true), cl::cat(PollyCategory));
193 
194 static cl::opt<bool> OptimizedScops(
195     "polly-optimized-scops",
196     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
197              "the isl scheduling optimizer and the set of post-scheduling "
198              "transformations is applied on the schedule tree"),
199     cl::cat(PollyCategory));
200 
201 STATISTIC(ScopsProcessed, "Number of scops processed");
202 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
203 STATISTIC(ScopsOptimized, "Number of scops optimized");
204 
205 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
206 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
207 
208 #define THREE_STATISTICS(VARNAME, DESC)                                        \
209   static Statistic VARNAME[3] = {                                              \
210       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
211       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
212       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
213 
214 THREE_STATISTICS(NumBands, "Number of bands");
215 THREE_STATISTICS(NumBandMembers, "Number of band members");
216 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
217 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
218 THREE_STATISTICS(NumFilters, "Number of filter nodes");
219 THREE_STATISTICS(NumExtension, "Number of extension nodes");
220 
221 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
222 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
223 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
224 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
225 STATISTIC(MatMulOpts,
226           "Number of matrix multiplication patterns detected and optimized");
227 
228 namespace {
229 /// Additional parameters of the schedule optimizer.
230 ///
231 /// Target Transform Info and the SCoP dependencies used by the schedule
232 /// optimizer.
233 struct OptimizerAdditionalInfoTy {
234   const llvm::TargetTransformInfo *TTI;
235   const Dependences *D;
236   bool PatternOpts;
237   bool Postopts;
238   bool Prevect;
239   bool &DepsChanged;
240 };
241 
242 class ScheduleTreeOptimizer final {
243 public:
244   /// Apply schedule tree transformations.
245   ///
246   /// This function takes an (possibly already optimized) schedule tree and
247   /// applies a set of additional optimizations on the schedule tree. The
248   /// transformations applied include:
249   ///
250   ///   - Pattern-based optimizations
251   ///   - Tiling
252   ///   - Prevectorization
253   ///
254   /// @param Schedule The schedule object the transformations will be applied
255   ///                 to.
256   /// @param OAI      Target Transform Info and the SCoP dependencies.
257   /// @returns        The transformed schedule.
258   static isl::schedule
259   optimizeSchedule(isl::schedule Schedule,
260                    const OptimizerAdditionalInfoTy *OAI = nullptr);
261 
262   /// Apply schedule tree transformations.
263   ///
264   /// This function takes a node in an (possibly already optimized) schedule
265   /// tree and applies a set of additional optimizations on this schedule tree
266   /// node and its descendants. The transformations applied include:
267   ///
268   ///   - Pattern-based optimizations
269   ///   - Tiling
270   ///   - Prevectorization
271   ///
272   /// @param Node The schedule object post-transformations will be applied to.
273   /// @param OAI  Target Transform Info and the SCoP dependencies.
274   /// @returns    The transformed schedule.
275   static isl::schedule_node
276   optimizeScheduleNode(isl::schedule_node Node,
277                        const OptimizerAdditionalInfoTy *OAI = nullptr);
278 
279   /// Decide if the @p NewSchedule is profitable for @p S.
280   ///
281   /// @param S           The SCoP we optimize.
282   /// @param NewSchedule The new schedule we computed.
283   ///
284   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
285   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
286 
287   /// Isolate a set of partial tile prefixes.
288   ///
289   /// This set should ensure that it contains only partial tile prefixes that
290   /// have exactly VectorWidth iterations.
291   ///
292   /// @param Node A schedule node band, which is a parent of a band node,
293   ///             that contains a vector loop.
294   /// @return Modified isl_schedule_node.
295   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
296                                                     int VectorWidth);
297 
298 private:
299   /// Check if this node is a band node we want to tile.
300   ///
301   /// We look for innermost band nodes where individual dimensions are marked as
302   /// permutable.
303   ///
304   /// @param Node The node to check.
305   static bool isTileableBandNode(isl::schedule_node Node);
306 
307   /// Check if this node is a band node we want to transform using pattern
308   /// matching.
309   ///
310   /// We look for innermost band nodes where individual dimensions are marked as
311   /// permutable. There is no restriction on the number of individual
312   /// dimensions.
313   ///
314   /// @param Node The node to check.
315   static bool isPMOptimizableBandNode(isl::schedule_node Node);
316 
317   /// Pre-vectorizes one scheduling dimension of a schedule band.
318   ///
319   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
320   /// sinks the resulting point loop.
321   ///
322   /// Example (DimToVectorize=0, VectorWidth=4):
323   ///
324   /// | Before transformation:
325   /// |
326   /// | A[i,j] -> [i,j]
327   /// |
328   /// | for (i = 0; i < 128; i++)
329   /// |    for (j = 0; j < 128; j++)
330   /// |      A(i,j);
331   ///
332   /// | After transformation:
333   /// |
334   /// | for (it = 0; it < 32; it+=1)
335   /// |    for (j = 0; j < 128; j++)
336   /// |      for (ip = 0; ip <= 3; ip++)
337   /// |        A(4 * it + ip,j);
338   ///
339   /// The goal of this transformation is to create a trivially vectorizable
340   /// loop.  This means a parallel loop at the innermost level that has a
341   /// constant number of iterations corresponding to the target vector width.
342   ///
343   /// This transformation creates a loop at the innermost level. The loop has
344   /// a constant number of iterations, if the number of loop iterations at
345   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
346   /// currently constant and not yet target specific. This function does not
347   /// reason about parallelism.
348   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
349                                              unsigned DimToVectorize,
350                                              int VectorWidth);
351 
352   /// Apply additional optimizations on the bands in the schedule tree.
353   ///
354   /// We are looking for an innermost band node and apply the following
355   /// transformations:
356   ///
357   ///  - Tile the band
358   ///      - if the band is tileable
359   ///      - if the band has more than one loop dimension
360   ///
361   ///  - Prevectorize the schedule of the band (or the point loop in case of
362   ///    tiling).
363   ///      - if vectorization is enabled
364   ///
365   /// @param Node The schedule node to (possibly) optimize.
366   /// @param User A pointer to forward some use information
367   ///        (currently unused).
368   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
369 
370   /// Apply tiling optimizations on the bands in the schedule tree.
371   ///
372   /// @param Node The schedule node to (possibly) optimize.
373   static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
374 
375   /// Apply prevectorization on the bands in the schedule tree.
376   ///
377   /// @param Node The schedule node to (possibly) prevectorize.
378   static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
379 };
380 
381 isl::schedule_node
382 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
383                                                int VectorWidth) {
384   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
385   Node = Node.child(0).child(0);
386   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
387   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
388   isl::set ScheduleRange{ScheduleRangeUSet};
389   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
390   auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
391   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
392   Node = Node.parent().parent();
393   isl::union_set Options = IsolateOption.unite(AtomicOption);
394   isl::schedule_node_band Result =
395       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
396   return Result;
397 }
398 
399 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
400   isl::schedule_node visitBand(isl::schedule_node_band Band) {
401     isl::schedule_node Node = visitChildren(Band);
402 
403     // Only add SIMD markers to innermost bands.
404     if (!Node.first_child().isa<isl::schedule_node_leaf>())
405       return Node;
406 
407     isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr);
408     return Band.insert_mark(LoopMarker);
409   }
410 };
411 
412 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
413     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
414   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
415 
416   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
417   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
418   assert(DimToVectorize < ScheduleDimensions);
419 
420   if (DimToVectorize > 0) {
421     Node = isl::manage(
422         isl_schedule_node_band_split(Node.release(), DimToVectorize));
423     Node = Node.child(0);
424   }
425   if (DimToVectorize < ScheduleDimensions - 1)
426     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
427   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
428   auto Sizes = isl::multi_val::zero(Space);
429   Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
430   Node =
431       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
432   Node = isolateFullPartialTiles(Node, VectorWidth);
433   Node = Node.child(0);
434   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
435   // we will have troubles to match it in the backend.
436   Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
437       isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
438 
439   // Sink the inner loop into the smallest possible statements to make them
440   // represent a single vector instruction if possible.
441   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
442 
443   // Add SIMD markers to those vector statements.
444   InsertSimdMarkers SimdMarkerInserter;
445   Node = SimdMarkerInserter.visit(Node);
446 
447   PrevectOpts++;
448   return Node.parent();
449 }
450 
451 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
452   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
453   assert(isl_schedule_node_n_children(Node.get()) == 1);
454 
455   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
456 
457   if (ChildType == isl_schedule_node_leaf)
458     return true;
459 
460   if (ChildType != isl_schedule_node_sequence)
461     return false;
462 
463   auto Sequence = Node.child(0);
464 
465   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
466        ++c) {
467     auto Child = Sequence.child(c);
468     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
469       return false;
470     if (isl_schedule_node_get_type(Child.child(0).get()) !=
471         isl_schedule_node_leaf)
472       return false;
473   }
474   return true;
475 }
476 
477 /// Check if this node is a band node, which has only one child.
478 ///
479 /// @param Node The node to check.
480 static bool isOneTimeParentBandNode(isl::schedule_node Node) {
481   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
482     return false;
483 
484   if (isl_schedule_node_n_children(Node.get()) != 1)
485     return false;
486 
487   return true;
488 }
489 
490 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
491   if (!isOneTimeParentBandNode(Node))
492     return false;
493 
494   if (!isl_schedule_node_band_get_permutable(Node.get()))
495     return false;
496 
497   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
498 
499   if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
500     return false;
501 
502   return isSimpleInnermostBand(Node);
503 }
504 
505 bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
506   if (!isOneTimeParentBandNode(Node))
507     return false;
508 
509   return Node.child(0).isa<isl::schedule_node_leaf>();
510 }
511 
512 __isl_give isl::schedule_node
513 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
514   if (FirstLevelTiling) {
515     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
516                     FirstLevelDefaultTileSize);
517     FirstLevelTileOpts++;
518   }
519 
520   if (SecondLevelTiling) {
521     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
522                     SecondLevelDefaultTileSize);
523     SecondLevelTileOpts++;
524   }
525 
526   if (RegisterTiling) {
527     Node =
528         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
529     RegisterTileOpts++;
530   }
531 
532   return Node;
533 }
534 
535 isl::schedule_node
536 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
537   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
538   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
539 
540   for (int i = Dims - 1; i >= 0; i--)
541     if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
542       Node = prevectSchedBand(Node, i, PrevectorWidth);
543       break;
544     }
545 
546   return Node;
547 }
548 
549 __isl_give isl_schedule_node *
550 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
551                                     void *User) {
552   const OptimizerAdditionalInfoTy *OAI =
553       static_cast<const OptimizerAdditionalInfoTy *>(User);
554   assert(OAI && "Expecting optimization options");
555 
556   isl::schedule_node Node = isl::manage(NodeArg);
557 
558   if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
559     isl::schedule_node PatternOptimizedSchedule =
560         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
561     if (!PatternOptimizedSchedule.is_null()) {
562       MatMulOpts++;
563       OAI->DepsChanged = true;
564       return PatternOptimizedSchedule.release();
565     }
566   }
567 
568   if (!isTileableBandNode(Node))
569     return Node.release();
570 
571   if (OAI->Postopts)
572     Node = applyTileBandOpt(Node);
573 
574   if (OAI->Prevect) {
575     // FIXME: Prevectorization requirements are different from those checked by
576     // isTileableBandNode.
577     Node = applyPrevectBandOpt(Node);
578   }
579 
580   return Node.release();
581 }
582 
583 isl::schedule
584 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
585                                         const OptimizerAdditionalInfoTy *OAI) {
586   auto Root = Schedule.get_root();
587   Root = optimizeScheduleNode(Root, OAI);
588   return Root.get_schedule();
589 }
590 
591 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
592     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
593   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
594       Node.release(), optimizeBand,
595       const_cast<void *>(static_cast<const void *>(OAI))));
596   return Node;
597 }
598 
599 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
600                                                  isl::schedule NewSchedule) {
601   // To understand if the schedule has been optimized we check if the schedule
602   // has changed at all.
603   // TODO: We can improve this by tracking if any necessarily beneficial
604   // transformations have been performed. This can e.g. be tiling, loop
605   // interchange, or ...) We can track this either at the place where the
606   // transformation has been performed or, in case of automatic ILP based
607   // optimizations, by comparing (yet to be defined) performance metrics
608   // before/after the scheduling optimizer
609   // (e.g., #stride-one accesses)
610   // FIXME: A schedule tree whose union_map-conversion is identical to the
611   // original schedule map may still allow for parallelization, i.e. can still
612   // be profitable.
613   auto NewScheduleMap = NewSchedule.get_map();
614   auto OldSchedule = S.getSchedule();
615   assert(!OldSchedule.is_null() &&
616          "Only IslScheduleOptimizer can insert extension nodes "
617          "that make Scop::getSchedule() return nullptr.");
618   bool changed = !OldSchedule.is_equal(NewScheduleMap);
619   return changed;
620 }
621 
622 class IslScheduleOptimizerWrapperPass final : public ScopPass {
623 public:
624   static char ID;
625 
626   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
627 
628   /// Optimize the schedule of the SCoP @p S.
629   bool runOnScop(Scop &S) override;
630 
631   /// Print the new schedule for the SCoP @p S.
632   void printScop(raw_ostream &OS, Scop &S) const override;
633 
634   /// Register all analyses and transformation required.
635   void getAnalysisUsage(AnalysisUsage &AU) const override;
636 
637   /// Release the internal memory.
638   void releaseMemory() override {
639     LastSchedule = {};
640     IslCtx.reset();
641   }
642 
643 private:
644   std::shared_ptr<isl_ctx> IslCtx;
645   isl::schedule LastSchedule;
646 };
647 
648 char IslScheduleOptimizerWrapperPass::ID = 0;
649 
650 #ifndef NDEBUG
651 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
652                           StringRef Desc) {
653   isl::ctx Ctx = Schedule.ctx();
654   isl_printer *P = isl_printer_to_str(Ctx.get());
655   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
656   P = isl_printer_print_schedule(P, Schedule.get());
657   char *Str = isl_printer_get_str(P);
658   OS << Desc << ": \n" << Str << "\n";
659   free(Str);
660   isl_printer_free(P);
661 }
662 #endif
663 
664 /// Collect statistics for the schedule tree.
665 ///
666 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
667 /// ignored.
668 /// @param Version  The version of the schedule tree that is analyzed.
669 ///                 0 for the original schedule tree before any transformation.
670 ///                 1 for the schedule tree after isl's rescheduling.
671 ///                 2 for the schedule tree after optimizations are applied
672 ///                 (tiling, pattern matching)
673 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
674   auto Root = Schedule.get_root();
675   if (Root.is_null())
676     return;
677 
678   isl_schedule_node_foreach_descendant_top_down(
679       Root.get(),
680       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
681         isl::schedule_node Node = isl::manage_copy(nodeptr);
682         int Version = *static_cast<int *>(user);
683 
684         switch (isl_schedule_node_get_type(Node.get())) {
685         case isl_schedule_node_band: {
686           NumBands[Version]++;
687           if (isl_schedule_node_band_get_permutable(Node.get()) ==
688               isl_bool_true)
689             NumPermutable[Version]++;
690 
691           int CountMembers = isl_schedule_node_band_n_member(Node.get());
692           NumBandMembers[Version] += CountMembers;
693           for (int i = 0; i < CountMembers; i += 1) {
694             if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
695               NumCoincident[Version]++;
696           }
697           break;
698         }
699 
700         case isl_schedule_node_filter:
701           NumFilters[Version]++;
702           break;
703 
704         case isl_schedule_node_extension:
705           NumExtension[Version]++;
706           break;
707 
708         default:
709           break;
710         }
711 
712         return isl_bool_true;
713       },
714       &Version);
715 }
716 
717 static void runIslScheduleOptimizer(
718     Scop &S,
719     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
720     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
721     isl::schedule &LastSchedule, bool &DepsChanged) {
722   // Skip empty SCoPs but still allow code generation as it will delete the
723   // loops present but not needed.
724   if (S.getSize() == 0) {
725     S.markAsOptimized();
726     return;
727   }
728 
729   ScopsProcessed++;
730 
731   // Schedule without optimizations.
732   isl::schedule Schedule = S.getScheduleTree();
733   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
734   POLLY_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
735 
736   bool HasUserTransformation = false;
737   if (PragmaBasedOpts) {
738     isl::schedule ManuallyTransformed = applyManualTransformations(
739         &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
740     if (ManuallyTransformed.is_null()) {
741       POLLY_DEBUG(dbgs() << "Error during manual optimization\n");
742       return;
743     }
744 
745     if (ManuallyTransformed.get() != Schedule.get()) {
746       // User transformations have precedence over other transformations.
747       HasUserTransformation = true;
748       Schedule = std::move(ManuallyTransformed);
749       POLLY_DEBUG(
750           printSchedule(dbgs(), Schedule, "After manual transformations"));
751     }
752   }
753 
754   // Only continue if either manual transformations have been applied or we are
755   // allowed to apply heuristics.
756   // TODO: Detect disabled heuristics and no user-directed transformation
757   // metadata earlier in ScopDetection.
758   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
759     POLLY_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
760     return;
761   }
762 
763   // Get dependency analysis.
764   const Dependences &D = GetDeps(Dependences::AL_Statement);
765   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
766     POLLY_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
767     return;
768   }
769   if (!D.hasValidDependences()) {
770     POLLY_DEBUG(dbgs() << "Dependency information not available\n");
771     return;
772   }
773 
774   // Apply ISL's algorithm only if not overridden by the user. Note that
775   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
776   // rely on the coincidence/permutable annotations on schedule tree bands that
777   // are added by the rescheduling analyzer. Therefore, disabling the
778   // rescheduler implicitly also disables these optimizations.
779   if (!EnableReschedule) {
780     POLLY_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
781   } else if (HasUserTransformation) {
782     POLLY_DEBUG(
783         dbgs() << "Skipping rescheduling due to manual transformation\n");
784   } else {
785     // Build input data.
786     int ValidityKinds =
787         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
788     int ProximityKinds;
789 
790     if (OptimizeDeps == "all")
791       ProximityKinds =
792           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
793     else if (OptimizeDeps == "raw")
794       ProximityKinds = Dependences::TYPE_RAW;
795     else {
796       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
797              << " Falling back to optimizing all dependences.\n";
798       ProximityKinds =
799           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
800     }
801 
802     isl::union_set Domain = S.getDomains();
803 
804     if (Domain.is_null())
805       return;
806 
807     isl::union_map Validity = D.getDependences(ValidityKinds);
808     isl::union_map Proximity = D.getDependences(ProximityKinds);
809 
810     // Simplify the dependences by removing the constraints introduced by the
811     // domains. This can speed up the scheduling time significantly, as large
812     // constant coefficients will be removed from the dependences. The
813     // introduction of some additional dependences reduces the possible
814     // transformations, but in most cases, such transformation do not seem to be
815     // interesting anyway. In some cases this option may stop the scheduler to
816     // find any schedule.
817     if (SimplifyDeps == "yes") {
818       Validity = Validity.gist_domain(Domain);
819       Validity = Validity.gist_range(Domain);
820       Proximity = Proximity.gist_domain(Domain);
821       Proximity = Proximity.gist_range(Domain);
822     } else if (SimplifyDeps != "no") {
823       errs()
824           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
825              "or 'no'. Falling back to default: 'yes'\n";
826     }
827 
828     POLLY_DEBUG(dbgs() << "\n\nCompute schedule from: ");
829     POLLY_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
830     POLLY_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
831     POLLY_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
832 
833     int IslMaximizeBands;
834     if (MaximizeBandDepth == "yes") {
835       IslMaximizeBands = 1;
836     } else if (MaximizeBandDepth == "no") {
837       IslMaximizeBands = 0;
838     } else {
839       errs()
840           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
841              " or 'no'. Falling back to default: 'yes'\n";
842       IslMaximizeBands = 1;
843     }
844 
845     int IslOuterCoincidence;
846     if (OuterCoincidence == "yes") {
847       IslOuterCoincidence = 1;
848     } else if (OuterCoincidence == "no") {
849       IslOuterCoincidence = 0;
850     } else {
851       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
852                 "'yes' or 'no'. Falling back to default: 'no'\n";
853       IslOuterCoincidence = 0;
854     }
855 
856     isl_ctx *Ctx = S.getIslCtx().get();
857 
858     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
859     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
860     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
861     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
862     isl_options_set_tile_scale_tile_loops(Ctx, 0);
863 
864     auto OnErrorStatus = isl_options_get_on_error(Ctx);
865     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
866 
867     auto SC = isl::schedule_constraints::on_domain(Domain);
868     SC = SC.set_proximity(Proximity);
869     SC = SC.set_validity(Validity);
870     SC = SC.set_coincidence(Validity);
871 
872     {
873       IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
874       Schedule = SC.compute_schedule();
875 
876       if (MaxOpGuard.hasQuotaExceeded())
877         POLLY_DEBUG(
878             dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
879     }
880 
881     isl_options_set_on_error(Ctx, OnErrorStatus);
882 
883     ScopsRescheduled++;
884     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
885   }
886 
887   walkScheduleTreeForStatistics(Schedule, 1);
888 
889   // In cases the scheduler is not able to optimize the code, we just do not
890   // touch the schedule.
891   if (Schedule.is_null())
892     return;
893 
894   if (GreedyFusion) {
895     isl::union_map Validity = D.getDependences(
896         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
897     Schedule = applyGreedyFusion(Schedule, Validity);
898     assert(!Schedule.is_null());
899   }
900 
901   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
902   const OptimizerAdditionalInfoTy OAI = {
903       TTI,
904       const_cast<Dependences *>(&D),
905       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
906       /*Postopts=*/!HasUserTransformation && EnablePostopts,
907       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
908       DepsChanged};
909   if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
910     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
911     Schedule = hoistExtensionNodes(Schedule);
912     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
913     walkScheduleTreeForStatistics(Schedule, 2);
914   }
915 
916   // Skip profitability check if user transformation(s) have been applied.
917   if (!HasUserTransformation &&
918       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
919     return;
920 
921   auto ScopStats = S.getStatistics();
922   ScopsOptimized++;
923   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
924   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
925   LastSchedule = Schedule;
926 
927   S.setScheduleTree(Schedule);
928   S.markAsOptimized();
929 
930   if (OptimizedScops)
931     errs() << S;
932 }
933 
934 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
935   releaseMemory();
936 
937   Function &F = S.getFunction();
938   IslCtx = S.getSharedIslCtx();
939 
940   auto getDependences =
941       [this](Dependences::AnalysisLevel) -> const Dependences & {
942     return getAnalysis<DependenceInfo>().getDependences(
943         Dependences::AL_Statement);
944   };
945   OptimizationRemarkEmitter &ORE =
946       getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
947   TargetTransformInfo *TTI =
948       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
949 
950   bool DepsChanged = false;
951   runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
952                           DepsChanged);
953   if (DepsChanged)
954     getAnalysis<DependenceInfo>().abandonDependences();
955   return false;
956 }
957 
958 static void runScheduleOptimizerPrinter(raw_ostream &OS,
959                                         isl::schedule LastSchedule) {
960   isl_printer *p;
961   char *ScheduleStr;
962 
963   OS << "Calculated schedule:\n";
964 
965   if (LastSchedule.is_null()) {
966     OS << "n/a\n";
967     return;
968   }
969 
970   p = isl_printer_to_str(LastSchedule.ctx().get());
971   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
972   p = isl_printer_print_schedule(p, LastSchedule.get());
973   ScheduleStr = isl_printer_get_str(p);
974   isl_printer_free(p);
975 
976   OS << ScheduleStr << "\n";
977 
978   free(ScheduleStr);
979 }
980 
981 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
982   runScheduleOptimizerPrinter(OS, LastSchedule);
983 }
984 
985 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
986     AnalysisUsage &AU) const {
987   ScopPass::getAnalysisUsage(AU);
988   AU.addRequired<DependenceInfo>();
989   AU.addRequired<TargetTransformInfoWrapperPass>();
990   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
991 
992   AU.addPreserved<DependenceInfo>();
993   AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
994 }
995 
996 } // namespace
997 
998 Pass *polly::createIslScheduleOptimizerWrapperPass() {
999   return new IslScheduleOptimizerWrapperPass();
1000 }
1001 
1002 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
1003                       "Polly - Optimize schedule of SCoP", false, false);
1004 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1005 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1006 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1007 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
1008 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
1009                     "Polly - Optimize schedule of SCoP", false, false)
1010 
1011 static llvm::PreservedAnalyses
1012 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
1013                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
1014                                 raw_ostream *OS) {
1015   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
1016   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
1017     return Deps.getDependences(Dependences::AL_Statement);
1018   };
1019   OptimizationRemarkEmitter ORE(&S.getFunction());
1020   TargetTransformInfo *TTI = &SAR.TTI;
1021   isl::schedule LastSchedule;
1022   bool DepsChanged = false;
1023   runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
1024   if (DepsChanged)
1025     Deps.abandonDependences();
1026 
1027   if (OS) {
1028     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
1029         << S.getName() << "' in function '" << S.getFunction().getName()
1030         << "':\n";
1031     runScheduleOptimizerPrinter(*OS, LastSchedule);
1032   }
1033   return PreservedAnalyses::all();
1034 }
1035 
1036 llvm::PreservedAnalyses
1037 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
1038                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
1039   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
1040 }
1041 
1042 llvm::PreservedAnalyses
1043 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1044                                      ScopStandardAnalysisResults &SAR,
1045                                      SPMUpdater &U) {
1046   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1047 }
1048 
1049 //===----------------------------------------------------------------------===//
1050 
1051 namespace {
1052 /// Print result from IslScheduleOptimizerWrapperPass.
1053 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
1054 public:
1055   static char ID;
1056 
1057   IslScheduleOptimizerPrinterLegacyPass()
1058       : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
1059   explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
1060       : ScopPass(ID), OS(OS) {}
1061 
1062   bool runOnScop(Scop &S) override {
1063     IslScheduleOptimizerWrapperPass &P =
1064         getAnalysis<IslScheduleOptimizerWrapperPass>();
1065 
1066     OS << "Printing analysis '" << P.getPassName() << "' for region: '"
1067        << S.getRegion().getNameStr() << "' in function '"
1068        << S.getFunction().getName() << "':\n";
1069     P.printScop(OS, S);
1070 
1071     return false;
1072   }
1073 
1074   void getAnalysisUsage(AnalysisUsage &AU) const override {
1075     ScopPass::getAnalysisUsage(AU);
1076     AU.addRequired<IslScheduleOptimizerWrapperPass>();
1077     AU.setPreservesAll();
1078   }
1079 
1080 private:
1081   llvm::raw_ostream &OS;
1082 };
1083 
1084 char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
1085 } // namespace
1086 
1087 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
1088   return new IslScheduleOptimizerPrinterLegacyPass(OS);
1089 }
1090 
1091 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
1092                       "polly-print-opt-isl",
1093                       "Polly - Print optimizer schedule of SCoP", false, false);
1094 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
1095 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
1096                     "polly-print-opt-isl",
1097                     "Polly - Print optimizer schedule of SCoP", false, false)
1098