1 //===- Transforms.h - SCF dialect transformation utilities ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This header file defines transformations on SCF operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_ 14 #define MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_ 15 16 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h" 17 #include "mlir/Support/LLVM.h" 18 #include "llvm/ADT/ArrayRef.h" 19 20 namespace mlir { 21 class Region; 22 class RewriterBase; 23 class Operation; 24 class Value; 25 26 namespace scf { 27 28 class IfOp; 29 class ForallOp; 30 class ForOp; 31 class ParallelOp; 32 class WhileOp; 33 34 /// Try converting scf.forall into a set of nested scf.for loops. 35 /// The newly created scf.for ops will be returned through the `results` 36 /// vector if provided. 37 LogicalResult forallToForLoop(RewriterBase &rewriter, ForallOp forallOp, 38 SmallVectorImpl<Operation *> *results = nullptr); 39 40 /// Try converting scf.forall into an scf.parallel loop. 41 /// The conversion is only supported for forall operations with no results. 42 LogicalResult forallToParallelLoop(RewriterBase &rewriter, ForallOp forallOp, 43 ParallelOp *result = nullptr); 44 45 /// Fuses all adjacent scf.parallel operations with identical bounds and step 46 /// into one scf.parallel operations. Uses a naive aliasing and dependency 47 /// analysis. 48 /// User can additionally customize alias checking with `mayAlias` hook. 49 /// `mayAlias` must return false if 2 values are guaranteed to not alias. 50 void naivelyFuseParallelOps(Region ®ion, 51 llvm::function_ref<bool(Value, Value)> mayAlias); 52 53 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 54 /// into a for loop where the step divides the iteration space evenly, followed 55 /// by another scf.for for the last (partial) iteration (if any; returned via 56 /// `partialIteration`). This transformation is called "loop peeling". 57 /// 58 /// This transformation is beneficial for a wide range of transformations such 59 /// as vectorization or loop tiling: It enables additional canonicalizations 60 /// inside the peeled loop body such as rewriting masked loads into unmaked 61 /// loads. 62 /// 63 /// E.g., assuming a lower bound of 0 (for illustration purposes): 64 /// ``` 65 /// scf.for %iv = %c0 to %ub step %c4 { 66 /// (loop body) 67 /// } 68 /// ``` 69 /// is rewritten into the following pseudo IR: 70 /// ``` 71 /// %newUb = %ub - (%ub mod %c4) 72 /// scf.for %iv = %c0 to %newUb step %c4 { 73 /// (loop body) 74 /// } 75 /// scf.for %iv2 = %newUb to %ub { 76 /// (loop body) 77 /// } 78 /// ``` 79 /// 80 /// After loop peeling, this function tries to simplify affine.min and 81 /// affine.max ops in the body of the peeled loop and in the body of the partial 82 /// iteration loop, taking advantage of the fact that the peeled loop has only 83 /// "full" iterations. This simplification is expected to enable further 84 /// canonicalization opportunities through other patterns. 85 /// 86 /// The return value indicates whether the loop was rewritten or not. Loops are 87 /// not rewritten if: 88 /// * Loop step size is 1 or 89 /// * Loop bounds and step size are static, and step already divides the 90 /// iteration space evenly. 91 /// 92 /// Note: This function rewrites the given scf.for loop in-place and creates a 93 /// new scf.for operation for the last iteration. It replaces all uses of the 94 /// unpeeled loop with the results of the newly generated scf.for. 95 LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, 96 scf::ForOp &partialIteration); 97 98 /// Peel the first iteration out of the scf.for loop. If there is only one 99 /// iteration, return the original loop. 100 LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, 101 scf::ForOp &partialIteration); 102 103 /// Tile a parallel loop of the form 104 /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 105 /// step (%arg4, %arg5) 106 /// 107 /// into 108 /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) 109 /// step (%arg4*tileSize[0], 110 /// %arg5*tileSize[1]) 111 /// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%j0) 112 /// min(tileSize[1], %arg3-%j1)) 113 /// step (%arg4, %arg5) 114 /// The old loop is replaced with the new one. 115 /// 116 /// The function returns the resulting ParallelOps, i.e. {outer_loop_op, 117 /// inner_loop_op}. 118 std::pair<ParallelOp, ParallelOp> 119 tileParallelLoop(ParallelOp op, llvm::ArrayRef<int64_t> tileSizes, 120 bool noMinMaxBounds); 121 122 /// Options to dictate how loops should be pipelined. 123 struct PipeliningOption { 124 /// Lambda returning all the operation in the forOp, with their stage, in the 125 /// order picked for the pipelined loop. 126 using GetScheduleFnType = std::function<void( 127 scf::ForOp, std::vector<std::pair<Operation *, unsigned>> &)>; 128 GetScheduleFnType getScheduleFn = nullptr; 129 enum class PipelinerPart { 130 Prologue, 131 Kernel, 132 Epilogue, 133 }; 134 /// Lambda called by the pipeliner to allow the user to annotate the IR while 135 /// it is generated. 136 /// The callback passes the operation created along with the part of the 137 /// pipeline and the iteration index. The iteration index is always 0 for the 138 /// kernel. For the prologue and epilogue, it corresponds to the iteration 139 /// peeled out of the loop in the range [0, maxStage[. 140 using AnnotationlFnType = 141 std::function<void(Operation *, PipelinerPart, unsigned)>; 142 AnnotationlFnType annotateFn = nullptr; 143 144 /// Control whether the epilogue should be peeled out of the loop or 145 /// operations should be predicated to skip the early stages in the last loop 146 /// iterations. If the epilogue is predicated; the user needs to provide a 147 /// lambda to generate the predicated version of operations. 148 bool peelEpilogue = true; 149 150 /// Control whether the transformation checks that the number of iterations is 151 /// greater or equal to the number of stages and skip the transformation if 152 /// this is not the case. If the loop is dynamic and this is set to true and 153 /// the loop bounds are not static the pipeliner will have to predicate 154 /// operations in the the prologue/epilogue. 155 bool supportDynamicLoops = false; 156 157 // Callback to predicate operations when the prologue or epilogue are not 158 // peeled. This takes the original operation, an i1 predicate value and the 159 // pattern rewriter. It is expected to replace the given operation with 160 // the predicated equivalent and return it, or return nullptr if the 161 // predication is impossible. In the latter case, pipelining will fail and 162 // may leave IR in a partially transformed state. 163 using PredicateOpFn = 164 std::function<Operation *(RewriterBase &, Operation *, Value)>; 165 PredicateOpFn predicateFn = nullptr; 166 167 // TODO: add option to decide if the prologue should be peeled. 168 }; 169 170 /// Generate a pipelined version of the scf.for loop based on the schedule given 171 /// as option. This applies the mechanical transformation of changing the loop 172 /// and generating the prologue/epilogue for the pipelining and doesn't make any 173 /// decision regarding the schedule. 174 /// Based on the options the loop is split into several stages. 175 /// The transformation assumes that the scheduling given by user is valid. 176 /// For example if we break a loop into 3 stages named S0, S1, S2 we would 177 /// generate the following code with the number in parenthesis as the iteration 178 /// index: 179 /// 180 /// S0(0) // Prologue 181 /// S0(1) S1(0) // Prologue 182 /// scf.for %I = %C0 to %N - 2 { 183 /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel 184 /// } 185 /// S1(N) S2(N-1) // Epilogue 186 /// S2(N) // Epilogue 187 /// 188 /// If `modifiedIR` is provided, it will be set to a value that indicates 189 /// whether pipelining modified the IR before failing, signaling to the caller 190 /// whether they can proceed with different transformations. 191 FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp, 192 const PipeliningOption &options, 193 bool *modifiedIR = nullptr); 194 195 /// Create zero-trip-check around a `while` op and return the new loop op in the 196 /// check. The while loop is rotated to avoid evaluating the condition twice 197 /// 198 /// By default the check won't be created for do-while loop as it is not 199 /// required. `forceCreateCheck` can force the creation. 200 /// 201 /// It turns: 202 /// 203 /// scf.while (%arg0 = %init) : (i32) -> i64 { 204 /// %val = .., %arg0 : i64 205 /// %cond = arith.cmpi .., %arg0 : i32 206 /// scf.condition(%cond) %val : i64 207 /// } do { 208 /// ^bb0(%arg1: i64): 209 /// %next = .., %arg1 : i32 210 /// scf.yield %next : i32 211 /// } 212 /// 213 /// into: 214 /// 215 /// %pre_val = .., %init : i64 216 /// %pre_cond = arith.cmpi .., %init : i32 217 /// scf.if %pre_cond -> i64 { 218 /// %res = scf.while (%arg1 = %va0) : (i64) -> i64 { 219 /// %next = .., %arg1 : i32 220 /// %val = .., %next : i64 221 /// %cond = arith.cmpi .., %next : i32 222 /// scf.condition(%cond) %val : i64 223 /// } do { 224 /// ^bb0(%arg2: i64): 225 /// %scf.yield %arg2 : i32 226 /// } 227 /// scf.yield %res : i64 228 /// } else { 229 /// scf.yield %pre_val : i64 230 /// } 231 /// 232 /// Failure mechanism is not implemented for this function, so it currently 233 /// always returns a `WhileOp` operation: a new one if the transformation took 234 /// place or the input `whileOp` if the loop was already in a `do-while` form 235 /// and `forceCreateCheck` is `false`. 236 FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp, 237 RewriterBase &rewriter, 238 bool forceCreateCheck = false); 239 240 /// Try to uplift `scf.while` op to `scf.for`. 241 /// Uplifitng expects a specific ops pattern: 242 /// * `before` block consisting of single arith.cmp op 243 /// * `after` block containing arith.addi 244 FailureOr<ForOp> upliftWhileToForLoop(RewriterBase &rewriter, WhileOp loop); 245 246 } // namespace scf 247 } // namespace mlir 248 249 #endif // MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_ 250