1 //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This header file defines prototypes for various loop transformation utility 10 // methods: these are not passes by themselves but are used either by passes, 11 // optimization sequences, or in turn by other transformation utilities. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H 16 #define MLIR_DIALECT_AFFINE_LOOPUTILS_H 17 18 #include "mlir/IR/Block.h" 19 #include "mlir/Support/LLVM.h" 20 #include "mlir/Transforms/RegionUtils.h" 21 #include <optional> 22 23 namespace mlir { 24 class AffineMap; 25 class LoopLikeOpInterface; 26 class OpBuilder; 27 class Value; 28 class ValueRange; 29 30 namespace func { 31 class FuncOp; 32 } // namespace func 33 34 namespace scf { 35 class ForOp; 36 class ParallelOp; 37 } // namespace scf 38 39 namespace affine { 40 class AffineForOp; 41 struct MemRefRegion; 42 43 /// Unrolls this for operation completely if the trip count is known to be 44 /// constant. Returns failure otherwise. 45 LogicalResult loopUnrollFull(AffineForOp forOp); 46 47 /// Unrolls this for operation by the specified unroll factor. Returns failure 48 /// if the loop cannot be unrolled either due to restrictions or due to invalid 49 /// unroll factors. Requires positive loop bounds and step. If specified, 50 /// annotates the Ops in each unrolled iteration by applying `annotateFn`. 51 /// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled 52 /// regardless of the unroll factor. 53 LogicalResult loopUnrollByFactor( 54 AffineForOp forOp, uint64_t unrollFactor, 55 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr, 56 bool cleanUpUnroll = false); 57 58 /// Unrolls this loop by the specified unroll factor or its trip count, 59 /// whichever is lower. 60 LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor); 61 62 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear 63 /// in it from outermost to innermost. 64 bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops); 65 66 /// Get perfectly nested sequence of loops starting at root of loop nest 67 /// (the first op being another AffineFor, and the second op - a terminator). 68 /// A loop is perfectly nested iff: the first op in the loop's body is another 69 /// AffineForOp, and the second op is a terminator). 70 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, 71 AffineForOp root); 72 73 /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop 74 /// with iteration arguments performing supported reductions and its inner loops 75 /// can have iteration arguments. Returns success if the loop is successfully 76 /// unroll-jammed. 77 LogicalResult loopUnrollJamByFactor(AffineForOp forOp, 78 uint64_t unrollJamFactor); 79 80 /// Unrolls and jams this loop by the specified factor or by the trip count (if 81 /// constant), whichever is lower. 82 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, 83 uint64_t unrollJamFactor); 84 85 /// Promotes the loop body of a AffineForOp to its containing block if the loop 86 /// was known to have a single iteration. 87 LogicalResult promoteIfSingleIteration(AffineForOp forOp); 88 89 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves 90 /// their body into the containing Block. 91 void promoteSingleIterationLoops(func::FuncOp f); 92 93 /// Skew the operations in an affine.for's body with the specified 94 /// operation-wise shifts. The shifts are with respect to the original execution 95 /// order, and are multiplied by the loop 'step' before being applied. If 96 /// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue 97 /// loops when possible. 98 LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, 99 bool unrollPrologueEpilogue = false); 100 101 /// Identify valid and profitable bands of loops to tile. This is currently just 102 /// a temporary placeholder to test the mechanics of tiled code generation. 103 /// Returns all maximal outermost perfect loop nests to tile. 104 void getTileableBands(func::FuncOp f, 105 std::vector<SmallVector<AffineForOp, 6>> *bands); 106 107 /// Tiles the specified band of perfectly nested loops creating tile-space loops 108 /// and intra-tile loops. A band is a contiguous set of loops. This utility 109 /// doesn't check for the validity of tiling itself, but just performs it. 110 LogicalResult 111 tilePerfectlyNested(MutableArrayRef<AffineForOp> input, 112 ArrayRef<unsigned> tileSizes, 113 SmallVectorImpl<AffineForOp> *tiledNest = nullptr); 114 115 /// Tiles the specified band of perfectly nested loops creating tile-space 116 /// loops and intra-tile loops, using SSA values as tiling parameters. A band 117 /// is a contiguous set of loops. 118 LogicalResult tilePerfectlyNestedParametric( 119 MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, 120 SmallVectorImpl<AffineForOp> *tiledNest = nullptr); 121 122 /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' 123 /// and 'forOpB' are part of a perfectly nested sequence of loops. 124 void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); 125 126 /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly 127 /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in 128 /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange). 129 bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, 130 ArrayRef<unsigned> loopPermMap); 131 132 /// Performs a loop permutation on a perfectly nested loop nest `inputNest` 133 /// (where the contained loops appear from outer to inner) as specified by the 134 /// permutation `permMap`: loop 'i' in `inputNest` is mapped to location 135 /// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position 136 /// to inner. Returns the position in `inputNest` of the AffineForOp that 137 /// becomes the new outermost loop of this nest. This method always succeeds, 138 /// asserts out on invalid input / specifications. 139 unsigned permuteLoops(ArrayRef<AffineForOp> inputNest, 140 ArrayRef<unsigned> permMap); 141 142 // Sinks all sequential loops to the innermost levels (while preserving 143 // relative order among them) and moves all parallel loops to the 144 // outermost (while again preserving relative order among them). 145 // Returns AffineForOp of the root of the new loop nest after loop interchanges. 146 AffineForOp sinkSequentialLoops(AffineForOp forOp); 147 148 /// Performs tiling fo imperfectly nested loops (with interchange) by 149 /// strip-mining the `forOps` by `sizes` and sinking them, in their order of 150 /// occurrence in `forOps`, under each of the `targets`. 151 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, 152 /// nested immediately under each of `targets`. 153 SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps, 154 ArrayRef<uint64_t> sizes, 155 ArrayRef<AffineForOp> targets); 156 157 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` 158 /// and sinking them, in their order of occurrence in `forOps`, under `target`. 159 /// Returns the new AffineForOps, one per `forOps`, nested immediately under 160 /// `target`. 161 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps, 162 ArrayRef<uint64_t> sizes, AffineForOp target); 163 164 /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. 165 struct AffineCopyOptions { 166 // True if DMAs should be generated instead of point-wise copies. 167 bool generateDma; 168 // The slower memory space from which data is to be moved. 169 unsigned slowMemorySpace; 170 // Memory space of the faster one (typically a scratchpad). 171 unsigned fastMemorySpace; 172 // Memory space to place tags in: only meaningful for DMAs. 173 unsigned tagMemorySpace; 174 // Capacity of the fast memory space in bytes. 175 uint64_t fastMemCapacityBytes; 176 }; 177 178 /// Performs explicit copying for the contiguous sequence of operations in the 179 /// block iterator range [`begin', `end'), where `end' can't be past the 180 /// terminator of the block (since additional operations are potentially 181 /// inserted right before `end`. `copyOptions` provides various parameters, and 182 /// the output argument `copyNests` is the set of all copy nests inserted, each 183 /// represented by its root affine.for. Since we generate alloc's and dealloc's 184 /// for all fast buffers (before and after the range of operations resp. or at a 185 /// hoisted position), all of the fast memory capacity is assumed to be 186 /// available for processing this block range. When 'filterMemRef' is specified, 187 /// copies are only generated for the provided MemRef. Returns success if the 188 /// explicit copying succeeded for all memrefs on which affine load/stores were 189 /// encountered. For memrefs for whose element types a size in bytes can't be 190 /// computed (`index` type), their capacity is not accounted for and the 191 /// `fastMemCapacityBytes` copy option would be non-functional in such cases. 192 LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, 193 const AffineCopyOptions ©Options, 194 std::optional<Value> filterMemRef, 195 DenseSet<Operation *> ©Nests); 196 197 /// A convenience version of affineDataCopyGenerate for all ops in the body of 198 /// an AffineForOp. 199 LogicalResult affineDataCopyGenerate(AffineForOp forOp, 200 const AffineCopyOptions ©Options, 201 std::optional<Value> filterMemRef, 202 DenseSet<Operation *> ©Nests); 203 204 /// Result for calling generateCopyForMemRegion. 205 struct CopyGenerateResult { 206 // Number of bytes used by alloc. 207 uint64_t sizeInBytes; 208 209 // The newly created buffer allocation. 210 Operation *alloc; 211 212 // Generated loop nest for copying data between the allocated buffer and the 213 // original memref. 214 Operation *copyNest; 215 }; 216 217 /// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works 218 /// with a single memref region. `memrefRegion` is supposed to contain analysis 219 /// information within analyzedOp. The generated prologue and epilogue always 220 /// surround `analyzedOp`. 221 /// 222 /// Note that `analyzedOp` is a single op for API convenience, and the 223 /// [begin, end) version can be added as needed. 224 /// 225 /// Also note that certain options in `copyOptions` aren't looked at anymore, 226 /// like slowMemorySpace. 227 LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, 228 Operation *analyzedOp, 229 const AffineCopyOptions ©Options, 230 CopyGenerateResult &result); 231 232 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes 233 /// `loops` contains a list of perfectly nested loops outermost to innermost 234 /// that are normalized (step one and lower bound of zero) and with bounds and 235 /// steps independent of any loop induction variable involved in the nest. 236 /// Coalescing affine.for loops is not always possible, i.e., the result may not 237 /// be representable using affine.for. 238 LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops); 239 240 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of 241 /// size given by `numProcessors`. This is achieved by embedding the SSA values 242 /// corresponding to `processorIds` and `numProcessors` into the bounds and step 243 /// of the `forOp`. No check is performed on the legality of the rewrite, it is 244 /// the caller's responsibility to ensure legality. 245 /// 246 /// Requires that `processorIds` and `numProcessors` have the same size and that 247 /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0 248 /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for: 249 /// 1. GPU (threadIdx, get_local_id(), ...) 250 /// 2. MPI (MPI_Comm_rank) 251 /// 3. OpenMP (omp_get_thread_num) 252 /// 253 /// Example: 254 /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and 255 /// numProcessors = [gridDim.x, blockDim.x], the loop: 256 /// 257 /// ``` 258 /// scf.for %i = %lb to %ub step %step { 259 /// ... 260 /// } 261 /// ``` 262 /// 263 /// is rewritten into a version resembling the following pseudo-IR: 264 /// 265 /// ``` 266 /// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x) 267 /// to %ub step %gridDim.x * blockDim.x * %step { 268 /// ... 269 /// } 270 /// ``` 271 void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, 272 ArrayRef<Value> numProcessors); 273 274 /// Gathers all AffineForOps in 'func.func' grouped by loop depth. 275 void gatherLoops(func::FuncOp func, 276 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops); 277 278 /// Creates an AffineForOp while ensuring that the lower and upper bounds are 279 /// canonicalized, i.e., unused and duplicate operands are removed, any constant 280 /// operands propagated/folded in, and duplicate bound maps dropped. 281 AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, 282 ValueRange lbOperands, 283 AffineMap lbMap, 284 ValueRange ubOperands, 285 AffineMap ubMap, int64_t step = 1); 286 287 /// Separates full tiles from partial tiles for a perfect nest `nest` by 288 /// generating a conditional guard that selects between the full tile version 289 /// and the partial tile version using an AffineIfOp. The original loop nest 290 /// is replaced by this guarded two version form. 291 /// 292 /// affine.if (cond) 293 /// // full_tile 294 /// else 295 /// // partial tile 296 /// 297 LogicalResult 298 separateFullTiles(MutableArrayRef<AffineForOp> nest, 299 SmallVectorImpl<AffineForOp> *fullTileNest = nullptr); 300 301 /// Walk an affine.for to find a band to coalesce. 302 LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op); 303 304 /// Count the number of loops surrounding `operand` such that operand could be 305 /// hoisted above. 306 /// Stop counting at the first loop over which the operand cannot be hoisted. 307 /// This counts any LoopLikeOpInterface, not just affine.for. 308 int64_t numEnclosingInvariantLoops(OpOperand &operand); 309 } // namespace affine 310 } // namespace mlir 311 312 #endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H 313