xref: /llvm-project/mlir/include/mlir/Dialect/Affine/Passes.td (revision 9e8200c7184431e0dd0e235b70cabfbe8bfe351d)
1//===-- Passes.td - Affine pass definition file ------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains definitions for passes within the Affine/ directory.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef MLIR_DIALECT_AFFINE_PASSES
14#define MLIR_DIALECT_AFFINE_PASSES
15
16include "mlir/Pass/PassBase.td"
17
18def AffineDataCopyGeneration : Pass<"affine-data-copy-generate", "func::FuncOp"> {
19  let summary = "Generate explicit copying for affine memory operations";
20  let constructor = "mlir::affine::createAffineDataCopyGenerationPass()";
21  let dependentDialects = ["memref::MemRefDialect"];
22  let options = [
23    Option<"fastMemoryCapacity", "fast-mem-capacity", "uint64_t",
24           /*default=*/"std::numeric_limits<uint64_t>::max()",
25           "Set fast memory space capacity in KiB (default: unlimited)">,
26    Option<"fastMemorySpace", "fast-mem-space", "unsigned",
27           /*default=*/"1",
28           "Fast memory space identifier for copy generation (default: 1)">,
29    Option<"generateDma", "generate-dma", "bool",
30           /*default=*/"true", "Generate DMA instead of point-wise copy">,
31    Option<"minDmaTransferSize", "min-dma-transfer", "int",
32           /*default=*/"1024",
33           "Minimum DMA transfer size supported by the target in bytes">,
34    Option<"slowMemorySpace", "slow-mem-space", "unsigned",
35           /*default=*/"0",
36           "Slow memory space identifier for copy generation (default: 0)">,
37    Option<"skipNonUnitStrideLoops", "skip-non-unit-stride-loops", "bool",
38           /*default=*/"false", "Testing purposes: avoid non-unit stride loop "
39                                "choice depths for copy placement">,
40    Option<"tagMemorySpace", "tag-mem-space", "unsigned",
41           /*default=*/"0",
42           "Tag memory space identifier for copy generation (default: 0)">,
43  ];
44}
45
46def AffineLoopFusion : Pass<"affine-loop-fusion"> {
47  let summary = "Fuse affine loop nests";
48  let description = [{
49    This pass performs fusion of loop nests using a slicing-based approach. The
50    transformation works on an MLIR `Block` granularity and applies to all
51    blocks of the pass is run on. It combines two fusion strategies:
52    producer-consumer fusion and sibling fusion. Producer-consumer fusion is
53    aimed at fusing pairs of loops where the first one writes to a memref that
54    the second reads. Sibling fusion targets pairs of loops that share no
55    dependences between them but that load from the same memref. The fused loop
56    nests, when possible, are rewritten to access significantly smaller local
57    buffers instead of the original memref's, and the latter are often either
58    completely optimized away or contracted. This transformation leads to
59    enhanced locality and lower memory footprint through the elimination or
60    contraction of temporaries/intermediate memref's. These benefits are
61    sometimes achieved at the expense of redundant computation through a cost
62    model that evaluates available choices such as the depth at which a source
63    slice should be materialized in the designation slice.
64
65    Example 1: Producer-consumer fusion.
66    Input:
67    ```mlir
68    func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
69      %0 = memref.alloc() : memref<10xf32>
70      %1 = memref.alloc() : memref<10xf32>
71      %cst = arith.constant 0.000000e+00 : f32
72      affine.for %arg2 = 0 to 10 {
73        affine.store %cst, %0[%arg2] : memref<10xf32>
74        affine.store %cst, %1[%arg2] : memref<10xf32>
75      }
76      affine.for %arg2 = 0 to 10 {
77        %2 = affine.load %0[%arg2] : memref<10xf32>
78        %3 = arith.addf %2, %2 : f32
79        affine.store %3, %arg0[%arg2] : memref<10xf32>
80      }
81      affine.for %arg2 = 0 to 10 {
82        %2 = affine.load %1[%arg2] : memref<10xf32>
83        %3 = arith.mulf %2, %2 : f32
84        affine.store %3, %arg1[%arg2] : memref<10xf32>
85      }
86      return
87    }
88    ```
89    Output:
90    ```mlir
91    func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
92      %0 = memref.alloc() : memref<1xf32>
93      %1 = memref.alloc() : memref<1xf32>
94      %cst = arith.constant 0.000000e+00 : f32
95      affine.for %arg2 = 0 to 10 {
96        affine.store %cst, %0[0] : memref<1xf32>
97        affine.store %cst, %1[0] : memref<1xf32>
98        %2 = affine.load %1[0] : memref<1xf32>
99        %3 = arith.mulf %2, %2 : f32
100        affine.store %3, %arg1[%arg2] : memref<10xf32>
101        %4 = affine.load %0[0] : memref<1xf32>
102        %5 = arith.addf %4, %4 : f32
103        affine.store %5, %arg0[%arg2] : memref<10xf32>
104      }
105      return
106    }
107    ```
108
109    Example 2: Sibling fusion.
110    Input:
111    ```mlir
112    func.func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
113                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
114                         %arg4: memref<10x10xf32>) {
115      affine.for %arg5 = 0 to 3 {
116        affine.for %arg6 = 0 to 3 {
117          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
118          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
119          %2 = arith.mulf %0, %1 : f32
120          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
121        }
122      }
123      affine.for %arg5 = 0 to 3 {
124        affine.for %arg6 = 0 to 3 {
125          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
126          %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
127          %2 = arith.addf %0, %1 : f32
128          affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
129        }
130      }
131      return
132    }
133    ```
134    Output:
135    ```mlir
136    func.func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
137                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
138                         %arg4: memref<10x10xf32>) {
139      affine.for %arg5 = 0 to 3 {
140        affine.for %arg6 = 0 to 3 {
141          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
142          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
143          %2 = arith.mulf %0, %1 : f32
144          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
145          %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
146          %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
147          %5 = arith.addf %3, %4 : f32
148          affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
149        }
150      }
151      return
152    }
153    ```
154  }];
155  let constructor = "mlir::affine::createLoopFusionPass()";
156  let options = [
157    Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
158           /*default=*/"0.30f", "Fractional increase in additional computation "
159                                "tolerated while fusing">,
160    Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
161           /*default=*/"0",
162           "Faster memory space number to promote fusion buffers to">,
163    Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
164           /*default=*/"0", "Threshold size (KiB) for promoting local buffers "
165                            "to fast memory space">,
166    Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
167           "Enables maximal loop fusion">,
168    Option<"affineFusionMode", "mode", "enum FusionMode",
169           "mlir::affine::FusionMode::Greedy", "fusion mode to attempt",
170           "llvm::cl::values(clEnumValN(mlir::affine::FusionMode::Greedy,"
171           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
172           "clEnumValN( mlir::affine::FusionMode::ProducerConsumer, "
173           "\"producer\", \"Perform only producer-consumer fusion\"), "
174           "clEnumValN( mlir::affine::FusionMode::Sibling, "
175           "\"sibling\", \"Perform only sibling fusion\"))">,
176    ];
177  let dependentDialects = ["memref::MemRefDialect"];
178}
179
180def AffineLoopInvariantCodeMotion
181    : Pass<"affine-loop-invariant-code-motion", "func::FuncOp"> {
182  let summary = "Hoist loop invariant instructions outside of affine loops";
183  let constructor = "mlir::affine::createAffineLoopInvariantCodeMotionPass()";
184}
185
186def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> {
187  let summary = "Tile affine loop nests";
188  let constructor = "mlir::affine::createLoopTilingPass()";
189  let options = [
190    Option<"cacheSizeInKiB", "cache-size", "uint64_t", /*default=*/"512",
191           "Set size of cache to tile for in KiB (default: 512)">,
192    Option<"separate", "separate", "bool", /*default=*/"false",
193           "Separate full and partial tiles (default: false)">,
194    Option<"tileSize", "tile-size", "unsigned", /*default=*/"",
195           "Use this tile size for all loops">,
196    ListOption<"tileSizes", "tile-sizes", "unsigned",
197               "List of tile sizes for each perfect nest "
198               "(overridden by -tile-size)">,
199  ];
200}
201
202def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> {
203  let summary = "Unroll affine loops";
204  let constructor = "mlir::affine::createLoopUnrollPass()";
205  let options = [
206    Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"4",
207           "Use this unroll factor for all loops being unrolled">,
208    Option<"unrollUpToFactor", "unroll-up-to-factor", "bool",
209           /*default=*/"false", "Allow unrolling up to the factor specified">,
210    Option<"unrollFull", "unroll-full", "bool", /*default=*/"false",
211           "Fully unroll loops">,
212    Option<"numRepetitions", "unroll-num-reps", "unsigned", /*default=*/"1",
213           "Unroll innermost loops repeatedly this many times">,
214    Option<"unrollFullThreshold", "unroll-full-threshold", "unsigned",
215           /*default=*/"1",
216           "Unroll all loops with trip count less than or equal to this">,
217    Option<"cleanUpUnroll", "cleanup-unroll", "bool", /*default=*/"false",
218           "Fully unroll the cleanup loop when possible.">,
219  ];
220}
221
222def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam", "func::FuncOp"> {
223  let summary = "Unroll and jam affine loops";
224  let constructor = "mlir::affine::createLoopUnrollAndJamPass()";
225  let options = [
226    Option<"unrollJamFactor", "unroll-jam-factor", "unsigned",
227           /*default=*/"4",
228           "Use this unroll jam factor for all loops (default 4)">,
229  ];
230}
231
232def AffinePipelineDataTransfer
233    : Pass<"affine-pipeline-data-transfer", "func::FuncOp"> {
234  let summary = "Pipeline non-blocking data transfers between explicitly "
235                "managed levels of the memory hierarchy";
236  let description = [{
237    This pass performs a transformation to overlap non-blocking DMA operations
238    in a loop with computations through double buffering. This is achieved by
239    advancing dma_start operations with respect to other operations.
240
241    Input
242
243    ```mlir
244    func.func @pipelinedatatransfer() {
245      %0 = memref.alloc() : memref<256xf32>
246      %1 = memref.alloc() : memref<32xf32, 1>
247      %2 = memref.alloc() : memref<1xf32>
248      %c0 = arith.constant 0 : index
249      %c128 = arith.constant 128 : index
250      affine.for %i0 = 0 to 8 {
251        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
252        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
253        %3 = affine.load %1[%i0] : memref<32xf32, 1>
254        %4 = "compute"(%3) : (f32) -> f32
255        affine.store %4, %1[%i0] : memref<32xf32, 1>
256      }
257      return
258    }
259    ```
260
261    Output
262
263    ```mlir
264    module {
265      func.func @pipelinedatatransfer() {
266        %c8 = arith.constant 8 : index
267        %c0 = arith.constant 0 : index
268        %0 = memref.alloc() : memref<256xf32>
269        %c0_0 = arith.constant 0 : index
270        %c128 = arith.constant 128 : index
271        %1 = memref.alloc() : memref<2x32xf32, 1>
272        %2 = memref.alloc() : memref<2x1xf32>
273        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
274        affine.for %arg0 = 1 to 8 {
275          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
276          %8 = affine.apply #map3(%arg0)
277          %9 = affine.apply #map4(%8)
278          %10 = affine.apply #map4(%8)
279          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
280          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
281          %12 = "compute"(%11) : (f32) -> f32
282          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
283        }
284        %3 = affine.apply #map3(%c8)
285        %4 = affine.apply #map4(%3)
286        %5 = affine.apply #map4(%3)
287        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
288        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
289        %7 = "compute"(%6) : (f32) -> f32
290        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
291        memref.dealloc %2 : memref<2x1xf32>
292        memref.dealloc %1 : memref<2x32xf32, 1>
293        return
294      }
295    }
296    ```
297  }];
298  let constructor = "mlir::affine::createPipelineDataTransferPass()";
299}
300
301def AffineScalarReplacement : Pass<"affine-scalrep", "func::FuncOp"> {
302  let summary = "Replace affine memref accesses by scalars by forwarding stores "
303                "to loads and eliminating redundant loads";
304  let description = [{
305    This pass performs store to load forwarding and redundant load elimination
306    for affine memref accesses and potentially eliminates the entire memref
307    if all its accesses are forwarded.
308
309    Input
310
311    ```mlir
312    func.func @store_load_affine_apply() -> memref<10x10xf32> {
313      %cf7 = arith.constant 7.0 : f32
314      %m = memref.alloc() : memref<10x10xf32>
315      affine.for %i0 = 0 to 10 {
316        affine.for %i1 = 0 to 10 {
317          affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32>
318          %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
319          %v1 = arith.addf %v0, %v0 : f32
320        }
321      }
322      return %m : memref<10x10xf32>
323    }
324    ```
325
326    Output
327
328    ```mlir
329    module {
330      func.func @store_load_affine_apply() -> memref<10x10xf32> {
331        %cst = arith.constant 7.000000e+00 : f32
332        %0 = memref.alloc() : memref<10x10xf32>
333        affine.for %arg0 = 0 to 10 {
334          affine.for %arg1 = 0 to 10 {
335            affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32>
336            %1 = arith.addf %cst, %cst : f32
337          }
338        }
339        return %0 : memref<10x10xf32>
340      }
341    }
342    ```
343  }];
344  let constructor = "mlir::affine::createAffineScalarReplacementPass()";
345}
346
347def AffineVectorize : Pass<"affine-super-vectorize", "func::FuncOp"> {
348  let summary = "Vectorize to a target independent n-D vector abstraction";
349  let dependentDialects = ["vector::VectorDialect"];
350  let options = [
351    ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
352               "Specify an n-D virtual vector size for vectorization. "
353               "This must be greater than zero.">,
354    // Optionally, the fixed mapping from loop to fastest varying MemRef
355    // dimension for all the MemRefs within a loop pattern:
356    //   the index represents the loop depth, the value represents the k^th
357    //   fastest varying memory dimension.
358    // This is voluntarily restrictive and is meant to precisely target a
359    // particular loop/op pair, for testing purposes.
360    ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",
361               "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
362               "dimensions to match. See defaultPatterns in Vectorize.cpp for "
363               "a description and examples. This is used for testing purposes">,
364    Option<"vectorizeReductions", "vectorize-reductions", "bool",
365           /*default=*/"false",
366           "Vectorize known reductions expressed via iter_args. "
367           "Switched off by default.">
368  ];
369}
370
371def AffineParallelize : Pass<"affine-parallelize", "func::FuncOp"> {
372  let summary = "Convert affine.for ops into 1-D affine.parallel";
373  let constructor = "mlir::affine::createAffineParallelizePass()";
374  let options = [
375    Option<"maxNested", "max-nested", "unsigned", /*default=*/"-1u",
376           "Maximum number of nested parallel loops to produce. "
377           "Defaults to unlimited (UINT_MAX).">,
378    Option<"parallelReductions", "parallel-reductions", "bool",
379           /*default=*/"false",
380           "Whether to parallelize reduction loops. Defaults to false.">
381  ];
382}
383
384def AffineLoopNormalize : Pass<"affine-loop-normalize", "func::FuncOp"> {
385  let summary = "Apply normalization transformations to affine loop-like ops";
386  let constructor = "mlir::affine::createAffineLoopNormalizePass()";
387  let options = [
388    Option<"promoteSingleIter", "promote-single-iter", "bool",
389           /*default=*/"true", "Promote single iteration loops">,
390  ];
391}
392
393def LoopCoalescing : Pass<"affine-loop-coalescing", "func::FuncOp"> {
394  let summary = "Coalesce nested loops with independent bounds into a single "
395                "loop";
396  let constructor = "mlir::affine::createLoopCoalescingPass()";
397  let dependentDialects = ["affine::AffineDialect","arith::ArithDialect"];
398}
399
400def SimplifyAffineStructures : Pass<"affine-simplify-structures", "func::FuncOp"> {
401  let summary = "Simplify affine expressions in maps/sets and normalize "
402                "memrefs";
403  let constructor = "mlir::affine::createSimplifyAffineStructuresPass()";
404}
405
406def AffineExpandIndexOps : Pass<"affine-expand-index-ops"> {
407  let summary = "Lower affine operations operating on indices into more fundamental operations";
408  let constructor = "mlir::affine::createAffineExpandIndexOpsPass()";
409}
410
411def AffineExpandIndexOpsAsAffine : Pass<"affine-expand-index-ops-as-affine"> {
412  let summary = "Lower affine operations operating on indices into affine.apply operations";
413  let constructor = "mlir::affine::createAffineExpandIndexOpsAsAffinePass()";
414}
415
416#endif // MLIR_DIALECT_AFFINE_PASSES
417