1//===-- Passes.td - Affine pass definition file ------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file contains definitions for passes within the Affine/ directory. 10// 11//===----------------------------------------------------------------------===// 12 13#ifndef MLIR_DIALECT_AFFINE_PASSES 14#define MLIR_DIALECT_AFFINE_PASSES 15 16include "mlir/Pass/PassBase.td" 17 18def AffineDataCopyGeneration : Pass<"affine-data-copy-generate", "func::FuncOp"> { 19 let summary = "Generate explicit copying for affine memory operations"; 20 let constructor = "mlir::affine::createAffineDataCopyGenerationPass()"; 21 let dependentDialects = ["memref::MemRefDialect"]; 22 let options = [ 23 Option<"fastMemoryCapacity", "fast-mem-capacity", "uint64_t", 24 /*default=*/"std::numeric_limits<uint64_t>::max()", 25 "Set fast memory space capacity in KiB (default: unlimited)">, 26 Option<"fastMemorySpace", "fast-mem-space", "unsigned", 27 /*default=*/"1", 28 "Fast memory space identifier for copy generation (default: 1)">, 29 Option<"generateDma", "generate-dma", "bool", 30 /*default=*/"true", "Generate DMA instead of point-wise copy">, 31 Option<"minDmaTransferSize", "min-dma-transfer", "int", 32 /*default=*/"1024", 33 "Minimum DMA transfer size supported by the target in bytes">, 34 Option<"slowMemorySpace", "slow-mem-space", "unsigned", 35 /*default=*/"0", 36 "Slow memory space identifier for copy generation (default: 0)">, 37 Option<"skipNonUnitStrideLoops", "skip-non-unit-stride-loops", "bool", 38 /*default=*/"false", "Testing purposes: avoid non-unit stride loop " 39 "choice depths for copy placement">, 40 Option<"tagMemorySpace", "tag-mem-space", "unsigned", 41 /*default=*/"0", 42 "Tag memory space identifier for copy generation (default: 0)">, 43 ]; 44} 45 46def AffineLoopFusion : Pass<"affine-loop-fusion"> { 47 let summary = "Fuse affine loop nests"; 48 let description = [{ 49 This pass performs fusion of loop nests using a slicing-based approach. The 50 transformation works on an MLIR `Block` granularity and applies to all 51 blocks of the pass is run on. It combines two fusion strategies: 52 producer-consumer fusion and sibling fusion. Producer-consumer fusion is 53 aimed at fusing pairs of loops where the first one writes to a memref that 54 the second reads. Sibling fusion targets pairs of loops that share no 55 dependences between them but that load from the same memref. The fused loop 56 nests, when possible, are rewritten to access significantly smaller local 57 buffers instead of the original memref's, and the latter are often either 58 completely optimized away or contracted. This transformation leads to 59 enhanced locality and lower memory footprint through the elimination or 60 contraction of temporaries/intermediate memref's. These benefits are 61 sometimes achieved at the expense of redundant computation through a cost 62 model that evaluates available choices such as the depth at which a source 63 slice should be materialized in the designation slice. 64 65 Example 1: Producer-consumer fusion. 66 Input: 67 ```mlir 68 func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { 69 %0 = memref.alloc() : memref<10xf32> 70 %1 = memref.alloc() : memref<10xf32> 71 %cst = arith.constant 0.000000e+00 : f32 72 affine.for %arg2 = 0 to 10 { 73 affine.store %cst, %0[%arg2] : memref<10xf32> 74 affine.store %cst, %1[%arg2] : memref<10xf32> 75 } 76 affine.for %arg2 = 0 to 10 { 77 %2 = affine.load %0[%arg2] : memref<10xf32> 78 %3 = arith.addf %2, %2 : f32 79 affine.store %3, %arg0[%arg2] : memref<10xf32> 80 } 81 affine.for %arg2 = 0 to 10 { 82 %2 = affine.load %1[%arg2] : memref<10xf32> 83 %3 = arith.mulf %2, %2 : f32 84 affine.store %3, %arg1[%arg2] : memref<10xf32> 85 } 86 return 87 } 88 ``` 89 Output: 90 ```mlir 91 func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { 92 %0 = memref.alloc() : memref<1xf32> 93 %1 = memref.alloc() : memref<1xf32> 94 %cst = arith.constant 0.000000e+00 : f32 95 affine.for %arg2 = 0 to 10 { 96 affine.store %cst, %0[0] : memref<1xf32> 97 affine.store %cst, %1[0] : memref<1xf32> 98 %2 = affine.load %1[0] : memref<1xf32> 99 %3 = arith.mulf %2, %2 : f32 100 affine.store %3, %arg1[%arg2] : memref<10xf32> 101 %4 = affine.load %0[0] : memref<1xf32> 102 %5 = arith.addf %4, %4 : f32 103 affine.store %5, %arg0[%arg2] : memref<10xf32> 104 } 105 return 106 } 107 ``` 108 109 Example 2: Sibling fusion. 110 Input: 111 ```mlir 112 func.func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, 113 %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, 114 %arg4: memref<10x10xf32>) { 115 affine.for %arg5 = 0 to 3 { 116 affine.for %arg6 = 0 to 3 { 117 %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> 118 %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> 119 %2 = arith.mulf %0, %1 : f32 120 affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> 121 } 122 } 123 affine.for %arg5 = 0 to 3 { 124 affine.for %arg6 = 0 to 3 { 125 %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> 126 %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> 127 %2 = arith.addf %0, %1 : f32 128 affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32> 129 } 130 } 131 return 132 } 133 ``` 134 Output: 135 ```mlir 136 func.func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, 137 %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, 138 %arg4: memref<10x10xf32>) { 139 affine.for %arg5 = 0 to 3 { 140 affine.for %arg6 = 0 to 3 { 141 %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> 142 %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> 143 %2 = arith.mulf %0, %1 : f32 144 affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> 145 %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> 146 %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> 147 %5 = arith.addf %3, %4 : f32 148 affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32> 149 } 150 } 151 return 152 } 153 ``` 154 }]; 155 let constructor = "mlir::affine::createLoopFusionPass()"; 156 let options = [ 157 Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double", 158 /*default=*/"0.30f", "Fractional increase in additional computation " 159 "tolerated while fusing">, 160 Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned", 161 /*default=*/"0", 162 "Faster memory space number to promote fusion buffers to">, 163 Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t", 164 /*default=*/"0", "Threshold size (KiB) for promoting local buffers " 165 "to fast memory space">, 166 Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false", 167 "Enables maximal loop fusion">, 168 Option<"affineFusionMode", "mode", "enum FusionMode", 169 "mlir::affine::FusionMode::Greedy", "fusion mode to attempt", 170 "llvm::cl::values(clEnumValN(mlir::affine::FusionMode::Greedy," 171 " \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), " 172 "clEnumValN( mlir::affine::FusionMode::ProducerConsumer, " 173 "\"producer\", \"Perform only producer-consumer fusion\"), " 174 "clEnumValN( mlir::affine::FusionMode::Sibling, " 175 "\"sibling\", \"Perform only sibling fusion\"))">, 176 ]; 177 let dependentDialects = ["memref::MemRefDialect"]; 178} 179 180def AffineLoopInvariantCodeMotion 181 : Pass<"affine-loop-invariant-code-motion", "func::FuncOp"> { 182 let summary = "Hoist loop invariant instructions outside of affine loops"; 183 let constructor = "mlir::affine::createAffineLoopInvariantCodeMotionPass()"; 184} 185 186def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> { 187 let summary = "Tile affine loop nests"; 188 let constructor = "mlir::affine::createLoopTilingPass()"; 189 let options = [ 190 Option<"cacheSizeInKiB", "cache-size", "uint64_t", /*default=*/"512", 191 "Set size of cache to tile for in KiB (default: 512)">, 192 Option<"separate", "separate", "bool", /*default=*/"false", 193 "Separate full and partial tiles (default: false)">, 194 Option<"tileSize", "tile-size", "unsigned", /*default=*/"", 195 "Use this tile size for all loops">, 196 ListOption<"tileSizes", "tile-sizes", "unsigned", 197 "List of tile sizes for each perfect nest " 198 "(overridden by -tile-size)">, 199 ]; 200} 201 202def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> { 203 let summary = "Unroll affine loops"; 204 let constructor = "mlir::affine::createLoopUnrollPass()"; 205 let options = [ 206 Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"4", 207 "Use this unroll factor for all loops being unrolled">, 208 Option<"unrollUpToFactor", "unroll-up-to-factor", "bool", 209 /*default=*/"false", "Allow unrolling up to the factor specified">, 210 Option<"unrollFull", "unroll-full", "bool", /*default=*/"false", 211 "Fully unroll loops">, 212 Option<"numRepetitions", "unroll-num-reps", "unsigned", /*default=*/"1", 213 "Unroll innermost loops repeatedly this many times">, 214 Option<"unrollFullThreshold", "unroll-full-threshold", "unsigned", 215 /*default=*/"1", 216 "Unroll all loops with trip count less than or equal to this">, 217 Option<"cleanUpUnroll", "cleanup-unroll", "bool", /*default=*/"false", 218 "Fully unroll the cleanup loop when possible.">, 219 ]; 220} 221 222def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam", "func::FuncOp"> { 223 let summary = "Unroll and jam affine loops"; 224 let constructor = "mlir::affine::createLoopUnrollAndJamPass()"; 225 let options = [ 226 Option<"unrollJamFactor", "unroll-jam-factor", "unsigned", 227 /*default=*/"4", 228 "Use this unroll jam factor for all loops (default 4)">, 229 ]; 230} 231 232def AffinePipelineDataTransfer 233 : Pass<"affine-pipeline-data-transfer", "func::FuncOp"> { 234 let summary = "Pipeline non-blocking data transfers between explicitly " 235 "managed levels of the memory hierarchy"; 236 let description = [{ 237 This pass performs a transformation to overlap non-blocking DMA operations 238 in a loop with computations through double buffering. This is achieved by 239 advancing dma_start operations with respect to other operations. 240 241 Input 242 243 ```mlir 244 func.func @pipelinedatatransfer() { 245 %0 = memref.alloc() : memref<256xf32> 246 %1 = memref.alloc() : memref<32xf32, 1> 247 %2 = memref.alloc() : memref<1xf32> 248 %c0 = arith.constant 0 : index 249 %c128 = arith.constant 128 : index 250 affine.for %i0 = 0 to 8 { 251 affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32> 252 affine.dma_wait %2[%c0], %c128 : memref<1xf32> 253 %3 = affine.load %1[%i0] : memref<32xf32, 1> 254 %4 = "compute"(%3) : (f32) -> f32 255 affine.store %4, %1[%i0] : memref<32xf32, 1> 256 } 257 return 258 } 259 ``` 260 261 Output 262 263 ```mlir 264 module { 265 func.func @pipelinedatatransfer() { 266 %c8 = arith.constant 8 : index 267 %c0 = arith.constant 0 : index 268 %0 = memref.alloc() : memref<256xf32> 269 %c0_0 = arith.constant 0 : index 270 %c128 = arith.constant 128 : index 271 %1 = memref.alloc() : memref<2x32xf32, 1> 272 %2 = memref.alloc() : memref<2x1xf32> 273 affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> 274 affine.for %arg0 = 1 to 8 { 275 affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> 276 %8 = affine.apply #map3(%arg0) 277 %9 = affine.apply #map4(%8) 278 %10 = affine.apply #map4(%8) 279 affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> 280 %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1> 281 %12 = "compute"(%11) : (f32) -> f32 282 affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1> 283 } 284 %3 = affine.apply #map3(%c8) 285 %4 = affine.apply #map4(%3) 286 %5 = affine.apply #map4(%3) 287 affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> 288 %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1> 289 %7 = "compute"(%6) : (f32) -> f32 290 affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1> 291 memref.dealloc %2 : memref<2x1xf32> 292 memref.dealloc %1 : memref<2x32xf32, 1> 293 return 294 } 295 } 296 ``` 297 }]; 298 let constructor = "mlir::affine::createPipelineDataTransferPass()"; 299} 300 301def AffineScalarReplacement : Pass<"affine-scalrep", "func::FuncOp"> { 302 let summary = "Replace affine memref accesses by scalars by forwarding stores " 303 "to loads and eliminating redundant loads"; 304 let description = [{ 305 This pass performs store to load forwarding and redundant load elimination 306 for affine memref accesses and potentially eliminates the entire memref 307 if all its accesses are forwarded. 308 309 Input 310 311 ```mlir 312 func.func @store_load_affine_apply() -> memref<10x10xf32> { 313 %cf7 = arith.constant 7.0 : f32 314 %m = memref.alloc() : memref<10x10xf32> 315 affine.for %i0 = 0 to 10 { 316 affine.for %i1 = 0 to 10 { 317 affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32> 318 %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32> 319 %v1 = arith.addf %v0, %v0 : f32 320 } 321 } 322 return %m : memref<10x10xf32> 323 } 324 ``` 325 326 Output 327 328 ```mlir 329 module { 330 func.func @store_load_affine_apply() -> memref<10x10xf32> { 331 %cst = arith.constant 7.000000e+00 : f32 332 %0 = memref.alloc() : memref<10x10xf32> 333 affine.for %arg0 = 0 to 10 { 334 affine.for %arg1 = 0 to 10 { 335 affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32> 336 %1 = arith.addf %cst, %cst : f32 337 } 338 } 339 return %0 : memref<10x10xf32> 340 } 341 } 342 ``` 343 }]; 344 let constructor = "mlir::affine::createAffineScalarReplacementPass()"; 345} 346 347def AffineVectorize : Pass<"affine-super-vectorize", "func::FuncOp"> { 348 let summary = "Vectorize to a target independent n-D vector abstraction"; 349 let dependentDialects = ["vector::VectorDialect"]; 350 let options = [ 351 ListOption<"vectorSizes", "virtual-vector-size", "int64_t", 352 "Specify an n-D virtual vector size for vectorization. " 353 "This must be greater than zero.">, 354 // Optionally, the fixed mapping from loop to fastest varying MemRef 355 // dimension for all the MemRefs within a loop pattern: 356 // the index represents the loop depth, the value represents the k^th 357 // fastest varying memory dimension. 358 // This is voluntarily restrictive and is meant to precisely target a 359 // particular loop/op pair, for testing purposes. 360 ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t", 361 "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory " 362 "dimensions to match. See defaultPatterns in Vectorize.cpp for " 363 "a description and examples. This is used for testing purposes">, 364 Option<"vectorizeReductions", "vectorize-reductions", "bool", 365 /*default=*/"false", 366 "Vectorize known reductions expressed via iter_args. " 367 "Switched off by default."> 368 ]; 369} 370 371def AffineParallelize : Pass<"affine-parallelize", "func::FuncOp"> { 372 let summary = "Convert affine.for ops into 1-D affine.parallel"; 373 let constructor = "mlir::affine::createAffineParallelizePass()"; 374 let options = [ 375 Option<"maxNested", "max-nested", "unsigned", /*default=*/"-1u", 376 "Maximum number of nested parallel loops to produce. " 377 "Defaults to unlimited (UINT_MAX).">, 378 Option<"parallelReductions", "parallel-reductions", "bool", 379 /*default=*/"false", 380 "Whether to parallelize reduction loops. Defaults to false."> 381 ]; 382} 383 384def AffineLoopNormalize : Pass<"affine-loop-normalize", "func::FuncOp"> { 385 let summary = "Apply normalization transformations to affine loop-like ops"; 386 let constructor = "mlir::affine::createAffineLoopNormalizePass()"; 387 let options = [ 388 Option<"promoteSingleIter", "promote-single-iter", "bool", 389 /*default=*/"true", "Promote single iteration loops">, 390 ]; 391} 392 393def LoopCoalescing : Pass<"affine-loop-coalescing", "func::FuncOp"> { 394 let summary = "Coalesce nested loops with independent bounds into a single " 395 "loop"; 396 let constructor = "mlir::affine::createLoopCoalescingPass()"; 397 let dependentDialects = ["affine::AffineDialect","arith::ArithDialect"]; 398} 399 400def SimplifyAffineStructures : Pass<"affine-simplify-structures", "func::FuncOp"> { 401 let summary = "Simplify affine expressions in maps/sets and normalize " 402 "memrefs"; 403 let constructor = "mlir::affine::createSimplifyAffineStructuresPass()"; 404} 405 406def AffineExpandIndexOps : Pass<"affine-expand-index-ops"> { 407 let summary = "Lower affine operations operating on indices into more fundamental operations"; 408 let constructor = "mlir::affine::createAffineExpandIndexOpsPass()"; 409} 410 411def AffineExpandIndexOpsAsAffine : Pass<"affine-expand-index-ops-as-affine"> { 412 let summary = "Lower affine operations operating on indices into affine.apply operations"; 413 let constructor = "mlir::affine::createAffineExpandIndexOpsAsAffinePass()"; 414} 415 416#endif // MLIR_DIALECT_AFFINE_PASSES 417