1 //===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef MLIR_DIALECT_LINALG_UTILS_UTILS_H 10 #define MLIR_DIALECT_LINALG_UTILS_UTILS_H 11 12 #include "mlir/Dialect/Linalg/IR/Linalg.h" 13 #include "mlir/Dialect/SCF/IR/SCF.h" 14 #include "mlir/Dialect/Utils/StructuredOpsUtils.h" 15 #include "llvm/ADT/StringSet.h" 16 #include <optional> 17 18 namespace mlir { 19 class AffineExpr; 20 class AffineMap; 21 class PatternRewriter; 22 23 namespace affine { 24 class AffineForOp; 25 } // namespace affine 26 27 namespace tensor { 28 class ExtractSliceOp; 29 } // namespace tensor 30 31 namespace linalg { 32 33 //===----------------------------------------------------------------------===// 34 // Utilities for inferring various semantics properties of Linalg ops. 35 //===----------------------------------------------------------------------===// 36 37 //===----------------------------------------------------------------------===// 38 // General utilities 39 //===----------------------------------------------------------------------===// 40 41 /// Check if all indexing maps are projected permutations. 42 bool allIndexingsAreProjectedPermutation(LinalgOp op); 43 44 /// Detect whether `r` has only ConstantOp, ElementwiseMappable and YieldOp. 45 bool hasOnlyScalarElementwiseOp(Region &r); 46 47 /// Check if a LinalgOp is an element-wise operation. 48 bool isElementwise(LinalgOp op); 49 50 /// Check if iterator type has "parallel" semantics. 51 bool isParallelIterator(utils::IteratorType iteratorType); 52 53 /// Check if iterator type has "reduction" semantics. 54 bool isReductionIterator(utils::IteratorType iteratorType); 55 56 /// Create a tensor::PadOp that pads `source` to the size of the statically 57 /// sized `type` whose static sizes are assumed to be greater than the dynamic 58 /// `source` size. The padding introduces trailing `pad` values until the 59 /// target size is met. If `source` is defined by one or more LinalgOps that 60 /// have been padded with the same value and sizes, return their padded result 61 /// instead of creating a tensor::PadOp. 62 /// 63 /// Example: 64 /// ``` 65 /// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1] 66 /// %1 = tensor.pad %0 low[0, 0] high[...] { tensor.yield %cst } 67 /// %2 = linalg.matmul ins(...) outs(%1) 68 /// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1] 69 /// ``` 70 /// makeComposedPadHighOp(source=%3, pad=%cst) returns %2 71 /// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4 72 /// ``` 73 /// %4 = tensor.pad %3 low[0, 0] high[...] { tensor.yield %other_cst } 74 /// ``` 75 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type, 76 Value source, Value pad, bool nofold); 77 78 /// Returns GenericOp that copies an n-D memref. Unlike the current 79 /// implementation of memref::CopyOp, this op can further tile, lower to loops 80 /// or vectorize. 81 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to); 82 83 /// Get the reassociation maps to fold the result of a extract_slice (or 84 /// source of a insert_slice) operation with given offsets, and sizes to its 85 /// rank-reduced version. This is only done for the cases where the size is 1 86 /// and offset is 0. Strictly speaking the offset 0 is not required in 87 /// general, but non-zero offsets are not handled by SPIR-V backend at this 88 /// point (and potentially cannot be handled). 89 std::optional<SmallVector<ReassociationIndices>> 90 getReassociationMapForFoldingUnitDims(ArrayRef<OpFoldResult> mixedSizes); 91 92 //===----------------------------------------------------------------------===// 93 // Fusion / Tiling utilities 94 //===----------------------------------------------------------------------===// 95 96 /// The type of loops to be generated during tiling. 97 enum class LinalgTilingLoopType { 98 Loops = 0, 99 AffineLoops = 1, 100 ParallelLoops = 2 101 }; 102 103 /// Computes tile offsets, given a list of loop `ivs` and `tileSizes`. In case 104 /// a tile size is zero (i.e., no tiling), the corresponding offset is also 105 /// zero. 106 SmallVector<OpFoldResult> computeTileOffsets(OpBuilder &b, Location loc, 107 ArrayRef<OpFoldResult> ivs, 108 ArrayRef<OpFoldResult> tileSizes); 109 110 /// Computes tile sizes, given a list of `tileSizes` and dimension 111 /// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the 112 /// corresponding result size is the corresponding value from `sizeBounds`. 113 /// Note: The returned tile sizes are closed intervals. 114 SmallVector<OpFoldResult> computeTileSizes(OpBuilder &b, Location loc, 115 ArrayRef<OpFoldResult> tileSizes, 116 ArrayRef<OpFoldResult> sizeBounds); 117 118 /// Returns the list of tensor output types produced when the given structured 119 /// operation `op` is applied to the given `operands`. Note that `operands` 120 /// are not necessarily the actual operands of `op`. 121 SmallVector<Type> getTensorOutputTypes(LinalgOp op, ValueRange operands); 122 123 /// Creates `insert_slice` ops that insert `results` back into larger tensors 124 /// they were originally extracted from with `extract_slice` before being 125 /// passed as `operands` to the given structured operation `op` or its clone. 126 /// Note that `operands` are not necessarily the actual operands of `op`, the 127 /// operation serves only as metadata container for operand types and 128 /// positions. 129 SmallVector<Value> insertSlicesBack(OpBuilder &builder, Location loc, 130 LinalgOp op, ValueRange operands, 131 ValueRange results); 132 133 /// A struct containg offsets-sizes-strides arguments of the tiled shape. 134 struct SliceParameters { 135 SmallVector<OpFoldResult> offsets; 136 SmallVector<OpFoldResult> sizes; 137 SmallVector<OpFoldResult> strides; 138 }; 139 140 /// Computes SliceParameters for a single `valueToTile` assuming that its user 141 /// is being tiled with the given loop bounds `lbs` and `ubs` and the tile 142 /// sizes `tileSizes`. 143 /// 144 /// `omitPartialTileCheck` controls whether to omit the partial/boundary tile 145 /// condition check in cases where we statically know that it is unnecessary. 146 SliceParameters 147 computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile, 148 ArrayRef<OpFoldResult> tileSizes, AffineMap map, 149 ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs, 150 ArrayRef<OpFoldResult> subShapeSizes, 151 bool omitPartialTileCheck); 152 153 /// Computes SliceParamaters for all `valuesToTile` of the given `linalgOp`, 154 /// assuming `linalgOp` is being fused into a loop nest. Calls 155 /// `computeSliceParameters` for every individual value. 156 /// 157 /// Note that a constant zero in `tileSizes` means no tiling at that implicit 158 /// loop. The number of non-zero values in `tileSizes` should be equal to the 159 /// number of values in `ivs`. 160 /// 161 /// Some of the `valuesToTile` won't be affected by tiling. For these values, 162 /// std::nullopt will be returned. 163 SmallVector<std::optional<SliceParameters>> 164 computeAllSliceParameters(OpBuilder &builder, Location loc, LinalgOp linalgOp, 165 ValueRange valuesToTile, ArrayRef<OpFoldResult> ivs, 166 ArrayRef<OpFoldResult> tileSizes, 167 ArrayRef<OpFoldResult> sizeBounds, 168 bool omitPartialTileCheck); 169 170 /// Creates an extract_slice/subview op for a single `valueToTile` with 171 /// `builder`. This new operation extracts a tile of `valueToTile`, starting 172 /// at offsets `lbs` and with sizes `subShapeSizes`. `omitPartialTileCheck` 173 /// controls whether to omit the partial/boundary tile condition check in 174 /// cases where we statically know that it is unnecessary. 175 Operation *makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile, 176 ArrayRef<OpFoldResult> tileSizes, AffineMap map, 177 ArrayRef<OpFoldResult> lbs, 178 ArrayRef<OpFoldResult> ubs, 179 ArrayRef<OpFoldResult> subShapeSizes, 180 bool omitPartialTileCheck); 181 182 /// Creates extract_slice/subview ops for all `valuesToTile` of the given 183 /// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop 184 /// nest for tiling with the given induction variables `ivs` and tile sizes 185 /// `tileSizes`. `sizeBounds` are the iteration space bounds for *all* the 186 /// implicit loops in `linalgOp`. `omitPartialTileCheck` controls whether to 187 /// omit the partial/boundary tile condition check in cases where we 188 /// statically know that it is unnecessary. 189 /// 190 /// Note that a constant zero in `tileSizes` means no tiling at that implicit 191 /// loop. The number of non-zero values in `tileSizes` should be equal to the 192 /// number of values in `ivs`. 193 SmallVector<Value> makeTiledShapes(OpBuilder &builder, Location loc, 194 LinalgOp linalgOp, ValueRange valuesToTile, 195 ArrayRef<OpFoldResult> ivs, 196 ArrayRef<OpFoldResult> tileSizes, 197 ArrayRef<OpFoldResult> sizeBounds, 198 bool omitPartialTileCheck); 199 200 /// Add the specified offsets to any `linalg.index` ops contained in the given 201 /// `linalgOp`. The offsets are provided in the same order as iteration space 202 /// dimensions. Null offests are assumed to be zero. 203 void offsetIndices(OpBuilder &b, LinalgOp linalgOp, 204 ArrayRef<OpFoldResult> offests); 205 void offsetIndices(RewriterBase &b, LinalgOp linalgOp, 206 ArrayRef<OpFoldResult> offests); 207 208 /// A struct containing the Linalg producer before and after fusion. 209 /// When operating on tensors, `fusedProducer` may feed into a `tensor.cast` 210 /// op before the consumer Linalg op, until enough canonicalizations have 211 /// applied. 212 struct FusionInfo { 213 LinalgOp originalProducer; 214 LinalgOp fusedProducer; 215 }; 216 217 /// This implements the fusion part of the "tileAndFuse on tensors" 218 /// transformation and thus requires the `consumerOpOperand` to be a 219 /// `extract_slice` op (generally obtained by applying the tiling 220 /// transformation). 221 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b, 222 OpOperand &consumerOpOperand); 223 224 /// This implements the fusion part of the "tileAndFuse on tensors" 225 /// transformation and thus requires the `consumerOpOperand` to be a 226 /// `extract_slice` op (generally obtained by applying the tiling 227 /// transformation). Assumes `producerOfTensor` is a Linalg op that produces 228 /// `consumerOpOperand`. 229 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b, 230 OpResult producerOpResult, 231 OpOperand &consumerOpOperand); 232 233 //===----------------------------------------------------------------------===// 234 // Distribution utilities 235 //===----------------------------------------------------------------------===// 236 237 /// Scheme used to distribute loops to processors. 238 enum class DistributionMethod { 239 /// Cyclic distribution where no assumption is made about the dynamic 240 /// relationship between number of processors and number of iterations of 241 /// the 242 /// distributed loop. Distributes the following loop 243 /// 244 /// scf.parallel (%iv) = (%lb) to (%ub) step (%step) 245 /// 246 /// to 247 /// 248 /// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * 249 /// %nprocs) 250 Cyclic = 0, 251 252 /// Cyclic distribution where the number of processors can be assumed to be 253 /// more than or equal to the number of iterations of the distributed loop. 254 /// In 255 /// such cases, a simple in-bounds check is enough (instead of materializing 256 /// a 257 /// loop). Distributes the following loop 258 /// 259 /// scf.parallel (%iv) = (%lb) to (%ub) step (%step) 260 /// 261 /// to 262 /// 263 /// %iv = %lb + %procId * %step 264 /// %cond = arith.cmpi "slt", %iv, %ub 265 /// scf.if %cond { 266 /// ... 267 /// } 268 CyclicNumProcsGeNumIters = 1, 269 270 /// Cyclic distribution where the number of processors can be assumed to be 271 /// equal to the number of iterations of the distributed loop. In such 272 /// cases, 273 /// no bounds check is needed. Distributes the following loop 274 /// 275 /// scf.parallel (%iv) = (%lb) to (%ub) step (%step) 276 /// 277 /// to 278 /// 279 /// %iv = %lb + %procId * %step 280 CyclicNumProcsEqNumIters = 2, 281 282 /// No Distribution. 283 None = 3 284 }; 285 286 /// Callback function type used to get processor ID, and number of processors 287 /// used for distribution for all parallel loops generated. 288 struct ProcInfo { 289 Value procId; 290 Value nprocs; 291 DistributionMethod distributionMethod; 292 }; 293 using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo>( 294 OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>; 295 296 /// Options that allow distribution of loops generated in Linalg transforms to 297 /// processors while generating the loops. 298 struct LinalgLoopDistributionOptions { 299 /// Callback function that returns the Values for processor ID (`procId`), 300 /// and number of processors (`nprocs`) used to execute the parallel loops. 301 /// The number of `{procId, nprocs}` pairs returned must be equal to the 302 /// number of `parallelLoopRanges` passed into the callback. The 303 /// `parallelLoopRanges` are ranges of the outer parallel loops of the 304 /// operation that do have non-zero tile sizes specified. 305 ProcInfoCallBackFn procInfo; 306 }; 307 308 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and 309 /// `step`. 310 void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc, 311 Value procId, Value nprocs, Value &lb, 312 Value &ub, Value &step); 313 314 //===----------------------------------------------------------------------===// 315 // Fusion on tensor utilities 316 //===----------------------------------------------------------------------===// 317 318 //===----------------------------------------------------------------------===// 319 // Generic op region utilities 320 //===----------------------------------------------------------------------===// 321 322 /// A struct containing common matchers over linalg op's region. 323 struct RegionMatcher { 324 enum class BinaryOpKind { 325 IAdd, 326 }; 327 328 /// Matches the given linalg op if its body is performing binary operation 329 /// on int or float scalar values and returns the binary op kind. 330 /// 331 /// The linalg op's region is expected to be 332 /// ``` 333 /// { 334 /// ^bb(%a: <scalar-type>, %b: <scalar-type>): 335 /// %0 = <binary-op> %a, %b: <scalar-type> 336 /// linalg.yield %0: <scalar-type> 337 /// } 338 /// ``` 339 static std::optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op); 340 }; 341 342 //===----------------------------------------------------------------------===// 343 // Loop nest utilities 344 //===----------------------------------------------------------------------===// 345 346 /// Utility class used to generate nested loops with ranges described by 347 /// `loopRanges` and loop type described by the `iteratorTypes`. 348 /// `bodyBuilderFn` is used to generate the body of the innermost loop. It is 349 /// passed a range of loop induction variables and a range of operand values 350 /// to use. 351 template <typename LoopTy> 352 struct GenerateLoopNest { 353 static void doit(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, 354 LinalgOp linalgOp, 355 ArrayRef<utils::IteratorType> iteratorTypes, 356 function_ref<scf::ValueVector(OpBuilder &, Location, 357 ValueRange, ValueRange)> 358 bodyBuilderFn, 359 ArrayRef<linalg::ProcInfo> procInfo = {}); 360 }; 361 362 /// Returns an attribute list that excludes pre-defined attributes. 363 template <typename OpTy> 364 SmallVector<NamedAttribute> getPrunedAttributeList(OpTy op) { 365 auto elidedAttrs = llvm::to_vector(op.getAttributeNames()); 366 if (isa<linalg::LinalgOp>(op.getOperation())) 367 elidedAttrs.push_back(LinalgDialect::kMemoizedIndexingMapsAttrName); 368 return getPrunedAttributeList(op, elidedAttrs); 369 } 370 371 } // namespace linalg 372 } // namespace mlir 373 374 #endif // MLIR_DIALECT_LINALG_UTILS_UTILS_H 375