xref: /llvm-project/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h (revision 28039055e57e4ee8c7e142909c70097c20009303)
1 //===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_UTILS_UTILS_H
10 #define MLIR_DIALECT_LINALG_UTILS_UTILS_H
11 
12 #include "mlir/Dialect/Linalg/IR/Linalg.h"
13 #include "mlir/Dialect/SCF/IR/SCF.h"
14 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
15 #include "llvm/ADT/StringSet.h"
16 #include <optional>
17 
18 namespace mlir {
19 class AffineExpr;
20 class AffineMap;
21 class PatternRewriter;
22 
23 namespace affine {
24 class AffineForOp;
25 } // namespace affine
26 
27 namespace tensor {
28 class ExtractSliceOp;
29 } // namespace tensor
30 
31 namespace linalg {
32 
33 //===----------------------------------------------------------------------===//
34 // Utilities for inferring various semantics properties of Linalg ops.
35 //===----------------------------------------------------------------------===//
36 
37 //===----------------------------------------------------------------------===//
38 // General utilities
39 //===----------------------------------------------------------------------===//
40 
41 /// Check if all indexing maps are projected permutations.
42 bool allIndexingsAreProjectedPermutation(LinalgOp op);
43 
44 /// Detect whether `r` has only ConstantOp, ElementwiseMappable and YieldOp.
45 bool hasOnlyScalarElementwiseOp(Region &r);
46 
47 /// Check if a LinalgOp is an element-wise operation.
48 bool isElementwise(LinalgOp op);
49 
50 /// Check if iterator type has "parallel" semantics.
51 bool isParallelIterator(utils::IteratorType iteratorType);
52 
53 /// Check if iterator type  has "reduction" semantics.
54 bool isReductionIterator(utils::IteratorType iteratorType);
55 
56 /// Create a tensor::PadOp that pads `source` to the size of the statically
57 /// sized `type` whose static sizes are assumed to be greater than the dynamic
58 /// `source` size. The padding introduces trailing `pad` values until the
59 /// target size is met. If `source` is defined by one or more LinalgOps that
60 /// have been padded with the same value and sizes, return their padded result
61 /// instead of creating a tensor::PadOp.
62 ///
63 /// Example:
64 /// ```
65 /// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
66 /// %1 = tensor.pad %0 low[0, 0] high[...] { tensor.yield %cst }
67 /// %2 = linalg.matmul ins(...) outs(%1)
68 /// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
69 /// ```
70 /// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
71 /// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
72 /// ```
73 /// %4 = tensor.pad %3 low[0, 0] high[...] { tensor.yield %other_cst }
74 /// ```
75 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
76                             Value source, Value pad, bool nofold);
77 
78 /// Returns GenericOp that copies an n-D memref. Unlike the current
79 /// implementation of memref::CopyOp, this op can further tile, lower to loops
80 /// or vectorize.
81 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to);
82 
83 /// Get the reassociation maps to fold the result of a extract_slice (or
84 /// source of a insert_slice) operation with given offsets, and sizes to its
85 /// rank-reduced version. This is only done for the cases where the size is 1
86 /// and offset is 0. Strictly speaking the offset 0 is not required in
87 /// general, but non-zero offsets are not handled by SPIR-V backend at this
88 /// point (and potentially cannot be handled).
89 std::optional<SmallVector<ReassociationIndices>>
90 getReassociationMapForFoldingUnitDims(ArrayRef<OpFoldResult> mixedSizes);
91 
92 //===----------------------------------------------------------------------===//
93 // Fusion / Tiling utilities
94 //===----------------------------------------------------------------------===//
95 
96 /// The type of loops to be generated during tiling.
97 enum class LinalgTilingLoopType {
98   Loops = 0,
99   AffineLoops = 1,
100   ParallelLoops = 2
101 };
102 
103 /// Computes tile offsets, given a list of loop `ivs` and `tileSizes`. In case
104 /// a tile size is zero (i.e., no tiling), the corresponding offset is also
105 /// zero.
106 SmallVector<OpFoldResult> computeTileOffsets(OpBuilder &b, Location loc,
107                                              ArrayRef<OpFoldResult> ivs,
108                                              ArrayRef<OpFoldResult> tileSizes);
109 
110 /// Computes tile sizes, given a list of `tileSizes` and dimension
111 /// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the
112 /// corresponding result size is the corresponding value from `sizeBounds`.
113 /// Note: The returned tile sizes are closed intervals.
114 SmallVector<OpFoldResult> computeTileSizes(OpBuilder &b, Location loc,
115                                            ArrayRef<OpFoldResult> tileSizes,
116                                            ArrayRef<OpFoldResult> sizeBounds);
117 
118 /// Returns the list of tensor output types produced when the given structured
119 /// operation `op` is applied to the given `operands`. Note that `operands`
120 /// are not necessarily the actual operands of `op`.
121 SmallVector<Type> getTensorOutputTypes(LinalgOp op, ValueRange operands);
122 
123 /// Creates `insert_slice` ops that insert `results` back into larger tensors
124 /// they were originally extracted from with `extract_slice` before being
125 /// passed as `operands` to the given structured operation `op` or its clone.
126 /// Note that `operands` are not necessarily the actual operands of `op`, the
127 /// operation serves only as metadata container for operand types and
128 /// positions.
129 SmallVector<Value> insertSlicesBack(OpBuilder &builder, Location loc,
130                                     LinalgOp op, ValueRange operands,
131                                     ValueRange results);
132 
133 /// A struct containg offsets-sizes-strides arguments of the tiled shape.
134 struct SliceParameters {
135   SmallVector<OpFoldResult> offsets;
136   SmallVector<OpFoldResult> sizes;
137   SmallVector<OpFoldResult> strides;
138 };
139 
140 /// Computes SliceParameters for a single `valueToTile` assuming that its user
141 /// is being tiled with the given loop bounds `lbs` and `ubs` and the tile
142 /// sizes `tileSizes`.
143 ///
144 /// `omitPartialTileCheck` controls whether to omit the partial/boundary tile
145 /// condition check in cases where we statically know that it is unnecessary.
146 SliceParameters
147 computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
148                        ArrayRef<OpFoldResult> tileSizes, AffineMap map,
149                        ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
150                        ArrayRef<OpFoldResult> subShapeSizes,
151                        bool omitPartialTileCheck);
152 
153 /// Computes SliceParamaters for all `valuesToTile` of the given `linalgOp`,
154 /// assuming `linalgOp` is being fused into a loop nest. Calls
155 /// `computeSliceParameters` for every individual value.
156 ///
157 /// Note that a constant zero in `tileSizes` means no tiling at that implicit
158 /// loop. The number of non-zero values in `tileSizes` should be equal to the
159 /// number of values in `ivs`.
160 ///
161 /// Some of the `valuesToTile` won't be affected by tiling. For these values,
162 /// std::nullopt will be returned.
163 SmallVector<std::optional<SliceParameters>>
164 computeAllSliceParameters(OpBuilder &builder, Location loc, LinalgOp linalgOp,
165                           ValueRange valuesToTile, ArrayRef<OpFoldResult> ivs,
166                           ArrayRef<OpFoldResult> tileSizes,
167                           ArrayRef<OpFoldResult> sizeBounds,
168                           bool omitPartialTileCheck);
169 
170 /// Creates an extract_slice/subview op for a single `valueToTile` with
171 /// `builder`. This new operation extracts a tile of `valueToTile`, starting
172 /// at offsets `lbs` and with sizes `subShapeSizes`. `omitPartialTileCheck`
173 /// controls whether to omit the partial/boundary tile condition check in
174 /// cases where we statically know that it is unnecessary.
175 Operation *makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
176                           ArrayRef<OpFoldResult> tileSizes, AffineMap map,
177                           ArrayRef<OpFoldResult> lbs,
178                           ArrayRef<OpFoldResult> ubs,
179                           ArrayRef<OpFoldResult> subShapeSizes,
180                           bool omitPartialTileCheck);
181 
182 /// Creates extract_slice/subview ops for all `valuesToTile` of the given
183 /// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop
184 /// nest for tiling with the given induction variables `ivs` and tile sizes
185 /// `tileSizes`. `sizeBounds` are the iteration space bounds for *all* the
186 /// implicit loops in `linalgOp`. `omitPartialTileCheck` controls whether to
187 /// omit the partial/boundary tile condition check in cases where we
188 /// statically know that it is unnecessary.
189 ///
190 /// Note that a constant zero in `tileSizes` means no tiling at that implicit
191 /// loop. The number of non-zero values in `tileSizes` should be equal to the
192 /// number of values in `ivs`.
193 SmallVector<Value> makeTiledShapes(OpBuilder &builder, Location loc,
194                                    LinalgOp linalgOp, ValueRange valuesToTile,
195                                    ArrayRef<OpFoldResult> ivs,
196                                    ArrayRef<OpFoldResult> tileSizes,
197                                    ArrayRef<OpFoldResult> sizeBounds,
198                                    bool omitPartialTileCheck);
199 
200 /// Add the specified offsets to any `linalg.index` ops contained in the given
201 /// `linalgOp`. The offsets are provided in the same order as iteration space
202 /// dimensions. Null offests are assumed to be zero.
203 void offsetIndices(OpBuilder &b, LinalgOp linalgOp,
204                    ArrayRef<OpFoldResult> offests);
205 void offsetIndices(RewriterBase &b, LinalgOp linalgOp,
206                    ArrayRef<OpFoldResult> offests);
207 
208 /// A struct containing the Linalg producer before and after fusion.
209 /// When operating on tensors, `fusedProducer` may feed into a `tensor.cast`
210 /// op before the consumer Linalg op, until enough canonicalizations have
211 /// applied.
212 struct FusionInfo {
213   LinalgOp originalProducer;
214   LinalgOp fusedProducer;
215 };
216 
217 /// This implements the fusion part of the "tileAndFuse on tensors"
218 /// transformation and thus requires the `consumerOpOperand` to be a
219 /// `extract_slice` op (generally obtained by applying the tiling
220 /// transformation).
221 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
222                                            OpOperand &consumerOpOperand);
223 
224 /// This implements the fusion part of the "tileAndFuse on tensors"
225 /// transformation and thus requires the `consumerOpOperand` to be a
226 /// `extract_slice` op (generally obtained by applying the tiling
227 /// transformation). Assumes `producerOfTensor` is a Linalg op that produces
228 /// `consumerOpOperand`.
229 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
230                                            OpResult producerOpResult,
231                                            OpOperand &consumerOpOperand);
232 
233 //===----------------------------------------------------------------------===//
234 // Distribution utilities
235 //===----------------------------------------------------------------------===//
236 
237 /// Scheme used to distribute loops to processors.
238 enum class DistributionMethod {
239   /// Cyclic distribution where no assumption is made about the dynamic
240   /// relationship between number of processors and number of iterations of
241   /// the
242   /// distributed loop. Distributes the following loop
243   ///
244   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
245   ///
246   /// to
247   ///
248   /// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step *
249   /// %nprocs)
250   Cyclic = 0,
251 
252   /// Cyclic distribution where the number of processors can be assumed to be
253   /// more than or equal to the number of iterations of the distributed loop.
254   /// In
255   /// such cases, a simple in-bounds check is enough (instead of materializing
256   /// a
257   /// loop). Distributes the following loop
258   ///
259   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
260   ///
261   /// to
262   ///
263   /// %iv = %lb + %procId * %step
264   /// %cond = arith.cmpi "slt", %iv, %ub
265   /// scf.if %cond {
266   ///   ...
267   /// }
268   CyclicNumProcsGeNumIters = 1,
269 
270   /// Cyclic distribution where the number of processors can be assumed to be
271   ///  equal to the number of iterations of the distributed loop. In such
272   ///  cases,
273   ///  no bounds check is needed. Distributes the following loop
274   ///
275   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
276   ///
277   /// to
278   ///
279   /// %iv = %lb + %procId * %step
280   CyclicNumProcsEqNumIters = 2,
281 
282   /// No Distribution.
283   None = 3
284 };
285 
286 /// Callback function type used to get processor ID, and number of processors
287 /// used for distribution for all parallel loops generated.
288 struct ProcInfo {
289   Value procId;
290   Value nprocs;
291   DistributionMethod distributionMethod;
292 };
293 using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo>(
294     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
295 
296 /// Options that allow distribution of loops generated in Linalg transforms to
297 /// processors while generating the loops.
298 struct LinalgLoopDistributionOptions {
299   /// Callback function that returns the Values for processor ID (`procId`),
300   /// and number of processors (`nprocs`) used to execute the parallel loops.
301   /// The number of `{procId, nprocs}` pairs returned must be equal to the
302   /// number of `parallelLoopRanges` passed into the callback. The
303   /// `parallelLoopRanges` are ranges of the outer parallel loops of the
304   /// operation that do have non-zero tile sizes specified.
305   ProcInfoCallBackFn procInfo;
306 };
307 
308 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and
309 /// `step`.
310 void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
311                                        Value procId, Value nprocs, Value &lb,
312                                        Value &ub, Value &step);
313 
314 //===----------------------------------------------------------------------===//
315 // Fusion on tensor utilities
316 //===----------------------------------------------------------------------===//
317 
318 //===----------------------------------------------------------------------===//
319 // Generic op region utilities
320 //===----------------------------------------------------------------------===//
321 
322 /// A struct containing common matchers over linalg op's region.
323 struct RegionMatcher {
324   enum class BinaryOpKind {
325     IAdd,
326   };
327 
328   /// Matches the given linalg op if its body is performing binary operation
329   /// on int or float scalar values and returns the binary op kind.
330   ///
331   /// The linalg op's region is expected to be
332   /// ```
333   /// {
334   ///   ^bb(%a: <scalar-type>, %b: <scalar-type>):
335   ///     %0 = <binary-op> %a, %b: <scalar-type>
336   ///     linalg.yield %0: <scalar-type>
337   /// }
338   /// ```
339   static std::optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
340 };
341 
342 //===----------------------------------------------------------------------===//
343 // Loop nest utilities
344 //===----------------------------------------------------------------------===//
345 
346 /// Utility class used to generate nested loops with ranges described by
347 /// `loopRanges` and loop type described by the `iteratorTypes`.
348 /// `bodyBuilderFn` is used to generate the body of the innermost loop. It is
349 /// passed a range of loop induction variables and a range of operand values
350 /// to use.
351 template <typename LoopTy>
352 struct GenerateLoopNest {
353   static void doit(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges,
354                    LinalgOp linalgOp,
355                    ArrayRef<utils::IteratorType> iteratorTypes,
356                    function_ref<scf::ValueVector(OpBuilder &, Location,
357                                                  ValueRange, ValueRange)>
358                        bodyBuilderFn,
359                    ArrayRef<linalg::ProcInfo> procInfo = {});
360 };
361 
362 /// Returns an attribute list that excludes pre-defined attributes.
363 template <typename OpTy>
364 SmallVector<NamedAttribute> getPrunedAttributeList(OpTy op) {
365   auto elidedAttrs = llvm::to_vector(op.getAttributeNames());
366   if (isa<linalg::LinalgOp>(op.getOperation()))
367     elidedAttrs.push_back(LinalgDialect::kMemoizedIndexingMapsAttrName);
368   return getPrunedAttributeList(op, elidedAttrs);
369 }
370 
371 } // namespace linalg
372 } // namespace mlir
373 
374 #endif // MLIR_DIALECT_LINALG_UTILS_UTILS_H
375