xref: /llvm-project/mlir/include/mlir/Dialect/Affine/Utils.h (revision 2ec27848c00cda734697619047e640eadb254555)
1 //===- Utils.h - Affine dialect utilities -----------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This header file declares a set of utilities for the affine dialect ops.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef MLIR_DIALECT_AFFINE_UTILS_H
14 #define MLIR_DIALECT_AFFINE_UTILS_H
15 
16 #include "mlir/Analysis/AliasAnalysis.h"
17 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/IR/OpDefinition.h"
20 #include <optional>
21 
22 namespace mlir {
23 class DominanceInfo;
24 class Operation;
25 class PostDominanceInfo;
26 class ImplicitLocOpBuilder;
27 
28 namespace func {
29 class FuncOp;
30 } // namespace func
31 
32 namespace memref {
33 class AllocOp;
34 class AllocaOp;
35 } // namespace memref
36 
37 namespace affine {
38 class AffineForOp;
39 class AffineIfOp;
40 class AffineParallelOp;
41 
42 using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
43 
44 /// Replaces a parallel affine.for op with a 1-d affine.parallel op. `forOp`'s
45 /// body is taken by the affine.parallel op and the former is erased.
46 /// (mlir::isLoopParallel can be used to detect a parallel affine.for op.) The
47 /// reductions specified in `parallelReductions` are also parallelized.
48 /// Parallelization will fail in the presence of loop iteration arguments that
49 /// are not listed in `parallelReductions`. `resOp` if non-null is set to the
50 /// newly created affine.parallel op.
51 LogicalResult affineParallelize(AffineForOp forOp,
52                                 ArrayRef<LoopReduction> parallelReductions = {},
53                                 AffineParallelOp *resOp = nullptr);
54 
55 /// Hoists out affine.if/else to as high as possible, i.e., past all invariant
56 /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is
57 /// set to true if the op was folded or erased. This hoisting could lead to
58 /// significant code expansion in some cases.
59 LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
60 
61 /// Holds parameters to perform n-D vectorization on a single loop nest.
62 /// For example, for the following loop nest:
63 ///
64 /// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) {
65 ///   affine.for %i0 = 0 to 64 {
66 ///     affine.for %i1 = 0 to 128 {
67 ///       affine.for %i2 = 0 to 512 {
68 ///         %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32>
69 ///         affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32>
70 ///       }
71 ///     }
72 ///   }
73 ///   return
74 /// }
75 ///
76 /// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim =
77 /// {{i1->0}, {i2->1}}', SuperVectorizer will generate:
78 ///
79 ///  func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) {
80 ///    affine.for %arg2 = 0 to 64 {
81 ///      affine.for %arg3 = 0 to 128 step 8 {
82 ///        affine.for %arg4 = 0 to 512 step 4 {
83 ///          %cst = arith.constant 0.000000e+00 : f32
84 ///          %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ...
85 ///          vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ...
86 ///        }
87 ///      }
88 ///    }
89 ///    return
90 ///  }
91 // TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
92 struct VectorizationStrategy {
93   // Vectorization factors to apply to each target vector dimension.
94   // Each factor will be applied to a different loop.
95   SmallVector<int64_t, 8> vectorSizes;
96   // Maps each AffineForOp vectorization candidate with its vector dimension.
97   // The candidate will be vectorized using the vectorization factor in
98   // 'vectorSizes' for that dimension.
99   DenseMap<Operation *, unsigned> loopToVectorDim;
100   // Maps loops that implement vectorizable reductions to the corresponding
101   // reduction descriptors.
102   ReductionLoopMap reductionLoops;
103 };
104 
105 /// Replace affine store and load accesses by scalars by forwarding stores to
106 /// loads and eliminate invariant affine loads; consequently, eliminate dead
107 /// allocs.
108 void affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo,
109                          PostDominanceInfo &postDomInfo,
110                          AliasAnalysis &analysis);
111 
112 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
113 /// 'vectorSizes'. By default, each vectorization factor is applied
114 /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
115 /// be optionally used to provide a different loop vectorization order.
116 /// If `reductionLoops` is not empty, the given reduction loops may be
117 /// vectorized along the reduction dimension.
118 /// TODO: Vectorizing reductions is supported only for 1-D vectorization.
119 void vectorizeAffineLoops(
120     Operation *parentOp,
121     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
122     ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
123     const ReductionLoopMap &reductionLoops = ReductionLoopMap());
124 
125 /// External utility to vectorize affine loops from a single loop nest using an
126 /// n-D vectorization strategy (see doc in VectorizationStrategy definition).
127 /// Loops are provided in a 2D vector container. The first dimension represents
128 /// the nesting level relative to the loops to be vectorized. The second
129 /// dimension contains the loops. This means that:
130 ///   a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
131 ///   b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
132 ///
133 /// For example, for the following loop nest:
134 ///
135 ///   func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
136 ///               %out0: memref<64x128x512xf32>,
137 ///               %out1: memref<64x128x128xf32>) {
138 ///     affine.for %i0 = 0 to 64 {
139 ///       affine.for %i1 = 0 to 128 {
140 ///         affine.for %i2 = 0 to 512 {
141 ///           %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
142 ///           affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
143 ///         }
144 ///         affine.for %i3 = 0 to 128 {
145 ///           %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
146 ///           affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
147 ///         }
148 ///       }
149 ///     }
150 ///     return
151 ///   }
152 ///
153 /// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
154 /// innermost loops;
155 /// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
156 /// loops;
157 /// loops = {{%i2}}, to vectorize only the first innermost loop;
158 /// loops = {{%i3}}, to vectorize only the second innermost loop;
159 /// loops = {{%i1}}, to vectorize only the middle loop.
160 LogicalResult
161 vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
162                         const VectorizationStrategy &strategy);
163 
164 /// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
165 /// As currently implemented, this transformation cannot fail and will return
166 /// early if the op is already in a normalized form.
167 void normalizeAffineParallel(AffineParallelOp op);
168 
169 /// Normalize an affine.for op. An affine.for op is normalized by converting the
170 /// lower bound to zero and loop step to one. The upper bound is set to the trip
171 /// count of the loop. Original loops must have a lower bound with only a single
172 /// result. There is no such restriction on upper bounds. Returns success if the
173 /// loop has been normalized (or is already in the normal form). If
174 /// `promoteSingleIter` is true, the loop is simply promoted if it has a single
175 /// iteration.
176 LogicalResult normalizeAffineFor(AffineForOp op,
177                                  bool promoteSingleIter = false);
178 
179 /// Traverse `e` and return an AffineExpr where all occurrences of `dim` have
180 /// been replaced by either:
181 ///  - `min` if `positivePath` is true when we reach an occurrence of `dim`
182 ///  - `max` if `positivePath` is true when we reach an occurrence of `dim`
183 /// `positivePath` is negated each time we hit a multiplicative or divisive
184 /// binary op with a constant negative coefficient.
185 AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min,
186                         AffineExpr max, bool positivePath = true);
187 
188 /// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
189 /// optionally remapping the old memref's indices using the supplied affine map,
190 /// `indexRemap`. The new memref could be of a different shape or rank.
191 /// `extraIndices` provides any additional access indices to be added to the
192 /// start.
193 ///
194 /// `indexRemap` remaps indices of the old memref access to a new set of indices
195 /// that are used to index the memref. Additional input operands to indexRemap
196 /// can be optionally provided in `extraOperands`, and they occupy the start
197 /// of its input list. `indexRemap`'s dimensional inputs are expected to
198 /// correspond to memref's indices, and its symbolic inputs if any should be
199 /// provided in `symbolOperands`.
200 ///
201 /// `domOpFilter`, if non-null, restricts the replacement to only those
202 /// operations that are dominated by the former; similarly, `postDomOpFilter`
203 /// restricts replacement to only those operations that are postdominated by it.
204 ///
205 /// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
206 /// uses of a memref without any requirement for access index rewrites as long
207 /// as the user operation has the MemRefsNormalizable trait. The default value
208 /// of this flag is false.
209 ///
210 /// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
211 /// also be a candidate for replacement. The default value of this flag is
212 /// false.
213 ///
214 /// Returns true on success and false if the replacement is not possible,
215 /// whenever a memref is used as an operand in a non-dereferencing context and
216 /// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
217 /// which are left untouched. See comments at function definition for an
218 /// example.
219 //
220 //  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
221 //  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
222 //  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
223 //  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
224 //  extra operands, note that 'indexRemap' would just be applied to existing
225 //  indices (%i, %j).
226 //  TODO: allow extraIndices to be added at any position.
227 LogicalResult replaceAllMemRefUsesWith(
228     Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
229     AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
230     ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
231     Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
232     bool replaceInDeallocOp = false);
233 
234 /// Performs the same replacement as the other version above but only for the
235 /// dereferencing uses of `oldMemRef` in `op`, except in cases where
236 /// 'allowNonDereferencingOps' is set to true where we replace the
237 /// non-dereferencing uses as well.
238 LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
239                                        Operation *op,
240                                        ArrayRef<Value> extraIndices = {},
241                                        AffineMap indexRemap = AffineMap(),
242                                        ArrayRef<Value> extraOperands = {},
243                                        ArrayRef<Value> symbolOperands = {},
244                                        bool allowNonDereferencingOps = false);
245 
246 /// Rewrites the memref defined by this alloc op to have an identity layout map
247 /// and updates all its indexing uses. Returns failure if any of its uses
248 /// escape (while leaving the IR in a valid state).
249 template <typename AllocLikeOp>
250 LogicalResult normalizeMemRef(AllocLikeOp *op);
251 extern template LogicalResult
252 normalizeMemRef<memref::AllocaOp>(memref::AllocaOp *op);
253 extern template LogicalResult
254 normalizeMemRef<memref::AllocOp>(memref::AllocOp *op);
255 
256 /// Normalizes `memrefType` so that the affine layout map of the memref is
257 /// transformed to an identity map with a new shape being computed for the
258 /// normalized memref type and returns it. The old memref type is simplify
259 /// returned if the normalization failed.
260 MemRefType normalizeMemRefType(MemRefType memrefType);
261 
262 /// Given an operation, inserts one or more single result affine apply
263 /// operations, results of which are exclusively used by this operation.
264 /// The operands of these newly created affine apply ops are
265 /// guaranteed to be loop iterators or terminal symbols of a function.
266 ///
267 /// Before
268 ///
269 /// affine.for %i = 0 to #map(%N)
270 ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
271 ///   send %A[%idx], ...
272 ///   %v = "compute"(%idx, ...)
273 ///
274 /// After
275 ///
276 /// affine.for %i = 0 to #map(%N)
277 ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
278 ///   send %A[%idx], ...
279 ///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
280 ///   %v = "compute"(%idx_, ...)
281 
282 /// This allows the application of different transformations on send and
283 /// compute (for eg. different shifts/delays)
284 ///
285 /// Fills `sliceOps` with the list of affine.apply operations.
286 /// In the following cases, `sliceOps` remains empty:
287 ///   1. If none of opInst's operands were the result of an affine.apply
288 ///      (i.e., there was no affine computation slice to create).
289 ///   2. If all the affine.apply op's supplying operands to this opInst did not
290 ///      have any uses other than those in this opInst.
291 void createAffineComputationSlice(Operation *opInst,
292                                   SmallVectorImpl<AffineApplyOp> *sliceOps);
293 
294 /// Emit code that computes the given affine expression using standard
295 /// arithmetic operations applied to the provided dimension and symbol values.
296 Value expandAffineExpr(OpBuilder &builder, Location loc, AffineExpr expr,
297                        ValueRange dimValues, ValueRange symbolValues);
298 
299 /// Create a sequence of operations that implement the `affineMap` applied to
300 /// the given `operands` (as it it were an AffineApplyOp).
301 std::optional<SmallVector<Value, 8>> expandAffineMap(OpBuilder &builder,
302                                                      Location loc,
303                                                      AffineMap affineMap,
304                                                      ValueRange operands);
305 
306 /// Holds the result of (div a, b)  and (mod a, b).
307 struct DivModValue {
308   Value quotient;
309   Value remainder;
310 };
311 
312 /// Create IR to calculate (div lhs, rhs) and (mod lhs, rhs).
313 DivModValue getDivMod(OpBuilder &b, Location loc, Value lhs, Value rhs);
314 
315 /// Generate the IR to delinearize `linearIndex` given the `basis` and return
316 /// the multi-index. `hasOuterBound` indicates whether `basis` has an entry
317 /// given the size of the first multi-index result - if it is true, the function
318 /// will return `basis.size()` values, otherwise, it will return `basis.size() +
319 /// 1`.
320 FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
321                                                Value linearIndex,
322                                                ArrayRef<Value> basis,
323                                                bool hasOuterBound = true);
324 
325 FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
326                                                Value linearIndex,
327                                                ArrayRef<OpFoldResult> basis,
328                                                bool hasOuterBound = true);
329 
330 // Generate IR that extracts the linear index from a multi-index according to
331 // a basis/shape. The basis may contain either `multiIndex.size()` or
332 // `multiIndex.size() - 1` elements.
333 OpFoldResult linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
334                             ArrayRef<OpFoldResult> basis,
335                             ImplicitLocOpBuilder &builder);
336 
337 OpFoldResult linearizeIndex(OpBuilder &builder, Location loc,
338                             ArrayRef<OpFoldResult> multiIndex,
339                             ArrayRef<OpFoldResult> basis);
340 
341 /// Ensure that all operations that could be executed after `start`
342 /// (noninclusive) and prior to `memOp` (e.g. on a control flow/op path
343 /// between the operations) do not have the potential memory effect
344 /// `EffectType` on `memOp`. `memOp`  is an operation that reads or writes to
345 /// a memref. For example, if `EffectType` is MemoryEffects::Write, this method
346 /// will check if there is no write to the memory between `start` and `memOp`
347 /// that would change the read within `memOp`.
348 template <typename EffectType, typename T>
349 bool hasNoInterveningEffect(Operation *start, T memOp,
350                             llvm::function_ref<bool(Value, Value)> mayAlias);
351 
352 struct AffineValueExpr {
353   explicit AffineValueExpr(AffineExpr e) : e(e) {}
354   AffineValueExpr bind(Value v) {
355     this->v = v;
356     return *this;
357   }
358   AffineValueExpr bind(OpFoldResult v) {
359     this->v = v;
360     return *this;
361   }
362   operator AffineExpr() const { return e; }
363   operator OpFoldResult() const { return v; }
364   AffineExpr e;
365   OpFoldResult v;
366 };
367 
368 /// Helper struct to build simple AffineValueExprs with minimal type inference
369 /// support.
370 struct AffineBuilder {
371   AffineBuilder(OpBuilder &b, Location loc) : b(b), loc(loc) {}
372   OpFoldResult add(AffineValueExpr lhs, AffineValueExpr rhs) {
373     return makeComposedFoldedAffineApply(b, loc, {lhs.e + rhs.e}, {lhs, rhs});
374   }
375   OpFoldResult sub(AffineValueExpr lhs, AffineValueExpr rhs) {
376     return makeComposedFoldedAffineApply(b, loc, {lhs.e - rhs.e}, {lhs, rhs});
377   }
378   OpFoldResult mul(AffineValueExpr lhs, AffineValueExpr rhs) {
379     return makeComposedFoldedAffineApply(b, loc, {lhs.e * rhs.e}, {lhs, rhs});
380   }
381   OpFoldResult floor(AffineValueExpr lhs, AffineValueExpr rhs) {
382     return makeComposedFoldedAffineApply(b, loc, {lhs.e.floorDiv(rhs.e)},
383                                          {lhs, rhs});
384   }
385   OpFoldResult ceil(AffineValueExpr lhs, AffineValueExpr rhs) {
386     return makeComposedFoldedAffineApply(b, loc, {lhs.e.ceilDiv(rhs.e)},
387                                          {lhs, rhs});
388   }
389   OpFoldResult min(ArrayRef<OpFoldResult> vals) {
390     return makeComposedFoldedAffineMin(
391         b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),
392         vals);
393   }
394   OpFoldResult max(ArrayRef<OpFoldResult> vals) {
395     return makeComposedFoldedAffineMax(
396         b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),
397         vals);
398   }
399 
400 private:
401   OpBuilder &b;
402   Location loc;
403 };
404 
405 } // namespace affine
406 } // namespace mlir
407 
408 #endif // MLIR_DIALECT_AFFINE_UTILS_H
409