xref: /llvm-project/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp (revision 2a82dfd7040276d50347a3fb4bcb6aced54d9fc5)
1 //===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements lowering of vector transfer operations to SCF.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <numeric>
14 #include <optional>
15 #include <type_traits>
16 
17 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
18 
19 #include "mlir/Dialect/Affine/IR/AffineOps.h"
20 #include "mlir/Dialect/Arith/IR/Arith.h"
21 #include "mlir/Dialect/MemRef/IR/MemRef.h"
22 #include "mlir/Dialect/SCF/IR/SCF.h"
23 #include "mlir/Dialect/Tensor/IR/Tensor.h"
24 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
25 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
26 #include "mlir/IR/Builders.h"
27 #include "mlir/IR/ImplicitLocOpBuilder.h"
28 #include "mlir/Pass/Pass.h"
29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
30 #include "mlir/Transforms/Passes.h"
31 
32 namespace mlir {
33 #define GEN_PASS_DEF_CONVERTVECTORTOSCF
34 #include "mlir/Conversion/Passes.h.inc"
35 } // namespace mlir
36 
37 using namespace mlir;
38 using vector::TransferReadOp;
39 using vector::TransferWriteOp;
40 
41 namespace {
42 
43 /// Attribute name used for labeling transfer ops during progressive lowering.
44 static const char kPassLabel[] = "__vector_to_scf_lowering__";
45 
46 /// Patterns that inherit from this struct have access to
47 /// VectorTransferToSCFOptions.
48 template <typename OpTy>
49 struct VectorToSCFPattern : public OpRewritePattern<OpTy> {
50   explicit VectorToSCFPattern(MLIRContext *context,
51                               VectorTransferToSCFOptions opt)
52       : OpRewritePattern<OpTy>(context), options(opt) {}
53 
54   VectorTransferToSCFOptions options;
55 };
56 
57 /// Given a vector transfer op, calculate which dimension of the `source`
58 /// memref should be unpacked in the next application of TransferOpConversion.
59 /// A return value of std::nullopt indicates a broadcast.
60 template <typename OpTy>
61 static std::optional<int64_t> unpackedDim(OpTy xferOp) {
62   // TODO: support 0-d corner case.
63   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");
64   auto map = xferOp.getPermutationMap();
65   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
66     return expr.getPosition();
67   }
68   assert(xferOp.isBroadcastDim(0) &&
69          "Expected AffineDimExpr or AffineConstantExpr");
70   return std::nullopt;
71 }
72 
73 /// Compute the permutation map for the new (N-1)-D vector transfer op. This
74 /// map is identical to the current permutation map, but the first result is
75 /// omitted.
76 template <typename OpTy>
77 static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) {
78   // TODO: support 0-d corner case.
79   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");
80   auto map = xferOp.getPermutationMap();
81   return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(),
82                         b.getContext());
83 }
84 
85 /// Calculate the indices for the new vector transfer op.
86 ///
87 /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
88 ///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
89 ///                                 ^^^^^^
90 ///              `iv` is the iteration variable of the (new) surrounding loop.
91 template <typename OpTy>
92 static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv,
93                            SmallVector<Value, 8> &indices) {
94   typename OpTy::Adaptor adaptor(xferOp);
95   // Corresponding memref dim of the vector dim that is unpacked.
96   auto dim = unpackedDim(xferOp);
97   auto prevIndices = adaptor.getIndices();
98   indices.append(prevIndices.begin(), prevIndices.end());
99 
100   Location loc = xferOp.getLoc();
101   bool isBroadcast = !dim.has_value();
102   if (!isBroadcast) {
103     AffineExpr d0, d1;
104     bindDims(xferOp.getContext(), d0, d1);
105     Value offset = adaptor.getIndices()[*dim];
106     indices[*dim] =
107         affine::makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
108   }
109 }
110 
111 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
112                             Value value) {
113   if (hasRetVal) {
114     assert(value && "Expected non-empty value");
115     b.create<scf::YieldOp>(loc, value);
116   } else {
117     b.create<scf::YieldOp>(loc);
118   }
119 }
120 
121 /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
122 /// is set to true. No such check is generated under following circumstances:
123 /// * xferOp does not have a mask.
124 /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
125 ///   computed and attached to the new transfer op in the pattern.)
126 /// * The to-be-unpacked dim of xferOp is a broadcast.
127 template <typename OpTy>
128 static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) {
129   if (!xferOp.getMask())
130     return Value();
131   if (xferOp.getMaskType().getRank() != 1)
132     return Value();
133   if (xferOp.isBroadcastDim(0))
134     return Value();
135 
136   Location loc = xferOp.getLoc();
137   return b.create<vector::ExtractElementOp>(loc, xferOp.getMask(), iv);
138 }
139 
140 /// Helper function TransferOpConversion and TransferOp1dConversion.
141 /// Generate an in-bounds check if the transfer op may go out-of-bounds on the
142 /// specified dimension `dim` with the loop iteration variable `iv`.
143 /// E.g., when unpacking dimension 0 from:
144 /// ```
145 /// %vec = vector.transfer_read %A[%a, %b] %cst
146 ///     : vector<5x4xf32>, memref<?x?xf32>
147 /// ```
148 /// An if check similar to this will be generated inside the loop:
149 /// ```
150 /// %d = memref.dim %A, %c0 : memref<?x?xf32>
151 /// if (%a + iv < %d) {
152 ///   (in-bounds case)
153 /// } else {
154 ///   (out-of-bounds case)
155 /// }
156 /// ```
157 ///
158 /// If the transfer is 1D and has a mask, this function generates a more complex
159 /// check also accounts for potentially masked out elements.
160 ///
161 /// This function variant returns the value returned by `inBoundsCase` or
162 /// `outOfBoundsCase`. The MLIR type of the return value must be specified in
163 /// `resultTypes`.
164 template <typename OpTy>
165 static Value generateInBoundsCheck(
166     OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim,
167     TypeRange resultTypes,
168     function_ref<Value(OpBuilder &, Location)> inBoundsCase,
169     function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
170   bool hasRetVal = !resultTypes.empty();
171   Value cond; // Condition to be built...
172 
173   // Condition check 1: Access in-bounds?
174   bool isBroadcast = !dim; // No in-bounds check for broadcasts.
175   Location loc = xferOp.getLoc();
176   ImplicitLocOpBuilder lb(xferOp.getLoc(), b);
177   if (!xferOp.isDimInBounds(0) && !isBroadcast) {
178     Value memrefDim =
179         vector::createOrFoldDimOp(b, loc, xferOp.getSource(), *dim);
180     AffineExpr d0, d1;
181     bindDims(xferOp.getContext(), d0, d1);
182     Value base = xferOp.getIndices()[*dim];
183     Value memrefIdx =
184         affine::makeComposedAffineApply(b, loc, d0 + d1, {base, iv});
185     cond = lb.create<arith::CmpIOp>(arith::CmpIPredicate::sgt, memrefDim,
186                                     memrefIdx);
187   }
188 
189   // Condition check 2: Masked in?
190   if (auto maskCond = generateMaskCheck(b, xferOp, iv)) {
191     if (cond)
192       cond = lb.create<arith::AndIOp>(cond, maskCond);
193     else
194       cond = maskCond;
195   }
196 
197   // If the condition is non-empty, generate an SCF::IfOp.
198   if (cond) {
199     auto check = lb.create<scf::IfOp>(
200         cond,
201         /*thenBuilder=*/
202         [&](OpBuilder &b, Location loc) {
203           maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc));
204         },
205         /*elseBuilder=*/
206         [&](OpBuilder &b, Location loc) {
207           if (outOfBoundsCase) {
208             maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc));
209           } else {
210             b.create<scf::YieldOp>(loc);
211           }
212         });
213 
214     return hasRetVal ? check.getResult(0) : Value();
215   }
216 
217   // Condition is empty, no need for an SCF::IfOp.
218   return inBoundsCase(b, loc);
219 }
220 
221 /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
222 /// a return value. Consequently, this function does not have a return value.
223 template <typename OpTy>
224 static void generateInBoundsCheck(
225     OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim,
226     function_ref<void(OpBuilder &, Location)> inBoundsCase,
227     function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
228   generateInBoundsCheck(
229       b, xferOp, iv, dim, /*resultTypes=*/TypeRange(),
230       /*inBoundsCase=*/
231       [&](OpBuilder &b, Location loc) {
232         inBoundsCase(b, loc);
233         return Value();
234       },
235       /*outOfBoundsCase=*/
236       [&](OpBuilder &b, Location loc) {
237         if (outOfBoundsCase)
238           outOfBoundsCase(b, loc);
239         return Value();
240       });
241 }
242 
243 /// Given an ArrayAttr, return a copy where the first element is dropped.
244 static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) {
245   if (!attr)
246     return attr;
247   return ArrayAttr::get(b.getContext(), attr.getValue().drop_front());
248 }
249 
250 /// Add the pass label to a vector transfer op if its rank is not the target
251 /// rank.
252 template <typename OpTy>
253 static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
254                                 unsigned targetRank) {
255   if (newXferOp.getVectorType().getRank() > targetRank)
256     newXferOp->setAttr(kPassLabel, b.getUnitAttr());
257 }
258 
259 /// Return true if this transfer op operates on a source tensor.
260 template <typename OpTy>
261 static bool isTensorOp(OpTy xferOp) {
262   if (isa<RankedTensorType>(xferOp.getShapedType())) {
263     if (xferOp.getOperationName().equals(TransferWriteOp::getOperationName())) {
264       // TransferWriteOps on tensors have a result.
265       assert(xferOp->getNumResults() > 0);
266     }
267     return true;
268   }
269   return false;
270 }
271 
272 namespace lowering_n_d {
273 
274 /// Helper data structure for data and mask buffers.
275 struct BufferAllocs {
276   Value dataBuffer;
277   Value maskBuffer;
278 };
279 
280 // TODO: Parallelism and threadlocal considerations with a ParallelScope trait.
281 static Operation *getAutomaticAllocationScope(Operation *op) {
282   Operation *scope =
283       op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
284   assert(scope && "Expected op to be inside automatic allocation scope");
285   return scope;
286 }
287 
288 /// Allocate temporary buffers for data (vector) and mask (if present).
289 template <typename OpTy>
290 static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) {
291   Location loc = xferOp.getLoc();
292   OpBuilder::InsertionGuard guard(b);
293   Operation *scope = getAutomaticAllocationScope(xferOp);
294   assert(scope->getNumRegions() == 1 &&
295          "AutomaticAllocationScope with >1 regions");
296   b.setInsertionPointToStart(&scope->getRegion(0).front());
297 
298   BufferAllocs result;
299   auto bufferType = MemRefType::get({}, xferOp.getVectorType());
300   result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType);
301 
302   if (xferOp.getMask()) {
303     auto maskType = MemRefType::get({}, xferOp.getMask().getType());
304     auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType);
305     b.setInsertionPoint(xferOp);
306     b.create<memref::StoreOp>(loc, xferOp.getMask(), maskBuffer);
307     result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer, ValueRange());
308   }
309 
310   return result;
311 }
312 
313 /// Given a MemRefType with VectorType element type, unpack one dimension from
314 /// the VectorType into the MemRefType.
315 ///
316 /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
317 static FailureOr<MemRefType> unpackOneDim(MemRefType type) {
318   auto vectorType = dyn_cast<VectorType>(type.getElementType());
319   // Vectors with leading scalable dims are not supported.
320   // It may be possible to support these in future by using dynamic memref dims.
321   if (vectorType.getScalableDims().front())
322     return failure();
323   auto memrefShape = type.getShape();
324   SmallVector<int64_t, 8> newMemrefShape;
325   newMemrefShape.append(memrefShape.begin(), memrefShape.end());
326   newMemrefShape.push_back(vectorType.getDimSize(0));
327   return MemRefType::get(newMemrefShape,
328                          VectorType::Builder(vectorType).dropDim(0));
329 }
330 
331 /// Given a transfer op, find the memref from which the mask is loaded. This
332 /// is similar to Strategy<TransferWriteOp>::getBuffer.
333 template <typename OpTy>
334 static Value getMaskBuffer(OpTy xferOp) {
335   assert(xferOp.getMask() && "Expected that transfer op has mask");
336   auto loadOp = xferOp.getMask().template getDefiningOp<memref::LoadOp>();
337   assert(loadOp && "Expected transfer op mask produced by LoadOp");
338   return loadOp.getMemRef();
339 }
340 
341 /// Codegen strategy, depending on the operation.
342 template <typename OpTy>
343 struct Strategy;
344 
345 /// Code strategy for vector TransferReadOp.
346 template <>
347 struct Strategy<TransferReadOp> {
348   /// Find the StoreOp that is used for writing the current TransferReadOp's
349   /// result to the temporary buffer allocation.
350   static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
351     assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
352     auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner());
353     assert(storeOp && "Expected TransferReadOp result used by StoreOp");
354     return storeOp;
355   }
356 
357   /// Find the temporary buffer allocation. All labeled TransferReadOps are
358   /// used like this, where %buf is either the buffer allocation or a type cast
359   /// of the buffer allocation:
360   /// ```
361   /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
362   /// memref.store %vec, %buf[...] ...
363   /// ```
364   static Value getBuffer(TransferReadOp xferOp) {
365     return getStoreOp(xferOp).getMemRef();
366   }
367 
368   /// Retrieve the indices of the current StoreOp that stores into the buffer.
369   static void getBufferIndices(TransferReadOp xferOp,
370                                SmallVector<Value, 8> &indices) {
371     auto storeOp = getStoreOp(xferOp);
372     auto prevIndices = memref::StoreOpAdaptor(storeOp).getIndices();
373     indices.append(prevIndices.begin(), prevIndices.end());
374   }
375 
376   /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
377   /// accesses on the to-be-unpacked dimension.
378   ///
379   /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
380   ///    variable `iv`.
381   /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
382   ///
383   /// E.g.:
384   /// ```
385   /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
386   ///     : memref<?x?x?xf32>, vector<4x3xf32>
387   /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
388   /// ```
389   /// Is rewritten to:
390   /// ```
391   /// %casted = vector.type_cast %buf
392   ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
393   /// for %j = 0 to 4 {
394   ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
395   ///       : memref<?x?x?xf32>, vector<3xf32>
396   ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
397   /// }
398   /// ```
399   ///
400   /// Note: The loop and type cast are generated in TransferOpConversion.
401   ///       The original TransferReadOp and store op are deleted in `cleanup`.
402   /// Note: The `mask` operand is set in TransferOpConversion.
403   static TransferReadOp rewriteOp(OpBuilder &b,
404                                   VectorTransferToSCFOptions options,
405                                   TransferReadOp xferOp, Value buffer, Value iv,
406                                   ValueRange /*loopState*/) {
407     SmallVector<Value, 8> storeIndices;
408     getBufferIndices(xferOp, storeIndices);
409     storeIndices.push_back(iv);
410 
411     SmallVector<Value, 8> xferIndices;
412     getXferIndices(b, xferOp, iv, xferIndices);
413 
414     Location loc = xferOp.getLoc();
415     auto bufferType = dyn_cast<ShapedType>(buffer.getType());
416     auto vecType = dyn_cast<VectorType>(bufferType.getElementType());
417     auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
418     auto newXferOp = b.create<vector::TransferReadOp>(
419         loc, vecType, xferOp.getSource(), xferIndices,
420         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
421         xferOp.getPadding(), Value(), inBoundsAttr);
422 
423     maybeApplyPassLabel(b, newXferOp, options.targetRank);
424 
425     b.create<memref::StoreOp>(loc, newXferOp.getVector(), buffer, storeIndices);
426     return newXferOp;
427   }
428 
429   /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
430   /// padding value to the temporary buffer.
431   static Value handleOutOfBoundsDim(OpBuilder &b, TransferReadOp xferOp,
432                                     Value buffer, Value iv,
433                                     ValueRange /*loopState*/) {
434     SmallVector<Value, 8> storeIndices;
435     getBufferIndices(xferOp, storeIndices);
436     storeIndices.push_back(iv);
437 
438     Location loc = xferOp.getLoc();
439     auto bufferType = dyn_cast<ShapedType>(buffer.getType());
440     auto vecType = dyn_cast<VectorType>(bufferType.getElementType());
441     auto vec = b.create<vector::SplatOp>(loc, vecType, xferOp.getPadding());
442     b.create<memref::StoreOp>(loc, vec, buffer, storeIndices);
443 
444     return Value();
445   }
446 
447   /// Cleanup after rewriting the op.
448   static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp,
449                       scf::ForOp /*forOp*/) {
450     rewriter.eraseOp(getStoreOp(xferOp));
451     rewriter.eraseOp(xferOp);
452   }
453 
454   /// Return the initial loop state for the generated scf.for loop.
455   static Value initialLoopState(TransferReadOp xferOp) { return Value(); }
456 };
457 
458 /// Codegen strategy for vector TransferWriteOp.
459 template <>
460 struct Strategy<TransferWriteOp> {
461   /// Find the temporary buffer allocation. All labeled TransferWriteOps are
462   /// used like this, where %buf is either the buffer allocation or a type cast
463   /// of the buffer allocation:
464   /// ```
465   /// %vec = memref.load %buf[...] ...
466   /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
467   /// ```
468   static Value getBuffer(TransferWriteOp xferOp) {
469     auto loadOp = xferOp.getVector().getDefiningOp<memref::LoadOp>();
470     assert(loadOp && "Expected transfer op vector produced by LoadOp");
471     return loadOp.getMemRef();
472   }
473 
474   /// Retrieve the indices of the current LoadOp that loads from the buffer.
475   static void getBufferIndices(TransferWriteOp xferOp,
476                                SmallVector<Value, 8> &indices) {
477     auto loadOp = xferOp.getVector().getDefiningOp<memref::LoadOp>();
478     auto prevIndices = memref::LoadOpAdaptor(loadOp).getIndices();
479     indices.append(prevIndices.begin(), prevIndices.end());
480   }
481 
482   /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
483   /// accesses on the to-be-unpacked dimension.
484   ///
485   /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
486   ///    using the loop iteration variable `iv`.
487   /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
488   ///    to memory.
489   ///
490   /// Note: For more details, see comments on Strategy<TransferReadOp>.
491   static TransferWriteOp rewriteOp(OpBuilder &b,
492                                    VectorTransferToSCFOptions options,
493                                    TransferWriteOp xferOp, Value buffer,
494                                    Value iv, ValueRange loopState) {
495     SmallVector<Value, 8> loadIndices;
496     getBufferIndices(xferOp, loadIndices);
497     loadIndices.push_back(iv);
498 
499     SmallVector<Value, 8> xferIndices;
500     getXferIndices(b, xferOp, iv, xferIndices);
501 
502     Location loc = xferOp.getLoc();
503     auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
504     auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
505     auto source = loopState.empty() ? xferOp.getSource() : loopState[0];
506     Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();
507     auto newXferOp = b.create<vector::TransferWriteOp>(
508         loc, type, vec, source, xferIndices,
509         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
510         inBoundsAttr);
511 
512     maybeApplyPassLabel(b, newXferOp, options.targetRank);
513 
514     return newXferOp;
515   }
516 
517   /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
518   static Value handleOutOfBoundsDim(OpBuilder &b, TransferWriteOp xferOp,
519                                     Value buffer, Value iv,
520                                     ValueRange loopState) {
521     return isTensorOp(xferOp) ? loopState[0] : Value();
522   }
523 
524   /// Cleanup after rewriting the op.
525   static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp,
526                       scf::ForOp forOp) {
527     if (isTensorOp(xferOp)) {
528       assert(forOp->getNumResults() == 1 && "Expected one for loop result");
529       rewriter.replaceOp(xferOp, forOp->getResult(0));
530     } else {
531       rewriter.eraseOp(xferOp);
532     }
533   }
534 
535   /// Return the initial loop state for the generated scf.for loop.
536   static Value initialLoopState(TransferWriteOp xferOp) {
537     return isTensorOp(xferOp) ? xferOp.getSource() : Value();
538   }
539 };
540 
541 template <typename OpTy>
542 LogicalResult checkPrepareXferOp(OpTy xferOp,
543                                  VectorTransferToSCFOptions options) {
544   if (xferOp->hasAttr(kPassLabel))
545     return failure();
546   if (xferOp.getVectorType().getRank() <= options.targetRank)
547     return failure();
548   // Currently the unpacking of the leading dimension into the memref is not
549   // supported for scalable dimensions.
550   if (xferOp.getVectorType().getScalableDims().front())
551     return failure();
552   if (isTensorOp(xferOp) && !options.lowerTensors)
553     return failure();
554   // Transfer ops that modify the element type are not supported atm.
555   if (xferOp.getVectorType().getElementType() !=
556       xferOp.getShapedType().getElementType())
557     return failure();
558   return success();
559 }
560 
561 /// Prepare a TransferReadOp for progressive lowering.
562 ///
563 /// 1. Allocate a temporary buffer.
564 /// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
565 /// 3. Store the result of the TransferReadOp into the temporary buffer.
566 /// 4. Load the result from the temporary buffer and replace all uses of the
567 ///    original TransferReadOp with this load.
568 ///
569 /// E.g.:
570 /// ```
571 /// %vec = vector.transfer_read %A[%a, %b, %c], %cst
572 ///     : vector<5x4xf32>, memref<?x?x?xf32>
573 /// ```
574 /// is rewritten to:
575 /// ```
576 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
577 /// %1 = vector.transfer_read %A[%a, %b, %c], %cst
578 ///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
579 /// memref.store %1, %0[] : memref<vector<5x4xf32>>
580 /// %vec = memref.load %0[] : memref<vector<5x4xf32>>
581 /// ```
582 ///
583 /// Note: A second temporary buffer may be allocated for the `mask` operand.
584 struct PrepareTransferReadConversion
585     : public VectorToSCFPattern<TransferReadOp> {
586   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
587 
588   LogicalResult matchAndRewrite(TransferReadOp xferOp,
589                                 PatternRewriter &rewriter) const override {
590     if (checkPrepareXferOp(xferOp, options).failed())
591       return failure();
592 
593     auto buffers = allocBuffers(rewriter, xferOp);
594     auto *newXfer = rewriter.clone(*xferOp.getOperation());
595     newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
596     if (xferOp.getMask()) {
597       dyn_cast<TransferReadOp>(newXfer).getMaskMutable().assign(
598           buffers.maskBuffer);
599     }
600 
601     Location loc = xferOp.getLoc();
602     rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0),
603                                      buffers.dataBuffer);
604     rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
605 
606     return success();
607   }
608 };
609 
610 /// Prepare a TransferWriteOp for progressive lowering.
611 ///
612 /// 1. Allocate a temporary buffer.
613 /// 2. Store the vector into the buffer.
614 /// 3. Load the vector from the buffer again.
615 /// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
616 ///    marking it eligible for progressive lowering via TransferOpConversion.
617 ///
618 /// E.g.:
619 /// ```
620 /// vector.transfer_write %vec, %A[%a, %b, %c]
621 ///     : vector<5x4xf32>, memref<?x?x?xf32>
622 /// ```
623 /// is rewritten to:
624 /// ```
625 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
626 /// memref.store %vec, %0[] : memref<vector<5x4xf32>>
627 /// %1 = memref.load %0[] : memref<vector<5x4xf32>>
628 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
629 ///     : vector<5x4xf32>, memref<?x?x?xf32>
630 /// ```
631 ///
632 /// Note: A second temporary buffer may be allocated for the `mask` operand.
633 struct PrepareTransferWriteConversion
634     : public VectorToSCFPattern<TransferWriteOp> {
635   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
636 
637   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
638                                 PatternRewriter &rewriter) const override {
639     if (checkPrepareXferOp(xferOp, options).failed())
640       return failure();
641 
642     Location loc = xferOp.getLoc();
643     auto buffers = allocBuffers(rewriter, xferOp);
644     rewriter.create<memref::StoreOp>(loc, xferOp.getVector(),
645                                      buffers.dataBuffer);
646     auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer);
647     rewriter.updateRootInPlace(xferOp, [&]() {
648       xferOp.getVectorMutable().assign(loadedVec);
649       xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
650     });
651 
652     if (xferOp.getMask()) {
653       rewriter.updateRootInPlace(xferOp, [&]() {
654         xferOp.getMaskMutable().assign(buffers.maskBuffer);
655       });
656     }
657 
658     return success();
659   }
660 };
661 
662 /// Decompose a n-D PrintOp into a loop of elementary/scalar prints. This allows
663 /// printing both 1D scalable vectors and n-D fixed size vectors.
664 ///
665 /// E.g.:
666 /// ```
667 /// vector.print %v : vector<[4]xi32>
668 /// ```
669 /// is rewritten to:
670 /// ```
671 /// %c0 = arith.constant 0 : index
672 /// %c4 = arith.constant 4 : index
673 /// %c1 = arith.constant 1 : index
674 /// %vscale = vector.vscale
675 /// %length = arith.muli %vscale, %c4 : index
676 /// %lastIndex = arith.subi %length, %c1 : index
677 /// vector.print punctuation <open>
678 /// scf.for %i = %c0 to %length step %c1 {
679 ///   %el = vector.extractelement %v[%i : index] : vector<[4]xi32>
680 ///   vector.print %el : i32 punctuation <no_punctuation>
681 ///   %notLastIndex = arith.cmpi ult, %i, %lastIndex : index
682 ///   scf.if %notLastIndex {
683 ///     vector.print punctuation <comma>
684 ///   }
685 /// }
686 /// vector.print punctuation <close>
687 /// vector.print
688 /// ```
689 struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
690   using VectorToSCFPattern<vector::PrintOp>::VectorToSCFPattern;
691   LogicalResult matchAndRewrite(vector::PrintOp printOp,
692                                 PatternRewriter &rewriter) const override {
693     if (!printOp.getSource())
694       return failure();
695 
696     VectorType vectorType = dyn_cast<VectorType>(printOp.getPrintType());
697     if (!vectorType)
698       return failure();
699 
700     // Currently >= 2D scalable vectors are not supported.
701     // These can't be lowered to LLVM (as LLVM does not support scalable vectors
702     // of scalable vectors), and due to limitations of current ops can't be
703     // indexed with SSA values or flattened. This may change after
704     // https://reviews.llvm.org/D155034, though there still needs to be a path
705     // for lowering to LLVM.
706     if (vectorType.getRank() > 1 && vectorType.isScalable())
707       return failure();
708 
709     auto loc = printOp.getLoc();
710     auto value = printOp.getSource();
711 
712     if (auto intTy = dyn_cast<IntegerType>(vectorType.getElementType())) {
713       // Oddly sized integers are (somewhat) buggy on a lot of backends, so to
714       // avoid issues extend them to a more standard size.
715       // https://github.com/llvm/llvm-project/issues/30613
716       auto width = intTy.getWidth();
717       auto legalWidth = llvm::NextPowerOf2(std::max(8u, width) - 1);
718       auto legalIntTy = IntegerType::get(rewriter.getContext(), legalWidth,
719                                          intTy.getSignedness());
720       // arith can only take signless integers, so we must cast back and forth.
721       auto signlessSourceVectorType =
722           vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(intTy));
723       auto signlessTargetVectorType =
724           vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(legalIntTy));
725       auto targetVectorType = vectorType.cloneWith({}, legalIntTy);
726       value = rewriter.create<vector::BitCastOp>(loc, signlessSourceVectorType,
727                                                  value);
728       if (width == 1 || intTy.isUnsigned())
729         value = rewriter.create<arith::ExtUIOp>(loc, signlessTargetVectorType,
730                                                 value);
731       else
732         value = rewriter.create<arith::ExtSIOp>(loc, signlessTargetVectorType,
733                                                 value);
734       value = rewriter.create<vector::BitCastOp>(loc, targetVectorType, value);
735       vectorType = targetVectorType;
736     }
737 
738     auto scalableDimensions = vectorType.getScalableDims();
739     auto shape = vectorType.getShape();
740     constexpr int64_t singletonShape[] = {1};
741     if (vectorType.getRank() == 0)
742       shape = singletonShape;
743 
744     if (vectorType.getRank() != 1) {
745       // Flatten n-D vectors to 1D. This is done to allow indexing with a
746       // non-constant value (which can currently only be done via
747       // vector.extractelement for 1D vectors).
748       auto flatLength = std::accumulate(shape.begin(), shape.end(), 1,
749                                         std::multiplies<int64_t>());
750       auto flatVectorType =
751           VectorType::get({flatLength}, vectorType.getElementType());
752       value = rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, value);
753     }
754 
755     vector::PrintOp firstClose;
756     SmallVector<Value, 8> loopIndices;
757     for (unsigned d = 0; d < shape.size(); d++) {
758       // Setup loop bounds and step.
759       Value lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
760       Value upperBound = rewriter.create<arith::ConstantIndexOp>(loc, shape[d]);
761       Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
762       if (!scalableDimensions.empty() && scalableDimensions[d]) {
763         auto vscale = rewriter.create<vector::VectorScaleOp>(
764             loc, rewriter.getIndexType());
765         upperBound = rewriter.create<arith::MulIOp>(loc, upperBound, vscale);
766       }
767       auto lastIndex = rewriter.create<arith::SubIOp>(loc, upperBound, step);
768 
769       // Create a loop to print the elements surrounded by parentheses.
770       rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
771       auto loop =
772           rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
773       auto printClose = rewriter.create<vector::PrintOp>(
774           loc, vector::PrintPunctuation::Close);
775       if (!firstClose)
776         firstClose = printClose;
777 
778       auto loopIdx = loop.getInductionVar();
779       loopIndices.push_back(loopIdx);
780 
781       // Print a comma after all but the last element.
782       rewriter.setInsertionPointToStart(loop.getBody());
783       auto notLastIndex = rewriter.create<arith::CmpIOp>(
784           loc, arith::CmpIPredicate::ult, loopIdx, lastIndex);
785       rewriter.create<scf::IfOp>(loc, notLastIndex,
786                                  [&](OpBuilder &builder, Location loc) {
787                                    builder.create<vector::PrintOp>(
788                                        loc, vector::PrintPunctuation::Comma);
789                                    builder.create<scf::YieldOp>(loc);
790                                  });
791 
792       rewriter.setInsertionPointToStart(loop.getBody());
793     }
794 
795     // Compute the flattened index.
796     // Note: For the > rank 1 vectors this assumes non-scalable.
797     Value flatIndex;
798     auto currentStride = 1;
799     for (int d = shape.size() - 1; d >= 0; d--) {
800       auto stride = rewriter.create<arith::ConstantIndexOp>(loc, currentStride);
801       auto index = rewriter.create<arith::MulIOp>(loc, stride, loopIndices[d]);
802       if (flatIndex)
803         flatIndex = rewriter.create<arith::AddIOp>(loc, flatIndex, index);
804       else
805         flatIndex = index;
806       currentStride *= shape[d];
807     }
808 
809     // Print the scalar elements in the inner most loop.
810     auto element =
811         rewriter.create<vector::ExtractElementOp>(loc, value, flatIndex);
812     rewriter.create<vector::PrintOp>(loc, element,
813                                      vector::PrintPunctuation::NoPunctuation);
814 
815     rewriter.setInsertionPointAfter(firstClose);
816     rewriter.create<vector::PrintOp>(loc, printOp.getPunctuation());
817     rewriter.eraseOp(printOp);
818     return success();
819   }
820 
821   static IntegerType getIntTypeWithSignlessSemantics(IntegerType intTy) {
822     return IntegerType::get(intTy.getContext(), intTy.getWidth(),
823                             IntegerType::Signless);
824   };
825 };
826 
827 /// Progressive lowering of vector transfer ops: Unpack one dimension.
828 ///
829 /// 1. Unpack one dimension from the current buffer type and cast the buffer
830 ///    to that new type. E.g.:
831 ///    ```
832 ///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
833 ///    vector.transfer_write %vec ...
834 ///    ```
835 ///    The following cast is generated:
836 ///    ```
837 ///    %casted = vector.type_cast %0
838 ///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
839 ///    ```
840 /// 2. Generate a for loop and rewrite the transfer op according to the
841 ///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
842 ///    out-of-bounds, generate an if-check and handle both cases separately.
843 /// 3. Clean up according to the corresponding Strategy<OpTy>.
844 ///
845 /// Note: If the transfer op is a TransferWriteOp and operates on a tensor
846 /// source (as opposed to a memref source), then each iteration of the generated
847 /// scf.for loop yields the new tensor value. E.g.:
848 /// ```
849 /// %result = scf.for i = 0 to 5 {
850 ///   %0 = memref.load %buffer[i] : memref<5xvector<4x3xf32>>
851 ///   %1 = vector.transfer_write %0, %source[...]
852 ///       : vector<4x3xf32>, tensor<5x4x3xf32>
853 ///   scf.yield %1 : tensor<5x4x3xf32>
854 /// }
855 /// ```
856 template <typename OpTy>
857 struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
858   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
859 
860   void initialize() {
861     // This pattern recursively unpacks one dimension at a time. The recursion
862     // bounded as the rank is strictly decreasing.
863     this->setHasBoundedRewriteRecursion();
864   }
865 
866   LogicalResult matchAndRewrite(OpTy xferOp,
867                                 PatternRewriter &rewriter) const override {
868     if (!xferOp->hasAttr(kPassLabel))
869       return failure();
870 
871     // Find and cast data buffer. How the buffer can be found depends on OpTy.
872     ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter);
873     auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
874     auto dataBufferType = dyn_cast<MemRefType>(dataBuffer.getType());
875     auto castedDataType = unpackOneDim(dataBufferType);
876     if (failed(castedDataType))
877       return failure();
878 
879     auto castedDataBuffer =
880         locB.create<vector::TypeCastOp>(*castedDataType, dataBuffer);
881 
882     // If the xferOp has a mask: Find and cast mask buffer.
883     Value castedMaskBuffer;
884     if (xferOp.getMask()) {
885       auto maskBuffer = getMaskBuffer(xferOp);
886       auto maskBufferType = dyn_cast<MemRefType>(maskBuffer.getType());
887       if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
888         // Do not unpack a dimension of the mask, if:
889         // * To-be-unpacked transfer op dimension is a broadcast.
890         // * Mask is 1D, i.e., the mask cannot be further unpacked.
891         //   (That means that all remaining dimensions of the transfer op must
892         //   be broadcasted.)
893         castedMaskBuffer = maskBuffer;
894       } else {
895         // It's safe to assume the mask buffer can be unpacked if the data
896         // buffer was unpacked.
897         auto castedMaskType = *unpackOneDim(maskBufferType);
898         castedMaskBuffer =
899             locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
900       }
901     }
902 
903     // Loop bounds and step.
904     auto lb = locB.create<arith::ConstantIndexOp>(0);
905     auto ub = locB.create<arith::ConstantIndexOp>(
906         castedDataType->getDimSize(castedDataType->getRank() - 1));
907     auto step = locB.create<arith::ConstantIndexOp>(1);
908     // TransferWriteOps that operate on tensors return the modified tensor and
909     // require a loop state.
910     auto loopState = Strategy<OpTy>::initialLoopState(xferOp);
911 
912     // Generate for loop.
913     auto result = locB.create<scf::ForOp>(
914         lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
915         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
916           Type stateType = loopState.empty() ? Type() : loopState[0].getType();
917 
918           auto result = generateInBoundsCheck(
919               b, xferOp, iv, unpackedDim(xferOp),
920               stateType ? TypeRange(stateType) : TypeRange(),
921               /*inBoundsCase=*/
922               [&](OpBuilder &b, Location loc) {
923                 // Create new transfer op.
924                 OpTy newXfer = Strategy<OpTy>::rewriteOp(
925                     b, this->options, xferOp, castedDataBuffer, iv, loopState);
926 
927                 // If old transfer op has a mask: Set mask on new transfer op.
928                 // Special case: If the mask of the old transfer op is 1D and
929                 // the
930                 //               unpacked dim is not a broadcast, no mask is
931                 //               needed on the new transfer op.
932                 if (xferOp.getMask() && (xferOp.isBroadcastDim(0) ||
933                                          xferOp.getMaskType().getRank() > 1)) {
934                   OpBuilder::InsertionGuard guard(b);
935                   b.setInsertionPoint(newXfer); // Insert load before newXfer.
936 
937                   SmallVector<Value, 8> loadIndices;
938                   Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
939                   // In case of broadcast: Use same indices to load from memref
940                   // as before.
941                   if (!xferOp.isBroadcastDim(0))
942                     loadIndices.push_back(iv);
943 
944                   auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
945                                                        loadIndices);
946                   rewriter.updateRootInPlace(newXfer, [&]() {
947                     newXfer.getMaskMutable().assign(mask);
948                   });
949                 }
950 
951                 return loopState.empty() ? Value() : newXfer->getResult(0);
952               },
953               /*outOfBoundsCase=*/
954               [&](OpBuilder &b, Location /*loc*/) {
955                 return Strategy<OpTy>::handleOutOfBoundsDim(
956                     b, xferOp, castedDataBuffer, iv, loopState);
957               });
958 
959           maybeYieldValue(b, loc, !loopState.empty(), result);
960         });
961 
962     Strategy<OpTy>::cleanup(rewriter, xferOp, result);
963     return success();
964   }
965 };
966 
967 } // namespace lowering_n_d
968 
969 namespace lowering_n_d_unrolled {
970 
971 /// If the original transfer op has a mask, compute the mask of the new transfer
972 /// op (for the current iteration `i`) and assign it.
973 template <typename OpTy>
974 static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
975                             int64_t i) {
976   if (!xferOp.getMask())
977     return;
978 
979   if (xferOp.isBroadcastDim(0)) {
980     // To-be-unpacked dimension is a broadcast, which does not have a
981     // corresponding mask dimension. Mask attribute remains unchanged.
982     newXferOp.getMaskMutable().assign(xferOp.getMask());
983     return;
984   }
985 
986   if (xferOp.getMaskType().getRank() > 1) {
987     // Unpack one dimension of the mask.
988     OpBuilder::InsertionGuard guard(b);
989     b.setInsertionPoint(newXferOp); // Insert load before newXfer.
990 
991     llvm::SmallVector<int64_t, 1> indices({i});
992     Location loc = xferOp.getLoc();
993     auto newMask = b.create<vector::ExtractOp>(loc, xferOp.getMask(), indices);
994     newXferOp.getMaskMutable().assign(newMask);
995   }
996 
997   // If we end up here: The mask of the old transfer op is 1D and the unpacked
998   // dim is not a broadcast, so no mask is needed on the new transfer op.
999   // `generateInBoundsCheck` will have evaluated the mask already.
1000 }
1001 
1002 /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
1003 /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
1004 /// memref buffer is allocated and the SCF loop is fully unrolled.
1005 ///
1006 /// ```
1007 /// E.g.:
1008 /// ```
1009 /// %vec = vector.transfer_read %A[%a, %b, %c], %padding
1010 ///     : memref<?x?x?xf32>, vector<5x4xf32>
1011 /// ```
1012 /// is rewritten to IR such as (simplified):
1013 /// ```
1014 /// %v_init = splat %padding : vector<5x4xf32>
1015 /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
1016 ///     : memref<?x?x?xf32>, vector<4xf32>
1017 /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
1018 /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
1019 ///     : memref<?x?x?xf32>, vector<4xf32>
1020 /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
1021 /// ...
1022 /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
1023 ///     : memref<?x?x?xf32>, vector<4xf32>
1024 /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
1025 /// ```
1026 ///
1027 /// Note: As an optimization, if the result of the original TransferReadOp
1028 /// was directly inserted into another vector, no new %v_init vector is created.
1029 /// Instead, the new TransferReadOp results are inserted into that vector.
1030 struct UnrollTransferReadConversion
1031     : public VectorToSCFPattern<TransferReadOp> {
1032   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;
1033 
1034   void initialize() {
1035     // This pattern recursively unpacks one dimension at a time. The recursion
1036     // bounded as the rank is strictly decreasing.
1037     setHasBoundedRewriteRecursion();
1038   }
1039 
1040   /// Return the vector into which the newly created TransferReadOp results
1041   /// are inserted.
1042   Value getResultVector(TransferReadOp xferOp,
1043                         PatternRewriter &rewriter) const {
1044     if (auto insertOp = getInsertOp(xferOp))
1045       return insertOp.getDest();
1046     Location loc = xferOp.getLoc();
1047     return rewriter.create<vector::SplatOp>(loc, xferOp.getVectorType(),
1048                                             xferOp.getPadding());
1049   }
1050 
1051   /// If the result of the TransferReadOp has exactly one user, which is a
1052   /// vector::InsertOp, return that operation.
1053   vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
1054     if (xferOp->hasOneUse()) {
1055       Operation *xferOpUser = *xferOp->getUsers().begin();
1056       if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
1057         return insertOp;
1058     }
1059 
1060     return vector::InsertOp();
1061   }
1062 
1063   /// If the result of the TransferReadOp has exactly one user, which is a
1064   /// vector::InsertOp, return that operation's indices.
1065   void getInsertionIndices(TransferReadOp xferOp,
1066                            SmallVector<int64_t, 8> &indices) const {
1067     if (auto insertOp = getInsertOp(xferOp))
1068       indices.assign(insertOp.getPosition().begin(),
1069                      insertOp.getPosition().end());
1070   }
1071 
1072   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
1073   /// accesses, and broadcasts and transposes in permutation maps.
1074   LogicalResult matchAndRewrite(TransferReadOp xferOp,
1075                                 PatternRewriter &rewriter) const override {
1076     if (xferOp.getVectorType().getRank() <= options.targetRank)
1077       return failure();
1078     if (isTensorOp(xferOp) && !options.lowerTensors)
1079       return failure();
1080     // Transfer ops that modify the element type are not supported atm.
1081     if (xferOp.getVectorType().getElementType() !=
1082         xferOp.getShapedType().getElementType())
1083       return failure();
1084 
1085     auto insertOp = getInsertOp(xferOp);
1086     auto vec = getResultVector(xferOp, rewriter);
1087     auto vecType = dyn_cast<VectorType>(vec.getType());
1088     auto xferVecType = xferOp.getVectorType();
1089 
1090     if (xferVecType.getScalableDims()[0]) {
1091       // Cannot unroll a scalable dimension at compile time.
1092       return failure();
1093     }
1094 
1095     VectorType newXferVecType = VectorType::Builder(xferVecType).dropDim(0);
1096 
1097     int64_t dimSize = xferVecType.getShape()[0];
1098 
1099     // Generate fully unrolled loop of transfer ops.
1100     Location loc = xferOp.getLoc();
1101     for (int64_t i = 0; i < dimSize; ++i) {
1102       Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);
1103 
1104       vec = generateInBoundsCheck(
1105           rewriter, xferOp, iv, unpackedDim(xferOp), TypeRange(vecType),
1106           /*inBoundsCase=*/
1107           [&](OpBuilder &b, Location loc) {
1108             // Indices for the new transfer op.
1109             SmallVector<Value, 8> xferIndices;
1110             getXferIndices(b, xferOp, iv, xferIndices);
1111 
1112             // Indices for the new vector.insert op.
1113             SmallVector<int64_t, 8> insertionIndices;
1114             getInsertionIndices(xferOp, insertionIndices);
1115             insertionIndices.push_back(i);
1116 
1117             auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
1118             auto newXferOp = b.create<vector::TransferReadOp>(
1119                 loc, newXferVecType, xferOp.getSource(), xferIndices,
1120                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
1121                 xferOp.getPadding(), Value(), inBoundsAttr);
1122             maybeAssignMask(b, xferOp, newXferOp, i);
1123             return b.create<vector::InsertOp>(loc, newXferOp, vec,
1124                                               insertionIndices);
1125           },
1126           /*outOfBoundsCase=*/
1127           [&](OpBuilder &b, Location loc) {
1128             // Loop through original (unmodified) vector.
1129             return vec;
1130           });
1131     }
1132 
1133     if (insertOp) {
1134       // Rewrite single user of the old TransferReadOp, which was an InsertOp.
1135       rewriter.replaceOp(insertOp, vec);
1136       rewriter.eraseOp(xferOp);
1137     } else {
1138       rewriter.replaceOp(xferOp, vec);
1139     }
1140 
1141     return success();
1142   }
1143 };
1144 
1145 /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
1146 /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
1147 /// memref buffer is allocated and the SCF loop is fully unrolled.
1148 ///
1149 /// ```
1150 /// E.g.:
1151 /// ```
1152 /// vector.transfer_write %vec, %A[%a, %b, %c]
1153 ///     : vector<5x4xf32>, memref<?x?x?xf32>
1154 /// ```
1155 /// is rewritten to IR such as (simplified):
1156 /// ```
1157 /// %v0 = vector.extract %vec[0] : vector<5x4xf32>
1158 /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
1159 /// %v1 = vector.extract %vec[1] : vector<5x4xf32>
1160 /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
1161 /// ...
1162 /// %v4 = vector.extract %vec[4] : vector<5x4xf32>
1163 /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
1164 /// ```
1165 ///
1166 /// Note: As an optimization, if the vector of the original TransferWriteOp
1167 /// was directly extracted from another vector via an ExtractOp `a`, extract
1168 /// the vectors for the newly generated TransferWriteOps from `a`'s input. By
1169 /// doing so, `a` may become dead, and the number of ExtractOps generated during
1170 /// recursive application of this pattern will be minimal.
1171 struct UnrollTransferWriteConversion
1172     : public VectorToSCFPattern<TransferWriteOp> {
1173   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;
1174 
1175   void initialize() {
1176     // This pattern recursively unpacks one dimension at a time. The recursion
1177     // bounded as the rank is strictly decreasing.
1178     setHasBoundedRewriteRecursion();
1179   }
1180 
1181   /// Return the vector from which newly generated ExtracOps will extract.
1182   Value getDataVector(TransferWriteOp xferOp) const {
1183     if (auto extractOp = getExtractOp(xferOp))
1184       return extractOp.getVector();
1185     return xferOp.getVector();
1186   }
1187 
1188   /// If the input of the given TransferWriteOp is an ExtractOp, return it.
1189   vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
1190     if (auto *op = xferOp.getVector().getDefiningOp())
1191       return dyn_cast<vector::ExtractOp>(op);
1192     return vector::ExtractOp();
1193   }
1194 
1195   /// If the input of the given TransferWriteOp is an ExtractOp, return its
1196   /// indices.
1197   void getExtractionIndices(TransferWriteOp xferOp,
1198                             SmallVector<int64_t, 8> &indices) const {
1199     if (auto extractOp = getExtractOp(xferOp))
1200       indices.assign(extractOp.getPosition().begin(),
1201                      extractOp.getPosition().end());
1202   }
1203 
1204   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
1205   /// accesses, and broadcasts and transposes in permutation maps.
1206   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
1207                                 PatternRewriter &rewriter) const override {
1208     if (xferOp.getVectorType().getRank() <= options.targetRank)
1209       return failure();
1210     if (isTensorOp(xferOp) && !options.lowerTensors)
1211       return failure();
1212     // Transfer ops that modify the element type are not supported atm.
1213     if (xferOp.getVectorType().getElementType() !=
1214         xferOp.getShapedType().getElementType())
1215       return failure();
1216 
1217     auto vec = getDataVector(xferOp);
1218     auto xferVecType = xferOp.getVectorType();
1219     int64_t dimSize = xferVecType.getShape()[0];
1220     Value source = xferOp.getSource(); // memref or tensor to be written to.
1221     auto sourceType = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();
1222 
1223     // Generate fully unrolled loop of transfer ops.
1224     Location loc = xferOp.getLoc();
1225     for (int64_t i = 0; i < dimSize; ++i) {
1226       Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);
1227 
1228       auto updatedSource = generateInBoundsCheck(
1229           rewriter, xferOp, iv, unpackedDim(xferOp),
1230           isTensorOp(xferOp) ? TypeRange(sourceType) : TypeRange(),
1231           /*inBoundsCase=*/
1232           [&](OpBuilder &b, Location loc) {
1233             // Indices for the new transfer op.
1234             SmallVector<Value, 8> xferIndices;
1235             getXferIndices(b, xferOp, iv, xferIndices);
1236 
1237             // Indices for the new vector.extract op.
1238             SmallVector<int64_t, 8> extractionIndices;
1239             getExtractionIndices(xferOp, extractionIndices);
1240             extractionIndices.push_back(i);
1241 
1242             auto extracted =
1243                 b.create<vector::ExtractOp>(loc, vec, extractionIndices);
1244             auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
1245             auto newXferOp = b.create<vector::TransferWriteOp>(
1246                 loc, sourceType, extracted, source, xferIndices,
1247                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
1248                 inBoundsAttr);
1249 
1250             maybeAssignMask(b, xferOp, newXferOp, i);
1251 
1252             return isTensorOp(xferOp) ? newXferOp->getResult(0) : Value();
1253           },
1254           /*outOfBoundsCase=*/
1255           [&](OpBuilder &b, Location loc) {
1256             return isTensorOp(xferOp) ? source : Value();
1257           });
1258 
1259       if (isTensorOp(xferOp))
1260         source = updatedSource;
1261     }
1262 
1263     if (isTensorOp(xferOp))
1264       rewriter.replaceOp(xferOp, source);
1265     else
1266       rewriter.eraseOp(xferOp);
1267 
1268     return success();
1269   }
1270 };
1271 
1272 } // namespace lowering_n_d_unrolled
1273 
1274 namespace lowering_1_d {
1275 
1276 /// Compute the indices into the memref for the LoadOp/StoreOp generated as
1277 /// part of TransferOp1dConversion. Return the memref dimension on which
1278 /// the transfer is operating. A return value of std::nullopt indicates a
1279 /// broadcast.
1280 template <typename OpTy>
1281 static std::optional<int64_t>
1282 get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv,
1283                    SmallVector<Value, 8> &memrefIndices) {
1284   auto indices = xferOp.getIndices();
1285   auto map = xferOp.getPermutationMap();
1286   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");
1287 
1288   memrefIndices.append(indices.begin(), indices.end());
1289   assert(map.getNumResults() == 1 &&
1290          "Expected 1 permutation map result for 1D transfer");
1291   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
1292     Location loc = xferOp.getLoc();
1293     auto dim = expr.getPosition();
1294     AffineExpr d0, d1;
1295     bindDims(xferOp.getContext(), d0, d1);
1296     Value offset = memrefIndices[dim];
1297     memrefIndices[dim] =
1298         affine::makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
1299     return dim;
1300   }
1301 
1302   assert(xferOp.isBroadcastDim(0) &&
1303          "Expected AffineDimExpr or AffineConstantExpr");
1304   return std::nullopt;
1305 }
1306 
1307 /// Codegen strategy for TransferOp1dConversion, depending on the
1308 /// operation.
1309 template <typename OpTy>
1310 struct Strategy1d;
1311 
1312 /// Codegen strategy for TransferReadOp.
1313 template <>
1314 struct Strategy1d<TransferReadOp> {
1315   static void generateForLoopBody(OpBuilder &b, Location loc,
1316                                   TransferReadOp xferOp, Value iv,
1317                                   ValueRange loopState) {
1318     SmallVector<Value, 8> indices;
1319     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1320     auto vec = loopState[0];
1321 
1322     // In case of out-of-bounds access, leave `vec` as is (was initialized with
1323     // padding value).
1324     auto nextVec = generateInBoundsCheck(
1325         b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()),
1326         /*inBoundsCase=*/
1327         [&](OpBuilder &b, Location loc) {
1328           Value val =
1329               b.create<memref::LoadOp>(loc, xferOp.getSource(), indices);
1330           return b.create<vector::InsertElementOp>(loc, val, vec, iv);
1331         },
1332         /*outOfBoundsCase=*/
1333         [&](OpBuilder & /*b*/, Location loc) { return vec; });
1334     b.create<scf::YieldOp>(loc, nextVec);
1335   }
1336 
1337   static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) {
1338     // Inititalize vector with padding value.
1339     Location loc = xferOp.getLoc();
1340     return b.create<vector::SplatOp>(loc, xferOp.getVectorType(),
1341                                      xferOp.getPadding());
1342   }
1343 };
1344 
1345 /// Codegen strategy for TransferWriteOp.
1346 template <>
1347 struct Strategy1d<TransferWriteOp> {
1348   static void generateForLoopBody(OpBuilder &b, Location loc,
1349                                   TransferWriteOp xferOp, Value iv,
1350                                   ValueRange /*loopState*/) {
1351     SmallVector<Value, 8> indices;
1352     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
1353 
1354     // Nothing to do in case of out-of-bounds access.
1355     generateInBoundsCheck(
1356         b, xferOp, iv, dim,
1357         /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
1358           auto val =
1359               b.create<vector::ExtractElementOp>(loc, xferOp.getVector(), iv);
1360           b.create<memref::StoreOp>(loc, val, xferOp.getSource(), indices);
1361         });
1362     b.create<scf::YieldOp>(loc);
1363   }
1364 
1365   static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) {
1366     return Value();
1367   }
1368 };
1369 
1370 /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
1371 /// necessary in cases where a 1D vector transfer op cannot be lowered into
1372 /// vector load/stores due to non-unit strides or broadcasts:
1373 ///
1374 /// * Transfer dimension is not the last memref dimension
1375 /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
1376 /// * Memref has a layout map with non-unit stride on the last dimension
1377 ///
1378 /// This pattern generates IR as follows:
1379 ///
1380 /// 1. Generate a for loop iterating over each vector element.
1381 /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
1382 ///    depending on OpTy.
1383 ///
1384 /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
1385 ///       can be generated instead of TransferOp1dConversion. Add such a pattern
1386 ///       to ConvertVectorToLLVM.
1387 ///
1388 /// E.g.:
1389 /// ```
1390 /// vector.transfer_write %vec, %A[%a, %b]
1391 ///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
1392 ///    : vector<9xf32>, memref<?x?xf32>
1393 /// ```
1394 /// Is rewritten to approximately the following pseudo-IR:
1395 /// ```
1396 /// for i = 0 to 9 {
1397 ///   %t = vector.extractelement %vec[i] : vector<9xf32>
1398 ///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
1399 /// }
1400 /// ```
1401 template <typename OpTy>
1402 struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> {
1403   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;
1404 
1405   LogicalResult matchAndRewrite(OpTy xferOp,
1406                                 PatternRewriter &rewriter) const override {
1407     // TODO: support 0-d corner case.
1408     if (xferOp.getTransferRank() == 0)
1409       return failure();
1410     auto map = xferOp.getPermutationMap();
1411     auto memRefType = dyn_cast<MemRefType>(xferOp.getShapedType());
1412 
1413     if (!memRefType)
1414       return failure();
1415     if (xferOp.getVectorType().getRank() != 1)
1416       return failure();
1417     if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
1418       return failure(); // Handled by ConvertVectorToLLVM
1419 
1420     // Loop bounds, step, state...
1421     Location loc = xferOp.getLoc();
1422     auto vecType = xferOp.getVectorType();
1423     auto lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
1424     Value ub =
1425         rewriter.create<arith::ConstantIndexOp>(loc, vecType.getDimSize(0));
1426     if (vecType.isScalable()) {
1427       Value vscale =
1428           rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
1429       ub = rewriter.create<arith::MulIOp>(loc, ub, vscale);
1430     }
1431     auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
1432     auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp);
1433 
1434     // Generate for loop.
1435     rewriter.replaceOpWithNewOp<scf::ForOp>(
1436         xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
1437         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
1438           Strategy1d<OpTy>::generateForLoopBody(b, loc, xferOp, iv, loopState);
1439         });
1440 
1441     return success();
1442   }
1443 };
1444 
1445 } // namespace lowering_1_d
1446 } // namespace
1447 
1448 void mlir::populateVectorToSCFConversionPatterns(
1449     RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) {
1450   if (options.unroll) {
1451     patterns.add<lowering_n_d_unrolled::UnrollTransferReadConversion,
1452                  lowering_n_d_unrolled::UnrollTransferWriteConversion>(
1453         patterns.getContext(), options);
1454   } else {
1455     patterns.add<lowering_n_d::PrepareTransferReadConversion,
1456                  lowering_n_d::PrepareTransferWriteConversion,
1457                  lowering_n_d::TransferOpConversion<TransferReadOp>,
1458                  lowering_n_d::TransferOpConversion<TransferWriteOp>>(
1459         patterns.getContext(), options);
1460   }
1461 
1462   if (options.targetRank == 1) {
1463     patterns.add<lowering_1_d::TransferOp1dConversion<TransferReadOp>,
1464                  lowering_1_d::TransferOp1dConversion<TransferWriteOp>>(
1465         patterns.getContext(), options);
1466   }
1467   patterns.add<lowering_n_d::DecomposePrintOpConversion>(patterns.getContext(),
1468                                                          options);
1469 }
1470 
1471 namespace {
1472 
1473 struct ConvertVectorToSCFPass
1474     : public impl::ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
1475   ConvertVectorToSCFPass() = default;
1476   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1477     this->fullUnroll = options.unroll;
1478     this->targetRank = options.targetRank;
1479     this->lowerTensors = options.lowerTensors;
1480   }
1481 
1482   void runOnOperation() override {
1483     VectorTransferToSCFOptions options;
1484     options.unroll = fullUnroll;
1485     options.targetRank = targetRank;
1486     options.lowerTensors = lowerTensors;
1487 
1488     // Lower permutation maps first.
1489     RewritePatternSet lowerTransferPatterns(&getContext());
1490     mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
1491         lowerTransferPatterns);
1492     (void)applyPatternsAndFoldGreedily(getOperation(),
1493                                        std::move(lowerTransferPatterns));
1494 
1495     RewritePatternSet patterns(&getContext());
1496     populateVectorToSCFConversionPatterns(patterns, options);
1497     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
1498   }
1499 };
1500 
1501 } // namespace
1502 
1503 std::unique_ptr<Pass>
1504 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
1505   return std::make_unique<ConvertVectorToSCFPass>(options);
1506 }
1507