1 //===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements lowering of vector transfer operations to SCF. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <numeric> 14 #include <optional> 15 #include <type_traits> 16 17 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" 18 19 #include "mlir/Dialect/Affine/IR/AffineOps.h" 20 #include "mlir/Dialect/Arith/IR/Arith.h" 21 #include "mlir/Dialect/MemRef/IR/MemRef.h" 22 #include "mlir/Dialect/SCF/IR/SCF.h" 23 #include "mlir/Dialect/Tensor/IR/Tensor.h" 24 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" 25 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" 26 #include "mlir/IR/Builders.h" 27 #include "mlir/IR/ImplicitLocOpBuilder.h" 28 #include "mlir/Pass/Pass.h" 29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 30 #include "mlir/Transforms/Passes.h" 31 32 namespace mlir { 33 #define GEN_PASS_DEF_CONVERTVECTORTOSCF 34 #include "mlir/Conversion/Passes.h.inc" 35 } // namespace mlir 36 37 using namespace mlir; 38 using vector::TransferReadOp; 39 using vector::TransferWriteOp; 40 41 namespace { 42 43 /// Attribute name used for labeling transfer ops during progressive lowering. 44 static const char kPassLabel[] = "__vector_to_scf_lowering__"; 45 46 /// Patterns that inherit from this struct have access to 47 /// VectorTransferToSCFOptions. 48 template <typename OpTy> 49 struct VectorToSCFPattern : public OpRewritePattern<OpTy> { 50 explicit VectorToSCFPattern(MLIRContext *context, 51 VectorTransferToSCFOptions opt) 52 : OpRewritePattern<OpTy>(context), options(opt) {} 53 54 VectorTransferToSCFOptions options; 55 }; 56 57 /// Given a vector transfer op, calculate which dimension of the `source` 58 /// memref should be unpacked in the next application of TransferOpConversion. 59 /// A return value of std::nullopt indicates a broadcast. 60 template <typename OpTy> 61 static std::optional<int64_t> unpackedDim(OpTy xferOp) { 62 // TODO: support 0-d corner case. 63 assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer"); 64 auto map = xferOp.getPermutationMap(); 65 if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) { 66 return expr.getPosition(); 67 } 68 assert(xferOp.isBroadcastDim(0) && 69 "Expected AffineDimExpr or AffineConstantExpr"); 70 return std::nullopt; 71 } 72 73 /// Compute the permutation map for the new (N-1)-D vector transfer op. This 74 /// map is identical to the current permutation map, but the first result is 75 /// omitted. 76 template <typename OpTy> 77 static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) { 78 // TODO: support 0-d corner case. 79 assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer"); 80 auto map = xferOp.getPermutationMap(); 81 return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(), 82 b.getContext()); 83 } 84 85 /// Calculate the indices for the new vector transfer op. 86 /// 87 /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ... 88 /// --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32> 89 /// ^^^^^^ 90 /// `iv` is the iteration variable of the (new) surrounding loop. 91 template <typename OpTy> 92 static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv, 93 SmallVector<Value, 8> &indices) { 94 typename OpTy::Adaptor adaptor(xferOp); 95 // Corresponding memref dim of the vector dim that is unpacked. 96 auto dim = unpackedDim(xferOp); 97 auto prevIndices = adaptor.getIndices(); 98 indices.append(prevIndices.begin(), prevIndices.end()); 99 100 Location loc = xferOp.getLoc(); 101 bool isBroadcast = !dim.has_value(); 102 if (!isBroadcast) { 103 AffineExpr d0, d1; 104 bindDims(xferOp.getContext(), d0, d1); 105 Value offset = adaptor.getIndices()[*dim]; 106 indices[*dim] = 107 affine::makeComposedAffineApply(b, loc, d0 + d1, {offset, iv}); 108 } 109 } 110 111 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal, 112 Value value) { 113 if (hasRetVal) { 114 assert(value && "Expected non-empty value"); 115 b.create<scf::YieldOp>(loc, value); 116 } else { 117 b.create<scf::YieldOp>(loc); 118 } 119 } 120 121 /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask 122 /// is set to true. No such check is generated under following circumstances: 123 /// * xferOp does not have a mask. 124 /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is 125 /// computed and attached to the new transfer op in the pattern.) 126 /// * The to-be-unpacked dim of xferOp is a broadcast. 127 template <typename OpTy> 128 static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) { 129 if (!xferOp.getMask()) 130 return Value(); 131 if (xferOp.getMaskType().getRank() != 1) 132 return Value(); 133 if (xferOp.isBroadcastDim(0)) 134 return Value(); 135 136 Location loc = xferOp.getLoc(); 137 return b.create<vector::ExtractElementOp>(loc, xferOp.getMask(), iv); 138 } 139 140 /// Helper function TransferOpConversion and TransferOp1dConversion. 141 /// Generate an in-bounds check if the transfer op may go out-of-bounds on the 142 /// specified dimension `dim` with the loop iteration variable `iv`. 143 /// E.g., when unpacking dimension 0 from: 144 /// ``` 145 /// %vec = vector.transfer_read %A[%a, %b] %cst 146 /// : vector<5x4xf32>, memref<?x?xf32> 147 /// ``` 148 /// An if check similar to this will be generated inside the loop: 149 /// ``` 150 /// %d = memref.dim %A, %c0 : memref<?x?xf32> 151 /// if (%a + iv < %d) { 152 /// (in-bounds case) 153 /// } else { 154 /// (out-of-bounds case) 155 /// } 156 /// ``` 157 /// 158 /// If the transfer is 1D and has a mask, this function generates a more complex 159 /// check also accounts for potentially masked out elements. 160 /// 161 /// This function variant returns the value returned by `inBoundsCase` or 162 /// `outOfBoundsCase`. The MLIR type of the return value must be specified in 163 /// `resultTypes`. 164 template <typename OpTy> 165 static Value generateInBoundsCheck( 166 OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim, 167 TypeRange resultTypes, 168 function_ref<Value(OpBuilder &, Location)> inBoundsCase, 169 function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) { 170 bool hasRetVal = !resultTypes.empty(); 171 Value cond; // Condition to be built... 172 173 // Condition check 1: Access in-bounds? 174 bool isBroadcast = !dim; // No in-bounds check for broadcasts. 175 Location loc = xferOp.getLoc(); 176 ImplicitLocOpBuilder lb(xferOp.getLoc(), b); 177 if (!xferOp.isDimInBounds(0) && !isBroadcast) { 178 Value memrefDim = 179 vector::createOrFoldDimOp(b, loc, xferOp.getSource(), *dim); 180 AffineExpr d0, d1; 181 bindDims(xferOp.getContext(), d0, d1); 182 Value base = xferOp.getIndices()[*dim]; 183 Value memrefIdx = 184 affine::makeComposedAffineApply(b, loc, d0 + d1, {base, iv}); 185 cond = lb.create<arith::CmpIOp>(arith::CmpIPredicate::sgt, memrefDim, 186 memrefIdx); 187 } 188 189 // Condition check 2: Masked in? 190 if (auto maskCond = generateMaskCheck(b, xferOp, iv)) { 191 if (cond) 192 cond = lb.create<arith::AndIOp>(cond, maskCond); 193 else 194 cond = maskCond; 195 } 196 197 // If the condition is non-empty, generate an SCF::IfOp. 198 if (cond) { 199 auto check = lb.create<scf::IfOp>( 200 cond, 201 /*thenBuilder=*/ 202 [&](OpBuilder &b, Location loc) { 203 maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc)); 204 }, 205 /*elseBuilder=*/ 206 [&](OpBuilder &b, Location loc) { 207 if (outOfBoundsCase) { 208 maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc)); 209 } else { 210 b.create<scf::YieldOp>(loc); 211 } 212 }); 213 214 return hasRetVal ? check.getResult(0) : Value(); 215 } 216 217 // Condition is empty, no need for an SCF::IfOp. 218 return inBoundsCase(b, loc); 219 } 220 221 /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have 222 /// a return value. Consequently, this function does not have a return value. 223 template <typename OpTy> 224 static void generateInBoundsCheck( 225 OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim, 226 function_ref<void(OpBuilder &, Location)> inBoundsCase, 227 function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) { 228 generateInBoundsCheck( 229 b, xferOp, iv, dim, /*resultTypes=*/TypeRange(), 230 /*inBoundsCase=*/ 231 [&](OpBuilder &b, Location loc) { 232 inBoundsCase(b, loc); 233 return Value(); 234 }, 235 /*outOfBoundsCase=*/ 236 [&](OpBuilder &b, Location loc) { 237 if (outOfBoundsCase) 238 outOfBoundsCase(b, loc); 239 return Value(); 240 }); 241 } 242 243 /// Given an ArrayAttr, return a copy where the first element is dropped. 244 static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) { 245 if (!attr) 246 return attr; 247 return ArrayAttr::get(b.getContext(), attr.getValue().drop_front()); 248 } 249 250 /// Add the pass label to a vector transfer op if its rank is not the target 251 /// rank. 252 template <typename OpTy> 253 static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp, 254 unsigned targetRank) { 255 if (newXferOp.getVectorType().getRank() > targetRank) 256 newXferOp->setAttr(kPassLabel, b.getUnitAttr()); 257 } 258 259 /// Return true if this transfer op operates on a source tensor. 260 template <typename OpTy> 261 static bool isTensorOp(OpTy xferOp) { 262 if (isa<RankedTensorType>(xferOp.getShapedType())) { 263 if (xferOp.getOperationName().equals(TransferWriteOp::getOperationName())) { 264 // TransferWriteOps on tensors have a result. 265 assert(xferOp->getNumResults() > 0); 266 } 267 return true; 268 } 269 return false; 270 } 271 272 namespace lowering_n_d { 273 274 /// Helper data structure for data and mask buffers. 275 struct BufferAllocs { 276 Value dataBuffer; 277 Value maskBuffer; 278 }; 279 280 // TODO: Parallelism and threadlocal considerations with a ParallelScope trait. 281 static Operation *getAutomaticAllocationScope(Operation *op) { 282 Operation *scope = 283 op->getParentWithTrait<OpTrait::AutomaticAllocationScope>(); 284 assert(scope && "Expected op to be inside automatic allocation scope"); 285 return scope; 286 } 287 288 /// Allocate temporary buffers for data (vector) and mask (if present). 289 template <typename OpTy> 290 static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) { 291 Location loc = xferOp.getLoc(); 292 OpBuilder::InsertionGuard guard(b); 293 Operation *scope = getAutomaticAllocationScope(xferOp); 294 assert(scope->getNumRegions() == 1 && 295 "AutomaticAllocationScope with >1 regions"); 296 b.setInsertionPointToStart(&scope->getRegion(0).front()); 297 298 BufferAllocs result; 299 auto bufferType = MemRefType::get({}, xferOp.getVectorType()); 300 result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType); 301 302 if (xferOp.getMask()) { 303 auto maskType = MemRefType::get({}, xferOp.getMask().getType()); 304 auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType); 305 b.setInsertionPoint(xferOp); 306 b.create<memref::StoreOp>(loc, xferOp.getMask(), maskBuffer); 307 result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer, ValueRange()); 308 } 309 310 return result; 311 } 312 313 /// Given a MemRefType with VectorType element type, unpack one dimension from 314 /// the VectorType into the MemRefType. 315 /// 316 /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>> 317 static FailureOr<MemRefType> unpackOneDim(MemRefType type) { 318 auto vectorType = dyn_cast<VectorType>(type.getElementType()); 319 // Vectors with leading scalable dims are not supported. 320 // It may be possible to support these in future by using dynamic memref dims. 321 if (vectorType.getScalableDims().front()) 322 return failure(); 323 auto memrefShape = type.getShape(); 324 SmallVector<int64_t, 8> newMemrefShape; 325 newMemrefShape.append(memrefShape.begin(), memrefShape.end()); 326 newMemrefShape.push_back(vectorType.getDimSize(0)); 327 return MemRefType::get(newMemrefShape, 328 VectorType::Builder(vectorType).dropDim(0)); 329 } 330 331 /// Given a transfer op, find the memref from which the mask is loaded. This 332 /// is similar to Strategy<TransferWriteOp>::getBuffer. 333 template <typename OpTy> 334 static Value getMaskBuffer(OpTy xferOp) { 335 assert(xferOp.getMask() && "Expected that transfer op has mask"); 336 auto loadOp = xferOp.getMask().template getDefiningOp<memref::LoadOp>(); 337 assert(loadOp && "Expected transfer op mask produced by LoadOp"); 338 return loadOp.getMemRef(); 339 } 340 341 /// Codegen strategy, depending on the operation. 342 template <typename OpTy> 343 struct Strategy; 344 345 /// Code strategy for vector TransferReadOp. 346 template <> 347 struct Strategy<TransferReadOp> { 348 /// Find the StoreOp that is used for writing the current TransferReadOp's 349 /// result to the temporary buffer allocation. 350 static memref::StoreOp getStoreOp(TransferReadOp xferOp) { 351 assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp"); 352 auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner()); 353 assert(storeOp && "Expected TransferReadOp result used by StoreOp"); 354 return storeOp; 355 } 356 357 /// Find the temporary buffer allocation. All labeled TransferReadOps are 358 /// used like this, where %buf is either the buffer allocation or a type cast 359 /// of the buffer allocation: 360 /// ``` 361 /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ... 362 /// memref.store %vec, %buf[...] ... 363 /// ``` 364 static Value getBuffer(TransferReadOp xferOp) { 365 return getStoreOp(xferOp).getMemRef(); 366 } 367 368 /// Retrieve the indices of the current StoreOp that stores into the buffer. 369 static void getBufferIndices(TransferReadOp xferOp, 370 SmallVector<Value, 8> &indices) { 371 auto storeOp = getStoreOp(xferOp); 372 auto prevIndices = memref::StoreOpAdaptor(storeOp).getIndices(); 373 indices.append(prevIndices.begin(), prevIndices.end()); 374 } 375 376 /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds 377 /// accesses on the to-be-unpacked dimension. 378 /// 379 /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration 380 /// variable `iv`. 381 /// 2. Store the result into the (already `vector.type_cast`ed) buffer. 382 /// 383 /// E.g.: 384 /// ``` 385 /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst 386 /// : memref<?x?x?xf32>, vector<4x3xf32> 387 /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>> 388 /// ``` 389 /// Is rewritten to: 390 /// ``` 391 /// %casted = vector.type_cast %buf 392 /// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> 393 /// for %j = 0 to 4 { 394 /// %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst 395 /// : memref<?x?x?xf32>, vector<3xf32> 396 /// memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>> 397 /// } 398 /// ``` 399 /// 400 /// Note: The loop and type cast are generated in TransferOpConversion. 401 /// The original TransferReadOp and store op are deleted in `cleanup`. 402 /// Note: The `mask` operand is set in TransferOpConversion. 403 static TransferReadOp rewriteOp(OpBuilder &b, 404 VectorTransferToSCFOptions options, 405 TransferReadOp xferOp, Value buffer, Value iv, 406 ValueRange /*loopState*/) { 407 SmallVector<Value, 8> storeIndices; 408 getBufferIndices(xferOp, storeIndices); 409 storeIndices.push_back(iv); 410 411 SmallVector<Value, 8> xferIndices; 412 getXferIndices(b, xferOp, iv, xferIndices); 413 414 Location loc = xferOp.getLoc(); 415 auto bufferType = dyn_cast<ShapedType>(buffer.getType()); 416 auto vecType = dyn_cast<VectorType>(bufferType.getElementType()); 417 auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); 418 auto newXferOp = b.create<vector::TransferReadOp>( 419 loc, vecType, xferOp.getSource(), xferIndices, 420 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), 421 xferOp.getPadding(), Value(), inBoundsAttr); 422 423 maybeApplyPassLabel(b, newXferOp, options.targetRank); 424 425 b.create<memref::StoreOp>(loc, newXferOp.getVector(), buffer, storeIndices); 426 return newXferOp; 427 } 428 429 /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write 430 /// padding value to the temporary buffer. 431 static Value handleOutOfBoundsDim(OpBuilder &b, TransferReadOp xferOp, 432 Value buffer, Value iv, 433 ValueRange /*loopState*/) { 434 SmallVector<Value, 8> storeIndices; 435 getBufferIndices(xferOp, storeIndices); 436 storeIndices.push_back(iv); 437 438 Location loc = xferOp.getLoc(); 439 auto bufferType = dyn_cast<ShapedType>(buffer.getType()); 440 auto vecType = dyn_cast<VectorType>(bufferType.getElementType()); 441 auto vec = b.create<vector::SplatOp>(loc, vecType, xferOp.getPadding()); 442 b.create<memref::StoreOp>(loc, vec, buffer, storeIndices); 443 444 return Value(); 445 } 446 447 /// Cleanup after rewriting the op. 448 static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp, 449 scf::ForOp /*forOp*/) { 450 rewriter.eraseOp(getStoreOp(xferOp)); 451 rewriter.eraseOp(xferOp); 452 } 453 454 /// Return the initial loop state for the generated scf.for loop. 455 static Value initialLoopState(TransferReadOp xferOp) { return Value(); } 456 }; 457 458 /// Codegen strategy for vector TransferWriteOp. 459 template <> 460 struct Strategy<TransferWriteOp> { 461 /// Find the temporary buffer allocation. All labeled TransferWriteOps are 462 /// used like this, where %buf is either the buffer allocation or a type cast 463 /// of the buffer allocation: 464 /// ``` 465 /// %vec = memref.load %buf[...] ... 466 /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ... 467 /// ``` 468 static Value getBuffer(TransferWriteOp xferOp) { 469 auto loadOp = xferOp.getVector().getDefiningOp<memref::LoadOp>(); 470 assert(loadOp && "Expected transfer op vector produced by LoadOp"); 471 return loadOp.getMemRef(); 472 } 473 474 /// Retrieve the indices of the current LoadOp that loads from the buffer. 475 static void getBufferIndices(TransferWriteOp xferOp, 476 SmallVector<Value, 8> &indices) { 477 auto loadOp = xferOp.getVector().getDefiningOp<memref::LoadOp>(); 478 auto prevIndices = memref::LoadOpAdaptor(loadOp).getIndices(); 479 indices.append(prevIndices.begin(), prevIndices.end()); 480 } 481 482 /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds 483 /// accesses on the to-be-unpacked dimension. 484 /// 485 /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer, 486 /// using the loop iteration variable `iv`. 487 /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back 488 /// to memory. 489 /// 490 /// Note: For more details, see comments on Strategy<TransferReadOp>. 491 static TransferWriteOp rewriteOp(OpBuilder &b, 492 VectorTransferToSCFOptions options, 493 TransferWriteOp xferOp, Value buffer, 494 Value iv, ValueRange loopState) { 495 SmallVector<Value, 8> loadIndices; 496 getBufferIndices(xferOp, loadIndices); 497 loadIndices.push_back(iv); 498 499 SmallVector<Value, 8> xferIndices; 500 getXferIndices(b, xferOp, iv, xferIndices); 501 502 Location loc = xferOp.getLoc(); 503 auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices); 504 auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); 505 auto source = loopState.empty() ? xferOp.getSource() : loopState[0]; 506 Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type(); 507 auto newXferOp = b.create<vector::TransferWriteOp>( 508 loc, type, vec, source, xferIndices, 509 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(), 510 inBoundsAttr); 511 512 maybeApplyPassLabel(b, newXferOp, options.targetRank); 513 514 return newXferOp; 515 } 516 517 /// Handle out-of-bounds accesses on the to-be-unpacked dimension. 518 static Value handleOutOfBoundsDim(OpBuilder &b, TransferWriteOp xferOp, 519 Value buffer, Value iv, 520 ValueRange loopState) { 521 return isTensorOp(xferOp) ? loopState[0] : Value(); 522 } 523 524 /// Cleanup after rewriting the op. 525 static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp, 526 scf::ForOp forOp) { 527 if (isTensorOp(xferOp)) { 528 assert(forOp->getNumResults() == 1 && "Expected one for loop result"); 529 rewriter.replaceOp(xferOp, forOp->getResult(0)); 530 } else { 531 rewriter.eraseOp(xferOp); 532 } 533 } 534 535 /// Return the initial loop state for the generated scf.for loop. 536 static Value initialLoopState(TransferWriteOp xferOp) { 537 return isTensorOp(xferOp) ? xferOp.getSource() : Value(); 538 } 539 }; 540 541 template <typename OpTy> 542 LogicalResult checkPrepareXferOp(OpTy xferOp, 543 VectorTransferToSCFOptions options) { 544 if (xferOp->hasAttr(kPassLabel)) 545 return failure(); 546 if (xferOp.getVectorType().getRank() <= options.targetRank) 547 return failure(); 548 // Currently the unpacking of the leading dimension into the memref is not 549 // supported for scalable dimensions. 550 if (xferOp.getVectorType().getScalableDims().front()) 551 return failure(); 552 if (isTensorOp(xferOp) && !options.lowerTensors) 553 return failure(); 554 // Transfer ops that modify the element type are not supported atm. 555 if (xferOp.getVectorType().getElementType() != 556 xferOp.getShapedType().getElementType()) 557 return failure(); 558 return success(); 559 } 560 561 /// Prepare a TransferReadOp for progressive lowering. 562 /// 563 /// 1. Allocate a temporary buffer. 564 /// 2. Label the TransferReadOp, marking it eligible for progressive lowering. 565 /// 3. Store the result of the TransferReadOp into the temporary buffer. 566 /// 4. Load the result from the temporary buffer and replace all uses of the 567 /// original TransferReadOp with this load. 568 /// 569 /// E.g.: 570 /// ``` 571 /// %vec = vector.transfer_read %A[%a, %b, %c], %cst 572 /// : vector<5x4xf32>, memref<?x?x?xf32> 573 /// ``` 574 /// is rewritten to: 575 /// ``` 576 /// %0 = memref.alloca() : memref<vector<5x4xf32>> 577 /// %1 = vector.transfer_read %A[%a, %b, %c], %cst 578 /// { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32> 579 /// memref.store %1, %0[] : memref<vector<5x4xf32>> 580 /// %vec = memref.load %0[] : memref<vector<5x4xf32>> 581 /// ``` 582 /// 583 /// Note: A second temporary buffer may be allocated for the `mask` operand. 584 struct PrepareTransferReadConversion 585 : public VectorToSCFPattern<TransferReadOp> { 586 using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern; 587 588 LogicalResult matchAndRewrite(TransferReadOp xferOp, 589 PatternRewriter &rewriter) const override { 590 if (checkPrepareXferOp(xferOp, options).failed()) 591 return failure(); 592 593 auto buffers = allocBuffers(rewriter, xferOp); 594 auto *newXfer = rewriter.clone(*xferOp.getOperation()); 595 newXfer->setAttr(kPassLabel, rewriter.getUnitAttr()); 596 if (xferOp.getMask()) { 597 dyn_cast<TransferReadOp>(newXfer).getMaskMutable().assign( 598 buffers.maskBuffer); 599 } 600 601 Location loc = xferOp.getLoc(); 602 rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0), 603 buffers.dataBuffer); 604 rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer); 605 606 return success(); 607 } 608 }; 609 610 /// Prepare a TransferWriteOp for progressive lowering. 611 /// 612 /// 1. Allocate a temporary buffer. 613 /// 2. Store the vector into the buffer. 614 /// 3. Load the vector from the buffer again. 615 /// 4. Use the loaded vector as a TransferWriteOp operand and label the op, 616 /// marking it eligible for progressive lowering via TransferOpConversion. 617 /// 618 /// E.g.: 619 /// ``` 620 /// vector.transfer_write %vec, %A[%a, %b, %c] 621 /// : vector<5x4xf32>, memref<?x?x?xf32> 622 /// ``` 623 /// is rewritten to: 624 /// ``` 625 /// %0 = memref.alloca() : memref<vector<5x4xf32>> 626 /// memref.store %vec, %0[] : memref<vector<5x4xf32>> 627 /// %1 = memref.load %0[] : memref<vector<5x4xf32>> 628 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ } 629 /// : vector<5x4xf32>, memref<?x?x?xf32> 630 /// ``` 631 /// 632 /// Note: A second temporary buffer may be allocated for the `mask` operand. 633 struct PrepareTransferWriteConversion 634 : public VectorToSCFPattern<TransferWriteOp> { 635 using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern; 636 637 LogicalResult matchAndRewrite(TransferWriteOp xferOp, 638 PatternRewriter &rewriter) const override { 639 if (checkPrepareXferOp(xferOp, options).failed()) 640 return failure(); 641 642 Location loc = xferOp.getLoc(); 643 auto buffers = allocBuffers(rewriter, xferOp); 644 rewriter.create<memref::StoreOp>(loc, xferOp.getVector(), 645 buffers.dataBuffer); 646 auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer); 647 rewriter.updateRootInPlace(xferOp, [&]() { 648 xferOp.getVectorMutable().assign(loadedVec); 649 xferOp->setAttr(kPassLabel, rewriter.getUnitAttr()); 650 }); 651 652 if (xferOp.getMask()) { 653 rewriter.updateRootInPlace(xferOp, [&]() { 654 xferOp.getMaskMutable().assign(buffers.maskBuffer); 655 }); 656 } 657 658 return success(); 659 } 660 }; 661 662 /// Decompose a n-D PrintOp into a loop of elementary/scalar prints. This allows 663 /// printing both 1D scalable vectors and n-D fixed size vectors. 664 /// 665 /// E.g.: 666 /// ``` 667 /// vector.print %v : vector<[4]xi32> 668 /// ``` 669 /// is rewritten to: 670 /// ``` 671 /// %c0 = arith.constant 0 : index 672 /// %c4 = arith.constant 4 : index 673 /// %c1 = arith.constant 1 : index 674 /// %vscale = vector.vscale 675 /// %length = arith.muli %vscale, %c4 : index 676 /// %lastIndex = arith.subi %length, %c1 : index 677 /// vector.print punctuation <open> 678 /// scf.for %i = %c0 to %length step %c1 { 679 /// %el = vector.extractelement %v[%i : index] : vector<[4]xi32> 680 /// vector.print %el : i32 punctuation <no_punctuation> 681 /// %notLastIndex = arith.cmpi ult, %i, %lastIndex : index 682 /// scf.if %notLastIndex { 683 /// vector.print punctuation <comma> 684 /// } 685 /// } 686 /// vector.print punctuation <close> 687 /// vector.print 688 /// ``` 689 struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> { 690 using VectorToSCFPattern<vector::PrintOp>::VectorToSCFPattern; 691 LogicalResult matchAndRewrite(vector::PrintOp printOp, 692 PatternRewriter &rewriter) const override { 693 if (!printOp.getSource()) 694 return failure(); 695 696 VectorType vectorType = dyn_cast<VectorType>(printOp.getPrintType()); 697 if (!vectorType) 698 return failure(); 699 700 // Currently >= 2D scalable vectors are not supported. 701 // These can't be lowered to LLVM (as LLVM does not support scalable vectors 702 // of scalable vectors), and due to limitations of current ops can't be 703 // indexed with SSA values or flattened. This may change after 704 // https://reviews.llvm.org/D155034, though there still needs to be a path 705 // for lowering to LLVM. 706 if (vectorType.getRank() > 1 && vectorType.isScalable()) 707 return failure(); 708 709 auto loc = printOp.getLoc(); 710 auto value = printOp.getSource(); 711 712 if (auto intTy = dyn_cast<IntegerType>(vectorType.getElementType())) { 713 // Oddly sized integers are (somewhat) buggy on a lot of backends, so to 714 // avoid issues extend them to a more standard size. 715 // https://github.com/llvm/llvm-project/issues/30613 716 auto width = intTy.getWidth(); 717 auto legalWidth = llvm::NextPowerOf2(std::max(8u, width) - 1); 718 auto legalIntTy = IntegerType::get(rewriter.getContext(), legalWidth, 719 intTy.getSignedness()); 720 // arith can only take signless integers, so we must cast back and forth. 721 auto signlessSourceVectorType = 722 vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(intTy)); 723 auto signlessTargetVectorType = 724 vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(legalIntTy)); 725 auto targetVectorType = vectorType.cloneWith({}, legalIntTy); 726 value = rewriter.create<vector::BitCastOp>(loc, signlessSourceVectorType, 727 value); 728 if (width == 1 || intTy.isUnsigned()) 729 value = rewriter.create<arith::ExtUIOp>(loc, signlessTargetVectorType, 730 value); 731 else 732 value = rewriter.create<arith::ExtSIOp>(loc, signlessTargetVectorType, 733 value); 734 value = rewriter.create<vector::BitCastOp>(loc, targetVectorType, value); 735 vectorType = targetVectorType; 736 } 737 738 auto scalableDimensions = vectorType.getScalableDims(); 739 auto shape = vectorType.getShape(); 740 constexpr int64_t singletonShape[] = {1}; 741 if (vectorType.getRank() == 0) 742 shape = singletonShape; 743 744 if (vectorType.getRank() != 1) { 745 // Flatten n-D vectors to 1D. This is done to allow indexing with a 746 // non-constant value (which can currently only be done via 747 // vector.extractelement for 1D vectors). 748 auto flatLength = std::accumulate(shape.begin(), shape.end(), 1, 749 std::multiplies<int64_t>()); 750 auto flatVectorType = 751 VectorType::get({flatLength}, vectorType.getElementType()); 752 value = rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, value); 753 } 754 755 vector::PrintOp firstClose; 756 SmallVector<Value, 8> loopIndices; 757 for (unsigned d = 0; d < shape.size(); d++) { 758 // Setup loop bounds and step. 759 Value lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0); 760 Value upperBound = rewriter.create<arith::ConstantIndexOp>(loc, shape[d]); 761 Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1); 762 if (!scalableDimensions.empty() && scalableDimensions[d]) { 763 auto vscale = rewriter.create<vector::VectorScaleOp>( 764 loc, rewriter.getIndexType()); 765 upperBound = rewriter.create<arith::MulIOp>(loc, upperBound, vscale); 766 } 767 auto lastIndex = rewriter.create<arith::SubIOp>(loc, upperBound, step); 768 769 // Create a loop to print the elements surrounded by parentheses. 770 rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open); 771 auto loop = 772 rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step); 773 auto printClose = rewriter.create<vector::PrintOp>( 774 loc, vector::PrintPunctuation::Close); 775 if (!firstClose) 776 firstClose = printClose; 777 778 auto loopIdx = loop.getInductionVar(); 779 loopIndices.push_back(loopIdx); 780 781 // Print a comma after all but the last element. 782 rewriter.setInsertionPointToStart(loop.getBody()); 783 auto notLastIndex = rewriter.create<arith::CmpIOp>( 784 loc, arith::CmpIPredicate::ult, loopIdx, lastIndex); 785 rewriter.create<scf::IfOp>(loc, notLastIndex, 786 [&](OpBuilder &builder, Location loc) { 787 builder.create<vector::PrintOp>( 788 loc, vector::PrintPunctuation::Comma); 789 builder.create<scf::YieldOp>(loc); 790 }); 791 792 rewriter.setInsertionPointToStart(loop.getBody()); 793 } 794 795 // Compute the flattened index. 796 // Note: For the > rank 1 vectors this assumes non-scalable. 797 Value flatIndex; 798 auto currentStride = 1; 799 for (int d = shape.size() - 1; d >= 0; d--) { 800 auto stride = rewriter.create<arith::ConstantIndexOp>(loc, currentStride); 801 auto index = rewriter.create<arith::MulIOp>(loc, stride, loopIndices[d]); 802 if (flatIndex) 803 flatIndex = rewriter.create<arith::AddIOp>(loc, flatIndex, index); 804 else 805 flatIndex = index; 806 currentStride *= shape[d]; 807 } 808 809 // Print the scalar elements in the inner most loop. 810 auto element = 811 rewriter.create<vector::ExtractElementOp>(loc, value, flatIndex); 812 rewriter.create<vector::PrintOp>(loc, element, 813 vector::PrintPunctuation::NoPunctuation); 814 815 rewriter.setInsertionPointAfter(firstClose); 816 rewriter.create<vector::PrintOp>(loc, printOp.getPunctuation()); 817 rewriter.eraseOp(printOp); 818 return success(); 819 } 820 821 static IntegerType getIntTypeWithSignlessSemantics(IntegerType intTy) { 822 return IntegerType::get(intTy.getContext(), intTy.getWidth(), 823 IntegerType::Signless); 824 }; 825 }; 826 827 /// Progressive lowering of vector transfer ops: Unpack one dimension. 828 /// 829 /// 1. Unpack one dimension from the current buffer type and cast the buffer 830 /// to that new type. E.g.: 831 /// ``` 832 /// %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>> 833 /// vector.transfer_write %vec ... 834 /// ``` 835 /// The following cast is generated: 836 /// ``` 837 /// %casted = vector.type_cast %0 838 /// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> 839 /// ``` 840 /// 2. Generate a for loop and rewrite the transfer op according to the 841 /// corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be 842 /// out-of-bounds, generate an if-check and handle both cases separately. 843 /// 3. Clean up according to the corresponding Strategy<OpTy>. 844 /// 845 /// Note: If the transfer op is a TransferWriteOp and operates on a tensor 846 /// source (as opposed to a memref source), then each iteration of the generated 847 /// scf.for loop yields the new tensor value. E.g.: 848 /// ``` 849 /// %result = scf.for i = 0 to 5 { 850 /// %0 = memref.load %buffer[i] : memref<5xvector<4x3xf32>> 851 /// %1 = vector.transfer_write %0, %source[...] 852 /// : vector<4x3xf32>, tensor<5x4x3xf32> 853 /// scf.yield %1 : tensor<5x4x3xf32> 854 /// } 855 /// ``` 856 template <typename OpTy> 857 struct TransferOpConversion : public VectorToSCFPattern<OpTy> { 858 using VectorToSCFPattern<OpTy>::VectorToSCFPattern; 859 860 void initialize() { 861 // This pattern recursively unpacks one dimension at a time. The recursion 862 // bounded as the rank is strictly decreasing. 863 this->setHasBoundedRewriteRecursion(); 864 } 865 866 LogicalResult matchAndRewrite(OpTy xferOp, 867 PatternRewriter &rewriter) const override { 868 if (!xferOp->hasAttr(kPassLabel)) 869 return failure(); 870 871 // Find and cast data buffer. How the buffer can be found depends on OpTy. 872 ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter); 873 auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp); 874 auto dataBufferType = dyn_cast<MemRefType>(dataBuffer.getType()); 875 auto castedDataType = unpackOneDim(dataBufferType); 876 if (failed(castedDataType)) 877 return failure(); 878 879 auto castedDataBuffer = 880 locB.create<vector::TypeCastOp>(*castedDataType, dataBuffer); 881 882 // If the xferOp has a mask: Find and cast mask buffer. 883 Value castedMaskBuffer; 884 if (xferOp.getMask()) { 885 auto maskBuffer = getMaskBuffer(xferOp); 886 auto maskBufferType = dyn_cast<MemRefType>(maskBuffer.getType()); 887 if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) { 888 // Do not unpack a dimension of the mask, if: 889 // * To-be-unpacked transfer op dimension is a broadcast. 890 // * Mask is 1D, i.e., the mask cannot be further unpacked. 891 // (That means that all remaining dimensions of the transfer op must 892 // be broadcasted.) 893 castedMaskBuffer = maskBuffer; 894 } else { 895 // It's safe to assume the mask buffer can be unpacked if the data 896 // buffer was unpacked. 897 auto castedMaskType = *unpackOneDim(maskBufferType); 898 castedMaskBuffer = 899 locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer); 900 } 901 } 902 903 // Loop bounds and step. 904 auto lb = locB.create<arith::ConstantIndexOp>(0); 905 auto ub = locB.create<arith::ConstantIndexOp>( 906 castedDataType->getDimSize(castedDataType->getRank() - 1)); 907 auto step = locB.create<arith::ConstantIndexOp>(1); 908 // TransferWriteOps that operate on tensors return the modified tensor and 909 // require a loop state. 910 auto loopState = Strategy<OpTy>::initialLoopState(xferOp); 911 912 // Generate for loop. 913 auto result = locB.create<scf::ForOp>( 914 lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(), 915 [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) { 916 Type stateType = loopState.empty() ? Type() : loopState[0].getType(); 917 918 auto result = generateInBoundsCheck( 919 b, xferOp, iv, unpackedDim(xferOp), 920 stateType ? TypeRange(stateType) : TypeRange(), 921 /*inBoundsCase=*/ 922 [&](OpBuilder &b, Location loc) { 923 // Create new transfer op. 924 OpTy newXfer = Strategy<OpTy>::rewriteOp( 925 b, this->options, xferOp, castedDataBuffer, iv, loopState); 926 927 // If old transfer op has a mask: Set mask on new transfer op. 928 // Special case: If the mask of the old transfer op is 1D and 929 // the 930 // unpacked dim is not a broadcast, no mask is 931 // needed on the new transfer op. 932 if (xferOp.getMask() && (xferOp.isBroadcastDim(0) || 933 xferOp.getMaskType().getRank() > 1)) { 934 OpBuilder::InsertionGuard guard(b); 935 b.setInsertionPoint(newXfer); // Insert load before newXfer. 936 937 SmallVector<Value, 8> loadIndices; 938 Strategy<OpTy>::getBufferIndices(xferOp, loadIndices); 939 // In case of broadcast: Use same indices to load from memref 940 // as before. 941 if (!xferOp.isBroadcastDim(0)) 942 loadIndices.push_back(iv); 943 944 auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer, 945 loadIndices); 946 rewriter.updateRootInPlace(newXfer, [&]() { 947 newXfer.getMaskMutable().assign(mask); 948 }); 949 } 950 951 return loopState.empty() ? Value() : newXfer->getResult(0); 952 }, 953 /*outOfBoundsCase=*/ 954 [&](OpBuilder &b, Location /*loc*/) { 955 return Strategy<OpTy>::handleOutOfBoundsDim( 956 b, xferOp, castedDataBuffer, iv, loopState); 957 }); 958 959 maybeYieldValue(b, loc, !loopState.empty(), result); 960 }); 961 962 Strategy<OpTy>::cleanup(rewriter, xferOp, result); 963 return success(); 964 } 965 }; 966 967 } // namespace lowering_n_d 968 969 namespace lowering_n_d_unrolled { 970 971 /// If the original transfer op has a mask, compute the mask of the new transfer 972 /// op (for the current iteration `i`) and assign it. 973 template <typename OpTy> 974 static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp, 975 int64_t i) { 976 if (!xferOp.getMask()) 977 return; 978 979 if (xferOp.isBroadcastDim(0)) { 980 // To-be-unpacked dimension is a broadcast, which does not have a 981 // corresponding mask dimension. Mask attribute remains unchanged. 982 newXferOp.getMaskMutable().assign(xferOp.getMask()); 983 return; 984 } 985 986 if (xferOp.getMaskType().getRank() > 1) { 987 // Unpack one dimension of the mask. 988 OpBuilder::InsertionGuard guard(b); 989 b.setInsertionPoint(newXferOp); // Insert load before newXfer. 990 991 llvm::SmallVector<int64_t, 1> indices({i}); 992 Location loc = xferOp.getLoc(); 993 auto newMask = b.create<vector::ExtractOp>(loc, xferOp.getMask(), indices); 994 newXferOp.getMaskMutable().assign(newMask); 995 } 996 997 // If we end up here: The mask of the old transfer op is 1D and the unpacked 998 // dim is not a broadcast, so no mask is needed on the new transfer op. 999 // `generateInBoundsCheck` will have evaluated the mask already. 1000 } 1001 1002 /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one 1003 /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no 1004 /// memref buffer is allocated and the SCF loop is fully unrolled. 1005 /// 1006 /// ``` 1007 /// E.g.: 1008 /// ``` 1009 /// %vec = vector.transfer_read %A[%a, %b, %c], %padding 1010 /// : memref<?x?x?xf32>, vector<5x4xf32> 1011 /// ``` 1012 /// is rewritten to IR such as (simplified): 1013 /// ``` 1014 /// %v_init = splat %padding : vector<5x4xf32> 1015 /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding 1016 /// : memref<?x?x?xf32>, vector<4xf32> 1017 /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32> 1018 /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding 1019 /// : memref<?x?x?xf32>, vector<4xf32> 1020 /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32> 1021 /// ... 1022 /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding 1023 /// : memref<?x?x?xf32>, vector<4xf32> 1024 /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32> 1025 /// ``` 1026 /// 1027 /// Note: As an optimization, if the result of the original TransferReadOp 1028 /// was directly inserted into another vector, no new %v_init vector is created. 1029 /// Instead, the new TransferReadOp results are inserted into that vector. 1030 struct UnrollTransferReadConversion 1031 : public VectorToSCFPattern<TransferReadOp> { 1032 using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern; 1033 1034 void initialize() { 1035 // This pattern recursively unpacks one dimension at a time. The recursion 1036 // bounded as the rank is strictly decreasing. 1037 setHasBoundedRewriteRecursion(); 1038 } 1039 1040 /// Return the vector into which the newly created TransferReadOp results 1041 /// are inserted. 1042 Value getResultVector(TransferReadOp xferOp, 1043 PatternRewriter &rewriter) const { 1044 if (auto insertOp = getInsertOp(xferOp)) 1045 return insertOp.getDest(); 1046 Location loc = xferOp.getLoc(); 1047 return rewriter.create<vector::SplatOp>(loc, xferOp.getVectorType(), 1048 xferOp.getPadding()); 1049 } 1050 1051 /// If the result of the TransferReadOp has exactly one user, which is a 1052 /// vector::InsertOp, return that operation. 1053 vector::InsertOp getInsertOp(TransferReadOp xferOp) const { 1054 if (xferOp->hasOneUse()) { 1055 Operation *xferOpUser = *xferOp->getUsers().begin(); 1056 if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser)) 1057 return insertOp; 1058 } 1059 1060 return vector::InsertOp(); 1061 } 1062 1063 /// If the result of the TransferReadOp has exactly one user, which is a 1064 /// vector::InsertOp, return that operation's indices. 1065 void getInsertionIndices(TransferReadOp xferOp, 1066 SmallVector<int64_t, 8> &indices) const { 1067 if (auto insertOp = getInsertOp(xferOp)) 1068 indices.assign(insertOp.getPosition().begin(), 1069 insertOp.getPosition().end()); 1070 } 1071 1072 /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds 1073 /// accesses, and broadcasts and transposes in permutation maps. 1074 LogicalResult matchAndRewrite(TransferReadOp xferOp, 1075 PatternRewriter &rewriter) const override { 1076 if (xferOp.getVectorType().getRank() <= options.targetRank) 1077 return failure(); 1078 if (isTensorOp(xferOp) && !options.lowerTensors) 1079 return failure(); 1080 // Transfer ops that modify the element type are not supported atm. 1081 if (xferOp.getVectorType().getElementType() != 1082 xferOp.getShapedType().getElementType()) 1083 return failure(); 1084 1085 auto insertOp = getInsertOp(xferOp); 1086 auto vec = getResultVector(xferOp, rewriter); 1087 auto vecType = dyn_cast<VectorType>(vec.getType()); 1088 auto xferVecType = xferOp.getVectorType(); 1089 1090 if (xferVecType.getScalableDims()[0]) { 1091 // Cannot unroll a scalable dimension at compile time. 1092 return failure(); 1093 } 1094 1095 VectorType newXferVecType = VectorType::Builder(xferVecType).dropDim(0); 1096 1097 int64_t dimSize = xferVecType.getShape()[0]; 1098 1099 // Generate fully unrolled loop of transfer ops. 1100 Location loc = xferOp.getLoc(); 1101 for (int64_t i = 0; i < dimSize; ++i) { 1102 Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i); 1103 1104 vec = generateInBoundsCheck( 1105 rewriter, xferOp, iv, unpackedDim(xferOp), TypeRange(vecType), 1106 /*inBoundsCase=*/ 1107 [&](OpBuilder &b, Location loc) { 1108 // Indices for the new transfer op. 1109 SmallVector<Value, 8> xferIndices; 1110 getXferIndices(b, xferOp, iv, xferIndices); 1111 1112 // Indices for the new vector.insert op. 1113 SmallVector<int64_t, 8> insertionIndices; 1114 getInsertionIndices(xferOp, insertionIndices); 1115 insertionIndices.push_back(i); 1116 1117 auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); 1118 auto newXferOp = b.create<vector::TransferReadOp>( 1119 loc, newXferVecType, xferOp.getSource(), xferIndices, 1120 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), 1121 xferOp.getPadding(), Value(), inBoundsAttr); 1122 maybeAssignMask(b, xferOp, newXferOp, i); 1123 return b.create<vector::InsertOp>(loc, newXferOp, vec, 1124 insertionIndices); 1125 }, 1126 /*outOfBoundsCase=*/ 1127 [&](OpBuilder &b, Location loc) { 1128 // Loop through original (unmodified) vector. 1129 return vec; 1130 }); 1131 } 1132 1133 if (insertOp) { 1134 // Rewrite single user of the old TransferReadOp, which was an InsertOp. 1135 rewriter.replaceOp(insertOp, vec); 1136 rewriter.eraseOp(xferOp); 1137 } else { 1138 rewriter.replaceOp(xferOp, vec); 1139 } 1140 1141 return success(); 1142 } 1143 }; 1144 1145 /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one 1146 /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no 1147 /// memref buffer is allocated and the SCF loop is fully unrolled. 1148 /// 1149 /// ``` 1150 /// E.g.: 1151 /// ``` 1152 /// vector.transfer_write %vec, %A[%a, %b, %c] 1153 /// : vector<5x4xf32>, memref<?x?x?xf32> 1154 /// ``` 1155 /// is rewritten to IR such as (simplified): 1156 /// ``` 1157 /// %v0 = vector.extract %vec[0] : vector<5x4xf32> 1158 /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...> 1159 /// %v1 = vector.extract %vec[1] : vector<5x4xf32> 1160 /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...> 1161 /// ... 1162 /// %v4 = vector.extract %vec[4] : vector<5x4xf32> 1163 /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...> 1164 /// ``` 1165 /// 1166 /// Note: As an optimization, if the vector of the original TransferWriteOp 1167 /// was directly extracted from another vector via an ExtractOp `a`, extract 1168 /// the vectors for the newly generated TransferWriteOps from `a`'s input. By 1169 /// doing so, `a` may become dead, and the number of ExtractOps generated during 1170 /// recursive application of this pattern will be minimal. 1171 struct UnrollTransferWriteConversion 1172 : public VectorToSCFPattern<TransferWriteOp> { 1173 using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern; 1174 1175 void initialize() { 1176 // This pattern recursively unpacks one dimension at a time. The recursion 1177 // bounded as the rank is strictly decreasing. 1178 setHasBoundedRewriteRecursion(); 1179 } 1180 1181 /// Return the vector from which newly generated ExtracOps will extract. 1182 Value getDataVector(TransferWriteOp xferOp) const { 1183 if (auto extractOp = getExtractOp(xferOp)) 1184 return extractOp.getVector(); 1185 return xferOp.getVector(); 1186 } 1187 1188 /// If the input of the given TransferWriteOp is an ExtractOp, return it. 1189 vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const { 1190 if (auto *op = xferOp.getVector().getDefiningOp()) 1191 return dyn_cast<vector::ExtractOp>(op); 1192 return vector::ExtractOp(); 1193 } 1194 1195 /// If the input of the given TransferWriteOp is an ExtractOp, return its 1196 /// indices. 1197 void getExtractionIndices(TransferWriteOp xferOp, 1198 SmallVector<int64_t, 8> &indices) const { 1199 if (auto extractOp = getExtractOp(xferOp)) 1200 indices.assign(extractOp.getPosition().begin(), 1201 extractOp.getPosition().end()); 1202 } 1203 1204 /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds 1205 /// accesses, and broadcasts and transposes in permutation maps. 1206 LogicalResult matchAndRewrite(TransferWriteOp xferOp, 1207 PatternRewriter &rewriter) const override { 1208 if (xferOp.getVectorType().getRank() <= options.targetRank) 1209 return failure(); 1210 if (isTensorOp(xferOp) && !options.lowerTensors) 1211 return failure(); 1212 // Transfer ops that modify the element type are not supported atm. 1213 if (xferOp.getVectorType().getElementType() != 1214 xferOp.getShapedType().getElementType()) 1215 return failure(); 1216 1217 auto vec = getDataVector(xferOp); 1218 auto xferVecType = xferOp.getVectorType(); 1219 int64_t dimSize = xferVecType.getShape()[0]; 1220 Value source = xferOp.getSource(); // memref or tensor to be written to. 1221 auto sourceType = isTensorOp(xferOp) ? xferOp.getShapedType() : Type(); 1222 1223 // Generate fully unrolled loop of transfer ops. 1224 Location loc = xferOp.getLoc(); 1225 for (int64_t i = 0; i < dimSize; ++i) { 1226 Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i); 1227 1228 auto updatedSource = generateInBoundsCheck( 1229 rewriter, xferOp, iv, unpackedDim(xferOp), 1230 isTensorOp(xferOp) ? TypeRange(sourceType) : TypeRange(), 1231 /*inBoundsCase=*/ 1232 [&](OpBuilder &b, Location loc) { 1233 // Indices for the new transfer op. 1234 SmallVector<Value, 8> xferIndices; 1235 getXferIndices(b, xferOp, iv, xferIndices); 1236 1237 // Indices for the new vector.extract op. 1238 SmallVector<int64_t, 8> extractionIndices; 1239 getExtractionIndices(xferOp, extractionIndices); 1240 extractionIndices.push_back(i); 1241 1242 auto extracted = 1243 b.create<vector::ExtractOp>(loc, vec, extractionIndices); 1244 auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr()); 1245 auto newXferOp = b.create<vector::TransferWriteOp>( 1246 loc, sourceType, extracted, source, xferIndices, 1247 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(), 1248 inBoundsAttr); 1249 1250 maybeAssignMask(b, xferOp, newXferOp, i); 1251 1252 return isTensorOp(xferOp) ? newXferOp->getResult(0) : Value(); 1253 }, 1254 /*outOfBoundsCase=*/ 1255 [&](OpBuilder &b, Location loc) { 1256 return isTensorOp(xferOp) ? source : Value(); 1257 }); 1258 1259 if (isTensorOp(xferOp)) 1260 source = updatedSource; 1261 } 1262 1263 if (isTensorOp(xferOp)) 1264 rewriter.replaceOp(xferOp, source); 1265 else 1266 rewriter.eraseOp(xferOp); 1267 1268 return success(); 1269 } 1270 }; 1271 1272 } // namespace lowering_n_d_unrolled 1273 1274 namespace lowering_1_d { 1275 1276 /// Compute the indices into the memref for the LoadOp/StoreOp generated as 1277 /// part of TransferOp1dConversion. Return the memref dimension on which 1278 /// the transfer is operating. A return value of std::nullopt indicates a 1279 /// broadcast. 1280 template <typename OpTy> 1281 static std::optional<int64_t> 1282 get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv, 1283 SmallVector<Value, 8> &memrefIndices) { 1284 auto indices = xferOp.getIndices(); 1285 auto map = xferOp.getPermutationMap(); 1286 assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer"); 1287 1288 memrefIndices.append(indices.begin(), indices.end()); 1289 assert(map.getNumResults() == 1 && 1290 "Expected 1 permutation map result for 1D transfer"); 1291 if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) { 1292 Location loc = xferOp.getLoc(); 1293 auto dim = expr.getPosition(); 1294 AffineExpr d0, d1; 1295 bindDims(xferOp.getContext(), d0, d1); 1296 Value offset = memrefIndices[dim]; 1297 memrefIndices[dim] = 1298 affine::makeComposedAffineApply(b, loc, d0 + d1, {offset, iv}); 1299 return dim; 1300 } 1301 1302 assert(xferOp.isBroadcastDim(0) && 1303 "Expected AffineDimExpr or AffineConstantExpr"); 1304 return std::nullopt; 1305 } 1306 1307 /// Codegen strategy for TransferOp1dConversion, depending on the 1308 /// operation. 1309 template <typename OpTy> 1310 struct Strategy1d; 1311 1312 /// Codegen strategy for TransferReadOp. 1313 template <> 1314 struct Strategy1d<TransferReadOp> { 1315 static void generateForLoopBody(OpBuilder &b, Location loc, 1316 TransferReadOp xferOp, Value iv, 1317 ValueRange loopState) { 1318 SmallVector<Value, 8> indices; 1319 auto dim = get1dMemrefIndices(b, xferOp, iv, indices); 1320 auto vec = loopState[0]; 1321 1322 // In case of out-of-bounds access, leave `vec` as is (was initialized with 1323 // padding value). 1324 auto nextVec = generateInBoundsCheck( 1325 b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()), 1326 /*inBoundsCase=*/ 1327 [&](OpBuilder &b, Location loc) { 1328 Value val = 1329 b.create<memref::LoadOp>(loc, xferOp.getSource(), indices); 1330 return b.create<vector::InsertElementOp>(loc, val, vec, iv); 1331 }, 1332 /*outOfBoundsCase=*/ 1333 [&](OpBuilder & /*b*/, Location loc) { return vec; }); 1334 b.create<scf::YieldOp>(loc, nextVec); 1335 } 1336 1337 static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) { 1338 // Inititalize vector with padding value. 1339 Location loc = xferOp.getLoc(); 1340 return b.create<vector::SplatOp>(loc, xferOp.getVectorType(), 1341 xferOp.getPadding()); 1342 } 1343 }; 1344 1345 /// Codegen strategy for TransferWriteOp. 1346 template <> 1347 struct Strategy1d<TransferWriteOp> { 1348 static void generateForLoopBody(OpBuilder &b, Location loc, 1349 TransferWriteOp xferOp, Value iv, 1350 ValueRange /*loopState*/) { 1351 SmallVector<Value, 8> indices; 1352 auto dim = get1dMemrefIndices(b, xferOp, iv, indices); 1353 1354 // Nothing to do in case of out-of-bounds access. 1355 generateInBoundsCheck( 1356 b, xferOp, iv, dim, 1357 /*inBoundsCase=*/[&](OpBuilder &b, Location loc) { 1358 auto val = 1359 b.create<vector::ExtractElementOp>(loc, xferOp.getVector(), iv); 1360 b.create<memref::StoreOp>(loc, val, xferOp.getSource(), indices); 1361 }); 1362 b.create<scf::YieldOp>(loc); 1363 } 1364 1365 static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) { 1366 return Value(); 1367 } 1368 }; 1369 1370 /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is 1371 /// necessary in cases where a 1D vector transfer op cannot be lowered into 1372 /// vector load/stores due to non-unit strides or broadcasts: 1373 /// 1374 /// * Transfer dimension is not the last memref dimension 1375 /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast) 1376 /// * Memref has a layout map with non-unit stride on the last dimension 1377 /// 1378 /// This pattern generates IR as follows: 1379 /// 1380 /// 1. Generate a for loop iterating over each vector element. 1381 /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp, 1382 /// depending on OpTy. 1383 /// 1384 /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp 1385 /// can be generated instead of TransferOp1dConversion. Add such a pattern 1386 /// to ConvertVectorToLLVM. 1387 /// 1388 /// E.g.: 1389 /// ``` 1390 /// vector.transfer_write %vec, %A[%a, %b] 1391 /// {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} 1392 /// : vector<9xf32>, memref<?x?xf32> 1393 /// ``` 1394 /// Is rewritten to approximately the following pseudo-IR: 1395 /// ``` 1396 /// for i = 0 to 9 { 1397 /// %t = vector.extractelement %vec[i] : vector<9xf32> 1398 /// memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32> 1399 /// } 1400 /// ``` 1401 template <typename OpTy> 1402 struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> { 1403 using VectorToSCFPattern<OpTy>::VectorToSCFPattern; 1404 1405 LogicalResult matchAndRewrite(OpTy xferOp, 1406 PatternRewriter &rewriter) const override { 1407 // TODO: support 0-d corner case. 1408 if (xferOp.getTransferRank() == 0) 1409 return failure(); 1410 auto map = xferOp.getPermutationMap(); 1411 auto memRefType = dyn_cast<MemRefType>(xferOp.getShapedType()); 1412 1413 if (!memRefType) 1414 return failure(); 1415 if (xferOp.getVectorType().getRank() != 1) 1416 return failure(); 1417 if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType)) 1418 return failure(); // Handled by ConvertVectorToLLVM 1419 1420 // Loop bounds, step, state... 1421 Location loc = xferOp.getLoc(); 1422 auto vecType = xferOp.getVectorType(); 1423 auto lb = rewriter.create<arith::ConstantIndexOp>(loc, 0); 1424 Value ub = 1425 rewriter.create<arith::ConstantIndexOp>(loc, vecType.getDimSize(0)); 1426 if (vecType.isScalable()) { 1427 Value vscale = 1428 rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType()); 1429 ub = rewriter.create<arith::MulIOp>(loc, ub, vscale); 1430 } 1431 auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1); 1432 auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp); 1433 1434 // Generate for loop. 1435 rewriter.replaceOpWithNewOp<scf::ForOp>( 1436 xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(), 1437 [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) { 1438 Strategy1d<OpTy>::generateForLoopBody(b, loc, xferOp, iv, loopState); 1439 }); 1440 1441 return success(); 1442 } 1443 }; 1444 1445 } // namespace lowering_1_d 1446 } // namespace 1447 1448 void mlir::populateVectorToSCFConversionPatterns( 1449 RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) { 1450 if (options.unroll) { 1451 patterns.add<lowering_n_d_unrolled::UnrollTransferReadConversion, 1452 lowering_n_d_unrolled::UnrollTransferWriteConversion>( 1453 patterns.getContext(), options); 1454 } else { 1455 patterns.add<lowering_n_d::PrepareTransferReadConversion, 1456 lowering_n_d::PrepareTransferWriteConversion, 1457 lowering_n_d::TransferOpConversion<TransferReadOp>, 1458 lowering_n_d::TransferOpConversion<TransferWriteOp>>( 1459 patterns.getContext(), options); 1460 } 1461 1462 if (options.targetRank == 1) { 1463 patterns.add<lowering_1_d::TransferOp1dConversion<TransferReadOp>, 1464 lowering_1_d::TransferOp1dConversion<TransferWriteOp>>( 1465 patterns.getContext(), options); 1466 } 1467 patterns.add<lowering_n_d::DecomposePrintOpConversion>(patterns.getContext(), 1468 options); 1469 } 1470 1471 namespace { 1472 1473 struct ConvertVectorToSCFPass 1474 : public impl::ConvertVectorToSCFBase<ConvertVectorToSCFPass> { 1475 ConvertVectorToSCFPass() = default; 1476 ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 1477 this->fullUnroll = options.unroll; 1478 this->targetRank = options.targetRank; 1479 this->lowerTensors = options.lowerTensors; 1480 } 1481 1482 void runOnOperation() override { 1483 VectorTransferToSCFOptions options; 1484 options.unroll = fullUnroll; 1485 options.targetRank = targetRank; 1486 options.lowerTensors = lowerTensors; 1487 1488 // Lower permutation maps first. 1489 RewritePatternSet lowerTransferPatterns(&getContext()); 1490 mlir::vector::populateVectorTransferPermutationMapLoweringPatterns( 1491 lowerTransferPatterns); 1492 (void)applyPatternsAndFoldGreedily(getOperation(), 1493 std::move(lowerTransferPatterns)); 1494 1495 RewritePatternSet patterns(&getContext()); 1496 populateVectorToSCFConversionPatterns(patterns, options); 1497 (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); 1498 } 1499 }; 1500 1501 } // namespace 1502 1503 std::unique_ptr<Pass> 1504 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { 1505 return std::make_unique<ConvertVectorToSCFPass>(options); 1506 } 1507