ArmSME/Transforms/VectorLegalization.cpp

042800a4SBenjamin Maxwell//===- VectorLegalization.cpp - Legalize vectors for lowering to ArmSME ---===//
042800a4SBenjamin Maxwell//
042800a4SBenjamin Maxwell// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
042800a4SBenjamin Maxwell// See https://llvm.org/LICENSE.txt for license information.
042800a4SBenjamin Maxwell// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
042800a4SBenjamin Maxwell//
042800a4SBenjamin Maxwell//===----------------------------------------------------------------------===//
042800a4SBenjamin Maxwell//
042800a4SBenjamin Maxwell// This pass legalizes vector operations so they can be lowered to ArmSME.
042800a4SBenjamin Maxwell//
042800a4SBenjamin Maxwell// Note: In the context of this pass 'tile' always refers to an SME tile.
042800a4SBenjamin Maxwell//
042800a4SBenjamin Maxwell//===----------------------------------------------------------------------===//
042800a4SBenjamin Maxwell
c2dea712SBenjamin Maxwell#include "mlir/Dialect/Arith/Utils/Utils.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/ArmSME/IR/ArmSME.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/ArmSME/Transforms/Passes.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/ArmSME/Utils/Utils.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/Func/IR/FuncOps.h"
31613de9SMatthias Springer#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
c194bc77SBenjamin Maxwell#include "mlir/Dialect/Index/IR/IndexDialect.h"
c194bc77SBenjamin Maxwell#include "mlir/Dialect/Index/IR/IndexOps.h"
0473e322SBenjamin Maxwell#include "mlir/Dialect/MemRef/IR/MemRef.h"
5ed5d723SBenjamin Maxwell#include "mlir/Dialect/SCF/IR/SCF.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/SCF/Transforms/Patterns.h"
042800a4SBenjamin Maxwell#include "mlir/Dialect/Utils/IndexingUtils.h"
c194bc77SBenjamin Maxwell#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
31613de9SMatthias Springer#include "mlir/Transforms/DialectConversion.h"
31613de9SMatthias Springer#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell#define DEBUG_TYPE "arm-sme-vector-legalization"
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwellnamespace mlir::arm_sme {
042800a4SBenjamin Maxwell#define GEN_PASS_DEF_VECTORLEGALIZATION
042800a4SBenjamin Maxwell#include "mlir/Dialect/ArmSME/Transforms/Passes.h.inc"
042800a4SBenjamin Maxwell} // namespace mlir::arm_sme
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwellusing namespace mlir;
042800a4SBenjamin Maxwellusing namespace mlir::arm_sme;
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwellnamespace {
042800a4SBenjamin Maxwell
c2dea712SBenjamin Maxwell//===----------------------------------------------------------------------===//
c2dea712SBenjamin Maxwell// Decomposition of vector operations larger than an SME tile
c2dea712SBenjamin Maxwell//===----------------------------------------------------------------------===//
c2dea712SBenjamin Maxwell
042800a4SBenjamin Maxwell// Common match failure reasons.
1408667fSBenjamin Maxwellstatic constexpr StringLiteral kMatchFailureNotSMETileTypeMultiple(
042800a4SBenjamin Maxwell    "op vector size is not multiple of SME tiles");
1408667fSBenjamin Maxwellstatic constexpr StringLiteral kMatchFailureUnsupportedMaskOp(
042800a4SBenjamin Maxwell    "op mask is unsupported for legalization/decomposition");
042800a4SBenjamin Maxwellstatic constexpr StringLiteral
1408667fSBenjamin Maxwell    kMatchFailureNonPermutationMap("op affine map is not a permutation");
d1fc59c3SBenjamin Maxwellstatic constexpr StringLiteral kMatchFailureNotIllegalToLegal(
d1fc59c3SBenjamin Maxwell    "expected transpose from illegal type to legal type");
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// An SMESubTile represents a single SME-sized sub-tile from decomposing a
042800a4SBenjamin Maxwell/// larger vector type. The (`row`, `col`) are the position of the tile in the
042800a4SBenjamin Maxwell/// original vector type. For example for an [8]x[8] tile with four [4]x[4]
042800a4SBenjamin Maxwell/// sub-tiles, we would have:
042800a4SBenjamin Maxwell///
042800a4SBenjamin Maxwell///           8 x vscale
042800a4SBenjamin Maxwell/// ┌─────────────┬─────────────┐
042800a4SBenjamin Maxwell/// │(0,0)        │(0,4)        │
042800a4SBenjamin Maxwell/// │             │             │
042800a4SBenjamin Maxwell/// ├─────────────┼─────────────┤ 8 x vscale
042800a4SBenjamin Maxwell/// │(4,0)        │(4,4)        │
042800a4SBenjamin Maxwell/// │             │             │
042800a4SBenjamin Maxwell/// └─────────────┴─────────────┘
042800a4SBenjamin Maxwellstruct SMESubTile {
042800a4SBenjamin Maxwell  // Note: The units of (row, col) are vscale (as SME tiles are scalable).
042800a4SBenjamin Maxwell  int row{0};
042800a4SBenjamin Maxwell  int col{0};
042800a4SBenjamin Maxwell  // The SME tile type.
042800a4SBenjamin Maxwell  VectorType type;
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Adds a constant elementwise scalable offset to `indices` (which are of equal
042800a4SBenjamin Maxwell/// length). For example, in the 2D case this would return:
042800a4SBenjamin Maxwell// { indices[0] + offset[0] * vscale, indices[1] + offset[1] *  vscale }
042800a4SBenjamin MaxwellSmallVector<Value, 2> addConstantScalableOffset(OpBuilder &builder,
042800a4SBenjamin Maxwell                                                Location loc,
042800a4SBenjamin Maxwell                                                ValueRange indices,
042800a4SBenjamin Maxwell                                                ArrayRef<int> scalableOffsets) {
042800a4SBenjamin Maxwell  auto vscale = builder.create<vector::VectorScaleOp>(loc);
042800a4SBenjamin Maxwell  return llvm::map_to_vector(
042800a4SBenjamin Maxwell      llvm::zip_equal(indices, scalableOffsets), [&](auto pair) -> Value {
042800a4SBenjamin Maxwell        auto [index, base] = pair;
042800a4SBenjamin Maxwell        auto offset = builder.create<arith::MulIOp>(
042800a4SBenjamin Maxwell            loc, builder.create<arith::ConstantIndexOp>(loc, base), vscale);
042800a4SBenjamin Maxwell        return builder.create<arith::AddIOp>(loc, index, offset);
042800a4SBenjamin Maxwell      });
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Adjusts `indices` (e.g. from a load/store) for a larger vector type to
042800a4SBenjamin Maxwell/// indices for one of the SME sub-tiles it will decompose into.
042800a4SBenjamin Maxwell///
042800a4SBenjamin Maxwell/// For example, if you were to decompose an 8x8 load into four 4x4 tiles, the
042800a4SBenjamin Maxwell/// indices for each tile would need to be adjusted as follows:
042800a4SBenjamin Maxwell///
042800a4SBenjamin Maxwell/// initial indices = [a,b], inital size = 8x8, target size = 4x4
042800a4SBenjamin Maxwell/// ┌─────────────┬─────────────┐
042800a4SBenjamin Maxwell/// │[a,b]        │[a,b+4]      │
042800a4SBenjamin Maxwell/// │             │             │
042800a4SBenjamin Maxwell/// ├─────────────┼─────────────┤
042800a4SBenjamin Maxwell/// │[a+4,b]      │[a+4,b+4]    │
042800a4SBenjamin Maxwell/// │             │             │
042800a4SBenjamin Maxwell/// └─────────────┴─────────────┘
042800a4SBenjamin MaxwellSmallVector<Value, 2> getSMESubTileIndices(OpBuilder &builder, Location loc,
042800a4SBenjamin Maxwell                                           ValueRange indices,
042800a4SBenjamin Maxwell                                           SMESubTile smeTile) {
042800a4SBenjamin Maxwell  return addConstantScalableOffset(builder, loc, indices,
042800a4SBenjamin Maxwell                                   {smeTile.row, smeTile.col});
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Returns true if `mask` is generated by an operation that can be decomposed
042800a4SBenjamin Maxwell/// for SME. Currently, that is just no mask, or vector.create_mask.
042800a4SBenjamin Maxwell/// TODO: Add support for vector.constant_mask once required for SME.
042800a4SBenjamin Maxwellbool isSupportedMaskOp(Value mask) {
042800a4SBenjamin Maxwell  return !mask || mask.getDefiningOp<vector::CreateMaskOp>();
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Extracts a mask for an SME sub-tile from the mask of a larger vector type.
042800a4SBenjamin MaxwellValue extractSMEMask(OpBuilder &builder, Location loc, Value mask,
042800a4SBenjamin Maxwell                     SMESubTile smeTile) {
042800a4SBenjamin Maxwell  assert(isSupportedMaskOp(mask));
042800a4SBenjamin Maxwell  if (!mask)
042800a4SBenjamin Maxwell    return Value{};
042800a4SBenjamin Maxwell  auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
042800a4SBenjamin Maxwell  // The operands of `vector.create_mask` (from a 2D perspective) are the
042800a4SBenjamin Maxwell  // coordinates where the mask ends. So we subtract where this tile starts,
042800a4SBenjamin Maxwell  // from the mask operands to get the parameters for this sub-tile.
042800a4SBenjamin Maxwell  auto smeTileMaskDims = addConstantScalableOffset(
042800a4SBenjamin Maxwell      builder, loc, createMask.getOperands(), {-smeTile.row, -smeTile.col});
042800a4SBenjamin Maxwell  auto smeTileCreateMask = builder.create<vector::CreateMaskOp>(
042800a4SBenjamin Maxwell      loc, smeTile.type.clone(builder.getI1Type()), smeTileMaskDims);
042800a4SBenjamin Maxwell  return smeTileCreateMask.getResult();
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Constructs an iterator that returns each SME tile (with coordinates)
042800a4SBenjamin Maxwell/// contained within a VectorType. For example, if decomposing an [8]x[8] into
042800a4SBenjamin Maxwell/// [4]x[4] tiles, the iterator would yield the tiles: (0, 0), (0, 4), (4, 0),
042800a4SBenjamin Maxwell/// (4, 4).
042800a4SBenjamin Maxwellauto decomposeToSMETiles(OpBuilder &builder, VectorType type,
042800a4SBenjamin Maxwell                         VectorType smeTileType,
042800a4SBenjamin Maxwell                         bool transposeIndices = false) {
042800a4SBenjamin Maxwell  return llvm::map_range(
c194bc77SBenjamin Maxwell      StaticTileOffsetRange(
c194bc77SBenjamin Maxwell          type.getShape(),
c194bc77SBenjamin Maxwell          {std::min(type.getDimSize(0), smeTileType.getDimSize(0)),
c194bc77SBenjamin Maxwell           std::min(type.getDimSize(1), smeTileType.getDimSize(1))}),
042800a4SBenjamin Maxwell      [=](auto indices) {
042800a4SBenjamin Maxwell        int row = int(indices[0]);
042800a4SBenjamin Maxwell        int col = int(indices[1]);
042800a4SBenjamin Maxwell        if (transposeIndices)
042800a4SBenjamin Maxwell          std::swap(row, col);
042800a4SBenjamin Maxwell        return SMESubTile{row, col, smeTileType};
042800a4SBenjamin Maxwell      });
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Returns the number of SME tiles that fit into the (2D-scalable) vector type
042800a4SBenjamin Maxwell/// `type`.
042800a4SBenjamin Maxwellint getNumberOfSMETilesForVectorType(VectorType type) {
042800a4SBenjamin Maxwell  assert(isMultipleOfSMETileVectorType(type) &&
042800a4SBenjamin Maxwell         "`type` not multiple of SME tiles");
042800a4SBenjamin Maxwell  int64_t vectorRows = type.getDimSize(0);
042800a4SBenjamin Maxwell  int64_t vectorCols = type.getDimSize(1);
042800a4SBenjamin Maxwell  auto elementType = type.getElementType();
042800a4SBenjamin Maxwell  unsigned minNumElts = getSMETileSliceMinNumElts(elementType);
042800a4SBenjamin Maxwell  return (vectorRows * vectorCols) / (minNumElts * minNumElts);
042800a4SBenjamin Maxwell}
042800a4SBenjamin Maxwell
dadcaf82SBenjamin Maxwell/// Legalize `arith.constant dense<value>` splat operations to fit within SME
dadcaf82SBenjamin Maxwell/// tiles by decomposing them into tile-sized operations.
dadcaf82SBenjamin Maxwellstruct LegalizeArithConstantOpsByDecomposition
31613de9SMatthias Springer    : public OpConversionPattern<arith::ConstantOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
dadcaf82SBenjamin Maxwell
dadcaf82SBenjamin Maxwell  LogicalResult
dadcaf82SBenjamin Maxwell  matchAndRewrite(arith::ConstantOp constantOp, OpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
dadcaf82SBenjamin Maxwell    auto vectorType = dyn_cast<VectorType>(constantOp.getType());
dadcaf82SBenjamin Maxwell    auto denseAttr = dyn_cast<DenseElementsAttr>(constantOp.getValueAttr());
dadcaf82SBenjamin Maxwell    if (!vectorType || !denseAttr || !denseAttr.isSplat())
dadcaf82SBenjamin Maxwell      return failure();
dadcaf82SBenjamin Maxwell
dadcaf82SBenjamin Maxwell    if (!isMultipleOfSMETileVectorType(vectorType))
dadcaf82SBenjamin Maxwell      return rewriter.notifyMatchFailure(constantOp,
dadcaf82SBenjamin Maxwell                                         kMatchFailureNotSMETileTypeMultiple);
dadcaf82SBenjamin Maxwell
dadcaf82SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
dadcaf82SBenjamin Maxwell    auto tileCount = getNumberOfSMETilesForVectorType(vectorType);
dadcaf82SBenjamin Maxwell    auto tileSplat = rewriter.create<arith::ConstantOp>(
dadcaf82SBenjamin Maxwell        constantOp.getLoc(), denseAttr.resizeSplat(smeTileType));
31613de9SMatthias Springer    SmallVector<Value> repl(tileCount, tileSplat);
31613de9SMatthias Springer    rewriter.replaceOpWithMultiple(constantOp, {repl});
dadcaf82SBenjamin Maxwell
dadcaf82SBenjamin Maxwell    return success();
dadcaf82SBenjamin Maxwell  }
dadcaf82SBenjamin Maxwell};
dadcaf82SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Legalize `vector.outerproduct` operations to fit within SME tiles by
042800a4SBenjamin Maxwell/// decomposing them into tile-sized operations.
042800a4SBenjamin Maxwellstruct LegalizeVectorOuterProductOpsByDecomposition
31613de9SMatthias Springer    : public OpConversionPattern<vector::OuterProductOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell  LogicalResult
31613de9SMatthias Springer  matchAndRewrite(vector::OuterProductOp outerProductOp,
31613de9SMatthias Springer                  OneToNOpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
042800a4SBenjamin Maxwell    auto vectorType = outerProductOp.getResultVectorType();
042800a4SBenjamin Maxwell    if (!isMultipleOfSMETileVectorType(vectorType))
1408667fSBenjamin Maxwell      return rewriter.notifyMatchFailure(outerProductOp,
1408667fSBenjamin Maxwell                                         kMatchFailureNotSMETileTypeMultiple);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    Value mask;
042800a4SBenjamin Maxwell    Operation *rootOp = outerProductOp;
042800a4SBenjamin Maxwell    auto loc = outerProductOp.getLoc();
042800a4SBenjamin Maxwell    if (outerProductOp.isMasked()) {
042800a4SBenjamin Maxwell      auto maskOp = outerProductOp.getMaskingOp();
042800a4SBenjamin Maxwell      mask = maskOp.getMask();
042800a4SBenjamin Maxwell      rootOp = maskOp;
31613de9SMatthias Springer      rewriter.setInsertionPoint(rootOp);
042800a4SBenjamin Maxwell    }
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    if (!isSupportedMaskOp(mask))
042800a4SBenjamin Maxwell      return rewriter.notifyMatchFailure(outerProductOp,
1408667fSBenjamin Maxwell                                         kMatchFailureUnsupportedMaskOp);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    ValueRange accSMETiles = adaptor.getAcc();
042800a4SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
042800a4SBenjamin Maxwell    VectorType sliceType = VectorType::Builder(smeTileType).dropDim(0);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    SmallVector<Value> resultSMETiles;
042800a4SBenjamin Maxwell    for (auto [index, smeTile] : llvm::enumerate(
042800a4SBenjamin Maxwell             decomposeToSMETiles(rewriter, vectorType, smeTileType))) {
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
042800a4SBenjamin Maxwell      auto lhs = rewriter.create<vector::ScalableExtractOp>(
042800a4SBenjamin Maxwell          loc, sliceType, outerProductOp.getLhs(), smeTile.row);
042800a4SBenjamin Maxwell      auto rhs = rewriter.create<vector::ScalableExtractOp>(
042800a4SBenjamin Maxwell          loc, sliceType, outerProductOp.getRhs(), smeTile.col);
042800a4SBenjamin Maxwell      auto smeOuterProduct = rewriter.create<vector::OuterProductOp>(
042800a4SBenjamin Maxwell          loc, smeTileType, lhs, rhs,
042800a4SBenjamin Maxwell          !accSMETiles.empty() ? accSMETiles[index] : Value{},
042800a4SBenjamin Maxwell          outerProductOp.getKind());
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell      auto maskedOuterProduct =
042800a4SBenjamin Maxwell          vector::maskOperation(rewriter, smeOuterProduct, smeMask);
042800a4SBenjamin Maxwell      resultSMETiles.push_back(maskedOuterProduct->getResult(0));
042800a4SBenjamin Maxwell    }
042800a4SBenjamin Maxwell
31613de9SMatthias Springer    rewriter.replaceOpWithMultiple(rootOp, {resultSMETiles});
042800a4SBenjamin Maxwell    return success();
042800a4SBenjamin Maxwell  }
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell// Workaround for `vector.mask`. We want to match on `vector.outerproduct` (to
042800a4SBenjamin Maxwell// get the help of the type conversion), but doing so results in the type
042800a4SBenjamin Maxwell// conversion adding target materializations in the `vector.mask` region
042800a4SBenjamin Maxwell// (invalid). This pattern matches on `vector.mask` then calls into the
042800a4SBenjamin Maxwell// `vector.outerproduct` pattern to work around this issue.
042800a4SBenjamin Maxwellstruct LegalizeMaskedVectorOuterProductOpsByDecomposition
31613de9SMatthias Springer    : public OpConversionPattern<vector::MaskOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell  LogicalResult
31613de9SMatthias Springer  matchAndRewrite(vector::MaskOp maskOp, OneToNOpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
a9eb8f0eSBenjamin Maxwell    if (auto outerProductOp = llvm::dyn_cast_or_null<vector::OuterProductOp>(
a9eb8f0eSBenjamin Maxwell            maskOp.getMaskableOp())) {
042800a4SBenjamin Maxwell      LegalizeVectorOuterProductOpsByDecomposition pattern(*getTypeConverter(),
042800a4SBenjamin Maxwell                                                           getContext());
042800a4SBenjamin Maxwell      return static_cast<RewritePattern &>(pattern).matchAndRewrite(
042800a4SBenjamin Maxwell          outerProductOp, rewriter);
042800a4SBenjamin Maxwell    }
042800a4SBenjamin Maxwell    return failure();
042800a4SBenjamin Maxwell  }
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Legalize `vector.transfer_read` operations to fit within SME tiles by
042800a4SBenjamin Maxwell/// decomposing them into tile-sized operations.
042800a4SBenjamin Maxwellstruct LegalizeTransferReadOpsByDecomposition
31613de9SMatthias Springer    : public OpConversionPattern<vector::TransferReadOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell  LogicalResult
31613de9SMatthias Springer  matchAndRewrite(vector::TransferReadOp readOp, OneToNOpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
042800a4SBenjamin Maxwell    auto vectorType = readOp.getVectorType();
042800a4SBenjamin Maxwell    if (!isMultipleOfSMETileVectorType(vectorType))
1408667fSBenjamin Maxwell      return rewriter.notifyMatchFailure(readOp,
1408667fSBenjamin Maxwell                                         kMatchFailureNotSMETileTypeMultiple);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto mask = readOp.getMask();
042800a4SBenjamin Maxwell    if (!isSupportedMaskOp(mask))
042800a4SBenjamin Maxwell      return rewriter.notifyMatchFailure(readOp,
1408667fSBenjamin Maxwell                                         kMatchFailureUnsupportedMaskOp);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto permutationMap = readOp.getPermutationMap();
042800a4SBenjamin Maxwell    if (!permutationMap.isPermutation())
042800a4SBenjamin Maxwell      return rewriter.notifyMatchFailure(readOp,
1408667fSBenjamin Maxwell                                         kMatchFailureNonPermutationMap);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    // Note: For 2D vector types the only non-identity permutation is a simple
*aa295216SJay Foad    // transpose [1, 0].
042800a4SBenjamin Maxwell    bool transposed = !permutationMap.isIdentity();
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto loc = readOp.getLoc();
042800a4SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    SmallVector<Value> resultSMETiles;
042800a4SBenjamin Maxwell    for (SMESubTile smeTile :
042800a4SBenjamin Maxwell         decomposeToSMETiles(rewriter, vectorType, smeTileType, transposed)) {
042800a4SBenjamin Maxwell      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
042800a4SBenjamin Maxwell      auto smeRead = rewriter.create<vector::TransferReadOp>(
042800a4SBenjamin Maxwell          loc, smeTileType, readOp.getSource(),
042800a4SBenjamin Maxwell          getSMESubTileIndices(rewriter, loc, readOp.getIndices(), smeTile),
042800a4SBenjamin Maxwell          readOp.getPermutationMapAttr(), readOp.getPadding(), smeMask,
042800a4SBenjamin Maxwell          readOp.getInBoundsAttr());
042800a4SBenjamin Maxwell      resultSMETiles.push_back(smeRead);
042800a4SBenjamin Maxwell    }
042800a4SBenjamin Maxwell
31613de9SMatthias Springer    rewriter.replaceOpWithMultiple(readOp, {resultSMETiles});
042800a4SBenjamin Maxwell    return success();
042800a4SBenjamin Maxwell  }
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell/// Legalize `vector.transfer_write` operations to fit within SME tiles by
042800a4SBenjamin Maxwell/// decomposing them into tile-sized operations.
042800a4SBenjamin Maxwellstruct LegalizeTransferWriteOpsByDecomposition
31613de9SMatthias Springer    : public OpConversionPattern<vector::TransferWriteOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell  LogicalResult
31613de9SMatthias Springer  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
042800a4SBenjamin Maxwell    auto vectorType = writeOp.getVectorType();
042800a4SBenjamin Maxwell    if (!isMultipleOfSMETileVectorType(vectorType))
1408667fSBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
1408667fSBenjamin Maxwell                                         kMatchFailureNotSMETileTypeMultiple);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto mask = writeOp.getMask();
042800a4SBenjamin Maxwell    if (!isSupportedMaskOp(mask))
042800a4SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
1408667fSBenjamin Maxwell                                         kMatchFailureUnsupportedMaskOp);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto permutationMap = writeOp.getPermutationMap();
042800a4SBenjamin Maxwell    if (!permutationMap.isPermutation())
042800a4SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
1408667fSBenjamin Maxwell                                         kMatchFailureNonPermutationMap);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    // Note: For 2D vector types the only non-identity permutation is a simple
*aa295216SJay Foad    // transpose [1, 0].
042800a4SBenjamin Maxwell    bool transposed = !permutationMap.isIdentity();
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    auto loc = writeOp.getLoc();
042800a4SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
042800a4SBenjamin Maxwell    auto inputSMETiles = adaptor.getVector();
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    Value destTensorOrMemref = writeOp.getSource();
042800a4SBenjamin Maxwell    for (auto [index, smeTile] : llvm::enumerate(decomposeToSMETiles(
042800a4SBenjamin Maxwell             rewriter, vectorType, smeTileType, transposed))) {
042800a4SBenjamin Maxwell      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
042800a4SBenjamin Maxwell      auto smeWrite = rewriter.create<vector::TransferWriteOp>(
042800a4SBenjamin Maxwell          loc, inputSMETiles[index], destTensorOrMemref,
042800a4SBenjamin Maxwell          getSMESubTileIndices(rewriter, loc, writeOp.getIndices(), smeTile),
042800a4SBenjamin Maxwell          writeOp.getPermutationMapAttr(), smeMask, writeOp.getInBoundsAttr());
042800a4SBenjamin Maxwell      if (writeOp.hasPureTensorSemantics())
042800a4SBenjamin Maxwell        destTensorOrMemref = smeWrite.getResult();
042800a4SBenjamin Maxwell    }
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    if (writeOp.hasPureTensorSemantics())
042800a4SBenjamin Maxwell      rewriter.replaceOp(writeOp, destTensorOrMemref);
042800a4SBenjamin Maxwell    else
042800a4SBenjamin Maxwell      rewriter.eraseOp(writeOp);
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell    return success();
042800a4SBenjamin Maxwell  }
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
5ed5d723SBenjamin Maxwell/// Legalize a multi-tile transfer_write as a single store loop. This is done as
5ed5d723SBenjamin Maxwell/// part of type decomposition as at this level we know each tile write is
5ed5d723SBenjamin Maxwell/// disjoint, but that information is lost after decomposition (without analysis
5ed5d723SBenjamin Maxwell/// to reconstruct it).
5ed5d723SBenjamin Maxwell///
5ed5d723SBenjamin Maxwell/// Example (pseudo-MLIR):
5ed5d723SBenjamin Maxwell///
5ed5d723SBenjamin Maxwell/// ```
5ed5d723SBenjamin Maxwell/// vector.transfer_write %vector, %dest[%y, %x], %mask
5ed5d723SBenjamin Maxwell///   : vector<[16]x[8]xi16>, memref<?x?xi16>
5ed5d723SBenjamin Maxwell/// ```
5ed5d723SBenjamin Maxwell/// Is rewritten to:
5ed5d723SBenjamin Maxwell/// ```
5ed5d723SBenjamin Maxwell/// scf.for %slice_idx = %c0 to %c8_vscale step %c1 {
5ed5d723SBenjamin Maxwell///   %upper_slice_mask = vector.extract %mask[%slice_idx] ─┐
5ed5d723SBenjamin Maxwell///     : vector<[8]xi1> from vector<[16]x[8]xi1>           |
5ed5d723SBenjamin Maxwell///   %upper_slice = vector.extract %upper_tile[%slice_idx] |- Store upper tile
5ed5d723SBenjamin Maxwell///     : vector<[8]xi16> from vector<[8]x[8]xi16>          |
5ed5d723SBenjamin Maxwell///   vector.transfer_write %upper_slice,                   |
5ed5d723SBenjamin Maxwell///     %dest[%slice_idx + %y, %x], %upper_slice_mask       |
5ed5d723SBenjamin Maxwell///     : vector<[8]xi16>, memref<?x?xi16>                  ┘
5ed5d723SBenjamin Maxwell///   %lower_slice_idx = %slice_idx + %c8_vscale                 ─┐
5ed5d723SBenjamin Maxwell///   %lower_slice_mask = vector.extract %mask[%lower_slice_idx]  |
5ed5d723SBenjamin Maxwell///     : vector<[8]xi1> from vector<[16]x[8]xi1>                 |
5ed5d723SBenjamin Maxwell///   %lower_slice = vector.extract %lower_tile[%slice_idx]       |- Store lower
5ed5d723SBenjamin Maxwell///     : vector<[8]xi16> from vector<[8]x[8]xi16>                |  tile
5ed5d723SBenjamin Maxwell///   vector.transfer_write %lower_slice,                         |
5ed5d723SBenjamin Maxwell///     %dest[%lower_slice_idx + %y, %x], %lower_slice_mask       |
5ed5d723SBenjamin Maxwell///     : vector<[8]xi16>, memref<?x?xi16>                        ┘
5ed5d723SBenjamin Maxwell/// }
5ed5d723SBenjamin Maxwell/// ```
5ed5d723SBenjamin Maxwellstruct LegalizeMultiTileTransferWriteAsStoreLoop
31613de9SMatthias Springer    : public OpConversionPattern<vector::TransferWriteOp> {
31613de9SMatthias Springer  using OpConversionPattern::OpConversionPattern;
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell  LogicalResult
31613de9SMatthias Springer  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
31613de9SMatthias Springer                  ConversionPatternRewriter &rewriter) const override {
5ed5d723SBenjamin Maxwell    if (writeOp.hasPureTensorSemantics())
5ed5d723SBenjamin Maxwell      return rewriter.notifyMatchFailure(
5ed5d723SBenjamin Maxwell          writeOp, "TODO: tensor semantics are unsupported");
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    auto permutationMap = writeOp.getPermutationMap();
5ed5d723SBenjamin Maxwell    if (!permutationMap.isPermutation())
5ed5d723SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
5ed5d723SBenjamin Maxwell                                         kMatchFailureNonPermutationMap);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    bool transposed = !permutationMap.isIdentity();
5ed5d723SBenjamin Maxwell    if (transposed)
5ed5d723SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
5ed5d723SBenjamin Maxwell                                         "TODO: transpose unsupported");
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    auto vectorType = writeOp.getVectorType();
5ed5d723SBenjamin Maxwell    if (!isMultipleOfSMETileVectorType(vectorType))
5ed5d723SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
5ed5d723SBenjamin Maxwell                                         kMatchFailureNotSMETileTypeMultiple);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    // Note: We also disallow masks where any dimension is > 16 because that
5ed5d723SBenjamin Maxwell    // prevents the masking from being lowered to use arm_sve.psel.
5ed5d723SBenjamin Maxwell    auto mask = writeOp.getMask();
5ed5d723SBenjamin Maxwell    if (!isSupportedMaskOp(mask) || (mask && (vectorType.getDimSize(0) > 16 ||
5ed5d723SBenjamin Maxwell                                              vectorType.getDimSize(1) > 16)))
5ed5d723SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
5ed5d723SBenjamin Maxwell                                         kMatchFailureUnsupportedMaskOp);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    auto loc = writeOp.getLoc();
c194bc77SBenjamin Maxwell    auto createVscaleMultiple =
c194bc77SBenjamin Maxwell        vector::makeVscaleConstantBuilder(rewriter, loc);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    // Get SME tile and slice types.
5ed5d723SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
5ed5d723SBenjamin Maxwell    auto minTileSlices = smeTileType.getDimSize(0);
5ed5d723SBenjamin Maxwell    VectorType sliceMaskType =
5ed5d723SBenjamin Maxwell        VectorType::get(minTileSlices, rewriter.getI1Type(), true);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    // Create loop over all tile slices.
5ed5d723SBenjamin Maxwell    auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
5ed5d723SBenjamin Maxwell    auto upperBound = createVscaleMultiple(minTileSlices);
5ed5d723SBenjamin Maxwell    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
5ed5d723SBenjamin Maxwell    auto storeLoop =
5ed5d723SBenjamin Maxwell        rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
5ed5d723SBenjamin Maxwell    rewriter.setInsertionPointToStart(storeLoop.getBody());
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    // For each sub-tile of the multi-tile `vectorType`.
5ed5d723SBenjamin Maxwell    auto inputSMETiles = adaptor.getVector();
5ed5d723SBenjamin Maxwell    auto tileSliceIndex = storeLoop.getInductionVar();
5ed5d723SBenjamin Maxwell    for (auto [index, smeTile] : llvm::enumerate(
5ed5d723SBenjamin Maxwell             decomposeToSMETiles(rewriter, vectorType, smeTileType))) {
5ed5d723SBenjamin Maxwell      // The coordinates of the tile within `vectorType`.
5ed5d723SBenjamin Maxwell      auto tileRow = createVscaleMultiple(smeTile.row);
5ed5d723SBenjamin Maxwell      auto tileCol = createVscaleMultiple(smeTile.col);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell      // The current slice of `vectorType` we are processing.
5ed5d723SBenjamin Maxwell      auto sliceIndex =
5ed5d723SBenjamin Maxwell          rewriter.create<arith::AddIOp>(loc, tileRow, tileSliceIndex);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell      // Where in the destination memref the current slice will be stored.
5ed5d723SBenjamin Maxwell      auto storeRow = rewriter.create<arith::AddIOp>(loc, sliceIndex,
5ed5d723SBenjamin Maxwell                                                     writeOp.getIndices()[0]);
5ed5d723SBenjamin Maxwell      auto storeCol =
5ed5d723SBenjamin Maxwell          rewriter.create<arith::AddIOp>(loc, tileCol, writeOp.getIndices()[1]);
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell      // Extract the mask for the current slice.
5ed5d723SBenjamin Maxwell      Value sliceMask = nullptr;
5ed5d723SBenjamin Maxwell      if (mask) {
5ed5d723SBenjamin Maxwell        sliceMask = rewriter.create<vector::ExtractOp>(
5ed5d723SBenjamin Maxwell            loc, mask, OpFoldResult(sliceIndex));
5ed5d723SBenjamin Maxwell        if (sliceMaskType != sliceMask.getType())
5ed5d723SBenjamin Maxwell          sliceMask = rewriter.create<vector::ScalableExtractOp>(
5ed5d723SBenjamin Maxwell              loc, sliceMaskType, sliceMask, smeTile.col);
5ed5d723SBenjamin Maxwell      }
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell      // Extract and store the current slice.
5ed5d723SBenjamin Maxwell      Value tile = inputSMETiles[index];
5ed5d723SBenjamin Maxwell      auto slice =
5ed5d723SBenjamin Maxwell          rewriter.create<vector::ExtractOp>(loc, tile, tileSliceIndex);
5ed5d723SBenjamin Maxwell      rewriter.create<vector::TransferWriteOp>(
5ed5d723SBenjamin Maxwell          loc, slice, writeOp.getSource(), ValueRange{storeRow, storeCol},
5ed5d723SBenjamin Maxwell          AffineMapAttr::get(writeOp.getPermutationMap().dropResult(0)),
5ed5d723SBenjamin Maxwell          sliceMask,
5ed5d723SBenjamin Maxwell          rewriter.getBoolArrayAttr(
5ed5d723SBenjamin Maxwell              ArrayRef<bool>(writeOp.getInBoundsValues()).drop_front()));
5ed5d723SBenjamin Maxwell    }
5ed5d723SBenjamin Maxwell
5ed5d723SBenjamin Maxwell    rewriter.eraseOp(writeOp);
5ed5d723SBenjamin Maxwell    return success();
5ed5d723SBenjamin Maxwell  }
5ed5d723SBenjamin Maxwell};
5ed5d723SBenjamin Maxwell
c2dea712SBenjamin Maxwell//===----------------------------------------------------------------------===//
c2dea712SBenjamin Maxwell// ArmSME-specific fixup canonicalizations/folds
c2dea712SBenjamin Maxwell//===----------------------------------------------------------------------===//
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell/// Folds an extract from a 3D `vector.create_mask` (which is a vector of
c2dea712SBenjamin Maxwell/// SME-like masks), into a compare and a 2D `vector.create_mask`. This is
c2dea712SBenjamin Maxwell/// necessary for the mask to be lowered to ArmSME.
c2dea712SBenjamin Maxwell///
c2dea712SBenjamin Maxwell/// Example:
c2dea712SBenjamin Maxwell///
c2dea712SBenjamin Maxwell///  BEFORE:
c2dea712SBenjamin Maxwell///  ```mlir
c2dea712SBenjamin Maxwell///  %mask = vector.create_mask %nonConstantDim, %a, %b : vector<4x[4]x[4]xi1>
c2dea712SBenjamin Maxwell///  %subMask = vector.extract %mask[2]
c2dea712SBenjamin Maxwell///          : vector<[4]x[4]xi1> from vector<4x[4]x[4]xi1>
c2dea712SBenjamin Maxwell///  ```
c2dea712SBenjamin Maxwell///
c2dea712SBenjamin Maxwell///  AFTER:
c2dea712SBenjamin Maxwell///  ```mlir
c2dea712SBenjamin Maxwell///  %extractionInTrueRegion = arith.cmpi slt, %c2, %nonConstantDim : index
c2dea712SBenjamin Maxwell///  %newMaskFrontDim = arith.select %extractionInTrueRegion, %a, %c0 : index
c2dea712SBenjamin Maxwell///  %subMask = vector.create_mask %newMaskFrontDim, %b : vector<[4]x[4]xi1>
c2dea712SBenjamin Maxwell///  ```
c2dea712SBenjamin Maxwellstruct FoldExtractFromVectorOfSMELikeCreateMasks
c2dea712SBenjamin Maxwell    : public OpRewritePattern<vector::ExtractOp> {
c2dea712SBenjamin Maxwell  using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell  LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
c2dea712SBenjamin Maxwell                                PatternRewriter &rewriter) const override {
c2dea712SBenjamin Maxwell    auto loc = extractOp.getLoc();
c2dea712SBenjamin Maxwell    auto createMaskOp =
c2dea712SBenjamin Maxwell        extractOp.getVector().getDefiningOp<vector::CreateMaskOp>();
c2dea712SBenjamin Maxwell    if (!createMaskOp)
c2dea712SBenjamin Maxwell      return rewriter.notifyMatchFailure(
c2dea712SBenjamin Maxwell          extractOp, "extract not from vector.create_mask op");
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell    VectorType extractedMaskType =
c2dea712SBenjamin Maxwell        llvm::dyn_cast<VectorType>(extractOp.getResult().getType());
c2dea712SBenjamin Maxwell    if (!extractedMaskType)
c2dea712SBenjamin Maxwell      return rewriter.notifyMatchFailure(extractOp,
c2dea712SBenjamin Maxwell                                         "extracted type is not a vector type");
c2dea712SBenjamin Maxwell
fe07d9aaSAndrzej Warzyński    auto numScalable = extractedMaskType.getNumScalableDims();
c2dea712SBenjamin Maxwell    if (numScalable != 2)
c2dea712SBenjamin Maxwell      return rewriter.notifyMatchFailure(
c2dea712SBenjamin Maxwell          extractOp, "expected extracted type to be an SME-like mask");
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell    // TODO: Support multiple extraction indices.
c2dea712SBenjamin Maxwell    if (extractOp.getStaticPosition().size() != 1)
c2dea712SBenjamin Maxwell      return rewriter.notifyMatchFailure(
c2dea712SBenjamin Maxwell          extractOp, "only a single extraction index is supported");
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell    auto frontMaskDim = createMaskOp.getOperand(0);
c2dea712SBenjamin Maxwell    if (frontMaskDim.getDefiningOp<arith::ConstantOp>())
c2dea712SBenjamin Maxwell      return rewriter.notifyMatchFailure(
c2dea712SBenjamin Maxwell          extractOp,
c2dea712SBenjamin Maxwell          "constant vector.create_masks dims should be folded elsewhere");
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
c2dea712SBenjamin Maxwell    auto extractionIndex = getValueOrCreateConstantIndexOp(
c2dea712SBenjamin Maxwell        rewriter, loc, extractOp.getMixedPosition()[0]);
c2dea712SBenjamin Maxwell    auto extractionInTrueRegion = rewriter.create<arith::CmpIOp>(
c2dea712SBenjamin Maxwell        loc, rewriter.getI1Type(), arith::CmpIPredicate::slt, extractionIndex,
c2dea712SBenjamin Maxwell        frontMaskDim);
c2dea712SBenjamin Maxwell    auto newMaskFrontDim = rewriter.create<arith::SelectOp>(
c2dea712SBenjamin Maxwell        loc, extractionInTrueRegion, createMaskOp.getOperand(1), zero);
c2dea712SBenjamin Maxwell
c2dea712SBenjamin Maxwell    rewriter.replaceOpWithNewOp<vector::CreateMaskOp>(
c2dea712SBenjamin Maxwell        extractOp, extractedMaskType,
c2dea712SBenjamin Maxwell        ValueRange{newMaskFrontDim, createMaskOp.getOperand(2)});
c2dea712SBenjamin Maxwell    return success();
c2dea712SBenjamin Maxwell  }
c2dea712SBenjamin Maxwell};
c2dea712SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell/// A vector type where no fixed dimension comes after a scalable dimension.
d1fc59c3SBenjamin Maxwellbool isLegalVectorType(VectorType vType) {
d1fc59c3SBenjamin Maxwell  bool seenFixedDim = false;
d1fc59c3SBenjamin Maxwell  for (bool scalableFlag : llvm::reverse(vType.getScalableDims())) {
d1fc59c3SBenjamin Maxwell    seenFixedDim |= !scalableFlag;
d1fc59c3SBenjamin Maxwell    if (seenFixedDim && scalableFlag)
d1fc59c3SBenjamin Maxwell      return false;
d1fc59c3SBenjamin Maxwell  }
d1fc59c3SBenjamin Maxwell  return true;
d1fc59c3SBenjamin Maxwell}
d1fc59c3SBenjamin Maxwell
0473e322SBenjamin Maxwell/// Lifts an illegal vector.transpose and vector.transfer_read to a
0473e322SBenjamin Maxwell/// memref.subview + memref.transpose, followed by a legal read.
0473e322SBenjamin Maxwell///
0473e322SBenjamin Maxwell/// 'Illegal' here means a leading scalable dimension and a fixed trailing
0473e322SBenjamin Maxwell/// dimension, which has no valid lowering.
0473e322SBenjamin Maxwell///
0473e322SBenjamin Maxwell/// The memref.transpose is metadata-only transpose that produces a strided
0473e322SBenjamin Maxwell/// memref, which eventually becomes a loop reading individual elements.
0473e322SBenjamin Maxwell///
0473e322SBenjamin Maxwell/// Example:
0473e322SBenjamin Maxwell///
0473e322SBenjamin Maxwell///  BEFORE:
0473e322SBenjamin Maxwell///  ```mlir
0473e322SBenjamin Maxwell///  %illegalRead = vector.transfer_read %memref[%a, %b]
0473e322SBenjamin Maxwell///                  : memref<?x?xf32>, vector<[8]x4xf32>
0473e322SBenjamin Maxwell///  %legalType = vector.transpose %illegalRead, [1, 0]
0473e322SBenjamin Maxwell///                  : vector<[8]x4xf32> to vector<4x[8]xf32>
0473e322SBenjamin Maxwell///  ```
0473e322SBenjamin Maxwell///
0473e322SBenjamin Maxwell///  AFTER:
0473e322SBenjamin Maxwell///  ```mlir
0473e322SBenjamin Maxwell///  %readSubview = memref.subview %memref[%a, %b] [%c8_vscale, %c4] [%c1, %c1]
0473e322SBenjamin Maxwell///                  : memref<?x?xf32> to memref<?x?xf32>
0473e322SBenjamin Maxwell///  %transpose = memref.transpose %readSubview (d0, d1) -> (d1, d0)
0473e322SBenjamin Maxwell///                  : memref<?x?xf32> to memref<?x?xf32>
0473e322SBenjamin Maxwell///  %legalType = vector.transfer_read %transpose[%c0, %c0]
0473e322SBenjamin Maxwell///                  : memref<?x?xf32>, vector<4x[8]xf32>
0473e322SBenjamin Maxwell///  ```
0473e322SBenjamin Maxwellstruct LiftIllegalVectorTransposeToMemory
0473e322SBenjamin Maxwell    : public OpRewritePattern<vector::TransposeOp> {
0473e322SBenjamin Maxwell  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell  static Value getExtensionSource(Operation *op) {
8cfb7161SBenjamin Maxwell    if (isa_and_present<arith::ExtSIOp, arith::ExtUIOp, arith::ExtFOp>(op))
0473e322SBenjamin Maxwell      return op->getOperand(0);
0473e322SBenjamin Maxwell    return {};
0473e322SBenjamin Maxwell  }
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell  LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
0473e322SBenjamin Maxwell                                PatternRewriter &rewriter) const override {
0473e322SBenjamin Maxwell    auto sourceType = transposeOp.getSourceVectorType();
0473e322SBenjamin Maxwell    auto resultType = transposeOp.getResultVectorType();
d1fc59c3SBenjamin Maxwell    if (isLegalVectorType(sourceType) || !isLegalVectorType(resultType))
d1fc59c3SBenjamin Maxwell      return rewriter.notifyMatchFailure(transposeOp,
d1fc59c3SBenjamin Maxwell                                         kMatchFailureNotIllegalToLegal);
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    // Look through extend for transfer_read.
0473e322SBenjamin Maxwell    Value maybeRead = transposeOp.getVector();
0473e322SBenjamin Maxwell    auto *transposeSourceOp = maybeRead.getDefiningOp();
0473e322SBenjamin Maxwell    Operation *extendOp = nullptr;
0473e322SBenjamin Maxwell    if (Value extendSource = getExtensionSource(transposeSourceOp)) {
0473e322SBenjamin Maxwell      maybeRead = extendSource;
0473e322SBenjamin Maxwell      extendOp = transposeSourceOp;
0473e322SBenjamin Maxwell    }
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    auto illegalRead = maybeRead.getDefiningOp<vector::TransferReadOp>();
0473e322SBenjamin Maxwell    if (!illegalRead)
0473e322SBenjamin Maxwell      return rewriter.notifyMatchFailure(
0473e322SBenjamin Maxwell          transposeOp,
0473e322SBenjamin Maxwell          "expected source to be (possibly extended) transfer_read");
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    if (!illegalRead.getPermutationMap().isIdentity())
0473e322SBenjamin Maxwell      return rewriter.notifyMatchFailure(
0473e322SBenjamin Maxwell          illegalRead, "expected read to have identity permutation map");
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    auto loc = transposeOp.getLoc();
0473e322SBenjamin Maxwell    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
0473e322SBenjamin Maxwell    auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    // Create a subview that matches the size of the illegal read vector type.
0473e322SBenjamin Maxwell    auto readType = illegalRead.getVectorType();
0473e322SBenjamin Maxwell    auto readSizes = llvm::map_to_vector(
0473e322SBenjamin Maxwell        llvm::zip_equal(readType.getShape(), readType.getScalableDims()),
0473e322SBenjamin Maxwell        [&](auto dim) -> Value {
0473e322SBenjamin Maxwell          auto [size, isScalable] = dim;
0473e322SBenjamin Maxwell          auto dimSize = rewriter.create<arith::ConstantIndexOp>(loc, size);
0473e322SBenjamin Maxwell          if (!isScalable)
0473e322SBenjamin Maxwell            return dimSize;
0473e322SBenjamin Maxwell          auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
0473e322SBenjamin Maxwell          return rewriter.create<arith::MulIOp>(loc, vscale, dimSize);
0473e322SBenjamin Maxwell        });
0473e322SBenjamin Maxwell    SmallVector<Value> strides(readType.getRank(), Value(one));
0473e322SBenjamin Maxwell    auto readSubview = rewriter.create<memref::SubViewOp>(
0473e322SBenjamin Maxwell        loc, illegalRead.getSource(), illegalRead.getIndices(), readSizes,
0473e322SBenjamin Maxwell        strides);
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    // Apply the transpose to all values/attributes of the transfer_read:
0473e322SBenjamin Maxwell    // - The mask
0473e322SBenjamin Maxwell    Value mask = illegalRead.getMask();
0473e322SBenjamin Maxwell    if (mask) {
0473e322SBenjamin Maxwell      // Note: The transpose for the mask should fold into the
0473e322SBenjamin Maxwell      // vector.create_mask/constant_mask op, which will then become legal.
0473e322SBenjamin Maxwell      mask = rewriter.create<vector::TransposeOp>(loc, mask,
0473e322SBenjamin Maxwell                                                  transposeOp.getPermutation());
0473e322SBenjamin Maxwell    }
0473e322SBenjamin Maxwell    // - The source memref
0473e322SBenjamin Maxwell    mlir::AffineMap transposeMap = AffineMap::getPermutationMap(
0473e322SBenjamin Maxwell        transposeOp.getPermutation(), getContext());
0473e322SBenjamin Maxwell    auto transposedSubview = rewriter.create<memref::TransposeOp>(
0473e322SBenjamin Maxwell        loc, readSubview, AffineMapAttr::get(transposeMap));
0473e322SBenjamin Maxwell    ArrayAttr inBoundsAttr = illegalRead.getInBoundsAttr();
0473e322SBenjamin Maxwell    // - The `in_bounds` attribute
0473e322SBenjamin Maxwell    if (inBoundsAttr) {
0473e322SBenjamin Maxwell      SmallVector<Attribute> inBoundsValues(inBoundsAttr.begin(),
0473e322SBenjamin Maxwell                                            inBoundsAttr.end());
0473e322SBenjamin Maxwell      applyPermutationToVector(inBoundsValues, transposeOp.getPermutation());
0473e322SBenjamin Maxwell      inBoundsAttr = rewriter.getArrayAttr(inBoundsValues);
0473e322SBenjamin Maxwell    }
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    VectorType legalReadType = resultType.clone(readType.getElementType());
0473e322SBenjamin Maxwell    // Note: The indices are all zero as the subview is already offset.
0473e322SBenjamin Maxwell    SmallVector<Value> readIndices(illegalRead.getIndices().size(), zero);
0473e322SBenjamin Maxwell    auto legalRead = rewriter.create<vector::TransferReadOp>(
0473e322SBenjamin Maxwell        loc, legalReadType, transposedSubview, readIndices,
0473e322SBenjamin Maxwell        illegalRead.getPermutationMapAttr(), illegalRead.getPadding(), mask,
0473e322SBenjamin Maxwell        inBoundsAttr);
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    // Replace the transpose with the new read, extending the result if
0473e322SBenjamin Maxwell    // necessary.
0473e322SBenjamin Maxwell    rewriter.replaceOp(transposeOp, [&]() -> Operation * {
0473e322SBenjamin Maxwell      if (extendOp)
0473e322SBenjamin Maxwell        return rewriter.create(loc, extendOp->getName().getIdentifier(),
0473e322SBenjamin Maxwell                               Value(legalRead), resultType);
0473e322SBenjamin Maxwell      return legalRead;
0473e322SBenjamin Maxwell    }());
0473e322SBenjamin Maxwell
0473e322SBenjamin Maxwell    return success();
0473e322SBenjamin Maxwell  }
0473e322SBenjamin Maxwell};
0473e322SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell/// A rewrite to turn unit dim transpose-like vector.shape_casts into
d1fc59c3SBenjamin Maxwell/// vector.transposes. The shape_cast has to be from an illegal vector type to a
d1fc59c3SBenjamin Maxwell/// legal one (as defined by isLegalVectorType).
d1fc59c3SBenjamin Maxwell///
d1fc59c3SBenjamin Maxwell/// The reasoning for this is if we've got to this pass and we still have
d1fc59c3SBenjamin Maxwell/// shape_casts of illegal types, then they likely will not cancel out. Turning
d1fc59c3SBenjamin Maxwell/// them into transposes gives LiftIllegalVectorTransposeToMemory a chance to
d1fc59c3SBenjamin Maxwell/// eliminate them.
d1fc59c3SBenjamin Maxwell///
d1fc59c3SBenjamin Maxwell/// Example:
d1fc59c3SBenjamin Maxwell///
d1fc59c3SBenjamin Maxwell///  BEFORE:
d1fc59c3SBenjamin Maxwell///  ```mlir
d1fc59c3SBenjamin Maxwell///  %0 = vector.shape_cast %a : vector<[4]x1xf32> to vector<1x[4]xf32>
d1fc59c3SBenjamin Maxwell///  ```
d1fc59c3SBenjamin Maxwell///
d1fc59c3SBenjamin Maxwell///  AFTER:
d1fc59c3SBenjamin Maxwell///  ```mlir
d1fc59c3SBenjamin Maxwell///  %0 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
d1fc59c3SBenjamin Maxwell///  ```
d1fc59c3SBenjamin Maxwellstruct ConvertIllegalShapeCastOpsToTransposes
d1fc59c3SBenjamin Maxwell    : public OpRewritePattern<vector::ShapeCastOp> {
d1fc59c3SBenjamin Maxwell  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
d1fc59c3SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell  LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
d1fc59c3SBenjamin Maxwell                                PatternRewriter &rewriter) const override {
d1fc59c3SBenjamin Maxwell    auto sourceType = shapeCastOp.getSourceVectorType();
d1fc59c3SBenjamin Maxwell    auto resultType = shapeCastOp.getResultVectorType();
d1fc59c3SBenjamin Maxwell    if (isLegalVectorType(sourceType) || !isLegalVectorType(resultType))
d1fc59c3SBenjamin Maxwell      return rewriter.notifyMatchFailure(shapeCastOp,
d1fc59c3SBenjamin Maxwell                                         kMatchFailureNotIllegalToLegal);
d1fc59c3SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell    // Note: If we know that `sourceType` is an illegal vector type (and 2D)
d1fc59c3SBenjamin Maxwell    // then dim 0 is scalable and dim 1 is fixed.
d1fc59c3SBenjamin Maxwell    if (sourceType.getRank() != 2 || sourceType.getDimSize(1) != 1)
d1fc59c3SBenjamin Maxwell      return rewriter.notifyMatchFailure(
d1fc59c3SBenjamin Maxwell          shapeCastOp, "expected source to be a 2D scalable vector with a "
d1fc59c3SBenjamin Maxwell                       "trailing unit dim");
d1fc59c3SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell    auto loc = shapeCastOp.getLoc();
d1fc59c3SBenjamin Maxwell    auto transpose = rewriter.create<vector::TransposeOp>(
d1fc59c3SBenjamin Maxwell        loc, shapeCastOp.getSource(), ArrayRef<int64_t>{1, 0});
d1fc59c3SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell    if (resultType.getRank() == 1)
d1fc59c3SBenjamin Maxwell      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(shapeCastOp, resultType,
d1fc59c3SBenjamin Maxwell                                                       transpose);
d1fc59c3SBenjamin Maxwell    else
d1fc59c3SBenjamin Maxwell      rewriter.replaceOp(shapeCastOp, transpose);
d1fc59c3SBenjamin Maxwell
d1fc59c3SBenjamin Maxwell    return success();
d1fc59c3SBenjamin Maxwell  }
d1fc59c3SBenjamin Maxwell};
d1fc59c3SBenjamin Maxwell
c194bc77SBenjamin Maxwell/// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
c194bc77SBenjamin Maxwell/// the ZA state. This workaround rewrite to support these transposes when ZA is
c194bc77SBenjamin Maxwell/// available.
c194bc77SBenjamin Maxwell///
c194bc77SBenjamin Maxwell/// Example:
c194bc77SBenjamin Maxwell///
c194bc77SBenjamin Maxwell///  BEFORE:
c194bc77SBenjamin Maxwell///  ```mlir
c194bc77SBenjamin Maxwell///  %transpose = vector.transpose %vec, [1, 0]
c194bc77SBenjamin Maxwell///     : vector<2x[4]xf32> to vector<[4]x2xf32>
c194bc77SBenjamin Maxwell///  vector.transfer_write %transpose, %dest[%y, %x]
c194bc77SBenjamin Maxwell///     : vector<[4]x2xf32>,  memref<?x?xf32>
c194bc77SBenjamin Maxwell///  ```
c194bc77SBenjamin Maxwell///
c194bc77SBenjamin Maxwell///  AFTER:
c194bc77SBenjamin Maxwell///  ```mlir
c194bc77SBenjamin Maxwell///   %0 = arm_sme.get_tile : vector<[4]x[4]xf32>
c194bc77SBenjamin Maxwell///   %1 = vector.extract %vec[0] : vector<[4]xf32> from vector<2x[4]xf32>
c194bc77SBenjamin Maxwell///   %2 = vector.insert %1, %0 [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
c194bc77SBenjamin Maxwell///   %3 = vector.extract %vec[1] : vector<[4]xf32> from vector<2x[4]xf32>
c194bc77SBenjamin Maxwell///   %4 = vector.insert %3, %2 [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
c194bc77SBenjamin Maxwell///   %c4_vscale = arith.muli %vscale, %c4 : index
c194bc77SBenjamin Maxwell///   %mask = vector.create_mask %c4_vscale, %c2 : vector<[4]x[4]xi1>
c194bc77SBenjamin Maxwell///   vector.transfer_write %4, %dest[%y, %x], %mask
c194bc77SBenjamin Maxwell///      {permutation_map = affine_map<(d0, d1) -> (d1, d0)>}
c194bc77SBenjamin Maxwell///      : vector<[4]x[4]xf32>, memref<?x?xf32>
c194bc77SBenjamin Maxwell///  ```
c194bc77SBenjamin Maxwell///
c194bc77SBenjamin Maxwell/// Values larger than a single tile are supported via decomposition.
c194bc77SBenjamin Maxwellstruct LowerIllegalTransposeStoreViaZA
c194bc77SBenjamin Maxwell    : public OpRewritePattern<vector::TransferWriteOp> {
c194bc77SBenjamin Maxwell  using OpRewritePattern::OpRewritePattern;
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
c194bc77SBenjamin Maxwell                                PatternRewriter &rewriter) const override {
c194bc77SBenjamin Maxwell    if (!isSupportedMaskOp(writeOp.getMask()))
c194bc77SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
c194bc77SBenjamin Maxwell                                         kMatchFailureUnsupportedMaskOp);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto permutationMap = writeOp.getPermutationMap();
c194bc77SBenjamin Maxwell    if (!permutationMap.isIdentity())
c194bc77SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp,
c194bc77SBenjamin Maxwell                                         kMatchFailureNonPermutationMap);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto transposeOp = writeOp.getVector().getDefiningOp<vector::TransposeOp>();
c194bc77SBenjamin Maxwell    if (!transposeOp)
c194bc77SBenjamin Maxwell      return failure();
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto sourceType = transposeOp.getSourceVectorType();
c194bc77SBenjamin Maxwell    auto resultType = transposeOp.getResultVectorType();
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    if (resultType.getRank() != 2)
c194bc77SBenjamin Maxwell      return rewriter.notifyMatchFailure(transposeOp, "TransposeOp not rank 2");
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    if (!isLegalVectorType(sourceType) || isLegalVectorType(resultType))
c194bc77SBenjamin Maxwell      return rewriter.notifyMatchFailure(
c194bc77SBenjamin Maxwell          transposeOp, "not illegal/unsupported SVE transpose");
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto smeTileType = getSMETileTypeForElement(resultType.getElementType());
c194bc77SBenjamin Maxwell    VectorType smeSliceType = VectorType::Builder(smeTileType).dropDim(0);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    if (sourceType.getDimSize(0) <= 1 ||
c194bc77SBenjamin Maxwell        sourceType.getDimSize(1) % smeSliceType.getDimSize(0) != 0)
c194bc77SBenjamin Maxwell      return rewriter.notifyMatchFailure(writeOp, "unsupported source shape");
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto loc = writeOp.getLoc();
c194bc77SBenjamin Maxwell    auto createVscaleMultiple =
c194bc77SBenjamin Maxwell        vector::makeVscaleConstantBuilder(rewriter, loc);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    auto transposeMap = AffineMapAttr::get(
c194bc77SBenjamin Maxwell        AffineMap::getPermutationMap(ArrayRef<int64_t>{1, 0}, getContext()));
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    // Note: We need to use `get_tile` as there's no vector-level `undef`.
c194bc77SBenjamin Maxwell    Value undefTile = rewriter.create<arm_sme::GetTileOp>(loc, smeTileType);
c194bc77SBenjamin Maxwell    Value destTensorOrMemref = writeOp.getSource();
c194bc77SBenjamin Maxwell    auto numSlicesPerTile =
c194bc77SBenjamin Maxwell        std::min(sourceType.getDimSize(0), smeTileType.getDimSize(0));
c194bc77SBenjamin Maxwell    auto numSlices =
c194bc77SBenjamin Maxwell        rewriter.create<arith::ConstantIndexOp>(loc, numSlicesPerTile);
c194bc77SBenjamin Maxwell    for (auto [index, smeTile] : llvm::enumerate(
c194bc77SBenjamin Maxwell             decomposeToSMETiles(rewriter, sourceType, smeTileType))) {
c194bc77SBenjamin Maxwell      // 1. _Deliberately_ drop a scalable dimension and insert a fixed number
c194bc77SBenjamin Maxwell      // of slices from the source type into the SME tile. Without checking
c194bc77SBenjamin Maxwell      // vscale (and emitting multiple implementations) we can't make use of the
c194bc77SBenjamin Maxwell      // rows of the tile after 1*vscale rows.
c194bc77SBenjamin Maxwell      Value tile = undefTile;
c194bc77SBenjamin Maxwell      for (int d = 0; d < numSlicesPerTile; ++d) {
c194bc77SBenjamin Maxwell        Value vector = rewriter.create<vector::ExtractOp>(
c194bc77SBenjamin Maxwell            loc, transposeOp.getVector(),
c194bc77SBenjamin Maxwell            rewriter.getIndexAttr(d + smeTile.row));
c194bc77SBenjamin Maxwell        if (vector.getType() != smeSliceType) {
c194bc77SBenjamin Maxwell          vector = rewriter.create<vector::ScalableExtractOp>(
c194bc77SBenjamin Maxwell              loc, smeSliceType, vector, smeTile.col);
c194bc77SBenjamin Maxwell        }
c194bc77SBenjamin Maxwell        tile = rewriter.create<vector::InsertOp>(loc, vector, tile, d);
c194bc77SBenjamin Maxwell      }
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell      // 2. Transpose the tile position.
c194bc77SBenjamin Maxwell      auto transposedRow = createVscaleMultiple(smeTile.col);
c194bc77SBenjamin Maxwell      auto transposedCol =
c194bc77SBenjamin Maxwell          rewriter.create<arith::ConstantIndexOp>(loc, smeTile.row);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell      // 3. Compute mask for tile store.
c194bc77SBenjamin Maxwell      Value maskRows;
c194bc77SBenjamin Maxwell      Value maskCols;
c194bc77SBenjamin Maxwell      if (auto mask = writeOp.getMask()) {
c194bc77SBenjamin Maxwell        auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
c194bc77SBenjamin Maxwell        maskRows = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(0),
c194bc77SBenjamin Maxwell                                                  transposedRow);
c194bc77SBenjamin Maxwell        maskCols = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(1),
c194bc77SBenjamin Maxwell                                                  transposedCol);
c194bc77SBenjamin Maxwell        maskCols = rewriter.create<index::MinSOp>(loc, maskCols, numSlices);
c194bc77SBenjamin Maxwell      } else {
c194bc77SBenjamin Maxwell        maskRows = createVscaleMultiple(smeTileType.getDimSize(0));
c194bc77SBenjamin Maxwell        maskCols = numSlices;
c194bc77SBenjamin Maxwell      }
c194bc77SBenjamin Maxwell      auto subMask = rewriter.create<vector::CreateMaskOp>(
c194bc77SBenjamin Maxwell          loc, smeTileType.clone(rewriter.getI1Type()),
c194bc77SBenjamin Maxwell          ValueRange{maskRows, maskCols});
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell      // 4. Emit a transposed tile write.
c194bc77SBenjamin Maxwell      auto writeIndices = writeOp.getIndices();
c194bc77SBenjamin Maxwell      Value destRow =
c194bc77SBenjamin Maxwell          rewriter.create<arith::AddIOp>(loc, transposedRow, writeIndices[0]);
c194bc77SBenjamin Maxwell      Value destCol =
c194bc77SBenjamin Maxwell          rewriter.create<arith::AddIOp>(loc, transposedCol, writeIndices[1]);
c194bc77SBenjamin Maxwell      auto smeWrite = rewriter.create<vector::TransferWriteOp>(
c194bc77SBenjamin Maxwell          loc, tile, destTensorOrMemref, ValueRange{destRow, destCol},
c194bc77SBenjamin Maxwell          transposeMap, subMask, writeOp.getInBounds());
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell      if (writeOp.hasPureTensorSemantics())
c194bc77SBenjamin Maxwell        destTensorOrMemref = smeWrite.getResult();
c194bc77SBenjamin Maxwell    }
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    if (writeOp.hasPureTensorSemantics())
c194bc77SBenjamin Maxwell      rewriter.replaceOp(writeOp, destTensorOrMemref);
c194bc77SBenjamin Maxwell    else
c194bc77SBenjamin Maxwell      rewriter.eraseOp(writeOp);
c194bc77SBenjamin Maxwell
c194bc77SBenjamin Maxwell    return success();
c194bc77SBenjamin Maxwell  }
c194bc77SBenjamin Maxwell};
c194bc77SBenjamin Maxwell
042800a4SBenjamin Maxwellstruct VectorLegalizationPass
042800a4SBenjamin Maxwell    : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
042800a4SBenjamin Maxwell  void runOnOperation() override {
042800a4SBenjamin Maxwell    auto *context = &getContext();
8c4bc1e7SMatthias Springer    TypeConverter converter;
042800a4SBenjamin Maxwell    RewritePatternSet patterns(context);
042800a4SBenjamin Maxwell    converter.addConversion([](Type type) { return type; });
042800a4SBenjamin Maxwell    converter.addConversion(
042800a4SBenjamin Maxwell        [](VectorType vectorType,
042800a4SBenjamin Maxwell           SmallVectorImpl<Type> &types) -> std::optional<LogicalResult> {
042800a4SBenjamin Maxwell          if (!isMultipleOfSMETileVectorType(vectorType))
042800a4SBenjamin Maxwell            return std::nullopt;
042800a4SBenjamin Maxwell          auto smeTileCount = getNumberOfSMETilesForVectorType(vectorType);
042800a4SBenjamin Maxwell          auto smeTileType =
042800a4SBenjamin Maxwell              getSMETileTypeForElement(vectorType.getElementType());
042800a4SBenjamin Maxwell          types = SmallVector<Type>(smeTileCount, smeTileType);
042800a4SBenjamin Maxwell          return success();
042800a4SBenjamin Maxwell        });
042800a4SBenjamin Maxwell
31613de9SMatthias Springer    // Apply preprocessing patterns.
31613de9SMatthias Springer    RewritePatternSet rewritePatterns(context);
31613de9SMatthias Springer    rewritePatterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
d1fc59c3SBenjamin Maxwell                        LiftIllegalVectorTransposeToMemory,
c194bc77SBenjamin Maxwell                        ConvertIllegalShapeCastOpsToTransposes,
fc4485bfSBenjamin Maxwell                        LowerIllegalTransposeStoreViaZA>(context);
31613de9SMatthias Springer    if (failed(
31613de9SMatthias Springer            applyPatternsGreedily(getOperation(), std::move(rewritePatterns))))
31613de9SMatthias Springer      return signalPassFailure();
31613de9SMatthias Springer
5ed5d723SBenjamin Maxwell    // Note: These two patterns are added with a high benefit to ensure:
5ed5d723SBenjamin Maxwell    //  - Masked outer products are handled before unmasked ones
5ed5d723SBenjamin Maxwell    //  - Multi-tile writes are lowered as a store loop (if possible)
5ed5d723SBenjamin Maxwell    patterns.add<LegalizeMaskedVectorOuterProductOpsByDecomposition,
5ed5d723SBenjamin Maxwell                 LegalizeMultiTileTransferWriteAsStoreLoop>(converter, context,
5ed5d723SBenjamin Maxwell                                                            /*benefit=*/1024);
dadcaf82SBenjamin Maxwell    patterns.add<LegalizeArithConstantOpsByDecomposition,
dadcaf82SBenjamin Maxwell                 LegalizeVectorOuterProductOpsByDecomposition,
042800a4SBenjamin Maxwell                 LegalizeTransferReadOpsByDecomposition,
042800a4SBenjamin Maxwell                 LegalizeTransferWriteOpsByDecomposition>(converter, context);
31613de9SMatthias Springer    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
31613de9SMatthias Springer                                                                   converter);
31613de9SMatthias Springer    populateCallOpTypeConversionPattern(patterns, converter);
31613de9SMatthias Springer    populateReturnOpTypeConversionPattern(patterns, converter);
31613de9SMatthias Springer    scf::populateSCFStructuralTypeConversions(converter, patterns);
042800a4SBenjamin Maxwell
31613de9SMatthias Springer    ConversionTarget target(getContext());
31613de9SMatthias Springer    target.markUnknownOpDynamicallyLegal(
31613de9SMatthias Springer        [&](Operation *op) { return converter.isLegal(op); });
31613de9SMatthias Springer    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
31613de9SMatthias Springer      return converter.isSignatureLegal(op.getFunctionType());
31613de9SMatthias Springer    });
31613de9SMatthias Springer    if (failed(applyPartialConversion(getOperation(), target,
042800a4SBenjamin Maxwell                                      std::move(patterns))))
042800a4SBenjamin Maxwell      return signalPassFailure();
042800a4SBenjamin Maxwell  }
042800a4SBenjamin Maxwell};
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwell} // namespace
042800a4SBenjamin Maxwell
042800a4SBenjamin Maxwellstd::unique_ptr<Pass> mlir::arm_sme::createVectorLegalizationPass() {
042800a4SBenjamin Maxwell  return std::make_unique<VectorLegalizationPass>();
042800a4SBenjamin Maxwell}