Linalg/Transforms/DataLayoutPropagation.cpp

0f297cadSHanhan Wang//===- DataLayoutPropagation.cpp -----------------------------------------===///
0f297cadSHanhan Wang//
0f297cadSHanhan Wang// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0f297cadSHanhan Wang// See https://llvm.org/LICENSE.txt for license information.
0f297cadSHanhan Wang// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0f297cadSHanhan Wang//
0f297cadSHanhan Wang//===----------------------------------------------------------------------===//
0f297cadSHanhan Wang
0f297cadSHanhan Wang#include "mlir/Dialect/Linalg/Passes.h"
0f297cadSHanhan Wang
0f297cadSHanhan Wang#include "mlir/Dialect/Affine/IR/AffineOps.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Linalg/IR/Linalg.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Linalg/Utils/Utils.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Tensor/IR/Tensor.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Tensor/Utils/Utils.h"
0f297cadSHanhan Wang#include "mlir/Dialect/Utils/IndexingUtils.h"
1c228026SLorenzo Chelini#include "mlir/IR/Dominance.h"
0f297cadSHanhan Wang#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
a945f55dSAdam Siemieniuk#include "llvm/ADT/SetOperations.h"
a945f55dSAdam Siemieniuk#include "llvm/ADT/SetVector.h"
0c1c0d53SJerry Wu#include "llvm/ADT/TypeSwitch.h"
d38d6065SHanhan Wang#include "llvm/Support/Debug.h"
a1fe1f5fSKazu Hirata#include <optional>
0f297cadSHanhan Wang
0f297cadSHanhan Wangnamespace mlir {
0f297cadSHanhan Wang#define GEN_PASS_DEF_LINALGDATALAYOUTPROPAGATION
0f297cadSHanhan Wang#include "mlir/Dialect/Linalg/Passes.h.inc"
0f297cadSHanhan Wang} // namespace mlir
0f297cadSHanhan Wang
0f297cadSHanhan Wangusing namespace mlir;
0f297cadSHanhan Wangusing namespace mlir::linalg;
0f297cadSHanhan Wang
0f297cadSHanhan Wang#define DEBUG_TYPE "linalg-data-layout-propagation"
0f297cadSHanhan Wang
0f297cadSHanhan Wangnamespace {
0f297cadSHanhan Wang
b4563ee1SQuinn Dawkinsstatic bool hasGatherSemantics(linalg::GenericOp genericOp) {
b4563ee1SQuinn Dawkins  for (Operation &op : genericOp.getBody()->getOperations())
b4563ee1SQuinn Dawkins    if (isa<tensor::ExtractOp, linalg::IndexOp>(op))
b4563ee1SQuinn Dawkins      return true;
b4563ee1SQuinn Dawkins  return false;
b4563ee1SQuinn Dawkins}
b4563ee1SQuinn Dawkins
d38d6065SHanhan Wang// The struct contains the infomation about mapping packing information to
d38d6065SHanhan Wang// the iteration domain of Linalg ops.
d38d6065SHanhan Wangstruct PackInfo {
d38d6065SHanhan Wang  int64_t getNumTiledLoops() const { return tileToPointMapping.size(); };
d38d6065SHanhan Wang  // InnerDimsPos on iteration domain, which follows the order in pack ops.
d38d6065SHanhan Wang  SmallVector<int64_t> tiledDimsPos;
d38d6065SHanhan Wang  // The sizes of tiling data dimensions on iteration domain.
d38d6065SHanhan Wang  llvm::DenseMap<int64_t, OpFoldResult> domainDimAndTileMapping;
d38d6065SHanhan Wang  // The mapping from a dimension of iteration domain to the corresponding inner
d38d6065SHanhan Wang  // tiling dimension on iteration domain.
d38d6065SHanhan Wang  llvm::DenseMap<int64_t, int64_t> tileToPointMapping;
d38d6065SHanhan Wang  // The permutation of outer dims (on domain).
d38d6065SHanhan Wang  SmallVector<int64_t> outerDimsOnDomainPerm;
d38d6065SHanhan Wang};
d38d6065SHanhan Wang
6bb0ab0dSLorenzo Chelinitemplate <typename OpTy>
b4563ee1SQuinn Dawkinsstatic FailureOr<PackInfo>
b4563ee1SQuinn DawkinsgetPackingInfoFromOperand(OpOperand *opOperand, linalg::GenericOp genericOp,
6bb0ab0dSLorenzo Chelini                          OpTy packOrUnPackOp) {
6bb0ab0dSLorenzo Chelini  static_assert(llvm::is_one_of<OpTy, tensor::PackOp, tensor::UnPackOp>::value,
6bb0ab0dSLorenzo Chelini                "applies to only pack or unpack operations");
d38d6065SHanhan Wang  LLVM_DEBUG(
6bb0ab0dSLorenzo Chelini      { llvm::dbgs() << "--- Construct PackInfo From an operand ---\n"; });
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins  AffineMap indexingMap = genericOp.getMatchingIndexingMap(opOperand);
b4563ee1SQuinn Dawkins  SmallVector<AffineMap> indexingMaps = genericOp.getIndexingMapsArray();
b4563ee1SQuinn Dawkins  SmallVector<utils::IteratorType> iterators =
b4563ee1SQuinn Dawkins      genericOp.getIteratorTypesArray();
b4563ee1SQuinn Dawkins
d38d6065SHanhan Wang  PackInfo packInfo;
d38d6065SHanhan Wang  int64_t origNumDims = indexingMap.getNumDims();
d38d6065SHanhan Wang  SmallVector<AffineExpr> exprs(indexingMap.getResults());
6bb0ab0dSLorenzo Chelini  ArrayRef<int64_t> innerDimsPos = packOrUnPackOp.getInnerDimsPos();
d38d6065SHanhan Wang  for (auto [index, innerDimPos, tileSize] :
d38d6065SHanhan Wang       llvm::zip_equal(llvm::seq<unsigned>(0, innerDimsPos.size()),
6bb0ab0dSLorenzo Chelini                       innerDimsPos, packOrUnPackOp.getMixedTiles())) {
b4563ee1SQuinn Dawkins    auto expr = exprs[innerDimPos];
1609f1c2Slong.chen    if (!isa<AffineDimExpr>(expr))
b4563ee1SQuinn Dawkins      return failure();
d38d6065SHanhan Wang    int64_t domainDimPos =
1609f1c2Slong.chen        cast<AffineDimExpr>(exprs[innerDimPos]).getPosition();
b4563ee1SQuinn Dawkins    if (!isParallelIterator(iterators[domainDimPos]))
b4563ee1SQuinn Dawkins      return failure();
d38d6065SHanhan Wang    packInfo.tiledDimsPos.push_back(domainDimPos);
d38d6065SHanhan Wang    packInfo.domainDimAndTileMapping[domainDimPos] = tileSize;
d38d6065SHanhan Wang    packInfo.tileToPointMapping[domainDimPos] = origNumDims + index;
d38d6065SHanhan Wang    LLVM_DEBUG({
d38d6065SHanhan Wang      llvm::dbgs() << "map innerDimPos=" << innerDimPos
d38d6065SHanhan Wang                   << " to iteration dimension (d" << domainDimPos << ", d"
d38d6065SHanhan Wang                   << packInfo.tileToPointMapping[domainDimPos]
d38d6065SHanhan Wang                   << "), which has size=("
d38d6065SHanhan Wang                   << packInfo.domainDimAndTileMapping[domainDimPos] << ")\n";
d38d6065SHanhan Wang    });
d38d6065SHanhan Wang  }
d38d6065SHanhan Wang
b4563ee1SQuinn Dawkins  // Bail out if a tiled dimension is present in a map but not as an affine dim
b4563ee1SQuinn Dawkins  // expression.
b4563ee1SQuinn Dawkins  auto areAllAffineDimExpr = [&](int dim) {
b4563ee1SQuinn Dawkins    for (AffineMap map : indexingMaps) {
b4563ee1SQuinn Dawkins      if (llvm::any_of(map.getResults(), [dim](AffineExpr expr) {
1609f1c2Slong.chen            return expr.isFunctionOfDim(dim) && !isa<AffineDimExpr>(expr);
b4563ee1SQuinn Dawkins          })) {
b4563ee1SQuinn Dawkins        return false;
b4563ee1SQuinn Dawkins      }
b4563ee1SQuinn Dawkins    }
b4563ee1SQuinn Dawkins    return true;
b4563ee1SQuinn Dawkins  };
b4563ee1SQuinn Dawkins  for (int64_t i : packInfo.tiledDimsPos)
b4563ee1SQuinn Dawkins    if (!areAllAffineDimExpr(i))
b4563ee1SQuinn Dawkins      return failure();
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins  // Get the outer dims perm on the iteration domain. Start by identifying the
b4563ee1SQuinn Dawkins  // set of domain dims affected by the outer permutation along with the
b4563ee1SQuinn Dawkins  // permuted ordering for those dims. Then the full outer dims permutation can
b4563ee1SQuinn Dawkins  // be constructed by replacing the affected dims with the permuted result in a
b4563ee1SQuinn Dawkins  // numLoops-rank identity. e.g.
b4563ee1SQuinn Dawkins  //   outerDimsPerm = [1, 2, 0]
b4563ee1SQuinn Dawkins  //   indexingMap = (d0, d1, d2, d3, d4) -> (d1, d4, d3)
b4563ee1SQuinn Dawkins  //
b4563ee1SQuinn Dawkins  //   permutedOuterDims =        [4,    3, 1]
b4563ee1SQuinn Dawkins  //   outerDimsOnDomainPerm = [0, 4, 2, 3, 1]
b4563ee1SQuinn Dawkins  //
b4563ee1SQuinn Dawkins  // Non-affine dim expressions must not be permuted by the outer dims
b4563ee1SQuinn Dawkins  // permutation.
b4563ee1SQuinn Dawkins  SmallVector<int64_t> permutedOuterDims;
b4563ee1SQuinn Dawkins  for (auto [index, dim] : llvm::enumerate(packOrUnPackOp.getOuterDimsPerm())) {
b4563ee1SQuinn Dawkins    auto permutedExpr = indexingMap.getResult(dim);
1609f1c2Slong.chen    if (auto dimExpr = dyn_cast<AffineDimExpr>(permutedExpr)) {
b4563ee1SQuinn Dawkins      permutedOuterDims.push_back(dimExpr.getPosition());
b4563ee1SQuinn Dawkins      continue;
b4563ee1SQuinn Dawkins    }
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins    // TODO: Allow propagation with transposes on non affine dim expressions,
b4563ee1SQuinn Dawkins    // e.g. d0 + d1 which implies transposing both dims simultaneously while
b4563ee1SQuinn Dawkins    // maintaining the relative position between them.
b4563ee1SQuinn Dawkins    if (static_cast<int64_t>(index) != dim)
b4563ee1SQuinn Dawkins      return failure();
b4563ee1SQuinn Dawkins  }
b4563ee1SQuinn Dawkins  if (!permutedOuterDims.empty()) {
b4563ee1SQuinn Dawkins    int64_t outerDimIndex = 0;
b4563ee1SQuinn Dawkins    llvm::DenseSet<int64_t> permutedDomainDims(permutedOuterDims.begin(),
b4563ee1SQuinn Dawkins                                               permutedOuterDims.end());
b4563ee1SQuinn Dawkins    for (int i = 0, e = indexingMap.getNumDims(); i < e; i++)
b4563ee1SQuinn Dawkins      packInfo.outerDimsOnDomainPerm.push_back(
b4563ee1SQuinn Dawkins          permutedDomainDims.contains(i) ? permutedOuterDims[outerDimIndex++]
b4563ee1SQuinn Dawkins                                         : i);
d38d6065SHanhan Wang    LLVM_DEBUG({
d38d6065SHanhan Wang      llvm::dbgs() << "map outer dimsDimsPerm to ";
d38d6065SHanhan Wang      for (auto dim : packInfo.outerDimsOnDomainPerm)
d38d6065SHanhan Wang        llvm::dbgs() << dim << " ";
d38d6065SHanhan Wang      llvm::dbgs() << "\n";
d38d6065SHanhan Wang    });
d38d6065SHanhan Wang  }
d38d6065SHanhan Wang
d38d6065SHanhan Wang  return packInfo;
d38d6065SHanhan Wang}
d38d6065SHanhan Wang
d7904a70SLorenzo Chelinistatic SmallVector<int64_t> computeOuterDims(ArrayRef<int64_t> perm,
d7904a70SLorenzo Chelini                                             ArrayRef<AffineExpr> exprs) {
d7904a70SLorenzo Chelini  // Compute `outer_dims_perm`. See example:
d7904a70SLorenzo Chelini  // current exprs      : (d0, d1, d2, d3) -> (d2, d3)
d7904a70SLorenzo Chelini  // perm               : [0, 3, 1, 2]
d7904a70SLorenzo Chelini  // First map d2, d3 with their position in the array as:
d7904a70SLorenzo Chelini  // currentPositionTileLoops: dim | pos
d7904a70SLorenzo Chelini  //                           d2  | 0
d7904a70SLorenzo Chelini  //                           d3  | 1
d7904a70SLorenzo Chelini  // then scan `perm` in order and get the `outer_dims_perm`
d7904a70SLorenzo Chelini  // to be used, here it would be [1, 0].
d7904a70SLorenzo Chelini  assert(!perm.empty() && "expect perm not to be empty");
d7904a70SLorenzo Chelini  assert(!exprs.empty() && "expect exprs not to be empty");
d7904a70SLorenzo Chelini  if (exprs.size() == 1)
d7904a70SLorenzo Chelini    return {};
d7904a70SLorenzo Chelini  SmallVector<int64_t> outerDimsPerm;
d7904a70SLorenzo Chelini  DenseMap<int64_t, int64_t> currentPositionTileLoops;
d7904a70SLorenzo Chelini  for (auto [pos, expr] : llvm::enumerate(exprs)) {
b4563ee1SQuinn Dawkins    // Here we rely on the assumption that the outer dims permutation
b4563ee1SQuinn Dawkins    // when propagating currently requires that non-affine dim expressions
b4563ee1SQuinn Dawkins    // are not permuted, thus allowing the identity assignment below.
1609f1c2Slong.chen    if (auto dimExpr = dyn_cast<AffineDimExpr>(expr))
b4563ee1SQuinn Dawkins      currentPositionTileLoops[dimExpr.getPosition()] = pos;
b4563ee1SQuinn Dawkins    else
b4563ee1SQuinn Dawkins      currentPositionTileLoops[pos] = pos;
d7904a70SLorenzo Chelini  }
d7904a70SLorenzo Chelini  for (int64_t loopIdx : perm) {
d7904a70SLorenzo Chelini    if (currentPositionTileLoops.count(loopIdx))
d7904a70SLorenzo Chelini      outerDimsPerm.push_back(currentPositionTileLoops.lookup(loopIdx));
d7904a70SLorenzo Chelini  }
d7904a70SLorenzo Chelini  return outerDimsPerm;
d7904a70SLorenzo Chelini}
d7904a70SLorenzo Chelini
0f297cadSHanhan Wang/// Returns a tuple for packed operand and indexing_map with the assumptions:
0f297cadSHanhan Wang///   1) The generic op is the producer of the pack op.
0f297cadSHanhan Wang///   2) The generic op has only one result.
0f297cadSHanhan Wang/// If the operand is a scalar or packing dimensions are all irrelevant to the
d7904a70SLorenzo Chelini/// operand, the operand and the updated indexing map will be returned.
0f297cadSHanhan Wang/// Otherwise, it returns the packed operand and the updated indexing map. E.g.,
0f297cadSHanhan Wang///
0f297cadSHanhan Wang///   #map0 = affine_map<(d0, d1) -> (d0, d1)>
0f297cadSHanhan Wang///   #map1 = affine_map<(d0, d1) -> (d0)>
0f297cadSHanhan Wang///   #map2 = affine_map<(d0, d1) -> (d1)>
0f297cadSHanhan Wang///   %0 = linalg.generic {indexing_maps = [#map1, #map2, #map0],
0f297cadSHanhan Wang///                        iterator_types = ["parallel", "parallel"]}
0f297cadSHanhan Wang///      ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
0f297cadSHanhan Wang///      outs(%init : tensor<?x?xf32>) {
0f297cadSHanhan Wang///    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
0f297cadSHanhan Wang///      %4 = arith.addf %arg3, %arg4 : f32
0f297cadSHanhan Wang///      linalg.yield %4 : f32
0f297cadSHanhan Wang///  } -> tensor<?x?xf32>
0f297cadSHanhan Wang///  %1 = tensor.pack %0
0f297cadSHanhan Wang///    inner_dims_pos = [0, 1]
0f297cadSHanhan Wang///    inner_tiles = [8, 2]
0f297cadSHanhan Wang///    into %dest : tensor<?x?xf32> -> tensor<?x?x8x2xf32>
0f297cadSHanhan Wang///
0f297cadSHanhan Wang///  Taking the first input operand as an example, the inner tile size of d1 is
0f297cadSHanhan Wang///  8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> ->
0f297cadSHanhan Wang///  affine_map<(d1, d3)>` will be returned.
0f297cadSHanhan Wang///
0f297cadSHanhan Wang///  %pack = tensor.pack %arg0
0f297cadSHanhan Wang///    inner_dims_pos = [0]
0f297cadSHanhan Wang///    inner_tiles = [8]
0f297cadSHanhan Wang///    into %init : tensor<?xf32> -> tensor<?x8xf32>
0f297cadSHanhan Wangstatic std::tuple<Value, AffineMap>
d38d6065SHanhan WanggetOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo,
d38d6065SHanhan Wang                               GenericOp genericOp, OpOperand *opOperand) {
d38d6065SHanhan Wang  int64_t numOrigLoops = genericOp.getNumLoops();
d38d6065SHanhan Wang  int64_t numInnerLoops = packInfo.getNumTiledLoops();
0f297cadSHanhan Wang  int64_t numLoops = numOrigLoops + numInnerLoops;
0f297cadSHanhan Wang  AffineMap origIndexingMap = genericOp.getMatchingIndexingMap(opOperand);
d38d6065SHanhan Wang  llvm::DenseMap<int64_t, int64_t> domainDimToOperandDim;
0f297cadSHanhan Wang  SmallVector<AffineExpr> exprs(origIndexingMap.getResults());
9f242404SLorenzo Chelini
9f242404SLorenzo Chelini  // If the OpOperand is a scalar or a zero-rank tensor, no need to pack.
3cf42c3fSAdrian Kuegel  if (genericOp.isScalar(opOperand) || exprs.empty())
d38d6065SHanhan Wang    return std::make_tuple(opOperand->get(),
d38d6065SHanhan Wang                           AffineMap::get(numLoops, 0, exprs, b.getContext()));
0f297cadSHanhan Wang
d38d6065SHanhan Wang  // Step 1. Construct the information of packing data dimensions; append inner
d38d6065SHanhan Wang  // dimensions to the indexing maps for the operand.
d38d6065SHanhan Wang  for (auto [index, expr] : llvm::enumerate(exprs)) {
1609f1c2Slong.chen    if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
5f2618feSQuinn Dawkins      int64_t dimPos = dimExpr.getPosition();
d38d6065SHanhan Wang      domainDimToOperandDim[dimPos] = index;
5f2618feSQuinn Dawkins      continue;
5f2618feSQuinn Dawkins    }
0f297cadSHanhan Wang  }
0f297cadSHanhan Wang  SmallVector<int64_t> innerDimsPos;
0f297cadSHanhan Wang  SmallVector<OpFoldResult> innerTileSizes;
d38d6065SHanhan Wang  for (auto dimPos : packInfo.tiledDimsPos) {
d38d6065SHanhan Wang    if (!domainDimToOperandDim.count(dimPos))
0f297cadSHanhan Wang      continue;
d38d6065SHanhan Wang    int64_t index = domainDimToOperandDim[dimPos];
d38d6065SHanhan Wang    innerTileSizes.push_back(packInfo.domainDimAndTileMapping[dimPos]);
d38d6065SHanhan Wang    innerDimsPos.push_back(index);
d38d6065SHanhan Wang    exprs.push_back(b.getAffineDimExpr(packInfo.tileToPointMapping[dimPos]));
0f297cadSHanhan Wang  }
0f297cadSHanhan Wang
d7904a70SLorenzo Chelini  // Step 2. Handle outer dim permutations.
0f297cadSHanhan Wang  SmallVector<int64_t> outerDimsPerm;
d38d6065SHanhan Wang  if (!packInfo.outerDimsOnDomainPerm.empty()) {
d7904a70SLorenzo Chelini    outerDimsPerm = computeOuterDims(packInfo.outerDimsOnDomainPerm, exprs);
d7904a70SLorenzo Chelini
d7904a70SLorenzo Chelini    // Step 2.1: Fold transpose into the linalg.generic.
d38d6065SHanhan Wang    SmallVector<int64_t> inversedOuterPerm =
d38d6065SHanhan Wang        invertPermutationVector(packInfo.outerDimsOnDomainPerm);
d38d6065SHanhan Wang    for (auto i : llvm::seq<unsigned>(0, origIndexingMap.getNumResults())) {
1609f1c2Slong.chen      if (auto dimExpr = dyn_cast<AffineDimExpr>(exprs[i])) {
5f2618feSQuinn Dawkins        int64_t dimPos = dimExpr.getPosition();
d38d6065SHanhan Wang        exprs[i] = b.getAffineDimExpr(inversedOuterPerm[dimPos]);
5f2618feSQuinn Dawkins        continue;
5f2618feSQuinn Dawkins      }
1609f1c2Slong.chen      assert(isa<AffineConstantExpr>(exprs[i]) &&
5f2618feSQuinn Dawkins             "Attempted to permute non-constant and non-affine dim expression");
0f297cadSHanhan Wang    }
d7904a70SLorenzo Chelini    // Step 2.2: Undo the transposition on `exprs` and propagate the
d7904a70SLorenzo Chelini    // transposition on the pack using outerDimsPerm.
d7904a70SLorenzo Chelini    if (!outerDimsPerm.empty()) {
d7904a70SLorenzo Chelini      SmallVector<AffineExpr> auxVec = exprs;
d7904a70SLorenzo Chelini      for (const auto &en : enumerate(outerDimsPerm))
d7904a70SLorenzo Chelini        auxVec[en.index()] = exprs[en.value()];
d7904a70SLorenzo Chelini      exprs = auxVec;
d7904a70SLorenzo Chelini    }
d38d6065SHanhan Wang  }
d38d6065SHanhan Wang  auto indexingMap = AffineMap::get(numLoops, 0, exprs, b.getContext());
0f297cadSHanhan Wang
0f297cadSHanhan Wang  // The operand does not have dimensions that relates to pack op.
b4563ee1SQuinn Dawkins  if (innerDimsPos.empty() && outerDimsPerm.empty())
0f297cadSHanhan Wang    return std::make_tuple(opOperand->get(), indexingMap);
0f297cadSHanhan Wang
0f297cadSHanhan Wang  auto empty = tensor::PackOp::createDestinationTensor(
0f297cadSHanhan Wang      b, loc, opOperand->get(), innerTileSizes, innerDimsPos, outerDimsPerm);
0f297cadSHanhan Wang  auto packedOperand = b.create<tensor::PackOp>(
0f297cadSHanhan Wang      loc, opOperand->get(), empty, innerDimsPos, innerTileSizes,
ff6f4ae7SLorenzo Chelini      /*padding=*/std::nullopt, outerDimsPerm);
0f297cadSHanhan Wang  return std::make_tuple(packedOperand, indexingMap);
0f297cadSHanhan Wang}
0f297cadSHanhan Wang
9f242404SLorenzo Chelini/// Pack a genericOp and return it.
9f242404SLorenzo Chelinistatic GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
6bb0ab0dSLorenzo Chelini                               Value dest, AffineMap packedOutIndexingMap,
6bb0ab0dSLorenzo Chelini                               const PackInfo &packInfo) {
6bb0ab0dSLorenzo Chelini  Location loc = genericOp.getLoc();
6bb0ab0dSLorenzo Chelini  SmallVector<Value> inputOperands;
6bb0ab0dSLorenzo Chelini  SmallVector<AffineMap> indexingMaps;
6bb0ab0dSLorenzo Chelini  for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) {
6bb0ab0dSLorenzo Chelini    auto [packedOperand, packedIndexingMap] = getOrCreatePackedViewOfOperand(
6bb0ab0dSLorenzo Chelini        rewriter, loc, packInfo, genericOp, inputOperand);
6bb0ab0dSLorenzo Chelini    inputOperands.push_back(packedOperand);
6bb0ab0dSLorenzo Chelini    indexingMaps.push_back(packedIndexingMap);
6bb0ab0dSLorenzo Chelini  }
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  int64_t numInnerLoops = packInfo.getNumTiledLoops();
6bb0ab0dSLorenzo Chelini  SmallVector<utils::IteratorType> iterTypes =
6bb0ab0dSLorenzo Chelini      genericOp.getIteratorTypesArray();
6bb0ab0dSLorenzo Chelini  iterTypes.append(numInnerLoops, utils::IteratorType::parallel);
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  indexingMaps.push_back(packedOutIndexingMap);
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  auto newGenericOp = rewriter.create<linalg::GenericOp>(
6bb0ab0dSLorenzo Chelini      loc, dest.getType(), inputOperands, dest, indexingMaps, iterTypes,
6bb0ab0dSLorenzo Chelini      /*bodyBuild=*/nullptr, linalg::getPrunedAttributeList(genericOp));
6bb0ab0dSLorenzo Chelini  rewriter.cloneRegionBefore(genericOp.getRegion(), newGenericOp.getRegion(),
6bb0ab0dSLorenzo Chelini                             newGenericOp.getRegion().begin());
6bb0ab0dSLorenzo Chelini  return newGenericOp;
6bb0ab0dSLorenzo Chelini}
6bb0ab0dSLorenzo Chelini
b4563ee1SQuinn Dawkins/// Bubbles up tensor.pack op through a producer generic op. This
0f297cadSHanhan Wang/// swap pack(generic) to generic(pack). The new generic op works on packed
0f297cadSHanhan Wang/// domain; pack ops are created for input and output operands. E.g.,
0f297cadSHanhan Wang///
0f297cadSHanhan Wang///     #map0 = affine_map<(d0, d1) -> (d0, d1)>
0f297cadSHanhan Wang///     %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
0f297cadSHanhan Wang///     %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
0f297cadSHanhan Wang///     %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
0f297cadSHanhan Wang///     %3 = linalg.generic {indexing_maps = [#map0, #map0],
0f297cadSHanhan Wang///                          iterator_types = ["parallel", "parallel"]}
0f297cadSHanhan Wang///         ins(%arg0 : tensor<?x?xf32>)
0f297cadSHanhan Wang///         outs(%2 : tensor<?x?xf32>) {
0f297cadSHanhan Wang///       ^bb0(%arg3: f32, %arg4: f32):
0f297cadSHanhan Wang///         %4 = arith.addf %arg3, %arg3 : f32
0f297cadSHanhan Wang///         linalg.yield %4 : f32
0f297cadSHanhan Wang///     } -> tensor<?x?xf32>
0f297cadSHanhan Wang///     %4 = tensor.pack %3
0f297cadSHanhan Wang///       inner_dims_pos = [0, 1]
0f297cadSHanhan Wang///       inner_tiles = [8, 2]
0f297cadSHanhan Wang///       into %dest : tensor<?x?xf32> -> tensor<?x?x8x2xf32>
0f297cadSHanhan Wang///
0f297cadSHanhan Wang/// will be converted to
0f297cadSHanhan Wang///
0f297cadSHanhan Wang///     #map = affine_map<()[s0] -> (s0 ceildiv 8)>
0f297cadSHanhan Wang///     #map1 = affine_map<()[s0] -> (s0 ceildiv 2)>
0f297cadSHanhan Wang///     #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
0f297cadSHanhan Wang///     %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
0f297cadSHanhan Wang///     %dim_0 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
0f297cadSHanhan Wang///     %0 = affine.apply #map()[%dim]
0f297cadSHanhan Wang///     %1 = affine.apply #map1()[%dim_0]
0f297cadSHanhan Wang///     %2 = tensor.empty(%0, %1) : tensor<?x?x8x2xf32>
0f297cadSHanhan Wang///     %pack = tensor.pack %arg0
0f297cadSHanhan Wang///       inner_dims_pos = [0, 1]
0f297cadSHanhan Wang///       inner_tiles = [8, 2]
0f297cadSHanhan Wang///       into %2 : tensor<?x?xf32> -> tensor<?x?x8x2xf32>
0f297cadSHanhan Wang///     %3 = linalg.generic {indexing_maps = [#map2, #map2],
0f297cadSHanhan Wang///       iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
0f297cadSHanhan Wang///       ins(%pack : tensor<?x?x8x2xf32>)
0f297cadSHanhan Wang///       outs(%arg1 : tensor<?x?x8x2xf32>) {
0f297cadSHanhan Wang///     ^bb0(%in: f32, %out: f32):
0f297cadSHanhan Wang///       %4 = arith.addf %in, %in : f32
0f297cadSHanhan Wang///       linalg.yield %4 : f32
0f297cadSHanhan Wang///     } -> tensor<?x?x8x2xf32>
0f297cadSHanhan Wangstatic FailureOr<GenericOp>
b4563ee1SQuinn DawkinsbubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp,
3b61f5a1SMehdi Amini                               const ControlPropagationFn &controlFn) {
0f297cadSHanhan Wang  auto genericOp = packOp.getSource().getDefiningOp<GenericOp>();
b4563ee1SQuinn Dawkins  if (!genericOp)
b4563ee1SQuinn Dawkins    return failure();
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins  // User controlled propagation function.
04fc471fSHan-Chung Wang  if (!controlFn(&packOp.getSourceMutable()))
b4563ee1SQuinn Dawkins    return failure();
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins  // TODO: Enable propagation in the presence of linalg.index and
b4563ee1SQuinn Dawkins  // tensor.extract, likely as a separate pattern as the pack information and
b4563ee1SQuinn Dawkins  // propagation decision needs to be inferred from the region of the generic.
b4563ee1SQuinn Dawkins  if (hasGatherSemantics(genericOp))
0f297cadSHanhan Wang    return failure();
0f297cadSHanhan Wang
0f297cadSHanhan Wang  // TODO: Relax the restriction. We are able to bubble up the pack op through
0f297cadSHanhan Wang  // multi-result generic op. It just needs more work.
0f297cadSHanhan Wang  if (genericOp.getNumResults() != 1)
0f297cadSHanhan Wang    return failure();
0f297cadSHanhan Wang
1c228026SLorenzo Chelini  // Bail-out if the result of the generic has multiple uses, as bubbling up
1c228026SLorenzo Chelini  // creates recomputation if the generic has multiple users.
b4563ee1SQuinn Dawkins  // TODO: Enable the case where every use is an identical pack op as no
b4563ee1SQuinn Dawkins  // recomputation is needed in that case.
1c228026SLorenzo Chelini  if (!genericOp->getResult(0).hasOneUse())
1c228026SLorenzo Chelini    return failure();
1c228026SLorenzo Chelini
1c228026SLorenzo Chelini  // We want to move the pack not the generic.
1c228026SLorenzo Chelini  OpBuilder::InsertionGuard guard(rewriter);
1c228026SLorenzo Chelini  rewriter.setInsertionPoint(genericOp);
1c228026SLorenzo Chelini
1c228026SLorenzo Chelini  // We need to handle two cases:
1c228026SLorenzo Chelini  // 1) The tensor.pack destination is a tensor.empty. If this is the case, we
1c228026SLorenzo Chelini  // create a new tensor.empty to avoid breaking dominance, as we are moving the
1c228026SLorenzo Chelini  // tensor.pack above the linalg.generic.
1c228026SLorenzo Chelini  // 2) The destination is not a tensor.empty. In this case we can replace only
1c228026SLorenzo Chelini  // if the destination of the tensor.pack dominates the linalg.generic.
1c228026SLorenzo Chelini  Value packOpDest = packOp.getDest();
1c228026SLorenzo Chelini  if (!packOpDest.hasOneUse())
1c228026SLorenzo Chelini    return failure();
1c228026SLorenzo Chelini  if (auto emptyOp = packOpDest.getDefiningOp<tensor::EmptyOp>()) {
1c228026SLorenzo Chelini    packOpDest = rewriter.create<tensor::EmptyOp>(
1c228026SLorenzo Chelini        genericOp->getLoc(), emptyOp.getMixedSizes(),
1c228026SLorenzo Chelini        emptyOp.getType().getElementType());
1c228026SLorenzo Chelini  } else {
1c228026SLorenzo Chelini    DominanceInfo dom(genericOp);
1c228026SLorenzo Chelini    if (!dom.properlyDominates(packOpDest, genericOp))
1c228026SLorenzo Chelini      return failure();
1c228026SLorenzo Chelini  }
1c228026SLorenzo Chelini
0f297cadSHanhan Wang  // TODO: Add an option for allowing padding values. It could introduce
0f297cadSHanhan Wang  // undefined behavior if we unconditionally propagate pack op through all
0f297cadSHanhan Wang  // the ops. E.g., if the padding value is zero and there are division ops in
0f297cadSHanhan Wang  // a generic op. Some values of padding area could be NaN (0/0).
0f297cadSHanhan Wang  if (packOp.getPaddingValue())
0f297cadSHanhan Wang    return failure();
0f297cadSHanhan Wang
0f297cadSHanhan Wang  OpOperand *opOperand = genericOp.getDpsInitOperand(0);
b4563ee1SQuinn Dawkins  auto packInfo = getPackingInfoFromOperand(opOperand, genericOp, packOp);
b4563ee1SQuinn Dawkins  if (failed(packInfo))
b4563ee1SQuinn Dawkins    return failure();
0f297cadSHanhan Wang
d38d6065SHanhan Wang  // Rebuild the indexing map for the corresponding init operand.
d38d6065SHanhan Wang  auto [packedOutOperand, packedOutIndexingMap] =
b4563ee1SQuinn Dawkins      getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo,
6bb0ab0dSLorenzo Chelini                                     genericOp, opOperand);
0f297cadSHanhan Wang
61be9358SLorenzo Chelini  // If the dps init operand of the generic is a tensor.empty forward the pack
61be9358SLorenzo Chelini  // op destination.
61be9358SLorenzo Chelini  Value dest = packedOutOperand;
61be9358SLorenzo Chelini  if (auto initTensor = genericOp.getDpsInitOperand(0)
61be9358SLorenzo Chelini                            ->get()
61be9358SLorenzo Chelini                            .getDefiningOp<tensor::EmptyOp>()) {
61be9358SLorenzo Chelini    dest = packOpDest;
61be9358SLorenzo Chelini  }
9f242404SLorenzo Chelini  return packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap,
b4563ee1SQuinn Dawkins                       *packInfo);
0f297cadSHanhan Wang}
0f297cadSHanhan Wang
b4563ee1SQuinn Dawkins/// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method.
b4563ee1SQuinn Dawkinsstruct BubbleUpPackOpThroughGenericOpPattern
0f297cadSHanhan Wang    : public OpRewritePattern<tensor::PackOp> {
b4563ee1SQuinn Dawkinspublic:
b4563ee1SQuinn Dawkins  BubbleUpPackOpThroughGenericOpPattern(MLIRContext *context,
b4563ee1SQuinn Dawkins                                        ControlPropagationFn fun)
b4563ee1SQuinn Dawkins      : OpRewritePattern<tensor::PackOp>(context), controlFn(std::move(fun)) {}
0f297cadSHanhan Wang
0f297cadSHanhan Wang  LogicalResult matchAndRewrite(tensor::PackOp packOp,
0f297cadSHanhan Wang                                PatternRewriter &rewriter) const override {
b4563ee1SQuinn Dawkins    auto genericOp =
b4563ee1SQuinn Dawkins        bubbleUpPackOpThroughGenericOp(rewriter, packOp, controlFn);
0f297cadSHanhan Wang    if (failed(genericOp))
0f297cadSHanhan Wang      return failure();
cbb09813SFangrui Song    rewriter.replaceOp(packOp, genericOp->getResults());
0f297cadSHanhan Wang    return success();
0f297cadSHanhan Wang  }
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkinsprivate:
b4563ee1SQuinn Dawkins  ControlPropagationFn controlFn;
0f297cadSHanhan Wang};
6bb0ab0dSLorenzo Chelini
886294a2SQuinn Dawkins/// Propagate a tensor.pack operation up through a tensor.pad. The idea is to
886294a2SQuinn Dawkins/// add as many zero padding dimensions in `high` and `low` based on the number
886294a2SQuinn Dawkins/// of point loops.
886294a2SQuinn Dawkinsclass BubbleUpPackThroughPadOp final : public OpRewritePattern<tensor::PackOp> {
886294a2SQuinn Dawkinspublic:
886294a2SQuinn Dawkins  BubbleUpPackThroughPadOp(MLIRContext *context, ControlPropagationFn fun)
886294a2SQuinn Dawkins      : OpRewritePattern<tensor::PackOp>(context), controlFn(std::move(fun)) {}
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins  LogicalResult matchAndRewrite(tensor::PackOp packOp,
886294a2SQuinn Dawkins                                PatternRewriter &rewriter) const override {
886294a2SQuinn Dawkins    auto padOp = packOp.getSource().getDefiningOp<tensor::PadOp>();
886294a2SQuinn Dawkins    if (!padOp)
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    // User controlled propagation function.
04fc471fSHan-Chung Wang    if (!controlFn(&packOp.getSourceMutable()))
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    // TODO: Enable padding when the padding values are the same.
886294a2SQuinn Dawkins    if (packOp.getPaddingValue())
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    // Fail for non-constant padding values. The body of the pad could
886294a2SQuinn Dawkins    // depend on the padding indices and/or properties of the padded
886294a2SQuinn Dawkins    // tensor so for now we fail.
886294a2SQuinn Dawkins    // TODO: Support non-constant padding values.
886294a2SQuinn Dawkins    Value paddingVal = padOp.getConstantPaddingValue();
886294a2SQuinn Dawkins    if (!paddingVal)
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    if (!packOp.getDest().getDefiningOp<tensor::EmptyOp>())
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    ArrayRef<int64_t> innerDimsPos = packOp.getInnerDimsPos();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    // Bail out if one of the padded dimension is a tiled one.
886294a2SQuinn Dawkins    llvm::SmallBitVector paddedDims = padOp.getPaddedDims();
886294a2SQuinn Dawkins    llvm::SmallBitVector innerDims(paddedDims.size());
886294a2SQuinn Dawkins    for (int64_t dim : innerDimsPos)
886294a2SQuinn Dawkins      innerDims.flip(dim);
886294a2SQuinn Dawkins    if (paddedDims.anyCommon(innerDims))
886294a2SQuinn Dawkins      return failure();
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    Location loc = padOp->getLoc();
886294a2SQuinn Dawkins    OpBuilder::InsertionGuard guard(rewriter);
886294a2SQuinn Dawkins    rewriter.setInsertionPoint(padOp);
886294a2SQuinn Dawkins
4ad96785SQuinn Dawkins    ArrayRef<int64_t> outerDimsPerm = packOp.getOuterDimsPerm();
4ad96785SQuinn Dawkins    SmallVector<OpFoldResult> mixedTiles = packOp.getMixedTiles();
886294a2SQuinn Dawkins    auto empty = tensor::PackOp::createDestinationTensor(
4ad96785SQuinn Dawkins        rewriter, loc, padOp.getSource(), mixedTiles, innerDimsPos,
886294a2SQuinn Dawkins        outerDimsPerm);
4ad96785SQuinn Dawkins    auto sourcePack = rewriter.create<tensor::PackOp>(
4ad96785SQuinn Dawkins        loc, padOp.getSource(), empty, innerDimsPos, mixedTiles,
886294a2SQuinn Dawkins        /*padding=*/std::nullopt, outerDimsPerm);
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    // If we have `outer_dims_perms` we need to adjust the padded dimensions.
886294a2SQuinn Dawkins    SmallVector<OpFoldResult> lowPad = padOp.getMixedLowPad();
886294a2SQuinn Dawkins    SmallVector<OpFoldResult> highPad = padOp.getMixedHighPad();
886294a2SQuinn Dawkins    if (!outerDimsPerm.empty()) {
886294a2SQuinn Dawkins      applyPermutationToVector<OpFoldResult>(lowPad, outerDimsPerm);
886294a2SQuinn Dawkins      applyPermutationToVector<OpFoldResult>(highPad, outerDimsPerm);
886294a2SQuinn Dawkins    }
886294a2SQuinn Dawkins    // The tiled dimensions were verified to be unpadded above, so here we
886294a2SQuinn Dawkins    // just append 0 for the inner tile dimensions.
886294a2SQuinn Dawkins    size_t pointLoopsSize = innerDimsPos.size();
886294a2SQuinn Dawkins    lowPad.append(pointLoopsSize, rewriter.getIndexAttr(0));
886294a2SQuinn Dawkins    highPad.append(pointLoopsSize, rewriter.getIndexAttr(0));
886294a2SQuinn Dawkins
886294a2SQuinn Dawkins    auto newPadOp = rewriter.create<tensor::PadOp>(
4ad96785SQuinn Dawkins        loc, /*result=*/Type(), sourcePack, lowPad, highPad, paddingVal,
886294a2SQuinn Dawkins        padOp.getNofold());
4ad96785SQuinn Dawkins
4ad96785SQuinn Dawkins    // If the pad has more than one user, create an unpack on the new pad to
4ad96785SQuinn Dawkins    // replace the other uses.
4ad96785SQuinn Dawkins    if (!padOp->hasOneUse()) {
4ad96785SQuinn Dawkins      auto unpackEmpty = tensor::UnPackOp::createDestinationTensor(
4ad96785SQuinn Dawkins          rewriter, loc, newPadOp, mixedTiles, innerDimsPos, outerDimsPerm);
4ad96785SQuinn Dawkins      Value unpackedPad = rewriter.create<tensor::UnPackOp>(
4ad96785SQuinn Dawkins          loc, newPadOp, unpackEmpty, innerDimsPos, mixedTiles, outerDimsPerm);
4ad96785SQuinn Dawkins      rewriter.replaceAllUsesExcept(padOp, unpackedPad, sourcePack);
4ad96785SQuinn Dawkins    }
4ad96785SQuinn Dawkins
4ad96785SQuinn Dawkins    // Replace the pack with the new pad.
886294a2SQuinn Dawkins    rewriter.replaceOp(packOp, newPadOp.getResult());
4ad96785SQuinn Dawkins
886294a2SQuinn Dawkins    return success();
886294a2SQuinn Dawkins  }
886294a2SQuinn Dawkins
886294a2SQuinn Dawkinsprivate:
886294a2SQuinn Dawkins  ControlPropagationFn controlFn;
886294a2SQuinn Dawkins};
886294a2SQuinn Dawkins
0c1c0d53SJerry Wu/// Project dimsPos to the inner-most non-unit dim pos with reassocIndices.
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// For example, given dimsPos [0, 2], reassocIndices [[0, 1], [2, 3]], and
0c1c0d53SJerry Wu/// targetShape [16, 16, 32, 1], it returns [1, 2]. Because for pos 0, the
0c1c0d53SJerry Wu/// inner-most projected dim in pos [0, 1] is 1. And for pos 2, the inner-most
0c1c0d53SJerry Wu/// non-unit projected dims in pos [2, 3] is 2.
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// If all candidates in a reassociation are unit dims, it chooses the
0c1c0d53SJerry Wu/// inner-most dim pos.
0c1c0d53SJerry Wustatic SmallVector<int64_t>
0c1c0d53SJerry WuprojectToInnerMostNonUnitDimsPos(ArrayRef<int64_t> dimsPos,
0c1c0d53SJerry Wu                                 ArrayRef<ReassociationIndices> reassocIndices,
0c1c0d53SJerry Wu                                 ArrayRef<int64_t> targetShape) {
0c1c0d53SJerry Wu  SmallVector<int64_t> projectedDimsPos;
0c1c0d53SJerry Wu  for (auto pos : dimsPos) {
0c1c0d53SJerry Wu    // In the case all dims are unit, this will return the inner-most one.
0c1c0d53SJerry Wu    int64_t projectedPos = reassocIndices[pos].back();
0c1c0d53SJerry Wu    for (auto i : llvm::reverse(reassocIndices[pos])) {
0c1c0d53SJerry Wu      int64_t dim = targetShape[i];
0c1c0d53SJerry Wu      if (dim > 1 || ShapedType::isDynamic(dim)) {
0c1c0d53SJerry Wu        projectedPos = i;
0c1c0d53SJerry Wu        break;
0c1c0d53SJerry Wu      }
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu    projectedDimsPos.push_back(projectedPos);
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu  return projectedDimsPos;
0c1c0d53SJerry Wu}
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu/// Check if all dims in dimsPos are divisible by the corresponding tile sizes.
0c1c0d53SJerry Wustatic bool isDimsDivisibleByTileSizes(ArrayRef<int64_t> dimsPos,
0c1c0d53SJerry Wu                                       ArrayRef<int64_t> shape,
0c1c0d53SJerry Wu                                       ArrayRef<int64_t> tileSizes) {
0c1c0d53SJerry Wu  for (auto [pos, tileSize] : llvm::zip_equal(dimsPos, tileSizes)) {
0c1c0d53SJerry Wu    int64_t dim = shape[pos];
0c1c0d53SJerry Wu    if (ShapedType::isDynamic(dim) || (dim % tileSize) != 0)
0c1c0d53SJerry Wu      return false;
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu  return true;
0c1c0d53SJerry Wu}
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu/// Permutate the reassociation indices and reindex them in the sequence order.
0c1c0d53SJerry Wu/// Returns the next dim pos in the sequence.
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// For example, given reassocIndices [[0, 1], [2]] and permutation [1, 0], it
0c1c0d53SJerry Wu/// applies the permutation to get [[2], [0, 1]] and reindexes the indices into
0c1c0d53SJerry Wu/// [[0], [1, 2]].
0c1c0d53SJerry Wustatic int64_t applyPermutationAndReindexReassoc(
0c1c0d53SJerry Wu    SmallVector<ReassociationIndices> &reassocIndices,
0c1c0d53SJerry Wu    ArrayRef<int64_t> permutation) {
002e8192Syifeizh2  if (!permutation.empty())
0c1c0d53SJerry Wu    applyPermutationToVector<ReassociationIndices>(reassocIndices, permutation);
0c1c0d53SJerry Wu  int64_t nextPos = 0;
0c1c0d53SJerry Wu  for (ReassociationIndices &indices : reassocIndices) {
0c1c0d53SJerry Wu    for (auto &index : indices) {
0c1c0d53SJerry Wu      index = nextPos;
0c1c0d53SJerry Wu      nextPos += 1;
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu  return nextPos;
0c1c0d53SJerry Wu}
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu/// Bubble up pack op through collapse shape op when the packed dims can be
0c1c0d53SJerry Wu/// projected to the dims before collapsing. This is possible when the inner
0c1c0d53SJerry Wu/// tile sizes can divide the projected dims.
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// For example:
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// %collapsed = tensor.collapse_shape %in [[0, 1], 2]
0c1c0d53SJerry Wu///     : tensor<?x16x4xf32> into tensor<?x4xf32>
0c1c0d53SJerry Wu/// %pack = tensor.pack %collapsed outer_dims_perm = [0, 1]
0c1c0d53SJerry Wu///     inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %empty
0c1c0d53SJerry Wu///     : tensor<?x4xf32> -> tensor<?x4x8x1xf32>
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// can be transformed into:
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// %pack = tensor.pack %in outer_dims_perm = [1, 2]
0c1c0d53SJerry Wu///     inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %empty
0c1c0d53SJerry Wu///     : tensor<?x16x4xf32> -> tensor<?x2x4x8x1xf32>
0c1c0d53SJerry Wu/// %collapsed = tensor.collapse_shape %pack [[0, 1], 2, 3, 4]
0c1c0d53SJerry Wu///     : tensor<?x2x4x8x1xf32> into tensor<?x4x8x1>
0c1c0d53SJerry Wustatic LogicalResult
0c1c0d53SJerry WububbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp,
0c1c0d53SJerry Wu                                   tensor::PackOp packOp,
0c1c0d53SJerry Wu                                   PatternRewriter &rewriter) {
0c1c0d53SJerry Wu  SmallVector<int64_t> innerTileSizes = packOp.getStaticTiles();
0c1c0d53SJerry Wu  ArrayRef<int64_t> innerDimsPos = packOp.getInnerDimsPos();
0c1c0d53SJerry Wu  ArrayRef<int64_t> outerDimsPerm = packOp.getOuterDimsPerm();
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  ArrayRef<int64_t> srcShape = collapseOp.getSrcType().getShape();
0c1c0d53SJerry Wu  SmallVector<ReassociationIndices> reassocIndices =
0c1c0d53SJerry Wu      collapseOp.getReassociationIndices();
0c1c0d53SJerry Wu  // Project inner tile pos to the dim pos before collapsing. For example, if
0c1c0d53SJerry Wu  // dims [x, y] is collapsed into [z], packing on dim z can be projected back
0c1c0d53SJerry Wu  // to pack on dim y.
0c1c0d53SJerry Wu  //
0c1c0d53SJerry Wu  // Project to inner-most non-unit dims to increase the chance that they can be
0c1c0d53SJerry Wu  // divided by the inner tile sizes. This is correct because for [..., x, 1],
0c1c0d53SJerry Wu  // packing on dim 1 is equivalent to packing on dim x.
0c1c0d53SJerry Wu  SmallVector<int64_t> projectedInnerDimsPos =
0c1c0d53SJerry Wu      projectToInnerMostNonUnitDimsPos(innerDimsPos, reassocIndices, srcShape);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  if (!isDimsDivisibleByTileSizes(projectedInnerDimsPos, srcShape,
0c1c0d53SJerry Wu                                  innerTileSizes)) {
0c1c0d53SJerry Wu    return failure();
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu  // Expand the outer dims permutation with the associated source dims for the
0c1c0d53SJerry Wu  // new permutation after bubbling. This is because moving a collapsed dim is
0c1c0d53SJerry Wu  // equivalent to moving the associated source dims together.
0c1c0d53SJerry Wu  SmallVector<int64_t> newOuterDimsPerm;
0c1c0d53SJerry Wu  for (auto outerPos : outerDimsPerm) {
0c1c0d53SJerry Wu    newOuterDimsPerm.insert(newOuterDimsPerm.end(),
0c1c0d53SJerry Wu                            reassocIndices[outerPos].begin(),
0c1c0d53SJerry Wu                            reassocIndices[outerPos].end());
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  auto emptyOp = tensor::PackOp::createDestinationTensor(
0c1c0d53SJerry Wu      rewriter, packOp.getLoc(), collapseOp.getSrc(), packOp.getMixedTiles(),
0c1c0d53SJerry Wu      projectedInnerDimsPos, newOuterDimsPerm);
0c1c0d53SJerry Wu  auto newPackOp = rewriter.create<tensor::PackOp>(
0c1c0d53SJerry Wu      packOp.getLoc(), collapseOp.getSrc(), emptyOp, projectedInnerDimsPos,
0c1c0d53SJerry Wu      packOp.getMixedTiles(), packOp.getPaddingValue(), newOuterDimsPerm);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  SmallVector<ReassociationIndices> newReassocIndices = reassocIndices;
0c1c0d53SJerry Wu  // First apply the permutation on the reassociations of the outer dims.
0c1c0d53SJerry Wu  // For example given the permutation [1, 0], the reassociations [[0, 1], [2]]
0c1c0d53SJerry Wu  // -> [[0], [1, 2]]
0c1c0d53SJerry Wu  int64_t nextPos =
0c1c0d53SJerry Wu      applyPermutationAndReindexReassoc(newReassocIndices, outerDimsPerm);
0c1c0d53SJerry Wu  // Then add direct mapping for the inner tile dims.
0c1c0d53SJerry Wu  for (size_t i = 0; i < innerDimsPos.size(); ++i) {
0c1c0d53SJerry Wu    newReassocIndices.push_back({nextPos});
0c1c0d53SJerry Wu    nextPos += 1;
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  auto newCollapseOp = rewriter.create<tensor::CollapseShapeOp>(
0c1c0d53SJerry Wu      collapseOp.getLoc(), packOp.getType(), newPackOp, newReassocIndices);
0c1c0d53SJerry Wu  rewriter.replaceOp(packOp, newCollapseOp);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  return success();
0c1c0d53SJerry Wu}
0c1c0d53SJerry Wu
a945f55dSAdam Siemieniuk/// Project dimsPos to their collapsed positions in the reassocIndices.
a945f55dSAdam Siemieniuk///
a945f55dSAdam Siemieniuk/// For example, given dimsPos [0, 1, 2, 4], and matching reassocIndices
a945f55dSAdam Siemieniuk/// [[0], [1, 2], [3], [4]], it returns [0, 1, 1, 3]. Because for pos 0,
a945f55dSAdam Siemieniuk/// the reassoc dim [0] is 0. For pos 1 and 2, the reassoc dim in pos
a945f55dSAdam Siemieniuk/// [1, 2] is 1. And for pos 4, the reassoc dim [4] is 3.
a945f55dSAdam Siemieniukstatic SmallVector<int64_t>
a945f55dSAdam SiemieniukprojectDimsPosIntoReassocPos(ArrayRef<int64_t> dimsPos,
a945f55dSAdam Siemieniuk                             ArrayRef<ReassociationIndices> reassocIndices) {
a945f55dSAdam Siemieniuk  SmallVector<int64_t> projectedPos;
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  // Map each dimension to the position of corresponding reassociation index.
a945f55dSAdam Siemieniuk  for (auto pos : dimsPos) {
a945f55dSAdam Siemieniuk    for (auto [idx, indices] : llvm::enumerate(reassocIndices)) {
a945f55dSAdam Siemieniuk      // If the dimension is present in the current indices group, the group
a945f55dSAdam Siemieniuk      // position within the reassociation map is the desired projected
a945f55dSAdam Siemieniuk      // dimension position.
*165f4535SKazu Hirata      if (llvm::is_contained(indices, pos)) {
a945f55dSAdam Siemieniuk        projectedPos.push_back(idx);
a945f55dSAdam Siemieniuk        break;
a945f55dSAdam Siemieniuk      }
a945f55dSAdam Siemieniuk    }
a945f55dSAdam Siemieniuk  }
a945f55dSAdam Siemieniuk  assert(projectedPos.size() == dimsPos.size() && "Invalid dim pos projection");
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  return projectedPos;
a945f55dSAdam Siemieniuk}
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk/// Bubble up pack op through expand shape op.
a945f55dSAdam Siemieniuk///
a945f55dSAdam Siemieniuk/// For example:
a945f55dSAdam Siemieniuk///
a945f55dSAdam Siemieniuk/// %expand = tensor.expand_shape %in [[0], [1, 2]]
a945f55dSAdam Siemieniuk///     : tensor<?x64xf32> into tensor<?x4x16xf32>
a945f55dSAdam Siemieniuk/// %pack = tensor.pack %expand outer_dims_perm = [0, 1]
a945f55dSAdam Siemieniuk///     inner_dims_pos = [2] inner_tiles = [8] into %empty
a945f55dSAdam Siemieniuk///     : tensor<?x4x16xf32> -> tensor<?x4x2x8xf32>
a945f55dSAdam Siemieniuk///
a945f55dSAdam Siemieniuk/// can be transformed into:
a945f55dSAdam Siemieniuk///
a945f55dSAdam Siemieniuk/// %pack = tensor.pack %in outer_dims_perm = [1, 2]
a945f55dSAdam Siemieniuk///     inner_dims_pos = [1] inner_tiles = [8] into %empty
a945f55dSAdam Siemieniuk///     : tensor<?x64xf32> -> tensor<?x8x8xf32>
a945f55dSAdam Siemieniuk/// %expand = tensor.expand_shape %pack [[0], [1, 2], [3]]
a945f55dSAdam Siemieniuk///     : tensor<?x8x8xf32> into tensor<?x4x2x8xf32>
a945f55dSAdam Siemieniukstatic LogicalResult
a945f55dSAdam SiemieniukbubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp,
a945f55dSAdam Siemieniuk                                 tensor::PackOp packOp,
a945f55dSAdam Siemieniuk                                 PatternRewriter &rewriter) {
a945f55dSAdam Siemieniuk  // Outer dimensions permutation is not supported currently.
a945f55dSAdam Siemieniuk  // TODO: Handle outer_dims_perm variants.
a945f55dSAdam Siemieniuk  ArrayRef<int64_t> outerDimsPerm = packOp.getOuterDimsPerm();
a945f55dSAdam Siemieniuk  if (!outerDimsPerm.empty() && !isIdentityPermutation(outerDimsPerm)) {
a945f55dSAdam Siemieniuk    return rewriter.notifyMatchFailure(packOp,
a945f55dSAdam Siemieniuk                                       "non-identity outer dims perm NYI");
a945f55dSAdam Siemieniuk  }
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  // Validate dimensions' relations between shape expansion and packing.
a945f55dSAdam Siemieniuk  SmallVector<ReassociationIndices, 4> reassoc =
a945f55dSAdam Siemieniuk      expandOp.getReassociationIndices();
a945f55dSAdam Siemieniuk  ArrayRef<int64_t> packInnerDims = packOp.getInnerDimsPos();
a945f55dSAdam Siemieniuk  llvm::SetVector<int64_t> packDimsPos(packInnerDims.begin(),
a945f55dSAdam Siemieniuk                                       packInnerDims.end());
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  for (auto [idx, indices] : llvm::enumerate(reassoc)) {
a945f55dSAdam Siemieniuk    // For each expand_shape reassociation, figure out which dimensions get
a945f55dSAdam Siemieniuk    // packed if any.
a945f55dSAdam Siemieniuk    llvm::SetVector<int64_t> expandDimPos(indices.begin(), indices.end());
a945f55dSAdam Siemieniuk    llvm::SetVector<int64_t> packedDims =
a945f55dSAdam Siemieniuk        llvm::set_intersection(packDimsPos, expandDimPos);
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk    // The expanded dimension is not packed so, it does not affect moving pack
a945f55dSAdam Siemieniuk    // before shape expansion - simply continue.
a945f55dSAdam Siemieniuk    if (packedDims.empty())
a945f55dSAdam Siemieniuk      continue;
a945f55dSAdam Siemieniuk    // Shape expansion cannot be propagated when multiple expanded dimension are
a945f55dSAdam Siemieniuk    // packed - in this case operation reordering would affect final element
a945f55dSAdam Siemieniuk    // positions and/or shapes can no longer be projected.
a945f55dSAdam Siemieniuk    if (packedDims.size() != 1)
a945f55dSAdam Siemieniuk      return rewriter.notifyMatchFailure(
a945f55dSAdam Siemieniuk          packOp, "only one of the expanded dimensions can be packed");
a945f55dSAdam Siemieniuk    // Only the inner-most expanded dimension should be packed. Otherwise,
a945f55dSAdam Siemieniuk    // elements order will be affected after operation reordering.
a945f55dSAdam Siemieniuk    if (packedDims.front() != indices.back())
a945f55dSAdam Siemieniuk      return rewriter.notifyMatchFailure(
a945f55dSAdam Siemieniuk          packOp, "can only pack the inner-most expanded dimension");
a945f55dSAdam Siemieniuk  }
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  // Project pack.inner_dims_pos to positions before shape expansion.
a945f55dSAdam Siemieniuk  SmallVector<int64_t> projectedInnerDimsPos =
a945f55dSAdam Siemieniuk      projectDimsPosIntoReassocPos(packInnerDims, reassoc);
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  // Project the shape expansion to new packed shape.
a945f55dSAdam Siemieniuk  // The pack.outer_dims_perm is restricted to identity so, the permutation can
a945f55dSAdam Siemieniuk  // be omitted for simplicity.
a945f55dSAdam Siemieniuk  // TODO: Account for outer dimensions permutation.
a945f55dSAdam Siemieniuk  //
a945f55dSAdam Siemieniuk  // If reassociation is not possible, then reordering cannot happen.
a945f55dSAdam Siemieniuk  // This can be caused by pack padding affecting previously expanded
a945f55dSAdam Siemieniuk  // dimensions or packing extending dimensions.
a945f55dSAdam Siemieniuk  RankedTensorType newPackType = tensor::PackOp::inferPackedType(
a945f55dSAdam Siemieniuk      expandOp.getSrcType(), packOp.getStaticInnerTiles(),
a945f55dSAdam Siemieniuk      projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector<int64_t>{});
a945f55dSAdam Siemieniuk  auto reassocExpand =
a945f55dSAdam Siemieniuk      getReassociationIndicesForReshape(newPackType, packOp.getDestType());
a945f55dSAdam Siemieniuk  if (!reassocExpand)
a945f55dSAdam Siemieniuk    return rewriter.notifyMatchFailure(
a945f55dSAdam Siemieniuk        packOp, "could not reassociate dims after bubbling up");
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  Value destTensor = tensor::PackOp::createDestinationTensor(
a945f55dSAdam Siemieniuk      rewriter, packOp.getLoc(), expandOp.getSrc(), packOp.getMixedTiles(),
a945f55dSAdam Siemieniuk      projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector<int64_t>{});
a945f55dSAdam Siemieniuk  Value packedVal = rewriter.create<tensor::PackOp>(
a945f55dSAdam Siemieniuk      packOp.getLoc(), expandOp.getSrc(), destTensor, projectedInnerDimsPos,
a945f55dSAdam Siemieniuk      packOp.getMixedTiles(), packOp.getPaddingValue(),
a945f55dSAdam Siemieniuk      /*outerDimsPerm=*/SmallVector<int64_t>{});
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  Value newExpandOp = rewriter.create<tensor::ExpandShapeOp>(
a945f55dSAdam Siemieniuk      packOp.getLoc(), packOp.getDestType(), packedVal, *reassocExpand);
a945f55dSAdam Siemieniuk  rewriter.replaceOp(packOp, newExpandOp);
a945f55dSAdam Siemieniuk
a945f55dSAdam Siemieniuk  return success();
a945f55dSAdam Siemieniuk}
a945f55dSAdam Siemieniuk
0c1c0d53SJerry Wuclass BubbleUpPackOpThroughReshapeOp final
0c1c0d53SJerry Wu    : public OpRewritePattern<tensor::PackOp> {
0c1c0d53SJerry Wupublic:
0c1c0d53SJerry Wu  BubbleUpPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun)
0c1c0d53SJerry Wu      : OpRewritePattern<tensor::PackOp>(context), controlFn(std::move(fun)) {}
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  LogicalResult matchAndRewrite(tensor::PackOp packOp,
0c1c0d53SJerry Wu                                PatternRewriter &rewriter) const override {
0c1c0d53SJerry Wu    Operation *srcOp = packOp.getSource().getDefiningOp();
0c1c0d53SJerry Wu    // Currently only support when the pack op is the only user.
0c1c0d53SJerry Wu    if (!srcOp || !(srcOp->getNumResults() == 1) ||
0c1c0d53SJerry Wu        !srcOp->getResult(0).hasOneUse()) {
0c1c0d53SJerry Wu      return failure();
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu    // Currently only support static inner tile sizes.
0c1c0d53SJerry Wu    if (llvm::any_of(packOp.getStaticTiles(), [](int64_t size) {
0c1c0d53SJerry Wu          return ShapedType::isDynamic(size);
0c1c0d53SJerry Wu        })) {
0c1c0d53SJerry Wu      return failure();
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu    // User controlled propagation function.
04fc471fSHan-Chung Wang    if (!controlFn(&packOp.getSourceMutable()))
0c1c0d53SJerry Wu      return failure();
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu    return TypeSwitch<Operation *, LogicalResult>(srcOp)
0c1c0d53SJerry Wu        .Case([&](tensor::CollapseShapeOp op) {
0c1c0d53SJerry Wu          return bubbleUpPackOpThroughCollapseShape(op, packOp, rewriter);
0c1c0d53SJerry Wu        })
a945f55dSAdam Siemieniuk        .Case([&](tensor::ExpandShapeOp op) {
a945f55dSAdam Siemieniuk          return bubbleUpPackOpThroughExpandShape(op, packOp, rewriter);
a945f55dSAdam Siemieniuk        })
0c1c0d53SJerry Wu        .Default([](Operation *) { return failure(); });
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wuprivate:
0c1c0d53SJerry Wu  ControlPropagationFn controlFn;
0c1c0d53SJerry Wu};
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu/// Push down unpack op through expand shape op when the packed dims can be
0c1c0d53SJerry Wu/// projected to the dims after expanding. This is possible when the inner tile
0c1c0d53SJerry Wu/// sizes can divide the projected dims.
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// For example:
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// %unpack = tensor.unpack %in outer_dims_perm = [0, 1]
0c1c0d53SJerry Wu///     inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %empty
0c1c0d53SJerry Wu///     : tensor<?x32x8x8xf32> -> tensor<?x256xf32>
0c1c0d53SJerry Wu/// %expanded = tensor.expand_shape %unpack [[0, 1], [2]]
0c1c0d53SJerry Wu///     : tensor<?x256xf32> into tensor<?x256x256xf32>
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// can be transformed into:
0c1c0d53SJerry Wu///
0c1c0d53SJerry Wu/// %expanded = tensor.expand_shape %ain [[0, 1], [2], [3], [4]]
0c1c0d53SJerry Wu///     : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
0c1c0d53SJerry Wu/// %unpack = tensor.unpack %expanded outer_dims_perm = [0, 1, 2]
0c1c0d53SJerry Wu///     inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %empty
0c1c0d53SJerry Wu///     : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32>
04fc471fSHan-Chung Wangstatic LogicalResult pushDownUnPackOpThroughExpandShape(
04fc471fSHan-Chung Wang    tensor::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp,
04fc471fSHan-Chung Wang    PatternRewriter &rewriter, ControlPropagationFn controlFn) {
04fc471fSHan-Chung Wang  // User controlled propagation function.
04fc471fSHan-Chung Wang  if (!controlFn(&expandOp.getSrcMutable()))
04fc471fSHan-Chung Wang    return failure();
04fc471fSHan-Chung Wang
0c1c0d53SJerry Wu  SmallVector<int64_t> innerTileSizes = unPackOp.getStaticTiles();
0c1c0d53SJerry Wu  ArrayRef<int64_t> innerDimsPos = unPackOp.getInnerDimsPos();
0c1c0d53SJerry Wu  ArrayRef<int64_t> outerDimsPerm = unPackOp.getOuterDimsPerm();
0c1c0d53SJerry Wu
d2353695SPeiming Liu  auto expandTy = dyn_cast<RankedTensorType>(expandOp.getType());
97069a86SGaurav Shukla  if (!expandTy)
97069a86SGaurav Shukla    return failure();
97069a86SGaurav Shukla  ArrayRef<int64_t> dstShape = expandTy.getShape();
0c1c0d53SJerry Wu  SmallVector<ReassociationIndices> reassocIndices =
0c1c0d53SJerry Wu      expandOp.getReassociationIndices();
0c1c0d53SJerry Wu  // Project inner tile pos to the dim pos after expanding. For example, if dims
0c1c0d53SJerry Wu  // [z] is expanded into [x, y], unpacking on dim z can be projected to unpack
0c1c0d53SJerry Wu  // on dim y.
0c1c0d53SJerry Wu  //
0c1c0d53SJerry Wu  // Project to inner-most non-unit dims to increase the chance that they can be
0c1c0d53SJerry Wu  // divided by the inner tile sizes. This is correct because for [..., x, 1],
0c1c0d53SJerry Wu  // unpacking on dim 1 is equivalent to unpacking on dim x.
0c1c0d53SJerry Wu  SmallVector<int64_t> projectedInnerDimsPos =
0c1c0d53SJerry Wu      projectToInnerMostNonUnitDimsPos(innerDimsPos, reassocIndices, dstShape);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  if (!isDimsDivisibleByTileSizes(projectedInnerDimsPos, dstShape,
0c1c0d53SJerry Wu                                  innerTileSizes)) {
0c1c0d53SJerry Wu    return failure();
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu  // Expand the outer dims permutation with the associated expanded dims for the
0c1c0d53SJerry Wu  // new permutation after pushing. This is because moving a source dim is
0c1c0d53SJerry Wu  // equivalent to moving the associated expanded dims together.
0c1c0d53SJerry Wu  SmallVector<int64_t> newOuterDimsPerm;
0c1c0d53SJerry Wu  for (auto outerPos : outerDimsPerm) {
0c1c0d53SJerry Wu    newOuterDimsPerm.insert(newOuterDimsPerm.end(),
0c1c0d53SJerry Wu                            reassocIndices[outerPos].begin(),
0c1c0d53SJerry Wu                            reassocIndices[outerPos].end());
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  SmallVector<ReassociationIndices> newReassocIndices = reassocIndices;
0c1c0d53SJerry Wu  // First apply the permutation on the reassociations of the outer dims.
0c1c0d53SJerry Wu  // For example given the permutation [1, 0], the reassociations [[0, 1], [2]]
0c1c0d53SJerry Wu  // -> [[0], [1, 2]]
0c1c0d53SJerry Wu  int64_t nextPos =
0c1c0d53SJerry Wu      applyPermutationAndReindexReassoc(newReassocIndices, outerDimsPerm);
0c1c0d53SJerry Wu  // Then add direct mapping for the inner tile dims.
0c1c0d53SJerry Wu  for (size_t i = 0; i < innerDimsPos.size(); ++i) {
0c1c0d53SJerry Wu    newReassocIndices.push_back({nextPos});
0c1c0d53SJerry Wu    nextPos += 1;
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
97069a86SGaurav Shukla  RankedTensorType newExpandType = tensor::PackOp::inferPackedType(
97069a86SGaurav Shukla      expandTy, innerTileSizes, projectedInnerDimsPos, newOuterDimsPerm);
0c1c0d53SJerry Wu  auto newExpandOp = rewriter.create<tensor::ExpandShapeOp>(
0c1c0d53SJerry Wu      expandOp.getLoc(), newExpandType, unPackOp.getSource(),
0c1c0d53SJerry Wu      newReassocIndices);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  auto emptyOp = tensor::UnPackOp::createDestinationTensor(
0c1c0d53SJerry Wu      rewriter, unPackOp.getLoc(), newExpandOp, unPackOp.getMixedTiles(),
0c1c0d53SJerry Wu      projectedInnerDimsPos, newOuterDimsPerm);
0c1c0d53SJerry Wu  auto newUnPackOp = rewriter.create<tensor::UnPackOp>(
0c1c0d53SJerry Wu      unPackOp.getLoc(), newExpandOp.getResult(), emptyOp,
0c1c0d53SJerry Wu      projectedInnerDimsPos, unPackOp.getMixedTiles(), newOuterDimsPerm);
0c1c0d53SJerry Wu  rewriter.replaceOp(expandOp, newUnPackOp);
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  return success();
0c1c0d53SJerry Wu}
0c1c0d53SJerry Wu
0c1c0d53SJerry Wuclass PushDownUnPackOpThroughReshapeOp final
0c1c0d53SJerry Wu    : public OpRewritePattern<tensor::UnPackOp> {
0c1c0d53SJerry Wupublic:
0c1c0d53SJerry Wu  PushDownUnPackOpThroughReshapeOp(MLIRContext *context,
0c1c0d53SJerry Wu                                   ControlPropagationFn fun)
0c1c0d53SJerry Wu      : OpRewritePattern<tensor::UnPackOp>(context), controlFn(std::move(fun)) {
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu  LogicalResult matchAndRewrite(tensor::UnPackOp unPackOp,
0c1c0d53SJerry Wu                                PatternRewriter &rewriter) const override {
0c1c0d53SJerry Wu    Value result = unPackOp.getResult();
0c1c0d53SJerry Wu    // Currently only support unpack op with the single user.
0c1c0d53SJerry Wu    if (!result.hasOneUse()) {
0c1c0d53SJerry Wu      return failure();
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu    // Currently only support static inner tile sizes.
0c1c0d53SJerry Wu    if (llvm::any_of(unPackOp.getStaticTiles(), [](int64_t size) {
0c1c0d53SJerry Wu          return ShapedType::isDynamic(size);
0c1c0d53SJerry Wu        })) {
0c1c0d53SJerry Wu      return failure();
0c1c0d53SJerry Wu    }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wu    Operation *consumerOp = *result.user_begin();
0c1c0d53SJerry Wu    return TypeSwitch<Operation *, LogicalResult>(consumerOp)
0c1c0d53SJerry Wu        .Case([&](tensor::ExpandShapeOp op) {
04fc471fSHan-Chung Wang          return pushDownUnPackOpThroughExpandShape(unPackOp, op, rewriter,
04fc471fSHan-Chung Wang                                                    controlFn);
0c1c0d53SJerry Wu        })
0c1c0d53SJerry Wu        .Default([](Operation *) { return failure(); });
0c1c0d53SJerry Wu  }
0c1c0d53SJerry Wu
0c1c0d53SJerry Wuprivate:
0c1c0d53SJerry Wu  ControlPropagationFn controlFn;
0c1c0d53SJerry Wu};
0c1c0d53SJerry Wu
9f242404SLorenzo Chelini// TODO: Relax this restriction. We should unpack a generic op also
6bb0ab0dSLorenzo Chelini// in the presence of multiple unpack ops as producers.
6bb0ab0dSLorenzo Chelini/// Return the unpacked operand, if present, for the current generic op.
6bb0ab0dSLorenzo Chelinistatic FailureOr<OpOperand *> getUnPackedOperand(GenericOp genericOp) {
6bb0ab0dSLorenzo Chelini  OpOperand *unPackedOperand = nullptr;
6bb0ab0dSLorenzo Chelini  for (OpOperand &operand : genericOp->getOpOperands()) {
6bb0ab0dSLorenzo Chelini    auto unPackOp = operand.get().getDefiningOp<tensor::UnPackOp>();
6bb0ab0dSLorenzo Chelini    if (!unPackOp)
6bb0ab0dSLorenzo Chelini      continue;
6bb0ab0dSLorenzo Chelini    if (unPackedOperand)
6bb0ab0dSLorenzo Chelini      return failure();
6bb0ab0dSLorenzo Chelini    unPackedOperand = &operand;
6bb0ab0dSLorenzo Chelini  }
6bb0ab0dSLorenzo Chelini  if (!unPackedOperand)
6bb0ab0dSLorenzo Chelini    return failure();
6bb0ab0dSLorenzo Chelini  return unPackedOperand;
6bb0ab0dSLorenzo Chelini}
6bb0ab0dSLorenzo Chelini
9f242404SLorenzo Chelini/// Push down a tensor.unpack op through a generic op.
6bb0ab0dSLorenzo Chelini/// The new generic op works on packed domain; pack ops are created for input
6bb0ab0dSLorenzo Chelini/// and output operands. A tensor.unpack op is inserted right after the packed
6bb0ab0dSLorenzo Chelini/// generic. E.g.
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// %arg0 = tensor<12x2x56x56x32xf32> // packed arg.
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// %0 = tensor.empty() : tensor<12x56x56x64xf32>
6bb0ab0dSLorenzo Chelini/// %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2]
6bb0ab0dSLorenzo Chelini///                          inner_dims_pos = [3] inner_tiles = [32] into %0
6bb0ab0dSLorenzo Chelini/// %2 = linalg.generic {indexing_maps = [#map],
6bb0ab0dSLorenzo Chelini///      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
6bb0ab0dSLorenzo Chelini///      outs(%1 : tensor<12x56x56x64xf32>) {
6bb0ab0dSLorenzo Chelini///      ^bb0(%out : f32):
6bb0ab0dSLorenzo Chelini///         linalg.yield %out : f32
6bb0ab0dSLorenzo Chelini///      } -> tensor<12x56x56x64xf32>
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// will be converted to
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelini/// %0 = tensor.empty() : tensor<12x56x56x64xf32>
6bb0ab0dSLorenzo Chelini/// %1 = linalg.generic {indexing_maps = [#map],
6bb0ab0dSLorenzo Chelini///      iterator_types = ["parallel", "parallel", "parallel",
6bb0ab0dSLorenzo Chelini///                        "parallel", "parallel"]}
6bb0ab0dSLorenzo Chelini///      outs(%arg0 : tensor<12x2x56x56x32xf32>) {
6bb0ab0dSLorenzo Chelini///      ^bb0(%out : f32):
6bb0ab0dSLorenzo Chelini///         linalg.yield %out : f32
6bb0ab0dSLorenzo Chelini///      } -> tensor<12x2x56x56x32xf32>
6bb0ab0dSLorenzo Chelini/// %2 = tensor.unpack %1 outer_dims_perm = [0, 3, 1, 2]
6bb0ab0dSLorenzo Chelini///                       inner_dims_pos = [3] inner_tiles = [32] into %0
6bb0ab0dSLorenzo Chelini///
6bb0ab0dSLorenzo Chelinistatic FailureOr<std::tuple<GenericOp, Value>>
04fc471fSHan-Chung WangpushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp,
04fc471fSHan-Chung Wang                                 ControlPropagationFn controlFn) {
6bb0ab0dSLorenzo Chelini  if (genericOp.getNumResults() != 1)
6bb0ab0dSLorenzo Chelini    return failure();
6bb0ab0dSLorenzo Chelini
b4563ee1SQuinn Dawkins  if (hasGatherSemantics(genericOp))
b4563ee1SQuinn Dawkins    return failure();
b4563ee1SQuinn Dawkins
6bb0ab0dSLorenzo Chelini  // Collect the unPacked operand, if present.
6bb0ab0dSLorenzo Chelini  auto maybeUnPackedOperand = getUnPackedOperand(genericOp);
6bb0ab0dSLorenzo Chelini  if (failed(maybeUnPackedOperand))
6bb0ab0dSLorenzo Chelini    return failure();
6bb0ab0dSLorenzo Chelini  OpOperand *unPackedOperand = *(maybeUnPackedOperand);
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  // Extract packing information.
6bb0ab0dSLorenzo Chelini  tensor::UnPackOp producerUnPackOp =
6bb0ab0dSLorenzo Chelini      unPackedOperand->get().getDefiningOp<tensor::UnPackOp>();
6bb0ab0dSLorenzo Chelini  assert(producerUnPackOp && "expect a valid UnPackOp");
04fc471fSHan-Chung Wang
04fc471fSHan-Chung Wang  if (!controlFn(unPackedOperand))
04fc471fSHan-Chung Wang    return failure();
04fc471fSHan-Chung Wang
b4563ee1SQuinn Dawkins  auto packInfo =
b4563ee1SQuinn Dawkins      getPackingInfoFromOperand(unPackedOperand, genericOp, producerUnPackOp);
b4563ee1SQuinn Dawkins  if (failed(packInfo))
b4563ee1SQuinn Dawkins    return failure();
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  // Rebuild the indexing map for the corresponding init operand.
6bb0ab0dSLorenzo Chelini  auto [packedOutOperand, packedOutIndexingMap] =
b4563ee1SQuinn Dawkins      getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo,
6bb0ab0dSLorenzo Chelini                                     genericOp, genericOp.getDpsInitOperand(0));
b4563ee1SQuinn Dawkins  auto destPack = packedOutOperand.getDefiningOp<tensor::PackOp>();
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  // If the dps init operand of the generic is a tensor.empty, do not pack it
6bb0ab0dSLorenzo Chelini  // and forward the new tensor.empty as a destination.
6bb0ab0dSLorenzo Chelini  Value dest = packedOutOperand;
6bb0ab0dSLorenzo Chelini  if (auto initTensor = genericOp.getDpsInitOperand(0)
6bb0ab0dSLorenzo Chelini                            ->get()
6bb0ab0dSLorenzo Chelini                            .getDefiningOp<tensor::EmptyOp>()) {
b4563ee1SQuinn Dawkins    if (destPack)
b4563ee1SQuinn Dawkins      dest = destPack.getDest();
6bb0ab0dSLorenzo Chelini  }
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  // Pack the genericOp.
9f242404SLorenzo Chelini  GenericOp newGenericOp =
9f242404SLorenzo Chelini      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo);
b4563ee1SQuinn Dawkins  Value newResult =
b4563ee1SQuinn Dawkins      newGenericOp.getTiedOpResult(newGenericOp.getDpsInitOperand(0));
6bb0ab0dSLorenzo Chelini
b4563ee1SQuinn Dawkins  // If the output is unaffected, no need to unpack.
b4563ee1SQuinn Dawkins  if (!destPack)
b4563ee1SQuinn Dawkins    return std::make_tuple(newGenericOp, newResult);
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkins  auto mixedTiles = destPack.getMixedTiles();
b4563ee1SQuinn Dawkins  auto innerDimsPos = destPack.getInnerDimsPos();
b4563ee1SQuinn Dawkins  auto outerDimsPerm = destPack.getOuterDimsPerm();
b4563ee1SQuinn Dawkins
6bb0ab0dSLorenzo Chelini  // Insert an unPackOp right after the packed generic.
6bb0ab0dSLorenzo Chelini  Value unPackOpRes =
6bb0ab0dSLorenzo Chelini      rewriter
536486fbSAbhishek Varma          .create<tensor::UnPackOp>(genericOp.getLoc(), newResult,
536486fbSAbhishek Varma                                    destPack.getSource(), innerDimsPos,
b4563ee1SQuinn Dawkins                                    mixedTiles, outerDimsPerm)
6bb0ab0dSLorenzo Chelini          .getResult();
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  return std::make_tuple(newGenericOp, unPackOpRes);
6bb0ab0dSLorenzo Chelini}
6bb0ab0dSLorenzo Chelini
b4563ee1SQuinn Dawkins// Wrapper pattern that applies pushDownUnPackOpThroughGenericOp method.
b4563ee1SQuinn Dawkinsstruct PushDownUnPackOpThroughGenericOp : public OpRewritePattern<GenericOp> {
b4563ee1SQuinn Dawkinspublic:
b4563ee1SQuinn Dawkins  PushDownUnPackOpThroughGenericOp(MLIRContext *context,
b4563ee1SQuinn Dawkins                                   ControlPropagationFn fun)
b4563ee1SQuinn Dawkins      : OpRewritePattern<GenericOp>(context), controlFn(std::move(fun)) {}
6bb0ab0dSLorenzo Chelini
6bb0ab0dSLorenzo Chelini  LogicalResult matchAndRewrite(GenericOp genericOp,
6bb0ab0dSLorenzo Chelini                                PatternRewriter &rewriter) const override {
04fc471fSHan-Chung Wang    auto genericAndRepl =
04fc471fSHan-Chung Wang        pushDownUnPackOpThroughGenericOp(rewriter, genericOp, controlFn);
6bb0ab0dSLorenzo Chelini    if (failed(genericAndRepl))
6bb0ab0dSLorenzo Chelini      return failure();
6bb0ab0dSLorenzo Chelini    rewriter.replaceOp(genericOp, std::get<1>(*genericAndRepl));
6bb0ab0dSLorenzo Chelini    return success();
6bb0ab0dSLorenzo Chelini  }
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkinsprivate:
b4563ee1SQuinn Dawkins  ControlPropagationFn controlFn;
6bb0ab0dSLorenzo Chelini};
6bb0ab0dSLorenzo Chelini
30d542f9SLorenzo Chelini/// Propagate a tensor.unpack operation through a tensor.pad. The idea is to
30d542f9SLorenzo Chelini/// add as many zero padding dimensions in `high` and `low` based on the number
30d542f9SLorenzo Chelini/// of point loops.
30d542f9SLorenzo Chelinistruct PushDownUnPackThroughPadOp : public OpRewritePattern<tensor::PadOp> {
b4563ee1SQuinn Dawkins  PushDownUnPackThroughPadOp(MLIRContext *context, ControlPropagationFn fun)
b4563ee1SQuinn Dawkins      : OpRewritePattern<tensor::PadOp>(context), controlFn(std::move(fun)) {}
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini  LogicalResult matchAndRewrite(tensor::PadOp padOp,
30d542f9SLorenzo Chelini                                PatternRewriter &rewriter) const override {
30d542f9SLorenzo Chelini    tensor::UnPackOp unpackOp =
30d542f9SLorenzo Chelini        padOp.getSource().getDefiningOp<tensor::UnPackOp>();
30d542f9SLorenzo Chelini    if (!unpackOp)
30d542f9SLorenzo Chelini      return failure();
30d542f9SLorenzo Chelini
04fc471fSHan-Chung Wang    if (!controlFn(&padOp.getSourceMutable()))
b4563ee1SQuinn Dawkins      return failure();
b4563ee1SQuinn Dawkins
30d542f9SLorenzo Chelini    Location loc = padOp.getLoc();
30d542f9SLorenzo Chelini    // Bail out if one of the padded dimension is a tiled one.
30d542f9SLorenzo Chelini    llvm::SmallBitVector paddedDims = padOp.getPaddedDims();
30d542f9SLorenzo Chelini    ArrayRef<int64_t> innerDimsPos = unpackOp.getInnerDimsPos();
30d542f9SLorenzo Chelini    llvm::SmallBitVector innerDims(paddedDims.size());
30d542f9SLorenzo Chelini    for (int64_t dim : innerDimsPos)
30d542f9SLorenzo Chelini      innerDims.flip(dim);
30d542f9SLorenzo Chelini    if (paddedDims.anyCommon(innerDims))
30d542f9SLorenzo Chelini      return failure();
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini    Value paddingVal = padOp.getConstantPaddingValue();
30d542f9SLorenzo Chelini    if (!paddingVal)
30d542f9SLorenzo Chelini      return failure();
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini    // If we have `outer_dims_perms` we need to adjust the padded dimensions.
30d542f9SLorenzo Chelini    ArrayRef<int64_t> outerDimsPerm = unpackOp.getOuterDimsPerm();
30d542f9SLorenzo Chelini    SmallVector<OpFoldResult> lowPad = padOp.getMixedLowPad();
30d542f9SLorenzo Chelini    SmallVector<OpFoldResult> highPad = padOp.getMixedHighPad();
30d542f9SLorenzo Chelini    if (!outerDimsPerm.empty()) {
30d542f9SLorenzo Chelini      applyPermutationToVector<OpFoldResult>(lowPad, outerDimsPerm);
30d542f9SLorenzo Chelini      applyPermutationToVector<OpFoldResult>(highPad, outerDimsPerm);
30d542f9SLorenzo Chelini    }
30d542f9SLorenzo Chelini    // Add zero padding for the point loops.
30d542f9SLorenzo Chelini    size_t pointLoopsSize = innerDimsPos.size();
30d542f9SLorenzo Chelini    lowPad.append(pointLoopsSize, rewriter.getIndexAttr(0));
30d542f9SLorenzo Chelini    highPad.append(pointLoopsSize, rewriter.getIndexAttr(0));
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini    auto newPadOp = rewriter.create<tensor::PadOp>(
30d542f9SLorenzo Chelini        loc, /*result=*/Type(), unpackOp.getSource(), lowPad, highPad,
30d542f9SLorenzo Chelini        paddingVal, padOp.getNofold());
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini    // Inject the tensor.unpack right after the packed padOp.
30d542f9SLorenzo Chelini    Value outputUnPack = rewriter.create<tensor::EmptyOp>(
30d542f9SLorenzo Chelini        loc, padOp.getResultType().getShape(),
30d542f9SLorenzo Chelini        padOp.getResultType().getElementType());
30d542f9SLorenzo Chelini
30d542f9SLorenzo Chelini    Value replacement = rewriter.create<tensor::UnPackOp>(
30d542f9SLorenzo Chelini        loc, newPadOp.getResult(), outputUnPack, innerDimsPos,
30d542f9SLorenzo Chelini        unpackOp.getMixedTiles(), outerDimsPerm);
30d542f9SLorenzo Chelini    rewriter.replaceOp(padOp, replacement);
30d542f9SLorenzo Chelini    return success();
30d542f9SLorenzo Chelini  }
b4563ee1SQuinn Dawkins
b4563ee1SQuinn Dawkinsprivate:
b4563ee1SQuinn Dawkins  ControlPropagationFn controlFn;
30d542f9SLorenzo Chelini};
30d542f9SLorenzo Chelini
0f297cadSHanhan Wang} // namespace
0f297cadSHanhan Wang
0f297cadSHanhan Wangvoid mlir::linalg::populateDataLayoutPropagationPatterns(
b4563ee1SQuinn Dawkins    RewritePatternSet &patterns,
b4563ee1SQuinn Dawkins    const ControlPropagationFn &controlPackUnPackPropagation) {
886294a2SQuinn Dawkins  patterns
886294a2SQuinn Dawkins      .insert<BubbleUpPackOpThroughGenericOpPattern, BubbleUpPackThroughPadOp,
0c1c0d53SJerry Wu              BubbleUpPackOpThroughReshapeOp, PushDownUnPackOpThroughGenericOp,
0c1c0d53SJerry Wu              PushDownUnPackThroughPadOp, PushDownUnPackOpThroughReshapeOp>(
b4563ee1SQuinn Dawkins          patterns.getContext(), controlPackUnPackPropagation);
0f297cadSHanhan Wang}