Vector/Transforms/VectorDistribute.cpp

d02f10d9SThomas Raoux//===- VectorDistribute.cpp - patterns to do vector distribution ----------===//
d02f10d9SThomas Raoux//
d02f10d9SThomas Raoux// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
d02f10d9SThomas Raoux// See https://llvm.org/LICENSE.txt for license information.
d02f10d9SThomas Raoux// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
d02f10d9SThomas Raoux//
d02f10d9SThomas Raoux//===----------------------------------------------------------------------===//
d02f10d9SThomas Raoux
ed0288f7SThomas Raoux#include "mlir/Dialect/Affine/IR/AffineOps.h"
abc362a1SJakub Kuderski#include "mlir/Dialect/Arith/IR/Arith.h"
ecaf2c33SPetr Kurapov#include "mlir/Dialect/GPU/IR/GPUDialect.h"
*bc29fc93SPetr Kurapov#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
d02f10d9SThomas Raoux#include "mlir/Dialect/MemRef/IR/MemRef.h"
8b68da2cSAlex Zinenko#include "mlir/Dialect/SCF/IR/SCF.h"
fa8a10a1SNicolas Vasilache#include "mlir/Dialect/Vector/IR/VectorOps.h"
d02f10d9SThomas Raoux#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
fa8a10a1SNicolas Vasilache#include "mlir/IR/AffineExpr.h"
fc367dfaSMahesh Ravishankar#include "mlir/Interfaces/SideEffectInterfaces.h"
91f62f0eSThomas Raoux#include "mlir/Transforms/RegionUtils.h"
d7d6443dSThomas Raoux#include "llvm/ADT/SetVector.h"
80636227SJakub Kuderski#include "llvm/Support/FormatVariadic.h"
08d651d7SMehdi Amini#include <utility>
08d651d7SMehdi Amini
d02f10d9SThomas Raouxusing namespace mlir;
d02f10d9SThomas Raouxusing namespace mlir::vector;
ecaf2c33SPetr Kurapovusing namespace mlir::gpu;
d02f10d9SThomas Raoux
4abb9e5dSThomas Raoux/// Currently the distribution map is implicit based on the vector shape. In the
4abb9e5dSThomas Raoux/// future it will be part of the op.
4abb9e5dSThomas Raoux/// Example:
4abb9e5dSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) {
4abb9e5dSThomas Raoux///   ...
ecaf2c33SPetr Kurapov///   gpu.yield %3 : vector<32x16x64xf32>
4abb9e5dSThomas Raoux/// }
4abb9e5dSThomas Raoux/// ```
4abb9e5dSThomas Raoux/// Would have an implicit map of:
4abb9e5dSThomas Raoux/// `(d0, d1, d2) -> (d0, d2)`
4abb9e5dSThomas Raouxstatic AffineMap calculateImplicitMap(VectorType sequentialType,
4abb9e5dSThomas Raoux                                      VectorType distributedType) {
4abb9e5dSThomas Raoux  SmallVector<AffineExpr> perm;
4abb9e5dSThomas Raoux  perm.reserve(1);
4abb9e5dSThomas Raoux  // Check which dimensions of the sequential type are different than the
4abb9e5dSThomas Raoux  // dimensions of the distributed type to know the distributed dimensions. Then
4abb9e5dSThomas Raoux  // associate each distributed dimension to an ID in order.
4abb9e5dSThomas Raoux  for (unsigned i = 0, e = sequentialType.getRank(); i < e; i++) {
4abb9e5dSThomas Raoux    if (sequentialType.getDimSize(i) != distributedType.getDimSize(i))
4abb9e5dSThomas Raoux      perm.push_back(getAffineDimExpr(i, distributedType.getContext()));
d02f10d9SThomas Raoux  }
4abb9e5dSThomas Raoux  auto map = AffineMap::get(sequentialType.getRank(), 0, perm,
4abb9e5dSThomas Raoux                            distributedType.getContext());
4abb9e5dSThomas Raoux  return map;
d02f10d9SThomas Raoux}
d02f10d9SThomas Raoux
fa8a10a1SNicolas Vasilachenamespace {
d02f10d9SThomas Raoux
fa8a10a1SNicolas Vasilache/// Helper struct to create the load / store operations that permit transit
fa8a10a1SNicolas Vasilache/// through the parallel / sequential and the sequential / parallel boundaries
fa8a10a1SNicolas Vasilache/// when performing `rewriteWarpOpToScfFor`.
fa8a10a1SNicolas Vasilache///
4abb9e5dSThomas Raoux/// The vector distribution dimension is inferred from the vector types.
fa8a10a1SNicolas Vasilachestruct DistributedLoadStoreHelper {
fa8a10a1SNicolas Vasilache  DistributedLoadStoreHelper(Value sequentialVal, Value distributedVal,
fa8a10a1SNicolas Vasilache                             Value laneId, Value zero)
fa8a10a1SNicolas Vasilache      : sequentialVal(sequentialVal), distributedVal(distributedVal),
fa8a10a1SNicolas Vasilache        laneId(laneId), zero(zero) {
5550c821STres Popp    sequentialVectorType = dyn_cast<VectorType>(sequentialVal.getType());
5550c821STres Popp    distributedVectorType = dyn_cast<VectorType>(distributedVal.getType());
4abb9e5dSThomas Raoux    if (sequentialVectorType && distributedVectorType)
4abb9e5dSThomas Raoux      distributionMap =
4abb9e5dSThomas Raoux          calculateImplicitMap(sequentialVectorType, distributedVectorType);
fa8a10a1SNicolas Vasilache  }
d02f10d9SThomas Raoux
4abb9e5dSThomas Raoux  Value buildDistributedOffset(RewriterBase &b, Location loc, int64_t index) {
4abb9e5dSThomas Raoux    int64_t distributedSize = distributedVectorType.getDimSize(index);
fa8a10a1SNicolas Vasilache    AffineExpr tid = getAffineSymbolExpr(0, b.getContext());
4c48f016SMatthias Springer    return b.createOrFold<affine::AffineApplyOp>(loc, tid * distributedSize,
fa8a10a1SNicolas Vasilache                                                 ArrayRef<Value>{laneId});
fa8a10a1SNicolas Vasilache  }
d02f10d9SThomas Raoux
845dc178SNicolas Vasilache  /// Create a store during the process of distributing the
845dc178SNicolas Vasilache  /// `vector.warp_execute_on_thread_0` op.
845dc178SNicolas Vasilache  /// Vector distribution assumes the following convention regarding the
845dc178SNicolas Vasilache  /// temporary buffers that are created to transition values. This **must**
845dc178SNicolas Vasilache  /// be properly specified in the `options.warpAllocationFn`:
845dc178SNicolas Vasilache  ///   1. scalars of type T transit through a memref<1xT>.
845dc178SNicolas Vasilache  ///   2. vectors of type V<shapexT> transit through a memref<shapexT>
fa8a10a1SNicolas Vasilache  Operation *buildStore(RewriterBase &b, Location loc, Value val,
fa8a10a1SNicolas Vasilache                        Value buffer) {
fa8a10a1SNicolas Vasilache    assert((val == distributedVal || val == sequentialVal) &&
fa8a10a1SNicolas Vasilache           "Must store either the preregistered distributed or the "
fa8a10a1SNicolas Vasilache           "preregistered sequential value.");
4abb9e5dSThomas Raoux    // Scalar case can directly use memref.store.
5550c821STres Popp    if (!isa<VectorType>(val.getType()))
4abb9e5dSThomas Raoux      return b.create<memref::StoreOp>(loc, val, buffer, zero);
4abb9e5dSThomas Raoux
fa8a10a1SNicolas Vasilache    // Vector case must use vector::TransferWriteOp which will later lower to
fa8a10a1SNicolas Vasilache    //   vector.store of memref.store depending on further lowerings.
845dc178SNicolas Vasilache    int64_t rank = sequentialVectorType.getRank();
845dc178SNicolas Vasilache    SmallVector<Value> indices(rank, zero);
4abb9e5dSThomas Raoux    if (val == distributedVal) {
4abb9e5dSThomas Raoux      for (auto dimExpr : distributionMap.getResults()) {
1609f1c2Slong.chen        int64_t index = cast<AffineDimExpr>(dimExpr).getPosition();
4abb9e5dSThomas Raoux        indices[index] = buildDistributedOffset(b, loc, index);
4abb9e5dSThomas Raoux      }
4abb9e5dSThomas Raoux    }
fa8a10a1SNicolas Vasilache    SmallVector<bool> inBounds(indices.size(), true);
fa8a10a1SNicolas Vasilache    return b.create<vector::TransferWriteOp>(
fa8a10a1SNicolas Vasilache        loc, val, buffer, indices,
fa8a10a1SNicolas Vasilache        ArrayRef<bool>(inBounds.begin(), inBounds.end()));
fa8a10a1SNicolas Vasilache  }
fa8a10a1SNicolas Vasilache
845dc178SNicolas Vasilache  /// Create a load during the process of distributing the
845dc178SNicolas Vasilache  /// `vector.warp_execute_on_thread_0` op.
845dc178SNicolas Vasilache  /// Vector distribution assumes the following convention regarding the
845dc178SNicolas Vasilache  /// temporary buffers that are created to transition values. This **must**
845dc178SNicolas Vasilache  /// be properly specified in the `options.warpAllocationFn`:
845dc178SNicolas Vasilache  ///   1. scalars of type T transit through a memref<1xT>.
845dc178SNicolas Vasilache  ///   2. vectors of type V<shapexT> transit through a memref<shapexT>
845dc178SNicolas Vasilache  ///
845dc178SNicolas Vasilache  /// When broadcastMode is true, the load is not distributed to account for
ecaf2c33SPetr Kurapov  /// the broadcast semantics of the `gpu.warp_execute_on_lane_0` op.
845dc178SNicolas Vasilache  ///
845dc178SNicolas Vasilache  /// Example:
845dc178SNicolas Vasilache  ///
845dc178SNicolas Vasilache  /// ```
ecaf2c33SPetr Kurapov  ///   %r = gpu.warp_execute_on_lane_0(...) -> (f32) {
ecaf2c33SPetr Kurapov  ///     gpu.yield %cst : f32
845dc178SNicolas Vasilache  ///   }
845dc178SNicolas Vasilache  ///   // Both types are f32. The constant %cst is broadcasted to all lanes.
845dc178SNicolas Vasilache  /// ```
845dc178SNicolas Vasilache  /// This behavior described in more detail in the documentation of the op.
4abb9e5dSThomas Raoux  Value buildLoad(RewriterBase &b, Location loc, Type type, Value buffer) {
4abb9e5dSThomas Raoux
4abb9e5dSThomas Raoux    // Scalar case can directly use memref.store.
5550c821STres Popp    if (!isa<VectorType>(type))
fa8a10a1SNicolas Vasilache      return b.create<memref::LoadOp>(loc, buffer, zero);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Other cases must be vector atm.
fa8a10a1SNicolas Vasilache    // Vector case must use vector::TransferReadOp which will later lower to
fa8a10a1SNicolas Vasilache    //   vector.read of memref.read depending on further lowerings.
fa8a10a1SNicolas Vasilache    assert((type == distributedVectorType || type == sequentialVectorType) &&
fa8a10a1SNicolas Vasilache           "Must store either the preregistered distributed or the "
fa8a10a1SNicolas Vasilache           "preregistered sequential type.");
fa8a10a1SNicolas Vasilache    SmallVector<Value> indices(sequentialVectorType.getRank(), zero);
fa8a10a1SNicolas Vasilache    if (type == distributedVectorType) {
4abb9e5dSThomas Raoux      for (auto dimExpr : distributionMap.getResults()) {
1609f1c2Slong.chen        int64_t index = cast<AffineDimExpr>(dimExpr).getPosition();
4abb9e5dSThomas Raoux        indices[index] = buildDistributedOffset(b, loc, index);
4abb9e5dSThomas Raoux      }
d02f10d9SThomas Raoux    }
fa8a10a1SNicolas Vasilache    SmallVector<bool> inBounds(indices.size(), true);
fa8a10a1SNicolas Vasilache    return b.create<vector::TransferReadOp>(
5550c821STres Popp        loc, cast<VectorType>(type), buffer, indices,
fa8a10a1SNicolas Vasilache        ArrayRef<bool>(inBounds.begin(), inBounds.end()));
d02f10d9SThomas Raoux  }
d02f10d9SThomas Raoux
fa8a10a1SNicolas Vasilache  Value sequentialVal, distributedVal, laneId, zero;
fa8a10a1SNicolas Vasilache  VectorType sequentialVectorType, distributedVectorType;
4abb9e5dSThomas Raoux  AffineMap distributionMap;
fa8a10a1SNicolas Vasilache};
d02f10d9SThomas Raoux
fa8a10a1SNicolas Vasilache} // namespace
d02f10d9SThomas Raoux
76cf33daSThomas Raoux// Clones `op` into a new operation that takes `operands` and returns
76cf33daSThomas Raoux// `resultTypes`.
76cf33daSThomas Raouxstatic Operation *cloneOpWithOperandsAndTypes(RewriterBase &rewriter,
76cf33daSThomas Raoux                                              Location loc, Operation *op,
76cf33daSThomas Raoux                                              ArrayRef<Value> operands,
76cf33daSThomas Raoux                                              ArrayRef<Type> resultTypes) {
76cf33daSThomas Raoux  OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,
76cf33daSThomas Raoux                     op->getAttrs());
76cf33daSThomas Raoux  return rewriter.create(res);
76cf33daSThomas Raoux}
76cf33daSThomas Raoux
d02f10d9SThomas Raouxnamespace {
d02f10d9SThomas Raoux
fa8a10a1SNicolas Vasilache/// Rewrite a WarpExecuteOnLane0Op into a predicated scf.if op where the single
fa8a10a1SNicolas Vasilache/// thread `laneId` executes the entirety of the computation.
fa8a10a1SNicolas Vasilache///
fa8a10a1SNicolas Vasilache/// After the transformation:
fa8a10a1SNicolas Vasilache///   - the IR within the scf.if op can be thought of as executing sequentially
fa8a10a1SNicolas Vasilache///     (from the point of view of threads along `laneId`).
fa8a10a1SNicolas Vasilache///   - the IR outside of the scf.if op can be thought of as executing in
fa8a10a1SNicolas Vasilache///     parallel (from the point of view of threads along `laneId`).
fa8a10a1SNicolas Vasilache///
fa8a10a1SNicolas Vasilache/// Values that need to transit through the parallel / sequential and the
fa8a10a1SNicolas Vasilache/// sequential / parallel boundaries do so via reads and writes to a temporary
fa8a10a1SNicolas Vasilache/// memory location.
fa8a10a1SNicolas Vasilache///
fa8a10a1SNicolas Vasilache/// The transformation proceeds in multiple steps:
fa8a10a1SNicolas Vasilache///   1. Create the scf.if op.
fa8a10a1SNicolas Vasilache///   2. Insert appropriate (alloc, write)-pairs before the scf.if and reads
fa8a10a1SNicolas Vasilache///      within the scf.if to transit the values captured from above.
fa8a10a1SNicolas Vasilache///   3. Synchronize before the scf.if to ensure all writes inserted in 2. are
fa8a10a1SNicolas Vasilache///      consistent within the scf.if.
fa8a10a1SNicolas Vasilache///   4. Move the body of the WarpExecuteOnLane0Op inside the scf.if.
fa8a10a1SNicolas Vasilache///   5. Insert appropriate writes within scf.if and reads after the scf.if to
fa8a10a1SNicolas Vasilache///      transit the values returned by the op.
fa8a10a1SNicolas Vasilache///   6. Synchronize after the scf.if to ensure all writes inserted in 5. are
fa8a10a1SNicolas Vasilache///      consistent after the scf.if.
fa8a10a1SNicolas Vasilache///   7. Perform late cleanups.
fa8a10a1SNicolas Vasilache///
fa8a10a1SNicolas Vasilache/// All this assumes the vector distribution occurs along the most minor
fa8a10a1SNicolas Vasilache/// distributed vector dimension.
*bc29fc93SPetr Kurapovstruct WarpOpToScfIfPattern : public WarpDistributionPattern {
4abb9e5dSThomas Raoux  WarpOpToScfIfPattern(MLIRContext *context,
d02f10d9SThomas Raoux                       const WarpExecuteOnLane0LoweringOptions &options,
d02f10d9SThomas Raoux                       PatternBenefit benefit = 1)
*bc29fc93SPetr Kurapov      : WarpDistributionPattern(context, benefit), options(options) {}
d02f10d9SThomas Raoux
d02f10d9SThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
d02f10d9SThomas Raoux                                PatternRewriter &rewriter) const override {
fa8a10a1SNicolas Vasilache    assert(warpOp.getBodyRegion().hasOneBlock() &&
fa8a10a1SNicolas Vasilache           "expected WarpOp with single block");
fa8a10a1SNicolas Vasilache    Block *warpOpBody = &warpOp.getBodyRegion().front();
fa8a10a1SNicolas Vasilache    Location loc = warpOp.getLoc();
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Passed all checks. Start rewriting.
fa8a10a1SNicolas Vasilache    OpBuilder::InsertionGuard g(rewriter);
fa8a10a1SNicolas Vasilache    rewriter.setInsertionPoint(warpOp);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 1: Create scf.if op.
fa8a10a1SNicolas Vasilache    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
fa8a10a1SNicolas Vasilache    Value isLane0 = rewriter.create<arith::CmpIOp>(
fa8a10a1SNicolas Vasilache        loc, arith::CmpIPredicate::eq, warpOp.getLaneid(), c0);
fa8a10a1SNicolas Vasilache    auto ifOp = rewriter.create<scf::IfOp>(loc, isLane0,
fa8a10a1SNicolas Vasilache                                           /*withElseRegion=*/false);
fa8a10a1SNicolas Vasilache    rewriter.eraseOp(ifOp.thenBlock()->getTerminator());
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 2: insert appropriate (alloc, write)-pairs before the scf.if and
fa8a10a1SNicolas Vasilache    // reads within the scf.if to transit the values captured from above.
fa8a10a1SNicolas Vasilache    SmallVector<Value> bbArgReplacements;
fa8a10a1SNicolas Vasilache    for (const auto &it : llvm::enumerate(warpOp.getArgs())) {
fa8a10a1SNicolas Vasilache      Value sequentialVal = warpOpBody->getArgument(it.index());
fa8a10a1SNicolas Vasilache      Value distributedVal = it.value();
fa8a10a1SNicolas Vasilache      DistributedLoadStoreHelper helper(sequentialVal, distributedVal,
fa8a10a1SNicolas Vasilache                                        warpOp.getLaneid(), c0);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache      // Create buffer before the ifOp.
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPoint(ifOp);
fa8a10a1SNicolas Vasilache      Value buffer = options.warpAllocationFn(loc, rewriter, warpOp,
fa8a10a1SNicolas Vasilache                                              sequentialVal.getType());
fa8a10a1SNicolas Vasilache      // Store distributed vector into buffer, before the ifOp.
fa8a10a1SNicolas Vasilache      helper.buildStore(rewriter, loc, distributedVal, buffer);
fa8a10a1SNicolas Vasilache      // Load sequential vector from buffer, inside the ifOp.
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPointToStart(ifOp.thenBlock());
4abb9e5dSThomas Raoux      bbArgReplacements.push_back(
4abb9e5dSThomas Raoux          helper.buildLoad(rewriter, loc, sequentialVal.getType(), buffer));
fa8a10a1SNicolas Vasilache    }
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 3. Insert sync after all the stores and before all the loads.
fa8a10a1SNicolas Vasilache    if (!warpOp.getArgs().empty()) {
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPoint(ifOp);
fa8a10a1SNicolas Vasilache      options.warpSyncronizationFn(loc, rewriter, warpOp);
fa8a10a1SNicolas Vasilache    }
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 4. Move body of warpOp to ifOp.
fa8a10a1SNicolas Vasilache    rewriter.mergeBlocks(warpOpBody, ifOp.thenBlock(), bbArgReplacements);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 5. Insert appropriate writes within scf.if and reads after the
fa8a10a1SNicolas Vasilache    // scf.if to transit the values returned by the op.
fa8a10a1SNicolas Vasilache    // TODO: at this point, we can reuse the shared memory from previous
fa8a10a1SNicolas Vasilache    // buffers.
fa8a10a1SNicolas Vasilache    SmallVector<Value> replacements;
ecaf2c33SPetr Kurapov    auto yieldOp = cast<gpu::YieldOp>(ifOp.thenBlock()->getTerminator());
fa8a10a1SNicolas Vasilache    Location yieldLoc = yieldOp.getLoc();
b74192b7SRiver Riddle    for (const auto &it : llvm::enumerate(yieldOp.getOperands())) {
fa8a10a1SNicolas Vasilache      Value sequentialVal = it.value();
fa8a10a1SNicolas Vasilache      Value distributedVal = warpOp->getResult(it.index());
fa8a10a1SNicolas Vasilache      DistributedLoadStoreHelper helper(sequentialVal, distributedVal,
fa8a10a1SNicolas Vasilache                                        warpOp.getLaneid(), c0);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache      // Create buffer before the ifOp.
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPoint(ifOp);
fa8a10a1SNicolas Vasilache      Value buffer = options.warpAllocationFn(loc, rewriter, warpOp,
fa8a10a1SNicolas Vasilache                                              sequentialVal.getType());
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache      // Store yielded value into buffer, inside the ifOp, before the
fa8a10a1SNicolas Vasilache      // terminator.
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPoint(yieldOp);
fa8a10a1SNicolas Vasilache      helper.buildStore(rewriter, loc, sequentialVal, buffer);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache      // Load distributed value from buffer, after  the warpOp.
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPointAfter(ifOp);
fa8a10a1SNicolas Vasilache      // Result type and yielded value type are the same. This is a broadcast.
fa8a10a1SNicolas Vasilache      // E.g.:
ecaf2c33SPetr Kurapov      // %r = gpu.warp_execute_on_lane_0(...) -> (f32) {
ecaf2c33SPetr Kurapov      //   gpu.yield %cst : f32
fa8a10a1SNicolas Vasilache      // }
fa8a10a1SNicolas Vasilache      // Both types are f32. The constant %cst is broadcasted to all lanes.
fa8a10a1SNicolas Vasilache      // This is described in more detail in the documentation of the op.
4abb9e5dSThomas Raoux      replacements.push_back(
4abb9e5dSThomas Raoux          helper.buildLoad(rewriter, loc, distributedVal.getType(), buffer));
fa8a10a1SNicolas Vasilache    }
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 6. Insert sync after all the stores and before all the loads.
b74192b7SRiver Riddle    if (!yieldOp.getOperands().empty()) {
fa8a10a1SNicolas Vasilache      rewriter.setInsertionPointAfter(ifOp);
fa8a10a1SNicolas Vasilache      options.warpSyncronizationFn(loc, rewriter, warpOp);
fa8a10a1SNicolas Vasilache    }
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Step 7. Delete terminator and add empty scf.yield.
fa8a10a1SNicolas Vasilache    rewriter.eraseOp(yieldOp);
fa8a10a1SNicolas Vasilache    rewriter.setInsertionPointToEnd(ifOp.thenBlock());
fa8a10a1SNicolas Vasilache    rewriter.create<scf::YieldOp>(yieldLoc);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    // Compute replacements for WarpOp results.
fa8a10a1SNicolas Vasilache    rewriter.replaceOp(warpOp, replacements);
fa8a10a1SNicolas Vasilache
fa8a10a1SNicolas Vasilache    return success();
d02f10d9SThomas Raoux  }
d02f10d9SThomas Raoux
d02f10d9SThomas Raouxprivate:
d02f10d9SThomas Raoux  const WarpExecuteOnLane0LoweringOptions &options;
d02f10d9SThomas Raoux};
d02f10d9SThomas Raoux
91f62f0eSThomas Raoux/// Return the distributed vector type based on the original type and the
91f62f0eSThomas Raoux/// distribution map. The map is expected to have a dimension equal to the
91f62f0eSThomas Raoux/// original type rank and should be a projection where the results are the
91f62f0eSThomas Raoux/// distributed dimensions. The number of results should be equal to the number
91f62f0eSThomas Raoux/// of warp sizes which is currently limited to 1.
91f62f0eSThomas Raoux/// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1)
91f62f0eSThomas Raoux/// and a warp size of 16 would distribute the second dimension (associated to
91f62f0eSThomas Raoux/// d1) and return vector<16x2x64>
91f62f0eSThomas Raouxstatic VectorType getDistributedType(VectorType originalType, AffineMap map,
91f62f0eSThomas Raoux                                     int64_t warpSize) {
5262865aSKazu Hirata  SmallVector<int64_t> targetShape(originalType.getShape());
91f62f0eSThomas Raoux  for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
91f62f0eSThomas Raoux    unsigned position = map.getDimPosition(i);
c2b95292SQuinn Dawkins    if (targetShape[position] % warpSize != 0) {
c2b95292SQuinn Dawkins      if (warpSize % targetShape[position] != 0) {
91f62f0eSThomas Raoux        return VectorType();
c2b95292SQuinn Dawkins      }
c2b95292SQuinn Dawkins      warpSize /= targetShape[position];
c2b95292SQuinn Dawkins      targetShape[position] = 1;
c2b95292SQuinn Dawkins      continue;
c2b95292SQuinn Dawkins    }
91f62f0eSThomas Raoux    targetShape[position] = targetShape[position] / warpSize;
c2b95292SQuinn Dawkins    warpSize = 1;
c2b95292SQuinn Dawkins    break;
c2b95292SQuinn Dawkins  }
c2b95292SQuinn Dawkins  if (warpSize != 1) {
c2b95292SQuinn Dawkins    return VectorType();
91f62f0eSThomas Raoux  }
91f62f0eSThomas Raoux  VectorType targetType =
91f62f0eSThomas Raoux      VectorType::get(targetShape, originalType.getElementType());
91f62f0eSThomas Raoux  return targetType;
91f62f0eSThomas Raoux}
91f62f0eSThomas Raoux
ed0288f7SThomas Raoux/// Distribute transfer_write ops based on the affine map returned by
80636227SJakub Kuderski/// `distributionMapFn`. Writes of size more than `maxNumElementToExtract`
80636227SJakub Kuderski/// will not be distributed (it should be less than the warp size).
80636227SJakub Kuderski///
ed0288f7SThomas Raoux/// Example:
ed0288f7SThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%id){
ed0288f7SThomas Raoux///   ...
ed0288f7SThomas Raoux///   vector.transfer_write %v, %A[%c0] : vector<32xf32>, memref<128xf32>
ecaf2c33SPetr Kurapov///   gpu.yield
ed0288f7SThomas Raoux/// }
ed0288f7SThomas Raoux/// ```
ed0288f7SThomas Raoux/// To
ed0288f7SThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %r:3 = gpu.warp_execute_on_lane_0(%id) -> (vector<1xf32>) {
ed0288f7SThomas Raoux///   ...
ecaf2c33SPetr Kurapov///   gpu.yield %v : vector<32xf32>
ed0288f7SThomas Raoux/// }
ed0288f7SThomas Raoux/// vector.transfer_write %v, %A[%id] : vector<1xf32>, memref<128xf32>
*bc29fc93SPetr Kurapovstruct WarpOpTransferWrite : public WarpDistributionPattern {
ed0288f7SThomas Raoux  WarpOpTransferWrite(MLIRContext *ctx, DistributionMapFn fn,
80636227SJakub Kuderski                      unsigned maxNumElementsToExtract, PatternBenefit b = 1)
*bc29fc93SPetr Kurapov      : WarpDistributionPattern(ctx, b), distributionMapFn(std::move(fn)),
80636227SJakub Kuderski        maxNumElementsToExtract(maxNumElementsToExtract) {}
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  /// Distribute the TransferWriteOp. Only 1D distributions and vector dims that
ed0288f7SThomas Raoux  /// are multiples of the distribution ratio are supported at the moment.
ed0288f7SThomas Raoux  LogicalResult tryDistributeOp(RewriterBase &rewriter,
ed0288f7SThomas Raoux                                vector::TransferWriteOp writeOp,
ed0288f7SThomas Raoux                                WarpExecuteOnLane0Op warpOp) const {
6a57d8fbSNicolas Vasilache    VectorType writtenVectorType = writeOp.getVectorType();
6a57d8fbSNicolas Vasilache
6a57d8fbSNicolas Vasilache    // 1. If the write is 0-D, we just clone it into a new WarpExecuteOnLane0Op
6a57d8fbSNicolas Vasilache    // to separate it from the rest.
6a57d8fbSNicolas Vasilache    if (writtenVectorType.getRank() == 0)
6a57d8fbSNicolas Vasilache      return failure();
6a57d8fbSNicolas Vasilache
91f62f0eSThomas Raoux    // 2. Compute the distributed type.
91f62f0eSThomas Raoux    AffineMap map = distributionMapFn(writeOp.getVector());
ed0288f7SThomas Raoux    VectorType targetType =
91f62f0eSThomas Raoux        getDistributedType(writtenVectorType, map, warpOp.getWarpSize());
91f62f0eSThomas Raoux    if (!targetType)
91f62f0eSThomas Raoux      return failure();
ed0288f7SThomas Raoux
25ec1fa9SQuinn Dawkins    // 2.5 Compute the distributed type for the new mask;
25ec1fa9SQuinn Dawkins    VectorType maskType;
25ec1fa9SQuinn Dawkins    if (writeOp.getMask()) {
25ec1fa9SQuinn Dawkins      // TODO: Distribution of masked writes with non-trivial permutation maps
25ec1fa9SQuinn Dawkins      // requires the distribution of the mask to elementwise match the
25ec1fa9SQuinn Dawkins      // distribution of the permuted written vector. Currently the details
25ec1fa9SQuinn Dawkins      // of which lane is responsible for which element is captured strictly
25ec1fa9SQuinn Dawkins      // by shape information on the warp op, and thus requires materializing
25ec1fa9SQuinn Dawkins      // the permutation in IR.
25ec1fa9SQuinn Dawkins      if (!writeOp.getPermutationMap().isMinorIdentity())
25ec1fa9SQuinn Dawkins        return failure();
25ec1fa9SQuinn Dawkins      maskType =
25ec1fa9SQuinn Dawkins          getDistributedType(writeOp.getMaskType(), map, warpOp.getWarpSize());
25ec1fa9SQuinn Dawkins    }
25ec1fa9SQuinn Dawkins
91f62f0eSThomas Raoux    // 3. clone the write into a new WarpExecuteOnLane0Op to separate it from
6a57d8fbSNicolas Vasilache    // the rest.
6a57d8fbSNicolas Vasilache    vector::TransferWriteOp newWriteOp =
25ec1fa9SQuinn Dawkins        cloneWriteOp(rewriter, warpOp, writeOp, targetType, maskType);
ed0288f7SThomas Raoux
91f62f0eSThomas Raoux    // 4. Reindex the write using the distribution map.
6a57d8fbSNicolas Vasilache    auto newWarpOp =
6a57d8fbSNicolas Vasilache        newWriteOp.getVector().getDefiningOp<WarpExecuteOnLane0Op>();
c2b95292SQuinn Dawkins
c2b95292SQuinn Dawkins    // Delinearize the lane id based on the way threads are divided across the
c2b95292SQuinn Dawkins    // vector. To get the number of threads per vector dimension, divide the
c2b95292SQuinn Dawkins    // sequential size by the distributed size along each dim.
ed0288f7SThomas Raoux    rewriter.setInsertionPoint(newWriteOp);
c2b95292SQuinn Dawkins    SmallVector<OpFoldResult> delinearizedIdSizes;
c2b95292SQuinn Dawkins    for (auto [seqSize, distSize] :
c2b95292SQuinn Dawkins         llvm::zip_equal(writtenVectorType.getShape(), targetType.getShape())) {
c2b95292SQuinn Dawkins      assert(seqSize % distSize == 0 && "Invalid distributed vector shape");
c2b95292SQuinn Dawkins      delinearizedIdSizes.push_back(rewriter.getIndexAttr(seqSize / distSize));
c2b95292SQuinn Dawkins    }
c2b95292SQuinn Dawkins    SmallVector<Value> delinearized;
c2b95292SQuinn Dawkins    if (map.getNumResults() > 1) {
c2b95292SQuinn Dawkins      delinearized = rewriter
c2b95292SQuinn Dawkins                         .create<mlir::affine::AffineDelinearizeIndexOp>(
c2b95292SQuinn Dawkins                             newWarpOp.getLoc(), newWarpOp.getLaneid(),
c2b95292SQuinn Dawkins                             delinearizedIdSizes)
c2b95292SQuinn Dawkins                         .getResults();
c2b95292SQuinn Dawkins    } else {
c2b95292SQuinn Dawkins      // If there is only one map result, we can elide the delinearization
c2b95292SQuinn Dawkins      // op and use the lane id directly.
c2b95292SQuinn Dawkins      delinearized.append(targetType.getRank(), newWarpOp.getLaneid());
c2b95292SQuinn Dawkins    }
c2b95292SQuinn Dawkins
ed0288f7SThomas Raoux    AffineMap indexMap = map.compose(newWriteOp.getPermutationMap());
ed0288f7SThomas Raoux    Location loc = newWriteOp.getLoc();
ed0288f7SThomas Raoux    SmallVector<Value> indices(newWriteOp.getIndices().begin(),
ed0288f7SThomas Raoux                               newWriteOp.getIndices().end());
ed0288f7SThomas Raoux    for (auto it : llvm::zip(indexMap.getResults(), map.getResults())) {
ed0288f7SThomas Raoux      AffineExpr d0, d1;
ed0288f7SThomas Raoux      bindDims(newWarpOp.getContext(), d0, d1);
1609f1c2Slong.chen      auto indexExpr = dyn_cast<AffineDimExpr>(std::get<0>(it));
ed0288f7SThomas Raoux      if (!indexExpr)
ed0288f7SThomas Raoux        continue;
ed0288f7SThomas Raoux      unsigned indexPos = indexExpr.getPosition();
1609f1c2Slong.chen      unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
c2b95292SQuinn Dawkins      Value laneId = delinearized[vectorPos];
91f62f0eSThomas Raoux      auto scale =
91f62f0eSThomas Raoux          rewriter.getAffineConstantExpr(targetType.getDimSize(vectorPos));
4c48f016SMatthias Springer      indices[indexPos] = affine::makeComposedAffineApply(
c2b95292SQuinn Dawkins          rewriter, loc, d0 + scale * d1, {indices[indexPos], laneId});
ed0288f7SThomas Raoux    }
ed0288f7SThomas Raoux    newWriteOp.getIndicesMutable().assign(indices);
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    return success();
ed0288f7SThomas Raoux  }
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  /// Extract TransferWriteOps of vector<1x> into a separate warp op.
ed0288f7SThomas Raoux  LogicalResult tryExtractOp(RewriterBase &rewriter,
ed0288f7SThomas Raoux                             vector::TransferWriteOp writeOp,
ed0288f7SThomas Raoux                             WarpExecuteOnLane0Op warpOp) const {
ed0288f7SThomas Raoux    Location loc = writeOp.getLoc();
ed0288f7SThomas Raoux    VectorType vecType = writeOp.getVectorType();
ed0288f7SThomas Raoux
80636227SJakub Kuderski    if (vecType.getNumElements() > maxNumElementsToExtract) {
80636227SJakub Kuderski      return rewriter.notifyMatchFailure(
80636227SJakub Kuderski          warpOp,
80636227SJakub Kuderski          llvm::formatv(
80636227SJakub Kuderski              "writes more elements ({0}) than allowed to extract ({1})",
80636227SJakub Kuderski              vecType.getNumElements(), maxNumElementsToExtract));
80636227SJakub Kuderski    }
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    // Do not process warp ops that contain only TransferWriteOps.
971b8525SJakub Kuderski    if (llvm::all_of(warpOp.getOps(),
ecaf2c33SPetr Kurapov                     llvm::IsaPred<vector::TransferWriteOp, gpu::YieldOp>))
ed0288f7SThomas Raoux      return failure();
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    SmallVector<Value> yieldValues = {writeOp.getVector()};
ed0288f7SThomas Raoux    SmallVector<Type> retTypes = {vecType};
d7d6443dSThomas Raoux    SmallVector<size_t> newRetIndices;
ed0288f7SThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
d7d6443dSThomas Raoux        rewriter, warpOp, yieldValues, retTypes, newRetIndices);
ed0288f7SThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    // Create a second warp op that contains only writeOp.
ed0288f7SThomas Raoux    auto secondWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
ed0288f7SThomas Raoux        loc, TypeRange(), newWarpOp.getLaneid(), newWarpOp.getWarpSize());
ed0288f7SThomas Raoux    Block &body = secondWarpOp.getBodyRegion().front();
ed0288f7SThomas Raoux    rewriter.setInsertionPointToStart(&body);
ed0288f7SThomas Raoux    auto newWriteOp =
ed0288f7SThomas Raoux        cast<vector::TransferWriteOp>(rewriter.clone(*writeOp.getOperation()));
d7d6443dSThomas Raoux    newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0]));
ed0288f7SThomas Raoux    rewriter.eraseOp(writeOp);
ecaf2c33SPetr Kurapov    rewriter.create<gpu::YieldOp>(newWarpOp.getLoc());
ed0288f7SThomas Raoux    return success();
ed0288f7SThomas Raoux  }
ed0288f7SThomas Raoux
df49a97aSQuinn Dawkins  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
ed0288f7SThomas Raoux                                PatternRewriter &rewriter) const override {
ecaf2c33SPetr Kurapov    auto yield = cast<gpu::YieldOp>(
df49a97aSQuinn Dawkins        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
df49a97aSQuinn Dawkins    Operation *lastNode = yield->getPrevNode();
df49a97aSQuinn Dawkins    auto writeOp = dyn_cast_or_null<vector::TransferWriteOp>(lastNode);
df49a97aSQuinn Dawkins    if (!writeOp)
ed0288f7SThomas Raoux      return failure();
ed0288f7SThomas Raoux
25ec1fa9SQuinn Dawkins    Value maybeMask = writeOp.getMask();
ed0288f7SThomas Raoux    if (!llvm::all_of(writeOp->getOperands(), [&](Value value) {
ed0288f7SThomas Raoux          return writeOp.getVector() == value ||
25ec1fa9SQuinn Dawkins                 (maybeMask && maybeMask == value) ||
ed0288f7SThomas Raoux                 warpOp.isDefinedOutsideOfRegion(value);
ed0288f7SThomas Raoux        }))
ed0288f7SThomas Raoux      return failure();
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    if (succeeded(tryDistributeOp(rewriter, writeOp, warpOp)))
ed0288f7SThomas Raoux      return success();
ed0288f7SThomas Raoux
25ec1fa9SQuinn Dawkins    // Masked writes not supported for extraction.
25ec1fa9SQuinn Dawkins    if (writeOp.getMask())
25ec1fa9SQuinn Dawkins      return failure();
25ec1fa9SQuinn Dawkins
ed0288f7SThomas Raoux    if (succeeded(tryExtractOp(rewriter, writeOp, warpOp)))
ed0288f7SThomas Raoux      return success();
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux    return failure();
ed0288f7SThomas Raoux  }
ed0288f7SThomas Raoux
ed0288f7SThomas Raouxprivate:
*bc29fc93SPetr Kurapov  /// Clone `writeOp` assumed to be nested under `warpOp` into a new warp
*bc29fc93SPetr Kurapov  /// execute op with the proper return type. The new write op is updated to
*bc29fc93SPetr Kurapov  /// write the result of the new warp execute op. The old `writeOp` is deleted.
*bc29fc93SPetr Kurapov  vector::TransferWriteOp cloneWriteOp(RewriterBase &rewriter,
*bc29fc93SPetr Kurapov                                       WarpExecuteOnLane0Op warpOp,
*bc29fc93SPetr Kurapov                                       vector::TransferWriteOp writeOp,
*bc29fc93SPetr Kurapov                                       VectorType targetType,
*bc29fc93SPetr Kurapov                                       VectorType maybeMaskType) const {
*bc29fc93SPetr Kurapov    assert(writeOp->getParentOp() == warpOp &&
*bc29fc93SPetr Kurapov           "write must be nested immediately under warp");
*bc29fc93SPetr Kurapov    OpBuilder::InsertionGuard g(rewriter);
*bc29fc93SPetr Kurapov    SmallVector<size_t> newRetIndices;
*bc29fc93SPetr Kurapov    WarpExecuteOnLane0Op newWarpOp;
*bc29fc93SPetr Kurapov    if (maybeMaskType) {
*bc29fc93SPetr Kurapov      newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
*bc29fc93SPetr Kurapov          rewriter, warpOp, ValueRange{writeOp.getVector(), writeOp.getMask()},
*bc29fc93SPetr Kurapov          TypeRange{targetType, maybeMaskType}, newRetIndices);
*bc29fc93SPetr Kurapov    } else {
*bc29fc93SPetr Kurapov      newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
*bc29fc93SPetr Kurapov          rewriter, warpOp, ValueRange{{writeOp.getVector()}},
*bc29fc93SPetr Kurapov          TypeRange{targetType}, newRetIndices);
*bc29fc93SPetr Kurapov    }
*bc29fc93SPetr Kurapov    rewriter.setInsertionPointAfter(newWarpOp);
*bc29fc93SPetr Kurapov    auto newWriteOp =
*bc29fc93SPetr Kurapov        cast<vector::TransferWriteOp>(rewriter.clone(*writeOp.getOperation()));
*bc29fc93SPetr Kurapov    rewriter.eraseOp(writeOp);
*bc29fc93SPetr Kurapov    newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0]));
*bc29fc93SPetr Kurapov    if (maybeMaskType)
*bc29fc93SPetr Kurapov      newWriteOp.getMaskMutable().assign(newWarpOp.getResult(newRetIndices[1]));
*bc29fc93SPetr Kurapov    return newWriteOp;
*bc29fc93SPetr Kurapov  }
*bc29fc93SPetr Kurapov
ed0288f7SThomas Raoux  DistributionMapFn distributionMapFn;
80636227SJakub Kuderski  unsigned maxNumElementsToExtract = 1;
ed0288f7SThomas Raoux};
ed0288f7SThomas Raoux
76cf33daSThomas Raoux/// Sink out elementwise op feeding into a warp op yield.
76cf33daSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
76cf33daSThomas Raoux///   ...
76cf33daSThomas Raoux///   %3 = arith.addf %1, %2 : vector<32xf32>
ecaf2c33SPetr Kurapov///   gpu.yield %3 : vector<32xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// ```
76cf33daSThomas Raoux/// To
76cf33daSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %r:3 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
76cf33daSThomas Raoux/// vector<1xf32>, vector<1xf32>) {
76cf33daSThomas Raoux///   ...
76cf33daSThomas Raoux///   %4 = arith.addf %2, %3 : vector<32xf32>
ecaf2c33SPetr Kurapov///   gpu.yield %4, %2, %3 : vector<32xf32>, vector<32xf32>,
76cf33daSThomas Raoux///   vector<32xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// %0 = arith.addf %r#1, %r#2 : vector<1xf32>
*bc29fc93SPetr Kurapovstruct WarpOpElementwise : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
76cf33daSThomas Raoux    OpOperand *yieldOperand = getWarpResult(warpOp, [](Operation *op) {
76cf33daSThomas Raoux      return OpTrait::hasElementwiseMappableTraits(op);
76cf33daSThomas Raoux    });
76cf33daSThomas Raoux    if (!yieldOperand)
76cf33daSThomas Raoux      return failure();
aa2376a0SQuinn Dawkins
76cf33daSThomas Raoux    Operation *elementWise = yieldOperand->get().getDefiningOp();
76cf33daSThomas Raoux    unsigned operandIndex = yieldOperand->getOperandNumber();
76cf33daSThomas Raoux    Value distributedVal = warpOp.getResult(operandIndex);
76cf33daSThomas Raoux    SmallVector<Value> yieldValues;
76cf33daSThomas Raoux    SmallVector<Type> retTypes;
76cf33daSThomas Raoux    Location loc = warpOp.getLoc();
76cf33daSThomas Raoux    for (OpOperand &operand : elementWise->getOpOperands()) {
76cf33daSThomas Raoux      Type targetType;
5550c821STres Popp      if (auto vecType = dyn_cast<VectorType>(distributedVal.getType())) {
76cf33daSThomas Raoux        // If the result type is a vector, the operands must also be vectors.
5550c821STres Popp        auto operandType = cast<VectorType>(operand.get().getType());
76cf33daSThomas Raoux        targetType =
76cf33daSThomas Raoux            VectorType::get(vecType.getShape(), operandType.getElementType());
76cf33daSThomas Raoux      } else {
76cf33daSThomas Raoux        auto operandType = operand.get().getType();
5550c821STres Popp        assert(!isa<VectorType>(operandType) &&
76cf33daSThomas Raoux               "unexpected yield of vector from op with scalar result type");
76cf33daSThomas Raoux        targetType = operandType;
76cf33daSThomas Raoux      }
76cf33daSThomas Raoux      retTypes.push_back(targetType);
76cf33daSThomas Raoux      yieldValues.push_back(operand.get());
76cf33daSThomas Raoux    }
d7d6443dSThomas Raoux    SmallVector<size_t> newRetIndices;
76cf33daSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
d7d6443dSThomas Raoux        rewriter, warpOp, yieldValues, retTypes, newRetIndices);
76cf33daSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
76cf33daSThomas Raoux    SmallVector<Value> newOperands(elementWise->getOperands().begin(),
76cf33daSThomas Raoux                                   elementWise->getOperands().end());
76cf33daSThomas Raoux    for (unsigned i : llvm::seq(unsigned(0), elementWise->getNumOperands())) {
d7d6443dSThomas Raoux      newOperands[i] = newWarpOp.getResult(newRetIndices[i]);
76cf33daSThomas Raoux    }
76cf33daSThomas Raoux    OpBuilder::InsertionGuard g(rewriter);
76cf33daSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
76cf33daSThomas Raoux    Operation *newOp = cloneOpWithOperandsAndTypes(
76cf33daSThomas Raoux        rewriter, loc, elementWise, newOperands,
76cf33daSThomas Raoux        {newWarpOp.getResult(operandIndex).getType()});
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIndex),
7ecc921dSMatthias Springer                                newOp->getResult(0));
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
0af26805SThomas Raoux/// Sink out splat constant op feeding into a warp op yield.
0af26805SThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
0af26805SThomas Raoux///   ...
0af26805SThomas Raoux///   %cst = arith.constant dense<2.0> : vector<32xf32>
ecaf2c33SPetr Kurapov///   gpu.yield %cst : vector<32xf32>
0af26805SThomas Raoux/// }
0af26805SThomas Raoux/// ```
0af26805SThomas Raoux/// To
0af26805SThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// gpu.warp_execute_on_lane_0(%arg0 {
0af26805SThomas Raoux///   ...
0af26805SThomas Raoux/// }
0af26805SThomas Raoux/// %0 = arith.constant dense<2.0> : vector<1xf32>
*bc29fc93SPetr Kurapovstruct WarpOpConstant : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
0af26805SThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
0af26805SThomas Raoux                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *yieldOperand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<arith::ConstantOp>);
0af26805SThomas Raoux    if (!yieldOperand)
0af26805SThomas Raoux      return failure();
0af26805SThomas Raoux    auto constantOp = yieldOperand->get().getDefiningOp<arith::ConstantOp>();
5550c821STres Popp    auto dense = dyn_cast<SplatElementsAttr>(constantOp.getValue());
0af26805SThomas Raoux    if (!dense)
0af26805SThomas Raoux      return failure();
aa2376a0SQuinn Dawkins    // Notify the rewriter that the warp op is changing (see the comment on
aa2376a0SQuinn Dawkins    // the WarpOpTransferRead pattern).
5fcf907bSMatthias Springer    rewriter.startOpModification(warpOp);
0af26805SThomas Raoux    unsigned operandIndex = yieldOperand->getOperandNumber();
0af26805SThomas Raoux    Attribute scalarAttr = dense.getSplatValue<Attribute>();
6089d612SRahul Kayaith    auto newAttr = DenseElementsAttr::get(
6089d612SRahul Kayaith        cast<ShapedType>(warpOp.getResult(operandIndex).getType()), scalarAttr);
0af26805SThomas Raoux    Location loc = warpOp.getLoc();
0af26805SThomas Raoux    rewriter.setInsertionPointAfter(warpOp);
0af26805SThomas Raoux    Value distConstant = rewriter.create<arith::ConstantOp>(loc, newAttr);
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(warpOp.getResult(operandIndex), distConstant);
5fcf907bSMatthias Springer    rewriter.finalizeOpModification(warpOp);
0af26805SThomas Raoux    return success();
0af26805SThomas Raoux  }
0af26805SThomas Raoux};
0af26805SThomas Raoux
76cf33daSThomas Raoux/// Sink out transfer_read op feeding into a warp op yield.
76cf33daSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
76cf33daSThomas Raoux///   ...
76cf33daSThomas Raoux//    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>,
76cf33daSThomas Raoux//    vector<32xf32>
ecaf2c33SPetr Kurapov///   gpu.yield %2 : vector<32xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// ```
76cf33daSThomas Raoux/// To
76cf33daSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %dead = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
76cf33daSThomas Raoux/// vector<1xf32>, vector<1xf32>) {
76cf33daSThomas Raoux///   ...
76cf33daSThomas Raoux///   %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>,
ecaf2c33SPetr Kurapov///   vector<32xf32> gpu.yield %2 : vector<32xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// %0 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<1xf32>
*bc29fc93SPetr Kurapovstruct WarpOpTransferRead : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
7360d5d3SQuinn Dawkins    // Try to find a distributable yielded read. Note that this pattern can
7360d5d3SQuinn Dawkins    // still fail at the end after distribution, in which case this might have
7360d5d3SQuinn Dawkins    // missed another distributable read.
7360d5d3SQuinn Dawkins    OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
7360d5d3SQuinn Dawkins      // Don't duplicate transfer_read ops when distributing.
7360d5d3SQuinn Dawkins      return isa<vector::TransferReadOp>(op) && op->hasOneUse();
7360d5d3SQuinn Dawkins    });
76cf33daSThomas Raoux    if (!operand)
35c19fddSMatthias Springer      return rewriter.notifyMatchFailure(
35c19fddSMatthias Springer          warpOp, "warp result is not a vector.transfer_read op");
76cf33daSThomas Raoux    auto read = operand->get().getDefiningOp<vector::TransferReadOp>();
7360d5d3SQuinn Dawkins
35c19fddSMatthias Springer    // Source must be defined outside of the region.
35c19fddSMatthias Springer    if (!warpOp.isDefinedOutsideOfRegion(read.getSource()))
35c19fddSMatthias Springer      return rewriter.notifyMatchFailure(
35c19fddSMatthias Springer          read, "source must be defined outside of the region");
35c19fddSMatthias Springer
76cf33daSThomas Raoux    unsigned operandIndex = operand->getOperandNumber();
76cf33daSThomas Raoux    Value distributedVal = warpOp.getResult(operandIndex);
76cf33daSThomas Raoux
76cf33daSThomas Raoux    SmallVector<Value, 4> indices(read.getIndices().begin(),
76cf33daSThomas Raoux                                  read.getIndices().end());
5550c821STres Popp    auto sequentialType = cast<VectorType>(read.getResult().getType());
5550c821STres Popp    auto distributedType = cast<VectorType>(distributedVal.getType());
4abb9e5dSThomas Raoux    AffineMap map = calculateImplicitMap(sequentialType, distributedType);
76cf33daSThomas Raoux    AffineMap indexMap = map.compose(read.getPermutationMap());
771f5759SQuinn Dawkins
35c19fddSMatthias Springer    // Try to delinearize the lane ID to match the rank expected for
35c19fddSMatthias Springer    // distribution.
35c19fddSMatthias Springer    SmallVector<Value> delinearizedIds;
35c19fddSMatthias Springer    if (!delinearizeLaneId(rewriter, read.getLoc(), sequentialType.getShape(),
35c19fddSMatthias Springer                           distributedType.getShape(), warpOp.getWarpSize(),
35c19fddSMatthias Springer                           warpOp.getLaneid(), delinearizedIds)) {
35c19fddSMatthias Springer      return rewriter.notifyMatchFailure(
35c19fddSMatthias Springer          read, "cannot delinearize lane ID for distribution");
35c19fddSMatthias Springer    }
35c19fddSMatthias Springer    assert(!delinearizedIds.empty() || map.getNumResults() == 0);
35c19fddSMatthias Springer
35c19fddSMatthias Springer    // Distribute indices and the mask (if present).
76cf33daSThomas Raoux    OpBuilder::InsertionGuard g(rewriter);
35c19fddSMatthias Springer    SmallVector<Value> additionalResults(indices.begin(), indices.end());
35c19fddSMatthias Springer    SmallVector<Type> additionalResultTypes(indices.size(),
35c19fddSMatthias Springer                                            rewriter.getIndexType());
35c19fddSMatthias Springer    additionalResults.push_back(read.getPadding());
35c19fddSMatthias Springer    additionalResultTypes.push_back(read.getPadding().getType());
35c19fddSMatthias Springer
aa2376a0SQuinn Dawkins    bool hasMask = false;
771f5759SQuinn Dawkins    if (read.getMask()) {
aa2376a0SQuinn Dawkins      hasMask = true;
771f5759SQuinn Dawkins      // TODO: Distribution of masked reads with non-trivial permutation maps
771f5759SQuinn Dawkins      // requires the distribution of the mask to elementwise match the
771f5759SQuinn Dawkins      // distribution of the permuted written vector. Currently the details
771f5759SQuinn Dawkins      // of which lane is responsible for which element is captured strictly
771f5759SQuinn Dawkins      // by shape information on the warp op, and thus requires materializing
771f5759SQuinn Dawkins      // the permutation in IR.
f385f6c9SQuinn Dawkins      if (!mlir::compressUnusedDims(read.getPermutationMap()).isIdentity())
35c19fddSMatthias Springer        return rewriter.notifyMatchFailure(
35c19fddSMatthias Springer            read, "non-trivial permutation maps not supported");
771f5759SQuinn Dawkins      VectorType maskType =
771f5759SQuinn Dawkins          getDistributedType(read.getMaskType(), map, warpOp.getWarpSize());
35c19fddSMatthias Springer      additionalResults.push_back(read.getMask());
35c19fddSMatthias Springer      additionalResultTypes.push_back(maskType);
771f5759SQuinn Dawkins    }
771f5759SQuinn Dawkins
35c19fddSMatthias Springer    SmallVector<size_t> newRetIndices;
35c19fddSMatthias Springer    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
35c19fddSMatthias Springer        rewriter, warpOp, additionalResults, additionalResultTypes,
35c19fddSMatthias Springer        newRetIndices);
35c19fddSMatthias Springer    distributedVal = newWarpOp.getResult(operandIndex);
35c19fddSMatthias Springer
35c19fddSMatthias Springer    // Distributed indices were appended first.
35c19fddSMatthias Springer    SmallVector<Value> newIndices;
35c19fddSMatthias Springer    for (int64_t i = 0, e = indices.size(); i < e; ++i)
35c19fddSMatthias Springer      newIndices.push_back(newWarpOp.getResult(newRetIndices[i]));
35c19fddSMatthias Springer
771f5759SQuinn Dawkins    rewriter.setInsertionPointAfter(newWarpOp);
199442eaSLei Zhang    for (auto it : llvm::zip_equal(indexMap.getResults(), map.getResults())) {
76cf33daSThomas Raoux      AffineExpr d0, d1;
76cf33daSThomas Raoux      bindDims(read.getContext(), d0, d1);
1609f1c2Slong.chen      auto indexExpr = dyn_cast<AffineDimExpr>(std::get<0>(it));
76cf33daSThomas Raoux      if (!indexExpr)
76cf33daSThomas Raoux        continue;
76cf33daSThomas Raoux      unsigned indexPos = indexExpr.getPosition();
1609f1c2Slong.chen      unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
73ddc447SLei Zhang      int64_t scale = distributedType.getDimSize(vectorPos);
35c19fddSMatthias Springer      newIndices[indexPos] = affine::makeComposedAffineApply(
4c48f016SMatthias Springer          rewriter, read.getLoc(), d0 + scale * d1,
35c19fddSMatthias Springer          {newIndices[indexPos], delinearizedIds[vectorPos]});
76cf33daSThomas Raoux    }
35c19fddSMatthias Springer
35c19fddSMatthias Springer    // Distributed padding value was appended right after the indices.
35c19fddSMatthias Springer    Value newPadding = newWarpOp.getResult(newRetIndices[indices.size()]);
35c19fddSMatthias Springer    // Distributed mask value was added at the end (if the op has a mask).
35c19fddSMatthias Springer    Value newMask =
35c19fddSMatthias Springer        hasMask ? newWarpOp.getResult(newRetIndices[newRetIndices.size() - 1])
35c19fddSMatthias Springer                : Value();
018d8ac9SQuentin Colombet    auto newRead = rewriter.create<vector::TransferReadOp>(
35c19fddSMatthias Springer        read.getLoc(), distributedVal.getType(), read.getSource(), newIndices,
35c19fddSMatthias Springer        read.getPermutationMapAttr(), newPadding, newMask,
76cf33daSThomas Raoux        read.getInBoundsAttr());
018d8ac9SQuentin Colombet
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(distributedVal, newRead);
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
76cf33daSThomas Raoux/// Remove any result that has no use along with the matching yieldOp operand.
76cf33daSThomas Raoux// TODO: Move this in WarpExecuteOnLane0Op canonicalization.
*bc29fc93SPetr Kurapovstruct WarpOpDeadResult : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
20df17fdSNicolas Vasilache    SmallVector<Type> newResultTypes;
20df17fdSNicolas Vasilache    newResultTypes.reserve(warpOp->getNumResults());
20df17fdSNicolas Vasilache    SmallVector<Value> newYieldValues;
20df17fdSNicolas Vasilache    newYieldValues.reserve(warpOp->getNumResults());
20df17fdSNicolas Vasilache    DenseMap<Value, int64_t> dedupYieldOperandPositionMap;
20df17fdSNicolas Vasilache    DenseMap<OpResult, int64_t> dedupResultPositionMap;
ecaf2c33SPetr Kurapov    auto yield = cast<gpu::YieldOp>(
76cf33daSThomas Raoux        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
20df17fdSNicolas Vasilache
20df17fdSNicolas Vasilache    // Some values may be yielded multiple times and correspond to multiple
20df17fdSNicolas Vasilache    // results. Deduplicating occurs by taking each result with its matching
20df17fdSNicolas Vasilache    // yielded value, and:
20df17fdSNicolas Vasilache    //   1. recording the unique first position at which the value is yielded.
20df17fdSNicolas Vasilache    //   2. recording for the result, the first position at which the dedup'ed
20df17fdSNicolas Vasilache    //      value is yielded.
20df17fdSNicolas Vasilache    //   3. skipping from the new result types / new yielded values any result
20df17fdSNicolas Vasilache    //      that has no use or whose yielded value has already been seen.
76cf33daSThomas Raoux    for (OpResult result : warpOp.getResults()) {
20df17fdSNicolas Vasilache      Value yieldOperand = yield.getOperand(result.getResultNumber());
20df17fdSNicolas Vasilache      auto it = dedupYieldOperandPositionMap.insert(
20df17fdSNicolas Vasilache          std::make_pair(yieldOperand, newResultTypes.size()));
20df17fdSNicolas Vasilache      dedupResultPositionMap.insert(std::make_pair(result, it.first->second));
20df17fdSNicolas Vasilache      if (result.use_empty() || !it.second)
76cf33daSThomas Raoux        continue;
20df17fdSNicolas Vasilache      newResultTypes.push_back(result.getType());
20df17fdSNicolas Vasilache      newYieldValues.push_back(yieldOperand);
76cf33daSThomas Raoux    }
20df17fdSNicolas Vasilache    // No modification, exit early.
20df17fdSNicolas Vasilache    if (yield.getNumOperands() == newYieldValues.size())
76cf33daSThomas Raoux      return failure();
20df17fdSNicolas Vasilache    // Move the body of the old warpOp to a new warpOp.
76cf33daSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
20df17fdSNicolas Vasilache        rewriter, warpOp, newYieldValues, newResultTypes);
7360d5d3SQuinn Dawkins
7360d5d3SQuinn Dawkins    // Simplify the new warp op after dropping dead results.
7360d5d3SQuinn Dawkins    newWarpOp.getBody()->walk([&](Operation *op) {
7360d5d3SQuinn Dawkins      if (isOpTriviallyDead(op))
7360d5d3SQuinn Dawkins        rewriter.eraseOp(op);
7360d5d3SQuinn Dawkins    });
7360d5d3SQuinn Dawkins
20df17fdSNicolas Vasilache    // Replace results of the old warpOp by the new, deduplicated results.
20df17fdSNicolas Vasilache    SmallVector<Value> newValues;
20df17fdSNicolas Vasilache    newValues.reserve(warpOp->getNumResults());
76cf33daSThomas Raoux    for (OpResult result : warpOp.getResults()) {
76cf33daSThomas Raoux      if (result.use_empty())
20df17fdSNicolas Vasilache        newValues.push_back(Value());
20df17fdSNicolas Vasilache      else
20df17fdSNicolas Vasilache        newValues.push_back(
20df17fdSNicolas Vasilache            newWarpOp.getResult(dedupResultPositionMap.lookup(result)));
76cf33daSThomas Raoux    }
20df17fdSNicolas Vasilache    rewriter.replaceOp(warpOp, newValues);
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
76cf33daSThomas Raoux// If an operand is directly yielded out of the region we can forward it
76cf33daSThomas Raoux// directly and it doesn't need to go through the region.
*bc29fc93SPetr Kurapovstruct WarpOpForwardOperand : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
76cf33daSThomas Raoux    SmallVector<Type> resultTypes;
76cf33daSThomas Raoux    SmallVector<Value> yieldValues;
ecaf2c33SPetr Kurapov    auto yield = cast<gpu::YieldOp>(
76cf33daSThomas Raoux        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
76cf33daSThomas Raoux    Value valForwarded;
76cf33daSThomas Raoux    unsigned resultIndex;
76cf33daSThomas Raoux    for (OpOperand &operand : yield->getOpOperands()) {
76cf33daSThomas Raoux      Value result = warpOp.getResult(operand.getOperandNumber());
76cf33daSThomas Raoux      if (result.use_empty())
76cf33daSThomas Raoux        continue;
76cf33daSThomas Raoux
76cf33daSThomas Raoux      // Assume all the values coming from above are uniform.
76cf33daSThomas Raoux      if (!warpOp.getBodyRegion().isAncestor(operand.get().getParentRegion())) {
76cf33daSThomas Raoux        if (result.getType() != operand.get().getType())
76cf33daSThomas Raoux          continue;
76cf33daSThomas Raoux        valForwarded = operand.get();
76cf33daSThomas Raoux        resultIndex = operand.getOperandNumber();
76cf33daSThomas Raoux        break;
76cf33daSThomas Raoux      }
5550c821STres Popp      auto arg = dyn_cast<BlockArgument>(operand.get());
76cf33daSThomas Raoux      if (!arg || arg.getOwner()->getParentOp() != warpOp.getOperation())
76cf33daSThomas Raoux        continue;
76cf33daSThomas Raoux      Value warpOperand = warpOp.getArgs()[arg.getArgNumber()];
76cf33daSThomas Raoux      if (result.getType() != warpOperand.getType())
76cf33daSThomas Raoux        continue;
76cf33daSThomas Raoux      valForwarded = warpOperand;
76cf33daSThomas Raoux      resultIndex = operand.getOperandNumber();
76cf33daSThomas Raoux      break;
76cf33daSThomas Raoux    }
76cf33daSThomas Raoux    if (!valForwarded)
76cf33daSThomas Raoux      return failure();
aa2376a0SQuinn Dawkins    // Notify the rewriter that the warp op is changing (see the comment on
aa2376a0SQuinn Dawkins    // the WarpOpTransferRead pattern).
5fcf907bSMatthias Springer    rewriter.startOpModification(warpOp);
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(warpOp.getResult(resultIndex), valForwarded);
5fcf907bSMatthias Springer    rewriter.finalizeOpModification(warpOp);
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
*bc29fc93SPetr Kurapovstruct WarpOpBroadcast : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *operand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
76cf33daSThomas Raoux    if (!operand)
76cf33daSThomas Raoux      return failure();
76cf33daSThomas Raoux    unsigned int operandNumber = operand->getOperandNumber();
76cf33daSThomas Raoux    auto broadcastOp = operand->get().getDefiningOp<vector::BroadcastOp>();
76cf33daSThomas Raoux    Location loc = broadcastOp.getLoc();
76cf33daSThomas Raoux    auto destVecType =
5550c821STres Popp        cast<VectorType>(warpOp->getResultTypes()[operandNumber]);
1dd00d39SQuentin Colombet    Value broadcastSrc = broadcastOp.getSource();
1dd00d39SQuentin Colombet    Type broadcastSrcType = broadcastSrc.getType();
1dd00d39SQuentin Colombet
1dd00d39SQuentin Colombet    // Check that the broadcast actually spans a set of values uniformly across
1dd00d39SQuentin Colombet    // all threads. In other words, check that each thread can reconstruct
1dd00d39SQuentin Colombet    // their own broadcast.
1dd00d39SQuentin Colombet    // For that we simply check that the broadcast we want to build makes sense.
1dd00d39SQuentin Colombet    if (vector::isBroadcastableTo(broadcastSrcType, destVecType) !=
1dd00d39SQuentin Colombet        vector::BroadcastableToResult::Success)
1dd00d39SQuentin Colombet      return failure();
d7d6443dSThomas Raoux    SmallVector<size_t> newRetIndices;
76cf33daSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1dd00d39SQuentin Colombet        rewriter, warpOp, {broadcastSrc}, {broadcastSrcType}, newRetIndices);
76cf33daSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
76cf33daSThomas Raoux    Value broadcasted = rewriter.create<vector::BroadcastOp>(
d7d6443dSThomas Raoux        loc, destVecType, newWarpOp->getResult(newRetIndices[0]));
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                broadcasted);
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
73ddc447SLei Zhang/// Pattern to move shape cast out of the warp op. shape cast is basically a
73ddc447SLei Zhang/// no-op for warp distribution; we need to handle the shape though.
*bc29fc93SPetr Kurapovstruct WarpOpShapeCast : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
73ddc447SLei Zhang  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
73ddc447SLei Zhang                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *operand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
73ddc447SLei Zhang    if (!operand)
73ddc447SLei Zhang      return failure();
aa2376a0SQuinn Dawkins
73ddc447SLei Zhang    auto oldCastOp = operand->get().getDefiningOp<vector::ShapeCastOp>();
73ddc447SLei Zhang
73ddc447SLei Zhang    unsigned int operandNumber = operand->getOperandNumber();
73ddc447SLei Zhang    auto castDistributedType =
73ddc447SLei Zhang        cast<VectorType>(warpOp->getResultTypes()[operandNumber]);
73ddc447SLei Zhang    VectorType castOriginalType = oldCastOp.getSourceVectorType();
73ddc447SLei Zhang    VectorType castResultType = castDistributedType;
73ddc447SLei Zhang
73ddc447SLei Zhang    // We expect the distributed type to have a smaller rank than the original
73ddc447SLei Zhang    // type. Prepend with size-one dimensions to make them the same.
73ddc447SLei Zhang    unsigned castDistributedRank = castDistributedType.getRank();
73ddc447SLei Zhang    unsigned castOriginalRank = castOriginalType.getRank();
73ddc447SLei Zhang    if (castDistributedRank < castOriginalRank) {
73ddc447SLei Zhang      SmallVector<int64_t> shape(castOriginalRank - castDistributedRank, 1);
73ddc447SLei Zhang      llvm::append_range(shape, castDistributedType.getShape());
73ddc447SLei Zhang      castDistributedType =
73ddc447SLei Zhang          VectorType::get(shape, castDistributedType.getElementType());
73ddc447SLei Zhang    }
73ddc447SLei Zhang
73ddc447SLei Zhang    SmallVector<size_t> newRetIndices;
73ddc447SLei Zhang    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
73ddc447SLei Zhang        rewriter, warpOp, {oldCastOp.getSource()}, {castDistributedType},
73ddc447SLei Zhang        newRetIndices);
73ddc447SLei Zhang    rewriter.setInsertionPointAfter(newWarpOp);
73ddc447SLei Zhang    Value newCast = rewriter.create<vector::ShapeCastOp>(
73ddc447SLei Zhang        oldCastOp.getLoc(), castResultType,
73ddc447SLei Zhang        newWarpOp->getResult(newRetIndices[0]));
73ddc447SLei Zhang    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newCast);
73ddc447SLei Zhang    return success();
73ddc447SLei Zhang  }
73ddc447SLei Zhang};
73ddc447SLei Zhang
d4d28914SQuinn Dawkins/// Sink out vector.create_mask op feeding into a warp op yield.
d4d28914SQuinn Dawkins/// ```
d4d28914SQuinn Dawkins/// %0 = ...
ecaf2c33SPetr Kurapov/// %1 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
d4d28914SQuinn Dawkins///   ...
d4d28914SQuinn Dawkins///   %mask = vector.create_mask %0 : vector<32xi1>
ecaf2c33SPetr Kurapov///   gpu.yield %mask : vector<32xi1>
d4d28914SQuinn Dawkins/// }
d4d28914SQuinn Dawkins/// ```
d4d28914SQuinn Dawkins/// To
d4d28914SQuinn Dawkins/// ```
d4d28914SQuinn Dawkins/// %0 = ...
ecaf2c33SPetr Kurapov/// gpu.warp_execute_on_lane_0(%arg0) {
d4d28914SQuinn Dawkins///   ...
d4d28914SQuinn Dawkins/// }
d4d28914SQuinn Dawkins/// %cmp = arith.cmpi ult, %laneid, %0
d4d28914SQuinn Dawkins/// %ub = arith.select %cmp, %c0, %c1
d4d28914SQuinn Dawkins/// %1 = vector.create_mask %ub : vector<1xi1>
*bc29fc93SPetr Kurapovstruct WarpOpCreateMask : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
d4d28914SQuinn Dawkins  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
d4d28914SQuinn Dawkins                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *yieldOperand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<vector::CreateMaskOp>);
d4d28914SQuinn Dawkins    if (!yieldOperand)
d4d28914SQuinn Dawkins      return failure();
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    auto mask = yieldOperand->get().getDefiningOp<vector::CreateMaskOp>();
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    // Early exit if any values needed for calculating the new mask indices
d4d28914SQuinn Dawkins    // are defined inside the warp op.
d4d28914SQuinn Dawkins    if (!llvm::all_of(mask->getOperands(), [&](Value value) {
d4d28914SQuinn Dawkins          return warpOp.isDefinedOutsideOfRegion(value);
d4d28914SQuinn Dawkins        }))
d4d28914SQuinn Dawkins      return failure();
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    Location loc = mask.getLoc();
d4d28914SQuinn Dawkins    unsigned operandIndex = yieldOperand->getOperandNumber();
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    auto distType = cast<VectorType>(warpOp.getResult(operandIndex).getType());
d4d28914SQuinn Dawkins    VectorType seqType = mask.getVectorType();
d4d28914SQuinn Dawkins    ArrayRef<int64_t> seqShape = seqType.getShape();
d4d28914SQuinn Dawkins    ArrayRef<int64_t> distShape = distType.getShape();
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    rewriter.setInsertionPointAfter(warpOp);
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    // Delinearize the lane ID for constructing the distributed mask sizes.
d4d28914SQuinn Dawkins    SmallVector<Value> delinearizedIds;
d4d28914SQuinn Dawkins    if (!delinearizeLaneId(rewriter, loc, seqShape, distShape,
d4d28914SQuinn Dawkins                           warpOp.getWarpSize(), warpOp.getLaneid(),
d4d28914SQuinn Dawkins                           delinearizedIds))
d4d28914SQuinn Dawkins      return rewriter.notifyMatchFailure(
d4d28914SQuinn Dawkins          mask, "cannot delinearize lane ID for distribution");
d4d28914SQuinn Dawkins    assert(!delinearizedIds.empty());
d4d28914SQuinn Dawkins
aa2376a0SQuinn Dawkins    // Notify the rewriter that the warp op is changing (see the comment on
aa2376a0SQuinn Dawkins    // the WarpOpTransferRead pattern).
5fcf907bSMatthias Springer    rewriter.startOpModification(warpOp);
aa2376a0SQuinn Dawkins
d4d28914SQuinn Dawkins    AffineExpr s0, s1;
d4d28914SQuinn Dawkins    bindSymbols(rewriter.getContext(), s0, s1);
d4d28914SQuinn Dawkins    SmallVector<Value> newOperands;
d4d28914SQuinn Dawkins    for (int i = 0, e = distShape.size(); i < e; ++i) {
d4d28914SQuinn Dawkins      // Get `mask_dim_range_upper_limit[i] - lane_id[i] * dist_sizes[i]` to
d4d28914SQuinn Dawkins      // find the distance from the largest mask index owned by this lane to the
d4d28914SQuinn Dawkins      // original mask size. `vector.create_mask` implicitly clamps mask
d4d28914SQuinn Dawkins      // operands to the range [0, mask_vector_size[i]], or in other words, the
d4d28914SQuinn Dawkins      // mask sizes are always in the range [0, mask_vector_size[i]).
d4d28914SQuinn Dawkins      Value maskDimIdx = affine::makeComposedAffineApply(
d4d28914SQuinn Dawkins          rewriter, loc, s1 - s0 * distShape[i],
d4d28914SQuinn Dawkins          {delinearizedIds[i], mask.getOperand(i)});
d4d28914SQuinn Dawkins      newOperands.push_back(maskDimIdx);
d4d28914SQuinn Dawkins    }
d4d28914SQuinn Dawkins
d4d28914SQuinn Dawkins    auto newMask =
d4d28914SQuinn Dawkins        rewriter.create<vector::CreateMaskOp>(loc, distType, newOperands);
d4d28914SQuinn Dawkins    rewriter.replaceAllUsesWith(warpOp.getResult(operandIndex), newMask);
5fcf907bSMatthias Springer    rewriter.finalizeOpModification(warpOp);
d4d28914SQuinn Dawkins    return success();
d4d28914SQuinn Dawkins  }
d4d28914SQuinn Dawkins};
d4d28914SQuinn Dawkins
f48ce52cSThomas Raoux/// Pattern to move out vector.extract of single element vector. Those don't
f48ce52cSThomas Raoux/// need to be distributed and can just be propagated outside of the region.
*bc29fc93SPetr Kurapovstruct WarpOpExtract : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
f48ce52cSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
f48ce52cSThomas Raoux                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *operand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<vector::ExtractOp>);
f48ce52cSThomas Raoux    if (!operand)
f48ce52cSThomas Raoux      return failure();
f48ce52cSThomas Raoux    unsigned int operandNumber = operand->getOperandNumber();
f48ce52cSThomas Raoux    auto extractOp = operand->get().getDefiningOp<vector::ExtractOp>();
a1aad28dSLei Zhang    VectorType extractSrcType = extractOp.getSourceVectorType();
f48ce52cSThomas Raoux    Location loc = extractOp.getLoc();
9085f00bSMatthias Springer
2f925d75SKunwar Grover    // For 1-d or 0-d source cases, we rely on WarpOpExtractScalar pattern.
2f925d75SKunwar Grover    if (extractSrcType.getRank() <= 1) {
9085f00bSMatthias Springer      return failure();
9085f00bSMatthias Springer    }
9085f00bSMatthias Springer
9085f00bSMatthias Springer    // All following cases are 2d or higher dimensional source vectors.
9085f00bSMatthias Springer
9085f00bSMatthias Springer    if (warpOp.getResult(operandNumber).getType() == operand->get().getType()) {
9085f00bSMatthias Springer      // There is no distribution, this is a broadcast. Simply move the extract
9085f00bSMatthias Springer      // out of the warp op.
9085f00bSMatthias Springer      // TODO: This could be optimized. E.g., in case of a scalar result, let
9085f00bSMatthias Springer      // one lane extract and shuffle the result to all other lanes (same as
9085f00bSMatthias Springer      // the 1d case).
f48ce52cSThomas Raoux      SmallVector<size_t> newRetIndices;
f48ce52cSThomas Raoux      WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
9085f00bSMatthias Springer          rewriter, warpOp, {extractOp.getVector()},
a1aad28dSLei Zhang          {extractOp.getSourceVectorType()}, newRetIndices);
9085f00bSMatthias Springer      rewriter.setInsertionPointAfter(newWarpOp);
9085f00bSMatthias Springer      Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
9085f00bSMatthias Springer      // Extract from distributed vector.
9085f00bSMatthias Springer      Value newExtract = rewriter.create<vector::ExtractOp>(
98f6289aSDiego Caballero          loc, distributedVec, extractOp.getMixedPosition());
7ecc921dSMatthias Springer      rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                  newExtract);
9085f00bSMatthias Springer      return success();
9085f00bSMatthias Springer    }
9085f00bSMatthias Springer
9085f00bSMatthias Springer    // Find the distributed dimension. There should be exactly one.
9085f00bSMatthias Springer    auto distributedType =
5550c821STres Popp        cast<VectorType>(warpOp.getResult(operandNumber).getType());
5550c821STres Popp    auto yieldedType = cast<VectorType>(operand->get().getType());
9085f00bSMatthias Springer    int64_t distributedDim = -1;
9085f00bSMatthias Springer    for (int64_t i = 0; i < yieldedType.getRank(); ++i) {
9085f00bSMatthias Springer      if (distributedType.getDimSize(i) != yieldedType.getDimSize(i)) {
9085f00bSMatthias Springer        // Keep this assert here in case WarpExecuteOnLane0Op gets extended to
9085f00bSMatthias Springer        // support distributing multiple dimensions in the future.
9085f00bSMatthias Springer        assert(distributedDim == -1 && "found multiple distributed dims");
9085f00bSMatthias Springer        distributedDim = i;
9085f00bSMatthias Springer      }
9085f00bSMatthias Springer    }
9085f00bSMatthias Springer    assert(distributedDim != -1 && "could not find distributed dimension");
51ddfd76SKazu Hirata    (void)distributedDim;
9085f00bSMatthias Springer
9085f00bSMatthias Springer    // Yield source vector from warp op.
5262865aSKazu Hirata    SmallVector<int64_t> newDistributedShape(extractSrcType.getShape());
9085f00bSMatthias Springer    for (int i = 0; i < distributedType.getRank(); ++i)
98f6289aSDiego Caballero      newDistributedShape[i + extractOp.getNumIndices()] =
9085f00bSMatthias Springer          distributedType.getDimSize(i);
9085f00bSMatthias Springer    auto newDistributedType =
9085f00bSMatthias Springer        VectorType::get(newDistributedShape, distributedType.getElementType());
9085f00bSMatthias Springer    SmallVector<size_t> newRetIndices;
9085f00bSMatthias Springer    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
9085f00bSMatthias Springer        rewriter, warpOp, {extractOp.getVector()}, {newDistributedType},
f48ce52cSThomas Raoux        newRetIndices);
f48ce52cSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
9085f00bSMatthias Springer    Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
9085f00bSMatthias Springer    // Extract from distributed vector.
f48ce52cSThomas Raoux    Value newExtract = rewriter.create<vector::ExtractOp>(
98f6289aSDiego Caballero        loc, distributedVec, extractOp.getMixedPosition());
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                newExtract);
f48ce52cSThomas Raoux    return success();
f48ce52cSThomas Raoux  }
f48ce52cSThomas Raoux};
f48ce52cSThomas Raoux
2f925d75SKunwar Grover/// Pattern to move out vector.extract with a scalar result.
2f925d75SKunwar Grover/// Only supports 1-D and 0-D sources for now.
*bc29fc93SPetr Kurapovstruct WarpOpExtractScalar : public WarpDistributionPattern {
2f925d75SKunwar Grover  WarpOpExtractScalar(MLIRContext *ctx, WarpShuffleFromIdxFn fn,
9d51b4e4SMatthias Springer                      PatternBenefit b = 1)
*bc29fc93SPetr Kurapov      : WarpDistributionPattern(ctx, b), warpShuffleFromIdxFn(std::move(fn)) {}
1757164eSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
1757164eSThomas Raoux                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *operand =
2f925d75SKunwar Grover        getWarpResult(warpOp, llvm::IsaPred<vector::ExtractOp>);
1757164eSThomas Raoux    if (!operand)
1757164eSThomas Raoux      return failure();
1757164eSThomas Raoux    unsigned int operandNumber = operand->getOperandNumber();
2f925d75SKunwar Grover    auto extractOp = operand->get().getDefiningOp<vector::ExtractOp>();
a1aad28dSLei Zhang    VectorType extractSrcType = extractOp.getSourceVectorType();
2f925d75SKunwar Grover    // Only supports 1-D or 0-D sources for now.
2f925d75SKunwar Grover    if (extractSrcType.getRank() > 1) {
2f925d75SKunwar Grover      return rewriter.notifyMatchFailure(
2f925d75SKunwar Grover          extractOp, "only 0-D or 1-D source supported for now");
2f925d75SKunwar Grover    }
35c19fddSMatthias Springer    // TODO: Supported shuffle types should be parameterizable, similar to
35c19fddSMatthias Springer    // `WarpShuffleFromIdxFn`.
35c19fddSMatthias Springer    if (!extractSrcType.getElementType().isF32() &&
35c19fddSMatthias Springer        !extractSrcType.getElementType().isInteger(32))
35c19fddSMatthias Springer      return rewriter.notifyMatchFailure(
35c19fddSMatthias Springer          extractOp, "only f32/i32 element types are supported");
069d7d7eSThomas Raoux    bool is0dOrVec1Extract = extractSrcType.getNumElements() == 1;
9d51b4e4SMatthias Springer    Type elType = extractSrcType.getElementType();
9d51b4e4SMatthias Springer    VectorType distributedVecType;
069d7d7eSThomas Raoux    if (!is0dOrVec1Extract) {
9d51b4e4SMatthias Springer      assert(extractSrcType.getRank() == 1 &&
2f925d75SKunwar Grover             "expected that extract src rank is 0 or 1");
069d7d7eSThomas Raoux      if (extractSrcType.getShape()[0] % warpOp.getWarpSize() != 0)
069d7d7eSThomas Raoux        return failure();
9d51b4e4SMatthias Springer      int64_t elementsPerLane =
9d51b4e4SMatthias Springer          extractSrcType.getShape()[0] / warpOp.getWarpSize();
9d51b4e4SMatthias Springer      distributedVecType = VectorType::get({elementsPerLane}, elType);
9d51b4e4SMatthias Springer    } else {
9d51b4e4SMatthias Springer      distributedVecType = extractSrcType;
9d51b4e4SMatthias Springer    }
ad100b36SMatthias Springer    // Yield source vector and position (if present) from warp op.
ad100b36SMatthias Springer    SmallVector<Value> additionalResults{extractOp.getVector()};
ad100b36SMatthias Springer    SmallVector<Type> additionalResultTypes{distributedVecType};
2f925d75SKunwar Grover    additionalResults.append(
2f925d75SKunwar Grover        SmallVector<Value>(extractOp.getDynamicPosition()));
2f925d75SKunwar Grover    additionalResultTypes.append(
2f925d75SKunwar Grover        SmallVector<Type>(extractOp.getDynamicPosition().getTypes()));
2f925d75SKunwar Grover
1757164eSThomas Raoux    Location loc = extractOp.getLoc();
1757164eSThomas Raoux    SmallVector<size_t> newRetIndices;
1757164eSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
ad100b36SMatthias Springer        rewriter, warpOp, additionalResults, additionalResultTypes,
1757164eSThomas Raoux        newRetIndices);
1757164eSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
9d51b4e4SMatthias Springer    Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
9d51b4e4SMatthias Springer
9d51b4e4SMatthias Springer    // 0d extract: The new warp op broadcasts the source vector to all lanes.
9d51b4e4SMatthias Springer    // All lanes extract the scalar.
069d7d7eSThomas Raoux    if (is0dOrVec1Extract) {
069d7d7eSThomas Raoux      Value newExtract;
2f925d75SKunwar Grover      SmallVector<int64_t> indices(extractSrcType.getRank(), 0);
069d7d7eSThomas Raoux      newExtract =
2f925d75SKunwar Grover          rewriter.create<vector::ExtractOp>(loc, distributedVec, indices);
7ecc921dSMatthias Springer      rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                  newExtract);
1757164eSThomas Raoux      return success();
1757164eSThomas Raoux    }
9d51b4e4SMatthias Springer
2f925d75SKunwar Grover    int64_t staticPos = extractOp.getStaticPosition()[0];
2f925d75SKunwar Grover    OpFoldResult pos = ShapedType::isDynamic(staticPos)
2f925d75SKunwar Grover                           ? (newWarpOp->getResult(newRetIndices[1]))
2f925d75SKunwar Grover                           : OpFoldResult(rewriter.getIndexAttr(staticPos));
9d51b4e4SMatthias Springer    // 1d extract: Distribute the source vector. One lane extracts and shuffles
9d51b4e4SMatthias Springer    // the value to all other lanes.
9d51b4e4SMatthias Springer    int64_t elementsPerLane = distributedVecType.getShape()[0];
9d51b4e4SMatthias Springer    AffineExpr sym0 = getAffineSymbolExpr(0, rewriter.getContext());
9d51b4e4SMatthias Springer    // tid of extracting thread: pos / elementsPerLane
2f925d75SKunwar Grover    Value broadcastFromTid = affine::makeComposedAffineApply(
2f925d75SKunwar Grover        rewriter, loc, sym0.ceilDiv(elementsPerLane), pos);
9d51b4e4SMatthias Springer    // Extract at position: pos % elementsPerLane
2f925d75SKunwar Grover    Value newPos =
73ce971cSMatthias Springer        elementsPerLane == 1
73ce971cSMatthias Springer            ? rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult()
2f925d75SKunwar Grover            : affine::makeComposedAffineApply(rewriter, loc,
2f925d75SKunwar Grover                                              sym0 % elementsPerLane, pos);
9d51b4e4SMatthias Springer    Value extracted =
2f925d75SKunwar Grover        rewriter.create<vector::ExtractOp>(loc, distributedVec, newPos);
9d51b4e4SMatthias Springer
9d51b4e4SMatthias Springer    // Shuffle the extracted value to all lanes.
9d51b4e4SMatthias Springer    Value shuffled = warpShuffleFromIdxFn(
9d51b4e4SMatthias Springer        loc, rewriter, extracted, broadcastFromTid, newWarpOp.getWarpSize());
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), shuffled);
9d51b4e4SMatthias Springer    return success();
9d51b4e4SMatthias Springer  }
9d51b4e4SMatthias Springer
9d51b4e4SMatthias Springerprivate:
9d51b4e4SMatthias Springer  WarpShuffleFromIdxFn warpShuffleFromIdxFn;
1757164eSThomas Raoux};
1757164eSThomas Raoux
2f925d75SKunwar Grover/// Pattern to convert vector.extractelement to vector.extract.
*bc29fc93SPetr Kurapovstruct WarpOpExtractElement : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
2f925d75SKunwar Grover  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
2f925d75SKunwar Grover                                PatternRewriter &rewriter) const override {
2f925d75SKunwar Grover    OpOperand *operand =
2f925d75SKunwar Grover        getWarpResult(warpOp, llvm::IsaPred<vector::ExtractElementOp>);
2f925d75SKunwar Grover    if (!operand)
2f925d75SKunwar Grover      return failure();
2f925d75SKunwar Grover    auto extractOp = operand->get().getDefiningOp<vector::ExtractElementOp>();
2f925d75SKunwar Grover    SmallVector<OpFoldResult> indices;
2f925d75SKunwar Grover    if (auto pos = extractOp.getPosition()) {
2f925d75SKunwar Grover      indices.push_back(pos);
2f925d75SKunwar Grover    }
2f925d75SKunwar Grover    rewriter.setInsertionPoint(extractOp);
2f925d75SKunwar Grover    rewriter.replaceOpWithNewOp<vector::ExtractOp>(
2f925d75SKunwar Grover        extractOp, extractOp.getVector(), indices);
2f925d75SKunwar Grover    return success();
2f925d75SKunwar Grover  }
2f925d75SKunwar Grover};
2f925d75SKunwar Grover
2f925d75SKunwar Grover/// Pattern to move out vector.insert with a scalar input.
2f925d75SKunwar Grover/// Only supports 1-D and 0-D destinations for now.
*bc29fc93SPetr Kurapovstruct WarpOpInsertScalar : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
73ce971cSMatthias Springer  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
73ce971cSMatthias Springer                                PatternRewriter &rewriter) const override {
2f925d75SKunwar Grover    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::InsertOp>);
73ce971cSMatthias Springer    if (!operand)
73ce971cSMatthias Springer      return failure();
73ce971cSMatthias Springer    unsigned int operandNumber = operand->getOperandNumber();
2f925d75SKunwar Grover    auto insertOp = operand->get().getDefiningOp<vector::InsertOp>();
73ce971cSMatthias Springer    VectorType vecType = insertOp.getDestVectorType();
73ce971cSMatthias Springer    VectorType distrType =
5550c821STres Popp        cast<VectorType>(warpOp.getResult(operandNumber).getType());
2f925d75SKunwar Grover
2f925d75SKunwar Grover    // Only supports 1-D or 0-D destinations for now.
2f925d75SKunwar Grover    if (vecType.getRank() > 1) {
2f925d75SKunwar Grover      return rewriter.notifyMatchFailure(
2f925d75SKunwar Grover          insertOp, "only 0-D or 1-D source supported for now");
2f925d75SKunwar Grover    }
73ce971cSMatthias Springer
73ce971cSMatthias Springer    // Yield destination vector, source scalar and position from warp op.
73ce971cSMatthias Springer    SmallVector<Value> additionalResults{insertOp.getDest(),
73ce971cSMatthias Springer                                         insertOp.getSource()};
73ce971cSMatthias Springer    SmallVector<Type> additionalResultTypes{distrType,
73ce971cSMatthias Springer                                            insertOp.getSource().getType()};
2f925d75SKunwar Grover    additionalResults.append(SmallVector<Value>(insertOp.getDynamicPosition()));
2f925d75SKunwar Grover    additionalResultTypes.append(
2f925d75SKunwar Grover        SmallVector<Type>(insertOp.getDynamicPosition().getTypes()));
2f925d75SKunwar Grover
73ce971cSMatthias Springer    Location loc = insertOp.getLoc();
73ce971cSMatthias Springer    SmallVector<size_t> newRetIndices;
73ce971cSMatthias Springer    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
73ce971cSMatthias Springer        rewriter, warpOp, additionalResults, additionalResultTypes,
73ce971cSMatthias Springer        newRetIndices);
73ce971cSMatthias Springer    rewriter.setInsertionPointAfter(newWarpOp);
73ce971cSMatthias Springer    Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
73ce971cSMatthias Springer    Value newSource = newWarpOp->getResult(newRetIndices[1]);
73ce971cSMatthias Springer    rewriter.setInsertionPointAfter(newWarpOp);
73ce971cSMatthias Springer
2f925d75SKunwar Grover    OpFoldResult pos;
2f925d75SKunwar Grover    if (vecType.getRank() != 0) {
2f925d75SKunwar Grover      int64_t staticPos = insertOp.getStaticPosition()[0];
2f925d75SKunwar Grover      pos = ShapedType::isDynamic(staticPos)
2f925d75SKunwar Grover                ? (newWarpOp->getResult(newRetIndices[2]))
2f925d75SKunwar Grover                : OpFoldResult(rewriter.getIndexAttr(staticPos));
2f925d75SKunwar Grover    }
2f925d75SKunwar Grover
2f925d75SKunwar Grover    // This condition is always true for 0-d vectors.
73ce971cSMatthias Springer    if (vecType == distrType) {
2f925d75SKunwar Grover      Value newInsert;
2f925d75SKunwar Grover      SmallVector<OpFoldResult> indices;
2f925d75SKunwar Grover      if (pos) {
2f925d75SKunwar Grover        indices.push_back(pos);
2f925d75SKunwar Grover      }
2f925d75SKunwar Grover      newInsert = rewriter.create<vector::InsertOp>(loc, newSource,
2f925d75SKunwar Grover                                                    distributedVec, indices);
2f925d75SKunwar Grover      // Broadcast: Simply move the vector.insert op out.
7ecc921dSMatthias Springer      rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                  newInsert);
73ce971cSMatthias Springer      return success();
73ce971cSMatthias Springer    }
73ce971cSMatthias Springer
73ce971cSMatthias Springer    // This is a distribution. Only one lane should insert.
73ce971cSMatthias Springer    int64_t elementsPerLane = distrType.getShape()[0];
73ce971cSMatthias Springer    AffineExpr sym0 = getAffineSymbolExpr(0, rewriter.getContext());
73ce971cSMatthias Springer    // tid of extracting thread: pos / elementsPerLane
2f925d75SKunwar Grover    Value insertingLane = affine::makeComposedAffineApply(
2f925d75SKunwar Grover        rewriter, loc, sym0.ceilDiv(elementsPerLane), pos);
73ce971cSMatthias Springer    // Insert position: pos % elementsPerLane
2f925d75SKunwar Grover    OpFoldResult newPos = affine::makeComposedFoldedAffineApply(
2f925d75SKunwar Grover        rewriter, loc, sym0 % elementsPerLane, pos);
73ce971cSMatthias Springer    Value isInsertingLane = rewriter.create<arith::CmpIOp>(
73ce971cSMatthias Springer        loc, arith::CmpIPredicate::eq, newWarpOp.getLaneid(), insertingLane);
73ce971cSMatthias Springer    Value newResult =
73ce971cSMatthias Springer        rewriter
73ce971cSMatthias Springer            .create<scf::IfOp>(
1125c5c0SFrederik Gossen                loc, isInsertingLane,
73ce971cSMatthias Springer                /*thenBuilder=*/
73ce971cSMatthias Springer                [&](OpBuilder &builder, Location loc) {
2f925d75SKunwar Grover                  Value newInsert = builder.create<vector::InsertOp>(
2f925d75SKunwar Grover                      loc, newSource, distributedVec, newPos);
73ce971cSMatthias Springer                  builder.create<scf::YieldOp>(loc, newInsert);
73ce971cSMatthias Springer                },
73ce971cSMatthias Springer                /*elseBuilder=*/
73ce971cSMatthias Springer                [&](OpBuilder &builder, Location loc) {
73ce971cSMatthias Springer                  builder.create<scf::YieldOp>(loc, distributedVec);
73ce971cSMatthias Springer                })
73ce971cSMatthias Springer            .getResult(0);
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newResult);
73ce971cSMatthias Springer    return success();
73ce971cSMatthias Springer  }
73ce971cSMatthias Springer};
73ce971cSMatthias Springer
*bc29fc93SPetr Kurapovstruct WarpOpInsert : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
1523b729SMatthias Springer  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
1523b729SMatthias Springer                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::InsertOp>);
1523b729SMatthias Springer    if (!operand)
1523b729SMatthias Springer      return failure();
1523b729SMatthias Springer    unsigned int operandNumber = operand->getOperandNumber();
1523b729SMatthias Springer    auto insertOp = operand->get().getDefiningOp<vector::InsertOp>();
1523b729SMatthias Springer    Location loc = insertOp.getLoc();
1523b729SMatthias Springer
2f925d75SKunwar Grover    // For 1-d or 0-d destination cases, we rely on WarpOpInsertScalar pattern.
2f925d75SKunwar Grover    if (insertOp.getDestVectorType().getRank() <= 1) {
1523b729SMatthias Springer      return failure();
1523b729SMatthias Springer    }
1523b729SMatthias Springer
2f925d75SKunwar Grover    // All following cases are 2d or higher dimensional source vectors.
2f925d75SKunwar Grover
1523b729SMatthias Springer    if (warpOp.getResult(operandNumber).getType() == operand->get().getType()) {
1523b729SMatthias Springer      // There is no distribution, this is a broadcast. Simply move the insert
1523b729SMatthias Springer      // out of the warp op.
1523b729SMatthias Springer      SmallVector<size_t> newRetIndices;
1523b729SMatthias Springer      WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1523b729SMatthias Springer          rewriter, warpOp, {insertOp.getSource(), insertOp.getDest()},
1523b729SMatthias Springer          {insertOp.getSourceType(), insertOp.getDestVectorType()},
1523b729SMatthias Springer          newRetIndices);
1523b729SMatthias Springer      rewriter.setInsertionPointAfter(newWarpOp);
1523b729SMatthias Springer      Value distributedSrc = newWarpOp->getResult(newRetIndices[0]);
1523b729SMatthias Springer      Value distributedDest = newWarpOp->getResult(newRetIndices[1]);
1523b729SMatthias Springer      Value newResult = rewriter.create<vector::InsertOp>(
98f6289aSDiego Caballero          loc, distributedSrc, distributedDest, insertOp.getMixedPosition());
7ecc921dSMatthias Springer      rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
7ecc921dSMatthias Springer                                  newResult);
1523b729SMatthias Springer      return success();
1523b729SMatthias Springer    }
1523b729SMatthias Springer
1523b729SMatthias Springer    // Find the distributed dimension. There should be exactly one.
1523b729SMatthias Springer    auto distrDestType =
5550c821STres Popp        cast<VectorType>(warpOp.getResult(operandNumber).getType());
5550c821STres Popp    auto yieldedType = cast<VectorType>(operand->get().getType());
1523b729SMatthias Springer    int64_t distrDestDim = -1;
1523b729SMatthias Springer    for (int64_t i = 0; i < yieldedType.getRank(); ++i) {
1523b729SMatthias Springer      if (distrDestType.getDimSize(i) != yieldedType.getDimSize(i)) {
1523b729SMatthias Springer        // Keep this assert here in case WarpExecuteOnLane0Op gets extended to
1523b729SMatthias Springer        // support distributing multiple dimensions in the future.
1523b729SMatthias Springer        assert(distrDestDim == -1 && "found multiple distributed dims");
1523b729SMatthias Springer        distrDestDim = i;
1523b729SMatthias Springer      }
1523b729SMatthias Springer    }
1523b729SMatthias Springer    assert(distrDestDim != -1 && "could not find distributed dimension");
1523b729SMatthias Springer
1523b729SMatthias Springer    // Compute the distributed source vector type.
5550c821STres Popp    VectorType srcVecType = cast<VectorType>(insertOp.getSourceType());
5262865aSKazu Hirata    SmallVector<int64_t> distrSrcShape(srcVecType.getShape());
1523b729SMatthias Springer    // E.g.: vector.insert %s, %d [2] : vector<96xf32> into vector<128x96xf32>
1523b729SMatthias Springer    // Case 1: distrDestDim = 1 (dim of size 96). In that case, each lane will
1523b729SMatthias Springer    //         insert a smaller vector<3xf32>.
1523b729SMatthias Springer    // Case 2: distrDestDim = 0 (dim of size 128) => distrSrcDim = -1. In that
1523b729SMatthias Springer    //         case, one lane will insert the source vector<96xf32>. The other
1523b729SMatthias Springer    //         lanes will not do anything.
98f6289aSDiego Caballero    int64_t distrSrcDim = distrDestDim - insertOp.getNumIndices();
1523b729SMatthias Springer    if (distrSrcDim >= 0)
1523b729SMatthias Springer      distrSrcShape[distrSrcDim] = distrDestType.getDimSize(distrDestDim);
1523b729SMatthias Springer    auto distrSrcType =
1523b729SMatthias Springer        VectorType::get(distrSrcShape, distrDestType.getElementType());
1523b729SMatthias Springer
1523b729SMatthias Springer    // Yield source and dest vectors from warp op.
1523b729SMatthias Springer    SmallVector<size_t> newRetIndices;
1523b729SMatthias Springer    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1523b729SMatthias Springer        rewriter, warpOp, {insertOp.getSource(), insertOp.getDest()},
1523b729SMatthias Springer        {distrSrcType, distrDestType}, newRetIndices);
1523b729SMatthias Springer    rewriter.setInsertionPointAfter(newWarpOp);
1523b729SMatthias Springer    Value distributedSrc = newWarpOp->getResult(newRetIndices[0]);
1523b729SMatthias Springer    Value distributedDest = newWarpOp->getResult(newRetIndices[1]);
1523b729SMatthias Springer
1523b729SMatthias Springer    // Insert into the distributed vector.
1523b729SMatthias Springer    Value newResult;
1523b729SMatthias Springer    if (distrSrcDim >= 0) {
1523b729SMatthias Springer      // Every lane inserts a small piece.
1523b729SMatthias Springer      newResult = rewriter.create<vector::InsertOp>(
98f6289aSDiego Caballero          loc, distributedSrc, distributedDest, insertOp.getMixedPosition());
1523b729SMatthias Springer    } else {
1523b729SMatthias Springer      // One lane inserts the entire source vector.
1523b729SMatthias Springer      int64_t elementsPerLane = distrDestType.getDimSize(distrDestDim);
98f6289aSDiego Caballero      SmallVector<OpFoldResult> pos = insertOp.getMixedPosition();
98f6289aSDiego Caballero      SmallVector<int64_t> newPos = getAsIntegers(pos);
1523b729SMatthias Springer      // tid of inserting lane: pos / elementsPerLane
1523b729SMatthias Springer      Value insertingLane = rewriter.create<arith::ConstantIndexOp>(
1523b729SMatthias Springer          loc, newPos[distrDestDim] / elementsPerLane);
1523b729SMatthias Springer      Value isInsertingLane = rewriter.create<arith::CmpIOp>(
1523b729SMatthias Springer          loc, arith::CmpIPredicate::eq, newWarpOp.getLaneid(), insertingLane);
1523b729SMatthias Springer      // Insert position: pos % elementsPerLane
1523b729SMatthias Springer      newPos[distrDestDim] %= elementsPerLane;
1523b729SMatthias Springer      auto insertingBuilder = [&](OpBuilder &builder, Location loc) {
1523b729SMatthias Springer        Value newInsert = builder.create<vector::InsertOp>(
1523b729SMatthias Springer            loc, distributedSrc, distributedDest, newPos);
1523b729SMatthias Springer        builder.create<scf::YieldOp>(loc, newInsert);
1523b729SMatthias Springer      };
1523b729SMatthias Springer      auto nonInsertingBuilder = [&](OpBuilder &builder, Location loc) {
1523b729SMatthias Springer        builder.create<scf::YieldOp>(loc, distributedDest);
1523b729SMatthias Springer      };
1523b729SMatthias Springer      newResult = rewriter
1125c5c0SFrederik Gossen                      .create<scf::IfOp>(loc, isInsertingLane,
1523b729SMatthias Springer                                         /*thenBuilder=*/insertingBuilder,
1523b729SMatthias Springer                                         /*elseBuilder=*/nonInsertingBuilder)
1523b729SMatthias Springer                      .getResult(0);
1523b729SMatthias Springer    }
1523b729SMatthias Springer
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newResult);
1523b729SMatthias Springer    return success();
1523b729SMatthias Springer  }
1523b729SMatthias Springer};
1523b729SMatthias Springer
*bc29fc93SPetr Kurapovstruct WarpOpInsertElement : public WarpDistributionPattern {
*bc29fc93SPetr Kurapov  using Base::Base;
2f925d75SKunwar Grover  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
2f925d75SKunwar Grover                                PatternRewriter &rewriter) const override {
2f925d75SKunwar Grover    OpOperand *operand =
2f925d75SKunwar Grover        getWarpResult(warpOp, llvm::IsaPred<vector::InsertElementOp>);
2f925d75SKunwar Grover    if (!operand)
2f925d75SKunwar Grover      return failure();
2f925d75SKunwar Grover    auto insertOp = operand->get().getDefiningOp<vector::InsertElementOp>();
2f925d75SKunwar Grover    SmallVector<OpFoldResult> indices;
2f925d75SKunwar Grover    if (auto pos = insertOp.getPosition()) {
2f925d75SKunwar Grover      indices.push_back(pos);
2f925d75SKunwar Grover    }
2f925d75SKunwar Grover    rewriter.setInsertionPoint(insertOp);
2f925d75SKunwar Grover    rewriter.replaceOpWithNewOp<vector::InsertOp>(
2f925d75SKunwar Grover        insertOp, insertOp.getSource(), insertOp.getDest(), indices);
2f925d75SKunwar Grover    return success();
2f925d75SKunwar Grover  }
2f925d75SKunwar Grover};
2f925d75SKunwar Grover
76cf33daSThomas Raoux/// Sink scf.for region out of WarpExecuteOnLane0Op. This can be done only if
2f925d75SKunwar Grover/// the scf.ForOp is the last operation in the region so that it doesn't
2f925d75SKunwar Grover/// change the order of execution. This creates a new scf.for region after the
76cf33daSThomas Raoux/// WarpExecuteOnLane0Op. The new scf.for region will contain a new
76cf33daSThomas Raoux/// WarpExecuteOnLane0Op region. Example:
76cf33daSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %w = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4xf32>) {
76cf33daSThomas Raoux///   ...
76cf33daSThomas Raoux///   %v1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %v)
76cf33daSThomas Raoux///   -> (vector<128xf32>) {
76cf33daSThomas Raoux///     ...
76cf33daSThomas Raoux///     scf.yield %r : vector<128xf32>
76cf33daSThomas Raoux///   }
ecaf2c33SPetr Kurapov///   gpu.yield %v1 : vector<128xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// ```
76cf33daSThomas Raoux/// To:
ecaf2c33SPetr Kurapov/// %w0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<4xf32>) {
76cf33daSThomas Raoux///   ...
ecaf2c33SPetr Kurapov///   gpu.yield %v : vector<128xf32>
76cf33daSThomas Raoux/// }
76cf33daSThomas Raoux/// %w = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%varg = %q0)
76cf33daSThomas Raoux///   -> (vector<4xf32>) {
ecaf2c33SPetr Kurapov///     %iw = gpu.warp_execute_on_lane_0(%laneid)
76cf33daSThomas Raoux///     args(%varg : vector<4xf32>) -> (vector<4xf32>) {
76cf33daSThomas Raoux///     ^bb0(%arg: vector<128xf32>):
76cf33daSThomas Raoux///       ...
ecaf2c33SPetr Kurapov///       gpu.yield %ir : vector<128xf32>
76cf33daSThomas Raoux///     }
76cf33daSThomas Raoux///     scf.yield %iw : vector<4xf32>
76cf33daSThomas Raoux///  }
76cf33daSThomas Raoux/// ```
*bc29fc93SPetr Kurapovstruct WarpOpScfForOp : public WarpDistributionPattern {
91f62f0eSThomas Raoux
91f62f0eSThomas Raoux  WarpOpScfForOp(MLIRContext *ctx, DistributionMapFn fn, PatternBenefit b = 1)
*bc29fc93SPetr Kurapov      : WarpDistributionPattern(ctx, b), distributionMapFn(std::move(fn)) {}
76cf33daSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
76cf33daSThomas Raoux                                PatternRewriter &rewriter) const override {
ecaf2c33SPetr Kurapov    auto yield = cast<gpu::YieldOp>(
76cf33daSThomas Raoux        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
76cf33daSThomas Raoux    // Only pick up forOp if it is the last op in the region.
76cf33daSThomas Raoux    Operation *lastNode = yield->getPrevNode();
76cf33daSThomas Raoux    auto forOp = dyn_cast_or_null<scf::ForOp>(lastNode);
76cf33daSThomas Raoux    if (!forOp)
76cf33daSThomas Raoux      return failure();
91f62f0eSThomas Raoux    // Collect Values that come from the warp op but are outside the forOp.
2f925d75SKunwar Grover    // Those Value needs to be returned by the original warpOp and passed to
2f925d75SKunwar Grover    // the new op.
91f62f0eSThomas Raoux    llvm::SmallSetVector<Value, 32> escapingValues;
91f62f0eSThomas Raoux    SmallVector<Type> inputTypes;
91f62f0eSThomas Raoux    SmallVector<Type> distTypes;
91f62f0eSThomas Raoux    mlir::visitUsedValuesDefinedAbove(
91f62f0eSThomas Raoux        forOp.getBodyRegion(), [&](OpOperand *operand) {
91f62f0eSThomas Raoux          Operation *parent = operand->get().getParentRegion()->getParentOp();
91f62f0eSThomas Raoux          if (warpOp->isAncestor(parent)) {
91f62f0eSThomas Raoux            if (!escapingValues.insert(operand->get()))
91f62f0eSThomas Raoux              return;
91f62f0eSThomas Raoux            Type distType = operand->get().getType();
d2433787SLei Zhang            if (auto vecType = dyn_cast<VectorType>(distType)) {
91f62f0eSThomas Raoux              AffineMap map = distributionMapFn(operand->get());
91f62f0eSThomas Raoux              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
91f62f0eSThomas Raoux            }
91f62f0eSThomas Raoux            inputTypes.push_back(operand->get().getType());
91f62f0eSThomas Raoux            distTypes.push_back(distType);
91f62f0eSThomas Raoux          }
91f62f0eSThomas Raoux        });
91f62f0eSThomas Raoux
b5e47d2eSBangtian Liu    if (llvm::is_contained(distTypes, Type{}))
b5e47d2eSBangtian Liu      return failure();
b5e47d2eSBangtian Liu
91f62f0eSThomas Raoux    SmallVector<size_t> newRetIndices;
91f62f0eSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
91f62f0eSThomas Raoux        rewriter, warpOp, escapingValues.getArrayRef(), distTypes,
91f62f0eSThomas Raoux        newRetIndices);
ecaf2c33SPetr Kurapov    yield = cast<gpu::YieldOp>(
91f62f0eSThomas Raoux        newWarpOp.getBodyRegion().getBlocks().begin()->getTerminator());
91f62f0eSThomas Raoux
76cf33daSThomas Raoux    SmallVector<Value> newOperands;
76cf33daSThomas Raoux    SmallVector<unsigned> resultIdx;
76cf33daSThomas Raoux    // Collect all the outputs coming from the forOp.
76cf33daSThomas Raoux    for (OpOperand &yieldOperand : yield->getOpOperands()) {
76cf33daSThomas Raoux      if (yieldOperand.get().getDefiningOp() != forOp.getOperation())
76cf33daSThomas Raoux        continue;
5550c821STres Popp      auto forResult = cast<OpResult>(yieldOperand.get());
91f62f0eSThomas Raoux      newOperands.push_back(
91f62f0eSThomas Raoux          newWarpOp.getResult(yieldOperand.getOperandNumber()));
5cf714bbSMatthias Springer      yieldOperand.set(forOp.getInitArgs()[forResult.getResultNumber()]);
76cf33daSThomas Raoux      resultIdx.push_back(yieldOperand.getOperandNumber());
76cf33daSThomas Raoux    }
91f62f0eSThomas Raoux
76cf33daSThomas Raoux    OpBuilder::InsertionGuard g(rewriter);
91f62f0eSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
91f62f0eSThomas Raoux
2f925d75SKunwar Grover    // Create a new for op outside the region with a WarpExecuteOnLane0Op
2f925d75SKunwar Grover    // region inside.
76cf33daSThomas Raoux    auto newForOp = rewriter.create<scf::ForOp>(
76cf33daSThomas Raoux        forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
76cf33daSThomas Raoux        forOp.getStep(), newOperands);
b613a540SMatthias Springer    rewriter.setInsertionPointToStart(newForOp.getBody());
91f62f0eSThomas Raoux
91f62f0eSThomas Raoux    SmallVector<Value> warpInput(newForOp.getRegionIterArgs().begin(),
91f62f0eSThomas Raoux                                 newForOp.getRegionIterArgs().end());
91f62f0eSThomas Raoux    SmallVector<Type> warpInputType(forOp.getResultTypes().begin(),
91f62f0eSThomas Raoux                                    forOp.getResultTypes().end());
91f62f0eSThomas Raoux    llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
91f62f0eSThomas Raoux    for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
91f62f0eSThomas Raoux      warpInput.push_back(newWarpOp.getResult(retIdx));
91f62f0eSThomas Raoux      argIndexMapping[escapingValues[i]] = warpInputType.size();
91f62f0eSThomas Raoux      warpInputType.push_back(inputTypes[i]);
91f62f0eSThomas Raoux    }
76cf33daSThomas Raoux    auto innerWarp = rewriter.create<WarpExecuteOnLane0Op>(
91f62f0eSThomas Raoux        newWarpOp.getLoc(), newForOp.getResultTypes(), newWarpOp.getLaneid(),
91f62f0eSThomas Raoux        newWarpOp.getWarpSize(), warpInput, warpInputType);
76cf33daSThomas Raoux
76cf33daSThomas Raoux    SmallVector<Value> argMapping;
76cf33daSThomas Raoux    argMapping.push_back(newForOp.getInductionVar());
76cf33daSThomas Raoux    for (Value args : innerWarp.getBody()->getArguments()) {
76cf33daSThomas Raoux      argMapping.push_back(args);
76cf33daSThomas Raoux    }
91f62f0eSThomas Raoux    argMapping.resize(forOp.getBody()->getNumArguments());
76cf33daSThomas Raoux    SmallVector<Value> yieldOperands;
76cf33daSThomas Raoux    for (Value operand : forOp.getBody()->getTerminator()->getOperands())
76cf33daSThomas Raoux      yieldOperands.push_back(operand);
76cf33daSThomas Raoux    rewriter.eraseOp(forOp.getBody()->getTerminator());
76cf33daSThomas Raoux    rewriter.mergeBlocks(forOp.getBody(), innerWarp.getBody(), argMapping);
b613a540SMatthias Springer    rewriter.setInsertionPointToEnd(innerWarp.getBody());
ecaf2c33SPetr Kurapov    rewriter.create<gpu::YieldOp>(innerWarp.getLoc(), yieldOperands);
76cf33daSThomas Raoux    rewriter.setInsertionPointAfter(innerWarp);
d343cdd5SThomas Raoux    if (!innerWarp.getResults().empty())
76cf33daSThomas Raoux      rewriter.create<scf::YieldOp>(forOp.getLoc(), innerWarp.getResults());
76cf33daSThomas Raoux    rewriter.eraseOp(forOp);
76cf33daSThomas Raoux    // Replace the warpOp result coming from the original ForOp.
76cf33daSThomas Raoux    for (const auto &res : llvm::enumerate(resultIdx)) {
7ecc921dSMatthias Springer      rewriter.replaceAllUsesWith(newWarpOp.getResult(res.value()),
7ecc921dSMatthias Springer                                  newForOp.getResult(res.index()));
91f62f0eSThomas Raoux      newForOp->setOperand(res.index() + 3, newWarpOp.getResult(res.value()));
76cf33daSThomas Raoux    }
91f62f0eSThomas Raoux    newForOp.walk([&](Operation *op) {
91f62f0eSThomas Raoux      for (OpOperand &operand : op->getOpOperands()) {
91f62f0eSThomas Raoux        auto it = argIndexMapping.find(operand.get());
91f62f0eSThomas Raoux        if (it == argIndexMapping.end())
91f62f0eSThomas Raoux          continue;
91f62f0eSThomas Raoux        operand.set(innerWarp.getBodyRegion().getArgument(it->second));
91f62f0eSThomas Raoux      }
91f62f0eSThomas Raoux    });
98dcd98aSQuinn Dawkins
98dcd98aSQuinn Dawkins    // Finally, hoist out any now uniform code from the inner warp op.
98dcd98aSQuinn Dawkins    mlir::vector::moveScalarUniformCode(innerWarp);
76cf33daSThomas Raoux    return success();
76cf33daSThomas Raoux  }
91f62f0eSThomas Raoux
91f62f0eSThomas Raouxprivate:
91f62f0eSThomas Raoux  DistributionMapFn distributionMapFn;
76cf33daSThomas Raoux};
76cf33daSThomas Raoux
087aba4fSThomas Raoux/// A pattern that extracts vector.reduction ops from a WarpExecuteOnLane0Op.
2f925d75SKunwar Grover/// The vector is reduced in parallel. Currently limited to vector size
2f925d75SKunwar Grover/// matching the warpOp size. E.g.:
087aba4fSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
087aba4fSThomas Raoux///   %0 = "some_def"() : () -> (vector<32xf32>)
087aba4fSThomas Raoux///   %1 = vector.reduction "add", %0 : vector<32xf32> into f32
ecaf2c33SPetr Kurapov///   gpu.yield %1 : f32
087aba4fSThomas Raoux/// }
087aba4fSThomas Raoux/// ```
087aba4fSThomas Raoux/// is lowered to:
087aba4fSThomas Raoux/// ```
ecaf2c33SPetr Kurapov/// %0 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
087aba4fSThomas Raoux///   %1 = "some_def"() : () -> (vector<32xf32>)
ecaf2c33SPetr Kurapov///   gpu.yield %1 : vector<32xf32>
087aba4fSThomas Raoux/// }
9816edc9SCullen Rhodes/// %a = vector.extract %0[0] : f32 from vector<1xf32>
6834803cSThomas Raoux/// %r = ("warp.reduction %a")
087aba4fSThomas Raoux/// ```
*bc29fc93SPetr Kurapovstruct WarpOpReduction : public WarpDistributionPattern {
6834803cSThomas Raoux  WarpOpReduction(MLIRContext *context,
6834803cSThomas Raoux                  DistributedReductionFn distributedReductionFn,
6834803cSThomas Raoux                  PatternBenefit benefit = 1)
*bc29fc93SPetr Kurapov      : WarpDistributionPattern(context, benefit),
61f06774SMehdi Amini        distributedReductionFn(std::move(distributedReductionFn)) {}
087aba4fSThomas Raoux
087aba4fSThomas Raoux  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
087aba4fSThomas Raoux                                PatternRewriter &rewriter) const override {
971b8525SJakub Kuderski    OpOperand *yieldOperand =
971b8525SJakub Kuderski        getWarpResult(warpOp, llvm::IsaPred<vector::ReductionOp>);
087aba4fSThomas Raoux    if (!yieldOperand)
087aba4fSThomas Raoux      return failure();
087aba4fSThomas Raoux
087aba4fSThomas Raoux    auto reductionOp =
087aba4fSThomas Raoux        cast<vector::ReductionOp>(yieldOperand->get().getDefiningOp());
5550c821STres Popp    auto vectorType = cast<VectorType>(reductionOp.getVector().getType());
087aba4fSThomas Raoux    // Only rank 1 vectors supported.
087aba4fSThomas Raoux    if (vectorType.getRank() != 1)
087aba4fSThomas Raoux      return rewriter.notifyMatchFailure(
087aba4fSThomas Raoux          warpOp, "Only rank 1 reductions can be distributed.");
087aba4fSThomas Raoux    // Only warp_size-sized vectors supported.
0660f3c5SThomas Raoux    if (vectorType.getShape()[0] % warpOp.getWarpSize() != 0)
087aba4fSThomas Raoux      return rewriter.notifyMatchFailure(
087aba4fSThomas Raoux          warpOp, "Reduction vector dimension must match was size.");
f41abcdaSThomas Raoux    if (!reductionOp.getType().isIntOrFloat())
087aba4fSThomas Raoux      return rewriter.notifyMatchFailure(
f41abcdaSThomas Raoux          warpOp, "Reduction distribution currently only supports floats and "
f41abcdaSThomas Raoux                  "integer types.");
087aba4fSThomas Raoux
0660f3c5SThomas Raoux    int64_t numElements = vectorType.getShape()[0] / warpOp.getWarpSize();
087aba4fSThomas Raoux    // Return vector that will be reduced from the WarpExecuteOnLane0Op.
087aba4fSThomas Raoux    unsigned operandIndex = yieldOperand->getOperandNumber();
087aba4fSThomas Raoux    SmallVector<Value> yieldValues = {reductionOp.getVector()};
0660f3c5SThomas Raoux    SmallVector<Type> retTypes = {
0660f3c5SThomas Raoux        VectorType::get({numElements}, reductionOp.getType())};
ffa7384fSThomas Raoux    if (reductionOp.getAcc()) {
ffa7384fSThomas Raoux      yieldValues.push_back(reductionOp.getAcc());
ffa7384fSThomas Raoux      retTypes.push_back(reductionOp.getAcc().getType());
ffa7384fSThomas Raoux    }
d7d6443dSThomas Raoux    SmallVector<size_t> newRetIndices;
087aba4fSThomas Raoux    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
d7d6443dSThomas Raoux        rewriter, warpOp, yieldValues, retTypes, newRetIndices);
087aba4fSThomas Raoux    rewriter.setInsertionPointAfter(newWarpOp);
087aba4fSThomas Raoux
d2061530Sstanley-nod    // Obtain data to reduce for a single lane.
d7d6443dSThomas Raoux    Value laneValVec = newWarpOp.getResult(newRetIndices[0]);
d2061530Sstanley-nod    // Distribute and reduce across threads.
0660f3c5SThomas Raoux    Value fullReduce =
d2061530Sstanley-nod        distributedReductionFn(reductionOp.getLoc(), rewriter, laneValVec,
6834803cSThomas Raoux                               reductionOp.getKind(), newWarpOp.getWarpSize());
ffa7384fSThomas Raoux    if (reductionOp.getAcc()) {
ffa7384fSThomas Raoux      fullReduce = vector::makeArithReduction(
ffa7384fSThomas Raoux          rewriter, reductionOp.getLoc(), reductionOp.getKind(), fullReduce,
ffa7384fSThomas Raoux          newWarpOp.getResult(newRetIndices[1]));
ffa7384fSThomas Raoux    }
7ecc921dSMatthias Springer    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIndex), fullReduce);
087aba4fSThomas Raoux    return success();
087aba4fSThomas Raoux  }
6834803cSThomas Raoux
6834803cSThomas Raouxprivate:
6834803cSThomas Raoux  DistributedReductionFn distributedReductionFn;
087aba4fSThomas Raoux};
087aba4fSThomas Raoux
d02f10d9SThomas Raoux} // namespace
d02f10d9SThomas Raoux
d02f10d9SThomas Raouxvoid mlir::vector::populateWarpExecuteOnLane0OpToScfForPattern(
d02f10d9SThomas Raoux    RewritePatternSet &patterns,
27cc31b6SNicolas Vasilache    const WarpExecuteOnLane0LoweringOptions &options, PatternBenefit benefit) {
4abb9e5dSThomas Raoux  patterns.add<WarpOpToScfIfPattern>(patterns.getContext(), options, benefit);
d02f10d9SThomas Raoux}
ed0288f7SThomas Raoux
ed0288f7SThomas Raouxvoid mlir::vector::populateDistributeTransferWriteOpPatterns(
27cc31b6SNicolas Vasilache    RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,
80636227SJakub Kuderski    unsigned maxNumElementsToExtract, PatternBenefit benefit) {
27cc31b6SNicolas Vasilache  patterns.add<WarpOpTransferWrite>(patterns.getContext(), distributionMapFn,
80636227SJakub Kuderski                                    maxNumElementsToExtract, benefit);
ed0288f7SThomas Raoux}
ed0288f7SThomas Raoux
76cf33daSThomas Raouxvoid mlir::vector::populatePropagateWarpVectorDistributionPatterns(
91f62f0eSThomas Raoux    RewritePatternSet &patterns, const DistributionMapFn &distributionMapFn,
df49a97aSQuinn Dawkins    const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit,
df49a97aSQuinn Dawkins    PatternBenefit readBenefit) {
df49a97aSQuinn Dawkins  patterns.add<WarpOpTransferRead>(patterns.getContext(), readBenefit);
2f925d75SKunwar Grover  patterns.add<WarpOpElementwise, WarpOpDeadResult, WarpOpBroadcast,
2f925d75SKunwar Grover               WarpOpShapeCast, WarpOpExtract, WarpOpForwardOperand,
2f925d75SKunwar Grover               WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
2f925d75SKunwar Grover               WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
df49a97aSQuinn Dawkins      patterns.getContext(), benefit);
2f925d75SKunwar Grover  patterns.add<WarpOpExtractScalar>(patterns.getContext(), warpShuffleFromIdxFn,
2f925d75SKunwar Grover                                    benefit);
91f62f0eSThomas Raoux  patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
91f62f0eSThomas Raoux                               benefit);
76cf33daSThomas Raoux}
76cf33daSThomas Raoux
6834803cSThomas Raouxvoid mlir::vector::populateDistributeReduction(
6834803cSThomas Raoux    RewritePatternSet &patterns,
27cc31b6SNicolas Vasilache    const DistributedReductionFn &distributedReductionFn,
27cc31b6SNicolas Vasilache    PatternBenefit benefit) {
27cc31b6SNicolas Vasilache  patterns.add<WarpOpReduction>(patterns.getContext(), distributedReductionFn,
27cc31b6SNicolas Vasilache                                benefit);
087aba4fSThomas Raoux}
087aba4fSThomas Raoux
*bc29fc93SPetr Kurapov/// Helper to know if an op can be hoisted out of the region.
*bc29fc93SPetr Kurapovstatic bool canBeHoisted(Operation *op,
*bc29fc93SPetr Kurapov                         function_ref<bool(Value)> definedOutside) {
*bc29fc93SPetr Kurapov  return llvm::all_of(op->getOperands(), definedOutside) &&
*bc29fc93SPetr Kurapov         isMemoryEffectFree(op) && op->getNumRegions() == 0;
*bc29fc93SPetr Kurapov}
*bc29fc93SPetr Kurapov
ed0288f7SThomas Raouxvoid mlir::vector::moveScalarUniformCode(WarpExecuteOnLane0Op warpOp) {
ed0288f7SThomas Raoux  Block *body = warpOp.getBody();
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  // Keep track of the ops we want to hoist.
ed0288f7SThomas Raoux  llvm::SmallSetVector<Operation *, 8> opsToMove;
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  // Helper to check if a value is or will be defined outside of the region.
ed0288f7SThomas Raoux  auto isDefinedOutsideOfBody = [&](Value value) {
ed0288f7SThomas Raoux    auto *definingOp = value.getDefiningOp();
ed0288f7SThomas Raoux    return (definingOp && opsToMove.count(definingOp)) ||
ed0288f7SThomas Raoux           warpOp.isDefinedOutsideOfRegion(value);
ed0288f7SThomas Raoux  };
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  // Do not use walk here, as we do not want to go into nested regions and hoist
ed0288f7SThomas Raoux  // operations from there.
ed0288f7SThomas Raoux  for (auto &op : body->without_terminator()) {
ed0288f7SThomas Raoux    bool hasVectorResult = llvm::any_of(op.getResults(), [](Value result) {
5550c821STres Popp      return isa<VectorType>(result.getType());
ed0288f7SThomas Raoux    });
ed0288f7SThomas Raoux    if (!hasVectorResult && canBeHoisted(&op, isDefinedOutsideOfBody))
ed0288f7SThomas Raoux      opsToMove.insert(&op);
ed0288f7SThomas Raoux  }
ed0288f7SThomas Raoux
ed0288f7SThomas Raoux  // Move all the ops marked as uniform outside of the region.
ed0288f7SThomas Raoux  for (Operation *op : opsToMove)
ed0288f7SThomas Raoux    op->moveBefore(warpOp);
ed0288f7SThomas Raoux}