GPU/TransformOps/GPUTransformOps.cpp

89bb0caeSGuray Ozen//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
89bb0caeSGuray Ozen//
89bb0caeSGuray Ozen// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
89bb0caeSGuray Ozen// See https://llvm.org/LICENSE.txt for license information.
89bb0caeSGuray Ozen// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
89bb0caeSGuray Ozen//
89bb0caeSGuray Ozen//===----------------------------------------------------------------------===//
89bb0caeSGuray Ozen
89bb0caeSGuray Ozen#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
89bb0caeSGuray Ozen
888717e8SNicolas Vasilache#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
888717e8SNicolas Vasilache#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
888717e8SNicolas Vasilache#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
c59465e1SNicolas Vasilache#include "mlir/Dialect/Affine/IR/AffineOps.h"
89bb0caeSGuray Ozen#include "mlir/Dialect/Arith/IR/Arith.h"
c59465e1SNicolas Vasilache#include "mlir/Dialect/Func/IR/FuncOps.h"
89bb0caeSGuray Ozen#include "mlir/Dialect/GPU/IR/GPUDialect.h"
90ecfa2aSNicolas Vasilache#include "mlir/Dialect/GPU/TransformOps/Utils.h"
888717e8SNicolas Vasilache#include "mlir/Dialect/GPU/Transforms/Passes.h"
888717e8SNicolas Vasilache#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
9ab34689SAlex Zinenko#include "mlir/Dialect/MemRef/IR/MemRef.h"
beaffb04SGuray Ozen#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
89bb0caeSGuray Ozen#include "mlir/Dialect/SCF/IR/SCF.h"
89bb0caeSGuray Ozen#include "mlir/Dialect/Transform/IR/TransformDialect.h"
5a9bdd85SOleksandr "Alex" Zinenko#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
c59465e1SNicolas Vasilache#include "mlir/Dialect/Utils/IndexingUtils.h"
9ab34689SAlex Zinenko#include "mlir/Dialect/Vector/IR/VectorOps.h"
ff8775f3SQuinn Dawkins#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
c59465e1SNicolas Vasilache#include "mlir/IR/AffineExpr.h"
c59465e1SNicolas Vasilache#include "mlir/IR/Builders.h"
768615bbSNicolas Vasilache#include "mlir/IR/BuiltinAttributes.h"
4d67b278SJeff Niu#include "mlir/IR/IRMapping.h"
c59465e1SNicolas Vasilache#include "mlir/IR/MLIRContext.h"
aafb52d7SNicolas Vasilache#include "mlir/IR/OpDefinition.h"
c59465e1SNicolas Vasilache#include "mlir/IR/Visitors.h"
aafb52d7SNicolas Vasilache#include "mlir/Support/LLVM.h"
888717e8SNicolas Vasilache#include "mlir/Transforms/DialectConversion.h"
768615bbSNicolas Vasilache#include "llvm/ADT/STLExtras.h"
768615bbSNicolas Vasilache#include "llvm/ADT/SmallVector.h"
9ab34689SAlex Zinenko#include "llvm/ADT/TypeSwitch.h"
768615bbSNicolas Vasilache#include "llvm/Support/Debug.h"
44e6318cSNicolas Vasilache#include "llvm/Support/ErrorHandling.h"
92f088d3SNicolas Vasilache#include <type_traits>
89bb0caeSGuray Ozen
89bb0caeSGuray Ozenusing namespace mlir;
89bb0caeSGuray Ozenusing namespace mlir::gpu;
89bb0caeSGuray Ozenusing namespace mlir::transform;
c59465e1SNicolas Vasilacheusing namespace mlir::transform::gpu;
89bb0caeSGuray Ozen
768615bbSNicolas Vasilache#define DEBUG_TYPE "gpu-transforms"
9ab34689SAlex Zinenko#define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
768615bbSNicolas Vasilache
768615bbSNicolas Vasilache#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
768615bbSNicolas Vasilache#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
9ab34689SAlex Zinenko#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
9ab34689SAlex Zinenko
9ab34689SAlex Zinenko//===----------------------------------------------------------------------===//
888717e8SNicolas Vasilache// Apply...ConversionPatternsOp
888717e8SNicolas Vasilache//===----------------------------------------------------------------------===//
888717e8SNicolas Vasilache
888717e8SNicolas Vasilachevoid transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
888717e8SNicolas Vasilache    TypeConverter &typeConverter, RewritePatternSet &patterns) {
888717e8SNicolas Vasilache  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
888717e8SNicolas Vasilache  // NVVM uses alloca in the default address space to represent private
888717e8SNicolas Vasilache  // memory allocations, so drop private annotations. NVVM uses address
888717e8SNicolas Vasilache  // space 3 for shared memory. NVVM uses the default address space to
888717e8SNicolas Vasilache  // represent global memory.
888717e8SNicolas Vasilache  // Used in populateGpuToNVVMConversionPatternsso attaching here for now.
888717e8SNicolas Vasilache  // TODO: We should have a single to_nvvm_type_converter.
888717e8SNicolas Vasilache  populateGpuMemorySpaceAttributeConversions(
888717e8SNicolas Vasilache      llvmTypeConverter, [](AddressSpace space) -> unsigned {
888717e8SNicolas Vasilache        switch (space) {
888717e8SNicolas Vasilache        case AddressSpace::Global:
888717e8SNicolas Vasilache          return static_cast<unsigned>(
888717e8SNicolas Vasilache              NVVM::NVVMMemorySpace::kGlobalMemorySpace);
888717e8SNicolas Vasilache        case AddressSpace::Workgroup:
888717e8SNicolas Vasilache          return static_cast<unsigned>(
888717e8SNicolas Vasilache              NVVM::NVVMMemorySpace::kSharedMemorySpace);
888717e8SNicolas Vasilache        case AddressSpace::Private:
888717e8SNicolas Vasilache          return 0;
888717e8SNicolas Vasilache        }
888717e8SNicolas Vasilache        llvm_unreachable("unknown address space enum value");
888717e8SNicolas Vasilache        return 0;
888717e8SNicolas Vasilache      });
888717e8SNicolas Vasilache  // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
888717e8SNicolas Vasilache  // TODO: We should have a single to_nvvm_type_converter.
888717e8SNicolas Vasilache  llvmTypeConverter.addConversion(
888717e8SNicolas Vasilache      [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });
888717e8SNicolas Vasilache  populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns);
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas VasilacheLogicalResult
888717e8SNicolas Vasilachetransform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
888717e8SNicolas Vasilache    transform::TypeConverterBuilderOpInterface builder) {
888717e8SNicolas Vasilache  if (builder.getTypeConverterType() != "LLVMTypeConverter")
888717e8SNicolas Vasilache    return emitOpError("expected LLVMTypeConverter");
888717e8SNicolas Vasilache  return success();
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas Vasilachevoid transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
888717e8SNicolas Vasilache    TypeConverter &typeConverter, RewritePatternSet &patterns) {
888717e8SNicolas Vasilache  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
888717e8SNicolas Vasilache  populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns);
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas VasilacheLogicalResult
888717e8SNicolas Vasilachetransform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
888717e8SNicolas Vasilache    transform::TypeConverterBuilderOpInterface builder) {
888717e8SNicolas Vasilache  if (builder.getTypeConverterType() != "LLVMTypeConverter")
888717e8SNicolas Vasilache    return emitOpError("expected LLVMTypeConverter");
888717e8SNicolas Vasilache  return success();
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas Vasilachevoid transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
888717e8SNicolas Vasilache    populatePatterns(TypeConverter &typeConverter,
888717e8SNicolas Vasilache                     RewritePatternSet &patterns) {
888717e8SNicolas Vasilache  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
888717e8SNicolas Vasilache  populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns);
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas VasilacheLogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
888717e8SNicolas Vasilache    verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
888717e8SNicolas Vasilache  if (builder.getTypeConverterType() != "LLVMTypeConverter")
888717e8SNicolas Vasilache    return emitOpError("expected LLVMTypeConverter");
888717e8SNicolas Vasilache  return success();
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas Vasilache//===----------------------------------------------------------------------===//
888717e8SNicolas Vasilache// Apply...PatternsOp
888717e8SNicolas Vasilache//===----------------------------------------------------------------------===//s
888717e8SNicolas Vasilache
888717e8SNicolas Vasilachevoid ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
888717e8SNicolas Vasilache  populateGpuRewritePatterns(patterns);
888717e8SNicolas Vasilache}
888717e8SNicolas Vasilache
888717e8SNicolas Vasilache//===----------------------------------------------------------------------===//
ff8775f3SQuinn Dawkins// ApplyUnrollVectorsSubgroupMmaOp
ff8775f3SQuinn Dawkins//===----------------------------------------------------------------------===//
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
ff8775f3SQuinn Dawkins/// register.
ff8775f3SQuinn Dawkinsstatic std::optional<SmallVector<int64_t>>
ff8775f3SQuinn DawkinsgpuMmaUnrollOrder(vector::ContractionOp contract) {
ff8775f3SQuinn Dawkins  SmallVector<int64_t> order;
ff8775f3SQuinn Dawkins  // First make reduction the outer dimensions.
ff8775f3SQuinn Dawkins  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
ff8775f3SQuinn Dawkins    if (vector::isReductionIterator(iter)) {
ff8775f3SQuinn Dawkins      order.push_back(index);
ff8775f3SQuinn Dawkins    }
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins  llvm::SmallDenseSet<int64_t> dims;
ff8775f3SQuinn Dawkins  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
1609f1c2Slong.chen    dims.insert(cast<AffineDimExpr>(expr).getPosition());
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
ff8775f3SQuinn Dawkins  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
ff8775f3SQuinn Dawkins    if (vector::isParallelIterator(iter) && dims.count(index)) {
ff8775f3SQuinn Dawkins      order.push_back(index);
ff8775f3SQuinn Dawkins    }
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  // Then the remaining parallel loops.
ff8775f3SQuinn Dawkins  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
ff8775f3SQuinn Dawkins    if (vector::isParallelIterator(iter) && !dims.count(index)) {
ff8775f3SQuinn Dawkins      order.push_back(index);
ff8775f3SQuinn Dawkins    }
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  return order;
ff8775f3SQuinn Dawkins}
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins/// Returns the target vector size for the target operation based on the native
ff8775f3SQuinn Dawkins/// vector size specified with `m`, `n`, and `k`.
ff8775f3SQuinn Dawkinsstatic std::optional<SmallVector<int64_t>>
ff8775f3SQuinn DawkinsgetSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) {
ff8775f3SQuinn Dawkins  if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
ff8775f3SQuinn Dawkins    int64_t contractRank = contract.getIteratorTypes().size();
ff8775f3SQuinn Dawkins    if (contractRank < 3)
ff8775f3SQuinn Dawkins      return std::nullopt;
ff8775f3SQuinn Dawkins    SmallVector<int64_t> nativeSize(contractRank - 3, 1);
ff8775f3SQuinn Dawkins    nativeSize.append({m, n, k});
ff8775f3SQuinn Dawkins    return nativeSize;
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
ff8775f3SQuinn Dawkins    int64_t writeRank = writeOp.getVectorType().getRank();
ff8775f3SQuinn Dawkins    if (writeRank < 2)
ff8775f3SQuinn Dawkins      return std::nullopt;
ff8775f3SQuinn Dawkins    SmallVector<int64_t> nativeSize(writeRank - 2, 1);
ff8775f3SQuinn Dawkins    nativeSize.append({m, n});
ff8775f3SQuinn Dawkins    return nativeSize;
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
ff8775f3SQuinn Dawkins    // Transfer read ops may need different shapes based on how they are being
ff8775f3SQuinn Dawkins    // used. For simplicity just match the shape used by the extract strided op.
ff8775f3SQuinn Dawkins    VectorType sliceType;
ff8775f3SQuinn Dawkins    for (Operation *users : op->getUsers()) {
ff8775f3SQuinn Dawkins      auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
ff8775f3SQuinn Dawkins      if (!extract)
ff8775f3SQuinn Dawkins        return std::nullopt;
a5757c5bSChristian Sigg      auto vecType = cast<VectorType>(extract.getResult().getType());
ff8775f3SQuinn Dawkins      if (sliceType && sliceType != vecType)
ff8775f3SQuinn Dawkins        return std::nullopt;
ff8775f3SQuinn Dawkins      sliceType = vecType;
ff8775f3SQuinn Dawkins    }
ff8775f3SQuinn Dawkins    return llvm::to_vector(sliceType.getShape());
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) {
a5757c5bSChristian Sigg    if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
ff8775f3SQuinn Dawkins      // TODO: The condition for unrolling elementwise should be restricted
ff8775f3SQuinn Dawkins      // only to operations that need unrolling (connected to the contract).
ff8775f3SQuinn Dawkins      if (vecType.getRank() < 2)
ff8775f3SQuinn Dawkins        return std::nullopt;
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins      // First check whether there is a slice to infer the shape from. This is
ff8775f3SQuinn Dawkins      // required for cases where the accumulator type differs from the input
ff8775f3SQuinn Dawkins      // types, in which case we will see an `arith.ext_` between the contract
ff8775f3SQuinn Dawkins      // and transfer_read which needs to be unrolled.
ff8775f3SQuinn Dawkins      VectorType sliceType;
ff8775f3SQuinn Dawkins      for (Operation *users : op->getUsers()) {
ff8775f3SQuinn Dawkins        auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
ff8775f3SQuinn Dawkins        if (!extract)
ff8775f3SQuinn Dawkins          return std::nullopt;
a5757c5bSChristian Sigg        auto vecType = cast<VectorType>(extract.getResult().getType());
ff8775f3SQuinn Dawkins        if (sliceType && sliceType != vecType)
ff8775f3SQuinn Dawkins          return std::nullopt;
ff8775f3SQuinn Dawkins        sliceType = vecType;
ff8775f3SQuinn Dawkins      }
ff8775f3SQuinn Dawkins      if (sliceType)
ff8775f3SQuinn Dawkins        return llvm::to_vector(sliceType.getShape());
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins      // Else unroll for trailing elementwise.
ff8775f3SQuinn Dawkins      SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
ff8775f3SQuinn Dawkins      // Map elementwise ops to the output shape.
ff8775f3SQuinn Dawkins      nativeSize.append({m, n});
ff8775f3SQuinn Dawkins      return nativeSize;
ff8775f3SQuinn Dawkins    }
ff8775f3SQuinn Dawkins  }
ff8775f3SQuinn Dawkins  return std::nullopt;
ff8775f3SQuinn Dawkins}
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkinsvoid transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
ff8775f3SQuinn Dawkins    RewritePatternSet &patterns) {
ff8775f3SQuinn Dawkins  auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
ff8775f3SQuinn Dawkins    auto contract = dyn_cast<vector::ContractionOp>(op);
ff8775f3SQuinn Dawkins    if (!contract)
ff8775f3SQuinn Dawkins      return std::nullopt;
ff8775f3SQuinn Dawkins    return gpuMmaUnrollOrder(contract);
ff8775f3SQuinn Dawkins  };
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins  int64_t m = getM();
ff8775f3SQuinn Dawkins  int64_t n = getN();
ff8775f3SQuinn Dawkins  int64_t k = getK();
ff8775f3SQuinn Dawkins  auto nativeShapeFn =
ff8775f3SQuinn Dawkins      [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
ff8775f3SQuinn Dawkins    return getSubgroupMmaNativeVectorSize(op, m, n, k);
ff8775f3SQuinn Dawkins  };
ff8775f3SQuinn Dawkins  vector::populateVectorUnrollPatterns(
ff8775f3SQuinn Dawkins      patterns, vector::UnrollVectorOptions()
ff8775f3SQuinn Dawkins                    .setNativeShapeFn(nativeShapeFn)
ff8775f3SQuinn Dawkins                    .setUnrollTraversalOrderFn(unrollOrder));
ff8775f3SQuinn Dawkins}
ff8775f3SQuinn Dawkins
ff8775f3SQuinn Dawkins//===----------------------------------------------------------------------===//
9ab34689SAlex Zinenko// EliminateBarriersOp
9ab34689SAlex Zinenko//===----------------------------------------------------------------------===//
9ab34689SAlex Zinenko
9ab34689SAlex Zinenkovoid EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
00c3c731Sspaceotter  populateGpuEliminateBarriersPatterns(patterns);
9ab34689SAlex Zinenko}
9ab34689SAlex Zinenko
9ab34689SAlex Zinenko//===----------------------------------------------------------------------===//
9ab34689SAlex Zinenko// Block and thread mapping utilities.
9ab34689SAlex Zinenko//===----------------------------------------------------------------------===//
768615bbSNicolas Vasilache
92f088d3SNicolas Vasilachenamespace {
92f088d3SNicolas Vasilache/// Local types used for mapping verification.
92f088d3SNicolas Vasilachestruct MappingKind {};
92f088d3SNicolas Vasilachestruct BlockMappingKind : MappingKind {};
92f088d3SNicolas Vasilachestruct ThreadMappingKind : MappingKind {};
92f088d3SNicolas Vasilache} // namespace
92f088d3SNicolas Vasilache
aafb52d7SNicolas Vasilachestatic DiagnosedSilenceableFailure
c59465e1SNicolas VasilachedefiniteFailureHelper(std::optional<TransformOpInterface> transformOp,
c59465e1SNicolas Vasilache                      Operation *target, const Twine &message) {
aafb52d7SNicolas Vasilache  if (transformOp.has_value())
c59465e1SNicolas Vasilache    return transformOp->emitDefiniteFailure() << message;
c59465e1SNicolas Vasilache  return emitDefiniteFailure(target, message);
aafb52d7SNicolas Vasilache}
aafb52d7SNicolas Vasilache
beaffb04SGuray Ozen/// Check if given mapping attributes are one of the desired attributes
92f088d3SNicolas Vasilachetemplate <typename MappingKindType>
c5798faeSGuray Ozenstatic DiagnosedSilenceableFailure
aafb52d7SNicolas VasilachecheckMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
aafb52d7SNicolas Vasilache                           scf::ForallOp forallOp) {
92f088d3SNicolas Vasilache  if (!forallOp.getMapping().has_value()) {
c59465e1SNicolas Vasilache    return definiteFailureHelper(transformOp, forallOp,
92f088d3SNicolas Vasilache                                 "scf.forall op requires a mapping attribute");
92f088d3SNicolas Vasilache  }
aafb52d7SNicolas Vasilache
971b8525SJakub Kuderski  bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
971b8525SJakub Kuderski                                      llvm::IsaPred<GPUBlockMappingAttr>);
971b8525SJakub Kuderski  bool hasWarpgroupMapping = llvm::any_of(
971b8525SJakub Kuderski      forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
971b8525SJakub Kuderski  bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
971b8525SJakub Kuderski                                     llvm::IsaPred<GPUWarpMappingAttr>);
971b8525SJakub Kuderski  bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
971b8525SJakub Kuderski                                       llvm::IsaPred<GPUThreadMappingAttr>);
aafb52d7SNicolas Vasilache  int64_t countMappingTypes = 0;
aafb52d7SNicolas Vasilache  countMappingTypes += hasBlockMapping ? 1 : 0;
44e6318cSNicolas Vasilache  countMappingTypes += hasWarpgroupMapping ? 1 : 0;
c59465e1SNicolas Vasilache  countMappingTypes += hasWarpMapping ? 1 : 0;
44e6318cSNicolas Vasilache  countMappingTypes += hasThreadMapping ? 1 : 0;
aafb52d7SNicolas Vasilache  if (countMappingTypes > 1) {
c59465e1SNicolas Vasilache    return definiteFailureHelper(
c59465e1SNicolas Vasilache        transformOp, forallOp,
aafb52d7SNicolas Vasilache        "cannot mix different mapping types, use nesting");
aafb52d7SNicolas Vasilache  }
92f088d3SNicolas Vasilache  if (std::is_same<MappingKindType, BlockMappingKind>::value &&
92f088d3SNicolas Vasilache      !hasBlockMapping) {
92f088d3SNicolas Vasilache    return definiteFailureHelper(
92f088d3SNicolas Vasilache        transformOp, forallOp,
92f088d3SNicolas Vasilache        "scf.forall op requires a mapping attribute of kind 'block'");
92f088d3SNicolas Vasilache  }
92f088d3SNicolas Vasilache  if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
92f088d3SNicolas Vasilache      !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
92f088d3SNicolas Vasilache    return definiteFailureHelper(transformOp, forallOp,
92f088d3SNicolas Vasilache                                 "scf.forall op requires a mapping attribute "
92f088d3SNicolas Vasilache                                 "of kind 'thread' or 'warp'");
92f088d3SNicolas Vasilache  }
beaffb04SGuray Ozen
c5798faeSGuray Ozen  DenseSet<Attribute> seen;
aafb52d7SNicolas Vasilache  for (Attribute map : forallOp.getMapping()->getValue()) {
8bdf3878SKazu Hirata    if (seen.contains(map)) {
c59465e1SNicolas Vasilache      return definiteFailureHelper(
c59465e1SNicolas Vasilache          transformOp, forallOp,
44e6318cSNicolas Vasilache          "duplicate attribute, cannot map different loops "
44e6318cSNicolas Vasilache          "to the same mapping id");
c5798faeSGuray Ozen    }
c5798faeSGuray Ozen    seen.insert(map);
c5798faeSGuray Ozen  }
c5798faeSGuray Ozen
44e6318cSNicolas Vasilache  auto isLinear = [](Attribute a) {
44e6318cSNicolas Vasilache    return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
44e6318cSNicolas Vasilache  };
44e6318cSNicolas Vasilache  if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
44e6318cSNicolas Vasilache      !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
44e6318cSNicolas Vasilache    return definiteFailureHelper(
44e6318cSNicolas Vasilache        transformOp, forallOp,
44e6318cSNicolas Vasilache        "cannot mix linear and non-linear mapping modes");
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache
beaffb04SGuray Ozen  return DiagnosedSilenceableFailure::success();
beaffb04SGuray Ozen}
beaffb04SGuray Ozen
92f088d3SNicolas Vasilachetemplate <typename MappingKindType>
aafb52d7SNicolas Vasilachestatic DiagnosedSilenceableFailure
aafb52d7SNicolas VasilacheverifyGpuMapping(std::optional<TransformOpInterface> transformOp,
aafb52d7SNicolas Vasilache                 scf::ForallOp forallOp) {
aafb52d7SNicolas Vasilache  // Check the types of the mapping attributes match.
aafb52d7SNicolas Vasilache  DiagnosedSilenceableFailure typeRes =
92f088d3SNicolas Vasilache      checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
aafb52d7SNicolas Vasilache  if (!typeRes.succeeded())
aafb52d7SNicolas Vasilache    return typeRes;
aafb52d7SNicolas Vasilache
aafb52d7SNicolas Vasilache  // Perform other non-types verifications.
aafb52d7SNicolas Vasilache  if (!forallOp.isNormalized())
c59465e1SNicolas Vasilache    return definiteFailureHelper(transformOp, forallOp,
aafb52d7SNicolas Vasilache                                 "unsupported non-normalized loops");
aafb52d7SNicolas Vasilache  if (forallOp.getNumResults() > 0)
c59465e1SNicolas Vasilache    return definiteFailureHelper(transformOp, forallOp,
aafb52d7SNicolas Vasilache                                 "only bufferized scf.forall can be mapped");
44e6318cSNicolas Vasilache  bool useLinearMapping = cast<DeviceMappingAttrInterface>(
44e6318cSNicolas Vasilache                              forallOp.getMapping()->getValue().front())
44e6318cSNicolas Vasilache                              .isLinearMapping();
44e6318cSNicolas Vasilache  // TODO: This would be more natural with support for Optional<EnumParameter>
44e6318cSNicolas Vasilache  // in GPUDeviceMappingAttr.
44e6318cSNicolas Vasilache  int64_t maxNumMappingsSupported =
44e6318cSNicolas Vasilache      useLinearMapping ? (getMaxEnumValForMappingId() -
44e6318cSNicolas Vasilache                          static_cast<uint64_t>(MappingId::DimZ))
44e6318cSNicolas Vasilache                       : 3;
44e6318cSNicolas Vasilache  if (forallOp.getRank() > maxNumMappingsSupported) {
c59465e1SNicolas Vasilache    return definiteFailureHelper(transformOp, forallOp,
44e6318cSNicolas Vasilache                                 "scf.forall with rank > ")
44e6318cSNicolas Vasilache           << maxNumMappingsSupported
44e6318cSNicolas Vasilache           << " does not lower for the specified mapping attribute type";
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  auto numParallelIterations =
44e6318cSNicolas Vasilache      getConstantIntValues(forallOp.getMixedUpperBound());
44e6318cSNicolas Vasilache  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
44e6318cSNicolas Vasilache    return definiteFailureHelper(
44e6318cSNicolas Vasilache        transformOp, forallOp,
44e6318cSNicolas Vasilache        "requires statically sized, normalized forall op");
aafb52d7SNicolas Vasilache  }
aafb52d7SNicolas Vasilache  return DiagnosedSilenceableFailure::success();
aafb52d7SNicolas Vasilache}
aafb52d7SNicolas Vasilache
c59465e1SNicolas Vasilache/// Struct to return the result of the rewrite of a forall operation.
c59465e1SNicolas Vasilachestruct ForallRewriteResult {
c59465e1SNicolas Vasilache  SmallVector<int64_t> mappingSizes;
c59465e1SNicolas Vasilache  SmallVector<Value> mappingIds;
c59465e1SNicolas Vasilache};
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
c59465e1SNicolas Vasilachetemplate <typename OpTy, typename OperationOrBlock>
c59465e1SNicolas Vasilachestatic void
c59465e1SNicolas VasilachereplaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
c59465e1SNicolas Vasilache                            OperationOrBlock *parent, Value replacement,
c59465e1SNicolas Vasilache                            ArrayRef<int64_t> availableMappingSizes) {
c59465e1SNicolas Vasilache  parent->walk([&](OpTy idOp) {
c59465e1SNicolas Vasilache    if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
c59465e1SNicolas Vasilache      rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
c59465e1SNicolas Vasilache  });
c59465e1SNicolas Vasilache}
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilachestatic DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
768615bbSNicolas Vasilache    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
44e6318cSNicolas Vasilache    scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
44e6318cSNicolas Vasilache    ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
c59465e1SNicolas Vasilache  LDBG("--start rewriteOneForallCommonImpl");
beaffb04SGuray Ozen
768615bbSNicolas Vasilache  // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
44e6318cSNicolas Vasilache  auto numParallelIterations =
44e6318cSNicolas Vasilache      getConstantIntValues(forallOp.getMixedUpperBound());
44e6318cSNicolas Vasilache  assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
44e6318cSNicolas Vasilache         "requires statically sized, normalized forall op");
44e6318cSNicolas Vasilache  SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
44e6318cSNicolas Vasilache  SetVector<Attribute> forallMappingAttrs;
44e6318cSNicolas Vasilache  forallMappingAttrs.insert(forallOp.getMapping()->getValue().begin(),
44e6318cSNicolas Vasilache                            forallOp.getMapping()->getValue().end());
44e6318cSNicolas Vasilache  auto comparator = [](Attribute a, Attribute b) -> bool {
44e6318cSNicolas Vasilache    return cast<DeviceMappingAttrInterface>(a).getMappingId() <
44e6318cSNicolas Vasilache           cast<DeviceMappingAttrInterface>(b).getMappingId();
44e6318cSNicolas Vasilache  };
44e6318cSNicolas Vasilache
44e6318cSNicolas Vasilache  // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
44e6318cSNicolas Vasilache  // mapping all dimensions. In the 3-D mapping case we need to map all
44e6318cSNicolas Vasilache  // dimensions.
fab2bb8bSJustin Lebar  DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
fab2bb8bSJustin Lebar      *llvm::max_element(forallMappingAttrs, comparator));
44e6318cSNicolas Vasilache  DeviceMappingAttrInterface maxLinearMapping;
44e6318cSNicolas Vasilache  if (maxMapping.isLinearMapping())
44e6318cSNicolas Vasilache    maxLinearMapping = maxMapping;
c59465e1SNicolas Vasilache  for (auto attr : gpuIdBuilder.mappingAttributes) {
44e6318cSNicolas Vasilache    // If attr overflows, just skip.
44e6318cSNicolas Vasilache    if (maxLinearMapping && comparator(maxLinearMapping, attr))
768615bbSNicolas Vasilache      continue;
44e6318cSNicolas Vasilache    // Try to insert. If element was already present, just continue.
44e6318cSNicolas Vasilache    if (!forallMappingAttrs.insert(attr))
44e6318cSNicolas Vasilache      continue;
44e6318cSNicolas Vasilache    // Otherwise, we have a new insertion without a size -> use size 1.
768615bbSNicolas Vasilache    tmpMappingSizes.push_back(1);
6663f347SGuray Ozen  }
c59465e1SNicolas Vasilache  LLVM_DEBUG(
c59465e1SNicolas Vasilache      llvm::interleaveComma(
c59465e1SNicolas Vasilache          tmpMappingSizes,
c59465e1SNicolas Vasilache          DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");
c59465e1SNicolas Vasilache      llvm::dbgs() << "\n");
6663f347SGuray Ozen
beaffb04SGuray Ozen  // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
44e6318cSNicolas Vasilache  SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
44e6318cSNicolas Vasilache      forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
c59465e1SNicolas Vasilache  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
c59465e1SNicolas Vasilache                                   DBGS() << "----forallMappingSizes: ");
c59465e1SNicolas Vasilache             llvm::dbgs() << "\n"; llvm::interleaveComma(
44e6318cSNicolas Vasilache                 forallMappingAttrs, DBGS() << "----forallMappingAttrs: ");
768615bbSNicolas Vasilache             llvm::dbgs() << "\n");
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  // Step 3. Generate the mappingIdOps using the provided generator.
768615bbSNicolas Vasilache  Location loc = forallOp.getLoc();
c59465e1SNicolas Vasilache  OpBuilder::InsertionGuard guard(rewriter);
c59465e1SNicolas Vasilache  rewriter.setInsertionPoint(forallOp);
44e6318cSNicolas Vasilache  SmallVector<int64_t> originalBasis(availableMappingSizes);
44e6318cSNicolas Vasilache  bool originalBasisWasProvided = !originalBasis.empty();
44e6318cSNicolas Vasilache  if (!originalBasisWasProvided) {
44e6318cSNicolas Vasilache    originalBasis = forallMappingSizes;
44e6318cSNicolas Vasilache    while (originalBasis.size() < 3)
44e6318cSNicolas Vasilache      originalBasis.push_back(1);
44e6318cSNicolas Vasilache  }
89bb0caeSGuray Ozen
44e6318cSNicolas Vasilache  IdBuilderResult builderResult =
44e6318cSNicolas Vasilache      gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
44e6318cSNicolas Vasilache
44e6318cSNicolas Vasilache  // Step 4. Map the induction variables to the mappingIdOps, this may involve
44e6318cSNicolas Vasilache  // a permutation.
c59465e1SNicolas Vasilache  SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
768615bbSNicolas Vasilache  IRMapping bvm;
44e6318cSNicolas Vasilache  for (auto [iv, dim] : llvm::zip_equal(
44e6318cSNicolas Vasilache           forallOp.getInductionVars(),
44e6318cSNicolas Vasilache           forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
44e6318cSNicolas Vasilache    auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
44e6318cSNicolas Vasilache    Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
768615bbSNicolas Vasilache    bvm.map(iv, peIdOp);
768615bbSNicolas Vasilache  }
768615bbSNicolas Vasilache
44e6318cSNicolas Vasilache  // Step 5. If the originalBasis is already known, create conditionals to
44e6318cSNicolas Vasilache  // predicate the region. Otherwise, the current forall determines the
44e6318cSNicolas Vasilache  // originalBasis and no predication occurs.
768615bbSNicolas Vasilache  Value predicate;
44e6318cSNicolas Vasilache  if (originalBasisWasProvided) {
44e6318cSNicolas Vasilache    SmallVector<int64_t> activeMappingSizes = builderResult.activeMappingSizes;
44e6318cSNicolas Vasilache    SmallVector<int64_t> availableMappingSizes =
44e6318cSNicolas Vasilache        builderResult.availableMappingSizes;
44e6318cSNicolas Vasilache    SmallVector<Value> activeIdOps = builderResult.activeIdOps;
c59465e1SNicolas Vasilache    // clang-format off
c59465e1SNicolas Vasilache    LLVM_DEBUG(
c59465e1SNicolas Vasilache        llvm::interleaveComma(
44e6318cSNicolas Vasilache          activeMappingSizes, DBGS() << "----activeMappingSizes: ");
c59465e1SNicolas Vasilache        llvm::dbgs() << "\n";
c59465e1SNicolas Vasilache        llvm::interleaveComma(
c59465e1SNicolas Vasilache          availableMappingSizes, DBGS() << "----availableMappingSizes: ");
c59465e1SNicolas Vasilache        llvm::dbgs() << "\n";
44e6318cSNicolas Vasilache        llvm::interleaveComma(activeIdOps, DBGS() << "----activeIdOps: ");
768615bbSNicolas Vasilache        llvm::dbgs() << "\n");
c59465e1SNicolas Vasilache    // clang-format on
44e6318cSNicolas Vasilache    for (auto [activeId, activeMappingSize, availableMappingSize] :
44e6318cSNicolas Vasilache         llvm::zip_equal(activeIdOps, activeMappingSizes,
44e6318cSNicolas Vasilache                         availableMappingSizes)) {
44e6318cSNicolas Vasilache      if (activeMappingSize > availableMappingSize) {
c59465e1SNicolas Vasilache        return definiteFailureHelper(
768615bbSNicolas Vasilache            transformOp, forallOp,
768615bbSNicolas Vasilache            "Trying to map to fewer GPU threads than loop iterations but "
768615bbSNicolas Vasilache            "overprovisioning is not yet supported. "
768615bbSNicolas Vasilache            "Try additional tiling of the before mapping or map to more "
768615bbSNicolas Vasilache            "threads.");
768615bbSNicolas Vasilache      }
44e6318cSNicolas Vasilache      if (activeMappingSize == availableMappingSize)
768615bbSNicolas Vasilache        continue;
44e6318cSNicolas Vasilache      Value idx =
44e6318cSNicolas Vasilache          rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize);
768615bbSNicolas Vasilache      Value tmpPredicate = rewriter.create<arith::CmpIOp>(
44e6318cSNicolas Vasilache          loc, arith::CmpIPredicate::ult, activeId, idx);
c59465e1SNicolas Vasilache      LDBG("----predicate: " << tmpPredicate);
768615bbSNicolas Vasilache      predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
768615bbSNicolas Vasilache                                                             tmpPredicate)
768615bbSNicolas Vasilache                            : tmpPredicate;
768615bbSNicolas Vasilache    }
768615bbSNicolas Vasilache  }
768615bbSNicolas Vasilache
c59465e1SNicolas Vasilache  // Step 6. Move the body of forallOp.
768615bbSNicolas Vasilache  // Erase the terminator first, it will not be used.
eb2f946eSAlexander Belyaev  rewriter.eraseOp(forallOp.getTerminator());
768615bbSNicolas Vasilache  Block *targetBlock;
768615bbSNicolas Vasilache  Block::iterator insertionPoint;
768615bbSNicolas Vasilache  if (predicate) {
c59465e1SNicolas Vasilache    // Step 6.a. If predicated, move at the beginning.
c59465e1SNicolas Vasilache    auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
c59465e1SNicolas Vasilache                                           /*withElseRegion=*/false);
768615bbSNicolas Vasilache    targetBlock = ifOp.thenBlock();
768615bbSNicolas Vasilache    insertionPoint = ifOp.thenBlock()->begin();
768615bbSNicolas Vasilache  } else {
c59465e1SNicolas Vasilache    // Step 6.b. Otherwise, move inline just at the rewriter insertion
c59465e1SNicolas Vasilache    // point.
768615bbSNicolas Vasilache    targetBlock = forallOp->getBlock();
768615bbSNicolas Vasilache    insertionPoint = rewriter.getInsertionPoint();
768615bbSNicolas Vasilache  }
eb2f946eSAlexander Belyaev  Block &sourceBlock = forallOp.getRegion().front();
89bb0caeSGuray Ozen  targetBlock->getOperations().splice(insertionPoint,
89bb0caeSGuray Ozen                                      sourceBlock.getOperations());
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  // Step 7. RAUW indices.
eb2f946eSAlexander Belyaev  for (Value loopIndex : forallOp.getInductionVars()) {
768615bbSNicolas Vasilache    Value threadIdx = bvm.lookup(loopIndex);
768615bbSNicolas Vasilache    rewriter.replaceAllUsesWith(loopIndex, threadIdx);
89bb0caeSGuray Ozen  }
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  // Step 8. Erase old op.
eb2f946eSAlexander Belyaev  rewriter.eraseOp(forallOp);
89bb0caeSGuray Ozen
44e6318cSNicolas Vasilache  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
44e6318cSNicolas Vasilache                                   DBGS() << "----result forallMappingSizes: ");
44e6318cSNicolas Vasilache             llvm::dbgs() << "\n"; llvm::interleaveComma(
44e6318cSNicolas Vasilache                 mappingIdOps, DBGS() << "----result mappingIdOps: ");
44e6318cSNicolas Vasilache             llvm::dbgs() << "\n");
44e6318cSNicolas Vasilache
c59465e1SNicolas Vasilache  result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
c59465e1SNicolas Vasilache  return DiagnosedSilenceableFailure::success();
768615bbSNicolas Vasilache}
768615bbSNicolas Vasilache
c59465e1SNicolas Vasilache//===----------------------------------------------------------------------===//
c59465e1SNicolas Vasilache// MapForallToBlocks
c59465e1SNicolas Vasilache//===----------------------------------------------------------------------===//
c59465e1SNicolas Vasilache
768615bbSNicolas VasilacheDiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
768615bbSNicolas Vasilache    RewriterBase &rewriter, TransformOpInterface transformOp,
768615bbSNicolas Vasilache    scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
c59465e1SNicolas Vasilache    const GpuIdBuilder &gpuIdBuilder) {
c59465e1SNicolas Vasilache  LDBG("Start mapForallToBlocksImpl");
c59465e1SNicolas Vasilache
92f088d3SNicolas Vasilache  {
92f088d3SNicolas Vasilache    // GPU-specific verifications. There is no better place to anchor
92f088d3SNicolas Vasilache    // those right now: the ForallOp is target-independent and the transform
92f088d3SNicolas Vasilache    // op does not apply to individual ForallOp.
92f088d3SNicolas Vasilache    DiagnosedSilenceableFailure diag =
92f088d3SNicolas Vasilache        verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
92f088d3SNicolas Vasilache    if (!diag.succeeded())
92f088d3SNicolas Vasilache      return diag;
92f088d3SNicolas Vasilache  }
92f088d3SNicolas Vasilache
c59465e1SNicolas Vasilache  Location loc = forallOp.getLoc();
c59465e1SNicolas Vasilache  Block *parentBlock = forallOp->getBlock();
c59465e1SNicolas Vasilache  Value zero;
c59465e1SNicolas Vasilache  {
c59465e1SNicolas Vasilache    // Create an early zero index value for replacements and immediately reset
c59465e1SNicolas Vasilache    // the insertion point.
c59465e1SNicolas Vasilache    OpBuilder::InsertionGuard guard(rewriter);
c59465e1SNicolas Vasilache    rewriter.setInsertionPointToStart(parentBlock);
c59465e1SNicolas Vasilache    zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
c59465e1SNicolas Vasilache  }
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  ForallRewriteResult rewriteResult;
44e6318cSNicolas Vasilache  DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
44e6318cSNicolas Vasilache      rewriter, transformOp, forallOp,
44e6318cSNicolas Vasilache      /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);
c59465e1SNicolas Vasilache
44e6318cSNicolas Vasilache  // Return if anything goes wrong, use silenceable failure as a match
44e6318cSNicolas Vasilache  // failure.
c59465e1SNicolas Vasilache  if (!diag.succeeded())
c59465e1SNicolas Vasilache    return diag;
c59465e1SNicolas Vasilache
44e6318cSNicolas Vasilache  // If gridDims was not provided already, set it from the return.
44e6318cSNicolas Vasilache  if (gridDims.empty()) {
c59465e1SNicolas Vasilache    gridDims = rewriteResult.mappingSizes;
44e6318cSNicolas Vasilache    while (gridDims.size() < 3)
44e6318cSNicolas Vasilache      gridDims.push_back(1);
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  assert(gridDims.size() == 3 && "Need 3-D gridDims");
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
c59465e1SNicolas Vasilache  // Here, the result of mapping determines the available mapping sizes.
c59465e1SNicolas Vasilache  replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
44e6318cSNicolas Vasilache                                          rewriteResult.mappingSizes);
c59465e1SNicolas Vasilache
89bb0caeSGuray Ozen  return DiagnosedSilenceableFailure::success();
89bb0caeSGuray Ozen}
89bb0caeSGuray Ozen
44e6318cSNicolas VasilacheDiagnosedSilenceableFailure
44e6318cSNicolas Vasilachemlir::transform::gpu::findTopLevelForallOp(Operation *target,
44e6318cSNicolas Vasilache                                           scf::ForallOp &topLevelForallOp,
44e6318cSNicolas Vasilache                                           TransformOpInterface transformOp) {
44e6318cSNicolas Vasilache  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
44e6318cSNicolas Vasilache    if (forallOp->getParentOfType<scf::ForallOp>())
44e6318cSNicolas Vasilache      return WalkResult::advance();
44e6318cSNicolas Vasilache    if (topLevelForallOp)
44e6318cSNicolas Vasilache      // TODO: Handle multiple forall if they are independent.
44e6318cSNicolas Vasilache      return WalkResult::interrupt();
44e6318cSNicolas Vasilache    topLevelForallOp = forallOp;
44e6318cSNicolas Vasilache    return WalkResult::advance();
44e6318cSNicolas Vasilache  });
44e6318cSNicolas Vasilache
92f088d3SNicolas Vasilache  if (walkResult.wasInterrupted() || !topLevelForallOp)
44e6318cSNicolas Vasilache    return transformOp.emitSilenceableError()
44e6318cSNicolas Vasilache           << "could not find a unique topLevel scf.forall";
44e6318cSNicolas Vasilache  return DiagnosedSilenceableFailure::success();
44e6318cSNicolas Vasilache}
44e6318cSNicolas Vasilache
c63d2b2cSMatthias SpringerDiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
c63d2b2cSMatthias Springer    transform::TransformRewriter &rewriter, Operation *target,
c63d2b2cSMatthias Springer    ApplyToEachResultList &results, transform::TransformState &state) {
89bb0caeSGuray Ozen  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
89bb0caeSGuray Ozen  auto transformOp = cast<TransformOpInterface>(getOperation());
89bb0caeSGuray Ozen
89bb0caeSGuray Ozen  if (!getGenerateGpuLaunch() && !gpuLaunch) {
89bb0caeSGuray Ozen    DiagnosedSilenceableFailure diag =
89bb0caeSGuray Ozen        emitSilenceableError()
89bb0caeSGuray Ozen        << "Given target is not gpu.launch, set `generate_gpu_launch` "
89bb0caeSGuray Ozen           "attribute";
89bb0caeSGuray Ozen    diag.attachNote(target->getLoc()) << "when applied to this payload op";
89bb0caeSGuray Ozen    return diag;
89bb0caeSGuray Ozen  }
89bb0caeSGuray Ozen
eb2f946eSAlexander Belyaev  scf::ForallOp topLevelForallOp;
eb2f946eSAlexander Belyaev  DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(
eb2f946eSAlexander Belyaev      target, topLevelForallOp, transformOp);
89bb0caeSGuray Ozen  if (!diag.succeeded()) {
89bb0caeSGuray Ozen    diag.attachNote(target->getLoc()) << "when applied to this payload op";
89bb0caeSGuray Ozen    return diag;
89bb0caeSGuray Ozen  }
92f088d3SNicolas Vasilache  assert(topLevelForallOp && "expect an scf.forall");
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  SmallVector<int64_t> gridDims{getGridDims()};
768615bbSNicolas Vasilache  if (!getGenerateGpuLaunch() && gridDims.size() != 3)
aafb52d7SNicolas Vasilache    return transformOp.emitDefiniteFailure("transform require size-3 mapping");
aafb52d7SNicolas Vasilache
89bb0caeSGuray Ozen  OpBuilder::InsertionGuard guard(rewriter);
eb2f946eSAlexander Belyaev  rewriter.setInsertionPoint(topLevelForallOp);
89bb0caeSGuray Ozen
eb2f946eSAlexander Belyaev  // Generate gpu launch here and move the forall inside
89bb0caeSGuray Ozen  if (getGenerateGpuLaunch()) {
89bb0caeSGuray Ozen    DiagnosedSilenceableFailure diag =
89bb0caeSGuray Ozen        createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
44e6318cSNicolas Vasilache    if (!diag.succeeded())
89bb0caeSGuray Ozen      return diag;
44e6318cSNicolas Vasilache
89bb0caeSGuray Ozen    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
eb2f946eSAlexander Belyaev    Operation *newForallOp = rewriter.clone(*topLevelForallOp);
eb2f946eSAlexander Belyaev    rewriter.eraseOp(topLevelForallOp);
eb2f946eSAlexander Belyaev    topLevelForallOp = cast<scf::ForallOp>(newForallOp);
89bb0caeSGuray Ozen  }
89bb0caeSGuray Ozen
44e6318cSNicolas Vasilache  // The BlockIdBuilder adapts to whatever is thrown at it.
92f088d3SNicolas Vasilache  bool useLinearMapping = false;
92f088d3SNicolas Vasilache  if (topLevelForallOp.getMapping()) {
44e6318cSNicolas Vasilache    auto mappingAttr = cast<DeviceMappingAttrInterface>(
44e6318cSNicolas Vasilache        topLevelForallOp.getMapping()->getValue().front());
92f088d3SNicolas Vasilache    useLinearMapping = mappingAttr.isLinearMapping();
92f088d3SNicolas Vasilache  }
44e6318cSNicolas Vasilache  GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping);
44e6318cSNicolas Vasilache
1cff4cbdSNicolas Vasilache  diag = mlir::transform::gpu::mapForallToBlocksImpl(
c59465e1SNicolas Vasilache      rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
aafb52d7SNicolas Vasilache  if (!diag.succeeded())
aafb52d7SNicolas Vasilache    return diag;
aafb52d7SNicolas Vasilache
44e6318cSNicolas Vasilache  // Set the GPU launch configuration for the grid dims late, this is
44e6318cSNicolas Vasilache  // subject to IR inspection.
89bb0caeSGuray Ozen  diag = alterGpuLaunch(rewriter, gpuLaunch,
768615bbSNicolas Vasilache                        cast<TransformOpInterface>(getOperation()), gridDims[0],
768615bbSNicolas Vasilache                        gridDims[1], gridDims[2]);
89bb0caeSGuray Ozen
4b455a71SAlex Zinenko  results.push_back(gpuLaunch);
89bb0caeSGuray Ozen  return diag;
89bb0caeSGuray Ozen}
89bb0caeSGuray Ozen
92f088d3SNicolas VasilacheLogicalResult transform::MapForallToBlocks::verify() {
92f088d3SNicolas Vasilache  if (!getGridDims().empty() && getGridDims().size() != 3) {
92f088d3SNicolas Vasilache    return emitOpError() << "transform requires empty or size-3 grid_dims";
92f088d3SNicolas Vasilache  }
92f088d3SNicolas Vasilache  return success();
92f088d3SNicolas Vasilache}
92f088d3SNicolas Vasilache
89bb0caeSGuray Ozen//===----------------------------------------------------------------------===//
1cff4cbdSNicolas Vasilache// MapNestedForallToThreads
89bb0caeSGuray Ozen//===----------------------------------------------------------------------===//
89bb0caeSGuray Ozen
44e6318cSNicolas Vasilachestatic DiagnosedSilenceableFailure checkMappingSpec(
44e6318cSNicolas Vasilache    std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
44e6318cSNicolas Vasilache    ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
44e6318cSNicolas Vasilache    int factor, bool useLinearMapping = false) {
44e6318cSNicolas Vasilache  if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
44e6318cSNicolas Vasilache    auto diag = definiteFailureHelper(
44e6318cSNicolas Vasilache        transformOp, forallOp,
44e6318cSNicolas Vasilache        Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
44e6318cSNicolas Vasilache            std::to_string(factor));
44e6318cSNicolas Vasilache    return diag;
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  if (computeProduct(numParallelIterations) * factor >
44e6318cSNicolas Vasilache      computeProduct(blockOrGridSizes)) {
44e6318cSNicolas Vasilache    auto diag = definiteFailureHelper(
44e6318cSNicolas Vasilache        transformOp, forallOp,
92f088d3SNicolas Vasilache        Twine("the number of required parallel resources (blocks or "
92f088d3SNicolas Vasilache              "threads) ") +
44e6318cSNicolas Vasilache            std::to_string(computeProduct(numParallelIterations) * factor) +
44e6318cSNicolas Vasilache            std::string(" overflows the number of available resources ") +
44e6318cSNicolas Vasilache            std::to_string(computeProduct(blockOrGridSizes)));
44e6318cSNicolas Vasilache    return diag;
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  return DiagnosedSilenceableFailure::success();
44e6318cSNicolas Vasilache}
44e6318cSNicolas Vasilache
44e6318cSNicolas Vasilachestatic DiagnosedSilenceableFailure
44e6318cSNicolas VasilachegetThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
44e6318cSNicolas Vasilache                   scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
44e6318cSNicolas Vasilache                   int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
44e6318cSNicolas Vasilache  auto mappingAttr = cast<DeviceMappingAttrInterface>(
44e6318cSNicolas Vasilache      forallOp.getMapping()->getValue().front());
44e6318cSNicolas Vasilache  bool useLinearMapping = mappingAttr.isLinearMapping();
44e6318cSNicolas Vasilache
44e6318cSNicolas Vasilache  // Sanity checks that may result in runtime verification errors.
44e6318cSNicolas Vasilache  auto numParallelIterations =
44e6318cSNicolas Vasilache      getConstantIntValues((forallOp.getMixedUpperBound()));
44e6318cSNicolas Vasilache  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
44e6318cSNicolas Vasilache    return definiteFailureHelper(
44e6318cSNicolas Vasilache        transformOp, forallOp,
44e6318cSNicolas Vasilache        "requires statically sized, normalized forall op");
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  int64_t factor = 1;
44e6318cSNicolas Vasilache  if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
44e6318cSNicolas Vasilache    factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
44e6318cSNicolas Vasilache  } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
44e6318cSNicolas Vasilache    factor = warpSize;
44e6318cSNicolas Vasilache  }
44e6318cSNicolas Vasilache  DiagnosedSilenceableFailure diag =
44e6318cSNicolas Vasilache      checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
44e6318cSNicolas Vasilache                       blockSizes, factor, useLinearMapping);
44e6318cSNicolas Vasilache  if (!diag.succeeded())
44e6318cSNicolas Vasilache    return diag;
44e6318cSNicolas Vasilache
44e6318cSNicolas Vasilache  // Start mapping.
44e6318cSNicolas Vasilache  MLIRContext *ctx = forallOp.getContext();
44e6318cSNicolas Vasilache  gpuIdBuilder =
44e6318cSNicolas Vasilache      TypeSwitch<DeviceMappingAttrInterface, GpuIdBuilder>(mappingAttr)
44e6318cSNicolas Vasilache          .Case([&](GPUWarpgroupMappingAttr) {
44e6318cSNicolas Vasilache            return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping);
44e6318cSNicolas Vasilache          })
44e6318cSNicolas Vasilache          .Case([&](GPUWarpMappingAttr) {
44e6318cSNicolas Vasilache            return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping);
44e6318cSNicolas Vasilache          })
44e6318cSNicolas Vasilache          .Case([&](GPUThreadMappingAttr) {
44e6318cSNicolas Vasilache            return GpuThreadIdBuilder(ctx, useLinearMapping);
44e6318cSNicolas Vasilache          })
44e6318cSNicolas Vasilache          .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
44e6318cSNicolas Vasilache            llvm_unreachable("unknown mapping attribute");
44e6318cSNicolas Vasilache          });
44e6318cSNicolas Vasilache  return DiagnosedSilenceableFailure::success();
44e6318cSNicolas Vasilache}
44e6318cSNicolas Vasilache
c59465e1SNicolas VasilacheDiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
768615bbSNicolas Vasilache    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
44e6318cSNicolas Vasilache    scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
44e6318cSNicolas Vasilache    bool syncAfterDistribute) {
44e6318cSNicolas Vasilache
92f088d3SNicolas Vasilache  {
92f088d3SNicolas Vasilache    // GPU-specific verifications. There is no better place to anchor
92f088d3SNicolas Vasilache    // those right now: the ForallOp is target-independent and the transform
92f088d3SNicolas Vasilache    // op does not apply to individual ForallOp.
92f088d3SNicolas Vasilache    DiagnosedSilenceableFailure diag =
92f088d3SNicolas Vasilache        verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
92f088d3SNicolas Vasilache    if (!diag.succeeded())
92f088d3SNicolas Vasilache      return diag;
92f088d3SNicolas Vasilache  }
92f088d3SNicolas Vasilache
44e6318cSNicolas Vasilache  GpuIdBuilder gpuIdBuilder;
44e6318cSNicolas Vasilache  {
44e6318cSNicolas Vasilache    // Try to construct the id builder, if it fails, return.
44e6318cSNicolas Vasilache    DiagnosedSilenceableFailure diag = getThreadIdBuilder(
44e6318cSNicolas Vasilache        transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
44e6318cSNicolas Vasilache    if (!diag.succeeded())
44e6318cSNicolas Vasilache      return diag;
a7686db8SThomas Raoux  }
c59465e1SNicolas Vasilache
768615bbSNicolas Vasilache  Location loc = forallOp.getLoc();
768615bbSNicolas Vasilache  OpBuilder::InsertionGuard g(rewriter);
c59465e1SNicolas Vasilache  // Insert after to allow for syncthreads after `forall` is erased.
768615bbSNicolas Vasilache  rewriter.setInsertionPointAfter(forallOp);
c59465e1SNicolas Vasilache  ForallRewriteResult rewriteResult;
44e6318cSNicolas Vasilache  DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
44e6318cSNicolas Vasilache      rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
c59465e1SNicolas Vasilache  if (!diag.succeeded())
c59465e1SNicolas Vasilache    return diag;
768615bbSNicolas Vasilache  // Add a syncthreads if needed. TODO: warpsync
768615bbSNicolas Vasilache  if (syncAfterDistribute)
768615bbSNicolas Vasilache    rewriter.create<BarrierOp>(loc);
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  return DiagnosedSilenceableFailure::success();
beaffb04SGuray Ozen}
c59465e1SNicolas Vasilache
c59465e1SNicolas VasilacheDiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
c59465e1SNicolas Vasilache    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
44e6318cSNicolas Vasilache    Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
c59465e1SNicolas Vasilache    bool syncAfterDistribute) {
c59465e1SNicolas Vasilache  LDBG("Start mapNestedForallToThreadsImpl");
44e6318cSNicolas Vasilache  if (blockDims.size() != 3) {
c59465e1SNicolas Vasilache    return definiteFailureHelper(transformOp, target,
c59465e1SNicolas Vasilache                                 "requires size-3 thread mapping");
c59465e1SNicolas Vasilache  }
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  // Create an early zero index value for replacements.
c59465e1SNicolas Vasilache  Location loc = target->getLoc();
c59465e1SNicolas Vasilache  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
c59465e1SNicolas Vasilache  DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
c59465e1SNicolas Vasilache  WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
c59465e1SNicolas Vasilache    diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
44e6318cSNicolas Vasilache        rewriter, transformOp, forallOp, blockDims, warpSize,
44e6318cSNicolas Vasilache        syncAfterDistribute);
c59465e1SNicolas Vasilache    if (diag.isDefiniteFailure())
c59465e1SNicolas Vasilache      return WalkResult::interrupt();
c59465e1SNicolas Vasilache    if (diag.succeeded())
c59465e1SNicolas Vasilache      return WalkResult::skip();
c59465e1SNicolas Vasilache    return WalkResult::advance();
89bb0caeSGuray Ozen  });
c59465e1SNicolas Vasilache  if (walkResult.wasInterrupted())
89bb0caeSGuray Ozen    return diag;
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
c59465e1SNicolas Vasilache  // Here, the result of mapping determines the available mapping sizes.
c59465e1SNicolas Vasilache  replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
c59465e1SNicolas Vasilache                                          blockDims);
c59465e1SNicolas Vasilache
c59465e1SNicolas Vasilache  return DiagnosedSilenceableFailure::success();
89bb0caeSGuray Ozen}
89bb0caeSGuray Ozen
1cff4cbdSNicolas VasilacheDiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
c63d2b2cSMatthias Springer    transform::TransformRewriter &rewriter, Operation *target,
c63d2b2cSMatthias Springer    ApplyToEachResultList &results, TransformState &state) {
89bb0caeSGuray Ozen  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
89bb0caeSGuray Ozen  auto transformOp = cast<TransformOpInterface>(getOperation());
89bb0caeSGuray Ozen
aafb52d7SNicolas Vasilache  // Basic high-level verifications.
aafb52d7SNicolas Vasilache  if (!gpuLaunch)
aafb52d7SNicolas Vasilache    return emitSilenceableError() << "Given target is not a gpu.launch";
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  // Mapping to block ids.
c59465e1SNicolas Vasilache  SmallVector<int64_t> blockDims{getBlockDims()};
89bb0caeSGuray Ozen  DiagnosedSilenceableFailure diag =
1a36588eSKazu Hirata      checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
768615bbSNicolas Vasilache                     blockDims[0], blockDims[1], blockDims[2]);
89bb0caeSGuray Ozen  if (diag.isSilenceableFailure()) {
c59465e1SNicolas Vasilache    diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
89bb0caeSGuray Ozen    return diag;
89bb0caeSGuray Ozen  }
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  // Set the GPU launch configuration for the block dims early, this is not
c59465e1SNicolas Vasilache  // subject to IR inspection.
1a36588eSKazu Hirata  diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
768615bbSNicolas Vasilache                        std::nullopt, std::nullopt, blockDims[0], blockDims[1],
768615bbSNicolas Vasilache                        blockDims[2]);
89bb0caeSGuray Ozen
c59465e1SNicolas Vasilache  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
c59465e1SNicolas Vasilache  diag =
c59465e1SNicolas Vasilache      mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
44e6318cSNicolas Vasilache                                   getWarpSize(), getSyncAfterDistribute());
c59465e1SNicolas Vasilache
015cd84dSNicolas Vasilache  results.push_back(gpuLaunch.getOperation());
89bb0caeSGuray Ozen  return diag;
89bb0caeSGuray Ozen}
89bb0caeSGuray Ozen
89bb0caeSGuray Ozen//===----------------------------------------------------------------------===//
89bb0caeSGuray Ozen// Transform op registration
89bb0caeSGuray Ozen//===----------------------------------------------------------------------===//
89bb0caeSGuray Ozen
89bb0caeSGuray Ozennamespace {
89bb0caeSGuray Ozen/// Registers new ops and declares PDL as dependent dialect since the
89bb0caeSGuray Ozen/// additional ops are using PDL types for operands and results.
89bb0caeSGuray Ozenclass GPUTransformDialectExtension
89bb0caeSGuray Ozen    : public transform::TransformDialectExtension<
89bb0caeSGuray Ozen          GPUTransformDialectExtension> {
89bb0caeSGuray Ozenpublic:
*84cc1865SNikhil Kalra  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)
*84cc1865SNikhil Kalra
89bb0caeSGuray Ozen  GPUTransformDialectExtension() {
89bb0caeSGuray Ozen    declareGeneratedDialect<scf::SCFDialect>();
89bb0caeSGuray Ozen    declareGeneratedDialect<arith::ArithDialect>();
89bb0caeSGuray Ozen    declareGeneratedDialect<GPUDialect>();
89bb0caeSGuray Ozen    registerTransformOps<
89bb0caeSGuray Ozen#define GET_OP_LIST
89bb0caeSGuray Ozen#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
89bb0caeSGuray Ozen        >();
89bb0caeSGuray Ozen  }
89bb0caeSGuray Ozen};
89bb0caeSGuray Ozen} // namespace
89bb0caeSGuray Ozen
89bb0caeSGuray Ozen#define GET_OP_CLASSES
89bb0caeSGuray Ozen#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
89bb0caeSGuray Ozen
89bb0caeSGuray Ozenvoid mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {
89bb0caeSGuray Ozen  registry.addExtensions<GPUTransformDialectExtension>();
89bb0caeSGuray Ozen}