Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

894a591cSThomas Raoux//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//
894a591cSThomas Raoux//
894a591cSThomas Raoux// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
894a591cSThomas Raoux// See https://llvm.org/LICENSE.txt for license information.
894a591cSThomas Raoux// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
894a591cSThomas Raoux//
894a591cSThomas Raoux//===----------------------------------------------------------------------===//
894a591cSThomas Raoux
894a591cSThomas Raoux#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
67d0d7acSMichele Scuttari
e56d6745SGuray Ozen#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
894a591cSThomas Raoux#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
894a591cSThomas Raoux#include "mlir/Conversion/LLVMCommon/Pattern.h"
2b23e6c8SObserver007#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
d20fbc90SGuray Ozen#include "mlir/Dialect/Arith/IR/Arith.h"
d7ef488bSMogball#include "mlir/Dialect/GPU/IR/GPUDialect.h"
708185f0SChristopher Bate#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
e56d6745SGuray Ozen#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
894a591cSThomas Raoux#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
affcfccdSGuray Ozen#include "mlir/Dialect/MemRef/IR/MemRef.h"
51b925dfSChristopher Bate#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
23882226SGuray Ozen#include "mlir/Dialect/SCF/Transforms/Patterns.h"
17649a77SGuray Ozen#include "mlir/IR/BuiltinTypes.h"
ee49cda7SGuray Ozen#include "mlir/IR/ImplicitLocOpBuilder.h"
e56d6745SGuray Ozen#include "mlir/IR/PatternMatch.h"
708185f0SChristopher Bate#include "mlir/IR/TypeUtilities.h"
17649a77SGuray Ozen#include "mlir/IR/Value.h"
67d0d7acSMichele Scuttari#include "mlir/Pass/Pass.h"
b96d0693SGuray Ozen#include "llvm/Support/Debug.h"
23882226SGuray Ozen#include "llvm/Support/ErrorHandling.h"
e56d6745SGuray Ozen#include "llvm/Support/raw_ostream.h"
63389326SGuray Ozen#include <optional>
67d0d7acSMichele Scuttari
b96d0693SGuray Ozen#define DEBUG_TYPE "nvgpu-to-nvvm"
b96d0693SGuray Ozen#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
b96d0693SGuray Ozen#define DBGSE() (llvm::dbgs())
b96d0693SGuray Ozen
67d0d7acSMichele Scuttarinamespace mlir {
53689fdfSMarkus Böck#define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS
67d0d7acSMichele Scuttari#include "mlir/Conversion/Passes.h.inc"
67d0d7acSMichele Scuttari} // namespace mlir
894a591cSThomas Raoux
894a591cSThomas Raouxusing namespace mlir;
894a591cSThomas Raoux
b74cfc13SGuray Ozen/// Number of bits that needs to be excluded when building matrix descriptor for
23882226SGuray Ozen/// wgmma operations.
23882226SGuray Ozenconstexpr int exclude4LSB = 4;
23882226SGuray Ozen
836dbb85SGuray Ozen/// GPU has 32 bit registers, this function truncates values when larger width
836dbb85SGuray Ozen/// is not needed.
ee49cda7SGuray Ozenstatic Value truncToI32(ImplicitLocOpBuilder &b, Value value) {
836dbb85SGuray Ozen  Type type = value.getType();
836dbb85SGuray Ozen  assert(llvm::isa<IntegerType>(type) && "expected an integer Value");
836dbb85SGuray Ozen  if (type.getIntOrFloatBitWidth() <= 32)
836dbb85SGuray Ozen    return value;
ee49cda7SGuray Ozen  return b.create<LLVM::TruncOp>(b.getI32Type(), value);
836dbb85SGuray Ozen}
836dbb85SGuray Ozen
894a591cSThomas Raoux/// Returns the type for the intrinsic given the vectorResultType of the
894a591cSThomas Raoux/// `gpu.mma.sync` operation.
894a591cSThomas Raouxstatic Type inferIntrinsicResultType(Type vectorResultType) {
894a591cSThomas Raoux  MLIRContext *ctx = vectorResultType.getContext();
5550c821STres Popp  auto a = cast<LLVM::LLVMArrayType>(vectorResultType);
894a591cSThomas Raoux  auto f16x2Ty = LLVM::getFixedVectorType(Float16Type::get(ctx), 2);
894a591cSThomas Raoux  auto i32Ty = IntegerType::get(ctx, 32);
894a591cSThomas Raoux  auto i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
894a591cSThomas Raoux  Type f64Ty = Float64Type::get(ctx);
894a591cSThomas Raoux  Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);
98798073SChristopher Bate  Type f32Ty = Float32Type::get(ctx);
98798073SChristopher Bate  Type f32x2Ty = LLVM::getFixedVectorType(f32Ty, 2);
894a591cSThomas Raoux  if (a.getElementType() == f16x2Ty) {
894a591cSThomas Raoux    return LLVM::LLVMStructType::getLiteral(
894a591cSThomas Raoux        ctx, SmallVector<Type>(a.getNumElements(), f16x2Ty));
894a591cSThomas Raoux  }
894a591cSThomas Raoux  if (a.getElementType() == i32x2Ty) {
894a591cSThomas Raoux    return LLVM::LLVMStructType::getLiteral(
894a591cSThomas Raoux        ctx,
894a591cSThomas Raoux        SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2, i32Ty));
894a591cSThomas Raoux  }
894a591cSThomas Raoux  if (a.getElementType() == f64x2Ty) {
894a591cSThomas Raoux    return LLVM::LLVMStructType::getLiteral(ctx, {f64Ty, f64Ty});
894a591cSThomas Raoux  }
98798073SChristopher Bate  if (a.getElementType() == f32x2Ty) {
98798073SChristopher Bate    return LLVM::LLVMStructType::getLiteral(
98798073SChristopher Bate        ctx,
98798073SChristopher Bate        SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2, f32Ty));
98798073SChristopher Bate  }
98798073SChristopher Bate  if (a.getElementType() == LLVM::getFixedVectorType(f32Ty, 1)) {
98798073SChristopher Bate    return LLVM::LLVMStructType::getLiteral(
98798073SChristopher Bate        ctx, SmallVector<Type>(static_cast<size_t>(a.getNumElements()), f32Ty));
98798073SChristopher Bate  }
894a591cSThomas Raoux  return vectorResultType;
894a591cSThomas Raoux}
894a591cSThomas Raoux
894a591cSThomas Raoux/// Convert the SSA result of the NVVM intrinsic `nvvm.mma.sync` (which is
894a591cSThomas Raoux/// always an LLVM struct) into a fragment that is compatible with the vector
894a591cSThomas Raoux/// type of this operation. This involves extracting elements from the struct
894a591cSThomas Raoux/// and inserting them into an LLVM array. These extra data-movement
894a591cSThomas Raoux/// operations should be canonicalized away by the LLVM backend.
894a591cSThomas Raouxstatic Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
894a591cSThomas Raoux                                    Type resultType, Value intrinsicResult,
894a591cSThomas Raoux                                    RewriterBase &rewriter) {
894a591cSThomas Raoux  MLIRContext *ctx = rewriter.getContext();
5550c821STres Popp  auto structType = dyn_cast<LLVM::LLVMStructType>(intrinsicResultType);
5550c821STres Popp  auto arrayType = dyn_cast<LLVM::LLVMArrayType>(resultType);
894a591cSThomas Raoux  Type i32Ty = rewriter.getI32Type();
98798073SChristopher Bate  Type f32Ty = rewriter.getF32Type();
894a591cSThomas Raoux  Type f64Ty = rewriter.getF64Type();
894a591cSThomas Raoux  Type f16x2Ty = LLVM::getFixedVectorType(rewriter.getF16Type(), 2);
894a591cSThomas Raoux  Type i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
894a591cSThomas Raoux  Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);
98798073SChristopher Bate  Type f32x2Ty = LLVM::getFixedVectorType(f32Ty, 2);
98798073SChristopher Bate  Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);
894a591cSThomas Raoux
894a591cSThomas Raoux  auto makeConst = [&](int32_t index) -> Value {
894a591cSThomas Raoux    return rewriter.create<LLVM::ConstantOp>(loc, IntegerType::get(ctx, 32),
894a591cSThomas Raoux                                             rewriter.getI32IntegerAttr(index));
894a591cSThomas Raoux  };
894a591cSThomas Raoux
894a591cSThomas Raoux  if (arrayType) {
894a591cSThomas Raoux    SmallVector<Value, 4> elements;
894a591cSThomas Raoux
98798073SChristopher Bate    // The intrinsic returns 32-bit wide elements in a form which can be
98798073SChristopher Bate    // directly bitcasted and inserted into the result vector.
98798073SChristopher Bate    if (arrayType.getElementType() == f16x2Ty ||
98798073SChristopher Bate        arrayType.getElementType() == f32x1Ty) {
894a591cSThomas Raoux      for (unsigned i = 0; i < structType.getBody().size(); i++) {
5c5af910SJeff Niu        Value el =
5c5af910SJeff Niu            rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult, i);
98798073SChristopher Bate        el = rewriter.createOrFold<LLVM::BitcastOp>(
98798073SChristopher Bate            loc, arrayType.getElementType(), el);
98798073SChristopher Bate        elements.push_back(el);
894a591cSThomas Raoux      }
894a591cSThomas Raoux    }
894a591cSThomas Raoux
98798073SChristopher Bate    // The intrinsic returns i32, f64, and f32 values as individual scalars,
98798073SChristopher Bate    // even when the result is notionally a 64-bit wide element (e.g. f32x2). We
98798073SChristopher Bate    // need to extract them from the struct and pack them into the 64-bit wide
98798073SChristopher Bate    // rows of the vector result.
894a591cSThomas Raoux    if (arrayType.getElementType() == i32x2Ty ||
98798073SChristopher Bate        arrayType.getElementType() == f64x2Ty ||
98798073SChristopher Bate        arrayType.getElementType() == f32x2Ty) {
98798073SChristopher Bate
98798073SChristopher Bate      for (unsigned i = 0, e = structType.getBody().size() / 2; i < e; i++) {
894a591cSThomas Raoux        Value vec =
894a591cSThomas Raoux            rewriter.create<LLVM::UndefOp>(loc, arrayType.getElementType());
5c5af910SJeff Niu        Value x1 =
5c5af910SJeff Niu            rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult, i * 2);
5c5af910SJeff Niu        Value x2 = rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult,
5c5af910SJeff Niu                                                         i * 2 + 1);
894a591cSThomas Raoux        vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
894a591cSThomas Raoux                                                     x1, makeConst(0));
894a591cSThomas Raoux        vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
894a591cSThomas Raoux                                                     x2, makeConst(1));
894a591cSThomas Raoux        elements.push_back(vec);
894a591cSThomas Raoux      }
98798073SChristopher Bate    }
894a591cSThomas Raoux
894a591cSThomas Raoux    // Create the final vectorized result.
894a591cSThomas Raoux    Value result = rewriter.create<LLVM::UndefOp>(loc, arrayType);
894a591cSThomas Raoux    for (const auto &el : llvm::enumerate(elements)) {
5c5af910SJeff Niu      result = rewriter.create<LLVM::InsertValueOp>(loc, result, el.value(),
5c5af910SJeff Niu                                                    el.index());
894a591cSThomas Raoux    }
894a591cSThomas Raoux    return result;
894a591cSThomas Raoux  }
894a591cSThomas Raoux
894a591cSThomas Raoux  return intrinsicResult;
894a591cSThomas Raoux}
894a591cSThomas Raoux
894a591cSThomas Raoux/// The `gpu.mma.sync` converter below expects matrix fragment operands to be
894a591cSThomas Raoux/// given as 2D `vectors` where the rows are 32b or 64b wide. The
894a591cSThomas Raoux/// `nvvm.mma.sync` op expects these argments to be a given in a long list of
894a591cSThomas Raoux/// scalars of certain types. This function helps unpack the `vector` arguments
894a591cSThomas Raoux/// and cast them to the types expected by `nvvm.mma.sync`.
ee49cda7SGuray Ozenstatic SmallVector<Value> unpackOperandVector(ImplicitLocOpBuilder &b,
ee49cda7SGuray Ozen                                              Value operand,
98798073SChristopher Bate                                              NVVM::MMATypes operandPtxType) {
894a591cSThomas Raoux  SmallVector<Value> result;
ee49cda7SGuray Ozen  Type i32Ty = b.getI32Type();
ee49cda7SGuray Ozen  Type f64Ty = b.getF64Type();
ee49cda7SGuray Ozen  Type f32Ty = b.getF32Type();
ee49cda7SGuray Ozen  Type i64Ty = b.getI64Type();
ee49cda7SGuray Ozen  Type i8x4Ty = LLVM::getFixedVectorType(b.getI8Type(), 4);
ee49cda7SGuray Ozen  Type i4x8Ty = LLVM::getFixedVectorType(b.getIntegerType(4), 8);
98798073SChristopher Bate  Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);
5550c821STres Popp  auto arrayTy = cast<LLVM::LLVMArrayType>(operand.getType());
894a591cSThomas Raoux
894a591cSThomas Raoux  for (unsigned i = 0, e = arrayTy.getNumElements(); i < e; ++i) {
ee49cda7SGuray Ozen    Value toUse = b.create<LLVM::ExtractValueOp>(operand, i);
894a591cSThomas Raoux
894a591cSThomas Raoux    // For 4xi8 vectors, the intrinsic expects these to be provided as i32
894a591cSThomas Raoux    // scalar types.
98798073SChristopher Bate    if (arrayTy.getElementType() == i8x4Ty ||
334f63e7SChristopher Bate        arrayTy.getElementType() == i4x8Ty ||
98798073SChristopher Bate        (arrayTy.getElementType() == f32x1Ty &&
98798073SChristopher Bate         operandPtxType == NVVM::MMATypes::tf32)) {
ee49cda7SGuray Ozen      result.push_back(b.create<LLVM::BitcastOp>(i32Ty, toUse));
894a591cSThomas Raoux      continue;
894a591cSThomas Raoux    }
894a591cSThomas Raoux
98798073SChristopher Bate    // For some element types (i32, f32, f64), we need to unpack the inner
894a591cSThomas Raoux    // vector/array type as well because the intrinsic expects individual
894a591cSThomas Raoux    // scalars to be provided.
5550c821STres Popp    VectorType innerArrayTy = dyn_cast<VectorType>(arrayTy.getElementType());
894a591cSThomas Raoux    if (innerArrayTy && (innerArrayTy.getElementType() == i32Ty ||
98798073SChristopher Bate                         innerArrayTy.getElementType() == f64Ty ||
98798073SChristopher Bate                         innerArrayTy.getElementType() == f32Ty)) {
894a591cSThomas Raoux      for (unsigned idx = 0, innerSize = innerArrayTy.getNumElements();
894a591cSThomas Raoux           idx < innerSize; idx++) {
ee49cda7SGuray Ozen        result.push_back(b.create<LLVM::ExtractElementOp>(
ee49cda7SGuray Ozen            toUse,
ee49cda7SGuray Ozen            b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(idx))));
894a591cSThomas Raoux      }
894a591cSThomas Raoux      continue;
894a591cSThomas Raoux    }
894a591cSThomas Raoux    result.push_back(toUse);
894a591cSThomas Raoux  }
894a591cSThomas Raoux  return result;
894a591cSThomas Raoux}
894a591cSThomas Raoux
99475f5bSNicolas Vasilache/// Returns whether mbarrier object has shared memory address space.
17649a77SGuray Ozenstatic bool isMbarrierShared(nvgpu::MBarrierGroupType barrierType) {
99475f5bSNicolas Vasilache  return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(
99475f5bSNicolas Vasilache      barrierType.getMemorySpace()));
99475f5bSNicolas Vasilache}
99475f5bSNicolas Vasilache
99475f5bSNicolas Vasilache/// Returns the memory space attribute of the mbarrier object.
99475f5bSNicolas VasilacheAttribute nvgpu::getMbarrierMemorySpace(MLIRContext *context,
17649a77SGuray Ozen                                        nvgpu::MBarrierGroupType barrierType) {
99475f5bSNicolas Vasilache  Attribute memorySpace = {};
99475f5bSNicolas Vasilache  if (isMbarrierShared(barrierType)) {
99475f5bSNicolas Vasilache    memorySpace =
99475f5bSNicolas Vasilache        IntegerAttr::get(IntegerType::get(context, 64),
99475f5bSNicolas Vasilache                         nvgpu::NVGPUDialect::kSharedMemoryAddressSpace);
99475f5bSNicolas Vasilache  }
99475f5bSNicolas Vasilache  return memorySpace;
99475f5bSNicolas Vasilache}
99475f5bSNicolas Vasilache
99475f5bSNicolas Vasilache/// Returns memref type of the mbarrier object. The type is defined in the
17649a77SGuray Ozen/// MBarrierGroupType.
99475f5bSNicolas VasilacheMemRefType nvgpu::getMBarrierMemrefType(MLIRContext *context,
17649a77SGuray Ozen                                        nvgpu::MBarrierGroupType barrierType) {
99475f5bSNicolas Vasilache  Attribute memorySpace = nvgpu::getMbarrierMemorySpace(context, barrierType);
99475f5bSNicolas Vasilache  MemRefLayoutAttrInterface layout;
17649a77SGuray Ozen  return MemRefType::get({barrierType.getNumBarriers()},
17649a77SGuray Ozen                         IntegerType::get(context, 64), layout, memorySpace);
99475f5bSNicolas Vasilache}
99475f5bSNicolas Vasilache
894a591cSThomas Raouxnamespace {
894a591cSThomas Raoux
894a591cSThomas Raouxstruct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
894a591cSThomas Raoux  using ConvertOpToLLVMPattern<nvgpu::LdMatrixOp>::ConvertOpToLLVMPattern;
894a591cSThomas Raoux
894a591cSThomas Raoux  LogicalResult
894a591cSThomas Raoux  matchAndRewrite(nvgpu::LdMatrixOp op, OpAdaptor adaptor,
894a591cSThomas Raoux                  ConversionPatternRewriter &rewriter) const override {
894a591cSThomas Raoux    MLIRContext *ctx = getContext();
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
894a591cSThomas Raoux
894a591cSThomas Raoux    // The result type of ldmatrix will always be a struct of 32bit integer
894a591cSThomas Raoux    // registers if more than one 32bit value is returned. Otherwise, the result
894a591cSThomas Raoux    // is a single i32. The result type of the GPU operation is always a vector
894a591cSThomas Raoux    // of shape (NumRegisters, VectorRegister) where VectorRegister is the
894a591cSThomas Raoux    // vector type of the result and always 32 bits long. We bitcast the result
894a591cSThomas Raoux    // of the NVVM::LdMatrix to this vector type.
5550c821STres Popp    auto vectorResultType = dyn_cast<VectorType>(op->getResultTypes()[0]);
894a591cSThomas Raoux    if (!vectorResultType) {
894a591cSThomas Raoux      return failure();
894a591cSThomas Raoux    }
894a591cSThomas Raoux    Type innerVectorType = LLVM::getFixedVectorType(
894a591cSThomas Raoux        vectorResultType.getElementType(), vectorResultType.getDimSize(1));
894a591cSThomas Raoux
894a591cSThomas Raoux    int64_t num32BitRegs = vectorResultType.getDimSize(0);
894a591cSThomas Raoux
894a591cSThomas Raoux    Type ldMatrixResultType;
894a591cSThomas Raoux    if (num32BitRegs > 1) {
894a591cSThomas Raoux      ldMatrixResultType = LLVM::LLVMStructType::getLiteral(
894a591cSThomas Raoux          ctx, SmallVector<Type>(num32BitRegs, rewriter.getI32Type()));
894a591cSThomas Raoux    } else {
894a591cSThomas Raoux      ldMatrixResultType = rewriter.getI32Type();
894a591cSThomas Raoux    }
894a591cSThomas Raoux
5550c821STres Popp    auto srcMemrefType = cast<MemRefType>(op.getSrcMemref().getType());
8df54a6aSJacques Pienaar    Value srcPtr =
ee49cda7SGuray Ozen        getStridedElementPtr(b.getLoc(), srcMemrefType, adaptor.getSrcMemref(),
8df54a6aSJacques Pienaar                             adaptor.getIndices(), rewriter);
ee49cda7SGuray Ozen    Value ldMatrixResult = b.create<NVVM::LdMatrixOp>(
ee49cda7SGuray Ozen        ldMatrixResultType, srcPtr,
8df54a6aSJacques Pienaar        /*num=*/op.getNumTiles(),
8df54a6aSJacques Pienaar        /*layout=*/op.getTranspose() ? NVVM::MMALayout::col
894a591cSThomas Raoux                                     : NVVM::MMALayout::row);
894a591cSThomas Raoux
894a591cSThomas Raoux    // The ldmatrix operation returns either a single i32 value or a struct of
894a591cSThomas Raoux    // i32 values. Here we unpack those values and cast them back to their
894a591cSThomas Raoux    // actual vector type (still of width 32b) and repack them into a result
894a591cSThomas Raoux    // struct.
894a591cSThomas Raoux    Type finalResultType = typeConverter->convertType(vectorResultType);
ee49cda7SGuray Ozen    Value result = b.create<LLVM::UndefOp>(finalResultType);
894a591cSThomas Raoux    for (int64_t i = 0, e = vectorResultType.getDimSize(0); i < e; i++) {
5c5af910SJeff Niu      Value i32Register =
ee49cda7SGuray Ozen          num32BitRegs > 1 ? b.create<LLVM::ExtractValueOp>(ldMatrixResult, i)
894a591cSThomas Raoux                           : ldMatrixResult;
ee49cda7SGuray Ozen      Value casted = b.create<LLVM::BitcastOp>(innerVectorType, i32Register);
ee49cda7SGuray Ozen      result = b.create<LLVM::InsertValueOp>(result, casted, i);
894a591cSThomas Raoux    }
894a591cSThomas Raoux
894a591cSThomas Raoux    rewriter.replaceOp(op, result);
894a591cSThomas Raoux    return success();
894a591cSThomas Raoux  }
894a591cSThomas Raoux};
894a591cSThomas Raoux
708185f0SChristopher Bate/// Convert the given type into the corresponding PTX type (NVVM::MMATypes
708185f0SChristopher Bate/// enum).
708185f0SChristopher Batestatic FailureOr<NVVM::MMATypes> getNvvmMmaType(Type t) {
708185f0SChristopher Bate  Type elType = getElementTypeOrSelf(t);
708185f0SChristopher Bate  if (elType.isInteger(8))
708185f0SChristopher Bate    return NVVM::MMATypes::s8;
708185f0SChristopher Bate  if (elType.isInteger(4))
708185f0SChristopher Bate    return NVVM::MMATypes::s4;
708185f0SChristopher Bate  if (elType.isF16())
708185f0SChristopher Bate    return NVVM::MMATypes::f16;
708185f0SChristopher Bate  if (elType.isF64())
708185f0SChristopher Bate    return NVVM::MMATypes::f64;
708185f0SChristopher Bate  if (elType.isF32())
708185f0SChristopher Bate    return NVVM::MMATypes::tf32;
708185f0SChristopher Bate  return failure();
708185f0SChristopher Bate}
708185f0SChristopher Bate
894a591cSThomas Raouxstruct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {
894a591cSThomas Raoux  using ConvertOpToLLVMPattern<nvgpu::MmaSyncOp>::ConvertOpToLLVMPattern;
894a591cSThomas Raoux
894a591cSThomas Raoux  LogicalResult
894a591cSThomas Raoux  matchAndRewrite(nvgpu::MmaSyncOp op, OpAdaptor adaptor,
894a591cSThomas Raoux                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
894a591cSThomas Raoux    // Get the shapes of the MMAMatrix type being used. The shapes will
894a591cSThomas Raoux    // choose which intrinsic this op will be lowered to.
708185f0SChristopher Bate    VectorType aType = op.getMatrixA().getType();
708185f0SChristopher Bate    VectorType bType = op.getMatrixA().getType();
708185f0SChristopher Bate    VectorType cType = op.getMatrixC().getType();
894a591cSThomas Raoux
708185f0SChristopher Bate    std::array<int64_t, 3> gemmShape = op.getMmaShapeAsArray();
14d79afeSManish Gupta
14d79afeSManish Gupta    // Tensor Cores (mma.sync) on F32 works only with TensorFloat32 (TF32).
14d79afeSManish Gupta    bool tf32Enabled = op->hasAttr(op.getTf32EnabledAttrName());
14d79afeSManish Gupta    if (aType.getElementType().isF32() && !tf32Enabled)
14d79afeSManish Gupta      return failure();
98798073SChristopher Bate
708185f0SChristopher Bate    FailureOr<NVVM::MMATypes> ptxTypeA = getNvvmMmaType(aType);
708185f0SChristopher Bate    if (failed(ptxTypeA))
708185f0SChristopher Bate      return op->emitOpError("failed to deduce operand PTX types");
708185f0SChristopher Bate    FailureOr<NVVM::MMATypes> ptxTypeB = getNvvmMmaType(bType);
708185f0SChristopher Bate    if (failed(ptxTypeB))
708185f0SChristopher Bate      return op->emitOpError("failed to deduce operand PTX types");
22426110SRamkumar Ramachandra    std::optional<NVVM::MMATypes> ptxTypeC =
22426110SRamkumar Ramachandra        NVVM::MmaOp::inferOperandMMAType(cType.getElementType(),
22426110SRamkumar Ramachandra                                         /*isAccumulator=*/true);
708185f0SChristopher Bate    if (!ptxTypeC)
708185f0SChristopher Bate      return op->emitError(
708185f0SChristopher Bate          "could not infer the PTX type for the accumulator/result");
708185f0SChristopher Bate
708185f0SChristopher Bate    // TODO: add an attribute to the op to customize this behavior.
22426110SRamkumar Ramachandra    std::optional<NVVM::MMAIntOverflow> overflow(std::nullopt);
5550c821STres Popp    if (isa<IntegerType>(aType.getElementType()))
894a591cSThomas Raoux      overflow = NVVM::MMAIntOverflow::satfinite;
894a591cSThomas Raoux
98798073SChristopher Bate    SmallVector<Value> matA =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixA(), *ptxTypeA);
98798073SChristopher Bate    SmallVector<Value> matB =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixB(), *ptxTypeB);
98798073SChristopher Bate    SmallVector<Value> matC =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixC(), *ptxTypeC);
98798073SChristopher Bate
894a591cSThomas Raoux    Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
894a591cSThomas Raoux    Type intrinsicResTy = inferIntrinsicResultType(
894a591cSThomas Raoux        typeConverter->convertType(op->getResultTypes()[0]));
ee49cda7SGuray Ozen    Value intrinsicResult = b.create<NVVM::MmaOp>(
ee49cda7SGuray Ozen        intrinsicResTy, matA, matB, matC,
894a591cSThomas Raoux        /*shape=*/gemmShape,
1a36588eSKazu Hirata        /*b1Op=*/std::nullopt,
894a591cSThomas Raoux        /*intOverflow=*/overflow,
894a591cSThomas Raoux        /*multiplicandPtxTypes=*/
708185f0SChristopher Bate        std::array<NVVM::MMATypes, 2>{*ptxTypeA, *ptxTypeB},
894a591cSThomas Raoux        /*multiplicandLayouts=*/
894a591cSThomas Raoux        std::array<NVVM::MMALayout, 2>{NVVM::MMALayout::row,
894a591cSThomas Raoux                                       NVVM::MMALayout::col});
894a591cSThomas Raoux    rewriter.replaceOp(op, convertIntrinsicResult(op.getLoc(), intrinsicResTy,
894a591cSThomas Raoux                                                  desiredRetTy, intrinsicResult,
894a591cSThomas Raoux                                                  rewriter));
894a591cSThomas Raoux    return success();
894a591cSThomas Raoux  }
894a591cSThomas Raoux};
894a591cSThomas Raoux
894a591cSThomas Raouxstruct ConvertNVGPUToNVVMPass
53689fdfSMarkus Böck    : public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
53689fdfSMarkus Böck  using Base::Base;
894a591cSThomas Raoux
affcfccdSGuray Ozen  void getDependentDialects(DialectRegistry &registry) const override {
d20fbc90SGuray Ozen    registry.insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect,
d20fbc90SGuray Ozen                    arith::ArithDialect>();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen
894a591cSThomas Raoux  void runOnOperation() override {
53689fdfSMarkus Böck    LowerToLLVMOptions options(&getContext());
894a591cSThomas Raoux    RewritePatternSet patterns(&getContext());
53689fdfSMarkus Böck    LLVMTypeConverter converter(&getContext(), options);
affcfccdSGuray Ozen    IRRewriter rewriter(&getContext());
3a03da37SGuray Ozen    populateGpuMemorySpaceAttributeConversions(
3a03da37SGuray Ozen        converter, [](gpu::AddressSpace space) -> unsigned {
3a03da37SGuray Ozen          switch (space) {
3a03da37SGuray Ozen          case gpu::AddressSpace::Global:
3a03da37SGuray Ozen            return static_cast<unsigned>(
3a03da37SGuray Ozen                NVVM::NVVMMemorySpace::kGlobalMemorySpace);
3a03da37SGuray Ozen          case gpu::AddressSpace::Workgroup:
3a03da37SGuray Ozen            return static_cast<unsigned>(
3a03da37SGuray Ozen                NVVM::NVVMMemorySpace::kSharedMemorySpace);
3a03da37SGuray Ozen          case gpu::AddressSpace::Private:
3a03da37SGuray Ozen            return 0;
3a03da37SGuray Ozen          }
3a03da37SGuray Ozen          llvm_unreachable("unknown address space enum value");
3a03da37SGuray Ozen          return 0;
3a03da37SGuray Ozen        });
affcfccdSGuray Ozen    /// device-side async tokens cannot be materialized in nvvm. We just
affcfccdSGuray Ozen    /// convert them to a dummy i32 type in order to easily drop them during
affcfccdSGuray Ozen    /// conversion.
15bcc36eSThomas Raoux    converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
15bcc36eSThomas Raoux      return converter.convertType(IntegerType::get(type.getContext(), 32));
15bcc36eSThomas Raoux    });
23882226SGuray Ozen    converter.addConversion([&](nvgpu::WarpgroupAccumulatorType type) -> Type {
52db7e27SGuray Ozen      Type elemType = type.getFragmented().getElementType();
52db7e27SGuray Ozen      int64_t sizeM = type.getFragmented().getDimSize(0);
52db7e27SGuray Ozen      int64_t sizeN = type.getFragmented().getDimSize(1);
52db7e27SGuray Ozen
52db7e27SGuray Ozen      unsigned numMembers;
52db7e27SGuray Ozen      if (elemType.isF32() || elemType.isInteger(32))
52db7e27SGuray Ozen        numMembers = sizeN / 2;
52db7e27SGuray Ozen      else if (elemType.isF16())
52db7e27SGuray Ozen        numMembers = sizeN / 4;
52db7e27SGuray Ozen      else
52db7e27SGuray Ozen        llvm_unreachable("unsupported type for warpgroup accumulator");
52db7e27SGuray Ozen
52db7e27SGuray Ozen      SmallVector<Type> innerStructBody;
52db7e27SGuray Ozen      for (unsigned i = 0; i < numMembers; i++)
52db7e27SGuray Ozen        innerStructBody.push_back(elemType);
52db7e27SGuray Ozen      auto innerStructType =
52db7e27SGuray Ozen          LLVM::LLVMStructType::getLiteral(type.getContext(), innerStructBody);
52db7e27SGuray Ozen
23882226SGuray Ozen      SmallVector<Type> structBody;
52db7e27SGuray Ozen      for (int i = 0; i < sizeM; i += kWgmmaSizeM)
52db7e27SGuray Ozen        structBody.push_back(innerStructType);
52db7e27SGuray Ozen
23882226SGuray Ozen      auto convertedType =
23882226SGuray Ozen          LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);
23882226SGuray Ozen      return converter.convertType(convertedType);
23882226SGuray Ozen    });
affcfccdSGuray Ozen    converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
affcfccdSGuray Ozen      return converter.convertType(IntegerType::get(type.getContext(), 64));
affcfccdSGuray Ozen    });
50ab427aSGuray Ozen    converter.addConversion(
50ab427aSGuray Ozen        [&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type {
50ab427aSGuray Ozen          return converter.convertType(IntegerType::get(type.getContext(), 64));
50ab427aSGuray Ozen        });
17649a77SGuray Ozen    converter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type {
99475f5bSNicolas Vasilache      return converter.convertType(
99475f5bSNicolas Vasilache          nvgpu::getMBarrierMemrefType(rewriter.getContext(), type));
affcfccdSGuray Ozen    });
70c2e061SGuray Ozen    converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
2f17c9f6SChristian Ulmann      return LLVM::LLVMPointerType::get(type.getContext());
70c2e061SGuray Ozen    });
894a591cSThomas Raoux    populateNVGPUToNVVMConversionPatterns(converter, patterns);
894a591cSThomas Raoux    LLVMConversionTarget target(getContext());
894a591cSThomas Raoux    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
d20fbc90SGuray Ozen    target.addLegalDialect<::mlir::arith::ArithDialect>();
affcfccdSGuray Ozen    target.addLegalDialect<::mlir::memref::MemRefDialect>();
894a591cSThomas Raoux    target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
23882226SGuray Ozen    mlir::scf::populateSCFStructuralTypeConversionsAndLegality(
23882226SGuray Ozen        converter, patterns, target);
894a591cSThomas Raoux    if (failed(applyPartialConversion(getOperation(), target,
894a591cSThomas Raoux                                      std::move(patterns))))
894a591cSThomas Raoux      signalPassFailure();
894a591cSThomas Raoux  }
894a591cSThomas Raoux};
894a591cSThomas Raoux
708185f0SChristopher Bate/// Returns the constraints for the sparse MMA inline assembly instruction.
708185f0SChristopher Batestatic std::string buildMmaSparseAsmConstraintString(unsigned matASize,
708185f0SChristopher Bate                                                     unsigned matBSize,
708185f0SChristopher Bate                                                     unsigned matCSize) {
708185f0SChristopher Bate  std::string str;
708185f0SChristopher Bate  llvm::raw_string_ostream ss(str);
708185f0SChristopher Bate  for (unsigned i = 0; i < matCSize; i++)
708185f0SChristopher Bate    ss << "=r,";
708185f0SChristopher Bate  for (unsigned i = 0; i < matASize + matBSize + matCSize; i++)
708185f0SChristopher Bate    ss << "r,";
4e4af133SAart Bik  // The final operand is for the sparsity metadata.
4e4af133SAart Bik  // The sparsity selector appears as direct literal.
4e4af133SAart Bik  ss << "r";
708185f0SChristopher Bate  return str;
708185f0SChristopher Bate}
708185f0SChristopher Bate
708185f0SChristopher Bate/// Returns the string for the `mma.sp.sync` instruction that corresponds to
4e4af133SAart Bik/// the given parameters. Note that this function doesn't do any validation,
708185f0SChristopher Bate/// it's expected that the provided parameters correspond to a valid
708185f0SChristopher Bate/// instruction.
4e4af133SAart Bikstatic std::string buildMmaSparseAsmString(
4e4af133SAart Bik    const std::array<int64_t, 3> &shape, unsigned matASize, unsigned matBSize,
4e4af133SAart Bik    unsigned matCSize, NVVM::MMATypes ptxTypeA, NVVM::MMATypes ptxTypeB,
708185f0SChristopher Bate    NVVM::MMATypes ptxTypeC, NVVM::MMATypes ptxTypeD,
4e4af133SAart Bik    std::optional<NVVM::MMAIntOverflow> overflow, unsigned metaDataSelector) {
708185f0SChristopher Bate  auto ptxTypeStr = [](NVVM::MMATypes ptxType) {
708185f0SChristopher Bate    return NVVM::stringifyMMATypes(ptxType);
708185f0SChristopher Bate  };
708185f0SChristopher Bate
708185f0SChristopher Bate  std::string asmStr;
708185f0SChristopher Bate  llvm::raw_string_ostream ss(asmStr);
708185f0SChristopher Bate  ss << "mma.sp.sync.aligned.m" << shape[0] << "n" << shape[1] << "k"
708185f0SChristopher Bate     << shape[2] << ".row.col.";
708185f0SChristopher Bate
708185f0SChristopher Bate  if (overflow)
708185f0SChristopher Bate    ss << NVVM::stringifyMMAIntOverflow(*overflow) << ".";
708185f0SChristopher Bate
708185f0SChristopher Bate  ss << ptxTypeStr(ptxTypeD) << "." << ptxTypeStr(ptxTypeA) << "."
708185f0SChristopher Bate     << ptxTypeStr(ptxTypeB) << "." << ptxTypeStr(ptxTypeC) << " ";
708185f0SChristopher Bate  unsigned asmArgIdx = 0;
708185f0SChristopher Bate
708185f0SChristopher Bate  // The operand string is structured into sections `{matC elements...},
708185f0SChristopher Bate  // {matA elements...}, {matB elements...}, {matC elements}`.
708185f0SChristopher Bate  for (const auto arrSize : {matCSize, matASize, matBSize, matCSize}) {
708185f0SChristopher Bate    ss << "{";
708185f0SChristopher Bate    for (unsigned i = 0; i < arrSize; i++)
708185f0SChristopher Bate      ss << "$" << asmArgIdx++ << (i < arrSize - 1 ? "," : "");
708185f0SChristopher Bate    ss << "},";
708185f0SChristopher Bate  }
b0bbc9b5Srkayaith  ss << "$" << asmArgIdx++ << ",";
4e4af133SAart Bik  assert(metaDataSelector <= 1);
4e4af133SAart Bik  ss << "0x" << metaDataSelector << ";";
708185f0SChristopher Bate  return asmStr;
708185f0SChristopher Bate}
708185f0SChristopher Bate
708185f0SChristopher Bate/// Builds an inline assembly operation corresponding to the specified MMA
708185f0SChristopher Bate/// sparse sync operation.
708185f0SChristopher Batestatic FailureOr<LLVM::InlineAsmOp> emitMmaSparseSyncOpAsm(
ee49cda7SGuray Ozen    ImplicitLocOpBuilder &b, NVVM::MMATypes ptxTypeA, NVVM::MMATypes ptxTypeB,
708185f0SChristopher Bate    NVVM::MMATypes ptxTypeC, NVVM::MMATypes ptxTypeD,
22426110SRamkumar Ramachandra    std::optional<NVVM::MMAIntOverflow> overflow, ArrayRef<Value> unpackedAData,
708185f0SChristopher Bate    ArrayRef<Value> unpackedB, ArrayRef<Value> unpackedC, Value indexData,
708185f0SChristopher Bate    int64_t metadataSelector, const std::array<int64_t, 3> &shape,
ee49cda7SGuray Ozen    Type intrinsicResultType) {
ee49cda7SGuray Ozen  auto asmDialectAttr =
ee49cda7SGuray Ozen      LLVM::AsmDialectAttr::get(b.getContext(), LLVM::AsmDialect::AD_ATT);
708185f0SChristopher Bate
4e4af133SAart Bik  const unsigned matASize = unpackedAData.size();
4e4af133SAart Bik  const unsigned matBSize = unpackedB.size();
4e4af133SAart Bik  const unsigned matCSize = unpackedC.size();
708185f0SChristopher Bate
4e4af133SAart Bik  std::string asmStr = buildMmaSparseAsmString(
4e4af133SAart Bik      shape, matASize, matBSize, matCSize, ptxTypeA, ptxTypeB, ptxTypeC,
4e4af133SAart Bik      ptxTypeD, overflow, metadataSelector);
4e4af133SAart Bik  std::string constraintStr =
4e4af133SAart Bik      buildMmaSparseAsmConstraintString(matASize, matBSize, matCSize);
708185f0SChristopher Bate
708185f0SChristopher Bate  SmallVector<Value> asmVals;
4e4af133SAart Bik  asmVals.reserve(matASize + matBSize + matCSize + 1);
708185f0SChristopher Bate  for (ArrayRef<Value> args : {unpackedAData, unpackedB, unpackedC})
708185f0SChristopher Bate    llvm::append_range(asmVals, args);
708185f0SChristopher Bate  asmVals.push_back(indexData);
708185f0SChristopher Bate
ee49cda7SGuray Ozen  return b.create<LLVM::InlineAsmOp>(
708185f0SChristopher Bate      /*resultTypes=*/intrinsicResultType,
708185f0SChristopher Bate      /*operands=*/asmVals,
708185f0SChristopher Bate      /*asm_string=*/asmStr,
708185f0SChristopher Bate      /*constraints=*/constraintStr,
708185f0SChristopher Bate      /*has_side_effects=*/true,
708185f0SChristopher Bate      /*is_align_stack=*/false,
708185f0SChristopher Bate      /*asm_dialect=*/asmDialectAttr,
708185f0SChristopher Bate      /*operand_attrs=*/ArrayAttr());
708185f0SChristopher Bate}
708185f0SChristopher Bate
708185f0SChristopher Bate/// Lowers `nvgpu.mma.sp.sync` to inline assembly.
708185f0SChristopher Batestruct NVGPUMmaSparseSyncLowering
708185f0SChristopher Bate    : public ConvertOpToLLVMPattern<nvgpu::MmaSparseSyncOp> {
708185f0SChristopher Bate  using ConvertOpToLLVMPattern<nvgpu::MmaSparseSyncOp>::ConvertOpToLLVMPattern;
708185f0SChristopher Bate
708185f0SChristopher Bate  LogicalResult
708185f0SChristopher Bate  matchAndRewrite(nvgpu::MmaSparseSyncOp op, OpAdaptor adaptor,
708185f0SChristopher Bate                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
708185f0SChristopher Bate    // Get the shapes of the MMAMatrix type being used. The shapes will
708185f0SChristopher Bate    // choose which intrinsic this op will be lowered to.
708185f0SChristopher Bate    VectorType aType = op.getMatrixA().getType();
708185f0SChristopher Bate    VectorType bType = op.getMatrixB().getType();
708185f0SChristopher Bate    VectorType cType = op.getMatrixC().getType();
708185f0SChristopher Bate
708185f0SChristopher Bate    FailureOr<NVVM::MMATypes> ptxTypeA = getNvvmMmaType(aType);
708185f0SChristopher Bate    if (failed(ptxTypeA))
708185f0SChristopher Bate      return op->emitOpError("failed to deduce operand PTX types");
708185f0SChristopher Bate    FailureOr<NVVM::MMATypes> ptxTypeB = getNvvmMmaType(bType);
708185f0SChristopher Bate    if (failed(ptxTypeB))
708185f0SChristopher Bate      return op->emitOpError("failed to deduce operand PTX types");
22426110SRamkumar Ramachandra    std::optional<NVVM::MMATypes> ptxTypeC =
22426110SRamkumar Ramachandra        NVVM::MmaOp::inferOperandMMAType(cType.getElementType(),
22426110SRamkumar Ramachandra                                         /*isAccumulator=*/true);
708185f0SChristopher Bate    if (!ptxTypeC)
708185f0SChristopher Bate      return op->emitError(
708185f0SChristopher Bate          "could not infer the PTX type for the accumulator/result");
708185f0SChristopher Bate
708185f0SChristopher Bate    // Same as `mma.sync`, F32 works only with TensorFloat32 (TF32).
708185f0SChristopher Bate    bool tf32Enabled = op->hasAttr(op.getTf32EnabledAttrName());
708185f0SChristopher Bate    if (aType.getElementType().isF32() && !tf32Enabled)
708185f0SChristopher Bate      return failure();
708185f0SChristopher Bate
708185f0SChristopher Bate    // TODO: add an attribute to the op to customize this behavior.
22426110SRamkumar Ramachandra    std::optional<NVVM::MMAIntOverflow> overflow(std::nullopt);
5550c821STres Popp    if (isa<IntegerType>(aType.getElementType()))
708185f0SChristopher Bate      overflow = NVVM::MMAIntOverflow::satfinite;
708185f0SChristopher Bate
708185f0SChristopher Bate    SmallVector<Value> matA =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixA(), *ptxTypeA);
708185f0SChristopher Bate    SmallVector<Value> matB =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixB(), *ptxTypeB);
708185f0SChristopher Bate    SmallVector<Value> matC =
ee49cda7SGuray Ozen        unpackOperandVector(b, adaptor.getMatrixC(), *ptxTypeC);
708185f0SChristopher Bate
708185f0SChristopher Bate    Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
708185f0SChristopher Bate    Type intrinsicResTy = inferIntrinsicResultType(
708185f0SChristopher Bate        typeConverter->convertType(op->getResultTypes()[0]));
708185f0SChristopher Bate
708185f0SChristopher Bate    // Bitcast the sparse metadata from vector<2xf16> to an i32.
708185f0SChristopher Bate    Value sparseMetadata = adaptor.getSparseMetadata();
708185f0SChristopher Bate    if (sparseMetadata.getType() !=
708185f0SChristopher Bate        LLVM::getFixedVectorType(rewriter.getI16Type(), 2))
708185f0SChristopher Bate      return op->emitOpError() << "Expected metadata type to be LLVM "
708185f0SChristopher Bate                                  "VectorType of 2 i16 elements";
ee49cda7SGuray Ozen    sparseMetadata =
ee49cda7SGuray Ozen        b.create<LLVM::BitcastOp>(rewriter.getI32Type(), sparseMetadata);
708185f0SChristopher Bate
708185f0SChristopher Bate    FailureOr<LLVM::InlineAsmOp> intrinsicResult = emitMmaSparseSyncOpAsm(
ee49cda7SGuray Ozen        b, *ptxTypeA, *ptxTypeB, *ptxTypeC, *ptxTypeC, overflow, matA, matB,
708185f0SChristopher Bate        matC, sparseMetadata, op.getSparsitySelector(), op.getMmaShapeAsArray(),
ee49cda7SGuray Ozen        intrinsicResTy);
708185f0SChristopher Bate    if (failed(intrinsicResult))
708185f0SChristopher Bate      return failure();
708185f0SChristopher Bate
708185f0SChristopher Bate    assert((*intrinsicResult).getNumResults() == 1 &&
708185f0SChristopher Bate           "expected inline asm op returns a single LLVM struct type");
708185f0SChristopher Bate    rewriter.replaceOp(
708185f0SChristopher Bate        op, convertIntrinsicResult(op.getLoc(), intrinsicResTy, desiredRetTy,
708185f0SChristopher Bate                                   (*intrinsicResult)->getResult(0), rewriter));
708185f0SChristopher Bate    return success();
708185f0SChristopher Bate  }
708185f0SChristopher Bate};
708185f0SChristopher Bate
15bcc36eSThomas Raouxstruct NVGPUAsyncCopyLowering
15bcc36eSThomas Raoux    : public ConvertOpToLLVMPattern<nvgpu::DeviceAsyncCopyOp> {
15bcc36eSThomas Raoux  using ConvertOpToLLVMPattern<
15bcc36eSThomas Raoux      nvgpu::DeviceAsyncCopyOp>::ConvertOpToLLVMPattern;
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux  LogicalResult
15bcc36eSThomas Raoux  matchAndRewrite(nvgpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,
15bcc36eSThomas Raoux                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
ee49cda7SGuray Ozen    Location loc = op.getLoc();
5550c821STres Popp    auto dstMemrefType = cast<MemRefType>(op.getDst().getType());
ee49cda7SGuray Ozen    Value dstPtr =
ee49cda7SGuray Ozen        getStridedElementPtr(b.getLoc(), dstMemrefType, adaptor.getDst(),
8df54a6aSJacques Pienaar                             adaptor.getDstIndices(), rewriter);
499abb24SKrzysztof Drewniak    FailureOr<unsigned> dstAddressSpace =
499abb24SKrzysztof Drewniak        getTypeConverter()->getMemRefAddressSpace(dstMemrefType);
499abb24SKrzysztof Drewniak    if (failed(dstAddressSpace))
499abb24SKrzysztof Drewniak      return rewriter.notifyMatchFailure(
499abb24SKrzysztof Drewniak          loc, "destination memref address space not convertible to integer");
15bcc36eSThomas Raoux
5550c821STres Popp    auto srcMemrefType = cast<MemRefType>(op.getSrc().getType());
499abb24SKrzysztof Drewniak    FailureOr<unsigned> srcAddressSpace =
499abb24SKrzysztof Drewniak        getTypeConverter()->getMemRefAddressSpace(srcMemrefType);
499abb24SKrzysztof Drewniak    if (failed(srcAddressSpace))
499abb24SKrzysztof Drewniak      return rewriter.notifyMatchFailure(
499abb24SKrzysztof Drewniak          loc, "source memref address space not convertible to integer");
15bcc36eSThomas Raoux
8df54a6aSJacques Pienaar    Value scrPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.getSrc(),
8df54a6aSJacques Pienaar                                        adaptor.getSrcIndices(), rewriter);
15bcc36eSThomas Raoux    // Intrinsics takes a global pointer so we need an address space cast.
2f17c9f6SChristian Ulmann    auto srcPointerGlobalType = LLVM::LLVMPointerType::get(
2f17c9f6SChristian Ulmann        op->getContext(), NVVM::NVVMMemorySpace::kGlobalMemorySpace);
ee49cda7SGuray Ozen    scrPtr = b.create<LLVM::AddrSpaceCastOp>(srcPointerGlobalType, scrPtr);
fbf69f95SManish Gupta    int64_t dstElements = adaptor.getDstElements().getZExtValue();
15bcc36eSThomas Raoux    int64_t sizeInBytes =
fbf69f95SManish Gupta        (dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;
fbf69f95SManish Gupta    // When the optional SrcElements argument is *not* present, the regular
fbf69f95SManish Gupta    // CpAsyncOp is generated. CopyAsyncOp reads bytes from source (global
2c573967SGuray Ozen    // memory) to fill DstElements number of elements in the destination
2c573967SGuray Ozen    // (shared memory).
2c573967SGuray Ozen    Value srcBytes = adaptor.getSrcElements();
2c573967SGuray Ozen    if (srcBytes) {
2c573967SGuray Ozen      // When the optional SrcElements argument is present, the source (global
2c573967SGuray Ozen      // memory) of CpAsyncOp is read only for SrcElements number of elements.
2c573967SGuray Ozen      // The rest of the DstElements in the destination (shared memory) are
2c573967SGuray Ozen      // filled with zeros.
ee49cda7SGuray Ozen      Value c3I32 =
ee49cda7SGuray Ozen          b.create<LLVM::ConstantOp>(b.getI32Type(), b.getI32IntegerAttr(3));
ee49cda7SGuray Ozen      Value bitwidth = b.create<LLVM::ConstantOp>(
ee49cda7SGuray Ozen          b.getI32Type(),
ee49cda7SGuray Ozen          b.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
ee49cda7SGuray Ozen      Value srcElementsI32 = b.create<LLVM::TruncOp>(b.getI32Type(), srcBytes);
ee49cda7SGuray Ozen      srcBytes = b.create<LLVM::LShrOp>(
ee49cda7SGuray Ozen          b.create<LLVM::MulOp>(bitwidth, srcElementsI32), c3I32);
2c573967SGuray Ozen    }
2c573967SGuray Ozen    // Cache global (.cg) for 16 dst bytes, Cache all (.ca) for sizes other than
2c573967SGuray Ozen    // 16 dst bytes.
2c573967SGuray Ozen    NVVM::LoadCacheModifierKind cacheModifier =
2c573967SGuray Ozen        (op.getBypassL1().value_or(false) && sizeInBytes == 16)
2c573967SGuray Ozen            ? NVVM::LoadCacheModifierKind::CG
2c573967SGuray Ozen            : NVVM::LoadCacheModifierKind::CA;
2c573967SGuray Ozen
ee49cda7SGuray Ozen    b.create<NVVM::CpAsyncOp>(
ee49cda7SGuray Ozen        dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
2c573967SGuray Ozen        NVVM::LoadCacheModifierKindAttr::get(op->getContext(), cacheModifier),
2c573967SGuray Ozen        srcBytes);
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux    // Drop the result token.
ee49cda7SGuray Ozen    Value zero = b.create<LLVM::ConstantOp>(
ee49cda7SGuray Ozen        IntegerType::get(op.getContext(), 32), rewriter.getI32IntegerAttr(0));
15bcc36eSThomas Raoux    rewriter.replaceOp(op, zero);
15bcc36eSThomas Raoux    return success();
15bcc36eSThomas Raoux  }
15bcc36eSThomas Raoux};
15bcc36eSThomas Raoux
15bcc36eSThomas Raouxstruct NVGPUAsyncCreateGroupLowering
15bcc36eSThomas Raoux    : public ConvertOpToLLVMPattern<nvgpu::DeviceAsyncCreateGroupOp> {
15bcc36eSThomas Raoux  using ConvertOpToLLVMPattern<
15bcc36eSThomas Raoux      nvgpu::DeviceAsyncCreateGroupOp>::ConvertOpToLLVMPattern;
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux  LogicalResult
15bcc36eSThomas Raoux  matchAndRewrite(nvgpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor,
15bcc36eSThomas Raoux                  ConversionPatternRewriter &rewriter) const override {
15bcc36eSThomas Raoux    rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
15bcc36eSThomas Raoux    // Drop the result token.
15bcc36eSThomas Raoux    Value zero = rewriter.create<LLVM::ConstantOp>(
15bcc36eSThomas Raoux        op->getLoc(), IntegerType::get(op.getContext(), 32),
15bcc36eSThomas Raoux        rewriter.getI32IntegerAttr(0));
15bcc36eSThomas Raoux    rewriter.replaceOp(op, zero);
15bcc36eSThomas Raoux    return success();
15bcc36eSThomas Raoux  }
15bcc36eSThomas Raoux};
15bcc36eSThomas Raoux
15bcc36eSThomas Raouxstruct NVGPUAsyncWaitLowering
15bcc36eSThomas Raoux    : public ConvertOpToLLVMPattern<nvgpu::DeviceAsyncWaitOp> {
15bcc36eSThomas Raoux  using ConvertOpToLLVMPattern<
15bcc36eSThomas Raoux      nvgpu::DeviceAsyncWaitOp>::ConvertOpToLLVMPattern;
15bcc36eSThomas Raoux
15bcc36eSThomas Raoux  LogicalResult
15bcc36eSThomas Raoux  matchAndRewrite(nvgpu::DeviceAsyncWaitOp op, OpAdaptor adaptor,
15bcc36eSThomas Raoux                  ConversionPatternRewriter &rewriter) const override {
15bcc36eSThomas Raoux    // If numGroup is not present pick 0 as a conservative correct value.
2789c4f5SKazu Hirata    int32_t numGroups = adaptor.getNumGroups().value_or(0);
15bcc36eSThomas Raoux    rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);
15bcc36eSThomas Raoux    rewriter.eraseOp(op);
15bcc36eSThomas Raoux    return success();
15bcc36eSThomas Raoux  }
15bcc36eSThomas Raoux};
15bcc36eSThomas Raoux
affcfccdSGuray Ozen/// Creates mbarrier object in shared memory
affcfccdSGuray Ozenstruct NVGPUMBarrierCreateLowering
affcfccdSGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp> {
affcfccdSGuray Ozen  using ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp>::ConvertOpToLLVMPattern;
affcfccdSGuray Ozen
affcfccdSGuray Ozen  template <typename moduleT>
affcfccdSGuray Ozen  memref::GlobalOp generateGlobalBarrier(ConversionPatternRewriter &rewriter,
affcfccdSGuray Ozen                                         Operation *funcOp, moduleT moduleOp,
affcfccdSGuray Ozen                                         MemRefType barrierType) const {
affcfccdSGuray Ozen    SymbolTable symbolTable(moduleOp);
affcfccdSGuray Ozen    OpBuilder::InsertionGuard guard(rewriter);
affcfccdSGuray Ozen    rewriter.setInsertionPoint(&moduleOp.front());
affcfccdSGuray Ozen    auto global = rewriter.create<memref::GlobalOp>(
affcfccdSGuray Ozen        funcOp->getLoc(), "__mbarrier",
affcfccdSGuray Ozen        /*sym_visibility=*/rewriter.getStringAttr("private"),
affcfccdSGuray Ozen        /*type=*/barrierType,
affcfccdSGuray Ozen        /*initial_value=*/ElementsAttr(),
affcfccdSGuray Ozen        /*constant=*/false,
affcfccdSGuray Ozen        /*alignment=*/rewriter.getI64IntegerAttr(8));
affcfccdSGuray Ozen    symbolTable.insert(global);
affcfccdSGuray Ozen    return global;
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen
affcfccdSGuray Ozen  LogicalResult
affcfccdSGuray Ozen  matchAndRewrite(nvgpu::MBarrierCreateOp op, OpAdaptor adaptor,
affcfccdSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
affcfccdSGuray Ozen    Operation *funcOp = op->getParentOp();
99475f5bSNicolas Vasilache    MemRefType barrierType = nvgpu::getMBarrierMemrefType(
17649a77SGuray Ozen        rewriter.getContext(), op.getBarriers().getType());
affcfccdSGuray Ozen
affcfccdSGuray Ozen    memref::GlobalOp global;
9dad32cbSGuray Ozen    if (auto moduleOp = funcOp->getParentOfType<gpu::GPUModuleOp>())
affcfccdSGuray Ozen      global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);
9dad32cbSGuray Ozen    else if (auto moduleOp = funcOp->getParentOfType<ModuleOp>())
affcfccdSGuray Ozen      global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);
affcfccdSGuray Ozen
affcfccdSGuray Ozen    rewriter.setInsertionPoint(op);
affcfccdSGuray Ozen    rewriter.replaceOpWithNewOp<memref::GetGlobalOp>(op, barrierType,
affcfccdSGuray Ozen                                                     global.getName());
affcfccdSGuray Ozen    return success();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen};
affcfccdSGuray Ozen
17649a77SGuray Ozen/// Base class for lowering mbarrier operations to nvvm intrinsics.
17649a77SGuray Ozentemplate <typename SourceOp>
17649a77SGuray Ozenstruct MBarrierBasePattern : public ConvertOpToLLVMPattern<SourceOp> {
17649a77SGuray Ozenpublic:
17649a77SGuray Ozen  using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
17649a77SGuray Ozen  /// Returns the base pointer of the mbarrier object.
ee49cda7SGuray Ozen  Value getMbarrierPtr(ImplicitLocOpBuilder &b,
ee49cda7SGuray Ozen                       nvgpu::MBarrierGroupType mbarType, Value memrefDesc,
ee49cda7SGuray Ozen                       Value mbarId,
17649a77SGuray Ozen                       ConversionPatternRewriter &rewriter) const {
17649a77SGuray Ozen    MemRefType mbarrierMemrefType =
17649a77SGuray Ozen        nvgpu::getMBarrierMemrefType(rewriter.getContext(), mbarType);
17649a77SGuray Ozen    return ConvertToLLVMPattern::getStridedElementPtr(
ee49cda7SGuray Ozen        b.getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter);
17649a77SGuray Ozen  }
17649a77SGuray Ozen};
17649a77SGuray Ozen
affcfccdSGuray Ozen/// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init`
affcfccdSGuray Ozenstruct NVGPUMBarrierInitLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierInitOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<nvgpu::MBarrierInitOp>::MBarrierBasePattern;
affcfccdSGuray Ozen
affcfccdSGuray Ozen  LogicalResult
affcfccdSGuray Ozen  matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor,
affcfccdSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    nvgpu::MBarrierGroupType mbarrierType = op.getBarriers().getType();
affcfccdSGuray Ozen    rewriter.setInsertionPoint(op);
ee49cda7SGuray Ozen    Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
17649a77SGuray Ozen                                   adaptor.getMbarId(), rewriter);
ee49cda7SGuray Ozen    Value count = truncToI32(b, adaptor.getCount());
17649a77SGuray Ozen    if (isMbarrierShared(mbarrierType)) {
192d3320SGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
192d3320SGuray Ozen          op, barrier, count, adaptor.getPredicate());
affcfccdSGuray Ozen    } else {
63389326SGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
192d3320SGuray Ozen                                                        adaptor.getPredicate());
affcfccdSGuray Ozen    }
affcfccdSGuray Ozen    return success();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen};
affcfccdSGuray Ozen
affcfccdSGuray Ozen/// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive`
affcfccdSGuray Ozenstruct NVGPUMBarrierArriveLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierArriveOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<nvgpu::MBarrierArriveOp>::MBarrierBasePattern;
affcfccdSGuray Ozen  LogicalResult
affcfccdSGuray Ozen  matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor,
affcfccdSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
affcfccdSGuray Ozen    Type tokenType = getTypeConverter()->convertType(
affcfccdSGuray Ozen        nvgpu::MBarrierTokenType::get(op->getContext()));
17649a77SGuray Ozen    if (isMbarrierShared(op.getBarriers().getType())) {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
affcfccdSGuray Ozen                                                                barrier);
affcfccdSGuray Ozen    } else {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
affcfccdSGuray Ozen                                                          barrier);
affcfccdSGuray Ozen    }
affcfccdSGuray Ozen    return success();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen};
affcfccdSGuray Ozen
affcfccdSGuray Ozen/// Lowers `nvgpu.mbarrier.arrive.nocomplete` to
affcfccdSGuray Ozen/// `nvvm.mbarrier.arrive.nocomplete`
affcfccdSGuray Ozenstruct NVGPUMBarrierArriveNoCompleteLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierArriveNoCompleteOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<
17649a77SGuray Ozen      nvgpu::MBarrierArriveNoCompleteOp>::MBarrierBasePattern;
affcfccdSGuray Ozen  LogicalResult
affcfccdSGuray Ozen  matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor,
affcfccdSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
affcfccdSGuray Ozen    Type tokenType = getTypeConverter()->convertType(
affcfccdSGuray Ozen        nvgpu::MBarrierTokenType::get(op->getContext()));
ee49cda7SGuray Ozen    Value count = truncToI32(b, adaptor.getCount());
17649a77SGuray Ozen    if (isMbarrierShared(op.getBarriers().getType())) {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
affcfccdSGuray Ozen          op, tokenType, barrier, count);
affcfccdSGuray Ozen    } else {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
affcfccdSGuray Ozen          op, tokenType, barrier, count);
affcfccdSGuray Ozen    }
affcfccdSGuray Ozen    return success();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen};
affcfccdSGuray Ozen
affcfccdSGuray Ozen/// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait`
affcfccdSGuray Ozenstruct NVGPUMBarrierTestWaitLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierTestWaitOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<nvgpu::MBarrierTestWaitOp>::MBarrierBasePattern;
affcfccdSGuray Ozen  LogicalResult
affcfccdSGuray Ozen  matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor,
affcfccdSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
affcfccdSGuray Ozen    Type retType = rewriter.getI1Type();
17649a77SGuray Ozen    if (isMbarrierShared(op.getBarriers().getType())) {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
affcfccdSGuray Ozen          op, retType, barrier, adaptor.getToken());
affcfccdSGuray Ozen    } else {
affcfccdSGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
affcfccdSGuray Ozen          op, retType, barrier, adaptor.getToken());
affcfccdSGuray Ozen    }
affcfccdSGuray Ozen    return success();
affcfccdSGuray Ozen  }
affcfccdSGuray Ozen};
affcfccdSGuray Ozen
836dbb85SGuray Ozenstruct NVGPUMBarrierArriveExpectTxLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierArriveExpectTxOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<
17649a77SGuray Ozen      nvgpu::MBarrierArriveExpectTxOp>::MBarrierBasePattern;
836dbb85SGuray Ozen  LogicalResult
836dbb85SGuray Ozen  matchAndRewrite(nvgpu::MBarrierArriveExpectTxOp op, OpAdaptor adaptor,
836dbb85SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
ee49cda7SGuray Ozen    Value txcount = truncToI32(b, adaptor.getTxcount());
836dbb85SGuray Ozen
17649a77SGuray Ozen    if (isMbarrierShared(op.getBarriers().getType())) {
836dbb85SGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
192d3320SGuray Ozen          op, barrier, txcount, adaptor.getPredicate());
836dbb85SGuray Ozen      return success();
836dbb85SGuray Ozen    }
836dbb85SGuray Ozen
63389326SGuray Ozen    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
192d3320SGuray Ozen        op, barrier, txcount, adaptor.getPredicate());
836dbb85SGuray Ozen    return success();
836dbb85SGuray Ozen  }
836dbb85SGuray Ozen};
836dbb85SGuray Ozen
836dbb85SGuray Ozenstruct NVGPUMBarrierTryWaitParityLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::MBarrierTryWaitParityOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<
17649a77SGuray Ozen      nvgpu::MBarrierTryWaitParityOp>::MBarrierBasePattern;
836dbb85SGuray Ozen  LogicalResult
836dbb85SGuray Ozen  matchAndRewrite(nvgpu::MBarrierTryWaitParityOp op, OpAdaptor adaptor,
836dbb85SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
ee49cda7SGuray Ozen    Value ticks = truncToI32(b, adaptor.getTicks());
0a600c34SGuray Ozen    Value phase =
0a600c34SGuray Ozen        b.create<LLVM::ZExtOp>(b.getI32Type(), adaptor.getPhaseParity());
836dbb85SGuray Ozen
17649a77SGuray Ozen    if (isMbarrierShared(op.getBarriers().getType())) {
836dbb85SGuray Ozen      rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
836dbb85SGuray Ozen          op, barrier, phase, ticks);
836dbb85SGuray Ozen      return success();
836dbb85SGuray Ozen    }
836dbb85SGuray Ozen
836dbb85SGuray Ozen    rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParityOp>(op, barrier,
836dbb85SGuray Ozen                                                               phase, ticks);
836dbb85SGuray Ozen    return success();
836dbb85SGuray Ozen  }
836dbb85SGuray Ozen};
836dbb85SGuray Ozen
70c2e061SGuray Ozenstruct NVGPUTmaAsyncLoadOpLowering
17649a77SGuray Ozen    : public MBarrierBasePattern<nvgpu::TmaAsyncLoadOp> {
17649a77SGuray Ozen  using MBarrierBasePattern<nvgpu::TmaAsyncLoadOp>::MBarrierBasePattern;
70c2e061SGuray Ozen  LogicalResult
70c2e061SGuray Ozen  matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
70c2e061SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
50a76a7dSGuray Ozen    auto srcMemrefType = cast<MemRefType>(op.getDst().getType());
50a76a7dSGuray Ozen    Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType,
50a76a7dSGuray Ozen                                      adaptor.getDst(), {}, rewriter);
17649a77SGuray Ozen    Value barrier =
ee49cda7SGuray Ozen        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
17649a77SGuray Ozen                       adaptor.getMbarId(), rewriter);
70c2e061SGuray Ozen
70c2e061SGuray Ozen    SmallVector<Value> coords = adaptor.getCoordinates();
70c2e061SGuray Ozen    for (auto [index, value] : llvm::enumerate(coords)) {
ee49cda7SGuray Ozen      coords[index] = truncToI32(b, value);
70c2e061SGuray Ozen    }
70c2e061SGuray Ozen    rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
9ceea088SGuray Ozen        op, dest, adaptor.getTensorMapDescriptor(), coords, barrier,
4319e191SGuray Ozen        ValueRange{}, adaptor.getMulticastMask(), Value{},
4319e191SGuray Ozen        adaptor.getPredicate());
70c2e061SGuray Ozen    return success();
70c2e061SGuray Ozen  }
70c2e061SGuray Ozen};
8dd0d95cSGuray Ozen
8dd0d95cSGuray Ozenstruct NVGPUTmaAsyncStoreOpLowering
8dd0d95cSGuray Ozen    : public MBarrierBasePattern<nvgpu::TmaAsyncStoreOp> {
8dd0d95cSGuray Ozen  using MBarrierBasePattern<nvgpu::TmaAsyncStoreOp>::MBarrierBasePattern;
8dd0d95cSGuray Ozen  LogicalResult
8dd0d95cSGuray Ozen  matchAndRewrite(nvgpu::TmaAsyncStoreOp op, OpAdaptor adaptor,
8dd0d95cSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
8dd0d95cSGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
8dd0d95cSGuray Ozen    auto srcMemrefType = cast<MemRefType>(op.getSrc().getType());
8dd0d95cSGuray Ozen    Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType,
8dd0d95cSGuray Ozen                                      adaptor.getSrc(), {}, rewriter);
8dd0d95cSGuray Ozen    SmallVector<Value> coords = adaptor.getCoordinates();
8dd0d95cSGuray Ozen    for (auto [index, value] : llvm::enumerate(coords)) {
8dd0d95cSGuray Ozen      coords[index] = truncToI32(b, value);
8dd0d95cSGuray Ozen    }
8dd0d95cSGuray Ozen
8dd0d95cSGuray Ozen    rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorSharedCTAToGlobalOp>(
8dd0d95cSGuray Ozen        op, adaptor.getTensorMapDescriptor(), dest, coords,
8dd0d95cSGuray Ozen        adaptor.getPredicate());
8dd0d95cSGuray Ozen    return success();
8dd0d95cSGuray Ozen  }
8dd0d95cSGuray Ozen};
8dd0d95cSGuray Ozen
6dc7717bSGuray Ozenstruct NVGPUGenerateWarpgroupDescriptorLowering
7eb2b99fSGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupGenerateDescriptorOp> {
cce3e8edSGuray Ozen  using ConvertOpToLLVMPattern<
7eb2b99fSGuray Ozen      nvgpu::WarpgroupGenerateDescriptorOp>::ConvertOpToLLVMPattern;
cce3e8edSGuray Ozen
cce3e8edSGuray Ozen  LogicalResult
7eb2b99fSGuray Ozen  matchAndRewrite(nvgpu::WarpgroupGenerateDescriptorOp op, OpAdaptor adaptor,
cce3e8edSGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
cce3e8edSGuray Ozen
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
cce3e8edSGuray Ozen
cce3e8edSGuray Ozen    nvgpu::TensorMapSwizzleKind swizzleKind =
cce3e8edSGuray Ozen        op.getTensorMap().getType().getSwizzle();
cce3e8edSGuray Ozen
cce3e8edSGuray Ozen    unsigned layout =
cce3e8edSGuray Ozen        (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_128B)  ? 128
cce3e8edSGuray Ozen        : (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_64B) ? 64
cce3e8edSGuray Ozen        : (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_32B) ? 32
cce3e8edSGuray Ozen                                                                    : 1;
cce3e8edSGuray Ozen    unsigned swizzle =
cce3e8edSGuray Ozen        (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_128B)  ? 1
cce3e8edSGuray Ozen        : (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_64B) ? 2
cce3e8edSGuray Ozen        : (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_32B) ? 3
cce3e8edSGuray Ozen                                                                    : 0;
cce3e8edSGuray Ozen
ee49cda7SGuray Ozen    auto ti64 = b.getIntegerType(64);
cce3e8edSGuray Ozen    auto makeConst = [&](uint64_t index) -> Value {
ee49cda7SGuray Ozen      return b.create<LLVM::ConstantOp>(ti64, b.getI64IntegerAttr(index));
cce3e8edSGuray Ozen    };
cce3e8edSGuray Ozen    auto shiftLeft = [&](Value value, unsigned shift) -> Value {
ee49cda7SGuray Ozen      return b.create<LLVM::ShlOp>(ti64, value, makeConst(shift));
cce3e8edSGuray Ozen    };
cce3e8edSGuray Ozen    auto shiftRight = [&](Value value, unsigned shift) -> Value {
ee49cda7SGuray Ozen      return b.create<LLVM::LShrOp>(ti64, value, makeConst(shift));
cce3e8edSGuray Ozen    };
cce3e8edSGuray Ozen    auto insertBit = [&](Value desc, Value val, int startBit) {
ee49cda7SGuray Ozen      return b.create<LLVM::OrOp>(ti64, desc, shiftLeft(val, startBit));
cce3e8edSGuray Ozen    };
cce3e8edSGuray Ozen
cce3e8edSGuray Ozen    int64_t sizeN = op.getTensorMap().getType().getTensor().getDimSize(0);
23882226SGuray Ozen    uint64_t strideDimVal = (layout << 3) >> exclude4LSB;
23882226SGuray Ozen    uint64_t leadDimVal = (sizeN * layout) >> exclude4LSB;
b96d0693SGuray Ozen    uint64_t offsetVal = 0;
b96d0693SGuray Ozen
b96d0693SGuray Ozen    Value strideDim = makeConst(strideDimVal);
b96d0693SGuray Ozen    Value leadDim = makeConst(leadDimVal);
b96d0693SGuray Ozen
cce3e8edSGuray Ozen    Value baseAddr = getStridedElementPtr(
cce3e8edSGuray Ozen        op->getLoc(), cast<MemRefType>(op.getTensor().getType()),
cce3e8edSGuray Ozen        adaptor.getTensor(), {}, rewriter);
ee49cda7SGuray Ozen    Value basePtr = b.create<LLVM::PtrToIntOp>(ti64, baseAddr);
cce3e8edSGuray Ozen    // Just use 14 bits for base address
cce3e8edSGuray Ozen    Value basePtr14bit = shiftRight(shiftLeft(basePtr, 46), 50);
cce3e8edSGuray Ozen
cce3e8edSGuray Ozen    int startSwizzleBit = 62, startOffsetBit = 49, startStrideBit = 32,
cce3e8edSGuray Ozen        startLeadBit = 16, startBaseAddrBit = 0;
cce3e8edSGuray Ozen    Value dsc = makeConst(0);
cce3e8edSGuray Ozen    // // [62,64)  swizzle type
cce3e8edSGuray Ozen    dsc = insertBit(dsc, makeConst(swizzle), startSwizzleBit);
cce3e8edSGuray Ozen    // // [49,52)  base_offset
b96d0693SGuray Ozen    dsc = insertBit(dsc, makeConst(offsetVal), startOffsetBit);
cce3e8edSGuray Ozen    // // [32,46)  stride
cce3e8edSGuray Ozen    dsc = insertBit(dsc, strideDim, startStrideBit);
cce3e8edSGuray Ozen    // // [16,30)  leading dimension
cce3e8edSGuray Ozen    dsc = insertBit(dsc, leadDim, startLeadBit);
cce3e8edSGuray Ozen    // // [0,14)   start_address
cce3e8edSGuray Ozen    dsc = insertBit(dsc, basePtr14bit, startBaseAddrBit);
cce3e8edSGuray Ozen
6dc7717bSGuray Ozen    LLVM_DEBUG(DBGS() << "Generating warpgroup.descriptor: "
b96d0693SGuray Ozen                      << "leading_off:" << leadDimVal << "\t"
b96d0693SGuray Ozen                      << "stride_off :" << strideDimVal << "\t"
b96d0693SGuray Ozen                      << "base_offset:" << offsetVal << "\t"
b96d0693SGuray Ozen                      << "layout_type:" << swizzle << " ("
b96d0693SGuray Ozen                      << nvgpu::stringifyTensorMapSwizzleKind(swizzleKind)
b96d0693SGuray Ozen                      << ")\n start_addr :  " << baseAddr << "\n");
b96d0693SGuray Ozen
cce3e8edSGuray Ozen    rewriter.replaceOp(op, dsc);
cce3e8edSGuray Ozen    return success();
cce3e8edSGuray Ozen  }
cce3e8edSGuray Ozen};
e56d6745SGuray Ozen
ee49cda7SGuray Ozenstatic Value makeI64Const(ImplicitLocOpBuilder &b, int32_t index) {
ee49cda7SGuray Ozen  return b.create<LLVM::ConstantOp>(b.getIntegerType(64),
ee49cda7SGuray Ozen                                    b.getI32IntegerAttr(index));
e56d6745SGuray Ozen}
e56d6745SGuray Ozen
e56d6745SGuray Ozen/// Returns a Value that holds data type enum that is expected by CUDA driver.
ee49cda7SGuray Ozenstatic Value elementTypeAsLLVMConstant(ImplicitLocOpBuilder &b, Type type) {
e56d6745SGuray Ozen  // Enum is from CUDA driver API
e56d6745SGuray Ozen  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
e56d6745SGuray Ozen  enum CUtensorMapDataTypeEnum {
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_UINT16,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_UINT32,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_INT32,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_UINT64,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_INT64,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_FLOAT16,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_FLOAT32,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_FLOAT64,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
e56d6745SGuray Ozen    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
e56d6745SGuray Ozen  };
e56d6745SGuray Ozen
e56d6745SGuray Ozen  if (type.isUnsignedInteger(8))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT8);
e56d6745SGuray Ozen  if (type.isUnsignedInteger(16))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT16);
e56d6745SGuray Ozen  if (type.isUnsignedInteger(32))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT32);
e56d6745SGuray Ozen  if (type.isUnsignedInteger(64))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT64);
e56d6745SGuray Ozen  if (type.isSignlessInteger(32))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_INT32);
e56d6745SGuray Ozen  if (type.isSignlessInteger(64))
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_INT64);
e56d6745SGuray Ozen  if (type.isF16())
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT16);
e56d6745SGuray Ozen  if (type.isF32())
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT32);
e56d6745SGuray Ozen  if (type.isF64())
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT64);
e56d6745SGuray Ozen  if (type.isBF16())
ee49cda7SGuray Ozen    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16);
e56d6745SGuray Ozen
e56d6745SGuray Ozen  llvm_unreachable("Not supported data type");
e56d6745SGuray Ozen}
e56d6745SGuray Ozen
e56d6745SGuray Ozenstruct NVGPUTmaCreateDescriptorOpLowering
e56d6745SGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::TmaCreateDescriptorOp> {
e56d6745SGuray Ozen  using ConvertOpToLLVMPattern<
e56d6745SGuray Ozen      nvgpu::TmaCreateDescriptorOp>::ConvertOpToLLVMPattern;
e56d6745SGuray Ozen  LogicalResult
e56d6745SGuray Ozen  matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor,
e56d6745SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
ee49cda7SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
2f17c9f6SChristian Ulmann    auto llvmPointerType = LLVM::LLVMPointerType::get(op->getContext());
e56d6745SGuray Ozen    Type llvmInt64Type = IntegerType::get(op->getContext(), 64);
e56d6745SGuray Ozen
ee49cda7SGuray Ozen    Value tensorElementType =
ee49cda7SGuray Ozen        elementTypeAsLLVMConstant(b, op.getTensor().getType().getElementType());
e56d6745SGuray Ozen    auto promotedOperands = getTypeConverter()->promoteOperands(
ee49cda7SGuray Ozen        b.getLoc(), op->getOperands(), adaptor.getOperands(), b);
e56d6745SGuray Ozen
ee49cda7SGuray Ozen    Value boxArrayPtr = b.create<LLVM::AllocaOp>(llvmPointerType, llvmInt64Type,
ee49cda7SGuray Ozen                                                 makeI64Const(b, 5));
e56d6745SGuray Ozen    for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) {
ee49cda7SGuray Ozen      Value gep = b.create<LLVM::GEPOp>(llvmPointerType, llvmPointerType,
ee49cda7SGuray Ozen                                        boxArrayPtr, makeI64Const(b, index));
ee49cda7SGuray Ozen      b.create<LLVM::StoreOp>(value, gep);
e56d6745SGuray Ozen    }
e56d6745SGuray Ozen
e56d6745SGuray Ozen    nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType();
e56d6745SGuray Ozen    // Set Arguments for the function call
e56d6745SGuray Ozen    SmallVector<Value> arguments;
e56d6745SGuray Ozen    arguments.push_back(promotedOperands[0]); // rank
e56d6745SGuray Ozen    arguments.push_back(promotedOperands[1]); // descriptor
e56d6745SGuray Ozen    arguments.push_back(tensorElementType);   // data type
e56d6745SGuray Ozen    arguments.push_back(
ee49cda7SGuray Ozen        makeI64Const(b, (int)desc.getInterleave()));              // interleave
ee49cda7SGuray Ozen    arguments.push_back(makeI64Const(b, (int)desc.getSwizzle())); // swizzle
ee49cda7SGuray Ozen    arguments.push_back(makeI64Const(b, (int)desc.getL2promo())); // l2promo
ee49cda7SGuray Ozen    arguments.push_back(makeI64Const(b, (int)desc.getOob()));     // oob
e56d6745SGuray Ozen    arguments.push_back(boxArrayPtr); // box dimensions
e56d6745SGuray Ozen
e56d6745SGuray Ozen    // Set data types of the arguments
e56d6745SGuray Ozen    SmallVector<Type> argTypes = {
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t tensorRank */
e56d6745SGuray Ozen        llvmPointerType, /* ptr */
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t */
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t */
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t */
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t */
e56d6745SGuray Ozen        llvmInt64Type,   /* int64_t */
e56d6745SGuray Ozen        llvmPointerType  /* ptr  */
e56d6745SGuray Ozen    };
e56d6745SGuray Ozen    FunctionCallBuilder hostRegisterCallBuilder = {
e56d6745SGuray Ozen        "mgpuTensorMapEncodeTiledMemref", llvmPointerType, argTypes};
e56d6745SGuray Ozen    Value tensorMap =
ee49cda7SGuray Ozen        hostRegisterCallBuilder.create(b.getLoc(), b, arguments).getResult();
e56d6745SGuray Ozen
e56d6745SGuray Ozen    rewriter.replaceOp(op, tensorMap);
e56d6745SGuray Ozen    return success();
e56d6745SGuray Ozen  }
e56d6745SGuray Ozen};
e56d6745SGuray Ozen
23882226SGuray Ozenstruct NVGPUWarpgroupMmaOpLowering
23882226SGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaOp> {
23882226SGuray Ozen  using ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaOp>::ConvertOpToLLVMPattern;
23882226SGuray Ozen
b74cfc13SGuray Ozen  /// This is a helper class to generate required NVVM Ops for warp-group level
b74cfc13SGuray Ozen  /// matrix multiplication.
b74cfc13SGuray Ozen  /// When the given GEMM shape is larger than the shape of
b74cfc13SGuray Ozen  /// a wgmma instrution in PTX, it can generate multiple NVVM::WgmmaMmaAsyncOp
b74cfc13SGuray Ozen  /// Op(s), group and execute them asynchronously. The class also handles
b74cfc13SGuray Ozen  /// waiting for completion and iterates through WarpgroupMatrixDescriptor to
b74cfc13SGuray Ozen  /// create descriptors for each instruction.
b74cfc13SGuray Ozen  ///
b74cfc13SGuray Ozen  /// For example this is the case when the shape of GEMM is 128x128x128
b74cfc13SGuray Ozen  ///
b74cfc13SGuray Ozen  ///    nvvm.wgmma.fence.aligned
b74cfc13SGuray Ozen  ///
b74cfc13SGuray Ozen  ///    nvvm.wgmma.mma.async descA, descB
b74cfc13SGuray Ozen  ///    iterate(descA, descB)
b74cfc13SGuray Ozen  ///    nvvm.wgmma.mma.async descA, descB
b74cfc13SGuray Ozen  ///    [6x times more]
b74cfc13SGuray Ozen  ///
b74cfc13SGuray Ozen  ///    nvvm.wgmma.group.sync.aligned
b74cfc13SGuray Ozen  ///    nvvm.wgmma.wait.group.sync [groupId]
b74cfc13SGuray Ozen  ///
b74cfc13SGuray Ozen  class WarpgroupGemm {
b74cfc13SGuray Ozen    nvgpu::WarpgroupMmaOp op;
b74cfc13SGuray Ozen    ImplicitLocOpBuilder b;
b74cfc13SGuray Ozen    OpAdaptor adaptor;
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    // Entire shape of the given Op
b74cfc13SGuray Ozen    int64_t totalM, totalN, totalK;
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    // Shape of one wgmma instruction
b74cfc13SGuray Ozen    int wgmmaM = 0, wgmmaN = 0, wgmmaK = 0;
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    // Iteration counts for GEMM
b74cfc13SGuray Ozen    int iterationM = 0, iterationN = 0, iterationK = 0;
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    /// The function returns the shape of wgmma instruction that is defined in
b74cfc13SGuray Ozen    /// PTX programming guide.
b74cfc13SGuray Ozen    /// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shape
b74cfc13SGuray Ozen    void findWgmmaShape(int64_t sizeM, int64_t sizeN, Type inputElemType) {
b74cfc13SGuray Ozen      wgmmaM = 64;
b74cfc13SGuray Ozen      wgmmaN = sizeN;
23882226SGuray Ozen      if (inputElemType.isTF32()) {
b74cfc13SGuray Ozen        wgmmaK = 8;
23882226SGuray Ozen      } else if (inputElemType.isF16() || inputElemType.isBF16()) {
b74cfc13SGuray Ozen        wgmmaK = 16;
*7a77f14cSMatthias Springer      } else if (isa<Float8E4M3FNType, Float8E5M2Type>(inputElemType) ||
*7a77f14cSMatthias Springer                 inputElemType.isInteger(16)) {
b74cfc13SGuray Ozen        wgmmaK = 32;
23882226SGuray Ozen      } else if (inputElemType.isInteger(1)) {
b74cfc13SGuray Ozen        wgmmaK = 256;
23882226SGuray Ozen      } else {
23882226SGuray Ozen        llvm_unreachable("msg: not supported K shape");
23882226SGuray Ozen      }
b74cfc13SGuray Ozen      LLVM_DEBUG(DBGS() << "Generating WgmmaMmaAsyncOp shape[m = " << wgmmaM
b74cfc13SGuray Ozen                        << ", n = " << wgmmaN << ", k = " << wgmmaK << "]\n");
23882226SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Generates WGMMATypesAttr from MLIR Type
12c241b3SGuray Ozen    NVVM::WGMMATypesAttr generateWgmmaType(Type type,
12c241b3SGuray Ozen                                           bool useF32 = false) const {
12c241b3SGuray Ozen      auto getWgmmaType = [=](Type elemType) {
b74cfc13SGuray Ozen        if (elemType.isF32() || elemType.isTF32())
12c241b3SGuray Ozen          return useF32 ? NVVM::WGMMATypes::f32 : NVVM::WGMMATypes::tf32;
b74cfc13SGuray Ozen        if (elemType.isF16())
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::f16;
b74cfc13SGuray Ozen        if (elemType.isBF16())
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::bf16;
*7a77f14cSMatthias Springer        if (isa<Float8E4M3FNType>(elemType))
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::e4m3;
*7a77f14cSMatthias Springer        if (isa<Float8E5M2Type>(elemType))
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::e5m2;
b74cfc13SGuray Ozen        if (elemType.isInteger(1))
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::b1;
b74cfc13SGuray Ozen        if (elemType.isInteger(8))
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::s8;
b74cfc13SGuray Ozen        if (elemType.isUnsignedInteger(8))
b74cfc13SGuray Ozen          return NVVM::WGMMATypes::u8;
12c241b3SGuray Ozen        if (elemType.isInteger(32))
12c241b3SGuray Ozen          return NVVM::WGMMATypes::s32;
b74cfc13SGuray Ozen        llvm_unreachable("unsupported type");
b74cfc13SGuray Ozen      };
b74cfc13SGuray Ozen      return NVVM::WGMMATypesAttr::get(op->getContext(), getWgmmaType(type));
23882226SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Generates layout attribute for the input matrix for wgmma instruction
b74cfc13SGuray Ozen    NVVM::MMALayoutAttr
b74cfc13SGuray Ozen    generateWgmmaLayout(std::optional<bool> transpose) const {
b74cfc13SGuray Ozen      if (transpose.value_or(false))
b74cfc13SGuray Ozen        return NVVM::MMALayoutAttr::get(op->getContext(), NVVM::MMALayout::col);
b74cfc13SGuray Ozen      return NVVM::MMALayoutAttr::get(op->getContext(), NVVM::MMALayout::row);
23882226SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Generates shape attribute for wgmma instruction
b74cfc13SGuray Ozen    NVVM::MMAShapeAttr generateWgmmaShape() const {
b74cfc13SGuray Ozen      return NVVM::MMAShapeAttr::get(op->getContext(), wgmmaM, wgmmaN, wgmmaK);
b74cfc13SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Generates scale attributes of output matrix for wgmma instruction
b74cfc13SGuray Ozen    NVVM::WGMMAScaleOutAttr generateScaleOut() const {
b74cfc13SGuray Ozen      return NVVM::WGMMAScaleOutAttr::get(op->getContext(),
b74cfc13SGuray Ozen                                          NVVM::WGMMAScaleOut::one);
b74cfc13SGuray Ozen    }
b74cfc13SGuray Ozen    /// Generates scale attributes of input matrix for wgmma instruction
b74cfc13SGuray Ozen    NVVM::WGMMAScaleInAttr generateScaleIn() const {
b74cfc13SGuray Ozen      return NVVM::WGMMAScaleInAttr::get(op->getContext(),
b74cfc13SGuray Ozen                                         NVVM::WGMMAScaleIn::one);
b74cfc13SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Basic function to generate Add
b74cfc13SGuray Ozen    Value makeAdd(Value lhs, Value rhs) {
ee49cda7SGuray Ozen      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
23882226SGuray Ozen    };
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Moves the descriptor pointer of matrix-A for the next wgmma instruction.
b74cfc13SGuray Ozen    /// Currently, it only handles row-major.
b74cfc13SGuray Ozen    ///
b74cfc13SGuray Ozen    /// It moves the pointer like below for [128][64] size:
b74cfc13SGuray Ozen    ///                 +2 +4 +6
b74cfc13SGuray Ozen    ///                  ↓  ↓  ↓
b74cfc13SGuray Ozen    /// descA    ---> +--+--+--+--+
b74cfc13SGuray Ozen    ///               |->|->|->|->|
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    /// descA+512---> +-----------+
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               |  |  |  |  |
b74cfc13SGuray Ozen    ///               +-----------+
b74cfc13SGuray Ozen    ///
b74cfc13SGuray Ozen    Value iterateDescriptorA(Value desc, int i, int j, int k) {
b74cfc13SGuray Ozen      MemRefType matrixTypeA = op.getDescriptorA().getType().getTensor();
b74cfc13SGuray Ozen      Type elemA = matrixTypeA.getElementType();
b74cfc13SGuray Ozen      int byte = elemA.getIntOrFloatBitWidth() / 8;
b74cfc13SGuray Ozen      int tileShapeA = matrixTypeA.getDimSize(1);
b74cfc13SGuray Ozen      int incrementVal = ((wgmmaK * k) + (totalK * tileShapeA * i)) * byte;
23882226SGuray Ozen      incrementVal = incrementVal >> exclude4LSB;
b74cfc13SGuray Ozen      LLVM_DEBUG(DBGS() << "\t\t[m: " << i << " n: " << j << " k: " << k
b74cfc13SGuray Ozen                        << "] [wgmma descriptors] Descriptor A + "
23882226SGuray Ozen                        << incrementVal << " | \t ");
23882226SGuray Ozen      if (!incrementVal)
23882226SGuray Ozen        return desc;
ee49cda7SGuray Ozen      return makeAdd(desc, makeI64Const(b, incrementVal));
b74cfc13SGuray Ozen    }
23882226SGuray Ozen
b74cfc13SGuray Ozen    /// Moves the descriptor pointer of matrix-B for the next wgmma instruction.
b74cfc13SGuray Ozen    /// Currently, it only handles column-major.
b74cfc13SGuray Ozen    ///
b74cfc13SGuray Ozen    /// It moves the pointer like below for [128][64] size:
b74cfc13SGuray Ozen    /// descB     ---> +--+--+--+--+--+--+--+--+
b74cfc13SGuray Ozen    ///                |↓ |  |  |  |  |  |  |  |
b74cfc13SGuray Ozen    ///                |↓ |  |  |  |  |  |  |  |
b74cfc13SGuray Ozen    ///                |↓ |  |  |  |  |  |  |  |
b74cfc13SGuray Ozen    ///                |↓ |  |  |  |  |  |  |  |
b74cfc13SGuray Ozen    ///                +--+--+--+--+--+--+--+--+
b74cfc13SGuray Ozen    ///
b74cfc13SGuray Ozen    Value iterateDescriptorB(Value desc, int i, int j, int k) {
b74cfc13SGuray Ozen      MemRefType matrixTypeB = op.getDescriptorB().getType().getTensor();
b74cfc13SGuray Ozen      Type elemB = matrixTypeB.getElementType();
b74cfc13SGuray Ozen      int byte = elemB.getIntOrFloatBitWidth() / 8;
b74cfc13SGuray Ozen      int incrementVal = matrixTypeB.getDimSize(0) * wgmmaK * k * byte;
23882226SGuray Ozen      incrementVal = incrementVal >> exclude4LSB;
23882226SGuray Ozen      LLVM_DEBUG(DBGSE() << "Descriptor B + " << incrementVal << "\n");
23882226SGuray Ozen      if (!incrementVal)
23882226SGuray Ozen        return desc;
ee49cda7SGuray Ozen      return makeAdd(desc, makeI64Const(b, incrementVal));
23882226SGuray Ozen    }
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    /// This function generates a WgmmaMmaAsyncOp using provided GMMA matrix
b74cfc13SGuray Ozen    /// descriptors and arranges them based on induction variables: i, j, and k.
52db7e27SGuray Ozen    Value generateWgmma(int i, int j, int k, Value matrixC) {
b74cfc13SGuray Ozen      LLVM_DEBUG(DBGS() << "\t wgmma."
b74cfc13SGuray Ozen                        << "m" << wgmmaM << "n" << wgmmaN << "k" << wgmmaK
b74cfc13SGuray Ozen                        << "(A[" << (iterationM * wgmmaM) << ":"
b74cfc13SGuray Ozen                        << (iterationM * wgmmaM) + wgmmaM << "]["
b74cfc13SGuray Ozen                        << (iterationK * wgmmaK) << ":"
b74cfc13SGuray Ozen                        << (iterationK * wgmmaK + wgmmaK) << "] * "
b74cfc13SGuray Ozen                        << " B[" << (iterationK * wgmmaK) << ":"
b74cfc13SGuray Ozen                        << (iterationK * wgmmaK + wgmmaK) << "][" << 0 << ":"
b74cfc13SGuray Ozen                        << wgmmaN << "])\n");
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      Value descriptorA = iterateDescriptorA(adaptor.getDescriptorA(), i, j, k);
b74cfc13SGuray Ozen      Value descriptorB = iterateDescriptorB(adaptor.getDescriptorB(), i, j, k);
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      Type elemA = op.getDescriptorA().getType().getTensor().getElementType();
b74cfc13SGuray Ozen      NVVM::WGMMATypesAttr itypeA = generateWgmmaType(elemA);
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      Type elemB = op.getDescriptorB().getType().getTensor().getElementType();
b74cfc13SGuray Ozen      NVVM::WGMMATypesAttr itypeB = generateWgmmaType(elemB);
b74cfc13SGuray Ozen
12c241b3SGuray Ozen      Type elemD = op.getMatrixC().getType().getFragmented().getElementType();
12c241b3SGuray Ozen      NVVM::WGMMATypesAttr itypeD = generateWgmmaType(elemD, true);
12c241b3SGuray Ozen
b74cfc13SGuray Ozen      NVVM::MMAShapeAttr shape = generateWgmmaShape();
b74cfc13SGuray Ozen      NVVM::WGMMAScaleOutAttr scaleOut = generateScaleOut();
b74cfc13SGuray Ozen      NVVM::WGMMAScaleInAttr scaleIn = generateScaleIn();
b74cfc13SGuray Ozen      NVVM::MMALayoutAttr layoutA = generateWgmmaLayout(op.getTransposeA());
fa13c3eeSGuray Ozen      NVVM::MMALayoutAttr layoutB = generateWgmmaLayout(!op.getTransposeB());
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      auto overflow = NVVM::MMAIntOverflowAttr::get(
b74cfc13SGuray Ozen          op->getContext(), NVVM::MMAIntOverflow::wrapped);
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      return b.create<NVVM::WgmmaMmaAsyncOp>(
52db7e27SGuray Ozen          matrixC.getType(), matrixC, descriptorA, descriptorB, shape, itypeA,
12c241b3SGuray Ozen          itypeB, itypeD, scaleOut, scaleIn, scaleIn, layoutA, layoutB,
12c241b3SGuray Ozen          overflow);
b74cfc13SGuray Ozen    }
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    /// Generates multiple wgmma instructions to complete the given GEMM shape
52db7e27SGuray Ozen    Value generateWgmmaGroup() {
52db7e27SGuray Ozen      Value wgmmaResult =
52db7e27SGuray Ozen          b.create<LLVM::UndefOp>(adaptor.getMatrixC().getType());
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      // Perform GEMM
52db7e27SGuray Ozen      SmallVector<Value> wgmmaResults;
b74cfc13SGuray Ozen      for (int i = 0; i < iterationM; ++i) {
52db7e27SGuray Ozen        Value matrixC = b.create<LLVM::ExtractValueOp>(adaptor.getMatrixC(), i);
b74cfc13SGuray Ozen        for (int j = 0; j < iterationN; ++j)
b74cfc13SGuray Ozen          for (int k = 0; k < iterationK; ++k)
52db7e27SGuray Ozen            matrixC = generateWgmma(i, j, k, matrixC);
23882226SGuray Ozen        wgmmaResults.push_back(matrixC);
23882226SGuray Ozen      }
52db7e27SGuray Ozen      for (auto [idx, matrix] : llvm::enumerate(wgmmaResults)) {
52db7e27SGuray Ozen        wgmmaResult = b.create<LLVM::InsertValueOp>(wgmmaResult.getType(),
52db7e27SGuray Ozen                                                    wgmmaResult, matrix, idx);
52db7e27SGuray Ozen      }
52db7e27SGuray Ozen      return wgmmaResult;
b74cfc13SGuray Ozen    }
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen  public:
b74cfc13SGuray Ozen    WarpgroupGemm(nvgpu::WarpgroupMmaOp op, ImplicitLocOpBuilder &b,
52db7e27SGuray Ozen                  OpAdaptor adaptor)
52db7e27SGuray Ozen        : op(op), b(b), adaptor(adaptor) {
b74cfc13SGuray Ozen      // Find the entire GEMM Shape
b74cfc13SGuray Ozen      totalM = op.getDescriptorA().getType().getTensor().getDimSize(0);
b74cfc13SGuray Ozen      totalN = op.getDescriptorB().getType().getTensor().getDimSize(1);
b74cfc13SGuray Ozen      totalK = op.getDescriptorA().getType().getTensor().getDimSize(1);
b74cfc13SGuray Ozen      LLVM_DEBUG(DBGS() << "===--- GEMM D[" << totalM << "][" << totalN
b74cfc13SGuray Ozen                        << "] += A[" << totalM << "][" << totalK << "] * B["
b74cfc13SGuray Ozen                        << totalK << "][" << totalN << "] ---===\n");
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      // Find the shape for one wgmma instruction
b74cfc13SGuray Ozen      findWgmmaShape(
b74cfc13SGuray Ozen          totalM, totalN,
b74cfc13SGuray Ozen          op.getDescriptorA().getType().getTensor().getElementType());
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen      // Iterations counts to complete the given shape with wgmma shape
b74cfc13SGuray Ozen      iterationM = totalM / wgmmaM;
b74cfc13SGuray Ozen      iterationN = totalN / wgmmaN;
b74cfc13SGuray Ozen      iterationK = totalK / wgmmaK;
b74cfc13SGuray Ozen    }
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    /// Generates WgmmaMmaAsync Ops to complete the specified GEMM  shape. It
b74cfc13SGuray Ozen    /// includes generating a fence Op (WgmmaFenceAlignedOp) before the
b74cfc13SGuray Ozen    /// instructions and group synchronization, as well as waiting
b74cfc13SGuray Ozen    /// (WgmmaGroupSyncAlignedOp) for group synchronization
b74cfc13SGuray Ozen    /// (WgmmaWaitGroupSyncOp) after the instructions.
52db7e27SGuray Ozen    Value generateWarpgroupMma() {
b74cfc13SGuray Ozen      b.create<NVVM::WgmmaFenceAlignedOp>();
52db7e27SGuray Ozen      Value wgmmaResult = generateWgmmaGroup();
ee49cda7SGuray Ozen      b.create<NVVM::WgmmaGroupSyncAlignedOp>();
ee49cda7SGuray Ozen      b.create<NVVM::WgmmaWaitGroupSyncOp>(op.getWaitGroup());
52db7e27SGuray Ozen      return wgmmaResult;
b74cfc13SGuray Ozen    }
b74cfc13SGuray Ozen  };
b74cfc13SGuray Ozen  LogicalResult
b74cfc13SGuray Ozen  matchAndRewrite(nvgpu::WarpgroupMmaOp op, OpAdaptor adaptor,
b74cfc13SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
b74cfc13SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
52db7e27SGuray Ozen
b74cfc13SGuray Ozen    // Step 1. Build a helper class
52db7e27SGuray Ozen    WarpgroupGemm warpgroupGemm(op, b, adaptor);
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    // Step 2. Get the entire GEMM Shape
52db7e27SGuray Ozen    Value wgmmaResult = warpgroupGemm.generateWarpgroupMma();
b74cfc13SGuray Ozen
b74cfc13SGuray Ozen    // Step 3. Replace fragmented result struct with the op results
52db7e27SGuray Ozen    rewriter.replaceOp(op, wgmmaResult);
23882226SGuray Ozen    return success();
23882226SGuray Ozen  }
23882226SGuray Ozen};
23882226SGuray Ozen
d20fbc90SGuray Ozenstruct NVGPUWarpgroupMmaStoreOpLowering
d20fbc90SGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaStoreOp> {
d20fbc90SGuray Ozen  using ConvertOpToLLVMPattern<
d20fbc90SGuray Ozen      nvgpu::WarpgroupMmaStoreOp>::ConvertOpToLLVMPattern;
d20fbc90SGuray Ozen
d20fbc90SGuray Ozen  /// This function stores a fragmented register matrix owned by a warp group
d20fbc90SGuray Ozen  /// (128 threads) into a memref. Each thread has 64 registers, each the size
d20fbc90SGuray Ozen  /// of a struct.
d20fbc90SGuray Ozen  /// Here is what each threads (T) holds, each `d` is struct value with a
d20fbc90SGuray Ozen  /// number.
d20fbc90SGuray Ozen  ///
d20fbc90SGuray Ozen  /// Threads in warp-group (128 threads) and what they owns in the matrixD:
d20fbc90SGuray Ozen  /// 0-31 	  Warp-0  -> MatrixD[0:15 ][0:N]
d20fbc90SGuray Ozen  /// 32-63 	Warp-1  -> MatrixD[16:31][0:N]
d20fbc90SGuray Ozen  /// 64-95 	Warp-2  -> MatrixD[32:47][0:N]
d20fbc90SGuray Ozen  /// 96-127 	Warp-3  -> MatrixD[48:64][0:N]
d20fbc90SGuray Ozen  ///
d20fbc90SGuray Ozen  /// Matrix-D:
d20fbc90SGuray Ozen  ///   +______________________________________________________________________+
d20fbc90SGuray Ozen  ///   |     0-1  |    2-3  |    4-5  |    6-7  |   8-9  |   10-11|..|N-8,N-7 |
d20fbc90SGuray Ozen  /// 0 | T0:d0-d1 |T1:d0-d1 |T2:d0-d1 |T3:d0-d1 |T0:d4-d5| T1:d4-d5..|T0:dX-dY|
d20fbc90SGuray Ozen  /// 1 | T4:d0-d1 |T5:d0-d1 |T6:d0-d1 |T7:d0-d1 |T4:d4-d5| T5:d4-d5..|T4:dX-dY|
d20fbc90SGuray Ozen  /// ..| .........|.........|.........|.........|........|...........|........|
d20fbc90SGuray Ozen  /// 8 | T0:d2-d3 |T1:d2-d3 |T2:d2-d3 |T3:d2-d3 |T0:d6-d7|T1:d6-d7,..|T0:dZ-dW|
d20fbc90SGuray Ozen  /// 9 | T4:d2-d3 |T5:d2-d3 |T6:d2-d3 |T7:d2-d3 |T4:d6-d7| T5:d6-d7..|T4:dZ-dW|
d20fbc90SGuray Ozen  /// ..| .........|.........|.........|.........|........|...........|........|
d20fbc90SGuray Ozen  /// 15| T28:d2-d3|T29:d2-d3|T30:d2-d3|T31:d2-d3|........|...........|........|
d20fbc90SGuray Ozen  /// 16| T32:d2-d3|T33:d2-d3|T34:d2-d3|T35:d2-d3|........|...........|........|
d20fbc90SGuray Ozen  /// ..| .........|.........|.........|.........|........|...........|........|
d20fbc90SGuray Ozen  /// 32| T64:d2-d3|T65:d2-d3|T66:d2-d3|T67:d2-d3|........|...........|........|
d20fbc90SGuray Ozen  /// ..| .........|.........|.........|.........|........|...........|........|
d20fbc90SGuray Ozen  /// 48| T96:d2-d3|T97:d2-d3|T98:d2-d3|T99:d2-d3|........|...........|........|
d20fbc90SGuray Ozen  /// ..| .........|.........|.........|.........|........|...........|........|
d20fbc90SGuray Ozen  ///   +______________________________________________________________________+
d20fbc90SGuray Ozen  ///
d20fbc90SGuray Ozen  /// \param rewriter: The pattern rewriter.
d20fbc90SGuray Ozen  /// \param matrixD: Result of the warp-group MMA operation (fragmented
d20fbc90SGuray Ozen  /// matrix). It is holded by a thread and a struct with 64 elements.
d20fbc90SGuray Ozen  /// \param dstMemref: The memref where the registers will be stored.
d20fbc90SGuray Ozen  /// \param offset: the offset within the memref where the registers will be
d20fbc90SGuray Ozen  /// stored.
d20fbc90SGuray Ozen  void storeFragmentedMatrix(ImplicitLocOpBuilder &b, Value matrixD,
d20fbc90SGuray Ozen                             TypedValue<MemRefType> dstMemref,
d20fbc90SGuray Ozen                             int offset) const {
d20fbc90SGuray Ozen    Type i32 = b.getI32Type();
d20fbc90SGuray Ozen
d20fbc90SGuray Ozen    auto makeConst = [&](int32_t index) -> Value {
d20fbc90SGuray Ozen      return b.create<LLVM::ConstantOp>(i32, b.getI32IntegerAttr(index));
d20fbc90SGuray Ozen    };
d20fbc90SGuray Ozen    Value c1 = makeConst(1);
d20fbc90SGuray Ozen    Value c2 = makeConst(2);
d20fbc90SGuray Ozen    Value c4 = makeConst(4);
d20fbc90SGuray Ozen    Value c8 = makeConst(8);
d20fbc90SGuray Ozen    Value c16 = makeConst(16);
d20fbc90SGuray Ozen    Value warpSize = makeConst(kWarpSize);
d20fbc90SGuray Ozen
d20fbc90SGuray Ozen    auto makeMul = [&](Value lhs, Value rhs) -> Value {
d20fbc90SGuray Ozen      return b.create<LLVM::MulOp>(lhs.getType(), lhs, rhs);
d20fbc90SGuray Ozen    };
d20fbc90SGuray Ozen    auto makeAdd = [&](Value lhs, Value rhs) -> Value {
d20fbc90SGuray Ozen      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
d20fbc90SGuray Ozen    };
d20fbc90SGuray Ozen
d20fbc90SGuray Ozen    auto makeExtractAndStore = [&](int i, Value wgmmaResult, Value x, Value y,
d20fbc90SGuray Ozen                                   TypedValue<::mlir::MemRefType> memref) {
d20fbc90SGuray Ozen      Type it = b.getIndexType();
d20fbc90SGuray Ozen      Value idx = b.create<arith::IndexCastOp>(it, x);
d20fbc90SGuray Ozen      Value idy0 = b.create<arith::IndexCastOp>(it, y);
d20fbc90SGuray Ozen      Value idy1 = b.create<arith::IndexCastOp>(it, makeAdd(y, c1));
d20fbc90SGuray Ozen      Value d0 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i);
d20fbc90SGuray Ozen      Value d1 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i + 1);
d20fbc90SGuray Ozen      b.create<memref::StoreOp>(d0, memref, ValueRange{idx, idy0});
d20fbc90SGuray Ozen      b.create<memref::StoreOp>(d1, memref, ValueRange{idx, idy1});
d20fbc90SGuray Ozen    };
d20fbc90SGuray Ozen
21830c91SGuray Ozen    Value tidx = b.create<NVVM::ThreadIdXOp>(i32);
21830c91SGuray Ozen    Value laneId = b.create<LLVM::URemOp>(i32, tidx, warpSize);
21830c91SGuray Ozen    Value warpId = b.create<LLVM::UDivOp>(i32, tidx, warpSize);
21830c91SGuray Ozen    Value lane4Id = b.create<LLVM::UDivOp>(i32, laneId, c4);
21830c91SGuray Ozen    Value lane4modId = b.create<LLVM::URemOp>(i32, laneId, c4);
21830c91SGuray Ozen
d20fbc90SGuray Ozen    Value tj = makeMul(lane4modId, c2);
d20fbc90SGuray Ozen    Value ti = makeAdd(lane4Id, makeMul(warpId, c16));
d20fbc90SGuray Ozen    if (offset)
d20fbc90SGuray Ozen      ti = makeAdd(ti, makeConst(offset));
21830c91SGuray Ozen
a5757c5bSChristian Sigg    auto structType = cast<LLVM::LLVMStructType>(matrixD.getType());
21830c91SGuray Ozen
21830c91SGuray Ozen    // Number of 32-bit registers owns per thread
21830c91SGuray Ozen    constexpr unsigned numAdjacentRegisters = 2;
21830c91SGuray Ozen    // Number of 8x8 matrices one below another per warp
21830c91SGuray Ozen    constexpr unsigned numStackedMatrices = 2;
21830c91SGuray Ozen
21830c91SGuray Ozen    size_t storeCount = (structType.getBody().size() /
21830c91SGuray Ozen                         (numStackedMatrices * numAdjacentRegisters));
21830c91SGuray Ozen
21830c91SGuray Ozen    for (size_t i = 0; i < numStackedMatrices; ++i) {
d20fbc90SGuray Ozen      Value idx = makeAdd(ti, makeMul(makeConst(i), c8));
21830c91SGuray Ozen      for (size_t j = 0; j < storeCount; ++j) {
d20fbc90SGuray Ozen        Value idy = makeAdd(tj, makeMul(makeConst(j), c8));
21830c91SGuray Ozen        size_t structIndex = (i * numAdjacentRegisters) +
21830c91SGuray Ozen                             (j * (numStackedMatrices * numAdjacentRegisters));
21830c91SGuray Ozen        makeExtractAndStore(structIndex, matrixD, idx, idy, dstMemref);
d20fbc90SGuray Ozen      }
d20fbc90SGuray Ozen    }
d20fbc90SGuray Ozen  }
d20fbc90SGuray Ozen
d20fbc90SGuray Ozen  LogicalResult
d20fbc90SGuray Ozen  matchAndRewrite(nvgpu::WarpgroupMmaStoreOp op, OpAdaptor adaptor,
d20fbc90SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
d20fbc90SGuray Ozen    int offset = 0;
52db7e27SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
52db7e27SGuray Ozen    Value matriDValue = adaptor.getMatrixD();
a5757c5bSChristian Sigg    auto stype = cast<LLVM::LLVMStructType>(matriDValue.getType());
52db7e27SGuray Ozen    for (auto [idx, matrixD] : llvm::enumerate(stype.getBody())) {
a5757c5bSChristian Sigg      auto structType = cast<LLVM::LLVMStructType>(matrixD);
52db7e27SGuray Ozen      Value innerStructValue = b.create<LLVM::ExtractValueOp>(matriDValue, idx);
52db7e27SGuray Ozen      storeFragmentedMatrix(b, innerStructValue, op.getDstMemref(), offset);
d20fbc90SGuray Ozen      offset += structType.getBody().size();
d20fbc90SGuray Ozen    }
d20fbc90SGuray Ozen    rewriter.eraseOp(op);
d20fbc90SGuray Ozen    return success();
d20fbc90SGuray Ozen  }
d20fbc90SGuray Ozen};
d20fbc90SGuray Ozen
315ab3c4SGuray Ozenstruct NVGPUWarpgroupMmaInitAccumulatorOpLowering
315ab3c4SGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaInitAccumulatorOp> {
315ab3c4SGuray Ozen  using ConvertOpToLLVMPattern<
315ab3c4SGuray Ozen      nvgpu::WarpgroupMmaInitAccumulatorOp>::ConvertOpToLLVMPattern;
315ab3c4SGuray Ozen  LogicalResult
315ab3c4SGuray Ozen  matchAndRewrite(nvgpu::WarpgroupMmaInitAccumulatorOp op, OpAdaptor adaptor,
315ab3c4SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
315ab3c4SGuray Ozen    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
a5757c5bSChristian Sigg    LLVM::LLVMStructType packStructType = cast<LLVM::LLVMStructType>(
a5757c5bSChristian Sigg        getTypeConverter()->convertType(op.getMatrixC().getType()));
a5757c5bSChristian Sigg    Type elemType = cast<LLVM::LLVMStructType>(packStructType.getBody().front())
52db7e27SGuray Ozen                        .getBody()
52db7e27SGuray Ozen                        .front();
52db7e27SGuray Ozen    Value zero = b.create<LLVM::ConstantOp>(elemType, b.getZeroAttr(elemType));
c4ba84d6SGuray Ozen    Value packStruct = b.create<LLVM::UndefOp>(packStructType);
c4ba84d6SGuray Ozen    SmallVector<Value> innerStructs;
c4ba84d6SGuray Ozen    // Unpack the structs and set all values to zero
c4ba84d6SGuray Ozen    for (auto [idx, s] : llvm::enumerate(packStructType.getBody())) {
a5757c5bSChristian Sigg      auto structType = cast<LLVM::LLVMStructType>(s);
c4ba84d6SGuray Ozen      Value structValue = b.create<LLVM::ExtractValueOp>(packStruct, idx);
c4ba84d6SGuray Ozen      for (unsigned i = 0; i < structType.getBody().size(); ++i) {
c4ba84d6SGuray Ozen        structValue = b.create<LLVM::InsertValueOp>(
c4ba84d6SGuray Ozen            structType, structValue, zero, ArrayRef<int64_t>({i}));
315ab3c4SGuray Ozen      }
c4ba84d6SGuray Ozen      innerStructs.push_back(structValue);
315ab3c4SGuray Ozen    }
c4ba84d6SGuray Ozen    // Pack the inner structs into a single struct
c4ba84d6SGuray Ozen    for (auto [idx, matrix] : llvm::enumerate(innerStructs)) {
c4ba84d6SGuray Ozen      packStruct = b.create<LLVM::InsertValueOp>(packStruct.getType(),
c4ba84d6SGuray Ozen                                                 packStruct, matrix, idx);
c4ba84d6SGuray Ozen    }
c4ba84d6SGuray Ozen    rewriter.replaceOp(op, packStruct);
315ab3c4SGuray Ozen    return success();
315ab3c4SGuray Ozen  }
315ab3c4SGuray Ozen};
315ab3c4SGuray Ozen
39cdefb5SGuray Ozenstruct NVGPUTmaPrefetchOpLowering
39cdefb5SGuray Ozen    : public ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp> {
39cdefb5SGuray Ozen  using ConvertOpToLLVMPattern<nvgpu::TmaPrefetchOp>::ConvertOpToLLVMPattern;
39cdefb5SGuray Ozen  LogicalResult
39cdefb5SGuray Ozen  matchAndRewrite(nvgpu::TmaPrefetchOp op, OpAdaptor adaptor,
39cdefb5SGuray Ozen                  ConversionPatternRewriter &rewriter) const override {
39cdefb5SGuray Ozen    rewriter.replaceOpWithNewOp<NVVM::PrefetchTensorMapOp>(
39cdefb5SGuray Ozen        op, adaptor.getTensorMapDescriptor(), adaptor.getPredicate());
39cdefb5SGuray Ozen    return success();
39cdefb5SGuray Ozen  }
39cdefb5SGuray Ozen};
39cdefb5SGuray Ozen
2b23e6c8SObserver007struct NVGPURcpOpLowering : public ConvertOpToLLVMPattern<nvgpu::RcpOp> {
2b23e6c8SObserver007  using ConvertOpToLLVMPattern<nvgpu::RcpOp>::ConvertOpToLLVMPattern;
2b23e6c8SObserver007  LogicalResult
2b23e6c8SObserver007  matchAndRewrite(nvgpu::RcpOp op, OpAdaptor adaptor,
2b23e6c8SObserver007                  ConversionPatternRewriter &rewriter) const override {
2b23e6c8SObserver007    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
2b23e6c8SObserver007    auto i64Ty = b.getI64Type();
2b23e6c8SObserver007    auto f32Ty = b.getF32Type();
2b23e6c8SObserver007    VectorType inTy = op.getIn().getType();
2b23e6c8SObserver007    // apply rcp.approx.ftz.f on each element in vector.
2b23e6c8SObserver007    auto convert1DVec = [&](Type llvm1DVectorTy, Value inVec) {
2b23e6c8SObserver007      Value ret1DVec = b.create<LLVM::UndefOp>(llvm1DVectorTy);
2b23e6c8SObserver007      int numElems = llvm::cast<VectorType>(llvm1DVectorTy).getNumElements();
2b23e6c8SObserver007      for (int i = 0; i < numElems; i++) {
2b23e6c8SObserver007        Value idx = b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(i));
2b23e6c8SObserver007        Value elem = b.create<LLVM::ExtractElementOp>(inVec, idx);
2b23e6c8SObserver007        Value dst = b.create<NVVM::RcpApproxFtzF32Op>(f32Ty, elem);
2b23e6c8SObserver007        ret1DVec = b.create<LLVM::InsertElementOp>(ret1DVec, dst, idx);
2b23e6c8SObserver007      }
2b23e6c8SObserver007      return ret1DVec;
2b23e6c8SObserver007    };
2b23e6c8SObserver007    if (inTy.getRank() == 1) {
2b23e6c8SObserver007      rewriter.replaceOp(op, convert1DVec(inTy, adaptor.getIn()));
2b23e6c8SObserver007      return success();
2b23e6c8SObserver007    }
2b23e6c8SObserver007    return LLVM::detail::handleMultidimensionalVectors(
2b23e6c8SObserver007        op.getOperation(), adaptor.getOperands(), *(this->getTypeConverter()),
2b23e6c8SObserver007        [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
2b23e6c8SObserver007          OpAdaptor adaptor(operands);
2b23e6c8SObserver007          return convert1DVec(llvm1DVectorTy, adaptor.getIn());
2b23e6c8SObserver007        },
2b23e6c8SObserver007        rewriter);
2b23e6c8SObserver007  }
2b23e6c8SObserver007};
894a591cSThomas Raoux} // namespace
15bcc36eSThomas Raoux
206fad0eSMatthias Springervoid mlir::populateNVGPUToNVVMConversionPatterns(
206fad0eSMatthias Springer    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
affcfccdSGuray Ozen  patterns.add<
affcfccdSGuray Ozen      NVGPUMBarrierCreateLowering,           // nvgpu.mbarrier.create
affcfccdSGuray Ozen      NVGPUMBarrierInitLowering,             // nvgpu.mbarrier.init
affcfccdSGuray Ozen      NVGPUMBarrierArriveLowering,           // nvgpu.mbarrier.arrive
affcfccdSGuray Ozen      NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete
836dbb85SGuray Ozen      NVGPUMBarrierTestWaitLowering,         // nvgpu.mbarrier.test_wait_parity
836dbb85SGuray Ozen      NVGPUMBarrierTryWaitParityLowering,    // nvgpu.mbarrier.try_wait_parity
e56d6745SGuray Ozen      NVGPUTmaAsyncLoadOpLowering,           // nvgpu.tma.async.load
8dd0d95cSGuray Ozen      NVGPUTmaAsyncStoreOpLowering,          // nvgpu.tma.async.store
e56d6745SGuray Ozen      NVGPUTmaCreateDescriptorOpLowering,    // nvgpu.tma.create.descriptor
39cdefb5SGuray Ozen      NVGPUTmaPrefetchOpLowering,            // nvgpu.tma.prefetch.descriptor
836dbb85SGuray Ozen      NVGPUMBarrierArriveExpectTxLowering,   // nvgpu.mbarrier.arrive.expect_tx
6dc7717bSGuray Ozen      NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
23882226SGuray Ozen      NVGPUWarpgroupMmaOpLowering,              // nvgpu.warpgroup.mma
d20fbc90SGuray Ozen      NVGPUWarpgroupMmaStoreOpLowering,         // nvgpu.warpgroup.mma.store
315ab3c4SGuray Ozen      NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
affcfccdSGuray Ozen      MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
708185f0SChristopher Bate      NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
2b23e6c8SObserver007      NVGPUMmaSparseSyncLowering, NVGPURcpOpLowering>(converter);
894a591cSThomas Raoux}