Target.cpp (revision 6a7d6c5f69dda254ec92f982985fd10fa51c63ef) - OpenGrok cross reference for /llvm-project/mlir/lib/Target/LLVM/NVVM/Target.cpp

//===- Target.cpp - MLIR LLVM NVVM target compilation -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This files defines NVVM target related functions including registration
// calls for the `#nvvm.target` compilation attribute.
//
//===----------------------------------------------------------------------===//

#include "mlir/Target/LLVM/NVVM/Target.h"

#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/IR/BuiltinAttributeInterfaces.h"
#include "mlir/IR/BuiltinDialect.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectResourceBlobManager.h"
#include "mlir/Target/LLVM/NVVM/Utils.h"
#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Export.h"

#include "llvm/Config/llvm-config.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/raw_ostream.h"

#include <cstdint>
#include <cstdlib>

using namespace mlir;
using namespace mlir::NVVM;

#ifndef __DEFAULT_CUDATOOLKIT_PATH__
#define __DEFAULT_CUDATOOLKIT_PATH__ ""
#endif

extern "C" const char _mlir_embedded_libdevice[];
extern "C" const unsigned _mlir_embedded_libdevice_size;

namespace {
// Implementation of the `TargetAttrInterface` model.
class NVVMTargetAttrImpl
    : public gpu::TargetAttrInterface::FallbackModel<NVVMTargetAttrImpl> {
public:
  std::optional<SmallVector<char, 0>>
  serializeToObject(Attribute attribute, Operation *module,
                    const gpu::TargetOptions &options) const;

  Attribute createObject(Attribute attribute, Operation *module,
                         const SmallVector<char, 0> &object,
                         const gpu::TargetOptions &options) const;
};
} // namespace

// Register the NVVM dialect, the NVVM translation & the target interface.
void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
    DialectRegistry &registry) {
  registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) {
    NVVMTargetAttr::attachInterface<NVVMTargetAttrImpl>(*ctx);
  });
}

void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
    MLIRContext &context) {
  DialectRegistry registry;
  registerNVVMTargetInterfaceExternalModels(registry);
  context.appendDialectRegistry(registry);
}

// Search for the CUDA toolkit path.
StringRef mlir::NVVM::getCUDAToolkitPath() {
  if (const char *var = std::getenv("CUDA_ROOT"))
    return var;
  if (const char *var = std::getenv("CUDA_HOME"))
    return var;
  if (const char *var = std::getenv("CUDA_PATH"))
    return var;
  return __DEFAULT_CUDATOOLKIT_PATH__;
}

SerializeGPUModuleBase::SerializeGPUModuleBase(
    Operation &module, NVVMTargetAttr target,
    const gpu::TargetOptions &targetOptions)
    : ModuleToObject(module, target.getTriple(), target.getChip(),
                     target.getFeatures(), target.getO(),
                     targetOptions.getInitialLlvmIRCallback(),
                     targetOptions.getLinkedLlvmIRCallback(),
                     targetOptions.getOptimizedLlvmIRCallback(),
                     targetOptions.getISACallback()),
      target(target), toolkitPath(targetOptions.getToolkitPath()),
      librariesToLink(targetOptions.getLibrariesToLink()) {

  // If `targetOptions` have an empty toolkitPath use `getCUDAToolkitPath`
  if (toolkitPath.empty())
    toolkitPath = getCUDAToolkitPath();

  // Append the files in the target attribute.
  if (target.getLink())
    librariesToLink.append(target.getLink().begin(), target.getLink().end());

  // Append libdevice to the files to be loaded.
  (void)appendStandardLibs();
}

void SerializeGPUModuleBase::init() {
  static llvm::once_flag initializeBackendOnce;
  llvm::call_once(initializeBackendOnce, []() {
  // If the `NVPTX` LLVM target was built, initialize it.
#if LLVM_HAS_NVPTX_TARGET
    LLVMInitializeNVPTXTarget();
    LLVMInitializeNVPTXTargetInfo();
    LLVMInitializeNVPTXTargetMC();
    LLVMInitializeNVPTXAsmPrinter();
#endif
  });
}

NVVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }

StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; }

ArrayRef<Attribute> SerializeGPUModuleBase::getLibrariesToLink() const {
  return librariesToLink;
}

// Try to append `libdevice` from a CUDA toolkit installation.
LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
#if MLIR_NVVM_EMBED_LIBDEVICE
  // If libdevice is embedded in the binary, we don't look it up on the
  // filesystem.
  MLIRContext *ctx = target.getContext();
  auto type =
      RankedTensorType::get(ArrayRef<int64_t>{_mlir_embedded_libdevice_size},
                            IntegerType::get(ctx, 8));
  auto resourceManager = DenseResourceElementsHandle::getManagerInterface(ctx);

  // Lookup if we already loaded the resource, otherwise create it.
  DialectResourceBlobManager::BlobEntry *blob =
      resourceManager.getBlobManager().lookup("_mlir_embedded_libdevice");
  if (blob) {
    librariesToLink.push_back(DenseResourceElementsAttr::get(
        type, DenseResourceElementsHandle(
                  blob, ctx->getLoadedDialect<BuiltinDialect>())));
    return success();
  }

  // Allocate a resource using one of the UnManagedResourceBlob method to wrap
  // the embedded data.
  auto unmanagedBlob = UnmanagedAsmResourceBlob::allocateInferAlign(
      ArrayRef<char>{_mlir_embedded_libdevice, _mlir_embedded_libdevice_size});
  librariesToLink.push_back(DenseResourceElementsAttr::get(
      type, resourceManager.insert("_mlir_embedded_libdevice",
                                   std::move(unmanagedBlob))));
#else
  StringRef pathRef = getToolkitPath();
  if (!pathRef.empty()) {
    SmallVector<char, 256> path;
    path.insert(path.begin(), pathRef.begin(), pathRef.end());
    pathRef = StringRef(path.data(), path.size());
    if (!llvm::sys::fs::is_directory(pathRef)) {
      getOperation().emitError() << "CUDA path: " << pathRef
                                 << " does not exist or is not a directory.\n";
      return failure();
    }
    llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc");
    pathRef = StringRef(path.data(), path.size());
    if (!llvm::sys::fs::is_regular_file(pathRef)) {
      getOperation().emitError() << "LibDevice path: " << pathRef
                                 << " does not exist or is not a file.\n";
      return failure();
    }
    librariesToLink.push_back(StringAttr::get(target.getContext(), pathRef));
  }
#endif
  return success();
}

std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
  if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
                                      bcFiles, true)))
    return std::nullopt;
  return std::move(bcFiles);
}

namespace {
class NVPTXSerializer : public SerializeGPUModuleBase {
public:
  NVPTXSerializer(Operation &module, NVVMTargetAttr target,
                  const gpu::TargetOptions &targetOptions);

  /// Returns the GPU module op being serialized.
  gpu::GPUModuleOp getOperation();

  /// Compiles PTX to cubin using `ptxas`.
  std::optional<SmallVector<char, 0>>
  compileToBinary(const std::string &ptxCode);

  /// Compiles PTX to cubin using the `nvptxcompiler` library.
  std::optional<SmallVector<char, 0>>
  compileToBinaryNVPTX(const std::string &ptxCode);

  /// Serializes the LLVM module to an object format, depending on the
  /// compilation target selected in target options.
  std::optional<SmallVector<char, 0>>
  moduleToObject(llvm::Module &llvmModule) override;

private:
  using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;

  /// Creates a temp file.
  std::optional<TmpFile> createTemp(StringRef name, StringRef suffix);

  /// Finds the `tool` path, where `tool` is the name of the binary to search,
  /// i.e. `ptxas` or `fatbinary`. The search order is:
  /// 1. The toolkit path in `targetOptions`.
  /// 2. In the system PATH.
  /// 3. The path from `getCUDAToolkitPath()`.
  std::optional<std::string> findTool(StringRef tool);

  /// Target options.
  gpu::TargetOptions targetOptions;
};
} // namespace

NVPTXSerializer::NVPTXSerializer(Operation &module, NVVMTargetAttr target,
                                 const gpu::TargetOptions &targetOptions)
    : SerializeGPUModuleBase(module, target, targetOptions),
      targetOptions(targetOptions) {}

std::optional<NVPTXSerializer::TmpFile>
NVPTXSerializer::createTemp(StringRef name, StringRef suffix) {
  llvm::SmallString<128> filename;
  std::error_code ec =
      llvm::sys::fs::createTemporaryFile(name, suffix, filename);
  if (ec) {
    getOperation().emitError() << "Couldn't create the temp file: `" << filename
                               << "`, error message: " << ec.message();
    return std::nullopt;
  }
  return TmpFile(filename, llvm::FileRemover(filename.c_str()));
}

gpu::GPUModuleOp NVPTXSerializer::getOperation() {
  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
}

std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
  // Find the `tool` path.
  // 1. Check the toolkit path given in the command line.
  StringRef pathRef = targetOptions.getToolkitPath();
  SmallVector<char, 256> path;
  if (!pathRef.empty()) {
    path.insert(path.begin(), pathRef.begin(), pathRef.end());
    llvm::sys::path::append(path, "bin", tool);
    if (llvm::sys::fs::can_execute(path))
      return StringRef(path.data(), path.size()).str();
  }

  // 2. Check PATH.
  if (std::optional<std::string> toolPath =
          llvm::sys::Process::FindInEnvPath("PATH", tool))
    return *toolPath;

  // 3. Check `getCUDAToolkitPath()`.
  pathRef = getCUDAToolkitPath();
  path.clear();
  if (!pathRef.empty()) {
    path.insert(path.begin(), pathRef.begin(), pathRef.end());
    llvm::sys::path::append(path, "bin", tool);
    if (llvm::sys::fs::can_execute(path))
      return StringRef(path.data(), path.size()).str();
  }
  getOperation().emitError()
      << "Couldn't find the `" << tool
      << "` binary. Please specify the toolkit "
         "path, add the compiler to $PATH, or set one of the environment "
         "variables in `NVVM::getCUDAToolkitPath()`.";
  return std::nullopt;
}

// TODO: clean this method & have a generic tool driver or never emit binaries
// with this mechanism and let another stage take care of it.
std::optional<SmallVector<char, 0>>
NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
  // Determine if the serializer should create a fatbinary with the PTX embeded
  // or a simple CUBIN binary.
  const bool createFatbin =
      targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;

  // Find the `ptxas` & `fatbinary` tools.
  std::optional<std::string> ptxasCompiler = findTool("ptxas");
  if (!ptxasCompiler)
    return std::nullopt;
  std::optional<std::string> fatbinaryTool;
  if (createFatbin) {
    fatbinaryTool = findTool("fatbinary");
    if (!fatbinaryTool)
      return std::nullopt;
  }
  Location loc = getOperation().getLoc();

  // Base name for all temp files: mlir-<module name>-<target triple>-<chip>.
  std::string basename =
      llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(),
                    getTarget().getTriple(), getTarget().getChip());

  // Create temp files:
  std::optional<TmpFile> ptxFile = createTemp(basename, "ptx");
  if (!ptxFile)
    return std::nullopt;
  std::optional<TmpFile> logFile = createTemp(basename, "log");
  if (!logFile)
    return std::nullopt;
  std::optional<TmpFile> binaryFile = createTemp(basename, "bin");
  if (!binaryFile)
    return std::nullopt;
  TmpFile cubinFile;
  if (createFatbin) {
    Twine cubinFilename = ptxFile->first + ".cubin";
    cubinFile = TmpFile(cubinFilename.str(), llvm::FileRemover(cubinFilename));
  } else {
    cubinFile.first = binaryFile->first;
  }

  std::error_code ec;
  // Dump the PTX to a temp file.
  {
    llvm::raw_fd_ostream ptxStream(ptxFile->first, ec);
    if (ec) {
      emitError(loc) << "Couldn't open the file: `" << ptxFile->first
                     << "`, error message: " << ec.message();
      return std::nullopt;
    }
    ptxStream << ptxCode;
    if (ptxStream.has_error()) {
      emitError(loc) << "An error occurred while writing the PTX to: `"
                     << ptxFile->first << "`.";
      return std::nullopt;
    }
    ptxStream.flush();
  }

  // Command redirects.
  std::optional<StringRef> redirects[] = {
      std::nullopt,
      logFile->first,
      logFile->first,
  };

  // Get any extra args passed in `targetOptions`.
  std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
      targetOptions.tokenizeCmdOptions();

  // Create ptxas args.
  std::string optLevel = std::to_string(this->optLevel);
  SmallVector<StringRef, 12> ptxasArgs(
      {StringRef("ptxas"), StringRef("-arch"), getTarget().getChip(),
       StringRef(ptxFile->first), StringRef("-o"), StringRef(cubinFile.first),
       "--opt-level", optLevel});

  bool useFatbin32 = false;
  for (const auto *cArg : cmdOpts.second) {
    // All `cmdOpts` are for `ptxas` except `-32` which passes `-32` to
    // `fatbinary`, indicating a 32-bit target. By default a 64-bit target is
    // assumed.
    if (StringRef arg(cArg); arg != "-32")
      ptxasArgs.push_back(arg);
    else
      useFatbin32 = true;
  }

  // Create the `fatbinary` args.
  StringRef chip = getTarget().getChip();
  // Remove the arch prefix to obtain the compute capability.
  chip.consume_front("sm_"), chip.consume_front("compute_");
  // Embed the cubin object.
  std::string cubinArg =
      llvm::formatv("--image3=kind=elf,sm={0},file={1}", chip, cubinFile.first)
          .str();
  // Embed the PTX file so the driver can JIT if needed.
  std::string ptxArg =
      llvm::formatv("--image3=kind=ptx,sm={0},file={1}", chip, ptxFile->first)
          .str();
  SmallVector<StringRef, 6> fatbinArgs({StringRef("fatbinary"),
                                        useFatbin32 ? "-32" : "-64", cubinArg,
                                        ptxArg, "--create", binaryFile->first});

  // Dump tool invocation commands.
#define DEBUG_TYPE "serialize-to-binary"
  LLVM_DEBUG({
    llvm::dbgs() << "Tool invocation for module: "
                 << getOperation().getNameAttr() << "\n";
    llvm::interleave(ptxasArgs, llvm::dbgs(), " ");
    llvm::dbgs() << "\n";
    if (createFatbin) {
      llvm::interleave(fatbinArgs, llvm::dbgs(), " ");
      llvm::dbgs() << "\n";
    }
  });
#undef DEBUG_TYPE

  // Helper function for printing tool error logs.
  std::string message;
  auto emitLogError =
      [&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
    if (message.empty()) {
      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
          llvm::MemoryBuffer::getFile(logFile->first);
      if (toolStderr)
        emitError(loc) << toolName << " invocation failed. Log:\n"
                       << toolStderr->get()->getBuffer();
      else
        emitError(loc) << toolName << " invocation failed.";
      return std::nullopt;
    }
    emitError(loc) << toolName
                   << " invocation failed, error message: " << message;
    return std::nullopt;
  };

  // Invoke PTXAS.
  if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(), ptxasArgs,
                                /*Env=*/std::nullopt,
                                /*Redirects=*/redirects,
                                /*SecondsToWait=*/0,
                                /*MemoryLimit=*/0,
                                /*ErrMsg=*/&message))
    return emitLogError("`ptxas`");
#define DEBUG_TYPE "dump-sass"
  LLVM_DEBUG({
    std::optional<std::string> nvdisasm = findTool("nvdisasm");
    SmallVector<StringRef> nvdisasmArgs(
        {StringRef("nvdisasm"), StringRef(cubinFile.first)});
    if (llvm::sys::ExecuteAndWait(nvdisasm.value(), nvdisasmArgs,
                                  /*Env=*/std::nullopt,
                                  /*Redirects=*/redirects,
                                  /*SecondsToWait=*/0,
                                  /*MemoryLimit=*/0,
                                  /*ErrMsg=*/&message))
      return emitLogError("`nvdisasm`");
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
        llvm::MemoryBuffer::getFile(logFile->first);
    if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
      llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
      llvm::dbgs().flush();
    }
  });
#undef DEBUG_TYPE

  // Invoke `fatbin`.
  message.clear();
  if (createFatbin && llvm::sys::ExecuteAndWait(*fatbinaryTool, fatbinArgs,
                                                /*Env=*/std::nullopt,
                                                /*Redirects=*/redirects,
                                                /*SecondsToWait=*/0,
                                                /*MemoryLimit=*/0,
                                                /*ErrMsg=*/&message))
    return emitLogError("`fatbinary`");

// Dump the output of the tools, helpful if the verbose flag was passed.
#define DEBUG_TYPE "serialize-to-binary"
  LLVM_DEBUG({
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
        llvm::MemoryBuffer::getFile(logFile->first);
    if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
      llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
      llvm::dbgs().flush();
    }
  });
#undef DEBUG_TYPE

  // Read the fatbin.
  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
      llvm::MemoryBuffer::getFile(binaryFile->first);
  if (!binaryBuffer) {
    emitError(loc) << "Couldn't open the file: `" << binaryFile->first
                   << "`, error message: " << binaryBuffer.getError().message();
    return std::nullopt;
  }
  StringRef fatbin = (*binaryBuffer)->getBuffer();
  return SmallVector<char, 0>(fatbin.begin(), fatbin.end());
}

#if MLIR_ENABLE_NVPTXCOMPILER
#include "nvPTXCompiler.h"

#define RETURN_ON_NVPTXCOMPILER_ERROR(expr)                                    \
  do {                                                                         \
    if (auto status = (expr)) {                                                \
      emitError(loc) << llvm::Twine(#expr).concat(" failed with error code ")  \
                     << status;                                                \
      return std::nullopt;                                                     \
    }                                                                          \
  } while (false)

#include "nvFatbin.h"

#define RETURN_ON_NVFATBIN_ERROR(expr)                                         \
  do {                                                                         \
    auto result = (expr);                                                      \
    if (result != nvFatbinResult::NVFATBIN_SUCCESS) {                          \
      emitError(loc) << llvm::Twine(#expr).concat(" failed with error: ")      \
                     << nvFatbinGetErrorString(result);                        \
      return std::nullopt;                                                     \
    }                                                                          \
  } while (false)

std::optional<SmallVector<char, 0>>
NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) {
  Location loc = getOperation().getLoc();
  nvPTXCompilerHandle compiler = nullptr;
  nvPTXCompileResult status;
  size_t logSize;

  // Create the options.
  std::string optLevel = std::to_string(this->optLevel);
  std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
      targetOptions.tokenizeCmdOptions();
  cmdOpts.second.append(
      {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()});

  // Create the compiler handle.
  RETURN_ON_NVPTXCOMPILER_ERROR(
      nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str()));

  // Try to compile the binary.
  status = nvPTXCompilerCompile(compiler, cmdOpts.second.size(),
                                cmdOpts.second.data());

  // Check if compilation failed.
  if (status != NVPTXCOMPILE_SUCCESS) {
    RETURN_ON_NVPTXCOMPILER_ERROR(
        nvPTXCompilerGetErrorLogSize(compiler, &logSize));
    if (logSize != 0) {
      SmallVector<char> log(logSize + 1, 0);
      RETURN_ON_NVPTXCOMPILER_ERROR(
          nvPTXCompilerGetErrorLog(compiler, log.data()));
      emitError(loc) << "NVPTX compiler invocation failed, error log: "
                     << log.data();
    } else
      emitError(loc) << "NVPTX compiler invocation failed with error code: "
                     << status;
    return std::nullopt;
  }

  // Retrieve the binary.
  size_t elfSize;
  RETURN_ON_NVPTXCOMPILER_ERROR(
      nvPTXCompilerGetCompiledProgramSize(compiler, &elfSize));
  SmallVector<char, 0> binary(elfSize, 0);
  RETURN_ON_NVPTXCOMPILER_ERROR(
      nvPTXCompilerGetCompiledProgram(compiler, (void *)binary.data()));

// Dump the log of the compiler, helpful if the verbose flag was passed.
#define DEBUG_TYPE "serialize-to-binary"
  LLVM_DEBUG({
    RETURN_ON_NVPTXCOMPILER_ERROR(
        nvPTXCompilerGetInfoLogSize(compiler, &logSize));
    if (logSize != 0) {
      SmallVector<char> log(logSize + 1, 0);
      RETURN_ON_NVPTXCOMPILER_ERROR(
          nvPTXCompilerGetInfoLog(compiler, log.data()));
      llvm::dbgs() << "NVPTX compiler invocation for module: "
                   << getOperation().getNameAttr() << "\n";
      llvm::dbgs() << "Arguments: ";
      llvm::interleave(cmdOpts.second, llvm::dbgs(), " ");
      llvm::dbgs() << "\nOutput\n" << log.data() << "\n";
      llvm::dbgs().flush();
    }
  });
#undef DEBUG_TYPE
  RETURN_ON_NVPTXCOMPILER_ERROR(nvPTXCompilerDestroy(&compiler));

  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin) {
    bool useFatbin32 = llvm::any_of(cmdOpts.second, [](const char *option) {
      return llvm::StringRef(option) == "-32";
    });

    const char *cubinOpts[1] = {useFatbin32 ? "-32" : "-64"};
    nvFatbinHandle handle;

    auto chip = getTarget().getChip();
    chip.consume_front("sm_");

    RETURN_ON_NVFATBIN_ERROR(nvFatbinCreate(&handle, cubinOpts, 1));
    RETURN_ON_NVFATBIN_ERROR(nvFatbinAddCubin(
        handle, binary.data(), binary.size(), chip.data(), nullptr));
    RETURN_ON_NVFATBIN_ERROR(nvFatbinAddPTX(
        handle, ptxCode.data(), ptxCode.size(), chip.data(), nullptr, nullptr));

    size_t fatbinSize;
    RETURN_ON_NVFATBIN_ERROR(nvFatbinSize(handle, &fatbinSize));
    SmallVector<char, 0> fatbin(fatbinSize, 0);
    RETURN_ON_NVFATBIN_ERROR(nvFatbinGet(handle, (void *)fatbin.data()));
    RETURN_ON_NVFATBIN_ERROR(nvFatbinDestroy(&handle));
    return fatbin;
  }

  return binary;
}
#endif // MLIR_ENABLE_NVPTXCOMPILER

std::optional<SmallVector<char, 0>>
NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
  // Return LLVM IR if the compilation target is `offload`.
#define DEBUG_TYPE "serialize-to-llvm"
  LLVM_DEBUG({
    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
                 << "\n";
    llvm::dbgs() << llvmModule << "\n";
    llvm::dbgs().flush();
  });
#undef DEBUG_TYPE
  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
    return SerializeGPUModuleBase::moduleToObject(llvmModule);

#if !LLVM_HAS_NVPTX_TARGET
  getOperation()->emitError(
      "The `NVPTX` target was not built. Please enable it when building LLVM.");
  return std::nullopt;
#endif // LLVM_HAS_NVPTX_TARGET

  // Emit PTX code.
  std::optional<llvm::TargetMachine *> targetMachine =
      getOrCreateTargetMachine();
  if (!targetMachine) {
    getOperation().emitError() << "Target Machine unavailable for triple "
                               << triple << ", can't optimize with LLVM\n";
    return std::nullopt;
  }
  std::optional<std::string> serializedISA =
      translateToISA(llvmModule, **targetMachine);
  if (!serializedISA) {
    getOperation().emitError() << "Failed translating the module to ISA.";
    return std::nullopt;
  }
  if (isaCallback)
    isaCallback(serializedISA.value());

#define DEBUG_TYPE "serialize-to-isa"
  LLVM_DEBUG({
    llvm::dbgs() << "PTX for module: " << getOperation().getNameAttr() << "\n";
    llvm::dbgs() << *serializedISA << "\n";
    llvm::dbgs().flush();
  });
#undef DEBUG_TYPE

  // Return PTX if the compilation target is `assembly`.
  if (targetOptions.getCompilationTarget() ==
      gpu::CompilationTarget::Assembly) {
    // Make sure to include the null terminator.
    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
    return SmallVector<char, 0>(bin.begin(), bin.end());
  }

  // Compile to binary.
#if MLIR_ENABLE_NVPTXCOMPILER
  return compileToBinaryNVPTX(*serializedISA);
#else
  return compileToBinary(*serializedISA);
#endif // MLIR_ENABLE_NVPTXCOMPILER
}

std::optional<SmallVector<char, 0>>
NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
                                      const gpu::TargetOptions &options) const {
  assert(module && "The module must be non null.");
  if (!module)
    return std::nullopt;
  if (!mlir::isa<gpu::GPUModuleOp>(module)) {
    module->emitError("Module must be a GPU module.");
    return std::nullopt;
  }
  NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
  serializer.init();
  return serializer.run();
}

Attribute
NVVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
                                 const SmallVector<char, 0> &object,
                                 const gpu::TargetOptions &options) const {
  auto target = cast<NVVMTargetAttr>(attribute);
  gpu::CompilationTarget format = options.getCompilationTarget();
  DictionaryAttr objectProps;
  Builder builder(attribute.getContext());
  SmallVector<NamedAttribute, 2> properties;
  if (format == gpu::CompilationTarget::Assembly)
    properties.push_back(
        builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO())));

  if (StringRef section = options.getELFSection(); !section.empty())
    properties.push_back(builder.getNamedAttr(gpu::elfSectionName,
                                              builder.getStringAttr(section)));

  if (!properties.empty())
    objectProps = builder.getDictionaryAttr(properties);

  return builder.getAttr<gpu::ObjectAttr>(
      attribute, format,
      builder.getStringAttr(StringRef(object.data(), object.size())),
      objectProps, /*kernels=*/nullptr);
}