xref: /llvm-project/mlir/lib/Target/LLVM/NVVM/Target.cpp (revision 6a7d6c5f69dda254ec92f982985fd10fa51c63ef)
1 //===- Target.cpp - MLIR LLVM NVVM target compilation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This files defines NVVM target related functions including registration
10 // calls for the `#nvvm.target` compilation attribute.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Target/LLVM/NVVM/Target.h"
15 
16 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
17 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
18 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
19 #include "mlir/IR/BuiltinAttributeInterfaces.h"
20 #include "mlir/IR/BuiltinDialect.h"
21 #include "mlir/IR/BuiltinTypes.h"
22 #include "mlir/IR/DialectResourceBlobManager.h"
23 #include "mlir/Target/LLVM/NVVM/Utils.h"
24 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
25 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
26 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
27 #include "mlir/Target/LLVMIR/Export.h"
28 
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/Support/FileSystem.h"
31 #include "llvm/Support/FileUtilities.h"
32 #include "llvm/Support/FormatVariadic.h"
33 #include "llvm/Support/MemoryBuffer.h"
34 #include "llvm/Support/Path.h"
35 #include "llvm/Support/Process.h"
36 #include "llvm/Support/Program.h"
37 #include "llvm/Support/TargetSelect.h"
38 #include "llvm/Support/raw_ostream.h"
39 
40 #include <cstdint>
41 #include <cstdlib>
42 
43 using namespace mlir;
44 using namespace mlir::NVVM;
45 
46 #ifndef __DEFAULT_CUDATOOLKIT_PATH__
47 #define __DEFAULT_CUDATOOLKIT_PATH__ ""
48 #endif
49 
50 extern "C" const char _mlir_embedded_libdevice[];
51 extern "C" const unsigned _mlir_embedded_libdevice_size;
52 
53 namespace {
54 // Implementation of the `TargetAttrInterface` model.
55 class NVVMTargetAttrImpl
56     : public gpu::TargetAttrInterface::FallbackModel<NVVMTargetAttrImpl> {
57 public:
58   std::optional<SmallVector<char, 0>>
59   serializeToObject(Attribute attribute, Operation *module,
60                     const gpu::TargetOptions &options) const;
61 
62   Attribute createObject(Attribute attribute, Operation *module,
63                          const SmallVector<char, 0> &object,
64                          const gpu::TargetOptions &options) const;
65 };
66 } // namespace
67 
68 // Register the NVVM dialect, the NVVM translation & the target interface.
69 void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
70     DialectRegistry &registry) {
71   registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) {
72     NVVMTargetAttr::attachInterface<NVVMTargetAttrImpl>(*ctx);
73   });
74 }
75 
76 void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
77     MLIRContext &context) {
78   DialectRegistry registry;
79   registerNVVMTargetInterfaceExternalModels(registry);
80   context.appendDialectRegistry(registry);
81 }
82 
83 // Search for the CUDA toolkit path.
84 StringRef mlir::NVVM::getCUDAToolkitPath() {
85   if (const char *var = std::getenv("CUDA_ROOT"))
86     return var;
87   if (const char *var = std::getenv("CUDA_HOME"))
88     return var;
89   if (const char *var = std::getenv("CUDA_PATH"))
90     return var;
91   return __DEFAULT_CUDATOOLKIT_PATH__;
92 }
93 
94 SerializeGPUModuleBase::SerializeGPUModuleBase(
95     Operation &module, NVVMTargetAttr target,
96     const gpu::TargetOptions &targetOptions)
97     : ModuleToObject(module, target.getTriple(), target.getChip(),
98                      target.getFeatures(), target.getO(),
99                      targetOptions.getInitialLlvmIRCallback(),
100                      targetOptions.getLinkedLlvmIRCallback(),
101                      targetOptions.getOptimizedLlvmIRCallback(),
102                      targetOptions.getISACallback()),
103       target(target), toolkitPath(targetOptions.getToolkitPath()),
104       librariesToLink(targetOptions.getLibrariesToLink()) {
105 
106   // If `targetOptions` have an empty toolkitPath use `getCUDAToolkitPath`
107   if (toolkitPath.empty())
108     toolkitPath = getCUDAToolkitPath();
109 
110   // Append the files in the target attribute.
111   if (target.getLink())
112     librariesToLink.append(target.getLink().begin(), target.getLink().end());
113 
114   // Append libdevice to the files to be loaded.
115   (void)appendStandardLibs();
116 }
117 
118 void SerializeGPUModuleBase::init() {
119   static llvm::once_flag initializeBackendOnce;
120   llvm::call_once(initializeBackendOnce, []() {
121   // If the `NVPTX` LLVM target was built, initialize it.
122 #if LLVM_HAS_NVPTX_TARGET
123     LLVMInitializeNVPTXTarget();
124     LLVMInitializeNVPTXTargetInfo();
125     LLVMInitializeNVPTXTargetMC();
126     LLVMInitializeNVPTXAsmPrinter();
127 #endif
128   });
129 }
130 
131 NVVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }
132 
133 StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; }
134 
135 ArrayRef<Attribute> SerializeGPUModuleBase::getLibrariesToLink() const {
136   return librariesToLink;
137 }
138 
139 // Try to append `libdevice` from a CUDA toolkit installation.
140 LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
141 #if MLIR_NVVM_EMBED_LIBDEVICE
142   // If libdevice is embedded in the binary, we don't look it up on the
143   // filesystem.
144   MLIRContext *ctx = target.getContext();
145   auto type =
146       RankedTensorType::get(ArrayRef<int64_t>{_mlir_embedded_libdevice_size},
147                             IntegerType::get(ctx, 8));
148   auto resourceManager = DenseResourceElementsHandle::getManagerInterface(ctx);
149 
150   // Lookup if we already loaded the resource, otherwise create it.
151   DialectResourceBlobManager::BlobEntry *blob =
152       resourceManager.getBlobManager().lookup("_mlir_embedded_libdevice");
153   if (blob) {
154     librariesToLink.push_back(DenseResourceElementsAttr::get(
155         type, DenseResourceElementsHandle(
156                   blob, ctx->getLoadedDialect<BuiltinDialect>())));
157     return success();
158   }
159 
160   // Allocate a resource using one of the UnManagedResourceBlob method to wrap
161   // the embedded data.
162   auto unmanagedBlob = UnmanagedAsmResourceBlob::allocateInferAlign(
163       ArrayRef<char>{_mlir_embedded_libdevice, _mlir_embedded_libdevice_size});
164   librariesToLink.push_back(DenseResourceElementsAttr::get(
165       type, resourceManager.insert("_mlir_embedded_libdevice",
166                                    std::move(unmanagedBlob))));
167 #else
168   StringRef pathRef = getToolkitPath();
169   if (!pathRef.empty()) {
170     SmallVector<char, 256> path;
171     path.insert(path.begin(), pathRef.begin(), pathRef.end());
172     pathRef = StringRef(path.data(), path.size());
173     if (!llvm::sys::fs::is_directory(pathRef)) {
174       getOperation().emitError() << "CUDA path: " << pathRef
175                                  << " does not exist or is not a directory.\n";
176       return failure();
177     }
178     llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc");
179     pathRef = StringRef(path.data(), path.size());
180     if (!llvm::sys::fs::is_regular_file(pathRef)) {
181       getOperation().emitError() << "LibDevice path: " << pathRef
182                                  << " does not exist or is not a file.\n";
183       return failure();
184     }
185     librariesToLink.push_back(StringAttr::get(target.getContext(), pathRef));
186   }
187 #endif
188   return success();
189 }
190 
191 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
192 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
193   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
194   if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
195                                       bcFiles, true)))
196     return std::nullopt;
197   return std::move(bcFiles);
198 }
199 
200 namespace {
201 class NVPTXSerializer : public SerializeGPUModuleBase {
202 public:
203   NVPTXSerializer(Operation &module, NVVMTargetAttr target,
204                   const gpu::TargetOptions &targetOptions);
205 
206   /// Returns the GPU module op being serialized.
207   gpu::GPUModuleOp getOperation();
208 
209   /// Compiles PTX to cubin using `ptxas`.
210   std::optional<SmallVector<char, 0>>
211   compileToBinary(const std::string &ptxCode);
212 
213   /// Compiles PTX to cubin using the `nvptxcompiler` library.
214   std::optional<SmallVector<char, 0>>
215   compileToBinaryNVPTX(const std::string &ptxCode);
216 
217   /// Serializes the LLVM module to an object format, depending on the
218   /// compilation target selected in target options.
219   std::optional<SmallVector<char, 0>>
220   moduleToObject(llvm::Module &llvmModule) override;
221 
222 private:
223   using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
224 
225   /// Creates a temp file.
226   std::optional<TmpFile> createTemp(StringRef name, StringRef suffix);
227 
228   /// Finds the `tool` path, where `tool` is the name of the binary to search,
229   /// i.e. `ptxas` or `fatbinary`. The search order is:
230   /// 1. The toolkit path in `targetOptions`.
231   /// 2. In the system PATH.
232   /// 3. The path from `getCUDAToolkitPath()`.
233   std::optional<std::string> findTool(StringRef tool);
234 
235   /// Target options.
236   gpu::TargetOptions targetOptions;
237 };
238 } // namespace
239 
240 NVPTXSerializer::NVPTXSerializer(Operation &module, NVVMTargetAttr target,
241                                  const gpu::TargetOptions &targetOptions)
242     : SerializeGPUModuleBase(module, target, targetOptions),
243       targetOptions(targetOptions) {}
244 
245 std::optional<NVPTXSerializer::TmpFile>
246 NVPTXSerializer::createTemp(StringRef name, StringRef suffix) {
247   llvm::SmallString<128> filename;
248   std::error_code ec =
249       llvm::sys::fs::createTemporaryFile(name, suffix, filename);
250   if (ec) {
251     getOperation().emitError() << "Couldn't create the temp file: `" << filename
252                                << "`, error message: " << ec.message();
253     return std::nullopt;
254   }
255   return TmpFile(filename, llvm::FileRemover(filename.c_str()));
256 }
257 
258 gpu::GPUModuleOp NVPTXSerializer::getOperation() {
259   return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
260 }
261 
262 std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
263   // Find the `tool` path.
264   // 1. Check the toolkit path given in the command line.
265   StringRef pathRef = targetOptions.getToolkitPath();
266   SmallVector<char, 256> path;
267   if (!pathRef.empty()) {
268     path.insert(path.begin(), pathRef.begin(), pathRef.end());
269     llvm::sys::path::append(path, "bin", tool);
270     if (llvm::sys::fs::can_execute(path))
271       return StringRef(path.data(), path.size()).str();
272   }
273 
274   // 2. Check PATH.
275   if (std::optional<std::string> toolPath =
276           llvm::sys::Process::FindInEnvPath("PATH", tool))
277     return *toolPath;
278 
279   // 3. Check `getCUDAToolkitPath()`.
280   pathRef = getCUDAToolkitPath();
281   path.clear();
282   if (!pathRef.empty()) {
283     path.insert(path.begin(), pathRef.begin(), pathRef.end());
284     llvm::sys::path::append(path, "bin", tool);
285     if (llvm::sys::fs::can_execute(path))
286       return StringRef(path.data(), path.size()).str();
287   }
288   getOperation().emitError()
289       << "Couldn't find the `" << tool
290       << "` binary. Please specify the toolkit "
291          "path, add the compiler to $PATH, or set one of the environment "
292          "variables in `NVVM::getCUDAToolkitPath()`.";
293   return std::nullopt;
294 }
295 
296 // TODO: clean this method & have a generic tool driver or never emit binaries
297 // with this mechanism and let another stage take care of it.
298 std::optional<SmallVector<char, 0>>
299 NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
300   // Determine if the serializer should create a fatbinary with the PTX embeded
301   // or a simple CUBIN binary.
302   const bool createFatbin =
303       targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;
304 
305   // Find the `ptxas` & `fatbinary` tools.
306   std::optional<std::string> ptxasCompiler = findTool("ptxas");
307   if (!ptxasCompiler)
308     return std::nullopt;
309   std::optional<std::string> fatbinaryTool;
310   if (createFatbin) {
311     fatbinaryTool = findTool("fatbinary");
312     if (!fatbinaryTool)
313       return std::nullopt;
314   }
315   Location loc = getOperation().getLoc();
316 
317   // Base name for all temp files: mlir-<module name>-<target triple>-<chip>.
318   std::string basename =
319       llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(),
320                     getTarget().getTriple(), getTarget().getChip());
321 
322   // Create temp files:
323   std::optional<TmpFile> ptxFile = createTemp(basename, "ptx");
324   if (!ptxFile)
325     return std::nullopt;
326   std::optional<TmpFile> logFile = createTemp(basename, "log");
327   if (!logFile)
328     return std::nullopt;
329   std::optional<TmpFile> binaryFile = createTemp(basename, "bin");
330   if (!binaryFile)
331     return std::nullopt;
332   TmpFile cubinFile;
333   if (createFatbin) {
334     Twine cubinFilename = ptxFile->first + ".cubin";
335     cubinFile = TmpFile(cubinFilename.str(), llvm::FileRemover(cubinFilename));
336   } else {
337     cubinFile.first = binaryFile->first;
338   }
339 
340   std::error_code ec;
341   // Dump the PTX to a temp file.
342   {
343     llvm::raw_fd_ostream ptxStream(ptxFile->first, ec);
344     if (ec) {
345       emitError(loc) << "Couldn't open the file: `" << ptxFile->first
346                      << "`, error message: " << ec.message();
347       return std::nullopt;
348     }
349     ptxStream << ptxCode;
350     if (ptxStream.has_error()) {
351       emitError(loc) << "An error occurred while writing the PTX to: `"
352                      << ptxFile->first << "`.";
353       return std::nullopt;
354     }
355     ptxStream.flush();
356   }
357 
358   // Command redirects.
359   std::optional<StringRef> redirects[] = {
360       std::nullopt,
361       logFile->first,
362       logFile->first,
363   };
364 
365   // Get any extra args passed in `targetOptions`.
366   std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
367       targetOptions.tokenizeCmdOptions();
368 
369   // Create ptxas args.
370   std::string optLevel = std::to_string(this->optLevel);
371   SmallVector<StringRef, 12> ptxasArgs(
372       {StringRef("ptxas"), StringRef("-arch"), getTarget().getChip(),
373        StringRef(ptxFile->first), StringRef("-o"), StringRef(cubinFile.first),
374        "--opt-level", optLevel});
375 
376   bool useFatbin32 = false;
377   for (const auto *cArg : cmdOpts.second) {
378     // All `cmdOpts` are for `ptxas` except `-32` which passes `-32` to
379     // `fatbinary`, indicating a 32-bit target. By default a 64-bit target is
380     // assumed.
381     if (StringRef arg(cArg); arg != "-32")
382       ptxasArgs.push_back(arg);
383     else
384       useFatbin32 = true;
385   }
386 
387   // Create the `fatbinary` args.
388   StringRef chip = getTarget().getChip();
389   // Remove the arch prefix to obtain the compute capability.
390   chip.consume_front("sm_"), chip.consume_front("compute_");
391   // Embed the cubin object.
392   std::string cubinArg =
393       llvm::formatv("--image3=kind=elf,sm={0},file={1}", chip, cubinFile.first)
394           .str();
395   // Embed the PTX file so the driver can JIT if needed.
396   std::string ptxArg =
397       llvm::formatv("--image3=kind=ptx,sm={0},file={1}", chip, ptxFile->first)
398           .str();
399   SmallVector<StringRef, 6> fatbinArgs({StringRef("fatbinary"),
400                                         useFatbin32 ? "-32" : "-64", cubinArg,
401                                         ptxArg, "--create", binaryFile->first});
402 
403   // Dump tool invocation commands.
404 #define DEBUG_TYPE "serialize-to-binary"
405   LLVM_DEBUG({
406     llvm::dbgs() << "Tool invocation for module: "
407                  << getOperation().getNameAttr() << "\n";
408     llvm::interleave(ptxasArgs, llvm::dbgs(), " ");
409     llvm::dbgs() << "\n";
410     if (createFatbin) {
411       llvm::interleave(fatbinArgs, llvm::dbgs(), " ");
412       llvm::dbgs() << "\n";
413     }
414   });
415 #undef DEBUG_TYPE
416 
417   // Helper function for printing tool error logs.
418   std::string message;
419   auto emitLogError =
420       [&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
421     if (message.empty()) {
422       llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
423           llvm::MemoryBuffer::getFile(logFile->first);
424       if (toolStderr)
425         emitError(loc) << toolName << " invocation failed. Log:\n"
426                        << toolStderr->get()->getBuffer();
427       else
428         emitError(loc) << toolName << " invocation failed.";
429       return std::nullopt;
430     }
431     emitError(loc) << toolName
432                    << " invocation failed, error message: " << message;
433     return std::nullopt;
434   };
435 
436   // Invoke PTXAS.
437   if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(), ptxasArgs,
438                                 /*Env=*/std::nullopt,
439                                 /*Redirects=*/redirects,
440                                 /*SecondsToWait=*/0,
441                                 /*MemoryLimit=*/0,
442                                 /*ErrMsg=*/&message))
443     return emitLogError("`ptxas`");
444 #define DEBUG_TYPE "dump-sass"
445   LLVM_DEBUG({
446     std::optional<std::string> nvdisasm = findTool("nvdisasm");
447     SmallVector<StringRef> nvdisasmArgs(
448         {StringRef("nvdisasm"), StringRef(cubinFile.first)});
449     if (llvm::sys::ExecuteAndWait(nvdisasm.value(), nvdisasmArgs,
450                                   /*Env=*/std::nullopt,
451                                   /*Redirects=*/redirects,
452                                   /*SecondsToWait=*/0,
453                                   /*MemoryLimit=*/0,
454                                   /*ErrMsg=*/&message))
455       return emitLogError("`nvdisasm`");
456     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
457         llvm::MemoryBuffer::getFile(logFile->first);
458     if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
459       llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
460       llvm::dbgs().flush();
461     }
462   });
463 #undef DEBUG_TYPE
464 
465   // Invoke `fatbin`.
466   message.clear();
467   if (createFatbin && llvm::sys::ExecuteAndWait(*fatbinaryTool, fatbinArgs,
468                                                 /*Env=*/std::nullopt,
469                                                 /*Redirects=*/redirects,
470                                                 /*SecondsToWait=*/0,
471                                                 /*MemoryLimit=*/0,
472                                                 /*ErrMsg=*/&message))
473     return emitLogError("`fatbinary`");
474 
475 // Dump the output of the tools, helpful if the verbose flag was passed.
476 #define DEBUG_TYPE "serialize-to-binary"
477   LLVM_DEBUG({
478     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
479         llvm::MemoryBuffer::getFile(logFile->first);
480     if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
481       llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
482       llvm::dbgs().flush();
483     }
484   });
485 #undef DEBUG_TYPE
486 
487   // Read the fatbin.
488   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
489       llvm::MemoryBuffer::getFile(binaryFile->first);
490   if (!binaryBuffer) {
491     emitError(loc) << "Couldn't open the file: `" << binaryFile->first
492                    << "`, error message: " << binaryBuffer.getError().message();
493     return std::nullopt;
494   }
495   StringRef fatbin = (*binaryBuffer)->getBuffer();
496   return SmallVector<char, 0>(fatbin.begin(), fatbin.end());
497 }
498 
499 #if MLIR_ENABLE_NVPTXCOMPILER
500 #include "nvPTXCompiler.h"
501 
502 #define RETURN_ON_NVPTXCOMPILER_ERROR(expr)                                    \
503   do {                                                                         \
504     if (auto status = (expr)) {                                                \
505       emitError(loc) << llvm::Twine(#expr).concat(" failed with error code ")  \
506                      << status;                                                \
507       return std::nullopt;                                                     \
508     }                                                                          \
509   } while (false)
510 
511 #include "nvFatbin.h"
512 
513 #define RETURN_ON_NVFATBIN_ERROR(expr)                                         \
514   do {                                                                         \
515     auto result = (expr);                                                      \
516     if (result != nvFatbinResult::NVFATBIN_SUCCESS) {                          \
517       emitError(loc) << llvm::Twine(#expr).concat(" failed with error: ")      \
518                      << nvFatbinGetErrorString(result);                        \
519       return std::nullopt;                                                     \
520     }                                                                          \
521   } while (false)
522 
523 std::optional<SmallVector<char, 0>>
524 NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) {
525   Location loc = getOperation().getLoc();
526   nvPTXCompilerHandle compiler = nullptr;
527   nvPTXCompileResult status;
528   size_t logSize;
529 
530   // Create the options.
531   std::string optLevel = std::to_string(this->optLevel);
532   std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
533       targetOptions.tokenizeCmdOptions();
534   cmdOpts.second.append(
535       {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()});
536 
537   // Create the compiler handle.
538   RETURN_ON_NVPTXCOMPILER_ERROR(
539       nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str()));
540 
541   // Try to compile the binary.
542   status = nvPTXCompilerCompile(compiler, cmdOpts.second.size(),
543                                 cmdOpts.second.data());
544 
545   // Check if compilation failed.
546   if (status != NVPTXCOMPILE_SUCCESS) {
547     RETURN_ON_NVPTXCOMPILER_ERROR(
548         nvPTXCompilerGetErrorLogSize(compiler, &logSize));
549     if (logSize != 0) {
550       SmallVector<char> log(logSize + 1, 0);
551       RETURN_ON_NVPTXCOMPILER_ERROR(
552           nvPTXCompilerGetErrorLog(compiler, log.data()));
553       emitError(loc) << "NVPTX compiler invocation failed, error log: "
554                      << log.data();
555     } else
556       emitError(loc) << "NVPTX compiler invocation failed with error code: "
557                      << status;
558     return std::nullopt;
559   }
560 
561   // Retrieve the binary.
562   size_t elfSize;
563   RETURN_ON_NVPTXCOMPILER_ERROR(
564       nvPTXCompilerGetCompiledProgramSize(compiler, &elfSize));
565   SmallVector<char, 0> binary(elfSize, 0);
566   RETURN_ON_NVPTXCOMPILER_ERROR(
567       nvPTXCompilerGetCompiledProgram(compiler, (void *)binary.data()));
568 
569 // Dump the log of the compiler, helpful if the verbose flag was passed.
570 #define DEBUG_TYPE "serialize-to-binary"
571   LLVM_DEBUG({
572     RETURN_ON_NVPTXCOMPILER_ERROR(
573         nvPTXCompilerGetInfoLogSize(compiler, &logSize));
574     if (logSize != 0) {
575       SmallVector<char> log(logSize + 1, 0);
576       RETURN_ON_NVPTXCOMPILER_ERROR(
577           nvPTXCompilerGetInfoLog(compiler, log.data()));
578       llvm::dbgs() << "NVPTX compiler invocation for module: "
579                    << getOperation().getNameAttr() << "\n";
580       llvm::dbgs() << "Arguments: ";
581       llvm::interleave(cmdOpts.second, llvm::dbgs(), " ");
582       llvm::dbgs() << "\nOutput\n" << log.data() << "\n";
583       llvm::dbgs().flush();
584     }
585   });
586 #undef DEBUG_TYPE
587   RETURN_ON_NVPTXCOMPILER_ERROR(nvPTXCompilerDestroy(&compiler));
588 
589   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin) {
590     bool useFatbin32 = llvm::any_of(cmdOpts.second, [](const char *option) {
591       return llvm::StringRef(option) == "-32";
592     });
593 
594     const char *cubinOpts[1] = {useFatbin32 ? "-32" : "-64"};
595     nvFatbinHandle handle;
596 
597     auto chip = getTarget().getChip();
598     chip.consume_front("sm_");
599 
600     RETURN_ON_NVFATBIN_ERROR(nvFatbinCreate(&handle, cubinOpts, 1));
601     RETURN_ON_NVFATBIN_ERROR(nvFatbinAddCubin(
602         handle, binary.data(), binary.size(), chip.data(), nullptr));
603     RETURN_ON_NVFATBIN_ERROR(nvFatbinAddPTX(
604         handle, ptxCode.data(), ptxCode.size(), chip.data(), nullptr, nullptr));
605 
606     size_t fatbinSize;
607     RETURN_ON_NVFATBIN_ERROR(nvFatbinSize(handle, &fatbinSize));
608     SmallVector<char, 0> fatbin(fatbinSize, 0);
609     RETURN_ON_NVFATBIN_ERROR(nvFatbinGet(handle, (void *)fatbin.data()));
610     RETURN_ON_NVFATBIN_ERROR(nvFatbinDestroy(&handle));
611     return fatbin;
612   }
613 
614   return binary;
615 }
616 #endif // MLIR_ENABLE_NVPTXCOMPILER
617 
618 std::optional<SmallVector<char, 0>>
619 NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
620   // Return LLVM IR if the compilation target is `offload`.
621 #define DEBUG_TYPE "serialize-to-llvm"
622   LLVM_DEBUG({
623     llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
624                  << "\n";
625     llvm::dbgs() << llvmModule << "\n";
626     llvm::dbgs().flush();
627   });
628 #undef DEBUG_TYPE
629   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
630     return SerializeGPUModuleBase::moduleToObject(llvmModule);
631 
632 #if !LLVM_HAS_NVPTX_TARGET
633   getOperation()->emitError(
634       "The `NVPTX` target was not built. Please enable it when building LLVM.");
635   return std::nullopt;
636 #endif // LLVM_HAS_NVPTX_TARGET
637 
638   // Emit PTX code.
639   std::optional<llvm::TargetMachine *> targetMachine =
640       getOrCreateTargetMachine();
641   if (!targetMachine) {
642     getOperation().emitError() << "Target Machine unavailable for triple "
643                                << triple << ", can't optimize with LLVM\n";
644     return std::nullopt;
645   }
646   std::optional<std::string> serializedISA =
647       translateToISA(llvmModule, **targetMachine);
648   if (!serializedISA) {
649     getOperation().emitError() << "Failed translating the module to ISA.";
650     return std::nullopt;
651   }
652   if (isaCallback)
653     isaCallback(serializedISA.value());
654 
655 #define DEBUG_TYPE "serialize-to-isa"
656   LLVM_DEBUG({
657     llvm::dbgs() << "PTX for module: " << getOperation().getNameAttr() << "\n";
658     llvm::dbgs() << *serializedISA << "\n";
659     llvm::dbgs().flush();
660   });
661 #undef DEBUG_TYPE
662 
663   // Return PTX if the compilation target is `assembly`.
664   if (targetOptions.getCompilationTarget() ==
665       gpu::CompilationTarget::Assembly) {
666     // Make sure to include the null terminator.
667     StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
668     return SmallVector<char, 0>(bin.begin(), bin.end());
669   }
670 
671   // Compile to binary.
672 #if MLIR_ENABLE_NVPTXCOMPILER
673   return compileToBinaryNVPTX(*serializedISA);
674 #else
675   return compileToBinary(*serializedISA);
676 #endif // MLIR_ENABLE_NVPTXCOMPILER
677 }
678 
679 std::optional<SmallVector<char, 0>>
680 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
681                                       const gpu::TargetOptions &options) const {
682   assert(module && "The module must be non null.");
683   if (!module)
684     return std::nullopt;
685   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
686     module->emitError("Module must be a GPU module.");
687     return std::nullopt;
688   }
689   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
690   serializer.init();
691   return serializer.run();
692 }
693 
694 Attribute
695 NVVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
696                                  const SmallVector<char, 0> &object,
697                                  const gpu::TargetOptions &options) const {
698   auto target = cast<NVVMTargetAttr>(attribute);
699   gpu::CompilationTarget format = options.getCompilationTarget();
700   DictionaryAttr objectProps;
701   Builder builder(attribute.getContext());
702   SmallVector<NamedAttribute, 2> properties;
703   if (format == gpu::CompilationTarget::Assembly)
704     properties.push_back(
705         builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO())));
706 
707   if (StringRef section = options.getELFSection(); !section.empty())
708     properties.push_back(builder.getNamedAttr(gpu::elfSectionName,
709                                               builder.getStringAttr(section)));
710 
711   if (!properties.empty())
712     objectProps = builder.getDictionaryAttr(properties);
713 
714   return builder.getAttr<gpu::ObjectAttr>(
715       attribute, format,
716       builder.getStringAttr(StringRef(object.data(), object.size())),
717       objectProps, /*kernels=*/nullptr);
718 }
719