xref: /llvm-project/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp (revision 89d8e70031189eacb915beae2ffc642c0de1ec1a)
1feeb8335SJoseph Huber //===-- Loader Implementation for NVPTX devices --------------------------===//
2feeb8335SJoseph Huber //
3feeb8335SJoseph Huber // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4feeb8335SJoseph Huber // See https://llvm.org/LICENSE.txt for license information.
5feeb8335SJoseph Huber // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6feeb8335SJoseph Huber //
7feeb8335SJoseph Huber //===----------------------------------------------------------------------===//
8feeb8335SJoseph Huber //
9feeb8335SJoseph Huber // This file impelements a simple loader to run images supporting the NVPTX
10feeb8335SJoseph Huber // architecture. The file launches the '_start' kernel which should be provided
11feeb8335SJoseph Huber // by the device application start code and call ultimately call the 'main'
12feeb8335SJoseph Huber // function.
13feeb8335SJoseph Huber //
14feeb8335SJoseph Huber //===----------------------------------------------------------------------===//
15feeb8335SJoseph Huber 
16feeb8335SJoseph Huber #include "Loader.h"
17feeb8335SJoseph Huber 
18feeb8335SJoseph Huber #include "cuda.h"
19feeb8335SJoseph Huber 
20feeb8335SJoseph Huber #include "llvm/Object/ELF.h"
21feeb8335SJoseph Huber #include "llvm/Object/ELFObjectFile.h"
22feeb8335SJoseph Huber 
23ee57a685SJoseph Huber #include <atomic>
24feeb8335SJoseph Huber #include <cstddef>
25feeb8335SJoseph Huber #include <cstdio>
26feeb8335SJoseph Huber #include <cstdlib>
27feeb8335SJoseph Huber #include <cstring>
28ee57a685SJoseph Huber #include <thread>
29feeb8335SJoseph Huber #include <vector>
30feeb8335SJoseph Huber 
31feeb8335SJoseph Huber using namespace llvm;
32feeb8335SJoseph Huber using namespace object;
33feeb8335SJoseph Huber 
34feeb8335SJoseph Huber static void handle_error_impl(const char *file, int32_t line, CUresult err) {
35feeb8335SJoseph Huber   if (err == CUDA_SUCCESS)
36feeb8335SJoseph Huber     return;
37feeb8335SJoseph Huber 
38feeb8335SJoseph Huber   const char *err_str = nullptr;
39feeb8335SJoseph Huber   CUresult result = cuGetErrorString(err, &err_str);
40feeb8335SJoseph Huber   if (result != CUDA_SUCCESS)
41feeb8335SJoseph Huber     fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
42feeb8335SJoseph Huber   else
43feeb8335SJoseph Huber     fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
44feeb8335SJoseph Huber   exit(1);
45feeb8335SJoseph Huber }
46feeb8335SJoseph Huber 
47feeb8335SJoseph Huber // Gets the names of all the globals that contain functions to initialize or
48feeb8335SJoseph Huber // deinitialize. We need to do this manually because the NVPTX toolchain does
49feeb8335SJoseph Huber // not contain the necessary binary manipulation tools.
50feeb8335SJoseph Huber template <typename Alloc>
51feeb8335SJoseph Huber Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
52feeb8335SJoseph Huber                                      Alloc allocator, CUmodule binary) {
53feeb8335SJoseph Huber   auto mem_buffer = MemoryBuffer::getMemBuffer(
54feeb8335SJoseph Huber       StringRef(reinterpret_cast<const char *>(image), size), "image",
55feeb8335SJoseph Huber       /*RequiresNullTerminator=*/false);
56feeb8335SJoseph Huber   Expected<ELF64LEObjectFile> elf_or_err =
57feeb8335SJoseph Huber       ELF64LEObjectFile::create(*mem_buffer);
58feeb8335SJoseph Huber   if (!elf_or_err)
59feeb8335SJoseph Huber     handle_error(toString(elf_or_err.takeError()).c_str());
60feeb8335SJoseph Huber 
61feeb8335SJoseph Huber   std::vector<std::pair<const char *, uint16_t>> ctors;
62feeb8335SJoseph Huber   std::vector<std::pair<const char *, uint16_t>> dtors;
63feeb8335SJoseph Huber   // CUDA has no way to iterate over all the symbols so we need to inspect the
64feeb8335SJoseph Huber   // ELF directly using the LLVM libraries.
65feeb8335SJoseph Huber   for (const auto &symbol : elf_or_err->symbols()) {
66feeb8335SJoseph Huber     auto name_or_err = symbol.getName();
67feeb8335SJoseph Huber     if (!name_or_err)
68feeb8335SJoseph Huber       handle_error(toString(name_or_err.takeError()).c_str());
69feeb8335SJoseph Huber 
70feeb8335SJoseph Huber     // Search for all symbols that contain a constructor or destructor.
71feeb8335SJoseph Huber     if (!name_or_err->starts_with("__init_array_object_") &&
72feeb8335SJoseph Huber         !name_or_err->starts_with("__fini_array_object_"))
73feeb8335SJoseph Huber       continue;
74feeb8335SJoseph Huber 
75feeb8335SJoseph Huber     uint16_t priority;
76feeb8335SJoseph Huber     if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
77feeb8335SJoseph Huber       handle_error("Invalid priority for constructor or destructor");
78feeb8335SJoseph Huber 
79feeb8335SJoseph Huber     if (name_or_err->starts_with("__init"))
80feeb8335SJoseph Huber       ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
81feeb8335SJoseph Huber     else
82feeb8335SJoseph Huber       dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
83feeb8335SJoseph Huber   }
84feeb8335SJoseph Huber   // Lower priority constructors are run before higher ones. The reverse is true
85feeb8335SJoseph Huber   // for destructors.
86feeb8335SJoseph Huber   llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
87feeb8335SJoseph Huber   llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
88feeb8335SJoseph Huber 
89feeb8335SJoseph Huber   // Allocate host pinned memory to make these arrays visible to the GPU.
90feeb8335SJoseph Huber   CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
91feeb8335SJoseph Huber       ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
92feeb8335SJoseph Huber   uint64_t global_size = 0;
93feeb8335SJoseph Huber 
94feeb8335SJoseph Huber   // Get the address of the global and then store the address of the constructor
95feeb8335SJoseph Huber   // function to call in the constructor array.
96feeb8335SJoseph Huber   CUdeviceptr *dev_ctors_start = dev_memory;
97feeb8335SJoseph Huber   CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
98feeb8335SJoseph Huber   for (uint64_t i = 0; i < ctors.size(); ++i) {
99feeb8335SJoseph Huber     CUdeviceptr dev_ptr;
100feeb8335SJoseph Huber     if (CUresult err =
101feeb8335SJoseph Huber             cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
102feeb8335SJoseph Huber       handle_error(err);
103feeb8335SJoseph Huber     if (CUresult err =
104feeb8335SJoseph Huber             cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
105feeb8335SJoseph Huber       handle_error(err);
106feeb8335SJoseph Huber   }
107feeb8335SJoseph Huber 
108feeb8335SJoseph Huber   // Get the address of the global and then store the address of the destructor
109feeb8335SJoseph Huber   // function to call in the destructor array.
110feeb8335SJoseph Huber   CUdeviceptr *dev_dtors_start = dev_ctors_end;
111feeb8335SJoseph Huber   CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
112feeb8335SJoseph Huber   for (uint64_t i = 0; i < dtors.size(); ++i) {
113feeb8335SJoseph Huber     CUdeviceptr dev_ptr;
114feeb8335SJoseph Huber     if (CUresult err =
115feeb8335SJoseph Huber             cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
116feeb8335SJoseph Huber       handle_error(err);
117feeb8335SJoseph Huber     if (CUresult err =
118feeb8335SJoseph Huber             cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
119feeb8335SJoseph Huber       handle_error(err);
120feeb8335SJoseph Huber   }
121feeb8335SJoseph Huber 
122feeb8335SJoseph Huber   // Obtain the address of the pointers the startup implementation uses to
123feeb8335SJoseph Huber   // iterate the constructors and destructors.
124feeb8335SJoseph Huber   CUdeviceptr init_start;
125feeb8335SJoseph Huber   if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
126feeb8335SJoseph Huber                                        "__init_array_start"))
127feeb8335SJoseph Huber     handle_error(err);
128feeb8335SJoseph Huber   CUdeviceptr init_end;
129feeb8335SJoseph Huber   if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
130feeb8335SJoseph Huber                                        "__init_array_end"))
131feeb8335SJoseph Huber     handle_error(err);
132feeb8335SJoseph Huber   CUdeviceptr fini_start;
133feeb8335SJoseph Huber   if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
134feeb8335SJoseph Huber                                        "__fini_array_start"))
135feeb8335SJoseph Huber     handle_error(err);
136feeb8335SJoseph Huber   CUdeviceptr fini_end;
137feeb8335SJoseph Huber   if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
138feeb8335SJoseph Huber                                        "__fini_array_end"))
139feeb8335SJoseph Huber     handle_error(err);
140feeb8335SJoseph Huber 
141feeb8335SJoseph Huber   // Copy the pointers to the newly written array to the symbols so the startup
142feeb8335SJoseph Huber   // implementation can iterate them.
143feeb8335SJoseph Huber   if (CUresult err =
144feeb8335SJoseph Huber           cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
145feeb8335SJoseph Huber     handle_error(err);
146feeb8335SJoseph Huber   if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
147feeb8335SJoseph Huber     handle_error(err);
148feeb8335SJoseph Huber   if (CUresult err =
149feeb8335SJoseph Huber           cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
150feeb8335SJoseph Huber     handle_error(err);
151feeb8335SJoseph Huber   if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
152feeb8335SJoseph Huber     handle_error(err);
153feeb8335SJoseph Huber 
154feeb8335SJoseph Huber   return dev_memory;
155feeb8335SJoseph Huber }
156feeb8335SJoseph Huber 
157feeb8335SJoseph Huber void print_kernel_resources(CUmodule binary, const char *kernel_name) {
158feeb8335SJoseph Huber   CUfunction function;
159feeb8335SJoseph Huber   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
160feeb8335SJoseph Huber     handle_error(err);
161feeb8335SJoseph Huber   int num_regs;
162feeb8335SJoseph Huber   if (CUresult err =
163feeb8335SJoseph Huber           cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
164feeb8335SJoseph Huber     handle_error(err);
165feeb8335SJoseph Huber   printf("Executing kernel %s:\n", kernel_name);
166feeb8335SJoseph Huber   printf("%6s registers: %d\n", kernel_name, num_regs);
167feeb8335SJoseph Huber }
168feeb8335SJoseph Huber 
169feeb8335SJoseph Huber template <typename args_t>
170b4d49fb5SJoseph Huber CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
171b4d49fb5SJoseph Huber                        const LaunchParameters &params, const char *kernel_name,
172b4d49fb5SJoseph Huber                        args_t kernel_args, bool print_resource_usage) {
173feeb8335SJoseph Huber   // look up the '_start' kernel in the loaded module.
174feeb8335SJoseph Huber   CUfunction function;
175feeb8335SJoseph Huber   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
176feeb8335SJoseph Huber     handle_error(err);
177feeb8335SJoseph Huber 
178feeb8335SJoseph Huber   // Set up the arguments to the '_start' kernel on the GPU.
179feeb8335SJoseph Huber   uint64_t args_size = sizeof(args_t);
180feeb8335SJoseph Huber   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
181feeb8335SJoseph Huber                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
182feeb8335SJoseph Huber                          CU_LAUNCH_PARAM_END};
183b4d49fb5SJoseph Huber   if (print_resource_usage)
184b4d49fb5SJoseph Huber     print_kernel_resources(binary, kernel_name);
185feeb8335SJoseph Huber 
186b4d49fb5SJoseph Huber   // Initialize a non-blocking CUDA stream to allocate memory if needed.
187b4d49fb5SJoseph Huber   // This needs to be done on a separate stream or else it will deadlock
188b4d49fb5SJoseph Huber   // with the executing kernel.
189feeb8335SJoseph Huber   CUstream memory_stream;
190feeb8335SJoseph Huber   if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
191feeb8335SJoseph Huber     handle_error(err);
192feeb8335SJoseph Huber 
193b4d49fb5SJoseph Huber   std::atomic<bool> finished = false;
194b4d49fb5SJoseph Huber   std::thread server_thread(
195b4d49fb5SJoseph Huber       [](std::atomic<bool> *finished, rpc::Server *server,
196b4d49fb5SJoseph Huber          CUstream memory_stream) {
197b4d49fb5SJoseph Huber         auto malloc_handler = [&](size_t size) -> void * {
198feeb8335SJoseph Huber           CUdeviceptr dev_ptr;
199feeb8335SJoseph Huber           if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
2008c6a6f1aSJoseph Huber             dev_ptr = 0UL;
201feeb8335SJoseph Huber 
202feeb8335SJoseph Huber           // Wait until the memory allocation is complete.
203feeb8335SJoseph Huber           while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
204feeb8335SJoseph Huber             ;
205b4d49fb5SJoseph Huber           return reinterpret_cast<void *>(dev_ptr);
206feeb8335SJoseph Huber         };
207b4d49fb5SJoseph Huber 
208b4d49fb5SJoseph Huber         auto free_handler = [&](void *ptr) -> void {
209b4d49fb5SJoseph Huber           if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
210b4d49fb5SJoseph Huber                                             memory_stream))
211feeb8335SJoseph Huber             handle_error(err);
212feeb8335SJoseph Huber         };
213feeb8335SJoseph Huber 
214b4d49fb5SJoseph Huber         uint32_t index = 0;
215ee57a685SJoseph Huber         while (!*finished) {
216b4d49fb5SJoseph Huber           index =
217b4d49fb5SJoseph Huber               handle_server<32>(*server, index, malloc_handler, free_handler);
218ee57a685SJoseph Huber         }
219ee57a685SJoseph Huber       },
220b4d49fb5SJoseph Huber       &finished, &server, memory_stream);
221ee57a685SJoseph Huber 
222feeb8335SJoseph Huber   // Call the kernel with the given arguments.
223feeb8335SJoseph Huber   if (CUresult err = cuLaunchKernel(
224feeb8335SJoseph Huber           function, params.num_blocks_x, params.num_blocks_y,
225feeb8335SJoseph Huber           params.num_blocks_z, params.num_threads_x, params.num_threads_y,
226feeb8335SJoseph Huber           params.num_threads_z, 0, stream, nullptr, args_config))
227feeb8335SJoseph Huber     handle_error(err);
228feeb8335SJoseph Huber 
229ee57a685SJoseph Huber   if (CUresult err = cuStreamSynchronize(stream))
230feeb8335SJoseph Huber     handle_error(err);
231feeb8335SJoseph Huber 
232ee57a685SJoseph Huber   finished = true;
233b4d49fb5SJoseph Huber   if (server_thread.joinable())
234b4d49fb5SJoseph Huber     server_thread.join();
235feeb8335SJoseph Huber 
236feeb8335SJoseph Huber   return CUDA_SUCCESS;
237feeb8335SJoseph Huber }
238feeb8335SJoseph Huber 
2395e326983SJoseph Huber int load(int argc, const char **argv, const char **envp, void *image,
2405e326983SJoseph Huber          size_t size, const LaunchParameters &params,
2415e326983SJoseph Huber          bool print_resource_usage) {
242feeb8335SJoseph Huber   if (CUresult err = cuInit(0))
243feeb8335SJoseph Huber     handle_error(err);
244feeb8335SJoseph Huber   // Obtain the first device found on the system.
245feeb8335SJoseph Huber   uint32_t device_id = 0;
246feeb8335SJoseph Huber   CUdevice device;
247feeb8335SJoseph Huber   if (CUresult err = cuDeviceGet(&device, device_id))
248feeb8335SJoseph Huber     handle_error(err);
249feeb8335SJoseph Huber 
250feeb8335SJoseph Huber   // Initialize the CUDA context and claim it for this execution.
251feeb8335SJoseph Huber   CUcontext context;
252feeb8335SJoseph Huber   if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
253feeb8335SJoseph Huber     handle_error(err);
254feeb8335SJoseph Huber   if (CUresult err = cuCtxSetCurrent(context))
255feeb8335SJoseph Huber     handle_error(err);
256feeb8335SJoseph Huber 
257feeb8335SJoseph Huber   // Increase the stack size per thread.
258feeb8335SJoseph Huber   // TODO: We should allow this to be passed in so only the tests that require a
259feeb8335SJoseph Huber   // larger stack can specify it to save on memory usage.
260feeb8335SJoseph Huber   if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
261feeb8335SJoseph Huber     handle_error(err);
262feeb8335SJoseph Huber 
263feeb8335SJoseph Huber   // Initialize a non-blocking CUDA stream to execute the kernel.
264feeb8335SJoseph Huber   CUstream stream;
265feeb8335SJoseph Huber   if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
266feeb8335SJoseph Huber     handle_error(err);
267feeb8335SJoseph Huber 
268feeb8335SJoseph Huber   // Load the image into a CUDA module.
269feeb8335SJoseph Huber   CUmodule binary;
270feeb8335SJoseph Huber   if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
271feeb8335SJoseph Huber     handle_error(err);
272feeb8335SJoseph Huber 
273feeb8335SJoseph Huber   // Allocate pinned memory on the host to hold the pointer array for the
274feeb8335SJoseph Huber   // copied argv and allow the GPU device to access it.
275feeb8335SJoseph Huber   auto allocator = [&](uint64_t size) -> void * {
276feeb8335SJoseph Huber     void *dev_ptr;
277feeb8335SJoseph Huber     if (CUresult err = cuMemAllocHost(&dev_ptr, size))
278feeb8335SJoseph Huber       handle_error(err);
279feeb8335SJoseph Huber     return dev_ptr;
280feeb8335SJoseph Huber   };
281feeb8335SJoseph Huber 
282feeb8335SJoseph Huber   auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
283feeb8335SJoseph Huber   if (!memory_or_err)
284feeb8335SJoseph Huber     handle_error(toString(memory_or_err.takeError()).c_str());
285feeb8335SJoseph Huber 
286feeb8335SJoseph Huber   void *dev_argv = copy_argument_vector(argc, argv, allocator);
287feeb8335SJoseph Huber   if (!dev_argv)
288feeb8335SJoseph Huber     handle_error("Failed to allocate device argv");
289feeb8335SJoseph Huber 
290feeb8335SJoseph Huber   // Allocate pinned memory on the host to hold the pointer array for the
291feeb8335SJoseph Huber   // copied environment array and allow the GPU device to access it.
292feeb8335SJoseph Huber   void *dev_envp = copy_environment(envp, allocator);
293feeb8335SJoseph Huber   if (!dev_envp)
294feeb8335SJoseph Huber     handle_error("Failed to allocate device environment");
295feeb8335SJoseph Huber 
296feeb8335SJoseph Huber   // Allocate space for the return pointer and initialize it to zero.
297feeb8335SJoseph Huber   CUdeviceptr dev_ret;
298feeb8335SJoseph Huber   if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
299feeb8335SJoseph Huber     handle_error(err);
300feeb8335SJoseph Huber   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
301feeb8335SJoseph Huber     handle_error(err);
302feeb8335SJoseph Huber 
303feeb8335SJoseph Huber   uint32_t warp_size = 32;
304b4d49fb5SJoseph Huber   void *rpc_buffer = nullptr;
305b4d49fb5SJoseph Huber   if (CUresult err = cuMemAllocHost(
306b4d49fb5SJoseph Huber           &rpc_buffer,
307b4d49fb5SJoseph Huber           rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
308feeb8335SJoseph Huber     handle_error(err);
309b4d49fb5SJoseph Huber   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
310b4d49fb5SJoseph Huber   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
311feeb8335SJoseph Huber 
312feeb8335SJoseph Huber   // Initialize the RPC client on the device by copying the local data to the
313feeb8335SJoseph Huber   // device's internal pointer.
314feeb8335SJoseph Huber   CUdeviceptr rpc_client_dev = 0;
315feeb8335SJoseph Huber   uint64_t client_ptr_size = sizeof(void *);
316feeb8335SJoseph Huber   if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
317*89d8e700SJoseph Huber                                        binary, "__llvm_rpc_client"))
318feeb8335SJoseph Huber     handle_error(err);
319feeb8335SJoseph Huber 
320*89d8e700SJoseph Huber   if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
321feeb8335SJoseph Huber     handle_error(err);
322feeb8335SJoseph Huber 
323feeb8335SJoseph Huber   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
324feeb8335SJoseph Huber   begin_args_t init_args = {argc, dev_argv, dev_envp};
325feeb8335SJoseph Huber   if (CUresult err =
326b4d49fb5SJoseph Huber           launch_kernel(binary, stream, server, single_threaded_params,
327feeb8335SJoseph Huber                         "_begin", init_args, print_resource_usage))
328feeb8335SJoseph Huber     handle_error(err);
329feeb8335SJoseph Huber 
330feeb8335SJoseph Huber   start_args_t args = {argc, dev_argv, dev_envp,
331feeb8335SJoseph Huber                        reinterpret_cast<void *>(dev_ret)};
332b4d49fb5SJoseph Huber   if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
333feeb8335SJoseph Huber                                    args, print_resource_usage))
334feeb8335SJoseph Huber     handle_error(err);
335feeb8335SJoseph Huber 
336feeb8335SJoseph Huber   // Copy the return value back from the kernel and wait.
337feeb8335SJoseph Huber   int host_ret = 0;
338feeb8335SJoseph Huber   if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
339feeb8335SJoseph Huber     handle_error(err);
340feeb8335SJoseph Huber 
341feeb8335SJoseph Huber   if (CUresult err = cuStreamSynchronize(stream))
342feeb8335SJoseph Huber     handle_error(err);
343feeb8335SJoseph Huber 
344feeb8335SJoseph Huber   end_args_t fini_args = {host_ret};
345feeb8335SJoseph Huber   if (CUresult err =
346b4d49fb5SJoseph Huber           launch_kernel(binary, stream, server, single_threaded_params, "_end",
347b4d49fb5SJoseph Huber                         fini_args, print_resource_usage))
348feeb8335SJoseph Huber     handle_error(err);
349feeb8335SJoseph Huber 
350feeb8335SJoseph Huber   // Free the memory allocated for the device.
351feeb8335SJoseph Huber   if (CUresult err = cuMemFreeHost(*memory_or_err))
352feeb8335SJoseph Huber     handle_error(err);
353feeb8335SJoseph Huber   if (CUresult err = cuMemFree(dev_ret))
354feeb8335SJoseph Huber     handle_error(err);
355feeb8335SJoseph Huber   if (CUresult err = cuMemFreeHost(dev_argv))
356feeb8335SJoseph Huber     handle_error(err);
357b4d49fb5SJoseph Huber   if (CUresult err = cuMemFreeHost(rpc_buffer))
358feeb8335SJoseph Huber     handle_error(err);
359feeb8335SJoseph Huber 
360feeb8335SJoseph Huber   // Destroy the context and the loaded binary.
361feeb8335SJoseph Huber   if (CUresult err = cuModuleUnload(binary))
362feeb8335SJoseph Huber     handle_error(err);
363feeb8335SJoseph Huber   if (CUresult err = cuDevicePrimaryCtxRelease(device))
364feeb8335SJoseph Huber     handle_error(err);
365feeb8335SJoseph Huber   return host_ret;
366feeb8335SJoseph Huber }
367