1feeb8335SJoseph Huber //===-- Loader Implementation for NVPTX devices --------------------------===// 2feeb8335SJoseph Huber // 3feeb8335SJoseph Huber // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4feeb8335SJoseph Huber // See https://llvm.org/LICENSE.txt for license information. 5feeb8335SJoseph Huber // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6feeb8335SJoseph Huber // 7feeb8335SJoseph Huber //===----------------------------------------------------------------------===// 8feeb8335SJoseph Huber // 9feeb8335SJoseph Huber // This file impelements a simple loader to run images supporting the NVPTX 10feeb8335SJoseph Huber // architecture. The file launches the '_start' kernel which should be provided 11feeb8335SJoseph Huber // by the device application start code and call ultimately call the 'main' 12feeb8335SJoseph Huber // function. 13feeb8335SJoseph Huber // 14feeb8335SJoseph Huber //===----------------------------------------------------------------------===// 15feeb8335SJoseph Huber 16feeb8335SJoseph Huber #include "Loader.h" 17feeb8335SJoseph Huber 18feeb8335SJoseph Huber #include "cuda.h" 19feeb8335SJoseph Huber 20feeb8335SJoseph Huber #include "llvm/Object/ELF.h" 21feeb8335SJoseph Huber #include "llvm/Object/ELFObjectFile.h" 22feeb8335SJoseph Huber 23ee57a685SJoseph Huber #include <atomic> 24feeb8335SJoseph Huber #include <cstddef> 25feeb8335SJoseph Huber #include <cstdio> 26feeb8335SJoseph Huber #include <cstdlib> 27feeb8335SJoseph Huber #include <cstring> 28ee57a685SJoseph Huber #include <thread> 29feeb8335SJoseph Huber #include <vector> 30feeb8335SJoseph Huber 31feeb8335SJoseph Huber using namespace llvm; 32feeb8335SJoseph Huber using namespace object; 33feeb8335SJoseph Huber 34feeb8335SJoseph Huber static void handle_error_impl(const char *file, int32_t line, CUresult err) { 35feeb8335SJoseph Huber if (err == CUDA_SUCCESS) 36feeb8335SJoseph Huber return; 37feeb8335SJoseph Huber 38feeb8335SJoseph Huber const char *err_str = nullptr; 39feeb8335SJoseph Huber CUresult result = cuGetErrorString(err, &err_str); 40feeb8335SJoseph Huber if (result != CUDA_SUCCESS) 41feeb8335SJoseph Huber fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line); 42feeb8335SJoseph Huber else 43feeb8335SJoseph Huber fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str); 44feeb8335SJoseph Huber exit(1); 45feeb8335SJoseph Huber } 46feeb8335SJoseph Huber 47feeb8335SJoseph Huber // Gets the names of all the globals that contain functions to initialize or 48feeb8335SJoseph Huber // deinitialize. We need to do this manually because the NVPTX toolchain does 49feeb8335SJoseph Huber // not contain the necessary binary manipulation tools. 50feeb8335SJoseph Huber template <typename Alloc> 51feeb8335SJoseph Huber Expected<void *> get_ctor_dtor_array(const void *image, const size_t size, 52feeb8335SJoseph Huber Alloc allocator, CUmodule binary) { 53feeb8335SJoseph Huber auto mem_buffer = MemoryBuffer::getMemBuffer( 54feeb8335SJoseph Huber StringRef(reinterpret_cast<const char *>(image), size), "image", 55feeb8335SJoseph Huber /*RequiresNullTerminator=*/false); 56feeb8335SJoseph Huber Expected<ELF64LEObjectFile> elf_or_err = 57feeb8335SJoseph Huber ELF64LEObjectFile::create(*mem_buffer); 58feeb8335SJoseph Huber if (!elf_or_err) 59feeb8335SJoseph Huber handle_error(toString(elf_or_err.takeError()).c_str()); 60feeb8335SJoseph Huber 61feeb8335SJoseph Huber std::vector<std::pair<const char *, uint16_t>> ctors; 62feeb8335SJoseph Huber std::vector<std::pair<const char *, uint16_t>> dtors; 63feeb8335SJoseph Huber // CUDA has no way to iterate over all the symbols so we need to inspect the 64feeb8335SJoseph Huber // ELF directly using the LLVM libraries. 65feeb8335SJoseph Huber for (const auto &symbol : elf_or_err->symbols()) { 66feeb8335SJoseph Huber auto name_or_err = symbol.getName(); 67feeb8335SJoseph Huber if (!name_or_err) 68feeb8335SJoseph Huber handle_error(toString(name_or_err.takeError()).c_str()); 69feeb8335SJoseph Huber 70feeb8335SJoseph Huber // Search for all symbols that contain a constructor or destructor. 71feeb8335SJoseph Huber if (!name_or_err->starts_with("__init_array_object_") && 72feeb8335SJoseph Huber !name_or_err->starts_with("__fini_array_object_")) 73feeb8335SJoseph Huber continue; 74feeb8335SJoseph Huber 75feeb8335SJoseph Huber uint16_t priority; 76feeb8335SJoseph Huber if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) 77feeb8335SJoseph Huber handle_error("Invalid priority for constructor or destructor"); 78feeb8335SJoseph Huber 79feeb8335SJoseph Huber if (name_or_err->starts_with("__init")) 80feeb8335SJoseph Huber ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); 81feeb8335SJoseph Huber else 82feeb8335SJoseph Huber dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); 83feeb8335SJoseph Huber } 84feeb8335SJoseph Huber // Lower priority constructors are run before higher ones. The reverse is true 85feeb8335SJoseph Huber // for destructors. 86feeb8335SJoseph Huber llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; }); 87feeb8335SJoseph Huber llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; }); 88feeb8335SJoseph Huber 89feeb8335SJoseph Huber // Allocate host pinned memory to make these arrays visible to the GPU. 90feeb8335SJoseph Huber CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator( 91feeb8335SJoseph Huber ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); 92feeb8335SJoseph Huber uint64_t global_size = 0; 93feeb8335SJoseph Huber 94feeb8335SJoseph Huber // Get the address of the global and then store the address of the constructor 95feeb8335SJoseph Huber // function to call in the constructor array. 96feeb8335SJoseph Huber CUdeviceptr *dev_ctors_start = dev_memory; 97feeb8335SJoseph Huber CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); 98feeb8335SJoseph Huber for (uint64_t i = 0; i < ctors.size(); ++i) { 99feeb8335SJoseph Huber CUdeviceptr dev_ptr; 100feeb8335SJoseph Huber if (CUresult err = 101feeb8335SJoseph Huber cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) 102feeb8335SJoseph Huber handle_error(err); 103feeb8335SJoseph Huber if (CUresult err = 104feeb8335SJoseph Huber cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) 105feeb8335SJoseph Huber handle_error(err); 106feeb8335SJoseph Huber } 107feeb8335SJoseph Huber 108feeb8335SJoseph Huber // Get the address of the global and then store the address of the destructor 109feeb8335SJoseph Huber // function to call in the destructor array. 110feeb8335SJoseph Huber CUdeviceptr *dev_dtors_start = dev_ctors_end; 111feeb8335SJoseph Huber CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); 112feeb8335SJoseph Huber for (uint64_t i = 0; i < dtors.size(); ++i) { 113feeb8335SJoseph Huber CUdeviceptr dev_ptr; 114feeb8335SJoseph Huber if (CUresult err = 115feeb8335SJoseph Huber cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) 116feeb8335SJoseph Huber handle_error(err); 117feeb8335SJoseph Huber if (CUresult err = 118feeb8335SJoseph Huber cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) 119feeb8335SJoseph Huber handle_error(err); 120feeb8335SJoseph Huber } 121feeb8335SJoseph Huber 122feeb8335SJoseph Huber // Obtain the address of the pointers the startup implementation uses to 123feeb8335SJoseph Huber // iterate the constructors and destructors. 124feeb8335SJoseph Huber CUdeviceptr init_start; 125feeb8335SJoseph Huber if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, 126feeb8335SJoseph Huber "__init_array_start")) 127feeb8335SJoseph Huber handle_error(err); 128feeb8335SJoseph Huber CUdeviceptr init_end; 129feeb8335SJoseph Huber if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, 130feeb8335SJoseph Huber "__init_array_end")) 131feeb8335SJoseph Huber handle_error(err); 132feeb8335SJoseph Huber CUdeviceptr fini_start; 133feeb8335SJoseph Huber if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, 134feeb8335SJoseph Huber "__fini_array_start")) 135feeb8335SJoseph Huber handle_error(err); 136feeb8335SJoseph Huber CUdeviceptr fini_end; 137feeb8335SJoseph Huber if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, 138feeb8335SJoseph Huber "__fini_array_end")) 139feeb8335SJoseph Huber handle_error(err); 140feeb8335SJoseph Huber 141feeb8335SJoseph Huber // Copy the pointers to the newly written array to the symbols so the startup 142feeb8335SJoseph Huber // implementation can iterate them. 143feeb8335SJoseph Huber if (CUresult err = 144feeb8335SJoseph Huber cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) 145feeb8335SJoseph Huber handle_error(err); 146feeb8335SJoseph Huber if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) 147feeb8335SJoseph Huber handle_error(err); 148feeb8335SJoseph Huber if (CUresult err = 149feeb8335SJoseph Huber cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) 150feeb8335SJoseph Huber handle_error(err); 151feeb8335SJoseph Huber if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) 152feeb8335SJoseph Huber handle_error(err); 153feeb8335SJoseph Huber 154feeb8335SJoseph Huber return dev_memory; 155feeb8335SJoseph Huber } 156feeb8335SJoseph Huber 157feeb8335SJoseph Huber void print_kernel_resources(CUmodule binary, const char *kernel_name) { 158feeb8335SJoseph Huber CUfunction function; 159feeb8335SJoseph Huber if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) 160feeb8335SJoseph Huber handle_error(err); 161feeb8335SJoseph Huber int num_regs; 162feeb8335SJoseph Huber if (CUresult err = 163feeb8335SJoseph Huber cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function)) 164feeb8335SJoseph Huber handle_error(err); 165feeb8335SJoseph Huber printf("Executing kernel %s:\n", kernel_name); 166feeb8335SJoseph Huber printf("%6s registers: %d\n", kernel_name, num_regs); 167feeb8335SJoseph Huber } 168feeb8335SJoseph Huber 169feeb8335SJoseph Huber template <typename args_t> 170b4d49fb5SJoseph Huber CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server, 171b4d49fb5SJoseph Huber const LaunchParameters ¶ms, const char *kernel_name, 172b4d49fb5SJoseph Huber args_t kernel_args, bool print_resource_usage) { 173feeb8335SJoseph Huber // look up the '_start' kernel in the loaded module. 174feeb8335SJoseph Huber CUfunction function; 175feeb8335SJoseph Huber if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) 176feeb8335SJoseph Huber handle_error(err); 177feeb8335SJoseph Huber 178feeb8335SJoseph Huber // Set up the arguments to the '_start' kernel on the GPU. 179feeb8335SJoseph Huber uint64_t args_size = sizeof(args_t); 180feeb8335SJoseph Huber void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, 181feeb8335SJoseph Huber CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, 182feeb8335SJoseph Huber CU_LAUNCH_PARAM_END}; 183b4d49fb5SJoseph Huber if (print_resource_usage) 184b4d49fb5SJoseph Huber print_kernel_resources(binary, kernel_name); 185feeb8335SJoseph Huber 186b4d49fb5SJoseph Huber // Initialize a non-blocking CUDA stream to allocate memory if needed. 187b4d49fb5SJoseph Huber // This needs to be done on a separate stream or else it will deadlock 188b4d49fb5SJoseph Huber // with the executing kernel. 189feeb8335SJoseph Huber CUstream memory_stream; 190feeb8335SJoseph Huber if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) 191feeb8335SJoseph Huber handle_error(err); 192feeb8335SJoseph Huber 193b4d49fb5SJoseph Huber std::atomic<bool> finished = false; 194b4d49fb5SJoseph Huber std::thread server_thread( 195b4d49fb5SJoseph Huber [](std::atomic<bool> *finished, rpc::Server *server, 196b4d49fb5SJoseph Huber CUstream memory_stream) { 197b4d49fb5SJoseph Huber auto malloc_handler = [&](size_t size) -> void * { 198feeb8335SJoseph Huber CUdeviceptr dev_ptr; 199feeb8335SJoseph Huber if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) 2008c6a6f1aSJoseph Huber dev_ptr = 0UL; 201feeb8335SJoseph Huber 202feeb8335SJoseph Huber // Wait until the memory allocation is complete. 203feeb8335SJoseph Huber while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) 204feeb8335SJoseph Huber ; 205b4d49fb5SJoseph Huber return reinterpret_cast<void *>(dev_ptr); 206feeb8335SJoseph Huber }; 207b4d49fb5SJoseph Huber 208b4d49fb5SJoseph Huber auto free_handler = [&](void *ptr) -> void { 209b4d49fb5SJoseph Huber if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), 210b4d49fb5SJoseph Huber memory_stream)) 211feeb8335SJoseph Huber handle_error(err); 212feeb8335SJoseph Huber }; 213feeb8335SJoseph Huber 214b4d49fb5SJoseph Huber uint32_t index = 0; 215ee57a685SJoseph Huber while (!*finished) { 216b4d49fb5SJoseph Huber index = 217b4d49fb5SJoseph Huber handle_server<32>(*server, index, malloc_handler, free_handler); 218ee57a685SJoseph Huber } 219ee57a685SJoseph Huber }, 220b4d49fb5SJoseph Huber &finished, &server, memory_stream); 221ee57a685SJoseph Huber 222feeb8335SJoseph Huber // Call the kernel with the given arguments. 223feeb8335SJoseph Huber if (CUresult err = cuLaunchKernel( 224feeb8335SJoseph Huber function, params.num_blocks_x, params.num_blocks_y, 225feeb8335SJoseph Huber params.num_blocks_z, params.num_threads_x, params.num_threads_y, 226feeb8335SJoseph Huber params.num_threads_z, 0, stream, nullptr, args_config)) 227feeb8335SJoseph Huber handle_error(err); 228feeb8335SJoseph Huber 229ee57a685SJoseph Huber if (CUresult err = cuStreamSynchronize(stream)) 230feeb8335SJoseph Huber handle_error(err); 231feeb8335SJoseph Huber 232ee57a685SJoseph Huber finished = true; 233b4d49fb5SJoseph Huber if (server_thread.joinable()) 234b4d49fb5SJoseph Huber server_thread.join(); 235feeb8335SJoseph Huber 236feeb8335SJoseph Huber return CUDA_SUCCESS; 237feeb8335SJoseph Huber } 238feeb8335SJoseph Huber 2395e326983SJoseph Huber int load(int argc, const char **argv, const char **envp, void *image, 2405e326983SJoseph Huber size_t size, const LaunchParameters ¶ms, 2415e326983SJoseph Huber bool print_resource_usage) { 242feeb8335SJoseph Huber if (CUresult err = cuInit(0)) 243feeb8335SJoseph Huber handle_error(err); 244feeb8335SJoseph Huber // Obtain the first device found on the system. 245feeb8335SJoseph Huber uint32_t device_id = 0; 246feeb8335SJoseph Huber CUdevice device; 247feeb8335SJoseph Huber if (CUresult err = cuDeviceGet(&device, device_id)) 248feeb8335SJoseph Huber handle_error(err); 249feeb8335SJoseph Huber 250feeb8335SJoseph Huber // Initialize the CUDA context and claim it for this execution. 251feeb8335SJoseph Huber CUcontext context; 252feeb8335SJoseph Huber if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) 253feeb8335SJoseph Huber handle_error(err); 254feeb8335SJoseph Huber if (CUresult err = cuCtxSetCurrent(context)) 255feeb8335SJoseph Huber handle_error(err); 256feeb8335SJoseph Huber 257feeb8335SJoseph Huber // Increase the stack size per thread. 258feeb8335SJoseph Huber // TODO: We should allow this to be passed in so only the tests that require a 259feeb8335SJoseph Huber // larger stack can specify it to save on memory usage. 260feeb8335SJoseph Huber if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024)) 261feeb8335SJoseph Huber handle_error(err); 262feeb8335SJoseph Huber 263feeb8335SJoseph Huber // Initialize a non-blocking CUDA stream to execute the kernel. 264feeb8335SJoseph Huber CUstream stream; 265feeb8335SJoseph Huber if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) 266feeb8335SJoseph Huber handle_error(err); 267feeb8335SJoseph Huber 268feeb8335SJoseph Huber // Load the image into a CUDA module. 269feeb8335SJoseph Huber CUmodule binary; 270feeb8335SJoseph Huber if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) 271feeb8335SJoseph Huber handle_error(err); 272feeb8335SJoseph Huber 273feeb8335SJoseph Huber // Allocate pinned memory on the host to hold the pointer array for the 274feeb8335SJoseph Huber // copied argv and allow the GPU device to access it. 275feeb8335SJoseph Huber auto allocator = [&](uint64_t size) -> void * { 276feeb8335SJoseph Huber void *dev_ptr; 277feeb8335SJoseph Huber if (CUresult err = cuMemAllocHost(&dev_ptr, size)) 278feeb8335SJoseph Huber handle_error(err); 279feeb8335SJoseph Huber return dev_ptr; 280feeb8335SJoseph Huber }; 281feeb8335SJoseph Huber 282feeb8335SJoseph Huber auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); 283feeb8335SJoseph Huber if (!memory_or_err) 284feeb8335SJoseph Huber handle_error(toString(memory_or_err.takeError()).c_str()); 285feeb8335SJoseph Huber 286feeb8335SJoseph Huber void *dev_argv = copy_argument_vector(argc, argv, allocator); 287feeb8335SJoseph Huber if (!dev_argv) 288feeb8335SJoseph Huber handle_error("Failed to allocate device argv"); 289feeb8335SJoseph Huber 290feeb8335SJoseph Huber // Allocate pinned memory on the host to hold the pointer array for the 291feeb8335SJoseph Huber // copied environment array and allow the GPU device to access it. 292feeb8335SJoseph Huber void *dev_envp = copy_environment(envp, allocator); 293feeb8335SJoseph Huber if (!dev_envp) 294feeb8335SJoseph Huber handle_error("Failed to allocate device environment"); 295feeb8335SJoseph Huber 296feeb8335SJoseph Huber // Allocate space for the return pointer and initialize it to zero. 297feeb8335SJoseph Huber CUdeviceptr dev_ret; 298feeb8335SJoseph Huber if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) 299feeb8335SJoseph Huber handle_error(err); 300feeb8335SJoseph Huber if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) 301feeb8335SJoseph Huber handle_error(err); 302feeb8335SJoseph Huber 303feeb8335SJoseph Huber uint32_t warp_size = 32; 304b4d49fb5SJoseph Huber void *rpc_buffer = nullptr; 305b4d49fb5SJoseph Huber if (CUresult err = cuMemAllocHost( 306b4d49fb5SJoseph Huber &rpc_buffer, 307b4d49fb5SJoseph Huber rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT))) 308feeb8335SJoseph Huber handle_error(err); 309b4d49fb5SJoseph Huber rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); 310b4d49fb5SJoseph Huber rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); 311feeb8335SJoseph Huber 312feeb8335SJoseph Huber // Initialize the RPC client on the device by copying the local data to the 313feeb8335SJoseph Huber // device's internal pointer. 314feeb8335SJoseph Huber CUdeviceptr rpc_client_dev = 0; 315feeb8335SJoseph Huber uint64_t client_ptr_size = sizeof(void *); 316feeb8335SJoseph Huber if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, 317*89d8e700SJoseph Huber binary, "__llvm_rpc_client")) 318feeb8335SJoseph Huber handle_error(err); 319feeb8335SJoseph Huber 320*89d8e700SJoseph Huber if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client))) 321feeb8335SJoseph Huber handle_error(err); 322feeb8335SJoseph Huber 323feeb8335SJoseph Huber LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 324feeb8335SJoseph Huber begin_args_t init_args = {argc, dev_argv, dev_envp}; 325feeb8335SJoseph Huber if (CUresult err = 326b4d49fb5SJoseph Huber launch_kernel(binary, stream, server, single_threaded_params, 327feeb8335SJoseph Huber "_begin", init_args, print_resource_usage)) 328feeb8335SJoseph Huber handle_error(err); 329feeb8335SJoseph Huber 330feeb8335SJoseph Huber start_args_t args = {argc, dev_argv, dev_envp, 331feeb8335SJoseph Huber reinterpret_cast<void *>(dev_ret)}; 332b4d49fb5SJoseph Huber if (CUresult err = launch_kernel(binary, stream, server, params, "_start", 333feeb8335SJoseph Huber args, print_resource_usage)) 334feeb8335SJoseph Huber handle_error(err); 335feeb8335SJoseph Huber 336feeb8335SJoseph Huber // Copy the return value back from the kernel and wait. 337feeb8335SJoseph Huber int host_ret = 0; 338feeb8335SJoseph Huber if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) 339feeb8335SJoseph Huber handle_error(err); 340feeb8335SJoseph Huber 341feeb8335SJoseph Huber if (CUresult err = cuStreamSynchronize(stream)) 342feeb8335SJoseph Huber handle_error(err); 343feeb8335SJoseph Huber 344feeb8335SJoseph Huber end_args_t fini_args = {host_ret}; 345feeb8335SJoseph Huber if (CUresult err = 346b4d49fb5SJoseph Huber launch_kernel(binary, stream, server, single_threaded_params, "_end", 347b4d49fb5SJoseph Huber fini_args, print_resource_usage)) 348feeb8335SJoseph Huber handle_error(err); 349feeb8335SJoseph Huber 350feeb8335SJoseph Huber // Free the memory allocated for the device. 351feeb8335SJoseph Huber if (CUresult err = cuMemFreeHost(*memory_or_err)) 352feeb8335SJoseph Huber handle_error(err); 353feeb8335SJoseph Huber if (CUresult err = cuMemFree(dev_ret)) 354feeb8335SJoseph Huber handle_error(err); 355feeb8335SJoseph Huber if (CUresult err = cuMemFreeHost(dev_argv)) 356feeb8335SJoseph Huber handle_error(err); 357b4d49fb5SJoseph Huber if (CUresult err = cuMemFreeHost(rpc_buffer)) 358feeb8335SJoseph Huber handle_error(err); 359feeb8335SJoseph Huber 360feeb8335SJoseph Huber // Destroy the context and the loaded binary. 361feeb8335SJoseph Huber if (CUresult err = cuModuleUnload(binary)) 362feeb8335SJoseph Huber handle_error(err); 363feeb8335SJoseph Huber if (CUresult err = cuDevicePrimaryCtxRelease(device)) 364feeb8335SJoseph Huber handle_error(err); 365feeb8335SJoseph Huber return host_ret; 366feeb8335SJoseph Huber } 367