1 //===-- Loader Implementation for NVPTX devices --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file impelements a simple loader to run images supporting the NVPTX 10 // architecture. The file launches the '_start' kernel which should be provided 11 // by the device application start code and call ultimately call the 'main' 12 // function. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "Loader.h" 17 18 #include "cuda.h" 19 20 #include "llvm/Object/ELF.h" 21 #include "llvm/Object/ELFObjectFile.h" 22 23 #include <atomic> 24 #include <cstddef> 25 #include <cstdio> 26 #include <cstdlib> 27 #include <cstring> 28 #include <thread> 29 #include <vector> 30 31 using namespace llvm; 32 using namespace object; 33 34 static void handle_error_impl(const char *file, int32_t line, CUresult err) { 35 if (err == CUDA_SUCCESS) 36 return; 37 38 const char *err_str = nullptr; 39 CUresult result = cuGetErrorString(err, &err_str); 40 if (result != CUDA_SUCCESS) 41 fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line); 42 else 43 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str); 44 exit(1); 45 } 46 47 // Gets the names of all the globals that contain functions to initialize or 48 // deinitialize. We need to do this manually because the NVPTX toolchain does 49 // not contain the necessary binary manipulation tools. 50 template <typename Alloc> 51 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size, 52 Alloc allocator, CUmodule binary) { 53 auto mem_buffer = MemoryBuffer::getMemBuffer( 54 StringRef(reinterpret_cast<const char *>(image), size), "image", 55 /*RequiresNullTerminator=*/false); 56 Expected<ELF64LEObjectFile> elf_or_err = 57 ELF64LEObjectFile::create(*mem_buffer); 58 if (!elf_or_err) 59 handle_error(toString(elf_or_err.takeError()).c_str()); 60 61 std::vector<std::pair<const char *, uint16_t>> ctors; 62 std::vector<std::pair<const char *, uint16_t>> dtors; 63 // CUDA has no way to iterate over all the symbols so we need to inspect the 64 // ELF directly using the LLVM libraries. 65 for (const auto &symbol : elf_or_err->symbols()) { 66 auto name_or_err = symbol.getName(); 67 if (!name_or_err) 68 handle_error(toString(name_or_err.takeError()).c_str()); 69 70 // Search for all symbols that contain a constructor or destructor. 71 if (!name_or_err->starts_with("__init_array_object_") && 72 !name_or_err->starts_with("__fini_array_object_")) 73 continue; 74 75 uint16_t priority; 76 if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) 77 handle_error("Invalid priority for constructor or destructor"); 78 79 if (name_or_err->starts_with("__init")) 80 ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); 81 else 82 dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); 83 } 84 // Lower priority constructors are run before higher ones. The reverse is true 85 // for destructors. 86 llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; }); 87 llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; }); 88 89 // Allocate host pinned memory to make these arrays visible to the GPU. 90 CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator( 91 ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); 92 uint64_t global_size = 0; 93 94 // Get the address of the global and then store the address of the constructor 95 // function to call in the constructor array. 96 CUdeviceptr *dev_ctors_start = dev_memory; 97 CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); 98 for (uint64_t i = 0; i < ctors.size(); ++i) { 99 CUdeviceptr dev_ptr; 100 if (CUresult err = 101 cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) 102 handle_error(err); 103 if (CUresult err = 104 cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) 105 handle_error(err); 106 } 107 108 // Get the address of the global and then store the address of the destructor 109 // function to call in the destructor array. 110 CUdeviceptr *dev_dtors_start = dev_ctors_end; 111 CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); 112 for (uint64_t i = 0; i < dtors.size(); ++i) { 113 CUdeviceptr dev_ptr; 114 if (CUresult err = 115 cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) 116 handle_error(err); 117 if (CUresult err = 118 cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) 119 handle_error(err); 120 } 121 122 // Obtain the address of the pointers the startup implementation uses to 123 // iterate the constructors and destructors. 124 CUdeviceptr init_start; 125 if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, 126 "__init_array_start")) 127 handle_error(err); 128 CUdeviceptr init_end; 129 if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, 130 "__init_array_end")) 131 handle_error(err); 132 CUdeviceptr fini_start; 133 if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, 134 "__fini_array_start")) 135 handle_error(err); 136 CUdeviceptr fini_end; 137 if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, 138 "__fini_array_end")) 139 handle_error(err); 140 141 // Copy the pointers to the newly written array to the symbols so the startup 142 // implementation can iterate them. 143 if (CUresult err = 144 cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) 145 handle_error(err); 146 if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) 147 handle_error(err); 148 if (CUresult err = 149 cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) 150 handle_error(err); 151 if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) 152 handle_error(err); 153 154 return dev_memory; 155 } 156 157 void print_kernel_resources(CUmodule binary, const char *kernel_name) { 158 CUfunction function; 159 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) 160 handle_error(err); 161 int num_regs; 162 if (CUresult err = 163 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function)) 164 handle_error(err); 165 printf("Executing kernel %s:\n", kernel_name); 166 printf("%6s registers: %d\n", kernel_name, num_regs); 167 } 168 169 template <typename args_t> 170 CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server, 171 const LaunchParameters ¶ms, const char *kernel_name, 172 args_t kernel_args, bool print_resource_usage) { 173 // look up the '_start' kernel in the loaded module. 174 CUfunction function; 175 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) 176 handle_error(err); 177 178 // Set up the arguments to the '_start' kernel on the GPU. 179 uint64_t args_size = sizeof(args_t); 180 void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, 181 CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, 182 CU_LAUNCH_PARAM_END}; 183 if (print_resource_usage) 184 print_kernel_resources(binary, kernel_name); 185 186 // Initialize a non-blocking CUDA stream to allocate memory if needed. 187 // This needs to be done on a separate stream or else it will deadlock 188 // with the executing kernel. 189 CUstream memory_stream; 190 if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) 191 handle_error(err); 192 193 std::atomic<bool> finished = false; 194 std::thread server_thread( 195 [](std::atomic<bool> *finished, rpc::Server *server, 196 CUstream memory_stream) { 197 auto malloc_handler = [&](size_t size) -> void * { 198 CUdeviceptr dev_ptr; 199 if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) 200 dev_ptr = 0UL; 201 202 // Wait until the memory allocation is complete. 203 while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) 204 ; 205 return reinterpret_cast<void *>(dev_ptr); 206 }; 207 208 auto free_handler = [&](void *ptr) -> void { 209 if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), 210 memory_stream)) 211 handle_error(err); 212 }; 213 214 uint32_t index = 0; 215 while (!*finished) { 216 index = 217 handle_server<32>(*server, index, malloc_handler, free_handler); 218 } 219 }, 220 &finished, &server, memory_stream); 221 222 // Call the kernel with the given arguments. 223 if (CUresult err = cuLaunchKernel( 224 function, params.num_blocks_x, params.num_blocks_y, 225 params.num_blocks_z, params.num_threads_x, params.num_threads_y, 226 params.num_threads_z, 0, stream, nullptr, args_config)) 227 handle_error(err); 228 229 if (CUresult err = cuStreamSynchronize(stream)) 230 handle_error(err); 231 232 finished = true; 233 if (server_thread.joinable()) 234 server_thread.join(); 235 236 return CUDA_SUCCESS; 237 } 238 239 int load(int argc, const char **argv, const char **envp, void *image, 240 size_t size, const LaunchParameters ¶ms, 241 bool print_resource_usage) { 242 if (CUresult err = cuInit(0)) 243 handle_error(err); 244 // Obtain the first device found on the system. 245 uint32_t device_id = 0; 246 CUdevice device; 247 if (CUresult err = cuDeviceGet(&device, device_id)) 248 handle_error(err); 249 250 // Initialize the CUDA context and claim it for this execution. 251 CUcontext context; 252 if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) 253 handle_error(err); 254 if (CUresult err = cuCtxSetCurrent(context)) 255 handle_error(err); 256 257 // Increase the stack size per thread. 258 // TODO: We should allow this to be passed in so only the tests that require a 259 // larger stack can specify it to save on memory usage. 260 if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024)) 261 handle_error(err); 262 263 // Initialize a non-blocking CUDA stream to execute the kernel. 264 CUstream stream; 265 if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) 266 handle_error(err); 267 268 // Load the image into a CUDA module. 269 CUmodule binary; 270 if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) 271 handle_error(err); 272 273 // Allocate pinned memory on the host to hold the pointer array for the 274 // copied argv and allow the GPU device to access it. 275 auto allocator = [&](uint64_t size) -> void * { 276 void *dev_ptr; 277 if (CUresult err = cuMemAllocHost(&dev_ptr, size)) 278 handle_error(err); 279 return dev_ptr; 280 }; 281 282 auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); 283 if (!memory_or_err) 284 handle_error(toString(memory_or_err.takeError()).c_str()); 285 286 void *dev_argv = copy_argument_vector(argc, argv, allocator); 287 if (!dev_argv) 288 handle_error("Failed to allocate device argv"); 289 290 // Allocate pinned memory on the host to hold the pointer array for the 291 // copied environment array and allow the GPU device to access it. 292 void *dev_envp = copy_environment(envp, allocator); 293 if (!dev_envp) 294 handle_error("Failed to allocate device environment"); 295 296 // Allocate space for the return pointer and initialize it to zero. 297 CUdeviceptr dev_ret; 298 if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) 299 handle_error(err); 300 if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) 301 handle_error(err); 302 303 uint32_t warp_size = 32; 304 void *rpc_buffer = nullptr; 305 if (CUresult err = cuMemAllocHost( 306 &rpc_buffer, 307 rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT))) 308 handle_error(err); 309 rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); 310 rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); 311 312 // Initialize the RPC client on the device by copying the local data to the 313 // device's internal pointer. 314 CUdeviceptr rpc_client_dev = 0; 315 uint64_t client_ptr_size = sizeof(void *); 316 if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, 317 binary, "__llvm_rpc_client")) 318 handle_error(err); 319 320 if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client))) 321 handle_error(err); 322 323 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 324 begin_args_t init_args = {argc, dev_argv, dev_envp}; 325 if (CUresult err = 326 launch_kernel(binary, stream, server, single_threaded_params, 327 "_begin", init_args, print_resource_usage)) 328 handle_error(err); 329 330 start_args_t args = {argc, dev_argv, dev_envp, 331 reinterpret_cast<void *>(dev_ret)}; 332 if (CUresult err = launch_kernel(binary, stream, server, params, "_start", 333 args, print_resource_usage)) 334 handle_error(err); 335 336 // Copy the return value back from the kernel and wait. 337 int host_ret = 0; 338 if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) 339 handle_error(err); 340 341 if (CUresult err = cuStreamSynchronize(stream)) 342 handle_error(err); 343 344 end_args_t fini_args = {host_ret}; 345 if (CUresult err = 346 launch_kernel(binary, stream, server, single_threaded_params, "_end", 347 fini_args, print_resource_usage)) 348 handle_error(err); 349 350 // Free the memory allocated for the device. 351 if (CUresult err = cuMemFreeHost(*memory_or_err)) 352 handle_error(err); 353 if (CUresult err = cuMemFree(dev_ret)) 354 handle_error(err); 355 if (CUresult err = cuMemFreeHost(dev_argv)) 356 handle_error(err); 357 if (CUresult err = cuMemFreeHost(rpc_buffer)) 358 handle_error(err); 359 360 // Destroy the context and the loaded binary. 361 if (CUresult err = cuModuleUnload(binary)) 362 handle_error(err); 363 if (CUresult err = cuDevicePrimaryCtxRelease(device)) 364 handle_error(err); 365 return host_ret; 366 } 367