1 //===-- Loader Implementation for AMDHSA devices --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file impelements a simple loader to run images supporting the AMDHSA 10 // architecture. The file launches the '_start' kernel which should be provided 11 // by the device application start code and call ultimately call the 'main' 12 // function. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "Loader.h" 17 18 #include "hsa/hsa.h" 19 #include "hsa/hsa_ext_amd.h" 20 21 #include "llvm/Frontend/Offloading/Utility.h" 22 23 #include <atomic> 24 #include <cstdio> 25 #include <cstdlib> 26 #include <cstring> 27 #include <thread> 28 #include <tuple> 29 #include <utility> 30 31 // The implicit arguments of COV5 AMDGPU kernels. 32 struct implicit_args_t { 33 uint32_t grid_size_x; 34 uint32_t grid_size_y; 35 uint32_t grid_size_z; 36 uint16_t workgroup_size_x; 37 uint16_t workgroup_size_y; 38 uint16_t workgroup_size_z; 39 uint8_t Unused0[46]; 40 uint16_t grid_dims; 41 uint8_t Unused1[190]; 42 }; 43 44 /// Print the error code and exit if \p code indicates an error. 45 static void handle_error_impl(const char *file, int32_t line, 46 hsa_status_t code) { 47 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) 48 return; 49 50 const char *desc; 51 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) 52 desc = "Unknown error"; 53 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); 54 exit(EXIT_FAILURE); 55 } 56 57 /// Generic interface for iterating using the HSA callbacks. 58 template <typename elem_ty, typename func_ty, typename callback_ty> 59 hsa_status_t iterate(func_ty func, callback_ty cb) { 60 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 61 callback_ty *unwrapped = static_cast<callback_ty *>(data); 62 return (*unwrapped)(elem); 63 }; 64 return func(l, static_cast<void *>(&cb)); 65 } 66 67 /// Generic interface for iterating using the HSA callbacks. 68 template <typename elem_ty, typename func_ty, typename func_arg_ty, 69 typename callback_ty> 70 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { 71 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 72 callback_ty *unwrapped = static_cast<callback_ty *>(data); 73 return (*unwrapped)(elem); 74 }; 75 return func(func_arg, l, static_cast<void *>(&cb)); 76 } 77 78 /// Iterate through all availible agents. 79 template <typename callback_ty> 80 hsa_status_t iterate_agents(callback_ty callback) { 81 return iterate<hsa_agent_t>(hsa_iterate_agents, callback); 82 } 83 84 /// Iterate through all availible memory pools. 85 template <typename callback_ty> 86 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { 87 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, 88 agent, cb); 89 } 90 91 template <hsa_device_type_t flag> 92 hsa_status_t get_agent(hsa_agent_t *output_agent) { 93 // Find the first agent with a matching device type. 94 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { 95 hsa_device_type_t type; 96 hsa_status_t status = 97 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); 98 if (status != HSA_STATUS_SUCCESS) 99 return status; 100 101 if (type == flag) { 102 // Ensure that a GPU agent supports kernel dispatch packets. 103 if (type == HSA_DEVICE_TYPE_GPU) { 104 hsa_agent_feature_t features; 105 status = 106 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); 107 if (status != HSA_STATUS_SUCCESS) 108 return status; 109 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 110 *output_agent = hsa_agent; 111 } else { 112 *output_agent = hsa_agent; 113 } 114 return HSA_STATUS_INFO_BREAK; 115 } 116 return HSA_STATUS_SUCCESS; 117 }; 118 119 return iterate_agents(cb); 120 } 121 122 void print_kernel_resources(const char *kernel_name) { 123 fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); 124 } 125 126 /// Retrieve a global memory pool with a \p flag from the agent. 127 template <hsa_amd_memory_pool_global_flag_t flag> 128 hsa_status_t get_agent_memory_pool(hsa_agent_t agent, 129 hsa_amd_memory_pool_t *output_pool) { 130 auto cb = [&](hsa_amd_memory_pool_t memory_pool) { 131 uint32_t flags; 132 hsa_amd_segment_t segment; 133 if (auto err = hsa_amd_memory_pool_get_info( 134 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) 135 return err; 136 if (auto err = hsa_amd_memory_pool_get_info( 137 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) 138 return err; 139 140 if (segment != HSA_AMD_SEGMENT_GLOBAL) 141 return HSA_STATUS_SUCCESS; 142 143 if (flags & flag) 144 *output_pool = memory_pool; 145 146 return HSA_STATUS_SUCCESS; 147 }; 148 return iterate_agent_memory_pools(agent, cb); 149 } 150 151 template <typename args_t> 152 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, 153 hsa_amd_memory_pool_t kernargs_pool, 154 hsa_amd_memory_pool_t coarsegrained_pool, 155 hsa_queue_t *queue, rpc::Server &server, 156 const LaunchParameters ¶ms, 157 const char *kernel_name, args_t kernel_args, 158 uint32_t wavefront_size, bool print_resource_usage) { 159 // Look up the kernel in the loaded executable. 160 hsa_executable_symbol_t symbol; 161 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 162 executable, kernel_name, &dev_agent, &symbol)) 163 return err; 164 165 // Retrieve different properties of the kernel symbol used for launch. 166 uint64_t kernel; 167 uint32_t args_size; 168 uint32_t group_size; 169 uint32_t private_size; 170 bool dynamic_stack; 171 172 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { 173 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, 174 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, 175 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, 176 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, 177 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; 178 179 for (auto &[info, value] : symbol_infos) 180 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) 181 return err; 182 183 // Allocate space for the kernel arguments on the host and allow the GPU agent 184 // to access it. 185 void *args; 186 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, 187 /*flags=*/0, &args)) 188 handle_error(err); 189 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); 190 191 // Initialize all the arguments (explicit and implicit) to zero, then set the 192 // explicit arguments to the values created above. 193 std::memset(args, 0, args_size); 194 std::memcpy(args, &kernel_args, sizeof(args_t)); 195 196 // Initialize the necessary implicit arguments to the proper values. 197 int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + 198 (params.num_blocks_z * params.num_threads_z != 1); 199 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( 200 reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); 201 implicit_args->grid_dims = dims; 202 implicit_args->grid_size_x = params.num_blocks_x; 203 implicit_args->grid_size_y = params.num_blocks_y; 204 implicit_args->grid_size_z = params.num_blocks_z; 205 implicit_args->workgroup_size_x = params.num_threads_x; 206 implicit_args->workgroup_size_y = params.num_threads_y; 207 implicit_args->workgroup_size_z = params.num_threads_z; 208 209 // Obtain a packet from the queue. 210 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 211 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) 212 ; 213 214 const uint32_t mask = queue->size - 1; 215 hsa_kernel_dispatch_packet_t *packet = 216 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + 217 (packet_id & mask); 218 219 // Set up the packet for exeuction on the device. We currently only launch 220 // with one thread on the device, forcing the rest of the wavefront to be 221 // masked off. 222 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 223 packet->workgroup_size_x = params.num_threads_x; 224 packet->workgroup_size_y = params.num_threads_y; 225 packet->workgroup_size_z = params.num_threads_z; 226 packet->reserved0 = 0; 227 packet->grid_size_x = params.num_blocks_x * params.num_threads_x; 228 packet->grid_size_y = params.num_blocks_y * params.num_threads_y; 229 packet->grid_size_z = params.num_blocks_z * params.num_threads_z; 230 packet->private_segment_size = 231 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; 232 packet->group_segment_size = group_size; 233 packet->kernel_object = kernel; 234 packet->kernarg_address = args; 235 packet->reserved2 = 0; 236 // Create a signal to indicate when this packet has been completed. 237 if (hsa_status_t err = 238 hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) 239 handle_error(err); 240 241 if (print_resource_usage) 242 print_kernel_resources(kernel_name); 243 244 // Initialize the packet header and set the doorbell signal to begin execution 245 // by the HSA runtime. 246 uint16_t header = 247 1u << HSA_PACKET_HEADER_BARRIER | 248 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 249 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | 250 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); 251 uint32_t header_word = header | (setup << 16u); 252 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); 253 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 254 255 std::atomic<bool> finished = false; 256 std::thread server_thread( 257 [](std::atomic<bool> *finished, rpc::Server *server, 258 uint32_t wavefront_size, hsa_agent_t dev_agent, 259 hsa_amd_memory_pool_t coarsegrained_pool) { 260 // Register RPC callbacks for the malloc and free functions on HSA. 261 auto malloc_handler = [&](size_t size) -> void * { 262 void *dev_ptr = nullptr; 263 if (hsa_status_t err = 264 hsa_amd_memory_pool_allocate(coarsegrained_pool, size, 265 /*flags=*/0, &dev_ptr)) 266 dev_ptr = nullptr; 267 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 268 return dev_ptr; 269 }; 270 271 auto free_handler = [](void *ptr) -> void { 272 if (hsa_status_t err = 273 hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr))) 274 handle_error(err); 275 }; 276 277 uint32_t index = 0; 278 while (!*finished) { 279 if (wavefront_size == 32) 280 index = 281 handle_server<32>(*server, index, malloc_handler, free_handler); 282 else 283 index = 284 handle_server<64>(*server, index, malloc_handler, free_handler); 285 } 286 }, 287 &finished, &server, wavefront_size, dev_agent, coarsegrained_pool); 288 289 // Wait until the kernel has completed execution on the device. Periodically 290 // check the RPC client for work to be performed on the server. 291 while (hsa_signal_wait_scacquire(packet->completion_signal, 292 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 293 HSA_WAIT_STATE_BLOCKED) != 0) 294 ; 295 296 finished = true; 297 if (server_thread.joinable()) 298 server_thread.join(); 299 300 // Destroy the resources acquired to launch the kernel and return. 301 if (hsa_status_t err = hsa_amd_memory_pool_free(args)) 302 handle_error(err); 303 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) 304 handle_error(err); 305 306 return HSA_STATUS_SUCCESS; 307 } 308 309 /// Copies data from the source agent to the destination agent. The source 310 /// memory must first be pinned explicitly or allocated via HSA. 311 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, 312 const void *src, hsa_agent_t src_agent, 313 uint64_t size) { 314 // Create a memory signal to copy information between the host and device. 315 hsa_signal_t memory_signal; 316 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) 317 return err; 318 319 if (hsa_status_t err = hsa_amd_memory_async_copy( 320 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) 321 return err; 322 323 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, 324 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) 325 ; 326 327 if (hsa_status_t err = hsa_signal_destroy(memory_signal)) 328 return err; 329 330 return HSA_STATUS_SUCCESS; 331 } 332 333 int load(int argc, const char **argv, const char **envp, void *image, 334 size_t size, const LaunchParameters ¶ms, 335 bool print_resource_usage) { 336 // Initialize the HSA runtime used to communicate with the device. 337 if (hsa_status_t err = hsa_init()) 338 handle_error(err); 339 340 // Register a callback when the device encounters a memory fault. 341 if (hsa_status_t err = hsa_amd_register_system_event_handler( 342 [](const hsa_amd_event_t *event, void *) -> hsa_status_t { 343 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) 344 return HSA_STATUS_ERROR; 345 return HSA_STATUS_SUCCESS; 346 }, 347 nullptr)) 348 handle_error(err); 349 350 // Obtain a single agent for the device and host to use the HSA memory model. 351 hsa_agent_t dev_agent; 352 hsa_agent_t host_agent; 353 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) 354 handle_error(err); 355 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) 356 handle_error(err); 357 358 // Load the code object's ISA information and executable data segments. 359 hsa_code_object_reader_t reader; 360 if (hsa_status_t err = 361 hsa_code_object_reader_create_from_memory(image, size, &reader)) 362 handle_error(err); 363 364 hsa_executable_t executable; 365 if (hsa_status_t err = hsa_executable_create_alt( 366 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", 367 &executable)) 368 handle_error(err); 369 370 hsa_loaded_code_object_t object; 371 if (hsa_status_t err = hsa_executable_load_agent_code_object( 372 executable, dev_agent, reader, "", &object)) 373 handle_error(err); 374 375 // No modifications to the executable are allowed after this point. 376 if (hsa_status_t err = hsa_executable_freeze(executable, "")) 377 handle_error(err); 378 379 // Check the validity of the loaded executable. If the agents ISA features do 380 // not match the executable's code object it will fail here. 381 uint32_t result; 382 if (hsa_status_t err = hsa_executable_validate(executable, &result)) 383 handle_error(err); 384 if (result) 385 handle_error(HSA_STATUS_ERROR); 386 387 if (hsa_status_t err = hsa_code_object_reader_destroy(reader)) 388 handle_error(err); 389 390 // Obtain memory pools to exchange data between the host and the device. The 391 // fine-grained pool acts as pinned memory on the host for DMA transfers to 392 // the device, the coarse-grained pool is for allocations directly on the 393 // device, and the kernerl-argument pool is for executing the kernel. 394 hsa_amd_memory_pool_t kernargs_pool; 395 hsa_amd_memory_pool_t finegrained_pool; 396 hsa_amd_memory_pool_t coarsegrained_pool; 397 if (hsa_status_t err = 398 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( 399 host_agent, &kernargs_pool)) 400 handle_error(err); 401 if (hsa_status_t err = 402 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( 403 host_agent, &finegrained_pool)) 404 handle_error(err); 405 if (hsa_status_t err = 406 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( 407 dev_agent, &coarsegrained_pool)) 408 handle_error(err); 409 410 // The AMDGPU target can change its wavefront size. There currently isn't a 411 // good way to look this up through the HSA API so we use the LLVM interface. 412 uint16_t abi_version; 413 llvm::StringRef image_ref(reinterpret_cast<char *>(image), size); 414 llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map; 415 if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage( 416 llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) { 417 handle_error(llvm::toString(std::move(err)).c_str()); 418 } 419 420 // Allocate fine-grained memory on the host to hold the pointer array for the 421 // copied argv and allow the GPU agent to access it. 422 auto allocator = [&](uint64_t size) -> void * { 423 void *dev_ptr = nullptr; 424 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 425 /*flags=*/0, &dev_ptr)) 426 handle_error(err); 427 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 428 return dev_ptr; 429 }; 430 void *dev_argv = copy_argument_vector(argc, argv, allocator); 431 if (!dev_argv) 432 handle_error("Failed to allocate device argv"); 433 434 // Allocate fine-grained memory on the host to hold the pointer array for the 435 // copied environment array and allow the GPU agent to access it. 436 void *dev_envp = copy_environment(envp, allocator); 437 if (!dev_envp) 438 handle_error("Failed to allocate device environment"); 439 440 // Allocate space for the return pointer and initialize it to zero. 441 void *dev_ret; 442 if (hsa_status_t err = 443 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), 444 /*flags=*/0, &dev_ret)) 445 handle_error(err); 446 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); 447 448 // Allocate finegrained memory for the RPC server and client to share. 449 uint32_t wavefront_size = 450 llvm::max_element(info_map, [](auto &&x, auto &&y) { 451 return x.second.WavefrontSize < y.second.WavefrontSize; 452 })->second.WavefrontSize; 453 454 // Set up the RPC server. 455 void *rpc_buffer; 456 if (hsa_status_t err = hsa_amd_memory_pool_allocate( 457 finegrained_pool, 458 rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT), 459 /*flags=*/0, &rpc_buffer)) 460 handle_error(err); 461 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer); 462 463 rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); 464 rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); 465 466 // Initialize the RPC client on the device by copying the local data to the 467 // device's internal pointer. 468 hsa_executable_symbol_t rpc_client_sym; 469 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 470 executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym)) 471 handle_error(err); 472 473 void *rpc_client_dev; 474 if (hsa_status_t err = hsa_executable_symbol_get_info( 475 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 476 &rpc_client_dev)) 477 handle_error(err); 478 479 void *rpc_client_buffer; 480 if (hsa_status_t err = 481 hsa_amd_memory_lock(&client, sizeof(rpc::Client), 482 /*agents=*/nullptr, 0, &rpc_client_buffer)) 483 handle_error(err); 484 485 // Copy the RPC client buffer to the address pointed to by the symbol. 486 if (hsa_status_t err = 487 hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent, 488 sizeof(rpc::Client))) 489 handle_error(err); 490 491 if (hsa_status_t err = hsa_amd_memory_unlock(&client)) 492 handle_error(err); 493 494 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. 495 // If the clock_freq symbol is missing, no work to do. 496 hsa_executable_symbol_t freq_sym; 497 if (HSA_STATUS_SUCCESS == 498 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", 499 &dev_agent, &freq_sym)) { 500 void *host_clock_freq; 501 if (hsa_status_t err = 502 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), 503 /*flags=*/0, &host_clock_freq)) 504 handle_error(err); 505 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); 506 507 if (HSA_STATUS_SUCCESS == 508 hsa_agent_get_info(dev_agent, 509 static_cast<hsa_agent_info_t>( 510 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), 511 host_clock_freq)) { 512 513 void *freq_addr; 514 if (hsa_status_t err = hsa_executable_symbol_get_info( 515 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 516 &freq_addr)) 517 handle_error(err); 518 519 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, 520 host_agent, sizeof(uint64_t))) 521 handle_error(err); 522 } 523 } 524 525 // Obtain a queue with the maximum (power of two) size, used to send commands 526 // to the HSA runtime and launch execution on the device. 527 uint64_t queue_size; 528 if (hsa_status_t err = hsa_agent_get_info( 529 dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) 530 handle_error(err); 531 hsa_queue_t *queue = nullptr; 532 if (hsa_status_t err = 533 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, 534 nullptr, UINT32_MAX, UINT32_MAX, &queue)) 535 handle_error(err); 536 537 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 538 begin_args_t init_args = {argc, dev_argv, dev_envp}; 539 if (hsa_status_t err = launch_kernel( 540 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 541 server, single_threaded_params, "_begin.kd", init_args, 542 info_map["_begin"].WavefrontSize, print_resource_usage)) 543 handle_error(err); 544 545 start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; 546 if (hsa_status_t err = launch_kernel( 547 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 548 server, params, "_start.kd", args, info_map["_start"].WavefrontSize, 549 print_resource_usage)) 550 handle_error(err); 551 552 void *host_ret; 553 if (hsa_status_t err = 554 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), 555 /*flags=*/0, &host_ret)) 556 handle_error(err); 557 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); 558 559 if (hsa_status_t err = 560 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) 561 handle_error(err); 562 563 // Save the return value and perform basic clean-up. 564 int ret = *static_cast<int *>(host_ret); 565 566 end_args_t fini_args = {ret}; 567 if (hsa_status_t err = launch_kernel( 568 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 569 server, single_threaded_params, "_end.kd", fini_args, 570 info_map["_end"].WavefrontSize, print_resource_usage)) 571 handle_error(err); 572 573 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer)) 574 handle_error(err); 575 576 // Free the memory allocated for the device. 577 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) 578 handle_error(err); 579 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) 580 handle_error(err); 581 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) 582 handle_error(err); 583 584 if (hsa_status_t err = hsa_queue_destroy(queue)) 585 handle_error(err); 586 587 if (hsa_status_t err = hsa_executable_destroy(executable)) 588 handle_error(err); 589 590 if (hsa_status_t err = hsa_shut_down()) 591 handle_error(err); 592 593 return ret; 594 } 595