1 //===-- Loader Implementation for AMDHSA devices --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file impelements a simple loader to run images supporting the AMDHSA 10 // architecture. The file launches the '_start' kernel which should be provided 11 // by the device application start code and call ultimately call the 'main' 12 // function. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "Loader.h" 17 18 #if defined(__has_include) 19 #if __has_include("hsa/hsa.h") 20 #include "hsa/hsa.h" 21 #include "hsa/hsa_ext_amd.h" 22 #elif __has_include("hsa.h") 23 #include "hsa.h" 24 #include "hsa_ext_amd.h" 25 #endif 26 #else 27 #include "hsa/hsa.h" 28 #include "hsa/hsa_ext_amd.h" 29 #endif 30 31 #include <cstdio> 32 #include <cstdlib> 33 #include <cstring> 34 #include <tuple> 35 #include <utility> 36 37 // The implicit arguments of COV5 AMDGPU kernels. 38 struct implicit_args_t { 39 uint32_t grid_size_x; 40 uint32_t grid_size_y; 41 uint32_t grid_size_z; 42 uint16_t workgroup_size_x; 43 uint16_t workgroup_size_y; 44 uint16_t workgroup_size_z; 45 uint8_t Unused0[46]; 46 uint16_t grid_dims; 47 uint8_t Unused1[190]; 48 }; 49 50 /// Print the error code and exit if \p code indicates an error. 51 static void handle_error_impl(const char *file, int32_t line, 52 hsa_status_t code) { 53 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) 54 return; 55 56 const char *desc; 57 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) 58 desc = "Unknown error"; 59 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); 60 exit(EXIT_FAILURE); 61 } 62 63 /// Generic interface for iterating using the HSA callbacks. 64 template <typename elem_ty, typename func_ty, typename callback_ty> 65 hsa_status_t iterate(func_ty func, callback_ty cb) { 66 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 67 callback_ty *unwrapped = static_cast<callback_ty *>(data); 68 return (*unwrapped)(elem); 69 }; 70 return func(l, static_cast<void *>(&cb)); 71 } 72 73 /// Generic interface for iterating using the HSA callbacks. 74 template <typename elem_ty, typename func_ty, typename func_arg_ty, 75 typename callback_ty> 76 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { 77 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 78 callback_ty *unwrapped = static_cast<callback_ty *>(data); 79 return (*unwrapped)(elem); 80 }; 81 return func(func_arg, l, static_cast<void *>(&cb)); 82 } 83 84 /// Iterate through all availible agents. 85 template <typename callback_ty> 86 hsa_status_t iterate_agents(callback_ty callback) { 87 return iterate<hsa_agent_t>(hsa_iterate_agents, callback); 88 } 89 90 /// Iterate through all availible memory pools. 91 template <typename callback_ty> 92 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { 93 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, 94 agent, cb); 95 } 96 97 template <hsa_device_type_t flag> 98 hsa_status_t get_agent(hsa_agent_t *output_agent) { 99 // Find the first agent with a matching device type. 100 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { 101 hsa_device_type_t type; 102 hsa_status_t status = 103 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); 104 if (status != HSA_STATUS_SUCCESS) 105 return status; 106 107 if (type == flag) { 108 // Ensure that a GPU agent supports kernel dispatch packets. 109 if (type == HSA_DEVICE_TYPE_GPU) { 110 hsa_agent_feature_t features; 111 status = 112 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); 113 if (status != HSA_STATUS_SUCCESS) 114 return status; 115 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 116 *output_agent = hsa_agent; 117 } else { 118 *output_agent = hsa_agent; 119 } 120 return HSA_STATUS_INFO_BREAK; 121 } 122 return HSA_STATUS_SUCCESS; 123 }; 124 125 return iterate_agents(cb); 126 } 127 128 void print_kernel_resources(const char *kernel_name) { 129 fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); 130 } 131 132 /// Retrieve a global memory pool with a \p flag from the agent. 133 template <hsa_amd_memory_pool_global_flag_t flag> 134 hsa_status_t get_agent_memory_pool(hsa_agent_t agent, 135 hsa_amd_memory_pool_t *output_pool) { 136 auto cb = [&](hsa_amd_memory_pool_t memory_pool) { 137 uint32_t flags; 138 hsa_amd_segment_t segment; 139 if (auto err = hsa_amd_memory_pool_get_info( 140 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) 141 return err; 142 if (auto err = hsa_amd_memory_pool_get_info( 143 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) 144 return err; 145 146 if (segment != HSA_AMD_SEGMENT_GLOBAL) 147 return HSA_STATUS_SUCCESS; 148 149 if (flags & flag) 150 *output_pool = memory_pool; 151 152 return HSA_STATUS_SUCCESS; 153 }; 154 return iterate_agent_memory_pools(agent, cb); 155 } 156 157 template <typename args_t> 158 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, 159 hsa_amd_memory_pool_t kernargs_pool, 160 hsa_amd_memory_pool_t coarsegrained_pool, 161 hsa_queue_t *queue, rpc_device_t device, 162 const LaunchParameters ¶ms, 163 const char *kernel_name, args_t kernel_args, 164 bool print_resource_usage) { 165 // Look up the kernel in the loaded executable. 166 hsa_executable_symbol_t symbol; 167 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 168 executable, kernel_name, &dev_agent, &symbol)) 169 return err; 170 171 // Register RPC callbacks for the malloc and free functions on HSA. 172 auto tuple = std::make_tuple(dev_agent, coarsegrained_pool); 173 rpc_register_callback( 174 device, RPC_MALLOC, 175 [](rpc_port_t port, void *data) { 176 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { 177 auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data); 178 uint64_t size = buffer->data[0]; 179 void *dev_ptr = nullptr; 180 if (hsa_status_t err = 181 hsa_amd_memory_pool_allocate(pool, size, 182 /*flags=*/0, &dev_ptr)) 183 handle_error(err); 184 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 185 buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr); 186 }; 187 rpc_recv_and_send(port, malloc_handler, data); 188 }, 189 &tuple); 190 rpc_register_callback( 191 device, RPC_FREE, 192 [](rpc_port_t port, void *data) { 193 auto free_handler = [](rpc_buffer_t *buffer, void *) { 194 if (hsa_status_t err = hsa_amd_memory_pool_free( 195 reinterpret_cast<void *>(buffer->data[0]))) 196 handle_error(err); 197 }; 198 rpc_recv_and_send(port, free_handler, data); 199 }, 200 nullptr); 201 202 // Retrieve different properties of the kernel symbol used for launch. 203 uint64_t kernel; 204 uint32_t args_size; 205 uint32_t group_size; 206 uint32_t private_size; 207 bool dynamic_stack; 208 209 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { 210 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, 211 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, 212 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, 213 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, 214 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; 215 216 for (auto &[info, value] : symbol_infos) 217 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) 218 return err; 219 220 // Allocate space for the kernel arguments on the host and allow the GPU agent 221 // to access it. 222 void *args; 223 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, 224 /*flags=*/0, &args)) 225 handle_error(err); 226 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); 227 228 // Initialize all the arguments (explicit and implicit) to zero, then set the 229 // explicit arguments to the values created above. 230 std::memset(args, 0, args_size); 231 std::memcpy(args, &kernel_args, sizeof(args_t)); 232 233 // Initialize the necessary implicit arguments to the proper values. 234 bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + 235 (params.num_blocks_z * params.num_threads_z != 1); 236 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( 237 reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); 238 implicit_args->grid_dims = dims; 239 implicit_args->grid_size_x = params.num_blocks_x; 240 implicit_args->grid_size_y = params.num_blocks_y; 241 implicit_args->grid_size_z = params.num_blocks_z; 242 implicit_args->workgroup_size_x = params.num_threads_x; 243 implicit_args->workgroup_size_y = params.num_threads_y; 244 implicit_args->workgroup_size_z = params.num_threads_z; 245 246 // Obtain a packet from the queue. 247 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 248 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) 249 ; 250 251 const uint32_t mask = queue->size - 1; 252 hsa_kernel_dispatch_packet_t *packet = 253 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + 254 (packet_id & mask); 255 256 // Set up the packet for exeuction on the device. We currently only launch 257 // with one thread on the device, forcing the rest of the wavefront to be 258 // masked off. 259 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 260 packet->workgroup_size_x = params.num_threads_x; 261 packet->workgroup_size_y = params.num_threads_y; 262 packet->workgroup_size_z = params.num_threads_z; 263 packet->reserved0 = 0; 264 packet->grid_size_x = params.num_blocks_x * params.num_threads_x; 265 packet->grid_size_y = params.num_blocks_y * params.num_threads_y; 266 packet->grid_size_z = params.num_blocks_z * params.num_threads_z; 267 packet->private_segment_size = 268 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; 269 packet->group_segment_size = group_size; 270 packet->kernel_object = kernel; 271 packet->kernarg_address = args; 272 packet->reserved2 = 0; 273 // Create a signal to indicate when this packet has been completed. 274 if (hsa_status_t err = 275 hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) 276 handle_error(err); 277 278 if (print_resource_usage) 279 print_kernel_resources(kernel_name); 280 281 // Initialize the packet header and set the doorbell signal to begin execution 282 // by the HSA runtime. 283 uint16_t header = 284 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 285 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | 286 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); 287 uint32_t header_word = header | (setup << 16u); 288 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); 289 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 290 291 // Wait until the kernel has completed execution on the device. Periodically 292 // check the RPC client for work to be performed on the server. 293 while (hsa_signal_wait_scacquire( 294 packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, 295 /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) 296 if (rpc_status_t err = rpc_handle_server(device)) 297 handle_error(err); 298 299 // Handle the server one more time in case the kernel exited with a pending 300 // send still in flight. 301 if (rpc_status_t err = rpc_handle_server(device)) 302 handle_error(err); 303 304 // Destroy the resources acquired to launch the kernel and return. 305 if (hsa_status_t err = hsa_amd_memory_pool_free(args)) 306 handle_error(err); 307 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) 308 handle_error(err); 309 310 return HSA_STATUS_SUCCESS; 311 } 312 313 /// Copies data from the source agent to the destination agent. The source 314 /// memory must first be pinned explicitly or allocated via HSA. 315 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, 316 const void *src, hsa_agent_t src_agent, 317 uint64_t size) { 318 // Create a memory signal to copy information between the host and device. 319 hsa_signal_t memory_signal; 320 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) 321 return err; 322 323 if (hsa_status_t err = hsa_amd_memory_async_copy( 324 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) 325 return err; 326 327 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, 328 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) 329 ; 330 331 if (hsa_status_t err = hsa_signal_destroy(memory_signal)) 332 return err; 333 334 return HSA_STATUS_SUCCESS; 335 } 336 337 int load(int argc, char **argv, char **envp, void *image, size_t size, 338 const LaunchParameters ¶ms, bool print_resource_usage) { 339 // Initialize the HSA runtime used to communicate with the device. 340 if (hsa_status_t err = hsa_init()) 341 handle_error(err); 342 343 // Register a callback when the device encounters a memory fault. 344 if (hsa_status_t err = hsa_amd_register_system_event_handler( 345 [](const hsa_amd_event_t *event, void *) -> hsa_status_t { 346 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) 347 return HSA_STATUS_ERROR; 348 return HSA_STATUS_SUCCESS; 349 }, 350 nullptr)) 351 handle_error(err); 352 353 // Obtain a single agent for the device and host to use the HSA memory model. 354 hsa_agent_t dev_agent; 355 hsa_agent_t host_agent; 356 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) 357 handle_error(err); 358 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) 359 handle_error(err); 360 361 // Load the code object's ISA information and executable data segments. 362 hsa_code_object_t object; 363 if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object)) 364 handle_error(err); 365 366 hsa_executable_t executable; 367 if (hsa_status_t err = hsa_executable_create_alt( 368 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", 369 &executable)) 370 handle_error(err); 371 372 if (hsa_status_t err = 373 hsa_executable_load_code_object(executable, dev_agent, object, "")) 374 handle_error(err); 375 376 // No modifications to the executable are allowed after this point. 377 if (hsa_status_t err = hsa_executable_freeze(executable, "")) 378 handle_error(err); 379 380 // Check the validity of the loaded executable. If the agents ISA features do 381 // not match the executable's code object it will fail here. 382 uint32_t result; 383 if (hsa_status_t err = hsa_executable_validate(executable, &result)) 384 handle_error(err); 385 if (result) 386 handle_error(HSA_STATUS_ERROR); 387 388 // Obtain memory pools to exchange data between the host and the device. The 389 // fine-grained pool acts as pinned memory on the host for DMA transfers to 390 // the device, the coarse-grained pool is for allocations directly on the 391 // device, and the kernerl-argument pool is for executing the kernel. 392 hsa_amd_memory_pool_t kernargs_pool; 393 hsa_amd_memory_pool_t finegrained_pool; 394 hsa_amd_memory_pool_t coarsegrained_pool; 395 if (hsa_status_t err = 396 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( 397 host_agent, &kernargs_pool)) 398 handle_error(err); 399 if (hsa_status_t err = 400 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( 401 host_agent, &finegrained_pool)) 402 handle_error(err); 403 if (hsa_status_t err = 404 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( 405 dev_agent, &coarsegrained_pool)) 406 handle_error(err); 407 408 // Allocate fine-grained memory on the host to hold the pointer array for the 409 // copied argv and allow the GPU agent to access it. 410 auto allocator = [&](uint64_t size) -> void * { 411 void *dev_ptr = nullptr; 412 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 413 /*flags=*/0, &dev_ptr)) 414 handle_error(err); 415 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 416 return dev_ptr; 417 }; 418 void *dev_argv = copy_argument_vector(argc, argv, allocator); 419 if (!dev_argv) 420 handle_error("Failed to allocate device argv"); 421 422 // Allocate fine-grained memory on the host to hold the pointer array for the 423 // copied environment array and allow the GPU agent to access it. 424 void *dev_envp = copy_environment(envp, allocator); 425 if (!dev_envp) 426 handle_error("Failed to allocate device environment"); 427 428 // Allocate space for the return pointer and initialize it to zero. 429 void *dev_ret; 430 if (hsa_status_t err = 431 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), 432 /*flags=*/0, &dev_ret)) 433 handle_error(err); 434 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); 435 436 // Allocate finegrained memory for the RPC server and client to share. 437 uint32_t wavefront_size = 0; 438 if (hsa_status_t err = hsa_agent_get_info( 439 dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) 440 handle_error(err); 441 442 // Set up the RPC server. 443 auto tuple = std::make_tuple(dev_agent, finegrained_pool); 444 auto rpc_alloc = [](uint64_t size, void *data) { 445 auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data); 446 void *dev_ptr = nullptr; 447 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 448 /*flags=*/0, &dev_ptr)) 449 handle_error(err); 450 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 451 return dev_ptr; 452 }; 453 rpc_device_t device; 454 if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT, 455 wavefront_size, rpc_alloc, &tuple)) 456 handle_error(err); 457 458 // Register callbacks for the RPC unit tests. 459 if (wavefront_size == 32) 460 register_rpc_callbacks<32>(device); 461 else if (wavefront_size == 64) 462 register_rpc_callbacks<64>(device); 463 else 464 handle_error("Invalid wavefront size"); 465 466 // Initialize the RPC client on the device by copying the local data to the 467 // device's internal pointer. 468 hsa_executable_symbol_t rpc_client_sym; 469 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 470 executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym)) 471 handle_error(err); 472 473 void *rpc_client_host; 474 if (hsa_status_t err = 475 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *), 476 /*flags=*/0, &rpc_client_host)) 477 handle_error(err); 478 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host); 479 480 void *rpc_client_dev; 481 if (hsa_status_t err = hsa_executable_symbol_get_info( 482 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 483 &rpc_client_dev)) 484 handle_error(err); 485 486 // Copy the address of the client buffer from the device to the host. 487 if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev, 488 dev_agent, sizeof(void *))) 489 handle_error(err); 490 491 void *rpc_client_buffer; 492 if (hsa_status_t err = 493 hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)), 494 rpc_get_client_size(), 495 /*agents=*/nullptr, 0, &rpc_client_buffer)) 496 handle_error(err); 497 498 // Copy the RPC client buffer to the address pointed to by the symbol. 499 if (hsa_status_t err = 500 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent, 501 rpc_client_buffer, host_agent, rpc_get_client_size())) 502 handle_error(err); 503 504 if (hsa_status_t err = hsa_amd_memory_unlock( 505 const_cast<void *>(rpc_get_client_buffer(device)))) 506 handle_error(err); 507 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) 508 handle_error(err); 509 510 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. 511 // If the clock_freq symbol is missing, no work to do. 512 hsa_executable_symbol_t freq_sym; 513 if (HSA_STATUS_SUCCESS == 514 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", 515 &dev_agent, &freq_sym)) { 516 517 void *host_clock_freq; 518 if (hsa_status_t err = 519 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), 520 /*flags=*/0, &host_clock_freq)) 521 handle_error(err); 522 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); 523 524 if (HSA_STATUS_SUCCESS == 525 hsa_agent_get_info(dev_agent, 526 static_cast<hsa_agent_info_t>( 527 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), 528 host_clock_freq)) { 529 530 void *freq_addr; 531 if (hsa_status_t err = hsa_executable_symbol_get_info( 532 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 533 &freq_addr)) 534 handle_error(err); 535 536 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, 537 host_agent, sizeof(uint64_t))) 538 handle_error(err); 539 } 540 } 541 542 // Obtain a queue with the minimum (power of two) size, used to send commands 543 // to the HSA runtime and launch execution on the device. 544 uint64_t queue_size; 545 if (hsa_status_t err = hsa_agent_get_info( 546 dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size)) 547 handle_error(err); 548 hsa_queue_t *queue = nullptr; 549 if (hsa_status_t err = 550 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, 551 nullptr, UINT32_MAX, UINT32_MAX, &queue)) 552 handle_error(err); 553 554 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 555 begin_args_t init_args = {argc, dev_argv, dev_envp}; 556 if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, 557 coarsegrained_pool, queue, device, 558 single_threaded_params, "_begin.kd", 559 init_args, print_resource_usage)) 560 handle_error(err); 561 562 start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; 563 if (hsa_status_t err = launch_kernel( 564 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 565 device, params, "_start.kd", args, print_resource_usage)) 566 handle_error(err); 567 568 void *host_ret; 569 if (hsa_status_t err = 570 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), 571 /*flags=*/0, &host_ret)) 572 handle_error(err); 573 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); 574 575 if (hsa_status_t err = 576 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) 577 handle_error(err); 578 579 // Save the return value and perform basic clean-up. 580 int ret = *static_cast<int *>(host_ret); 581 582 end_args_t fini_args = {ret}; 583 if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, 584 coarsegrained_pool, queue, device, 585 single_threaded_params, "_end.kd", 586 fini_args, print_resource_usage)) 587 handle_error(err); 588 589 if (rpc_status_t err = rpc_server_shutdown( 590 device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); }, 591 nullptr)) 592 handle_error(err); 593 594 // Free the memory allocated for the device. 595 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) 596 handle_error(err); 597 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) 598 handle_error(err); 599 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) 600 handle_error(err); 601 602 if (hsa_status_t err = hsa_queue_destroy(queue)) 603 handle_error(err); 604 605 if (hsa_status_t err = hsa_executable_destroy(executable)) 606 handle_error(err); 607 608 if (hsa_status_t err = hsa_code_object_destroy(object)) 609 handle_error(err); 610 611 if (hsa_status_t err = hsa_shut_down()) 612 handle_error(err); 613 614 return ret; 615 } 616