1 //===-- Loader Implementation for AMDHSA devices --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file impelements a simple loader to run images supporting the AMDHSA 10 // architecture. The file launches the '_start' kernel which should be provided 11 // by the device application start code and call ultimately call the 'main' 12 // function. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "Loader.h" 17 18 #if defined(__has_include) 19 #if __has_include("hsa/hsa.h") 20 #include "hsa/hsa.h" 21 #include "hsa/hsa_ext_amd.h" 22 #elif __has_include("hsa.h") 23 #include "hsa.h" 24 #include "hsa_ext_amd.h" 25 #endif 26 #else 27 #include "hsa/hsa.h" 28 #include "hsa/hsa_ext_amd.h" 29 #endif 30 31 #include <atomic> 32 #include <cstdio> 33 #include <cstdlib> 34 #include <cstring> 35 #include <thread> 36 #include <tuple> 37 #include <utility> 38 39 // The implicit arguments of COV5 AMDGPU kernels. 40 struct implicit_args_t { 41 uint32_t grid_size_x; 42 uint32_t grid_size_y; 43 uint32_t grid_size_z; 44 uint16_t workgroup_size_x; 45 uint16_t workgroup_size_y; 46 uint16_t workgroup_size_z; 47 uint8_t Unused0[46]; 48 uint16_t grid_dims; 49 uint8_t Unused1[190]; 50 }; 51 52 /// Print the error code and exit if \p code indicates an error. 53 static void handle_error_impl(const char *file, int32_t line, 54 hsa_status_t code) { 55 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) 56 return; 57 58 const char *desc; 59 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) 60 desc = "Unknown error"; 61 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); 62 exit(EXIT_FAILURE); 63 } 64 65 /// Generic interface for iterating using the HSA callbacks. 66 template <typename elem_ty, typename func_ty, typename callback_ty> 67 hsa_status_t iterate(func_ty func, callback_ty cb) { 68 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 69 callback_ty *unwrapped = static_cast<callback_ty *>(data); 70 return (*unwrapped)(elem); 71 }; 72 return func(l, static_cast<void *>(&cb)); 73 } 74 75 /// Generic interface for iterating using the HSA callbacks. 76 template <typename elem_ty, typename func_ty, typename func_arg_ty, 77 typename callback_ty> 78 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { 79 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 80 callback_ty *unwrapped = static_cast<callback_ty *>(data); 81 return (*unwrapped)(elem); 82 }; 83 return func(func_arg, l, static_cast<void *>(&cb)); 84 } 85 86 /// Iterate through all availible agents. 87 template <typename callback_ty> 88 hsa_status_t iterate_agents(callback_ty callback) { 89 return iterate<hsa_agent_t>(hsa_iterate_agents, callback); 90 } 91 92 /// Iterate through all availible memory pools. 93 template <typename callback_ty> 94 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { 95 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, 96 agent, cb); 97 } 98 99 template <hsa_device_type_t flag> 100 hsa_status_t get_agent(hsa_agent_t *output_agent) { 101 // Find the first agent with a matching device type. 102 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { 103 hsa_device_type_t type; 104 hsa_status_t status = 105 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); 106 if (status != HSA_STATUS_SUCCESS) 107 return status; 108 109 if (type == flag) { 110 // Ensure that a GPU agent supports kernel dispatch packets. 111 if (type == HSA_DEVICE_TYPE_GPU) { 112 hsa_agent_feature_t features; 113 status = 114 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); 115 if (status != HSA_STATUS_SUCCESS) 116 return status; 117 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 118 *output_agent = hsa_agent; 119 } else { 120 *output_agent = hsa_agent; 121 } 122 return HSA_STATUS_INFO_BREAK; 123 } 124 return HSA_STATUS_SUCCESS; 125 }; 126 127 return iterate_agents(cb); 128 } 129 130 void print_kernel_resources(const char *kernel_name) { 131 fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); 132 } 133 134 /// Retrieve a global memory pool with a \p flag from the agent. 135 template <hsa_amd_memory_pool_global_flag_t flag> 136 hsa_status_t get_agent_memory_pool(hsa_agent_t agent, 137 hsa_amd_memory_pool_t *output_pool) { 138 auto cb = [&](hsa_amd_memory_pool_t memory_pool) { 139 uint32_t flags; 140 hsa_amd_segment_t segment; 141 if (auto err = hsa_amd_memory_pool_get_info( 142 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) 143 return err; 144 if (auto err = hsa_amd_memory_pool_get_info( 145 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) 146 return err; 147 148 if (segment != HSA_AMD_SEGMENT_GLOBAL) 149 return HSA_STATUS_SUCCESS; 150 151 if (flags & flag) 152 *output_pool = memory_pool; 153 154 return HSA_STATUS_SUCCESS; 155 }; 156 return iterate_agent_memory_pools(agent, cb); 157 } 158 159 template <typename args_t> 160 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, 161 hsa_amd_memory_pool_t kernargs_pool, 162 hsa_amd_memory_pool_t coarsegrained_pool, 163 hsa_queue_t *queue, rpc_device_t device, 164 const LaunchParameters ¶ms, 165 const char *kernel_name, args_t kernel_args, 166 bool print_resource_usage) { 167 // Look up the kernel in the loaded executable. 168 hsa_executable_symbol_t symbol; 169 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 170 executable, kernel_name, &dev_agent, &symbol)) 171 return err; 172 173 // Register RPC callbacks for the malloc and free functions on HSA. 174 auto tuple = std::make_tuple(dev_agent, coarsegrained_pool); 175 rpc_register_callback( 176 device, RPC_MALLOC, 177 [](rpc_port_t port, void *data) { 178 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { 179 auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data); 180 uint64_t size = buffer->data[0]; 181 void *dev_ptr = nullptr; 182 if (hsa_status_t err = 183 hsa_amd_memory_pool_allocate(pool, size, 184 /*flags=*/0, &dev_ptr)) 185 dev_ptr = nullptr; 186 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 187 buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr); 188 }; 189 rpc_recv_and_send(port, malloc_handler, data); 190 }, 191 &tuple); 192 rpc_register_callback( 193 device, RPC_FREE, 194 [](rpc_port_t port, void *data) { 195 auto free_handler = [](rpc_buffer_t *buffer, void *) { 196 if (hsa_status_t err = hsa_amd_memory_pool_free( 197 reinterpret_cast<void *>(buffer->data[0]))) 198 handle_error(err); 199 }; 200 rpc_recv_and_send(port, free_handler, data); 201 }, 202 nullptr); 203 204 // Retrieve different properties of the kernel symbol used for launch. 205 uint64_t kernel; 206 uint32_t args_size; 207 uint32_t group_size; 208 uint32_t private_size; 209 bool dynamic_stack; 210 211 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { 212 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, 213 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, 214 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, 215 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, 216 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; 217 218 for (auto &[info, value] : symbol_infos) 219 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) 220 return err; 221 222 // Allocate space for the kernel arguments on the host and allow the GPU agent 223 // to access it. 224 void *args; 225 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, 226 /*flags=*/0, &args)) 227 handle_error(err); 228 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); 229 230 // Initialize all the arguments (explicit and implicit) to zero, then set the 231 // explicit arguments to the values created above. 232 std::memset(args, 0, args_size); 233 std::memcpy(args, &kernel_args, sizeof(args_t)); 234 235 // Initialize the necessary implicit arguments to the proper values. 236 int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + 237 (params.num_blocks_z * params.num_threads_z != 1); 238 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( 239 reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); 240 implicit_args->grid_dims = dims; 241 implicit_args->grid_size_x = params.num_blocks_x; 242 implicit_args->grid_size_y = params.num_blocks_y; 243 implicit_args->grid_size_z = params.num_blocks_z; 244 implicit_args->workgroup_size_x = params.num_threads_x; 245 implicit_args->workgroup_size_y = params.num_threads_y; 246 implicit_args->workgroup_size_z = params.num_threads_z; 247 248 // Obtain a packet from the queue. 249 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 250 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) 251 ; 252 253 const uint32_t mask = queue->size - 1; 254 hsa_kernel_dispatch_packet_t *packet = 255 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + 256 (packet_id & mask); 257 258 // Set up the packet for exeuction on the device. We currently only launch 259 // with one thread on the device, forcing the rest of the wavefront to be 260 // masked off. 261 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 262 packet->workgroup_size_x = params.num_threads_x; 263 packet->workgroup_size_y = params.num_threads_y; 264 packet->workgroup_size_z = params.num_threads_z; 265 packet->reserved0 = 0; 266 packet->grid_size_x = params.num_blocks_x * params.num_threads_x; 267 packet->grid_size_y = params.num_blocks_y * params.num_threads_y; 268 packet->grid_size_z = params.num_blocks_z * params.num_threads_z; 269 packet->private_segment_size = 270 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; 271 packet->group_segment_size = group_size; 272 packet->kernel_object = kernel; 273 packet->kernarg_address = args; 274 packet->reserved2 = 0; 275 // Create a signal to indicate when this packet has been completed. 276 if (hsa_status_t err = 277 hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) 278 handle_error(err); 279 280 if (print_resource_usage) 281 print_kernel_resources(kernel_name); 282 283 // Initialize the packet header and set the doorbell signal to begin execution 284 // by the HSA runtime. 285 uint16_t header = 286 1u << HSA_PACKET_HEADER_BARRIER | 287 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 288 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | 289 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); 290 uint32_t header_word = header | (setup << 16u); 291 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); 292 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 293 294 std::atomic<bool> finished = false; 295 std::thread server( 296 [](std::atomic<bool> *finished, rpc_device_t device) { 297 while (!*finished) { 298 if (rpc_status_t err = rpc_handle_server(device)) 299 handle_error(err); 300 } 301 }, 302 &finished, device); 303 304 // Wait until the kernel has completed execution on the device. Periodically 305 // check the RPC client for work to be performed on the server. 306 while (hsa_signal_wait_scacquire(packet->completion_signal, 307 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 308 HSA_WAIT_STATE_BLOCKED) != 0) 309 ; 310 311 finished = true; 312 if (server.joinable()) 313 server.join(); 314 315 // Destroy the resources acquired to launch the kernel and return. 316 if (hsa_status_t err = hsa_amd_memory_pool_free(args)) 317 handle_error(err); 318 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) 319 handle_error(err); 320 321 return HSA_STATUS_SUCCESS; 322 } 323 324 /// Copies data from the source agent to the destination agent. The source 325 /// memory must first be pinned explicitly or allocated via HSA. 326 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, 327 const void *src, hsa_agent_t src_agent, 328 uint64_t size) { 329 // Create a memory signal to copy information between the host and device. 330 hsa_signal_t memory_signal; 331 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) 332 return err; 333 334 if (hsa_status_t err = hsa_amd_memory_async_copy( 335 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) 336 return err; 337 338 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, 339 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) 340 ; 341 342 if (hsa_status_t err = hsa_signal_destroy(memory_signal)) 343 return err; 344 345 return HSA_STATUS_SUCCESS; 346 } 347 348 int load(int argc, const char **argv, const char **envp, void *image, 349 size_t size, const LaunchParameters ¶ms, 350 bool print_resource_usage) { 351 // Initialize the HSA runtime used to communicate with the device. 352 if (hsa_status_t err = hsa_init()) 353 handle_error(err); 354 355 // Register a callback when the device encounters a memory fault. 356 if (hsa_status_t err = hsa_amd_register_system_event_handler( 357 [](const hsa_amd_event_t *event, void *) -> hsa_status_t { 358 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) 359 return HSA_STATUS_ERROR; 360 return HSA_STATUS_SUCCESS; 361 }, 362 nullptr)) 363 handle_error(err); 364 365 // Obtain a single agent for the device and host to use the HSA memory model. 366 hsa_agent_t dev_agent; 367 hsa_agent_t host_agent; 368 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) 369 handle_error(err); 370 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) 371 handle_error(err); 372 373 // Load the code object's ISA information and executable data segments. 374 hsa_code_object_reader_t reader; 375 if (hsa_status_t err = 376 hsa_code_object_reader_create_from_memory(image, size, &reader)) 377 handle_error(err); 378 379 hsa_executable_t executable; 380 if (hsa_status_t err = hsa_executable_create_alt( 381 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", 382 &executable)) 383 handle_error(err); 384 385 hsa_loaded_code_object_t object; 386 if (hsa_status_t err = hsa_executable_load_agent_code_object( 387 executable, dev_agent, reader, "", &object)) 388 handle_error(err); 389 390 // No modifications to the executable are allowed after this point. 391 if (hsa_status_t err = hsa_executable_freeze(executable, "")) 392 handle_error(err); 393 394 // Check the validity of the loaded executable. If the agents ISA features do 395 // not match the executable's code object it will fail here. 396 uint32_t result; 397 if (hsa_status_t err = hsa_executable_validate(executable, &result)) 398 handle_error(err); 399 if (result) 400 handle_error(HSA_STATUS_ERROR); 401 402 if (hsa_status_t err = hsa_code_object_reader_destroy(reader)) 403 handle_error(err); 404 405 // Obtain memory pools to exchange data between the host and the device. The 406 // fine-grained pool acts as pinned memory on the host for DMA transfers to 407 // the device, the coarse-grained pool is for allocations directly on the 408 // device, and the kernerl-argument pool is for executing the kernel. 409 hsa_amd_memory_pool_t kernargs_pool; 410 hsa_amd_memory_pool_t finegrained_pool; 411 hsa_amd_memory_pool_t coarsegrained_pool; 412 if (hsa_status_t err = 413 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( 414 host_agent, &kernargs_pool)) 415 handle_error(err); 416 if (hsa_status_t err = 417 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( 418 host_agent, &finegrained_pool)) 419 handle_error(err); 420 if (hsa_status_t err = 421 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( 422 dev_agent, &coarsegrained_pool)) 423 handle_error(err); 424 425 // Allocate fine-grained memory on the host to hold the pointer array for the 426 // copied argv and allow the GPU agent to access it. 427 auto allocator = [&](uint64_t size) -> void * { 428 void *dev_ptr = nullptr; 429 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 430 /*flags=*/0, &dev_ptr)) 431 handle_error(err); 432 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 433 return dev_ptr; 434 }; 435 void *dev_argv = copy_argument_vector(argc, argv, allocator); 436 if (!dev_argv) 437 handle_error("Failed to allocate device argv"); 438 439 // Allocate fine-grained memory on the host to hold the pointer array for the 440 // copied environment array and allow the GPU agent to access it. 441 void *dev_envp = copy_environment(envp, allocator); 442 if (!dev_envp) 443 handle_error("Failed to allocate device environment"); 444 445 // Allocate space for the return pointer and initialize it to zero. 446 void *dev_ret; 447 if (hsa_status_t err = 448 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), 449 /*flags=*/0, &dev_ret)) 450 handle_error(err); 451 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); 452 453 // Allocate finegrained memory for the RPC server and client to share. 454 uint32_t wavefront_size = 0; 455 if (hsa_status_t err = hsa_agent_get_info( 456 dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) 457 handle_error(err); 458 459 // Set up the RPC server. 460 auto tuple = std::make_tuple(dev_agent, finegrained_pool); 461 auto rpc_alloc = [](uint64_t size, void *data) { 462 auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data); 463 void *dev_ptr = nullptr; 464 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 465 /*flags=*/0, &dev_ptr)) 466 handle_error(err); 467 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 468 return dev_ptr; 469 }; 470 rpc_device_t device; 471 if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT, 472 wavefront_size, rpc_alloc, &tuple)) 473 handle_error(err); 474 475 // Register callbacks for the RPC unit tests. 476 if (wavefront_size == 32) 477 register_rpc_callbacks<32>(device); 478 else if (wavefront_size == 64) 479 register_rpc_callbacks<64>(device); 480 else 481 handle_error("Invalid wavefront size"); 482 483 // Initialize the RPC client on the device by copying the local data to the 484 // device's internal pointer. 485 hsa_executable_symbol_t rpc_client_sym; 486 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 487 executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym)) 488 handle_error(err); 489 490 void *rpc_client_host; 491 if (hsa_status_t err = 492 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *), 493 /*flags=*/0, &rpc_client_host)) 494 handle_error(err); 495 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host); 496 497 void *rpc_client_dev; 498 if (hsa_status_t err = hsa_executable_symbol_get_info( 499 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 500 &rpc_client_dev)) 501 handle_error(err); 502 503 // Copy the address of the client buffer from the device to the host. 504 if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev, 505 dev_agent, sizeof(void *))) 506 handle_error(err); 507 508 void *rpc_client_buffer; 509 if (hsa_status_t err = 510 hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)), 511 rpc_get_client_size(), 512 /*agents=*/nullptr, 0, &rpc_client_buffer)) 513 handle_error(err); 514 515 // Copy the RPC client buffer to the address pointed to by the symbol. 516 if (hsa_status_t err = 517 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent, 518 rpc_client_buffer, host_agent, rpc_get_client_size())) 519 handle_error(err); 520 521 if (hsa_status_t err = hsa_amd_memory_unlock( 522 const_cast<void *>(rpc_get_client_buffer(device)))) 523 handle_error(err); 524 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) 525 handle_error(err); 526 527 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. 528 // If the clock_freq symbol is missing, no work to do. 529 hsa_executable_symbol_t freq_sym; 530 if (HSA_STATUS_SUCCESS == 531 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", 532 &dev_agent, &freq_sym)) { 533 534 void *host_clock_freq; 535 if (hsa_status_t err = 536 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), 537 /*flags=*/0, &host_clock_freq)) 538 handle_error(err); 539 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); 540 541 if (HSA_STATUS_SUCCESS == 542 hsa_agent_get_info(dev_agent, 543 static_cast<hsa_agent_info_t>( 544 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), 545 host_clock_freq)) { 546 547 void *freq_addr; 548 if (hsa_status_t err = hsa_executable_symbol_get_info( 549 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 550 &freq_addr)) 551 handle_error(err); 552 553 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, 554 host_agent, sizeof(uint64_t))) 555 handle_error(err); 556 } 557 } 558 559 // Obtain a queue with the maximum (power of two) size, used to send commands 560 // to the HSA runtime and launch execution on the device. 561 uint64_t queue_size; 562 if (hsa_status_t err = hsa_agent_get_info( 563 dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) 564 handle_error(err); 565 hsa_queue_t *queue = nullptr; 566 if (hsa_status_t err = 567 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, 568 nullptr, UINT32_MAX, UINT32_MAX, &queue)) 569 handle_error(err); 570 571 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 572 begin_args_t init_args = {argc, dev_argv, dev_envp}; 573 if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, 574 coarsegrained_pool, queue, device, 575 single_threaded_params, "_begin.kd", 576 init_args, print_resource_usage)) 577 handle_error(err); 578 579 start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; 580 if (hsa_status_t err = launch_kernel( 581 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 582 device, params, "_start.kd", args, print_resource_usage)) 583 handle_error(err); 584 585 void *host_ret; 586 if (hsa_status_t err = 587 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), 588 /*flags=*/0, &host_ret)) 589 handle_error(err); 590 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); 591 592 if (hsa_status_t err = 593 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) 594 handle_error(err); 595 596 // Save the return value and perform basic clean-up. 597 int ret = *static_cast<int *>(host_ret); 598 599 end_args_t fini_args = {ret}; 600 if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, 601 coarsegrained_pool, queue, device, 602 single_threaded_params, "_end.kd", 603 fini_args, print_resource_usage)) 604 handle_error(err); 605 606 if (rpc_status_t err = rpc_server_shutdown( 607 device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); }, 608 nullptr)) 609 handle_error(err); 610 611 // Free the memory allocated for the device. 612 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) 613 handle_error(err); 614 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) 615 handle_error(err); 616 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) 617 handle_error(err); 618 619 if (hsa_status_t err = hsa_queue_destroy(queue)) 620 handle_error(err); 621 622 if (hsa_status_t err = hsa_executable_destroy(executable)) 623 handle_error(err); 624 625 if (hsa_status_t err = hsa_shut_down()) 626 handle_error(err); 627 628 return ret; 629 } 630