1 //===-- Loader Implementation for AMDHSA devices --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file impelements a simple loader to run images supporting the AMDHSA 10 // architecture. The file launches the '_start' kernel which should be provided 11 // by the device application start code and call ultimately call the 'main' 12 // function. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "Loader.h" 17 18 #if defined(__has_include) 19 #if __has_include("hsa/hsa.h") 20 #include "hsa/hsa.h" 21 #include "hsa/hsa_ext_amd.h" 22 #elif __has_include("hsa.h") 23 #include "hsa.h" 24 #include "hsa_ext_amd.h" 25 #endif 26 #else 27 #include "hsa/hsa.h" 28 #include "hsa/hsa_ext_amd.h" 29 #endif 30 31 #include "llvm/Frontend/Offloading/Utility.h" 32 33 #include <atomic> 34 #include <cstdio> 35 #include <cstdlib> 36 #include <cstring> 37 #include <thread> 38 #include <tuple> 39 #include <utility> 40 41 // The implicit arguments of COV5 AMDGPU kernels. 42 struct implicit_args_t { 43 uint32_t grid_size_x; 44 uint32_t grid_size_y; 45 uint32_t grid_size_z; 46 uint16_t workgroup_size_x; 47 uint16_t workgroup_size_y; 48 uint16_t workgroup_size_z; 49 uint8_t Unused0[46]; 50 uint16_t grid_dims; 51 uint8_t Unused1[190]; 52 }; 53 54 /// Print the error code and exit if \p code indicates an error. 55 static void handle_error_impl(const char *file, int32_t line, 56 hsa_status_t code) { 57 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) 58 return; 59 60 const char *desc; 61 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) 62 desc = "Unknown error"; 63 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); 64 exit(EXIT_FAILURE); 65 } 66 67 /// Generic interface for iterating using the HSA callbacks. 68 template <typename elem_ty, typename func_ty, typename callback_ty> 69 hsa_status_t iterate(func_ty func, callback_ty cb) { 70 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 71 callback_ty *unwrapped = static_cast<callback_ty *>(data); 72 return (*unwrapped)(elem); 73 }; 74 return func(l, static_cast<void *>(&cb)); 75 } 76 77 /// Generic interface for iterating using the HSA callbacks. 78 template <typename elem_ty, typename func_ty, typename func_arg_ty, 79 typename callback_ty> 80 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { 81 auto l = [](elem_ty elem, void *data) -> hsa_status_t { 82 callback_ty *unwrapped = static_cast<callback_ty *>(data); 83 return (*unwrapped)(elem); 84 }; 85 return func(func_arg, l, static_cast<void *>(&cb)); 86 } 87 88 /// Iterate through all availible agents. 89 template <typename callback_ty> 90 hsa_status_t iterate_agents(callback_ty callback) { 91 return iterate<hsa_agent_t>(hsa_iterate_agents, callback); 92 } 93 94 /// Iterate through all availible memory pools. 95 template <typename callback_ty> 96 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { 97 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, 98 agent, cb); 99 } 100 101 template <hsa_device_type_t flag> 102 hsa_status_t get_agent(hsa_agent_t *output_agent) { 103 // Find the first agent with a matching device type. 104 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { 105 hsa_device_type_t type; 106 hsa_status_t status = 107 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); 108 if (status != HSA_STATUS_SUCCESS) 109 return status; 110 111 if (type == flag) { 112 // Ensure that a GPU agent supports kernel dispatch packets. 113 if (type == HSA_DEVICE_TYPE_GPU) { 114 hsa_agent_feature_t features; 115 status = 116 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); 117 if (status != HSA_STATUS_SUCCESS) 118 return status; 119 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 120 *output_agent = hsa_agent; 121 } else { 122 *output_agent = hsa_agent; 123 } 124 return HSA_STATUS_INFO_BREAK; 125 } 126 return HSA_STATUS_SUCCESS; 127 }; 128 129 return iterate_agents(cb); 130 } 131 132 void print_kernel_resources(const char *kernel_name) { 133 fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); 134 } 135 136 /// Retrieve a global memory pool with a \p flag from the agent. 137 template <hsa_amd_memory_pool_global_flag_t flag> 138 hsa_status_t get_agent_memory_pool(hsa_agent_t agent, 139 hsa_amd_memory_pool_t *output_pool) { 140 auto cb = [&](hsa_amd_memory_pool_t memory_pool) { 141 uint32_t flags; 142 hsa_amd_segment_t segment; 143 if (auto err = hsa_amd_memory_pool_get_info( 144 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) 145 return err; 146 if (auto err = hsa_amd_memory_pool_get_info( 147 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) 148 return err; 149 150 if (segment != HSA_AMD_SEGMENT_GLOBAL) 151 return HSA_STATUS_SUCCESS; 152 153 if (flags & flag) 154 *output_pool = memory_pool; 155 156 return HSA_STATUS_SUCCESS; 157 }; 158 return iterate_agent_memory_pools(agent, cb); 159 } 160 161 template <typename args_t> 162 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, 163 hsa_amd_memory_pool_t kernargs_pool, 164 hsa_amd_memory_pool_t coarsegrained_pool, 165 hsa_queue_t *queue, rpc::Server &server, 166 const LaunchParameters ¶ms, 167 const char *kernel_name, args_t kernel_args, 168 uint32_t wavefront_size, bool print_resource_usage) { 169 // Look up the kernel in the loaded executable. 170 hsa_executable_symbol_t symbol; 171 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 172 executable, kernel_name, &dev_agent, &symbol)) 173 return err; 174 175 // Retrieve different properties of the kernel symbol used for launch. 176 uint64_t kernel; 177 uint32_t args_size; 178 uint32_t group_size; 179 uint32_t private_size; 180 bool dynamic_stack; 181 182 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { 183 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, 184 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, 185 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, 186 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, 187 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; 188 189 for (auto &[info, value] : symbol_infos) 190 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) 191 return err; 192 193 // Allocate space for the kernel arguments on the host and allow the GPU agent 194 // to access it. 195 void *args; 196 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, 197 /*flags=*/0, &args)) 198 handle_error(err); 199 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); 200 201 // Initialize all the arguments (explicit and implicit) to zero, then set the 202 // explicit arguments to the values created above. 203 std::memset(args, 0, args_size); 204 std::memcpy(args, &kernel_args, sizeof(args_t)); 205 206 // Initialize the necessary implicit arguments to the proper values. 207 int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + 208 (params.num_blocks_z * params.num_threads_z != 1); 209 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( 210 reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); 211 implicit_args->grid_dims = dims; 212 implicit_args->grid_size_x = params.num_blocks_x; 213 implicit_args->grid_size_y = params.num_blocks_y; 214 implicit_args->grid_size_z = params.num_blocks_z; 215 implicit_args->workgroup_size_x = params.num_threads_x; 216 implicit_args->workgroup_size_y = params.num_threads_y; 217 implicit_args->workgroup_size_z = params.num_threads_z; 218 219 // Obtain a packet from the queue. 220 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 221 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) 222 ; 223 224 const uint32_t mask = queue->size - 1; 225 hsa_kernel_dispatch_packet_t *packet = 226 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + 227 (packet_id & mask); 228 229 // Set up the packet for exeuction on the device. We currently only launch 230 // with one thread on the device, forcing the rest of the wavefront to be 231 // masked off. 232 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 233 packet->workgroup_size_x = params.num_threads_x; 234 packet->workgroup_size_y = params.num_threads_y; 235 packet->workgroup_size_z = params.num_threads_z; 236 packet->reserved0 = 0; 237 packet->grid_size_x = params.num_blocks_x * params.num_threads_x; 238 packet->grid_size_y = params.num_blocks_y * params.num_threads_y; 239 packet->grid_size_z = params.num_blocks_z * params.num_threads_z; 240 packet->private_segment_size = 241 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; 242 packet->group_segment_size = group_size; 243 packet->kernel_object = kernel; 244 packet->kernarg_address = args; 245 packet->reserved2 = 0; 246 // Create a signal to indicate when this packet has been completed. 247 if (hsa_status_t err = 248 hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) 249 handle_error(err); 250 251 if (print_resource_usage) 252 print_kernel_resources(kernel_name); 253 254 // Initialize the packet header and set the doorbell signal to begin execution 255 // by the HSA runtime. 256 uint16_t header = 257 1u << HSA_PACKET_HEADER_BARRIER | 258 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 259 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | 260 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); 261 uint32_t header_word = header | (setup << 16u); 262 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); 263 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 264 265 std::atomic<bool> finished = false; 266 std::thread server_thread( 267 [](std::atomic<bool> *finished, rpc::Server *server, 268 uint32_t wavefront_size, hsa_agent_t dev_agent, 269 hsa_amd_memory_pool_t coarsegrained_pool) { 270 // Register RPC callbacks for the malloc and free functions on HSA. 271 auto malloc_handler = [&](size_t size) -> void * { 272 void *dev_ptr = nullptr; 273 if (hsa_status_t err = 274 hsa_amd_memory_pool_allocate(coarsegrained_pool, size, 275 /*flags=*/0, &dev_ptr)) 276 dev_ptr = nullptr; 277 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 278 return dev_ptr; 279 }; 280 281 auto free_handler = [](void *ptr) -> void { 282 if (hsa_status_t err = 283 hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr))) 284 handle_error(err); 285 }; 286 287 uint32_t index = 0; 288 while (!*finished) { 289 if (wavefront_size == 32) 290 index = 291 handle_server<32>(*server, index, malloc_handler, free_handler); 292 else 293 index = 294 handle_server<64>(*server, index, malloc_handler, free_handler); 295 } 296 }, 297 &finished, &server, wavefront_size, dev_agent, coarsegrained_pool); 298 299 // Wait until the kernel has completed execution on the device. Periodically 300 // check the RPC client for work to be performed on the server. 301 while (hsa_signal_wait_scacquire(packet->completion_signal, 302 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 303 HSA_WAIT_STATE_BLOCKED) != 0) 304 ; 305 306 finished = true; 307 if (server_thread.joinable()) 308 server_thread.join(); 309 310 // Destroy the resources acquired to launch the kernel and return. 311 if (hsa_status_t err = hsa_amd_memory_pool_free(args)) 312 handle_error(err); 313 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) 314 handle_error(err); 315 316 return HSA_STATUS_SUCCESS; 317 } 318 319 /// Copies data from the source agent to the destination agent. The source 320 /// memory must first be pinned explicitly or allocated via HSA. 321 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, 322 const void *src, hsa_agent_t src_agent, 323 uint64_t size) { 324 // Create a memory signal to copy information between the host and device. 325 hsa_signal_t memory_signal; 326 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) 327 return err; 328 329 if (hsa_status_t err = hsa_amd_memory_async_copy( 330 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) 331 return err; 332 333 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, 334 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) 335 ; 336 337 if (hsa_status_t err = hsa_signal_destroy(memory_signal)) 338 return err; 339 340 return HSA_STATUS_SUCCESS; 341 } 342 343 int load(int argc, const char **argv, const char **envp, void *image, 344 size_t size, const LaunchParameters ¶ms, 345 bool print_resource_usage) { 346 // Initialize the HSA runtime used to communicate with the device. 347 if (hsa_status_t err = hsa_init()) 348 handle_error(err); 349 350 // Register a callback when the device encounters a memory fault. 351 if (hsa_status_t err = hsa_amd_register_system_event_handler( 352 [](const hsa_amd_event_t *event, void *) -> hsa_status_t { 353 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) 354 return HSA_STATUS_ERROR; 355 return HSA_STATUS_SUCCESS; 356 }, 357 nullptr)) 358 handle_error(err); 359 360 // Obtain a single agent for the device and host to use the HSA memory model. 361 hsa_agent_t dev_agent; 362 hsa_agent_t host_agent; 363 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) 364 handle_error(err); 365 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) 366 handle_error(err); 367 368 // Load the code object's ISA information and executable data segments. 369 hsa_code_object_reader_t reader; 370 if (hsa_status_t err = 371 hsa_code_object_reader_create_from_memory(image, size, &reader)) 372 handle_error(err); 373 374 hsa_executable_t executable; 375 if (hsa_status_t err = hsa_executable_create_alt( 376 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", 377 &executable)) 378 handle_error(err); 379 380 hsa_loaded_code_object_t object; 381 if (hsa_status_t err = hsa_executable_load_agent_code_object( 382 executable, dev_agent, reader, "", &object)) 383 handle_error(err); 384 385 // No modifications to the executable are allowed after this point. 386 if (hsa_status_t err = hsa_executable_freeze(executable, "")) 387 handle_error(err); 388 389 // Check the validity of the loaded executable. If the agents ISA features do 390 // not match the executable's code object it will fail here. 391 uint32_t result; 392 if (hsa_status_t err = hsa_executable_validate(executable, &result)) 393 handle_error(err); 394 if (result) 395 handle_error(HSA_STATUS_ERROR); 396 397 if (hsa_status_t err = hsa_code_object_reader_destroy(reader)) 398 handle_error(err); 399 400 // Obtain memory pools to exchange data between the host and the device. The 401 // fine-grained pool acts as pinned memory on the host for DMA transfers to 402 // the device, the coarse-grained pool is for allocations directly on the 403 // device, and the kernerl-argument pool is for executing the kernel. 404 hsa_amd_memory_pool_t kernargs_pool; 405 hsa_amd_memory_pool_t finegrained_pool; 406 hsa_amd_memory_pool_t coarsegrained_pool; 407 if (hsa_status_t err = 408 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( 409 host_agent, &kernargs_pool)) 410 handle_error(err); 411 if (hsa_status_t err = 412 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( 413 host_agent, &finegrained_pool)) 414 handle_error(err); 415 if (hsa_status_t err = 416 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( 417 dev_agent, &coarsegrained_pool)) 418 handle_error(err); 419 420 // The AMDGPU target can change its wavefront size. There currently isn't a 421 // good way to look this up through the HSA API so we use the LLVM interface. 422 uint16_t abi_version; 423 llvm::StringRef image_ref(reinterpret_cast<char *>(image), size); 424 llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map; 425 if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage( 426 llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) { 427 handle_error(llvm::toString(std::move(err)).c_str()); 428 } 429 430 // Allocate fine-grained memory on the host to hold the pointer array for the 431 // copied argv and allow the GPU agent to access it. 432 auto allocator = [&](uint64_t size) -> void * { 433 void *dev_ptr = nullptr; 434 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 435 /*flags=*/0, &dev_ptr)) 436 handle_error(err); 437 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 438 return dev_ptr; 439 }; 440 void *dev_argv = copy_argument_vector(argc, argv, allocator); 441 if (!dev_argv) 442 handle_error("Failed to allocate device argv"); 443 444 // Allocate fine-grained memory on the host to hold the pointer array for the 445 // copied environment array and allow the GPU agent to access it. 446 void *dev_envp = copy_environment(envp, allocator); 447 if (!dev_envp) 448 handle_error("Failed to allocate device environment"); 449 450 // Allocate space for the return pointer and initialize it to zero. 451 void *dev_ret; 452 if (hsa_status_t err = 453 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), 454 /*flags=*/0, &dev_ret)) 455 handle_error(err); 456 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); 457 458 // Allocate finegrained memory for the RPC server and client to share. 459 uint32_t wavefront_size = 460 llvm::max_element(info_map, [](auto &&x, auto &&y) { 461 return x.second.WavefrontSize < y.second.WavefrontSize; 462 })->second.WavefrontSize; 463 464 // Set up the RPC server. 465 void *rpc_buffer; 466 if (hsa_status_t err = hsa_amd_memory_pool_allocate( 467 finegrained_pool, 468 rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT), 469 /*flags=*/0, &rpc_buffer)) 470 handle_error(err); 471 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer); 472 473 rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); 474 rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); 475 476 // Initialize the RPC client on the device by copying the local data to the 477 // device's internal pointer. 478 hsa_executable_symbol_t rpc_client_sym; 479 if (hsa_status_t err = hsa_executable_get_symbol_by_name( 480 executable, "__llvm_libc_rpc_client", &dev_agent, &rpc_client_sym)) 481 handle_error(err); 482 483 void *rpc_client_host; 484 if (hsa_status_t err = 485 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *), 486 /*flags=*/0, &rpc_client_host)) 487 handle_error(err); 488 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host); 489 490 void *rpc_client_dev; 491 if (hsa_status_t err = hsa_executable_symbol_get_info( 492 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 493 &rpc_client_dev)) 494 handle_error(err); 495 496 // Copy the address of the client buffer from the device to the host. 497 if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev, 498 dev_agent, sizeof(void *))) 499 handle_error(err); 500 501 void *rpc_client_buffer; 502 if (hsa_status_t err = 503 hsa_amd_memory_lock(&client, sizeof(rpc::Client), 504 /*agents=*/nullptr, 0, &rpc_client_buffer)) 505 handle_error(err); 506 507 // Copy the RPC client buffer to the address pointed to by the symbol. 508 if (hsa_status_t err = 509 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent, 510 rpc_client_buffer, host_agent, sizeof(rpc::Client))) 511 handle_error(err); 512 513 if (hsa_status_t err = hsa_amd_memory_unlock(&client)) 514 handle_error(err); 515 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) 516 handle_error(err); 517 518 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. 519 // If the clock_freq symbol is missing, no work to do. 520 hsa_executable_symbol_t freq_sym; 521 if (HSA_STATUS_SUCCESS == 522 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", 523 &dev_agent, &freq_sym)) { 524 void *host_clock_freq; 525 if (hsa_status_t err = 526 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), 527 /*flags=*/0, &host_clock_freq)) 528 handle_error(err); 529 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); 530 531 if (HSA_STATUS_SUCCESS == 532 hsa_agent_get_info(dev_agent, 533 static_cast<hsa_agent_info_t>( 534 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), 535 host_clock_freq)) { 536 537 void *freq_addr; 538 if (hsa_status_t err = hsa_executable_symbol_get_info( 539 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 540 &freq_addr)) 541 handle_error(err); 542 543 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, 544 host_agent, sizeof(uint64_t))) 545 handle_error(err); 546 } 547 } 548 549 // Obtain a queue with the maximum (power of two) size, used to send commands 550 // to the HSA runtime and launch execution on the device. 551 uint64_t queue_size; 552 if (hsa_status_t err = hsa_agent_get_info( 553 dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) 554 handle_error(err); 555 hsa_queue_t *queue = nullptr; 556 if (hsa_status_t err = 557 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, 558 nullptr, UINT32_MAX, UINT32_MAX, &queue)) 559 handle_error(err); 560 561 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 562 begin_args_t init_args = {argc, dev_argv, dev_envp}; 563 if (hsa_status_t err = launch_kernel( 564 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 565 server, single_threaded_params, "_begin.kd", init_args, 566 info_map["_begin"].WavefrontSize, print_resource_usage)) 567 handle_error(err); 568 569 start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; 570 if (hsa_status_t err = launch_kernel( 571 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 572 server, params, "_start.kd", args, info_map["_start"].WavefrontSize, 573 print_resource_usage)) 574 handle_error(err); 575 576 void *host_ret; 577 if (hsa_status_t err = 578 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), 579 /*flags=*/0, &host_ret)) 580 handle_error(err); 581 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); 582 583 if (hsa_status_t err = 584 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) 585 handle_error(err); 586 587 // Save the return value and perform basic clean-up. 588 int ret = *static_cast<int *>(host_ret); 589 590 end_args_t fini_args = {ret}; 591 if (hsa_status_t err = launch_kernel( 592 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 593 server, single_threaded_params, "_end.kd", fini_args, 594 info_map["_end"].WavefrontSize, print_resource_usage)) 595 handle_error(err); 596 597 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer)) 598 handle_error(err); 599 600 // Free the memory allocated for the device. 601 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) 602 handle_error(err); 603 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) 604 handle_error(err); 605 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) 606 handle_error(err); 607 608 if (hsa_status_t err = hsa_queue_destroy(queue)) 609 handle_error(err); 610 611 if (hsa_status_t err = hsa_executable_destroy(executable)) 612 handle_error(err); 613 614 if (hsa_status_t err = hsa_shut_down()) 615 handle_error(err); 616 617 return ret; 618 } 619