1feeb8335SJoseph Huber //===-- Loader Implementation for AMDHSA devices --------------------------===// 2feeb8335SJoseph Huber // 3feeb8335SJoseph Huber // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4feeb8335SJoseph Huber // See https://llvm.org/LICENSE.txt for license information. 5feeb8335SJoseph Huber // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6feeb8335SJoseph Huber // 7feeb8335SJoseph Huber //===----------------------------------------------------------------------===// 8feeb8335SJoseph Huber // 9feeb8335SJoseph Huber // This file impelements a simple loader to run images supporting the AMDHSA 10feeb8335SJoseph Huber // architecture. The file launches the '_start' kernel which should be provided 11feeb8335SJoseph Huber // by the device application start code and call ultimately call the 'main' 12feeb8335SJoseph Huber // function. 13feeb8335SJoseph Huber // 14feeb8335SJoseph Huber //===----------------------------------------------------------------------===// 15feeb8335SJoseph Huber 16feeb8335SJoseph Huber #include "Loader.h" 17feeb8335SJoseph Huber 18feeb8335SJoseph Huber #include "hsa/hsa.h" 19feeb8335SJoseph Huber #include "hsa/hsa_ext_amd.h" 20feeb8335SJoseph Huber 2138049dc8SJoseph Huber #include "llvm/Frontend/Offloading/Utility.h" 2238049dc8SJoseph Huber 23ee57a685SJoseph Huber #include <atomic> 24feeb8335SJoseph Huber #include <cstdio> 25feeb8335SJoseph Huber #include <cstdlib> 26feeb8335SJoseph Huber #include <cstring> 27ee57a685SJoseph Huber #include <thread> 28feeb8335SJoseph Huber #include <tuple> 29feeb8335SJoseph Huber #include <utility> 30feeb8335SJoseph Huber 31feeb8335SJoseph Huber // The implicit arguments of COV5 AMDGPU kernels. 32feeb8335SJoseph Huber struct implicit_args_t { 33feeb8335SJoseph Huber uint32_t grid_size_x; 34feeb8335SJoseph Huber uint32_t grid_size_y; 35feeb8335SJoseph Huber uint32_t grid_size_z; 36feeb8335SJoseph Huber uint16_t workgroup_size_x; 37feeb8335SJoseph Huber uint16_t workgroup_size_y; 38feeb8335SJoseph Huber uint16_t workgroup_size_z; 39feeb8335SJoseph Huber uint8_t Unused0[46]; 40feeb8335SJoseph Huber uint16_t grid_dims; 41feeb8335SJoseph Huber uint8_t Unused1[190]; 42feeb8335SJoseph Huber }; 43feeb8335SJoseph Huber 44feeb8335SJoseph Huber /// Print the error code and exit if \p code indicates an error. 45feeb8335SJoseph Huber static void handle_error_impl(const char *file, int32_t line, 46feeb8335SJoseph Huber hsa_status_t code) { 47feeb8335SJoseph Huber if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) 48feeb8335SJoseph Huber return; 49feeb8335SJoseph Huber 50feeb8335SJoseph Huber const char *desc; 51feeb8335SJoseph Huber if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) 52feeb8335SJoseph Huber desc = "Unknown error"; 53feeb8335SJoseph Huber fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); 54feeb8335SJoseph Huber exit(EXIT_FAILURE); 55feeb8335SJoseph Huber } 56feeb8335SJoseph Huber 57feeb8335SJoseph Huber /// Generic interface for iterating using the HSA callbacks. 58feeb8335SJoseph Huber template <typename elem_ty, typename func_ty, typename callback_ty> 59feeb8335SJoseph Huber hsa_status_t iterate(func_ty func, callback_ty cb) { 60feeb8335SJoseph Huber auto l = [](elem_ty elem, void *data) -> hsa_status_t { 61feeb8335SJoseph Huber callback_ty *unwrapped = static_cast<callback_ty *>(data); 62feeb8335SJoseph Huber return (*unwrapped)(elem); 63feeb8335SJoseph Huber }; 64feeb8335SJoseph Huber return func(l, static_cast<void *>(&cb)); 65feeb8335SJoseph Huber } 66feeb8335SJoseph Huber 67feeb8335SJoseph Huber /// Generic interface for iterating using the HSA callbacks. 68feeb8335SJoseph Huber template <typename elem_ty, typename func_ty, typename func_arg_ty, 69feeb8335SJoseph Huber typename callback_ty> 70feeb8335SJoseph Huber hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { 71feeb8335SJoseph Huber auto l = [](elem_ty elem, void *data) -> hsa_status_t { 72feeb8335SJoseph Huber callback_ty *unwrapped = static_cast<callback_ty *>(data); 73feeb8335SJoseph Huber return (*unwrapped)(elem); 74feeb8335SJoseph Huber }; 75feeb8335SJoseph Huber return func(func_arg, l, static_cast<void *>(&cb)); 76feeb8335SJoseph Huber } 77feeb8335SJoseph Huber 78feeb8335SJoseph Huber /// Iterate through all availible agents. 79feeb8335SJoseph Huber template <typename callback_ty> 80feeb8335SJoseph Huber hsa_status_t iterate_agents(callback_ty callback) { 81feeb8335SJoseph Huber return iterate<hsa_agent_t>(hsa_iterate_agents, callback); 82feeb8335SJoseph Huber } 83feeb8335SJoseph Huber 84feeb8335SJoseph Huber /// Iterate through all availible memory pools. 85feeb8335SJoseph Huber template <typename callback_ty> 86feeb8335SJoseph Huber hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { 87feeb8335SJoseph Huber return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, 88feeb8335SJoseph Huber agent, cb); 89feeb8335SJoseph Huber } 90feeb8335SJoseph Huber 91feeb8335SJoseph Huber template <hsa_device_type_t flag> 92feeb8335SJoseph Huber hsa_status_t get_agent(hsa_agent_t *output_agent) { 93feeb8335SJoseph Huber // Find the first agent with a matching device type. 94feeb8335SJoseph Huber auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { 95feeb8335SJoseph Huber hsa_device_type_t type; 96feeb8335SJoseph Huber hsa_status_t status = 97feeb8335SJoseph Huber hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); 98feeb8335SJoseph Huber if (status != HSA_STATUS_SUCCESS) 99feeb8335SJoseph Huber return status; 100feeb8335SJoseph Huber 101feeb8335SJoseph Huber if (type == flag) { 102feeb8335SJoseph Huber // Ensure that a GPU agent supports kernel dispatch packets. 103feeb8335SJoseph Huber if (type == HSA_DEVICE_TYPE_GPU) { 104feeb8335SJoseph Huber hsa_agent_feature_t features; 105feeb8335SJoseph Huber status = 106feeb8335SJoseph Huber hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); 107feeb8335SJoseph Huber if (status != HSA_STATUS_SUCCESS) 108feeb8335SJoseph Huber return status; 109feeb8335SJoseph Huber if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 110feeb8335SJoseph Huber *output_agent = hsa_agent; 111feeb8335SJoseph Huber } else { 112feeb8335SJoseph Huber *output_agent = hsa_agent; 113feeb8335SJoseph Huber } 114feeb8335SJoseph Huber return HSA_STATUS_INFO_BREAK; 115feeb8335SJoseph Huber } 116feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 117feeb8335SJoseph Huber }; 118feeb8335SJoseph Huber 119feeb8335SJoseph Huber return iterate_agents(cb); 120feeb8335SJoseph Huber } 121feeb8335SJoseph Huber 122feeb8335SJoseph Huber void print_kernel_resources(const char *kernel_name) { 123feeb8335SJoseph Huber fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); 124feeb8335SJoseph Huber } 125feeb8335SJoseph Huber 126feeb8335SJoseph Huber /// Retrieve a global memory pool with a \p flag from the agent. 127feeb8335SJoseph Huber template <hsa_amd_memory_pool_global_flag_t flag> 128feeb8335SJoseph Huber hsa_status_t get_agent_memory_pool(hsa_agent_t agent, 129feeb8335SJoseph Huber hsa_amd_memory_pool_t *output_pool) { 130feeb8335SJoseph Huber auto cb = [&](hsa_amd_memory_pool_t memory_pool) { 131feeb8335SJoseph Huber uint32_t flags; 132feeb8335SJoseph Huber hsa_amd_segment_t segment; 133feeb8335SJoseph Huber if (auto err = hsa_amd_memory_pool_get_info( 134feeb8335SJoseph Huber memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) 135feeb8335SJoseph Huber return err; 136feeb8335SJoseph Huber if (auto err = hsa_amd_memory_pool_get_info( 137feeb8335SJoseph Huber memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) 138feeb8335SJoseph Huber return err; 139feeb8335SJoseph Huber 140feeb8335SJoseph Huber if (segment != HSA_AMD_SEGMENT_GLOBAL) 141feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 142feeb8335SJoseph Huber 143feeb8335SJoseph Huber if (flags & flag) 144feeb8335SJoseph Huber *output_pool = memory_pool; 145feeb8335SJoseph Huber 146feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 147feeb8335SJoseph Huber }; 148feeb8335SJoseph Huber return iterate_agent_memory_pools(agent, cb); 149feeb8335SJoseph Huber } 150feeb8335SJoseph Huber 151feeb8335SJoseph Huber template <typename args_t> 152feeb8335SJoseph Huber hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, 153feeb8335SJoseph Huber hsa_amd_memory_pool_t kernargs_pool, 154feeb8335SJoseph Huber hsa_amd_memory_pool_t coarsegrained_pool, 155b4d49fb5SJoseph Huber hsa_queue_t *queue, rpc::Server &server, 156feeb8335SJoseph Huber const LaunchParameters ¶ms, 157feeb8335SJoseph Huber const char *kernel_name, args_t kernel_args, 15838049dc8SJoseph Huber uint32_t wavefront_size, bool print_resource_usage) { 159feeb8335SJoseph Huber // Look up the kernel in the loaded executable. 160feeb8335SJoseph Huber hsa_executable_symbol_t symbol; 161feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_get_symbol_by_name( 162feeb8335SJoseph Huber executable, kernel_name, &dev_agent, &symbol)) 163feeb8335SJoseph Huber return err; 164feeb8335SJoseph Huber 165feeb8335SJoseph Huber // Retrieve different properties of the kernel symbol used for launch. 166feeb8335SJoseph Huber uint64_t kernel; 167feeb8335SJoseph Huber uint32_t args_size; 168feeb8335SJoseph Huber uint32_t group_size; 169feeb8335SJoseph Huber uint32_t private_size; 170feeb8335SJoseph Huber bool dynamic_stack; 171feeb8335SJoseph Huber 172feeb8335SJoseph Huber std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { 173feeb8335SJoseph Huber {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, 174feeb8335SJoseph Huber {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, 175feeb8335SJoseph Huber {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, 176feeb8335SJoseph Huber {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, 177feeb8335SJoseph Huber {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; 178feeb8335SJoseph Huber 179feeb8335SJoseph Huber for (auto &[info, value] : symbol_infos) 180feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) 181feeb8335SJoseph Huber return err; 182feeb8335SJoseph Huber 183feeb8335SJoseph Huber // Allocate space for the kernel arguments on the host and allow the GPU agent 184feeb8335SJoseph Huber // to access it. 185feeb8335SJoseph Huber void *args; 186feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, 187feeb8335SJoseph Huber /*flags=*/0, &args)) 188feeb8335SJoseph Huber handle_error(err); 189feeb8335SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); 190feeb8335SJoseph Huber 191feeb8335SJoseph Huber // Initialize all the arguments (explicit and implicit) to zero, then set the 192feeb8335SJoseph Huber // explicit arguments to the values created above. 193feeb8335SJoseph Huber std::memset(args, 0, args_size); 194feeb8335SJoseph Huber std::memcpy(args, &kernel_args, sizeof(args_t)); 195feeb8335SJoseph Huber 196feeb8335SJoseph Huber // Initialize the necessary implicit arguments to the proper values. 197b712a144SJoseph Huber int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + 198feeb8335SJoseph Huber (params.num_blocks_z * params.num_threads_z != 1); 199feeb8335SJoseph Huber implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( 200feeb8335SJoseph Huber reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); 201feeb8335SJoseph Huber implicit_args->grid_dims = dims; 202feeb8335SJoseph Huber implicit_args->grid_size_x = params.num_blocks_x; 203feeb8335SJoseph Huber implicit_args->grid_size_y = params.num_blocks_y; 204feeb8335SJoseph Huber implicit_args->grid_size_z = params.num_blocks_z; 205feeb8335SJoseph Huber implicit_args->workgroup_size_x = params.num_threads_x; 206feeb8335SJoseph Huber implicit_args->workgroup_size_y = params.num_threads_y; 207feeb8335SJoseph Huber implicit_args->workgroup_size_z = params.num_threads_z; 208feeb8335SJoseph Huber 209feeb8335SJoseph Huber // Obtain a packet from the queue. 210feeb8335SJoseph Huber uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 211feeb8335SJoseph Huber while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) 212feeb8335SJoseph Huber ; 213feeb8335SJoseph Huber 214feeb8335SJoseph Huber const uint32_t mask = queue->size - 1; 215feeb8335SJoseph Huber hsa_kernel_dispatch_packet_t *packet = 216feeb8335SJoseph Huber static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + 217feeb8335SJoseph Huber (packet_id & mask); 218feeb8335SJoseph Huber 219feeb8335SJoseph Huber // Set up the packet for exeuction on the device. We currently only launch 220feeb8335SJoseph Huber // with one thread on the device, forcing the rest of the wavefront to be 221feeb8335SJoseph Huber // masked off. 222feeb8335SJoseph Huber uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 223feeb8335SJoseph Huber packet->workgroup_size_x = params.num_threads_x; 224feeb8335SJoseph Huber packet->workgroup_size_y = params.num_threads_y; 225feeb8335SJoseph Huber packet->workgroup_size_z = params.num_threads_z; 226feeb8335SJoseph Huber packet->reserved0 = 0; 227feeb8335SJoseph Huber packet->grid_size_x = params.num_blocks_x * params.num_threads_x; 228feeb8335SJoseph Huber packet->grid_size_y = params.num_blocks_y * params.num_threads_y; 229feeb8335SJoseph Huber packet->grid_size_z = params.num_blocks_z * params.num_threads_z; 230feeb8335SJoseph Huber packet->private_segment_size = 231feeb8335SJoseph Huber dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; 232feeb8335SJoseph Huber packet->group_segment_size = group_size; 233feeb8335SJoseph Huber packet->kernel_object = kernel; 234feeb8335SJoseph Huber packet->kernarg_address = args; 235feeb8335SJoseph Huber packet->reserved2 = 0; 236feeb8335SJoseph Huber // Create a signal to indicate when this packet has been completed. 237feeb8335SJoseph Huber if (hsa_status_t err = 238feeb8335SJoseph Huber hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) 239feeb8335SJoseph Huber handle_error(err); 240feeb8335SJoseph Huber 241feeb8335SJoseph Huber if (print_resource_usage) 242feeb8335SJoseph Huber print_kernel_resources(kernel_name); 243feeb8335SJoseph Huber 244feeb8335SJoseph Huber // Initialize the packet header and set the doorbell signal to begin execution 245feeb8335SJoseph Huber // by the HSA runtime. 246feeb8335SJoseph Huber uint16_t header = 2476558e561SJoseph Huber 1u << HSA_PACKET_HEADER_BARRIER | 248feeb8335SJoseph Huber (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | 249feeb8335SJoseph Huber (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | 250feeb8335SJoseph Huber (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); 251feeb8335SJoseph Huber uint32_t header_word = header | (setup << 16u); 252feeb8335SJoseph Huber __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); 253feeb8335SJoseph Huber hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 254feeb8335SJoseph Huber 255ee57a685SJoseph Huber std::atomic<bool> finished = false; 256b4d49fb5SJoseph Huber std::thread server_thread( 257b4d49fb5SJoseph Huber [](std::atomic<bool> *finished, rpc::Server *server, 258b4d49fb5SJoseph Huber uint32_t wavefront_size, hsa_agent_t dev_agent, 259b4d49fb5SJoseph Huber hsa_amd_memory_pool_t coarsegrained_pool) { 260b4d49fb5SJoseph Huber // Register RPC callbacks for the malloc and free functions on HSA. 261b4d49fb5SJoseph Huber auto malloc_handler = [&](size_t size) -> void * { 262b4d49fb5SJoseph Huber void *dev_ptr = nullptr; 263b4d49fb5SJoseph Huber if (hsa_status_t err = 264b4d49fb5SJoseph Huber hsa_amd_memory_pool_allocate(coarsegrained_pool, size, 265b4d49fb5SJoseph Huber /*flags=*/0, &dev_ptr)) 266b4d49fb5SJoseph Huber dev_ptr = nullptr; 267b4d49fb5SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 268b4d49fb5SJoseph Huber return dev_ptr; 269b4d49fb5SJoseph Huber }; 270b4d49fb5SJoseph Huber 271b4d49fb5SJoseph Huber auto free_handler = [](void *ptr) -> void { 272b4d49fb5SJoseph Huber if (hsa_status_t err = 273b4d49fb5SJoseph Huber hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr))) 274ee57a685SJoseph Huber handle_error(err); 275b4d49fb5SJoseph Huber }; 276b4d49fb5SJoseph Huber 277b4d49fb5SJoseph Huber uint32_t index = 0; 278b4d49fb5SJoseph Huber while (!*finished) { 279b4d49fb5SJoseph Huber if (wavefront_size == 32) 280b4d49fb5SJoseph Huber index = 281b4d49fb5SJoseph Huber handle_server<32>(*server, index, malloc_handler, free_handler); 282b4d49fb5SJoseph Huber else 283b4d49fb5SJoseph Huber index = 284b4d49fb5SJoseph Huber handle_server<64>(*server, index, malloc_handler, free_handler); 285ee57a685SJoseph Huber } 286ee57a685SJoseph Huber }, 287b4d49fb5SJoseph Huber &finished, &server, wavefront_size, dev_agent, coarsegrained_pool); 288ee57a685SJoseph Huber 289feeb8335SJoseph Huber // Wait until the kernel has completed execution on the device. Periodically 290feeb8335SJoseph Huber // check the RPC client for work to be performed on the server. 291ee57a685SJoseph Huber while (hsa_signal_wait_scacquire(packet->completion_signal, 292ee57a685SJoseph Huber HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 293ee57a685SJoseph Huber HSA_WAIT_STATE_BLOCKED) != 0) 294ee57a685SJoseph Huber ; 295feeb8335SJoseph Huber 296ee57a685SJoseph Huber finished = true; 297b4d49fb5SJoseph Huber if (server_thread.joinable()) 298b4d49fb5SJoseph Huber server_thread.join(); 299feeb8335SJoseph Huber 300feeb8335SJoseph Huber // Destroy the resources acquired to launch the kernel and return. 301feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_free(args)) 302feeb8335SJoseph Huber handle_error(err); 303feeb8335SJoseph Huber if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) 304feeb8335SJoseph Huber handle_error(err); 305feeb8335SJoseph Huber 306feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 307feeb8335SJoseph Huber } 308feeb8335SJoseph Huber 309feeb8335SJoseph Huber /// Copies data from the source agent to the destination agent. The source 310feeb8335SJoseph Huber /// memory must first be pinned explicitly or allocated via HSA. 311feeb8335SJoseph Huber static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, 312feeb8335SJoseph Huber const void *src, hsa_agent_t src_agent, 313feeb8335SJoseph Huber uint64_t size) { 314feeb8335SJoseph Huber // Create a memory signal to copy information between the host and device. 315feeb8335SJoseph Huber hsa_signal_t memory_signal; 316feeb8335SJoseph Huber if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) 317feeb8335SJoseph Huber return err; 318feeb8335SJoseph Huber 319feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_async_copy( 320feeb8335SJoseph Huber dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) 321feeb8335SJoseph Huber return err; 322feeb8335SJoseph Huber 323feeb8335SJoseph Huber while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, 324feeb8335SJoseph Huber UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) 325feeb8335SJoseph Huber ; 326feeb8335SJoseph Huber 327feeb8335SJoseph Huber if (hsa_status_t err = hsa_signal_destroy(memory_signal)) 328feeb8335SJoseph Huber return err; 329feeb8335SJoseph Huber 330feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 331feeb8335SJoseph Huber } 332feeb8335SJoseph Huber 3335e326983SJoseph Huber int load(int argc, const char **argv, const char **envp, void *image, 3345e326983SJoseph Huber size_t size, const LaunchParameters ¶ms, 3355e326983SJoseph Huber bool print_resource_usage) { 336feeb8335SJoseph Huber // Initialize the HSA runtime used to communicate with the device. 337feeb8335SJoseph Huber if (hsa_status_t err = hsa_init()) 338feeb8335SJoseph Huber handle_error(err); 339feeb8335SJoseph Huber 340feeb8335SJoseph Huber // Register a callback when the device encounters a memory fault. 341feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_register_system_event_handler( 342feeb8335SJoseph Huber [](const hsa_amd_event_t *event, void *) -> hsa_status_t { 343feeb8335SJoseph Huber if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) 344feeb8335SJoseph Huber return HSA_STATUS_ERROR; 345feeb8335SJoseph Huber return HSA_STATUS_SUCCESS; 346feeb8335SJoseph Huber }, 347feeb8335SJoseph Huber nullptr)) 348feeb8335SJoseph Huber handle_error(err); 349feeb8335SJoseph Huber 350feeb8335SJoseph Huber // Obtain a single agent for the device and host to use the HSA memory model. 351feeb8335SJoseph Huber hsa_agent_t dev_agent; 352feeb8335SJoseph Huber hsa_agent_t host_agent; 353feeb8335SJoseph Huber if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) 354feeb8335SJoseph Huber handle_error(err); 355feeb8335SJoseph Huber if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) 356feeb8335SJoseph Huber handle_error(err); 357feeb8335SJoseph Huber 358feeb8335SJoseph Huber // Load the code object's ISA information and executable data segments. 359676a1e66SJoseph Huber hsa_code_object_reader_t reader; 360676a1e66SJoseph Huber if (hsa_status_t err = 361676a1e66SJoseph Huber hsa_code_object_reader_create_from_memory(image, size, &reader)) 362feeb8335SJoseph Huber handle_error(err); 363feeb8335SJoseph Huber 364feeb8335SJoseph Huber hsa_executable_t executable; 365feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_create_alt( 366feeb8335SJoseph Huber HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", 367feeb8335SJoseph Huber &executable)) 368feeb8335SJoseph Huber handle_error(err); 369feeb8335SJoseph Huber 370676a1e66SJoseph Huber hsa_loaded_code_object_t object; 371676a1e66SJoseph Huber if (hsa_status_t err = hsa_executable_load_agent_code_object( 372676a1e66SJoseph Huber executable, dev_agent, reader, "", &object)) 373feeb8335SJoseph Huber handle_error(err); 374feeb8335SJoseph Huber 375feeb8335SJoseph Huber // No modifications to the executable are allowed after this point. 376feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_freeze(executable, "")) 377feeb8335SJoseph Huber handle_error(err); 378feeb8335SJoseph Huber 379feeb8335SJoseph Huber // Check the validity of the loaded executable. If the agents ISA features do 380feeb8335SJoseph Huber // not match the executable's code object it will fail here. 381feeb8335SJoseph Huber uint32_t result; 382feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_validate(executable, &result)) 383feeb8335SJoseph Huber handle_error(err); 384feeb8335SJoseph Huber if (result) 385feeb8335SJoseph Huber handle_error(HSA_STATUS_ERROR); 386feeb8335SJoseph Huber 387676a1e66SJoseph Huber if (hsa_status_t err = hsa_code_object_reader_destroy(reader)) 388676a1e66SJoseph Huber handle_error(err); 389676a1e66SJoseph Huber 390feeb8335SJoseph Huber // Obtain memory pools to exchange data between the host and the device. The 391feeb8335SJoseph Huber // fine-grained pool acts as pinned memory on the host for DMA transfers to 392feeb8335SJoseph Huber // the device, the coarse-grained pool is for allocations directly on the 393feeb8335SJoseph Huber // device, and the kernerl-argument pool is for executing the kernel. 394feeb8335SJoseph Huber hsa_amd_memory_pool_t kernargs_pool; 395feeb8335SJoseph Huber hsa_amd_memory_pool_t finegrained_pool; 396feeb8335SJoseph Huber hsa_amd_memory_pool_t coarsegrained_pool; 397feeb8335SJoseph Huber if (hsa_status_t err = 398feeb8335SJoseph Huber get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( 399feeb8335SJoseph Huber host_agent, &kernargs_pool)) 400feeb8335SJoseph Huber handle_error(err); 401feeb8335SJoseph Huber if (hsa_status_t err = 402feeb8335SJoseph Huber get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( 403feeb8335SJoseph Huber host_agent, &finegrained_pool)) 404feeb8335SJoseph Huber handle_error(err); 405feeb8335SJoseph Huber if (hsa_status_t err = 406feeb8335SJoseph Huber get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( 407feeb8335SJoseph Huber dev_agent, &coarsegrained_pool)) 408feeb8335SJoseph Huber handle_error(err); 409feeb8335SJoseph Huber 41038049dc8SJoseph Huber // The AMDGPU target can change its wavefront size. There currently isn't a 41138049dc8SJoseph Huber // good way to look this up through the HSA API so we use the LLVM interface. 41238049dc8SJoseph Huber uint16_t abi_version; 41338049dc8SJoseph Huber llvm::StringRef image_ref(reinterpret_cast<char *>(image), size); 41438049dc8SJoseph Huber llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map; 41538049dc8SJoseph Huber if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage( 41638049dc8SJoseph Huber llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) { 41738049dc8SJoseph Huber handle_error(llvm::toString(std::move(err)).c_str()); 41838049dc8SJoseph Huber } 41938049dc8SJoseph Huber 420feeb8335SJoseph Huber // Allocate fine-grained memory on the host to hold the pointer array for the 421feeb8335SJoseph Huber // copied argv and allow the GPU agent to access it. 422feeb8335SJoseph Huber auto allocator = [&](uint64_t size) -> void * { 423feeb8335SJoseph Huber void *dev_ptr = nullptr; 424feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, 425feeb8335SJoseph Huber /*flags=*/0, &dev_ptr)) 426feeb8335SJoseph Huber handle_error(err); 427feeb8335SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); 428feeb8335SJoseph Huber return dev_ptr; 429feeb8335SJoseph Huber }; 430feeb8335SJoseph Huber void *dev_argv = copy_argument_vector(argc, argv, allocator); 431feeb8335SJoseph Huber if (!dev_argv) 432feeb8335SJoseph Huber handle_error("Failed to allocate device argv"); 433feeb8335SJoseph Huber 434feeb8335SJoseph Huber // Allocate fine-grained memory on the host to hold the pointer array for the 435feeb8335SJoseph Huber // copied environment array and allow the GPU agent to access it. 436feeb8335SJoseph Huber void *dev_envp = copy_environment(envp, allocator); 437feeb8335SJoseph Huber if (!dev_envp) 438feeb8335SJoseph Huber handle_error("Failed to allocate device environment"); 439feeb8335SJoseph Huber 440feeb8335SJoseph Huber // Allocate space for the return pointer and initialize it to zero. 441feeb8335SJoseph Huber void *dev_ret; 442feeb8335SJoseph Huber if (hsa_status_t err = 443feeb8335SJoseph Huber hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), 444feeb8335SJoseph Huber /*flags=*/0, &dev_ret)) 445feeb8335SJoseph Huber handle_error(err); 446feeb8335SJoseph Huber hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); 447feeb8335SJoseph Huber 448feeb8335SJoseph Huber // Allocate finegrained memory for the RPC server and client to share. 44938049dc8SJoseph Huber uint32_t wavefront_size = 45038049dc8SJoseph Huber llvm::max_element(info_map, [](auto &&x, auto &&y) { 45138049dc8SJoseph Huber return x.second.WavefrontSize < y.second.WavefrontSize; 45238049dc8SJoseph Huber })->second.WavefrontSize; 453feeb8335SJoseph Huber 454feeb8335SJoseph Huber // Set up the RPC server. 455b4d49fb5SJoseph Huber void *rpc_buffer; 456b4d49fb5SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_allocate( 457b4d49fb5SJoseph Huber finegrained_pool, 458b4d49fb5SJoseph Huber rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT), 459b4d49fb5SJoseph Huber /*flags=*/0, &rpc_buffer)) 460feeb8335SJoseph Huber handle_error(err); 461b4d49fb5SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer); 462feeb8335SJoseph Huber 463b4d49fb5SJoseph Huber rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); 464b4d49fb5SJoseph Huber rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); 465feeb8335SJoseph Huber 466feeb8335SJoseph Huber // Initialize the RPC client on the device by copying the local data to the 467feeb8335SJoseph Huber // device's internal pointer. 468feeb8335SJoseph Huber hsa_executable_symbol_t rpc_client_sym; 469feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_get_symbol_by_name( 470*89d8e700SJoseph Huber executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym)) 471feeb8335SJoseph Huber handle_error(err); 472feeb8335SJoseph Huber 473feeb8335SJoseph Huber void *rpc_client_dev; 474feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_symbol_get_info( 475feeb8335SJoseph Huber rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 476feeb8335SJoseph Huber &rpc_client_dev)) 477feeb8335SJoseph Huber handle_error(err); 478feeb8335SJoseph Huber 479feeb8335SJoseph Huber void *rpc_client_buffer; 480feeb8335SJoseph Huber if (hsa_status_t err = 481b4d49fb5SJoseph Huber hsa_amd_memory_lock(&client, sizeof(rpc::Client), 482feeb8335SJoseph Huber /*agents=*/nullptr, 0, &rpc_client_buffer)) 483feeb8335SJoseph Huber handle_error(err); 484feeb8335SJoseph Huber 485feeb8335SJoseph Huber // Copy the RPC client buffer to the address pointed to by the symbol. 486feeb8335SJoseph Huber if (hsa_status_t err = 487*89d8e700SJoseph Huber hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent, 488*89d8e700SJoseph Huber sizeof(rpc::Client))) 489feeb8335SJoseph Huber handle_error(err); 490feeb8335SJoseph Huber 491b4d49fb5SJoseph Huber if (hsa_status_t err = hsa_amd_memory_unlock(&client)) 492feeb8335SJoseph Huber handle_error(err); 493feeb8335SJoseph Huber 494feeb8335SJoseph Huber // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. 495feeb8335SJoseph Huber // If the clock_freq symbol is missing, no work to do. 496feeb8335SJoseph Huber hsa_executable_symbol_t freq_sym; 497feeb8335SJoseph Huber if (HSA_STATUS_SUCCESS == 498feeb8335SJoseph Huber hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", 499feeb8335SJoseph Huber &dev_agent, &freq_sym)) { 500feeb8335SJoseph Huber void *host_clock_freq; 501feeb8335SJoseph Huber if (hsa_status_t err = 502feeb8335SJoseph Huber hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), 503feeb8335SJoseph Huber /*flags=*/0, &host_clock_freq)) 504feeb8335SJoseph Huber handle_error(err); 505feeb8335SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); 506feeb8335SJoseph Huber 507feeb8335SJoseph Huber if (HSA_STATUS_SUCCESS == 508feeb8335SJoseph Huber hsa_agent_get_info(dev_agent, 509feeb8335SJoseph Huber static_cast<hsa_agent_info_t>( 510feeb8335SJoseph Huber HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), 511feeb8335SJoseph Huber host_clock_freq)) { 512feeb8335SJoseph Huber 513feeb8335SJoseph Huber void *freq_addr; 514feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_symbol_get_info( 515feeb8335SJoseph Huber freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 516feeb8335SJoseph Huber &freq_addr)) 517feeb8335SJoseph Huber handle_error(err); 518feeb8335SJoseph Huber 519feeb8335SJoseph Huber if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, 520feeb8335SJoseph Huber host_agent, sizeof(uint64_t))) 521feeb8335SJoseph Huber handle_error(err); 522feeb8335SJoseph Huber } 523feeb8335SJoseph Huber } 524feeb8335SJoseph Huber 5256558e561SJoseph Huber // Obtain a queue with the maximum (power of two) size, used to send commands 526feeb8335SJoseph Huber // to the HSA runtime and launch execution on the device. 527feeb8335SJoseph Huber uint64_t queue_size; 528feeb8335SJoseph Huber if (hsa_status_t err = hsa_agent_get_info( 5296558e561SJoseph Huber dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) 530feeb8335SJoseph Huber handle_error(err); 531feeb8335SJoseph Huber hsa_queue_t *queue = nullptr; 532feeb8335SJoseph Huber if (hsa_status_t err = 533feeb8335SJoseph Huber hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, 534feeb8335SJoseph Huber nullptr, UINT32_MAX, UINT32_MAX, &queue)) 535feeb8335SJoseph Huber handle_error(err); 536feeb8335SJoseph Huber 537feeb8335SJoseph Huber LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; 538feeb8335SJoseph Huber begin_args_t init_args = {argc, dev_argv, dev_envp}; 53938049dc8SJoseph Huber if (hsa_status_t err = launch_kernel( 54038049dc8SJoseph Huber dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 54138049dc8SJoseph Huber server, single_threaded_params, "_begin.kd", init_args, 54238049dc8SJoseph Huber info_map["_begin"].WavefrontSize, print_resource_usage)) 543feeb8335SJoseph Huber handle_error(err); 544feeb8335SJoseph Huber 545feeb8335SJoseph Huber start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; 546feeb8335SJoseph Huber if (hsa_status_t err = launch_kernel( 547feeb8335SJoseph Huber dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 54838049dc8SJoseph Huber server, params, "_start.kd", args, info_map["_start"].WavefrontSize, 54938049dc8SJoseph Huber print_resource_usage)) 550feeb8335SJoseph Huber handle_error(err); 551feeb8335SJoseph Huber 552feeb8335SJoseph Huber void *host_ret; 553feeb8335SJoseph Huber if (hsa_status_t err = 554feeb8335SJoseph Huber hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), 555feeb8335SJoseph Huber /*flags=*/0, &host_ret)) 556feeb8335SJoseph Huber handle_error(err); 557feeb8335SJoseph Huber hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); 558feeb8335SJoseph Huber 559feeb8335SJoseph Huber if (hsa_status_t err = 560feeb8335SJoseph Huber hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) 561feeb8335SJoseph Huber handle_error(err); 562feeb8335SJoseph Huber 563feeb8335SJoseph Huber // Save the return value and perform basic clean-up. 564feeb8335SJoseph Huber int ret = *static_cast<int *>(host_ret); 565feeb8335SJoseph Huber 566feeb8335SJoseph Huber end_args_t fini_args = {ret}; 56738049dc8SJoseph Huber if (hsa_status_t err = launch_kernel( 56838049dc8SJoseph Huber dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, 56938049dc8SJoseph Huber server, single_threaded_params, "_end.kd", fini_args, 57038049dc8SJoseph Huber info_map["_end"].WavefrontSize, print_resource_usage)) 571feeb8335SJoseph Huber handle_error(err); 572feeb8335SJoseph Huber 573b4d49fb5SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer)) 574feeb8335SJoseph Huber handle_error(err); 575feeb8335SJoseph Huber 576feeb8335SJoseph Huber // Free the memory allocated for the device. 577feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) 578feeb8335SJoseph Huber handle_error(err); 579feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) 580feeb8335SJoseph Huber handle_error(err); 581feeb8335SJoseph Huber if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) 582feeb8335SJoseph Huber handle_error(err); 583feeb8335SJoseph Huber 584feeb8335SJoseph Huber if (hsa_status_t err = hsa_queue_destroy(queue)) 585feeb8335SJoseph Huber handle_error(err); 586feeb8335SJoseph Huber 587feeb8335SJoseph Huber if (hsa_status_t err = hsa_executable_destroy(executable)) 588feeb8335SJoseph Huber handle_error(err); 589feeb8335SJoseph Huber 590feeb8335SJoseph Huber if (hsa_status_t err = hsa_shut_down()) 591feeb8335SJoseph Huber handle_error(err); 592feeb8335SJoseph Huber 593feeb8335SJoseph Huber return ret; 594feeb8335SJoseph Huber } 595