xref: /llvm-project/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp (revision a2fc276ed2556c5da59f8b039bbb6d97f3003134)
1feeb8335SJoseph Huber //===-- Loader Implementation for AMDHSA devices --------------------------===//
2feeb8335SJoseph Huber //
3feeb8335SJoseph Huber // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4feeb8335SJoseph Huber // See https://llvm.org/LICENSE.txt for license information.
5feeb8335SJoseph Huber // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6feeb8335SJoseph Huber //
7feeb8335SJoseph Huber //===----------------------------------------------------------------------===//
8feeb8335SJoseph Huber //
9feeb8335SJoseph Huber // This file impelements a simple loader to run images supporting the AMDHSA
10feeb8335SJoseph Huber // architecture. The file launches the '_start' kernel which should be provided
11feeb8335SJoseph Huber // by the device application start code and call ultimately call the 'main'
12feeb8335SJoseph Huber // function.
13feeb8335SJoseph Huber //
14feeb8335SJoseph Huber //===----------------------------------------------------------------------===//
15feeb8335SJoseph Huber 
16feeb8335SJoseph Huber #include "Loader.h"
17feeb8335SJoseph Huber 
18feeb8335SJoseph Huber #include "hsa/hsa.h"
19feeb8335SJoseph Huber #include "hsa/hsa_ext_amd.h"
20feeb8335SJoseph Huber 
2138049dc8SJoseph Huber #include "llvm/Frontend/Offloading/Utility.h"
2238049dc8SJoseph Huber 
23ee57a685SJoseph Huber #include <atomic>
24feeb8335SJoseph Huber #include <cstdio>
25feeb8335SJoseph Huber #include <cstdlib>
26feeb8335SJoseph Huber #include <cstring>
27ee57a685SJoseph Huber #include <thread>
28feeb8335SJoseph Huber #include <tuple>
29feeb8335SJoseph Huber #include <utility>
30feeb8335SJoseph Huber 
31feeb8335SJoseph Huber // The implicit arguments of COV5 AMDGPU kernels.
32feeb8335SJoseph Huber struct implicit_args_t {
33feeb8335SJoseph Huber   uint32_t grid_size_x;
34feeb8335SJoseph Huber   uint32_t grid_size_y;
35feeb8335SJoseph Huber   uint32_t grid_size_z;
36feeb8335SJoseph Huber   uint16_t workgroup_size_x;
37feeb8335SJoseph Huber   uint16_t workgroup_size_y;
38feeb8335SJoseph Huber   uint16_t workgroup_size_z;
39feeb8335SJoseph Huber   uint8_t Unused0[46];
40feeb8335SJoseph Huber   uint16_t grid_dims;
41feeb8335SJoseph Huber   uint8_t Unused1[190];
42feeb8335SJoseph Huber };
43feeb8335SJoseph Huber 
44feeb8335SJoseph Huber /// Print the error code and exit if \p code indicates an error.
45feeb8335SJoseph Huber static void handle_error_impl(const char *file, int32_t line,
46feeb8335SJoseph Huber                               hsa_status_t code) {
47feeb8335SJoseph Huber   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
48feeb8335SJoseph Huber     return;
49feeb8335SJoseph Huber 
50feeb8335SJoseph Huber   const char *desc;
51feeb8335SJoseph Huber   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
52feeb8335SJoseph Huber     desc = "Unknown error";
53feeb8335SJoseph Huber   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
54feeb8335SJoseph Huber   exit(EXIT_FAILURE);
55feeb8335SJoseph Huber }
56feeb8335SJoseph Huber 
57feeb8335SJoseph Huber /// Generic interface for iterating using the HSA callbacks.
58feeb8335SJoseph Huber template <typename elem_ty, typename func_ty, typename callback_ty>
59feeb8335SJoseph Huber hsa_status_t iterate(func_ty func, callback_ty cb) {
60feeb8335SJoseph Huber   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
61feeb8335SJoseph Huber     callback_ty *unwrapped = static_cast<callback_ty *>(data);
62feeb8335SJoseph Huber     return (*unwrapped)(elem);
63feeb8335SJoseph Huber   };
64feeb8335SJoseph Huber   return func(l, static_cast<void *>(&cb));
65feeb8335SJoseph Huber }
66feeb8335SJoseph Huber 
67feeb8335SJoseph Huber /// Generic interface for iterating using the HSA callbacks.
68feeb8335SJoseph Huber template <typename elem_ty, typename func_ty, typename func_arg_ty,
69feeb8335SJoseph Huber           typename callback_ty>
70feeb8335SJoseph Huber hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
71feeb8335SJoseph Huber   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
72feeb8335SJoseph Huber     callback_ty *unwrapped = static_cast<callback_ty *>(data);
73feeb8335SJoseph Huber     return (*unwrapped)(elem);
74feeb8335SJoseph Huber   };
75feeb8335SJoseph Huber   return func(func_arg, l, static_cast<void *>(&cb));
76feeb8335SJoseph Huber }
77feeb8335SJoseph Huber 
78feeb8335SJoseph Huber /// Iterate through all availible agents.
79feeb8335SJoseph Huber template <typename callback_ty>
80feeb8335SJoseph Huber hsa_status_t iterate_agents(callback_ty callback) {
81feeb8335SJoseph Huber   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
82feeb8335SJoseph Huber }
83feeb8335SJoseph Huber 
84feeb8335SJoseph Huber /// Iterate through all availible memory pools.
85feeb8335SJoseph Huber template <typename callback_ty>
86feeb8335SJoseph Huber hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
87feeb8335SJoseph Huber   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
88feeb8335SJoseph Huber                                         agent, cb);
89feeb8335SJoseph Huber }
90feeb8335SJoseph Huber 
91feeb8335SJoseph Huber template <hsa_device_type_t flag>
92feeb8335SJoseph Huber hsa_status_t get_agent(hsa_agent_t *output_agent) {
93feeb8335SJoseph Huber   // Find the first agent with a matching device type.
94feeb8335SJoseph Huber   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
95feeb8335SJoseph Huber     hsa_device_type_t type;
96feeb8335SJoseph Huber     hsa_status_t status =
97feeb8335SJoseph Huber         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
98feeb8335SJoseph Huber     if (status != HSA_STATUS_SUCCESS)
99feeb8335SJoseph Huber       return status;
100feeb8335SJoseph Huber 
101feeb8335SJoseph Huber     if (type == flag) {
102feeb8335SJoseph Huber       // Ensure that a GPU agent supports kernel dispatch packets.
103feeb8335SJoseph Huber       if (type == HSA_DEVICE_TYPE_GPU) {
104feeb8335SJoseph Huber         hsa_agent_feature_t features;
105feeb8335SJoseph Huber         status =
106feeb8335SJoseph Huber             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
107feeb8335SJoseph Huber         if (status != HSA_STATUS_SUCCESS)
108feeb8335SJoseph Huber           return status;
109feeb8335SJoseph Huber         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
110feeb8335SJoseph Huber           *output_agent = hsa_agent;
111feeb8335SJoseph Huber       } else {
112feeb8335SJoseph Huber         *output_agent = hsa_agent;
113feeb8335SJoseph Huber       }
114feeb8335SJoseph Huber       return HSA_STATUS_INFO_BREAK;
115feeb8335SJoseph Huber     }
116feeb8335SJoseph Huber     return HSA_STATUS_SUCCESS;
117feeb8335SJoseph Huber   };
118feeb8335SJoseph Huber 
119feeb8335SJoseph Huber   return iterate_agents(cb);
120feeb8335SJoseph Huber }
121feeb8335SJoseph Huber 
122feeb8335SJoseph Huber void print_kernel_resources(const char *kernel_name) {
123feeb8335SJoseph Huber   fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
124feeb8335SJoseph Huber }
125feeb8335SJoseph Huber 
126feeb8335SJoseph Huber /// Retrieve a global memory pool with a \p flag from the agent.
127feeb8335SJoseph Huber template <hsa_amd_memory_pool_global_flag_t flag>
128feeb8335SJoseph Huber hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
129feeb8335SJoseph Huber                                    hsa_amd_memory_pool_t *output_pool) {
130feeb8335SJoseph Huber   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
131feeb8335SJoseph Huber     uint32_t flags;
132feeb8335SJoseph Huber     hsa_amd_segment_t segment;
133feeb8335SJoseph Huber     if (auto err = hsa_amd_memory_pool_get_info(
134feeb8335SJoseph Huber             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
135feeb8335SJoseph Huber       return err;
136feeb8335SJoseph Huber     if (auto err = hsa_amd_memory_pool_get_info(
137feeb8335SJoseph Huber             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
138feeb8335SJoseph Huber       return err;
139feeb8335SJoseph Huber 
140feeb8335SJoseph Huber     if (segment != HSA_AMD_SEGMENT_GLOBAL)
141feeb8335SJoseph Huber       return HSA_STATUS_SUCCESS;
142feeb8335SJoseph Huber 
143feeb8335SJoseph Huber     if (flags & flag)
144feeb8335SJoseph Huber       *output_pool = memory_pool;
145feeb8335SJoseph Huber 
146feeb8335SJoseph Huber     return HSA_STATUS_SUCCESS;
147feeb8335SJoseph Huber   };
148feeb8335SJoseph Huber   return iterate_agent_memory_pools(agent, cb);
149feeb8335SJoseph Huber }
150feeb8335SJoseph Huber 
151feeb8335SJoseph Huber template <typename args_t>
152feeb8335SJoseph Huber hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
153feeb8335SJoseph Huber                            hsa_amd_memory_pool_t kernargs_pool,
154feeb8335SJoseph Huber                            hsa_amd_memory_pool_t coarsegrained_pool,
155b4d49fb5SJoseph Huber                            hsa_queue_t *queue, rpc::Server &server,
156feeb8335SJoseph Huber                            const LaunchParameters &params,
157feeb8335SJoseph Huber                            const char *kernel_name, args_t kernel_args,
15838049dc8SJoseph Huber                            uint32_t wavefront_size, bool print_resource_usage) {
159feeb8335SJoseph Huber   // Look up the kernel in the loaded executable.
160feeb8335SJoseph Huber   hsa_executable_symbol_t symbol;
161feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
162feeb8335SJoseph Huber           executable, kernel_name, &dev_agent, &symbol))
163feeb8335SJoseph Huber     return err;
164feeb8335SJoseph Huber 
165feeb8335SJoseph Huber   // Retrieve different properties of the kernel symbol used for launch.
166feeb8335SJoseph Huber   uint64_t kernel;
167feeb8335SJoseph Huber   uint32_t args_size;
168feeb8335SJoseph Huber   uint32_t group_size;
169feeb8335SJoseph Huber   uint32_t private_size;
170feeb8335SJoseph Huber   bool dynamic_stack;
171feeb8335SJoseph Huber 
172feeb8335SJoseph Huber   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
173feeb8335SJoseph Huber       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
174feeb8335SJoseph Huber       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
175feeb8335SJoseph Huber       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
176feeb8335SJoseph Huber       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
177feeb8335SJoseph Huber       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
178feeb8335SJoseph Huber 
179feeb8335SJoseph Huber   for (auto &[info, value] : symbol_infos)
180feeb8335SJoseph Huber     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
181feeb8335SJoseph Huber       return err;
182feeb8335SJoseph Huber 
183feeb8335SJoseph Huber   // Allocate space for the kernel arguments on the host and allow the GPU agent
184feeb8335SJoseph Huber   // to access it.
185feeb8335SJoseph Huber   void *args;
186feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
187feeb8335SJoseph Huber                                                       /*flags=*/0, &args))
188feeb8335SJoseph Huber     handle_error(err);
189feeb8335SJoseph Huber   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
190feeb8335SJoseph Huber 
191feeb8335SJoseph Huber   // Initialize all the arguments (explicit and implicit) to zero, then set the
192feeb8335SJoseph Huber   // explicit arguments to the values created above.
193feeb8335SJoseph Huber   std::memset(args, 0, args_size);
194feeb8335SJoseph Huber   std::memcpy(args, &kernel_args, sizeof(args_t));
195feeb8335SJoseph Huber 
196feeb8335SJoseph Huber   // Initialize the necessary implicit arguments to the proper values.
197b712a144SJoseph Huber   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
198feeb8335SJoseph Huber              (params.num_blocks_z * params.num_threads_z != 1);
199feeb8335SJoseph Huber   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
200feeb8335SJoseph Huber       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
201feeb8335SJoseph Huber   implicit_args->grid_dims = dims;
202feeb8335SJoseph Huber   implicit_args->grid_size_x = params.num_blocks_x;
203feeb8335SJoseph Huber   implicit_args->grid_size_y = params.num_blocks_y;
204feeb8335SJoseph Huber   implicit_args->grid_size_z = params.num_blocks_z;
205feeb8335SJoseph Huber   implicit_args->workgroup_size_x = params.num_threads_x;
206feeb8335SJoseph Huber   implicit_args->workgroup_size_y = params.num_threads_y;
207feeb8335SJoseph Huber   implicit_args->workgroup_size_z = params.num_threads_z;
208feeb8335SJoseph Huber 
209feeb8335SJoseph Huber   // Obtain a packet from the queue.
210feeb8335SJoseph Huber   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
211feeb8335SJoseph Huber   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
212feeb8335SJoseph Huber     ;
213feeb8335SJoseph Huber 
214feeb8335SJoseph Huber   const uint32_t mask = queue->size - 1;
215feeb8335SJoseph Huber   hsa_kernel_dispatch_packet_t *packet =
216feeb8335SJoseph Huber       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
217feeb8335SJoseph Huber       (packet_id & mask);
218feeb8335SJoseph Huber 
219feeb8335SJoseph Huber   // Set up the packet for exeuction on the device. We currently only launch
220feeb8335SJoseph Huber   // with one thread on the device, forcing the rest of the wavefront to be
221feeb8335SJoseph Huber   // masked off.
222feeb8335SJoseph Huber   uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
223feeb8335SJoseph Huber   packet->workgroup_size_x = params.num_threads_x;
224feeb8335SJoseph Huber   packet->workgroup_size_y = params.num_threads_y;
225feeb8335SJoseph Huber   packet->workgroup_size_z = params.num_threads_z;
226feeb8335SJoseph Huber   packet->reserved0 = 0;
227feeb8335SJoseph Huber   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
228feeb8335SJoseph Huber   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
229feeb8335SJoseph Huber   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
230feeb8335SJoseph Huber   packet->private_segment_size =
231feeb8335SJoseph Huber       dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
232feeb8335SJoseph Huber   packet->group_segment_size = group_size;
233feeb8335SJoseph Huber   packet->kernel_object = kernel;
234feeb8335SJoseph Huber   packet->kernarg_address = args;
235feeb8335SJoseph Huber   packet->reserved2 = 0;
236feeb8335SJoseph Huber   // Create a signal to indicate when this packet has been completed.
237feeb8335SJoseph Huber   if (hsa_status_t err =
238feeb8335SJoseph Huber           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
239feeb8335SJoseph Huber     handle_error(err);
240feeb8335SJoseph Huber 
241feeb8335SJoseph Huber   if (print_resource_usage)
242feeb8335SJoseph Huber     print_kernel_resources(kernel_name);
243feeb8335SJoseph Huber 
244feeb8335SJoseph Huber   // Initialize the packet header and set the doorbell signal to begin execution
245feeb8335SJoseph Huber   // by the HSA runtime.
246feeb8335SJoseph Huber   uint16_t header =
2476558e561SJoseph Huber       1u << HSA_PACKET_HEADER_BARRIER |
248feeb8335SJoseph Huber       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
249feeb8335SJoseph Huber       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
250feeb8335SJoseph Huber       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
251feeb8335SJoseph Huber   uint32_t header_word = header | (setup << 16u);
252feeb8335SJoseph Huber   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
253feeb8335SJoseph Huber   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
254feeb8335SJoseph Huber 
255ee57a685SJoseph Huber   std::atomic<bool> finished = false;
256b4d49fb5SJoseph Huber   std::thread server_thread(
257b4d49fb5SJoseph Huber       [](std::atomic<bool> *finished, rpc::Server *server,
258b4d49fb5SJoseph Huber          uint32_t wavefront_size, hsa_agent_t dev_agent,
259b4d49fb5SJoseph Huber          hsa_amd_memory_pool_t coarsegrained_pool) {
260b4d49fb5SJoseph Huber         // Register RPC callbacks for the malloc and free functions on HSA.
261b4d49fb5SJoseph Huber         auto malloc_handler = [&](size_t size) -> void * {
262b4d49fb5SJoseph Huber           void *dev_ptr = nullptr;
263b4d49fb5SJoseph Huber           if (hsa_status_t err =
264b4d49fb5SJoseph Huber                   hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
265b4d49fb5SJoseph Huber                                                /*flags=*/0, &dev_ptr))
266b4d49fb5SJoseph Huber             dev_ptr = nullptr;
267b4d49fb5SJoseph Huber           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
268b4d49fb5SJoseph Huber           return dev_ptr;
269b4d49fb5SJoseph Huber         };
270b4d49fb5SJoseph Huber 
271b4d49fb5SJoseph Huber         auto free_handler = [](void *ptr) -> void {
272b4d49fb5SJoseph Huber           if (hsa_status_t err =
273b4d49fb5SJoseph Huber                   hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
274ee57a685SJoseph Huber             handle_error(err);
275b4d49fb5SJoseph Huber         };
276b4d49fb5SJoseph Huber 
277b4d49fb5SJoseph Huber         uint32_t index = 0;
278b4d49fb5SJoseph Huber         while (!*finished) {
279b4d49fb5SJoseph Huber           if (wavefront_size == 32)
280b4d49fb5SJoseph Huber             index =
281b4d49fb5SJoseph Huber                 handle_server<32>(*server, index, malloc_handler, free_handler);
282b4d49fb5SJoseph Huber           else
283b4d49fb5SJoseph Huber             index =
284b4d49fb5SJoseph Huber                 handle_server<64>(*server, index, malloc_handler, free_handler);
285ee57a685SJoseph Huber         }
286ee57a685SJoseph Huber       },
287b4d49fb5SJoseph Huber       &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
288ee57a685SJoseph Huber 
289feeb8335SJoseph Huber   // Wait until the kernel has completed execution on the device. Periodically
290feeb8335SJoseph Huber   // check the RPC client for work to be performed on the server.
291ee57a685SJoseph Huber   while (hsa_signal_wait_scacquire(packet->completion_signal,
292ee57a685SJoseph Huber                                    HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
293ee57a685SJoseph Huber                                    HSA_WAIT_STATE_BLOCKED) != 0)
294ee57a685SJoseph Huber     ;
295feeb8335SJoseph Huber 
296ee57a685SJoseph Huber   finished = true;
297b4d49fb5SJoseph Huber   if (server_thread.joinable())
298b4d49fb5SJoseph Huber     server_thread.join();
299feeb8335SJoseph Huber 
300feeb8335SJoseph Huber   // Destroy the resources acquired to launch the kernel and return.
301feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
302feeb8335SJoseph Huber     handle_error(err);
303feeb8335SJoseph Huber   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
304feeb8335SJoseph Huber     handle_error(err);
305feeb8335SJoseph Huber 
306feeb8335SJoseph Huber   return HSA_STATUS_SUCCESS;
307feeb8335SJoseph Huber }
308feeb8335SJoseph Huber 
309feeb8335SJoseph Huber /// Copies data from the source agent to the destination agent. The source
310feeb8335SJoseph Huber /// memory must first be pinned explicitly or allocated via HSA.
311feeb8335SJoseph Huber static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
312feeb8335SJoseph Huber                                const void *src, hsa_agent_t src_agent,
313feeb8335SJoseph Huber                                uint64_t size) {
314feeb8335SJoseph Huber   // Create a memory signal to copy information between the host and device.
315feeb8335SJoseph Huber   hsa_signal_t memory_signal;
316feeb8335SJoseph Huber   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
317feeb8335SJoseph Huber     return err;
318feeb8335SJoseph Huber 
319feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_async_copy(
320feeb8335SJoseph Huber           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
321feeb8335SJoseph Huber     return err;
322feeb8335SJoseph Huber 
323feeb8335SJoseph Huber   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
324feeb8335SJoseph Huber                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
325feeb8335SJoseph Huber     ;
326feeb8335SJoseph Huber 
327feeb8335SJoseph Huber   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
328feeb8335SJoseph Huber     return err;
329feeb8335SJoseph Huber 
330feeb8335SJoseph Huber   return HSA_STATUS_SUCCESS;
331feeb8335SJoseph Huber }
332feeb8335SJoseph Huber 
3335e326983SJoseph Huber int load(int argc, const char **argv, const char **envp, void *image,
3345e326983SJoseph Huber          size_t size, const LaunchParameters &params,
3355e326983SJoseph Huber          bool print_resource_usage) {
336feeb8335SJoseph Huber   // Initialize the HSA runtime used to communicate with the device.
337feeb8335SJoseph Huber   if (hsa_status_t err = hsa_init())
338feeb8335SJoseph Huber     handle_error(err);
339feeb8335SJoseph Huber 
340feeb8335SJoseph Huber   // Register a callback when the device encounters a memory fault.
341feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_register_system_event_handler(
342feeb8335SJoseph Huber           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
343feeb8335SJoseph Huber             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
344feeb8335SJoseph Huber               return HSA_STATUS_ERROR;
345feeb8335SJoseph Huber             return HSA_STATUS_SUCCESS;
346feeb8335SJoseph Huber           },
347feeb8335SJoseph Huber           nullptr))
348feeb8335SJoseph Huber     handle_error(err);
349feeb8335SJoseph Huber 
350feeb8335SJoseph Huber   // Obtain a single agent for the device and host to use the HSA memory model.
351feeb8335SJoseph Huber   hsa_agent_t dev_agent;
352feeb8335SJoseph Huber   hsa_agent_t host_agent;
353feeb8335SJoseph Huber   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
354feeb8335SJoseph Huber     handle_error(err);
355feeb8335SJoseph Huber   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
356feeb8335SJoseph Huber     handle_error(err);
357feeb8335SJoseph Huber 
358feeb8335SJoseph Huber   // Load the code object's ISA information and executable data segments.
359676a1e66SJoseph Huber   hsa_code_object_reader_t reader;
360676a1e66SJoseph Huber   if (hsa_status_t err =
361676a1e66SJoseph Huber           hsa_code_object_reader_create_from_memory(image, size, &reader))
362feeb8335SJoseph Huber     handle_error(err);
363feeb8335SJoseph Huber 
364feeb8335SJoseph Huber   hsa_executable_t executable;
365feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_create_alt(
366feeb8335SJoseph Huber           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
367feeb8335SJoseph Huber           &executable))
368feeb8335SJoseph Huber     handle_error(err);
369feeb8335SJoseph Huber 
370676a1e66SJoseph Huber   hsa_loaded_code_object_t object;
371676a1e66SJoseph Huber   if (hsa_status_t err = hsa_executable_load_agent_code_object(
372676a1e66SJoseph Huber           executable, dev_agent, reader, "", &object))
373feeb8335SJoseph Huber     handle_error(err);
374feeb8335SJoseph Huber 
375feeb8335SJoseph Huber   // No modifications to the executable are allowed  after this point.
376feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
377feeb8335SJoseph Huber     handle_error(err);
378feeb8335SJoseph Huber 
379feeb8335SJoseph Huber   // Check the validity of the loaded executable. If the agents ISA features do
380feeb8335SJoseph Huber   // not match the executable's code object it will fail here.
381feeb8335SJoseph Huber   uint32_t result;
382feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_validate(executable, &result))
383feeb8335SJoseph Huber     handle_error(err);
384feeb8335SJoseph Huber   if (result)
385feeb8335SJoseph Huber     handle_error(HSA_STATUS_ERROR);
386feeb8335SJoseph Huber 
387676a1e66SJoseph Huber   if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
388676a1e66SJoseph Huber     handle_error(err);
389676a1e66SJoseph Huber 
390feeb8335SJoseph Huber   // Obtain memory pools to exchange data between the host and the device. The
391feeb8335SJoseph Huber   // fine-grained pool acts as pinned memory on the host for DMA transfers to
392feeb8335SJoseph Huber   // the device, the coarse-grained pool is for allocations directly on the
393feeb8335SJoseph Huber   // device, and the kernerl-argument pool is for executing the kernel.
394feeb8335SJoseph Huber   hsa_amd_memory_pool_t kernargs_pool;
395feeb8335SJoseph Huber   hsa_amd_memory_pool_t finegrained_pool;
396feeb8335SJoseph Huber   hsa_amd_memory_pool_t coarsegrained_pool;
397feeb8335SJoseph Huber   if (hsa_status_t err =
398feeb8335SJoseph Huber           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
399feeb8335SJoseph Huber               host_agent, &kernargs_pool))
400feeb8335SJoseph Huber     handle_error(err);
401feeb8335SJoseph Huber   if (hsa_status_t err =
402feeb8335SJoseph Huber           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
403feeb8335SJoseph Huber               host_agent, &finegrained_pool))
404feeb8335SJoseph Huber     handle_error(err);
405feeb8335SJoseph Huber   if (hsa_status_t err =
406feeb8335SJoseph Huber           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
407feeb8335SJoseph Huber               dev_agent, &coarsegrained_pool))
408feeb8335SJoseph Huber     handle_error(err);
409feeb8335SJoseph Huber 
41038049dc8SJoseph Huber   // The AMDGPU target can change its wavefront size. There currently isn't a
41138049dc8SJoseph Huber   // good way to look this up through the HSA API so we use the LLVM interface.
41238049dc8SJoseph Huber   uint16_t abi_version;
41338049dc8SJoseph Huber   llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
41438049dc8SJoseph Huber   llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
41538049dc8SJoseph Huber   if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
41638049dc8SJoseph Huber           llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
41738049dc8SJoseph Huber     handle_error(llvm::toString(std::move(err)).c_str());
41838049dc8SJoseph Huber   }
41938049dc8SJoseph Huber 
420feeb8335SJoseph Huber   // Allocate fine-grained memory on the host to hold the pointer array for the
421feeb8335SJoseph Huber   // copied argv and allow the GPU agent to access it.
422feeb8335SJoseph Huber   auto allocator = [&](uint64_t size) -> void * {
423feeb8335SJoseph Huber     void *dev_ptr = nullptr;
424feeb8335SJoseph Huber     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
425feeb8335SJoseph Huber                                                         /*flags=*/0, &dev_ptr))
426feeb8335SJoseph Huber       handle_error(err);
427feeb8335SJoseph Huber     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
428feeb8335SJoseph Huber     return dev_ptr;
429feeb8335SJoseph Huber   };
430feeb8335SJoseph Huber   void *dev_argv = copy_argument_vector(argc, argv, allocator);
431feeb8335SJoseph Huber   if (!dev_argv)
432feeb8335SJoseph Huber     handle_error("Failed to allocate device argv");
433feeb8335SJoseph Huber 
434feeb8335SJoseph Huber   // Allocate fine-grained memory on the host to hold the pointer array for the
435feeb8335SJoseph Huber   // copied environment array and allow the GPU agent to access it.
436feeb8335SJoseph Huber   void *dev_envp = copy_environment(envp, allocator);
437feeb8335SJoseph Huber   if (!dev_envp)
438feeb8335SJoseph Huber     handle_error("Failed to allocate device environment");
439feeb8335SJoseph Huber 
440feeb8335SJoseph Huber   // Allocate space for the return pointer and initialize it to zero.
441feeb8335SJoseph Huber   void *dev_ret;
442feeb8335SJoseph Huber   if (hsa_status_t err =
443feeb8335SJoseph Huber           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
444feeb8335SJoseph Huber                                        /*flags=*/0, &dev_ret))
445feeb8335SJoseph Huber     handle_error(err);
446feeb8335SJoseph Huber   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
447feeb8335SJoseph Huber 
448feeb8335SJoseph Huber   // Allocate finegrained memory for the RPC server and client to share.
44938049dc8SJoseph Huber   uint32_t wavefront_size =
45038049dc8SJoseph Huber       llvm::max_element(info_map, [](auto &&x, auto &&y) {
45138049dc8SJoseph Huber         return x.second.WavefrontSize < y.second.WavefrontSize;
45238049dc8SJoseph Huber       })->second.WavefrontSize;
453feeb8335SJoseph Huber 
454feeb8335SJoseph Huber   // Set up the RPC server.
455b4d49fb5SJoseph Huber   void *rpc_buffer;
456b4d49fb5SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_allocate(
457b4d49fb5SJoseph Huber           finegrained_pool,
458b4d49fb5SJoseph Huber           rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
459b4d49fb5SJoseph Huber           /*flags=*/0, &rpc_buffer))
460feeb8335SJoseph Huber     handle_error(err);
461b4d49fb5SJoseph Huber   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
462feeb8335SJoseph Huber 
463b4d49fb5SJoseph Huber   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
464b4d49fb5SJoseph Huber   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
465feeb8335SJoseph Huber 
466feeb8335SJoseph Huber   // Initialize the RPC client on the device by copying the local data to the
467feeb8335SJoseph Huber   // device's internal pointer.
468feeb8335SJoseph Huber   hsa_executable_symbol_t rpc_client_sym;
469feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
470*89d8e700SJoseph Huber           executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
471feeb8335SJoseph Huber     handle_error(err);
472feeb8335SJoseph Huber 
473feeb8335SJoseph Huber   void *rpc_client_dev;
474feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_symbol_get_info(
475feeb8335SJoseph Huber           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
476feeb8335SJoseph Huber           &rpc_client_dev))
477feeb8335SJoseph Huber     handle_error(err);
478feeb8335SJoseph Huber 
479feeb8335SJoseph Huber   void *rpc_client_buffer;
480feeb8335SJoseph Huber   if (hsa_status_t err =
481b4d49fb5SJoseph Huber           hsa_amd_memory_lock(&client, sizeof(rpc::Client),
482feeb8335SJoseph Huber                               /*agents=*/nullptr, 0, &rpc_client_buffer))
483feeb8335SJoseph Huber     handle_error(err);
484feeb8335SJoseph Huber 
485feeb8335SJoseph Huber   // Copy the RPC client buffer to the address pointed to by the symbol.
486feeb8335SJoseph Huber   if (hsa_status_t err =
487*89d8e700SJoseph Huber           hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
488*89d8e700SJoseph Huber                      sizeof(rpc::Client)))
489feeb8335SJoseph Huber     handle_error(err);
490feeb8335SJoseph Huber 
491b4d49fb5SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_unlock(&client))
492feeb8335SJoseph Huber     handle_error(err);
493feeb8335SJoseph Huber 
494feeb8335SJoseph Huber   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
495feeb8335SJoseph Huber   // If the clock_freq symbol is missing, no work to do.
496feeb8335SJoseph Huber   hsa_executable_symbol_t freq_sym;
497feeb8335SJoseph Huber   if (HSA_STATUS_SUCCESS ==
498feeb8335SJoseph Huber       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
499feeb8335SJoseph Huber                                         &dev_agent, &freq_sym)) {
500feeb8335SJoseph Huber     void *host_clock_freq;
501feeb8335SJoseph Huber     if (hsa_status_t err =
502feeb8335SJoseph Huber             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
503feeb8335SJoseph Huber                                          /*flags=*/0, &host_clock_freq))
504feeb8335SJoseph Huber       handle_error(err);
505feeb8335SJoseph Huber     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
506feeb8335SJoseph Huber 
507feeb8335SJoseph Huber     if (HSA_STATUS_SUCCESS ==
508feeb8335SJoseph Huber         hsa_agent_get_info(dev_agent,
509feeb8335SJoseph Huber                            static_cast<hsa_agent_info_t>(
510feeb8335SJoseph Huber                                HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
511feeb8335SJoseph Huber                            host_clock_freq)) {
512feeb8335SJoseph Huber 
513feeb8335SJoseph Huber       void *freq_addr;
514feeb8335SJoseph Huber       if (hsa_status_t err = hsa_executable_symbol_get_info(
515feeb8335SJoseph Huber               freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
516feeb8335SJoseph Huber               &freq_addr))
517feeb8335SJoseph Huber         handle_error(err);
518feeb8335SJoseph Huber 
519feeb8335SJoseph Huber       if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
520feeb8335SJoseph Huber                                         host_agent, sizeof(uint64_t)))
521feeb8335SJoseph Huber         handle_error(err);
522feeb8335SJoseph Huber     }
523feeb8335SJoseph Huber   }
524feeb8335SJoseph Huber 
5256558e561SJoseph Huber   // Obtain a queue with the maximum (power of two) size, used to send commands
526feeb8335SJoseph Huber   // to the HSA runtime and launch execution on the device.
527feeb8335SJoseph Huber   uint64_t queue_size;
528feeb8335SJoseph Huber   if (hsa_status_t err = hsa_agent_get_info(
5296558e561SJoseph Huber           dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
530feeb8335SJoseph Huber     handle_error(err);
531feeb8335SJoseph Huber   hsa_queue_t *queue = nullptr;
532feeb8335SJoseph Huber   if (hsa_status_t err =
533feeb8335SJoseph Huber           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
534feeb8335SJoseph Huber                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
535feeb8335SJoseph Huber     handle_error(err);
536feeb8335SJoseph Huber 
537feeb8335SJoseph Huber   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
538feeb8335SJoseph Huber   begin_args_t init_args = {argc, dev_argv, dev_envp};
53938049dc8SJoseph Huber   if (hsa_status_t err = launch_kernel(
54038049dc8SJoseph Huber           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
54138049dc8SJoseph Huber           server, single_threaded_params, "_begin.kd", init_args,
54238049dc8SJoseph Huber           info_map["_begin"].WavefrontSize, print_resource_usage))
543feeb8335SJoseph Huber     handle_error(err);
544feeb8335SJoseph Huber 
545feeb8335SJoseph Huber   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
546feeb8335SJoseph Huber   if (hsa_status_t err = launch_kernel(
547feeb8335SJoseph Huber           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
54838049dc8SJoseph Huber           server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
54938049dc8SJoseph Huber           print_resource_usage))
550feeb8335SJoseph Huber     handle_error(err);
551feeb8335SJoseph Huber 
552feeb8335SJoseph Huber   void *host_ret;
553feeb8335SJoseph Huber   if (hsa_status_t err =
554feeb8335SJoseph Huber           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
555feeb8335SJoseph Huber                                        /*flags=*/0, &host_ret))
556feeb8335SJoseph Huber     handle_error(err);
557feeb8335SJoseph Huber   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
558feeb8335SJoseph Huber 
559feeb8335SJoseph Huber   if (hsa_status_t err =
560feeb8335SJoseph Huber           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
561feeb8335SJoseph Huber     handle_error(err);
562feeb8335SJoseph Huber 
563feeb8335SJoseph Huber   // Save the return value and perform basic clean-up.
564feeb8335SJoseph Huber   int ret = *static_cast<int *>(host_ret);
565feeb8335SJoseph Huber 
566feeb8335SJoseph Huber   end_args_t fini_args = {ret};
56738049dc8SJoseph Huber   if (hsa_status_t err = launch_kernel(
56838049dc8SJoseph Huber           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
56938049dc8SJoseph Huber           server, single_threaded_params, "_end.kd", fini_args,
57038049dc8SJoseph Huber           info_map["_end"].WavefrontSize, print_resource_usage))
571feeb8335SJoseph Huber     handle_error(err);
572feeb8335SJoseph Huber 
573b4d49fb5SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
574feeb8335SJoseph Huber     handle_error(err);
575feeb8335SJoseph Huber 
576feeb8335SJoseph Huber   // Free the memory allocated for the device.
577feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
578feeb8335SJoseph Huber     handle_error(err);
579feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
580feeb8335SJoseph Huber     handle_error(err);
581feeb8335SJoseph Huber   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
582feeb8335SJoseph Huber     handle_error(err);
583feeb8335SJoseph Huber 
584feeb8335SJoseph Huber   if (hsa_status_t err = hsa_queue_destroy(queue))
585feeb8335SJoseph Huber     handle_error(err);
586feeb8335SJoseph Huber 
587feeb8335SJoseph Huber   if (hsa_status_t err = hsa_executable_destroy(executable))
588feeb8335SJoseph Huber     handle_error(err);
589feeb8335SJoseph Huber 
590feeb8335SJoseph Huber   if (hsa_status_t err = hsa_shut_down())
591feeb8335SJoseph Huber     handle_error(err);
592feeb8335SJoseph Huber 
593feeb8335SJoseph Huber   return ret;
594feeb8335SJoseph Huber }
595