xref: /llvm-project/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp (revision 676a1e6643c7f8db22607fb98984965d51518b40)
1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "Loader.h"
17 
18 #if defined(__has_include)
19 #if __has_include("hsa/hsa.h")
20 #include "hsa/hsa.h"
21 #include "hsa/hsa_ext_amd.h"
22 #elif __has_include("hsa.h")
23 #include "hsa.h"
24 #include "hsa_ext_amd.h"
25 #endif
26 #else
27 #include "hsa/hsa.h"
28 #include "hsa/hsa_ext_amd.h"
29 #endif
30 
31 #include <atomic>
32 #include <cstdio>
33 #include <cstdlib>
34 #include <cstring>
35 #include <thread>
36 #include <tuple>
37 #include <utility>
38 
39 // The implicit arguments of COV5 AMDGPU kernels.
40 struct implicit_args_t {
41   uint32_t grid_size_x;
42   uint32_t grid_size_y;
43   uint32_t grid_size_z;
44   uint16_t workgroup_size_x;
45   uint16_t workgroup_size_y;
46   uint16_t workgroup_size_z;
47   uint8_t Unused0[46];
48   uint16_t grid_dims;
49   uint8_t Unused1[190];
50 };
51 
52 /// Print the error code and exit if \p code indicates an error.
53 static void handle_error_impl(const char *file, int32_t line,
54                               hsa_status_t code) {
55   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
56     return;
57 
58   const char *desc;
59   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
60     desc = "Unknown error";
61   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
62   exit(EXIT_FAILURE);
63 }
64 
65 /// Generic interface for iterating using the HSA callbacks.
66 template <typename elem_ty, typename func_ty, typename callback_ty>
67 hsa_status_t iterate(func_ty func, callback_ty cb) {
68   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
69     callback_ty *unwrapped = static_cast<callback_ty *>(data);
70     return (*unwrapped)(elem);
71   };
72   return func(l, static_cast<void *>(&cb));
73 }
74 
75 /// Generic interface for iterating using the HSA callbacks.
76 template <typename elem_ty, typename func_ty, typename func_arg_ty,
77           typename callback_ty>
78 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
79   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
80     callback_ty *unwrapped = static_cast<callback_ty *>(data);
81     return (*unwrapped)(elem);
82   };
83   return func(func_arg, l, static_cast<void *>(&cb));
84 }
85 
86 /// Iterate through all availible agents.
87 template <typename callback_ty>
88 hsa_status_t iterate_agents(callback_ty callback) {
89   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
90 }
91 
92 /// Iterate through all availible memory pools.
93 template <typename callback_ty>
94 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
95   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
96                                         agent, cb);
97 }
98 
99 template <hsa_device_type_t flag>
100 hsa_status_t get_agent(hsa_agent_t *output_agent) {
101   // Find the first agent with a matching device type.
102   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
103     hsa_device_type_t type;
104     hsa_status_t status =
105         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
106     if (status != HSA_STATUS_SUCCESS)
107       return status;
108 
109     if (type == flag) {
110       // Ensure that a GPU agent supports kernel dispatch packets.
111       if (type == HSA_DEVICE_TYPE_GPU) {
112         hsa_agent_feature_t features;
113         status =
114             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
115         if (status != HSA_STATUS_SUCCESS)
116           return status;
117         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
118           *output_agent = hsa_agent;
119       } else {
120         *output_agent = hsa_agent;
121       }
122       return HSA_STATUS_INFO_BREAK;
123     }
124     return HSA_STATUS_SUCCESS;
125   };
126 
127   return iterate_agents(cb);
128 }
129 
130 void print_kernel_resources(const char *kernel_name) {
131   fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
132 }
133 
134 /// Retrieve a global memory pool with a \p flag from the agent.
135 template <hsa_amd_memory_pool_global_flag_t flag>
136 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
137                                    hsa_amd_memory_pool_t *output_pool) {
138   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
139     uint32_t flags;
140     hsa_amd_segment_t segment;
141     if (auto err = hsa_amd_memory_pool_get_info(
142             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
143       return err;
144     if (auto err = hsa_amd_memory_pool_get_info(
145             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
146       return err;
147 
148     if (segment != HSA_AMD_SEGMENT_GLOBAL)
149       return HSA_STATUS_SUCCESS;
150 
151     if (flags & flag)
152       *output_pool = memory_pool;
153 
154     return HSA_STATUS_SUCCESS;
155   };
156   return iterate_agent_memory_pools(agent, cb);
157 }
158 
159 template <typename args_t>
160 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
161                            hsa_amd_memory_pool_t kernargs_pool,
162                            hsa_amd_memory_pool_t coarsegrained_pool,
163                            hsa_queue_t *queue, rpc_device_t device,
164                            const LaunchParameters &params,
165                            const char *kernel_name, args_t kernel_args,
166                            bool print_resource_usage) {
167   // Look up the kernel in the loaded executable.
168   hsa_executable_symbol_t symbol;
169   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
170           executable, kernel_name, &dev_agent, &symbol))
171     return err;
172 
173   // Register RPC callbacks for the malloc and free functions on HSA.
174   auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
175   rpc_register_callback(
176       device, RPC_MALLOC,
177       [](rpc_port_t port, void *data) {
178         auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
179           auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
180           uint64_t size = buffer->data[0];
181           void *dev_ptr = nullptr;
182           if (hsa_status_t err =
183                   hsa_amd_memory_pool_allocate(pool, size,
184                                                /*flags=*/0, &dev_ptr))
185             dev_ptr = nullptr;
186           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
187           buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
188         };
189         rpc_recv_and_send(port, malloc_handler, data);
190       },
191       &tuple);
192   rpc_register_callback(
193       device, RPC_FREE,
194       [](rpc_port_t port, void *data) {
195         auto free_handler = [](rpc_buffer_t *buffer, void *) {
196           if (hsa_status_t err = hsa_amd_memory_pool_free(
197                   reinterpret_cast<void *>(buffer->data[0])))
198             handle_error(err);
199         };
200         rpc_recv_and_send(port, free_handler, data);
201       },
202       nullptr);
203 
204   // Retrieve different properties of the kernel symbol used for launch.
205   uint64_t kernel;
206   uint32_t args_size;
207   uint32_t group_size;
208   uint32_t private_size;
209   bool dynamic_stack;
210 
211   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
212       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
213       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
214       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
215       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
216       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
217 
218   for (auto &[info, value] : symbol_infos)
219     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
220       return err;
221 
222   // Allocate space for the kernel arguments on the host and allow the GPU agent
223   // to access it.
224   void *args;
225   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
226                                                       /*flags=*/0, &args))
227     handle_error(err);
228   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
229 
230   // Initialize all the arguments (explicit and implicit) to zero, then set the
231   // explicit arguments to the values created above.
232   std::memset(args, 0, args_size);
233   std::memcpy(args, &kernel_args, sizeof(args_t));
234 
235   // Initialize the necessary implicit arguments to the proper values.
236   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
237              (params.num_blocks_z * params.num_threads_z != 1);
238   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
239       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
240   implicit_args->grid_dims = dims;
241   implicit_args->grid_size_x = params.num_blocks_x;
242   implicit_args->grid_size_y = params.num_blocks_y;
243   implicit_args->grid_size_z = params.num_blocks_z;
244   implicit_args->workgroup_size_x = params.num_threads_x;
245   implicit_args->workgroup_size_y = params.num_threads_y;
246   implicit_args->workgroup_size_z = params.num_threads_z;
247 
248   // Obtain a packet from the queue.
249   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
250   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
251     ;
252 
253   const uint32_t mask = queue->size - 1;
254   hsa_kernel_dispatch_packet_t *packet =
255       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
256       (packet_id & mask);
257 
258   // Set up the packet for exeuction on the device. We currently only launch
259   // with one thread on the device, forcing the rest of the wavefront to be
260   // masked off.
261   uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
262   packet->workgroup_size_x = params.num_threads_x;
263   packet->workgroup_size_y = params.num_threads_y;
264   packet->workgroup_size_z = params.num_threads_z;
265   packet->reserved0 = 0;
266   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
267   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
268   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
269   packet->private_segment_size =
270       dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
271   packet->group_segment_size = group_size;
272   packet->kernel_object = kernel;
273   packet->kernarg_address = args;
274   packet->reserved2 = 0;
275   // Create a signal to indicate when this packet has been completed.
276   if (hsa_status_t err =
277           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
278     handle_error(err);
279 
280   if (print_resource_usage)
281     print_kernel_resources(kernel_name);
282 
283   // Initialize the packet header and set the doorbell signal to begin execution
284   // by the HSA runtime.
285   uint16_t header =
286       1u << HSA_PACKET_HEADER_BARRIER |
287       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
288       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
289       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
290   uint32_t header_word = header | (setup << 16u);
291   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
292   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
293 
294   std::atomic<bool> finished = false;
295   std::thread server(
296       [](std::atomic<bool> *finished, rpc_device_t device) {
297         while (!*finished) {
298           if (rpc_status_t err = rpc_handle_server(device))
299             handle_error(err);
300         }
301       },
302       &finished, device);
303 
304   // Wait until the kernel has completed execution on the device. Periodically
305   // check the RPC client for work to be performed on the server.
306   while (hsa_signal_wait_scacquire(packet->completion_signal,
307                                    HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
308                                    HSA_WAIT_STATE_BLOCKED) != 0)
309     ;
310 
311   finished = true;
312   if (server.joinable())
313     server.join();
314 
315   // Destroy the resources acquired to launch the kernel and return.
316   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
317     handle_error(err);
318   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
319     handle_error(err);
320 
321   return HSA_STATUS_SUCCESS;
322 }
323 
324 /// Copies data from the source agent to the destination agent. The source
325 /// memory must first be pinned explicitly or allocated via HSA.
326 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
327                                const void *src, hsa_agent_t src_agent,
328                                uint64_t size) {
329   // Create a memory signal to copy information between the host and device.
330   hsa_signal_t memory_signal;
331   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
332     return err;
333 
334   if (hsa_status_t err = hsa_amd_memory_async_copy(
335           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
336     return err;
337 
338   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
339                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
340     ;
341 
342   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
343     return err;
344 
345   return HSA_STATUS_SUCCESS;
346 }
347 
348 int load(int argc, const char **argv, const char **envp, void *image,
349          size_t size, const LaunchParameters &params,
350          bool print_resource_usage) {
351   // Initialize the HSA runtime used to communicate with the device.
352   if (hsa_status_t err = hsa_init())
353     handle_error(err);
354 
355   // Register a callback when the device encounters a memory fault.
356   if (hsa_status_t err = hsa_amd_register_system_event_handler(
357           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
358             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
359               return HSA_STATUS_ERROR;
360             return HSA_STATUS_SUCCESS;
361           },
362           nullptr))
363     handle_error(err);
364 
365   // Obtain a single agent for the device and host to use the HSA memory model.
366   hsa_agent_t dev_agent;
367   hsa_agent_t host_agent;
368   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
369     handle_error(err);
370   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
371     handle_error(err);
372 
373   // Load the code object's ISA information and executable data segments.
374   hsa_code_object_reader_t reader;
375   if (hsa_status_t err =
376           hsa_code_object_reader_create_from_memory(image, size, &reader))
377     handle_error(err);
378 
379   hsa_executable_t executable;
380   if (hsa_status_t err = hsa_executable_create_alt(
381           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
382           &executable))
383     handle_error(err);
384 
385   hsa_loaded_code_object_t object;
386   if (hsa_status_t err = hsa_executable_load_agent_code_object(
387           executable, dev_agent, reader, "", &object))
388     handle_error(err);
389 
390   // No modifications to the executable are allowed  after this point.
391   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
392     handle_error(err);
393 
394   // Check the validity of the loaded executable. If the agents ISA features do
395   // not match the executable's code object it will fail here.
396   uint32_t result;
397   if (hsa_status_t err = hsa_executable_validate(executable, &result))
398     handle_error(err);
399   if (result)
400     handle_error(HSA_STATUS_ERROR);
401 
402   if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
403     handle_error(err);
404 
405   // Obtain memory pools to exchange data between the host and the device. The
406   // fine-grained pool acts as pinned memory on the host for DMA transfers to
407   // the device, the coarse-grained pool is for allocations directly on the
408   // device, and the kernerl-argument pool is for executing the kernel.
409   hsa_amd_memory_pool_t kernargs_pool;
410   hsa_amd_memory_pool_t finegrained_pool;
411   hsa_amd_memory_pool_t coarsegrained_pool;
412   if (hsa_status_t err =
413           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
414               host_agent, &kernargs_pool))
415     handle_error(err);
416   if (hsa_status_t err =
417           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
418               host_agent, &finegrained_pool))
419     handle_error(err);
420   if (hsa_status_t err =
421           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
422               dev_agent, &coarsegrained_pool))
423     handle_error(err);
424 
425   // Allocate fine-grained memory on the host to hold the pointer array for the
426   // copied argv and allow the GPU agent to access it.
427   auto allocator = [&](uint64_t size) -> void * {
428     void *dev_ptr = nullptr;
429     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
430                                                         /*flags=*/0, &dev_ptr))
431       handle_error(err);
432     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
433     return dev_ptr;
434   };
435   void *dev_argv = copy_argument_vector(argc, argv, allocator);
436   if (!dev_argv)
437     handle_error("Failed to allocate device argv");
438 
439   // Allocate fine-grained memory on the host to hold the pointer array for the
440   // copied environment array and allow the GPU agent to access it.
441   void *dev_envp = copy_environment(envp, allocator);
442   if (!dev_envp)
443     handle_error("Failed to allocate device environment");
444 
445   // Allocate space for the return pointer and initialize it to zero.
446   void *dev_ret;
447   if (hsa_status_t err =
448           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
449                                        /*flags=*/0, &dev_ret))
450     handle_error(err);
451   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
452 
453   // Allocate finegrained memory for the RPC server and client to share.
454   uint32_t wavefront_size = 0;
455   if (hsa_status_t err = hsa_agent_get_info(
456           dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
457     handle_error(err);
458 
459   // Set up the RPC server.
460   auto tuple = std::make_tuple(dev_agent, finegrained_pool);
461   auto rpc_alloc = [](uint64_t size, void *data) {
462     auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
463     void *dev_ptr = nullptr;
464     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
465                                                         /*flags=*/0, &dev_ptr))
466       handle_error(err);
467     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
468     return dev_ptr;
469   };
470   rpc_device_t device;
471   if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
472                                          wavefront_size, rpc_alloc, &tuple))
473     handle_error(err);
474 
475   // Register callbacks for the RPC unit tests.
476   if (wavefront_size == 32)
477     register_rpc_callbacks<32>(device);
478   else if (wavefront_size == 64)
479     register_rpc_callbacks<64>(device);
480   else
481     handle_error("Invalid wavefront size");
482 
483   // Initialize the RPC client on the device by copying the local data to the
484   // device's internal pointer.
485   hsa_executable_symbol_t rpc_client_sym;
486   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
487           executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
488     handle_error(err);
489 
490   void *rpc_client_host;
491   if (hsa_status_t err =
492           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
493                                        /*flags=*/0, &rpc_client_host))
494     handle_error(err);
495   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host);
496 
497   void *rpc_client_dev;
498   if (hsa_status_t err = hsa_executable_symbol_get_info(
499           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
500           &rpc_client_dev))
501     handle_error(err);
502 
503   // Copy the address of the client buffer from the device to the host.
504   if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
505                                     dev_agent, sizeof(void *)))
506     handle_error(err);
507 
508   void *rpc_client_buffer;
509   if (hsa_status_t err =
510           hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
511                               rpc_get_client_size(),
512                               /*agents=*/nullptr, 0, &rpc_client_buffer))
513     handle_error(err);
514 
515   // Copy the RPC client buffer to the address pointed to by the symbol.
516   if (hsa_status_t err =
517           hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
518                      rpc_client_buffer, host_agent, rpc_get_client_size()))
519     handle_error(err);
520 
521   if (hsa_status_t err = hsa_amd_memory_unlock(
522           const_cast<void *>(rpc_get_client_buffer(device))))
523     handle_error(err);
524   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
525     handle_error(err);
526 
527   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
528   // If the clock_freq symbol is missing, no work to do.
529   hsa_executable_symbol_t freq_sym;
530   if (HSA_STATUS_SUCCESS ==
531       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
532                                         &dev_agent, &freq_sym)) {
533 
534     void *host_clock_freq;
535     if (hsa_status_t err =
536             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
537                                          /*flags=*/0, &host_clock_freq))
538       handle_error(err);
539     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
540 
541     if (HSA_STATUS_SUCCESS ==
542         hsa_agent_get_info(dev_agent,
543                            static_cast<hsa_agent_info_t>(
544                                HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
545                            host_clock_freq)) {
546 
547       void *freq_addr;
548       if (hsa_status_t err = hsa_executable_symbol_get_info(
549               freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
550               &freq_addr))
551         handle_error(err);
552 
553       if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
554                                         host_agent, sizeof(uint64_t)))
555         handle_error(err);
556     }
557   }
558 
559   // Obtain a queue with the maximum (power of two) size, used to send commands
560   // to the HSA runtime and launch execution on the device.
561   uint64_t queue_size;
562   if (hsa_status_t err = hsa_agent_get_info(
563           dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
564     handle_error(err);
565   hsa_queue_t *queue = nullptr;
566   if (hsa_status_t err =
567           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
568                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
569     handle_error(err);
570 
571   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
572   begin_args_t init_args = {argc, dev_argv, dev_envp};
573   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
574                                        coarsegrained_pool, queue, device,
575                                        single_threaded_params, "_begin.kd",
576                                        init_args, print_resource_usage))
577     handle_error(err);
578 
579   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
580   if (hsa_status_t err = launch_kernel(
581           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
582           device, params, "_start.kd", args, print_resource_usage))
583     handle_error(err);
584 
585   void *host_ret;
586   if (hsa_status_t err =
587           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
588                                        /*flags=*/0, &host_ret))
589     handle_error(err);
590   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
591 
592   if (hsa_status_t err =
593           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
594     handle_error(err);
595 
596   // Save the return value and perform basic clean-up.
597   int ret = *static_cast<int *>(host_ret);
598 
599   end_args_t fini_args = {ret};
600   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
601                                        coarsegrained_pool, queue, device,
602                                        single_threaded_params, "_end.kd",
603                                        fini_args, print_resource_usage))
604     handle_error(err);
605 
606   if (rpc_status_t err = rpc_server_shutdown(
607           device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
608           nullptr))
609     handle_error(err);
610 
611   // Free the memory allocated for the device.
612   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
613     handle_error(err);
614   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
615     handle_error(err);
616   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
617     handle_error(err);
618 
619   if (hsa_status_t err = hsa_queue_destroy(queue))
620     handle_error(err);
621 
622   if (hsa_status_t err = hsa_executable_destroy(executable))
623     handle_error(err);
624 
625   if (hsa_status_t err = hsa_shut_down())
626     handle_error(err);
627 
628   return ret;
629 }
630