xref: /llvm-project/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp (revision feeb8335a010c90071b65618634cf775d2d720a4)
1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "Loader.h"
17 
18 #if defined(__has_include)
19 #if __has_include("hsa/hsa.h")
20 #include "hsa/hsa.h"
21 #include "hsa/hsa_ext_amd.h"
22 #elif __has_include("hsa.h")
23 #include "hsa.h"
24 #include "hsa_ext_amd.h"
25 #endif
26 #else
27 #include "hsa/hsa.h"
28 #include "hsa/hsa_ext_amd.h"
29 #endif
30 
31 #include <cstdio>
32 #include <cstdlib>
33 #include <cstring>
34 #include <tuple>
35 #include <utility>
36 
37 // The implicit arguments of COV5 AMDGPU kernels.
38 struct implicit_args_t {
39   uint32_t grid_size_x;
40   uint32_t grid_size_y;
41   uint32_t grid_size_z;
42   uint16_t workgroup_size_x;
43   uint16_t workgroup_size_y;
44   uint16_t workgroup_size_z;
45   uint8_t Unused0[46];
46   uint16_t grid_dims;
47   uint8_t Unused1[190];
48 };
49 
50 /// Print the error code and exit if \p code indicates an error.
51 static void handle_error_impl(const char *file, int32_t line,
52                               hsa_status_t code) {
53   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
54     return;
55 
56   const char *desc;
57   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
58     desc = "Unknown error";
59   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
60   exit(EXIT_FAILURE);
61 }
62 
63 /// Generic interface for iterating using the HSA callbacks.
64 template <typename elem_ty, typename func_ty, typename callback_ty>
65 hsa_status_t iterate(func_ty func, callback_ty cb) {
66   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
67     callback_ty *unwrapped = static_cast<callback_ty *>(data);
68     return (*unwrapped)(elem);
69   };
70   return func(l, static_cast<void *>(&cb));
71 }
72 
73 /// Generic interface for iterating using the HSA callbacks.
74 template <typename elem_ty, typename func_ty, typename func_arg_ty,
75           typename callback_ty>
76 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
77   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
78     callback_ty *unwrapped = static_cast<callback_ty *>(data);
79     return (*unwrapped)(elem);
80   };
81   return func(func_arg, l, static_cast<void *>(&cb));
82 }
83 
84 /// Iterate through all availible agents.
85 template <typename callback_ty>
86 hsa_status_t iterate_agents(callback_ty callback) {
87   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
88 }
89 
90 /// Iterate through all availible memory pools.
91 template <typename callback_ty>
92 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
93   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
94                                         agent, cb);
95 }
96 
97 template <hsa_device_type_t flag>
98 hsa_status_t get_agent(hsa_agent_t *output_agent) {
99   // Find the first agent with a matching device type.
100   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
101     hsa_device_type_t type;
102     hsa_status_t status =
103         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
104     if (status != HSA_STATUS_SUCCESS)
105       return status;
106 
107     if (type == flag) {
108       // Ensure that a GPU agent supports kernel dispatch packets.
109       if (type == HSA_DEVICE_TYPE_GPU) {
110         hsa_agent_feature_t features;
111         status =
112             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
113         if (status != HSA_STATUS_SUCCESS)
114           return status;
115         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
116           *output_agent = hsa_agent;
117       } else {
118         *output_agent = hsa_agent;
119       }
120       return HSA_STATUS_INFO_BREAK;
121     }
122     return HSA_STATUS_SUCCESS;
123   };
124 
125   return iterate_agents(cb);
126 }
127 
128 void print_kernel_resources(const char *kernel_name) {
129   fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
130 }
131 
132 /// Retrieve a global memory pool with a \p flag from the agent.
133 template <hsa_amd_memory_pool_global_flag_t flag>
134 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
135                                    hsa_amd_memory_pool_t *output_pool) {
136   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
137     uint32_t flags;
138     hsa_amd_segment_t segment;
139     if (auto err = hsa_amd_memory_pool_get_info(
140             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
141       return err;
142     if (auto err = hsa_amd_memory_pool_get_info(
143             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
144       return err;
145 
146     if (segment != HSA_AMD_SEGMENT_GLOBAL)
147       return HSA_STATUS_SUCCESS;
148 
149     if (flags & flag)
150       *output_pool = memory_pool;
151 
152     return HSA_STATUS_SUCCESS;
153   };
154   return iterate_agent_memory_pools(agent, cb);
155 }
156 
157 template <typename args_t>
158 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
159                            hsa_amd_memory_pool_t kernargs_pool,
160                            hsa_amd_memory_pool_t coarsegrained_pool,
161                            hsa_queue_t *queue, rpc_device_t device,
162                            const LaunchParameters &params,
163                            const char *kernel_name, args_t kernel_args,
164                            bool print_resource_usage) {
165   // Look up the kernel in the loaded executable.
166   hsa_executable_symbol_t symbol;
167   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
168           executable, kernel_name, &dev_agent, &symbol))
169     return err;
170 
171   // Register RPC callbacks for the malloc and free functions on HSA.
172   auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
173   rpc_register_callback(
174       device, RPC_MALLOC,
175       [](rpc_port_t port, void *data) {
176         auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
177           auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
178           uint64_t size = buffer->data[0];
179           void *dev_ptr = nullptr;
180           if (hsa_status_t err =
181                   hsa_amd_memory_pool_allocate(pool, size,
182                                                /*flags=*/0, &dev_ptr))
183             handle_error(err);
184           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
185           buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
186         };
187         rpc_recv_and_send(port, malloc_handler, data);
188       },
189       &tuple);
190   rpc_register_callback(
191       device, RPC_FREE,
192       [](rpc_port_t port, void *data) {
193         auto free_handler = [](rpc_buffer_t *buffer, void *) {
194           if (hsa_status_t err = hsa_amd_memory_pool_free(
195                   reinterpret_cast<void *>(buffer->data[0])))
196             handle_error(err);
197         };
198         rpc_recv_and_send(port, free_handler, data);
199       },
200       nullptr);
201 
202   // Retrieve different properties of the kernel symbol used for launch.
203   uint64_t kernel;
204   uint32_t args_size;
205   uint32_t group_size;
206   uint32_t private_size;
207   bool dynamic_stack;
208 
209   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
210       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
211       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
212       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
213       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
214       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
215 
216   for (auto &[info, value] : symbol_infos)
217     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
218       return err;
219 
220   // Allocate space for the kernel arguments on the host and allow the GPU agent
221   // to access it.
222   void *args;
223   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
224                                                       /*flags=*/0, &args))
225     handle_error(err);
226   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
227 
228   // Initialize all the arguments (explicit and implicit) to zero, then set the
229   // explicit arguments to the values created above.
230   std::memset(args, 0, args_size);
231   std::memcpy(args, &kernel_args, sizeof(args_t));
232 
233   // Initialize the necessary implicit arguments to the proper values.
234   bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
235               (params.num_blocks_z * params.num_threads_z != 1);
236   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
237       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
238   implicit_args->grid_dims = dims;
239   implicit_args->grid_size_x = params.num_blocks_x;
240   implicit_args->grid_size_y = params.num_blocks_y;
241   implicit_args->grid_size_z = params.num_blocks_z;
242   implicit_args->workgroup_size_x = params.num_threads_x;
243   implicit_args->workgroup_size_y = params.num_threads_y;
244   implicit_args->workgroup_size_z = params.num_threads_z;
245 
246   // Obtain a packet from the queue.
247   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
248   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
249     ;
250 
251   const uint32_t mask = queue->size - 1;
252   hsa_kernel_dispatch_packet_t *packet =
253       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
254       (packet_id & mask);
255 
256   // Set up the packet for exeuction on the device. We currently only launch
257   // with one thread on the device, forcing the rest of the wavefront to be
258   // masked off.
259   uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
260   packet->workgroup_size_x = params.num_threads_x;
261   packet->workgroup_size_y = params.num_threads_y;
262   packet->workgroup_size_z = params.num_threads_z;
263   packet->reserved0 = 0;
264   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
265   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
266   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
267   packet->private_segment_size =
268       dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
269   packet->group_segment_size = group_size;
270   packet->kernel_object = kernel;
271   packet->kernarg_address = args;
272   packet->reserved2 = 0;
273   // Create a signal to indicate when this packet has been completed.
274   if (hsa_status_t err =
275           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
276     handle_error(err);
277 
278   if (print_resource_usage)
279     print_kernel_resources(kernel_name);
280 
281   // Initialize the packet header and set the doorbell signal to begin execution
282   // by the HSA runtime.
283   uint16_t header =
284       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
285       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
286       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
287   uint32_t header_word = header | (setup << 16u);
288   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
289   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
290 
291   // Wait until the kernel has completed execution on the device. Periodically
292   // check the RPC client for work to be performed on the server.
293   while (hsa_signal_wait_scacquire(
294              packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
295              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
296     if (rpc_status_t err = rpc_handle_server(device))
297       handle_error(err);
298 
299   // Handle the server one more time in case the kernel exited with a pending
300   // send still in flight.
301   if (rpc_status_t err = rpc_handle_server(device))
302     handle_error(err);
303 
304   // Destroy the resources acquired to launch the kernel and return.
305   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
306     handle_error(err);
307   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
308     handle_error(err);
309 
310   return HSA_STATUS_SUCCESS;
311 }
312 
313 /// Copies data from the source agent to the destination agent. The source
314 /// memory must first be pinned explicitly or allocated via HSA.
315 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
316                                const void *src, hsa_agent_t src_agent,
317                                uint64_t size) {
318   // Create a memory signal to copy information between the host and device.
319   hsa_signal_t memory_signal;
320   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
321     return err;
322 
323   if (hsa_status_t err = hsa_amd_memory_async_copy(
324           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
325     return err;
326 
327   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
328                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
329     ;
330 
331   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
332     return err;
333 
334   return HSA_STATUS_SUCCESS;
335 }
336 
337 int load(int argc, char **argv, char **envp, void *image, size_t size,
338          const LaunchParameters &params, bool print_resource_usage) {
339   // Initialize the HSA runtime used to communicate with the device.
340   if (hsa_status_t err = hsa_init())
341     handle_error(err);
342 
343   // Register a callback when the device encounters a memory fault.
344   if (hsa_status_t err = hsa_amd_register_system_event_handler(
345           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
346             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
347               return HSA_STATUS_ERROR;
348             return HSA_STATUS_SUCCESS;
349           },
350           nullptr))
351     handle_error(err);
352 
353   // Obtain a single agent for the device and host to use the HSA memory model.
354   hsa_agent_t dev_agent;
355   hsa_agent_t host_agent;
356   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
357     handle_error(err);
358   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
359     handle_error(err);
360 
361   // Load the code object's ISA information and executable data segments.
362   hsa_code_object_t object;
363   if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
364     handle_error(err);
365 
366   hsa_executable_t executable;
367   if (hsa_status_t err = hsa_executable_create_alt(
368           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
369           &executable))
370     handle_error(err);
371 
372   if (hsa_status_t err =
373           hsa_executable_load_code_object(executable, dev_agent, object, ""))
374     handle_error(err);
375 
376   // No modifications to the executable are allowed  after this point.
377   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
378     handle_error(err);
379 
380   // Check the validity of the loaded executable. If the agents ISA features do
381   // not match the executable's code object it will fail here.
382   uint32_t result;
383   if (hsa_status_t err = hsa_executable_validate(executable, &result))
384     handle_error(err);
385   if (result)
386     handle_error(HSA_STATUS_ERROR);
387 
388   // Obtain memory pools to exchange data between the host and the device. The
389   // fine-grained pool acts as pinned memory on the host for DMA transfers to
390   // the device, the coarse-grained pool is for allocations directly on the
391   // device, and the kernerl-argument pool is for executing the kernel.
392   hsa_amd_memory_pool_t kernargs_pool;
393   hsa_amd_memory_pool_t finegrained_pool;
394   hsa_amd_memory_pool_t coarsegrained_pool;
395   if (hsa_status_t err =
396           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
397               host_agent, &kernargs_pool))
398     handle_error(err);
399   if (hsa_status_t err =
400           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
401               host_agent, &finegrained_pool))
402     handle_error(err);
403   if (hsa_status_t err =
404           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
405               dev_agent, &coarsegrained_pool))
406     handle_error(err);
407 
408   // Allocate fine-grained memory on the host to hold the pointer array for the
409   // copied argv and allow the GPU agent to access it.
410   auto allocator = [&](uint64_t size) -> void * {
411     void *dev_ptr = nullptr;
412     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
413                                                         /*flags=*/0, &dev_ptr))
414       handle_error(err);
415     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
416     return dev_ptr;
417   };
418   void *dev_argv = copy_argument_vector(argc, argv, allocator);
419   if (!dev_argv)
420     handle_error("Failed to allocate device argv");
421 
422   // Allocate fine-grained memory on the host to hold the pointer array for the
423   // copied environment array and allow the GPU agent to access it.
424   void *dev_envp = copy_environment(envp, allocator);
425   if (!dev_envp)
426     handle_error("Failed to allocate device environment");
427 
428   // Allocate space for the return pointer and initialize it to zero.
429   void *dev_ret;
430   if (hsa_status_t err =
431           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
432                                        /*flags=*/0, &dev_ret))
433     handle_error(err);
434   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
435 
436   // Allocate finegrained memory for the RPC server and client to share.
437   uint32_t wavefront_size = 0;
438   if (hsa_status_t err = hsa_agent_get_info(
439           dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
440     handle_error(err);
441 
442   // Set up the RPC server.
443   auto tuple = std::make_tuple(dev_agent, finegrained_pool);
444   auto rpc_alloc = [](uint64_t size, void *data) {
445     auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
446     void *dev_ptr = nullptr;
447     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
448                                                         /*flags=*/0, &dev_ptr))
449       handle_error(err);
450     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
451     return dev_ptr;
452   };
453   rpc_device_t device;
454   if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
455                                          wavefront_size, rpc_alloc, &tuple))
456     handle_error(err);
457 
458   // Register callbacks for the RPC unit tests.
459   if (wavefront_size == 32)
460     register_rpc_callbacks<32>(device);
461   else if (wavefront_size == 64)
462     register_rpc_callbacks<64>(device);
463   else
464     handle_error("Invalid wavefront size");
465 
466   // Initialize the RPC client on the device by copying the local data to the
467   // device's internal pointer.
468   hsa_executable_symbol_t rpc_client_sym;
469   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
470           executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
471     handle_error(err);
472 
473   void *rpc_client_host;
474   if (hsa_status_t err =
475           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
476                                        /*flags=*/0, &rpc_client_host))
477     handle_error(err);
478   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host);
479 
480   void *rpc_client_dev;
481   if (hsa_status_t err = hsa_executable_symbol_get_info(
482           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
483           &rpc_client_dev))
484     handle_error(err);
485 
486   // Copy the address of the client buffer from the device to the host.
487   if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
488                                     dev_agent, sizeof(void *)))
489     handle_error(err);
490 
491   void *rpc_client_buffer;
492   if (hsa_status_t err =
493           hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
494                               rpc_get_client_size(),
495                               /*agents=*/nullptr, 0, &rpc_client_buffer))
496     handle_error(err);
497 
498   // Copy the RPC client buffer to the address pointed to by the symbol.
499   if (hsa_status_t err =
500           hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
501                      rpc_client_buffer, host_agent, rpc_get_client_size()))
502     handle_error(err);
503 
504   if (hsa_status_t err = hsa_amd_memory_unlock(
505           const_cast<void *>(rpc_get_client_buffer(device))))
506     handle_error(err);
507   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
508     handle_error(err);
509 
510   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
511   // If the clock_freq symbol is missing, no work to do.
512   hsa_executable_symbol_t freq_sym;
513   if (HSA_STATUS_SUCCESS ==
514       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
515                                         &dev_agent, &freq_sym)) {
516 
517     void *host_clock_freq;
518     if (hsa_status_t err =
519             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
520                                          /*flags=*/0, &host_clock_freq))
521       handle_error(err);
522     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
523 
524     if (HSA_STATUS_SUCCESS ==
525         hsa_agent_get_info(dev_agent,
526                            static_cast<hsa_agent_info_t>(
527                                HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
528                            host_clock_freq)) {
529 
530       void *freq_addr;
531       if (hsa_status_t err = hsa_executable_symbol_get_info(
532               freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
533               &freq_addr))
534         handle_error(err);
535 
536       if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
537                                         host_agent, sizeof(uint64_t)))
538         handle_error(err);
539     }
540   }
541 
542   // Obtain a queue with the minimum (power of two) size, used to send commands
543   // to the HSA runtime and launch execution on the device.
544   uint64_t queue_size;
545   if (hsa_status_t err = hsa_agent_get_info(
546           dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
547     handle_error(err);
548   hsa_queue_t *queue = nullptr;
549   if (hsa_status_t err =
550           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
551                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
552     handle_error(err);
553 
554   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
555   begin_args_t init_args = {argc, dev_argv, dev_envp};
556   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
557                                        coarsegrained_pool, queue, device,
558                                        single_threaded_params, "_begin.kd",
559                                        init_args, print_resource_usage))
560     handle_error(err);
561 
562   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
563   if (hsa_status_t err = launch_kernel(
564           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
565           device, params, "_start.kd", args, print_resource_usage))
566     handle_error(err);
567 
568   void *host_ret;
569   if (hsa_status_t err =
570           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
571                                        /*flags=*/0, &host_ret))
572     handle_error(err);
573   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
574 
575   if (hsa_status_t err =
576           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
577     handle_error(err);
578 
579   // Save the return value and perform basic clean-up.
580   int ret = *static_cast<int *>(host_ret);
581 
582   end_args_t fini_args = {ret};
583   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
584                                        coarsegrained_pool, queue, device,
585                                        single_threaded_params, "_end.kd",
586                                        fini_args, print_resource_usage))
587     handle_error(err);
588 
589   if (rpc_status_t err = rpc_server_shutdown(
590           device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
591           nullptr))
592     handle_error(err);
593 
594   // Free the memory allocated for the device.
595   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
596     handle_error(err);
597   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
598     handle_error(err);
599   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
600     handle_error(err);
601 
602   if (hsa_status_t err = hsa_queue_destroy(queue))
603     handle_error(err);
604 
605   if (hsa_status_t err = hsa_executable_destroy(executable))
606     handle_error(err);
607 
608   if (hsa_status_t err = hsa_code_object_destroy(object))
609     handle_error(err);
610 
611   if (hsa_status_t err = hsa_shut_down())
612     handle_error(err);
613 
614   return ret;
615 }
616