xref: /llvm-project/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp (revision 38049dc8eef0dca7e82c25e4012228a9a135e255)
1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "Loader.h"
17 
18 #if defined(__has_include)
19 #if __has_include("hsa/hsa.h")
20 #include "hsa/hsa.h"
21 #include "hsa/hsa_ext_amd.h"
22 #elif __has_include("hsa.h")
23 #include "hsa.h"
24 #include "hsa_ext_amd.h"
25 #endif
26 #else
27 #include "hsa/hsa.h"
28 #include "hsa/hsa_ext_amd.h"
29 #endif
30 
31 #include "llvm/Frontend/Offloading/Utility.h"
32 
33 #include <atomic>
34 #include <cstdio>
35 #include <cstdlib>
36 #include <cstring>
37 #include <thread>
38 #include <tuple>
39 #include <utility>
40 
41 // The implicit arguments of COV5 AMDGPU kernels.
42 struct implicit_args_t {
43   uint32_t grid_size_x;
44   uint32_t grid_size_y;
45   uint32_t grid_size_z;
46   uint16_t workgroup_size_x;
47   uint16_t workgroup_size_y;
48   uint16_t workgroup_size_z;
49   uint8_t Unused0[46];
50   uint16_t grid_dims;
51   uint8_t Unused1[190];
52 };
53 
54 /// Print the error code and exit if \p code indicates an error.
55 static void handle_error_impl(const char *file, int32_t line,
56                               hsa_status_t code) {
57   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
58     return;
59 
60   const char *desc;
61   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
62     desc = "Unknown error";
63   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
64   exit(EXIT_FAILURE);
65 }
66 
67 /// Generic interface for iterating using the HSA callbacks.
68 template <typename elem_ty, typename func_ty, typename callback_ty>
69 hsa_status_t iterate(func_ty func, callback_ty cb) {
70   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
71     callback_ty *unwrapped = static_cast<callback_ty *>(data);
72     return (*unwrapped)(elem);
73   };
74   return func(l, static_cast<void *>(&cb));
75 }
76 
77 /// Generic interface for iterating using the HSA callbacks.
78 template <typename elem_ty, typename func_ty, typename func_arg_ty,
79           typename callback_ty>
80 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
81   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
82     callback_ty *unwrapped = static_cast<callback_ty *>(data);
83     return (*unwrapped)(elem);
84   };
85   return func(func_arg, l, static_cast<void *>(&cb));
86 }
87 
88 /// Iterate through all availible agents.
89 template <typename callback_ty>
90 hsa_status_t iterate_agents(callback_ty callback) {
91   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
92 }
93 
94 /// Iterate through all availible memory pools.
95 template <typename callback_ty>
96 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
97   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
98                                         agent, cb);
99 }
100 
101 template <hsa_device_type_t flag>
102 hsa_status_t get_agent(hsa_agent_t *output_agent) {
103   // Find the first agent with a matching device type.
104   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
105     hsa_device_type_t type;
106     hsa_status_t status =
107         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
108     if (status != HSA_STATUS_SUCCESS)
109       return status;
110 
111     if (type == flag) {
112       // Ensure that a GPU agent supports kernel dispatch packets.
113       if (type == HSA_DEVICE_TYPE_GPU) {
114         hsa_agent_feature_t features;
115         status =
116             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
117         if (status != HSA_STATUS_SUCCESS)
118           return status;
119         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
120           *output_agent = hsa_agent;
121       } else {
122         *output_agent = hsa_agent;
123       }
124       return HSA_STATUS_INFO_BREAK;
125     }
126     return HSA_STATUS_SUCCESS;
127   };
128 
129   return iterate_agents(cb);
130 }
131 
132 void print_kernel_resources(const char *kernel_name) {
133   fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
134 }
135 
136 /// Retrieve a global memory pool with a \p flag from the agent.
137 template <hsa_amd_memory_pool_global_flag_t flag>
138 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
139                                    hsa_amd_memory_pool_t *output_pool) {
140   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
141     uint32_t flags;
142     hsa_amd_segment_t segment;
143     if (auto err = hsa_amd_memory_pool_get_info(
144             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
145       return err;
146     if (auto err = hsa_amd_memory_pool_get_info(
147             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
148       return err;
149 
150     if (segment != HSA_AMD_SEGMENT_GLOBAL)
151       return HSA_STATUS_SUCCESS;
152 
153     if (flags & flag)
154       *output_pool = memory_pool;
155 
156     return HSA_STATUS_SUCCESS;
157   };
158   return iterate_agent_memory_pools(agent, cb);
159 }
160 
161 template <typename args_t>
162 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
163                            hsa_amd_memory_pool_t kernargs_pool,
164                            hsa_amd_memory_pool_t coarsegrained_pool,
165                            hsa_queue_t *queue, rpc::Server &server,
166                            const LaunchParameters &params,
167                            const char *kernel_name, args_t kernel_args,
168                            uint32_t wavefront_size, bool print_resource_usage) {
169   // Look up the kernel in the loaded executable.
170   hsa_executable_symbol_t symbol;
171   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
172           executable, kernel_name, &dev_agent, &symbol))
173     return err;
174 
175   // Retrieve different properties of the kernel symbol used for launch.
176   uint64_t kernel;
177   uint32_t args_size;
178   uint32_t group_size;
179   uint32_t private_size;
180   bool dynamic_stack;
181 
182   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
183       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
184       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
185       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
186       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
187       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
188 
189   for (auto &[info, value] : symbol_infos)
190     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
191       return err;
192 
193   // Allocate space for the kernel arguments on the host and allow the GPU agent
194   // to access it.
195   void *args;
196   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
197                                                       /*flags=*/0, &args))
198     handle_error(err);
199   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
200 
201   // Initialize all the arguments (explicit and implicit) to zero, then set the
202   // explicit arguments to the values created above.
203   std::memset(args, 0, args_size);
204   std::memcpy(args, &kernel_args, sizeof(args_t));
205 
206   // Initialize the necessary implicit arguments to the proper values.
207   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
208              (params.num_blocks_z * params.num_threads_z != 1);
209   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
210       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
211   implicit_args->grid_dims = dims;
212   implicit_args->grid_size_x = params.num_blocks_x;
213   implicit_args->grid_size_y = params.num_blocks_y;
214   implicit_args->grid_size_z = params.num_blocks_z;
215   implicit_args->workgroup_size_x = params.num_threads_x;
216   implicit_args->workgroup_size_y = params.num_threads_y;
217   implicit_args->workgroup_size_z = params.num_threads_z;
218 
219   // Obtain a packet from the queue.
220   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
221   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
222     ;
223 
224   const uint32_t mask = queue->size - 1;
225   hsa_kernel_dispatch_packet_t *packet =
226       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
227       (packet_id & mask);
228 
229   // Set up the packet for exeuction on the device. We currently only launch
230   // with one thread on the device, forcing the rest of the wavefront to be
231   // masked off.
232   uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
233   packet->workgroup_size_x = params.num_threads_x;
234   packet->workgroup_size_y = params.num_threads_y;
235   packet->workgroup_size_z = params.num_threads_z;
236   packet->reserved0 = 0;
237   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
238   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
239   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
240   packet->private_segment_size =
241       dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
242   packet->group_segment_size = group_size;
243   packet->kernel_object = kernel;
244   packet->kernarg_address = args;
245   packet->reserved2 = 0;
246   // Create a signal to indicate when this packet has been completed.
247   if (hsa_status_t err =
248           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
249     handle_error(err);
250 
251   if (print_resource_usage)
252     print_kernel_resources(kernel_name);
253 
254   // Initialize the packet header and set the doorbell signal to begin execution
255   // by the HSA runtime.
256   uint16_t header =
257       1u << HSA_PACKET_HEADER_BARRIER |
258       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
259       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
260       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
261   uint32_t header_word = header | (setup << 16u);
262   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
263   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
264 
265   std::atomic<bool> finished = false;
266   std::thread server_thread(
267       [](std::atomic<bool> *finished, rpc::Server *server,
268          uint32_t wavefront_size, hsa_agent_t dev_agent,
269          hsa_amd_memory_pool_t coarsegrained_pool) {
270         // Register RPC callbacks for the malloc and free functions on HSA.
271         auto malloc_handler = [&](size_t size) -> void * {
272           void *dev_ptr = nullptr;
273           if (hsa_status_t err =
274                   hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
275                                                /*flags=*/0, &dev_ptr))
276             dev_ptr = nullptr;
277           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
278           return dev_ptr;
279         };
280 
281         auto free_handler = [](void *ptr) -> void {
282           if (hsa_status_t err =
283                   hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
284             handle_error(err);
285         };
286 
287         uint32_t index = 0;
288         while (!*finished) {
289           if (wavefront_size == 32)
290             index =
291                 handle_server<32>(*server, index, malloc_handler, free_handler);
292           else
293             index =
294                 handle_server<64>(*server, index, malloc_handler, free_handler);
295         }
296       },
297       &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
298 
299   // Wait until the kernel has completed execution on the device. Periodically
300   // check the RPC client for work to be performed on the server.
301   while (hsa_signal_wait_scacquire(packet->completion_signal,
302                                    HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
303                                    HSA_WAIT_STATE_BLOCKED) != 0)
304     ;
305 
306   finished = true;
307   if (server_thread.joinable())
308     server_thread.join();
309 
310   // Destroy the resources acquired to launch the kernel and return.
311   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
312     handle_error(err);
313   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
314     handle_error(err);
315 
316   return HSA_STATUS_SUCCESS;
317 }
318 
319 /// Copies data from the source agent to the destination agent. The source
320 /// memory must first be pinned explicitly or allocated via HSA.
321 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
322                                const void *src, hsa_agent_t src_agent,
323                                uint64_t size) {
324   // Create a memory signal to copy information between the host and device.
325   hsa_signal_t memory_signal;
326   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
327     return err;
328 
329   if (hsa_status_t err = hsa_amd_memory_async_copy(
330           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
331     return err;
332 
333   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
334                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
335     ;
336 
337   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
338     return err;
339 
340   return HSA_STATUS_SUCCESS;
341 }
342 
343 int load(int argc, const char **argv, const char **envp, void *image,
344          size_t size, const LaunchParameters &params,
345          bool print_resource_usage) {
346   // Initialize the HSA runtime used to communicate with the device.
347   if (hsa_status_t err = hsa_init())
348     handle_error(err);
349 
350   // Register a callback when the device encounters a memory fault.
351   if (hsa_status_t err = hsa_amd_register_system_event_handler(
352           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
353             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
354               return HSA_STATUS_ERROR;
355             return HSA_STATUS_SUCCESS;
356           },
357           nullptr))
358     handle_error(err);
359 
360   // Obtain a single agent for the device and host to use the HSA memory model.
361   hsa_agent_t dev_agent;
362   hsa_agent_t host_agent;
363   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
364     handle_error(err);
365   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
366     handle_error(err);
367 
368   // Load the code object's ISA information and executable data segments.
369   hsa_code_object_reader_t reader;
370   if (hsa_status_t err =
371           hsa_code_object_reader_create_from_memory(image, size, &reader))
372     handle_error(err);
373 
374   hsa_executable_t executable;
375   if (hsa_status_t err = hsa_executable_create_alt(
376           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
377           &executable))
378     handle_error(err);
379 
380   hsa_loaded_code_object_t object;
381   if (hsa_status_t err = hsa_executable_load_agent_code_object(
382           executable, dev_agent, reader, "", &object))
383     handle_error(err);
384 
385   // No modifications to the executable are allowed  after this point.
386   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
387     handle_error(err);
388 
389   // Check the validity of the loaded executable. If the agents ISA features do
390   // not match the executable's code object it will fail here.
391   uint32_t result;
392   if (hsa_status_t err = hsa_executable_validate(executable, &result))
393     handle_error(err);
394   if (result)
395     handle_error(HSA_STATUS_ERROR);
396 
397   if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
398     handle_error(err);
399 
400   // Obtain memory pools to exchange data between the host and the device. The
401   // fine-grained pool acts as pinned memory on the host for DMA transfers to
402   // the device, the coarse-grained pool is for allocations directly on the
403   // device, and the kernerl-argument pool is for executing the kernel.
404   hsa_amd_memory_pool_t kernargs_pool;
405   hsa_amd_memory_pool_t finegrained_pool;
406   hsa_amd_memory_pool_t coarsegrained_pool;
407   if (hsa_status_t err =
408           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
409               host_agent, &kernargs_pool))
410     handle_error(err);
411   if (hsa_status_t err =
412           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
413               host_agent, &finegrained_pool))
414     handle_error(err);
415   if (hsa_status_t err =
416           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
417               dev_agent, &coarsegrained_pool))
418     handle_error(err);
419 
420   // The AMDGPU target can change its wavefront size. There currently isn't a
421   // good way to look this up through the HSA API so we use the LLVM interface.
422   uint16_t abi_version;
423   llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
424   llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
425   if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
426           llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
427     handle_error(llvm::toString(std::move(err)).c_str());
428   }
429 
430   // Allocate fine-grained memory on the host to hold the pointer array for the
431   // copied argv and allow the GPU agent to access it.
432   auto allocator = [&](uint64_t size) -> void * {
433     void *dev_ptr = nullptr;
434     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
435                                                         /*flags=*/0, &dev_ptr))
436       handle_error(err);
437     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
438     return dev_ptr;
439   };
440   void *dev_argv = copy_argument_vector(argc, argv, allocator);
441   if (!dev_argv)
442     handle_error("Failed to allocate device argv");
443 
444   // Allocate fine-grained memory on the host to hold the pointer array for the
445   // copied environment array and allow the GPU agent to access it.
446   void *dev_envp = copy_environment(envp, allocator);
447   if (!dev_envp)
448     handle_error("Failed to allocate device environment");
449 
450   // Allocate space for the return pointer and initialize it to zero.
451   void *dev_ret;
452   if (hsa_status_t err =
453           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
454                                        /*flags=*/0, &dev_ret))
455     handle_error(err);
456   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
457 
458   // Allocate finegrained memory for the RPC server and client to share.
459   uint32_t wavefront_size =
460       llvm::max_element(info_map, [](auto &&x, auto &&y) {
461         return x.second.WavefrontSize < y.second.WavefrontSize;
462       })->second.WavefrontSize;
463 
464   // Set up the RPC server.
465   void *rpc_buffer;
466   if (hsa_status_t err = hsa_amd_memory_pool_allocate(
467           finegrained_pool,
468           rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
469           /*flags=*/0, &rpc_buffer))
470     handle_error(err);
471   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
472 
473   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
474   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
475 
476   // Initialize the RPC client on the device by copying the local data to the
477   // device's internal pointer.
478   hsa_executable_symbol_t rpc_client_sym;
479   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
480           executable, "__llvm_libc_rpc_client", &dev_agent, &rpc_client_sym))
481     handle_error(err);
482 
483   void *rpc_client_host;
484   if (hsa_status_t err =
485           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
486                                        /*flags=*/0, &rpc_client_host))
487     handle_error(err);
488   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host);
489 
490   void *rpc_client_dev;
491   if (hsa_status_t err = hsa_executable_symbol_get_info(
492           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
493           &rpc_client_dev))
494     handle_error(err);
495 
496   // Copy the address of the client buffer from the device to the host.
497   if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
498                                     dev_agent, sizeof(void *)))
499     handle_error(err);
500 
501   void *rpc_client_buffer;
502   if (hsa_status_t err =
503           hsa_amd_memory_lock(&client, sizeof(rpc::Client),
504                               /*agents=*/nullptr, 0, &rpc_client_buffer))
505     handle_error(err);
506 
507   // Copy the RPC client buffer to the address pointed to by the symbol.
508   if (hsa_status_t err =
509           hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
510                      rpc_client_buffer, host_agent, sizeof(rpc::Client)))
511     handle_error(err);
512 
513   if (hsa_status_t err = hsa_amd_memory_unlock(&client))
514     handle_error(err);
515   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
516     handle_error(err);
517 
518   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
519   // If the clock_freq symbol is missing, no work to do.
520   hsa_executable_symbol_t freq_sym;
521   if (HSA_STATUS_SUCCESS ==
522       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
523                                         &dev_agent, &freq_sym)) {
524     void *host_clock_freq;
525     if (hsa_status_t err =
526             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
527                                          /*flags=*/0, &host_clock_freq))
528       handle_error(err);
529     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
530 
531     if (HSA_STATUS_SUCCESS ==
532         hsa_agent_get_info(dev_agent,
533                            static_cast<hsa_agent_info_t>(
534                                HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
535                            host_clock_freq)) {
536 
537       void *freq_addr;
538       if (hsa_status_t err = hsa_executable_symbol_get_info(
539               freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
540               &freq_addr))
541         handle_error(err);
542 
543       if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
544                                         host_agent, sizeof(uint64_t)))
545         handle_error(err);
546     }
547   }
548 
549   // Obtain a queue with the maximum (power of two) size, used to send commands
550   // to the HSA runtime and launch execution on the device.
551   uint64_t queue_size;
552   if (hsa_status_t err = hsa_agent_get_info(
553           dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
554     handle_error(err);
555   hsa_queue_t *queue = nullptr;
556   if (hsa_status_t err =
557           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
558                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
559     handle_error(err);
560 
561   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
562   begin_args_t init_args = {argc, dev_argv, dev_envp};
563   if (hsa_status_t err = launch_kernel(
564           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
565           server, single_threaded_params, "_begin.kd", init_args,
566           info_map["_begin"].WavefrontSize, print_resource_usage))
567     handle_error(err);
568 
569   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
570   if (hsa_status_t err = launch_kernel(
571           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
572           server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
573           print_resource_usage))
574     handle_error(err);
575 
576   void *host_ret;
577   if (hsa_status_t err =
578           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
579                                        /*flags=*/0, &host_ret))
580     handle_error(err);
581   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
582 
583   if (hsa_status_t err =
584           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
585     handle_error(err);
586 
587   // Save the return value and perform basic clean-up.
588   int ret = *static_cast<int *>(host_ret);
589 
590   end_args_t fini_args = {ret};
591   if (hsa_status_t err = launch_kernel(
592           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
593           server, single_threaded_params, "_end.kd", fini_args,
594           info_map["_end"].WavefrontSize, print_resource_usage))
595     handle_error(err);
596 
597   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
598     handle_error(err);
599 
600   // Free the memory allocated for the device.
601   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
602     handle_error(err);
603   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
604     handle_error(err);
605   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
606     handle_error(err);
607 
608   if (hsa_status_t err = hsa_queue_destroy(queue))
609     handle_error(err);
610 
611   if (hsa_status_t err = hsa_executable_destroy(executable))
612     handle_error(err);
613 
614   if (hsa_status_t err = hsa_shut_down())
615     handle_error(err);
616 
617   return ret;
618 }
619