xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/plugin/plugin-gcn.c (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1 /* Plugin for AMD GCN execution.
2 
3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* {{{ Includes and defines  */
30 
31 #include "config.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <pthread.h>
36 #include <inttypes.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <hsa.h>
40 #include <dlfcn.h>
41 #include <signal.h>
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
44 #include <elf.h>
45 #include "oacc-plugin.h"
46 #include "oacc-int.h"
47 #include <assert.h>
48 
49 /* Additional definitions not in HSA 1.1.
50    FIXME: this needs to be updated in hsa.h for upstream, but the only source
51           right now is the ROCr source which may cause license issues.  */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53 
54 /* These probably won't be in elf.h for a while.  */
55 #define R_AMDGPU_NONE		0
56 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
57 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
58 #define R_AMDGPU_ABS64		3	/* S + A  */
59 #define R_AMDGPU_REL32		4	/* S + A - P  */
60 #define R_AMDGPU_REL64		5	/* S + A - P  */
61 #define R_AMDGPU_ABS32		6	/* S + A  */
62 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
63 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
64 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
65 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
66 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
67 #define reserved		12
68 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
69 
70 /* GCN specific definitions for asynchronous queues.  */
71 
72 #define ASYNC_QUEUE_SIZE 64
73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
74 #define DEBUG_QUEUES 0
75 #define DEBUG_THREAD_SLEEP 0
76 #define DEBUG_THREAD_SIGNAL 0
77 
78 /* Defaults.  */
79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
80 
81 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
82 #ifndef HAVE_SECURE_GETENV
83 #ifdef HAVE___SECURE_GETENV
84 #define secure_getenv __secure_getenv
85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87 
88 #include <unistd.h>
89 
90 /* Implementation of secure_getenv() for targets where it is not provided but
91    we have at least means to test real and effective IDs. */
92 
93 static char *
94 secure_getenv (const char *name)
95 {
96   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97     return getenv (name);
98   else
99     return NULL;
100 }
101 
102 #else
103 #define secure_getenv getenv
104 #endif
105 #endif
106 
107 /* }}}  */
108 /* {{{ Types  */
109 
110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
111 
112 struct gcn_thread
113 {
114   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
115   int async;
116 };
117 
118 /* As an HSA runtime is dlopened, following structure defines function
119    pointers utilized by the HSA plug-in.  */
120 
121 struct hsa_runtime_fn_info
122 {
123   /* HSA runtime.  */
124   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125 					const char **status_string);
126   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127 					  void *value);
128   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129 					 hsa_agent_info_t attribute,
130 					 void *value);
131   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132 				      hsa_isa_info_t attribute,
133 				      uint32_t index,
134 				      void *value);
135   hsa_status_t (*hsa_init_fn) (void);
136   hsa_status_t (*hsa_iterate_agents_fn)
137     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139 					  hsa_region_info_t attribute,
140 					  void *value);
141   hsa_status_t (*hsa_queue_create_fn)
142     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144      void *data, uint32_t private_segment_size,
145      uint32_t group_segment_size, hsa_queue_t **queue);
146   hsa_status_t (*hsa_agent_iterate_regions_fn)
147     (hsa_agent_t agent,
148      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150   hsa_status_t (*hsa_executable_create_fn)
151     (hsa_profile_t profile, hsa_executable_state_t executable_state,
152      const char *options, hsa_executable_t *executable);
153   hsa_status_t (*hsa_executable_global_variable_define_fn)
154     (hsa_executable_t executable, const char *variable_name, void *address);
155   hsa_status_t (*hsa_executable_load_code_object_fn)
156     (hsa_executable_t executable, hsa_agent_t agent,
157      hsa_code_object_t code_object, const char *options);
158   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159 					   const char *options);
160   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161 					uint32_t num_consumers,
162 					const hsa_agent_t *consumers,
163 					hsa_signal_t *signal);
164   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165 					  void **ptr);
166   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167 					      hsa_access_permission_t access);
168   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171   hsa_status_t (*hsa_executable_get_symbol_fn)
172     (hsa_executable_t executable, const char *module_name,
173      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174      hsa_executable_symbol_t *symbol);
175   hsa_status_t (*hsa_executable_symbol_get_info_fn)
176     (hsa_executable_symbol_t executable_symbol,
177      hsa_executable_symbol_info_t attribute, void *value);
178   hsa_status_t (*hsa_executable_iterate_symbols_fn)
179     (hsa_executable_t executable,
180      hsa_status_t (*callback)(hsa_executable_t executable,
181 			      hsa_executable_symbol_t symbol, void *data),
182      void *data);
183   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184 						    uint64_t value);
185   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187 				       hsa_signal_value_t value);
188   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189 				       hsa_signal_value_t value);
190   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191     (hsa_signal_t signal, hsa_signal_condition_t condition,
192      hsa_signal_value_t compare_value, uint64_t timeout_hint,
193      hsa_wait_state_t wait_state_hint);
194   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196 
197   hsa_status_t (*hsa_code_object_deserialize_fn)
198     (void *serialized_code_object, size_t serialized_code_object_size,
199      const char *options, hsa_code_object_t *code_object);
200 };
201 
202 /* Structure describing the run-time and grid properties of an HSA kernel
203    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
204 
205 struct GOMP_kernel_launch_attributes
206 {
207   /* Number of dimensions the workload has.  Maximum number is 3.  */
208   uint32_t ndim;
209   /* Size of the grid in the three respective dimensions.  */
210   uint32_t gdims[3];
211   /* Size of work-groups in the respective dimensions.  */
212   uint32_t wdims[3];
213 };
214 
215 /* Collection of information needed for a dispatch of a kernel from a
216    kernel.  */
217 
218 struct kernel_dispatch
219 {
220   struct agent_info *agent;
221   /* Pointer to a command queue associated with a kernel dispatch agent.  */
222   void *queue;
223   /* Pointer to a memory space used for kernel arguments passing.  */
224   void *kernarg_address;
225   /* Kernel object.  */
226   uint64_t object;
227   /* Synchronization signal used for dispatch synchronization.  */
228   uint64_t signal;
229   /* Private segment size.  */
230   uint32_t private_segment_size;
231   /* Group segment size.  */
232   uint32_t group_segment_size;
233 };
234 
235 /* Structure of the kernargs segment, supporting console output.
236 
237    This needs to match the definitions in Newlib, and the expectations
238    in libgomp target code.  */
239 
240 struct kernargs {
241   /* Leave space for the real kernel arguments.
242      OpenACC and OpenMP only use one pointer.  */
243   int64_t dummy1;
244   int64_t dummy2;
245 
246   /* A pointer to struct output, below, for console output data.  */
247   int64_t out_ptr;
248 
249   /* A pointer to struct heap, below.  */
250   int64_t heap_ptr;
251 
252   /* A pointer to an ephemeral memory arena.
253     Only needed for OpenMP.  */
254   int64_t arena_ptr;
255 
256   /* Output data.  */
257   struct output {
258     int return_value;
259     unsigned int next_output;
260     struct printf_data {
261       int written;
262       char msg[128];
263       int type;
264       union {
265 	int64_t ivalue;
266 	double dvalue;
267 	char text[128];
268       };
269     } queue[1024];
270     unsigned int consumed;
271   } output_data;
272 };
273 
274 /* A queue entry for a future asynchronous launch.  */
275 
276 struct kernel_launch
277 {
278   struct kernel_info *kernel;
279   void *vars;
280   struct GOMP_kernel_launch_attributes kla;
281 };
282 
283 /* A queue entry for a future callback.  */
284 
285 struct callback
286 {
287   void (*fn)(void *);
288   void *data;
289 };
290 
291 /* A data struct for the copy_data callback.  */
292 
293 struct copy_data
294 {
295   void *dst;
296   const void *src;
297   size_t len;
298   bool free_src;
299   struct goacc_asyncqueue *aq;
300 };
301 
302 /* A queue entry for a placeholder.  These correspond to a wait event.  */
303 
304 struct placeholder
305 {
306   int executed;
307   pthread_cond_t cond;
308   pthread_mutex_t mutex;
309 };
310 
311 /* A queue entry for a wait directive.  */
312 
313 struct asyncwait_info
314 {
315   struct placeholder *placeholderp;
316 };
317 
318 /* Encode the type of an entry in an async queue.  */
319 
320 enum entry_type
321 {
322   KERNEL_LAUNCH,
323   CALLBACK,
324   ASYNC_WAIT,
325   ASYNC_PLACEHOLDER
326 };
327 
328 /* An entry in an async queue.  */
329 
330 struct queue_entry
331 {
332   enum entry_type type;
333   union {
334     struct kernel_launch launch;
335     struct callback callback;
336     struct asyncwait_info asyncwait;
337     struct placeholder placeholder;
338   } u;
339 };
340 
341 /* An async queue header.
342 
343    OpenMP may create one of these.
344    OpenACC may create many.  */
345 
346 struct goacc_asyncqueue
347 {
348   struct agent_info *agent;
349   hsa_queue_t *hsa_queue;
350 
351   pthread_t thread_drain_queue;
352   pthread_mutex_t mutex;
353   pthread_cond_t queue_cond_in;
354   pthread_cond_t queue_cond_out;
355   struct queue_entry queue[ASYNC_QUEUE_SIZE];
356   int queue_first;
357   int queue_n;
358   int drain_queue_stop;
359 
360   int id;
361   struct goacc_asyncqueue *prev;
362   struct goacc_asyncqueue *next;
363 };
364 
365 /* Mkoffload uses this structure to describe a kernel.
366 
367    OpenMP kernel dimensions are passed at runtime.
368    OpenACC kernel dimensions are passed at compile time, here.  */
369 
370 struct hsa_kernel_description
371 {
372   const char *name;
373   int oacc_dims[3];  /* Only present for GCN kernels.  */
374   int sgpr_count;
375   int vpgr_count;
376 };
377 
378 /* Mkoffload uses this structure to describe an offload variable.  */
379 
380 struct global_var_info
381 {
382   const char *name;
383   void *address;
384 };
385 
386 /* Mkoffload uses this structure to describe all the kernels in a
387    loadable module.  These are passed the libgomp via static constructors.  */
388 
389 struct gcn_image_desc
390 {
391   struct gcn_image {
392     size_t size;
393     void *image;
394   } *gcn_image;
395   const unsigned kernel_count;
396   struct hsa_kernel_description *kernel_infos;
397   const unsigned global_variable_count;
398   struct global_var_info *global_variables;
399 };
400 
401 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
402    support.
403    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
404 
405 typedef enum {
406   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
407   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
408   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
409 } EF_AMDGPU_MACH;
410 
411 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
412 typedef EF_AMDGPU_MACH gcn_isa;
413 
414 /* Description of an HSA GPU agent (device) and the program associated with
415    it.  */
416 
417 struct agent_info
418 {
419   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
420   hsa_agent_t id;
421   /* The user-visible device number.  */
422   int device_id;
423   /* Whether the agent has been initialized.  The fields below are usable only
424      if it has been.  */
425   bool initialized;
426 
427   /* The instruction set architecture of the device. */
428   gcn_isa device_isa;
429   /* Name of the agent. */
430   char name[64];
431   /* Name of the vendor of the agent. */
432   char vendor_name[64];
433   /* Command queues of the agent.  */
434   hsa_queue_t *sync_queue;
435   struct goacc_asyncqueue *async_queues, *omp_async_queue;
436   pthread_mutex_t async_queues_mutex;
437 
438   /* The HSA memory region from which to allocate kernel arguments.  */
439   hsa_region_t kernarg_region;
440 
441   /* The HSA memory region from which to allocate device data.  */
442   hsa_region_t data_region;
443 
444   /* Allocated team arenas.  */
445   struct team_arena_list *team_arena_list;
446   pthread_mutex_t team_arena_write_lock;
447 
448   /* Read-write lock that protects kernels which are running or about to be run
449      from interference with loading and unloading of images.  Needs to be
450      locked for reading while a kernel is being run, and for writing if the
451      list of modules is manipulated (and thus the HSA program invalidated).  */
452   pthread_rwlock_t module_rwlock;
453 
454   /* The module associated with this kernel.  */
455   struct module_info *module;
456 
457   /* Mutex enforcing that only one thread will finalize the HSA program.  A
458      thread should have locked agent->module_rwlock for reading before
459      acquiring it.  */
460   pthread_mutex_t prog_mutex;
461   /* Flag whether the HSA program that consists of all the modules has been
462      finalized.  */
463   bool prog_finalized;
464   /* HSA executable - the finalized program that is used to locate kernels.  */
465   hsa_executable_t executable;
466 };
467 
468 /* Information required to identify, finalize and run any given kernel.  */
469 
470 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
471 
472 struct kernel_info
473 {
474   /* Name of the kernel, required to locate it within the GCN object-code
475      module.  */
476   const char *name;
477   /* The specific agent the kernel has been or will be finalized for and run
478      on.  */
479   struct agent_info *agent;
480   /* The specific module where the kernel takes place.  */
481   struct module_info *module;
482   /* Information provided by mkoffload associated with the kernel.  */
483   struct hsa_kernel_description *description;
484   /* Mutex enforcing that at most once thread ever initializes a kernel for
485      use.  A thread should have locked agent->module_rwlock for reading before
486      acquiring it.  */
487   pthread_mutex_t init_mutex;
488   /* Flag indicating whether the kernel has been initialized and all fields
489      below it contain valid data.  */
490   bool initialized;
491   /* Flag indicating that the kernel has a problem that blocks an execution.  */
492   bool initialization_failed;
493   /* The object to be put into the dispatch queue.  */
494   uint64_t object;
495   /* Required size of kernel arguments.  */
496   uint32_t kernarg_segment_size;
497   /* Required size of group segment.  */
498   uint32_t group_segment_size;
499   /* Required size of private segment.  */
500   uint32_t private_segment_size;
501   /* Set up for OpenMP or OpenACC?  */
502   enum offload_kind kind;
503 };
504 
505 /* Information about a particular GCN module, its image and kernels.  */
506 
507 struct module_info
508 {
509   /* The description with which the program has registered the image.  */
510   struct gcn_image_desc *image_desc;
511   /* GCN heap allocation.  */
512   struct heap *heap;
513   /* Physical boundaries of the loaded module.  */
514   Elf64_Addr phys_address_start;
515   Elf64_Addr phys_address_end;
516 
517   bool constructors_run_p;
518   struct kernel_info *init_array_func, *fini_array_func;
519 
520   /* Number of kernels in this module.  */
521   int kernel_count;
522   /* An array of kernel_info structures describing each kernel in this
523      module.  */
524   struct kernel_info kernels[];
525 };
526 
527 /* A linked list of memory arenas allocated on the device.
528    These are only used by OpenMP, as a means to optimize per-team malloc.  */
529 
530 struct team_arena_list
531 {
532   struct team_arena_list *next;
533 
534   /* The number of teams determines the size of the allocation.  */
535   int num_teams;
536   /* The device address of the arena itself.  */
537   void *arena;
538   /* A flag to prevent two asynchronous kernels trying to use the same arena.
539      The mutex is locked until the kernel exits.  */
540   pthread_mutex_t in_use;
541 };
542 
543 /* Information about the whole HSA environment and all of its agents.  */
544 
545 struct hsa_context_info
546 {
547   /* Whether the structure has been initialized.  */
548   bool initialized;
549   /* Number of usable GPU HSA agents in the system.  */
550   int agent_count;
551   /* Array of agent_info structures describing the individual HSA agents.  */
552   struct agent_info *agents;
553   /* Driver version string. */
554   char driver_version_s[30];
555 };
556 
557 /* Format of the on-device heap.
558 
559    This must match the definition in Newlib and gcn-run.  */
560 
561 struct heap {
562   int64_t size;
563   char data[0];
564 };
565 
566 /* }}}  */
567 /* {{{ Global variables  */
568 
569 /* Information about the whole HSA environment and all of its agents.  */
570 
571 static struct hsa_context_info hsa_context;
572 
573 /* HSA runtime functions that are initialized in init_hsa_context.  */
574 
575 static struct hsa_runtime_fn_info hsa_fns;
576 
577 /* Heap space, allocated target-side, provided for use of newlib malloc.
578    Each module should have it's own heap allocated.
579    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
580 
581 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
582 
583 /* Flag to decide whether print to stderr information about what is going on.
584    Set in init_debug depending on environment variables.  */
585 
586 static bool debug;
587 
588 /* Flag to decide if the runtime should suppress a possible fallback to host
589    execution.  */
590 
591 static bool suppress_host_fallback;
592 
593 /* Flag to locate HSA runtime shared library that is dlopened
594    by this plug-in.  */
595 
596 static const char *hsa_runtime_lib;
597 
598 /* Flag to decide if the runtime should support also CPU devices (can be
599    a simulator).  */
600 
601 static bool support_cpu_devices;
602 
603 /* Runtime dimension overrides.  Zero indicates default.  */
604 
605 static int override_x_dim = 0;
606 static int override_z_dim = 0;
607 
608 /* }}}  */
609 /* {{{ Debug & Diagnostic  */
610 
611 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
612 
613 #define DEBUG_PRINT(...) \
614   do \
615   { \
616     if (debug) \
617       { \
618 	fprintf (stderr, __VA_ARGS__); \
619       } \
620   } \
621   while (false);
622 
623 /* Flush stderr if GCN_DEBUG value is set to true.  */
624 
625 #define DEBUG_FLUSH()				\
626   do {						\
627     if (debug)					\
628       fflush (stderr);				\
629   } while (false)
630 
631 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
632    is set to true.  */
633 
634 #define DEBUG_LOG(prefix, ...)			\
635   do						\
636     {						\
637       DEBUG_PRINT (prefix);			\
638       DEBUG_PRINT (__VA_ARGS__);			\
639       DEBUG_FLUSH ();				\
640     } while (false)
641 
642 /* Print a debugging message to stderr.  */
643 
644 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
645 
646 /* Print a warning message to stderr.  */
647 
648 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
649 
650 /* Print HSA warning STR with an HSA STATUS code.  */
651 
652 static void
653 hsa_warn (const char *str, hsa_status_t status)
654 {
655   if (!debug)
656     return;
657 
658   const char *hsa_error_msg = "[unknown]";
659   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660 
661   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
662 	   hsa_error_msg);
663 }
664 
665 /* Report a fatal error STR together with the HSA error corresponding to STATUS
666    and terminate execution of the current process.  */
667 
668 static void
669 hsa_fatal (const char *str, hsa_status_t status)
670 {
671   const char *hsa_error_msg = "[unknown]";
672   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
673   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
674 		     hsa_error_msg);
675 }
676 
677 /* Like hsa_fatal, except only report error message, and return FALSE
678    for propagating error processing to outside of plugin.  */
679 
680 static bool
681 hsa_error (const char *str, hsa_status_t status)
682 {
683   const char *hsa_error_msg = "[unknown]";
684   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
685   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
686 		     hsa_error_msg);
687   return false;
688 }
689 
690 /* Dump information about the available hardware.  */
691 
692 static void
693 dump_hsa_system_info (void)
694 {
695   hsa_status_t status;
696 
697   hsa_endianness_t endianness;
698   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
699 					   &endianness);
700   if (status == HSA_STATUS_SUCCESS)
701     switch (endianness)
702       {
703       case HSA_ENDIANNESS_LITTLE:
704 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
705 	break;
706       case HSA_ENDIANNESS_BIG:
707 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
708 	break;
709       default:
710 	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
711       }
712   else
713     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
714 
715   uint8_t extensions[128];
716   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
717 					   &extensions);
718   if (status == HSA_STATUS_SUCCESS)
719     {
720       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
721 	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
722     }
723   else
724     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
725 }
726 
727 /* Dump information about the available hardware.  */
728 
729 static void
730 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
731 {
732   switch (machine_model)
733     {
734     case HSA_MACHINE_MODEL_SMALL:
735       GCN_DEBUG ("%s: SMALL\n", s);
736       break;
737     case HSA_MACHINE_MODEL_LARGE:
738       GCN_DEBUG ("%s: LARGE\n", s);
739       break;
740     default:
741       GCN_WARNING ("%s: UNKNOWN\n", s);
742       break;
743     }
744 }
745 
746 /* Dump information about the available hardware.  */
747 
748 static void
749 dump_profile (hsa_profile_t profile, const char *s)
750 {
751   switch (profile)
752     {
753     case HSA_PROFILE_FULL:
754       GCN_DEBUG ("%s: FULL\n", s);
755       break;
756     case HSA_PROFILE_BASE:
757       GCN_DEBUG ("%s: BASE\n", s);
758       break;
759     default:
760       GCN_WARNING ("%s: UNKNOWN\n", s);
761       break;
762     }
763 }
764 
765 /* Dump information about a device memory region.  */
766 
767 static hsa_status_t
768 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
769 {
770   hsa_status_t status;
771 
772   hsa_region_segment_t segment;
773   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
774 					   &segment);
775   if (status == HSA_STATUS_SUCCESS)
776     {
777       if (segment == HSA_REGION_SEGMENT_GLOBAL)
778 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
779       else if (segment == HSA_REGION_SEGMENT_READONLY)
780 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
781       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
782 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
783       else if (segment == HSA_REGION_SEGMENT_GROUP)
784 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
785       else
786 	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
787     }
788   else
789     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
790 
791   if (segment == HSA_REGION_SEGMENT_GLOBAL)
792     {
793       uint32_t flags;
794       status
795 	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
796 					  &flags);
797       if (status == HSA_STATUS_SUCCESS)
798 	{
799 	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
800 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
801 	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
802 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
803 	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
804 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
805 	}
806       else
807 	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
808     }
809 
810   size_t size;
811   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
812   if (status == HSA_STATUS_SUCCESS)
813     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
814   else
815     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
816 
817   status
818     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
819 				      &size);
820   if (status == HSA_STATUS_SUCCESS)
821     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
822   else
823     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
824 
825   bool alloc_allowed;
826   status
827     = hsa_fns.hsa_region_get_info_fn (region,
828 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
829 				      &alloc_allowed);
830   if (status == HSA_STATUS_SUCCESS)
831     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
832   else
833     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
834 
835   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
836     return HSA_STATUS_SUCCESS;
837 
838   status
839     = hsa_fns.hsa_region_get_info_fn (region,
840 				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
841 				      &size);
842   if (status == HSA_STATUS_SUCCESS)
843     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
844   else
845     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
846 
847   size_t align;
848   status
849     = hsa_fns.hsa_region_get_info_fn (region,
850 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
851 				      &align);
852   if (status == HSA_STATUS_SUCCESS)
853     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
854   else
855     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
856 
857   return HSA_STATUS_SUCCESS;
858 }
859 
860 /* Dump information about all the device memory regions.  */
861 
862 static void
863 dump_hsa_regions (hsa_agent_t agent)
864 {
865   hsa_status_t status;
866   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
867 						 dump_hsa_region,
868 						 NULL);
869   if (status != HSA_STATUS_SUCCESS)
870     hsa_error ("Dumping hsa regions failed", status);
871 }
872 
873 /* Dump information about the available devices.  */
874 
875 static hsa_status_t
876 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
877 {
878   hsa_status_t status;
879 
880   char buf[64];
881   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
882 					  &buf);
883   if (status == HSA_STATUS_SUCCESS)
884     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
885   else
886     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
887 
888   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
889 					  &buf);
890   if (status == HSA_STATUS_SUCCESS)
891     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
892   else
893     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
894 
895   hsa_machine_model_t machine_model;
896   status
897     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
898 				     &machine_model);
899   if (status == HSA_STATUS_SUCCESS)
900     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
901   else
902     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
903 
904   hsa_profile_t profile;
905   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
906 					  &profile);
907   if (status == HSA_STATUS_SUCCESS)
908     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
909   else
910     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
911 
912   hsa_device_type_t device_type;
913   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
914 					  &device_type);
915   if (status == HSA_STATUS_SUCCESS)
916     {
917       switch (device_type)
918 	{
919 	case HSA_DEVICE_TYPE_CPU:
920 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
921 	  break;
922 	case HSA_DEVICE_TYPE_GPU:
923 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
924 	  break;
925 	case HSA_DEVICE_TYPE_DSP:
926 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
927 	  break;
928 	default:
929 	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
930 	  break;
931 	}
932     }
933   else
934     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
935 
936   uint32_t cu_count;
937   status = hsa_fns.hsa_agent_get_info_fn
938     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
939   if (status == HSA_STATUS_SUCCESS)
940     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
941   else
942     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
943 
944   uint32_t size;
945   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
946 					  &size);
947   if (status == HSA_STATUS_SUCCESS)
948     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
949   else
950     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
951 
952   uint32_t max_dim;
953   status = hsa_fns.hsa_agent_get_info_fn (agent,
954 					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
955 					  &max_dim);
956   if (status == HSA_STATUS_SUCCESS)
957     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
958   else
959     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
960 
961   uint32_t max_size;
962   status = hsa_fns.hsa_agent_get_info_fn (agent,
963 					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
964 					  &max_size);
965   if (status == HSA_STATUS_SUCCESS)
966     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
967   else
968     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
969 
970   uint32_t grid_max_dim;
971   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
972 					  &grid_max_dim);
973   if (status == HSA_STATUS_SUCCESS)
974     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
975   else
976     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
977 
978   uint32_t grid_max_size;
979   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
980 					  &grid_max_size);
981   if (status == HSA_STATUS_SUCCESS)
982     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
983   else
984     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
985 
986   dump_hsa_regions (agent);
987 
988   return HSA_STATUS_SUCCESS;
989 }
990 
991 /* Forward reference.  */
992 
993 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
994 
995 /* Helper function for dump_executable_symbols.  */
996 
997 static hsa_status_t
998 dump_executable_symbol (hsa_executable_t executable,
999 			hsa_executable_symbol_t symbol,
1000 			void *data __attribute__((unused)))
1001 {
1002   char *name = get_executable_symbol_name (symbol);
1003 
1004   if (name)
1005     {
1006       GCN_DEBUG ("executable symbol: %s\n", name);
1007       free (name);
1008     }
1009 
1010   return HSA_STATUS_SUCCESS;
1011 }
1012 
1013 /* Dump all global symbol in an executable.  */
1014 
1015 static void
1016 dump_executable_symbols (hsa_executable_t executable)
1017 {
1018   hsa_status_t status;
1019   status
1020     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1021 						 dump_executable_symbol,
1022 						 NULL);
1023   if (status != HSA_STATUS_SUCCESS)
1024     hsa_fatal ("Could not dump HSA executable symbols", status);
1025 }
1026 
1027 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1028 
1029 static void
1030 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1031 {
1032   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1033 
1034   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1035   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1036   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1037   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1038 	   (void*)kernargs->heap_ptr);
1039   fprintf (stderr, "%*sarena address: %p\n", indent, "",
1040 	   (void*)kernargs->arena_ptr);
1041   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1042   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1043 	   dispatch->private_segment_size);
1044   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1045 	   dispatch->group_segment_size);
1046   fprintf (stderr, "\n");
1047 }
1048 
1049 /* }}}  */
1050 /* {{{ Utility functions  */
1051 
1052 /* Cast the thread local storage to gcn_thread.  */
1053 
1054 static inline struct gcn_thread *
1055 gcn_thread (void)
1056 {
1057   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1058 }
1059 
1060 /* Initialize debug and suppress_host_fallback according to the environment.  */
1061 
1062 static void
1063 init_environment_variables (void)
1064 {
1065   if (secure_getenv ("GCN_DEBUG"))
1066     debug = true;
1067   else
1068     debug = false;
1069 
1070   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1071     suppress_host_fallback = true;
1072   else
1073     suppress_host_fallback = false;
1074 
1075   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1076   if (hsa_runtime_lib == NULL)
1077     hsa_runtime_lib = "libhsa-runtime64.so";
1078 
1079   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1080 
1081   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1082   if (!x)
1083     x = secure_getenv ("GCN_NUM_GANGS");
1084   if (x)
1085     override_x_dim = atoi (x);
1086 
1087   const char *z = secure_getenv ("GCN_NUM_THREADS");
1088   if (!z)
1089     z = secure_getenv ("GCN_NUM_WORKERS");
1090   if (z)
1091     override_z_dim = atoi (z);
1092 
1093   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1094   if (heap)
1095     {
1096       size_t tmp = atol (heap);
1097       if (tmp)
1098 	gcn_kernel_heap_size = tmp;
1099     }
1100 }
1101 
1102 /* Return malloc'd string with name of SYMBOL.  */
1103 
1104 static char *
1105 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1106 {
1107   hsa_status_t status;
1108   char *res;
1109   uint32_t len;
1110   const hsa_executable_symbol_info_t info_name_length
1111     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1112 
1113   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1114 						      &len);
1115   if (status != HSA_STATUS_SUCCESS)
1116     {
1117       hsa_error ("Could not get length of symbol name", status);
1118       return NULL;
1119     }
1120 
1121   res = GOMP_PLUGIN_malloc (len + 1);
1122 
1123   const hsa_executable_symbol_info_t info_name
1124     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1125 
1126   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1127 
1128   if (status != HSA_STATUS_SUCCESS)
1129     {
1130       hsa_error ("Could not get symbol name", status);
1131       free (res);
1132       return NULL;
1133     }
1134 
1135   res[len] = '\0';
1136 
1137   return res;
1138 }
1139 
1140 /* Helper function for find_executable_symbol.  */
1141 
1142 static hsa_status_t
1143 find_executable_symbol_1 (hsa_executable_t executable,
1144 			  hsa_executable_symbol_t symbol,
1145 			  void *data)
1146 {
1147   hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1148   *res = symbol;
1149   return HSA_STATUS_INFO_BREAK;
1150 }
1151 
1152 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
1153    found, return false.  */
1154 
1155 static bool
1156 find_executable_symbol (hsa_executable_t executable,
1157 			hsa_executable_symbol_t *symbol)
1158 {
1159   hsa_status_t status;
1160 
1161   status
1162     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1163 						 find_executable_symbol_1,
1164 						 symbol);
1165   if (status != HSA_STATUS_INFO_BREAK)
1166     {
1167       hsa_error ("Could not find executable symbol", status);
1168       return false;
1169     }
1170 
1171   return true;
1172 }
1173 
1174 /* Get the number of GPU Compute Units.  */
1175 
1176 static int
1177 get_cu_count (struct agent_info *agent)
1178 {
1179   uint32_t cu_count;
1180   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1181     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1182   if (status == HSA_STATUS_SUCCESS)
1183     return cu_count;
1184   else
1185     return 64;  /* The usual number for older devices.  */
1186 }
1187 
1188 /* Calculate the maximum grid size for OMP threads / OACC workers.
1189    This depends on the kernel's resource usage levels.  */
1190 
1191 static int
1192 limit_worker_threads (int threads)
1193 {
1194   /* FIXME Do something more inteligent here.
1195      GCN can always run 4 threads within a Compute Unit, but
1196      more than that depends on register usage.  */
1197   if (threads > 16)
1198     threads = 16;
1199   return threads;
1200 }
1201 
1202 /* Parse the target attributes INPUT provided by the compiler and return true
1203    if we should run anything all.  If INPUT is NULL, fill DEF with default
1204    values, then store INPUT or DEF into *RESULT.
1205 
1206    This is used for OpenMP only.  */
1207 
1208 static bool
1209 parse_target_attributes (void **input,
1210 			 struct GOMP_kernel_launch_attributes *def,
1211 			 struct GOMP_kernel_launch_attributes **result,
1212 			 struct agent_info *agent)
1213 {
1214   if (!input)
1215     GOMP_PLUGIN_fatal ("No target arguments provided");
1216 
1217   bool grid_attrs_found = false;
1218   bool gcn_dims_found = false;
1219   int gcn_teams = 0;
1220   int gcn_threads = 0;
1221   while (*input)
1222     {
1223       intptr_t id = (intptr_t) *input++, val;
1224 
1225       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1226 	val = (intptr_t) *input++;
1227       else
1228 	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1229 
1230       val = (val > INT_MAX) ? INT_MAX : val;
1231 
1232       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1233 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1234 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1235 	{
1236 	  grid_attrs_found = true;
1237 	  break;
1238 	}
1239       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1240 	       == GOMP_TARGET_ARG_DEVICE_ALL)
1241 	{
1242 	  gcn_dims_found = true;
1243 	  switch (id & GOMP_TARGET_ARG_ID_MASK)
1244 	    {
1245 	    case GOMP_TARGET_ARG_NUM_TEAMS:
1246 	      gcn_teams = val;
1247 	      break;
1248 	    case GOMP_TARGET_ARG_THREAD_LIMIT:
1249 	      gcn_threads = limit_worker_threads (val);
1250 	      break;
1251 	    default:
1252 	      ;
1253 	    }
1254 	}
1255     }
1256 
1257   if (gcn_dims_found)
1258     {
1259       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1260 	  && gcn_threads == 0 && override_z_dim == 0)
1261 	{
1262 	  gcn_threads = 4;
1263 	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1264 		       "threads to 4 per team.\n");
1265 	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
1266 		       "GCN_NUM_THREADS=16\n");
1267 	}
1268 
1269       def->ndim = 3;
1270       /* Fiji has 64 CUs, but Vega20 has 60.  */
1271       def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1272       /* Each thread is 64 work items wide.  */
1273       def->gdims[1] = 64;
1274       /* A work group can have 16 wavefronts.  */
1275       def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1276       def->wdims[0] = 1; /* Single team per work-group.  */
1277       def->wdims[1] = 64;
1278       def->wdims[2] = 16;
1279       *result = def;
1280       return true;
1281     }
1282   else if (!grid_attrs_found)
1283     {
1284       def->ndim = 1;
1285       def->gdims[0] = 1;
1286       def->gdims[1] = 1;
1287       def->gdims[2] = 1;
1288       def->wdims[0] = 1;
1289       def->wdims[1] = 1;
1290       def->wdims[2] = 1;
1291       *result = def;
1292       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1293       return true;
1294     }
1295 
1296   struct GOMP_kernel_launch_attributes *kla;
1297   kla = (struct GOMP_kernel_launch_attributes *) *input;
1298   *result = kla;
1299   if (kla->ndim == 0 || kla->ndim > 3)
1300     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1301 
1302   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1303   unsigned i;
1304   for (i = 0; i < kla->ndim; i++)
1305     {
1306       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1307 		 kla->gdims[i], kla->wdims[i]);
1308       if (kla->gdims[i] == 0)
1309 	return false;
1310     }
1311   return true;
1312 }
1313 
1314 /* Return the group size given the requested GROUP size, GRID size and number
1315    of grid dimensions NDIM.  */
1316 
1317 static uint32_t
1318 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1319 {
1320   if (group == 0)
1321     {
1322       /* TODO: Provide a default via environment or device characteristics.  */
1323       if (ndim == 1)
1324 	group = 64;
1325       else if (ndim == 2)
1326 	group = 8;
1327       else
1328 	group = 4;
1329     }
1330 
1331   if (group > grid)
1332     group = grid;
1333   return group;
1334 }
1335 
1336 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1337 
1338 static void
1339 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1340 {
1341   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1342 }
1343 
1344 /* A never-called callback for the HSA command queues.  These signal events
1345    that we don't use, so we trigger an error.
1346 
1347    This "queue" is not to be confused with the async queues, below.  */
1348 
1349 static void
1350 hsa_queue_callback (hsa_status_t status,
1351 		hsa_queue_t *queue __attribute__ ((unused)),
1352 		void *data __attribute__ ((unused)))
1353 {
1354   hsa_fatal ("Asynchronous queue error", status);
1355 }
1356 
1357 /* }}}  */
1358 /* {{{ HSA initialization  */
1359 
1360 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1361 
1362 static bool
1363 init_hsa_runtime_functions (void)
1364 {
1365 #define DLSYM_FN(function) \
1366   hsa_fns.function##_fn = dlsym (handle, #function); \
1367   if (hsa_fns.function##_fn == NULL) \
1368     return false;
1369   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1370   if (handle == NULL)
1371     return false;
1372 
1373   DLSYM_FN (hsa_status_string)
1374   DLSYM_FN (hsa_system_get_info)
1375   DLSYM_FN (hsa_agent_get_info)
1376   DLSYM_FN (hsa_init)
1377   DLSYM_FN (hsa_iterate_agents)
1378   DLSYM_FN (hsa_region_get_info)
1379   DLSYM_FN (hsa_queue_create)
1380   DLSYM_FN (hsa_agent_iterate_regions)
1381   DLSYM_FN (hsa_executable_destroy)
1382   DLSYM_FN (hsa_executable_create)
1383   DLSYM_FN (hsa_executable_global_variable_define)
1384   DLSYM_FN (hsa_executable_load_code_object)
1385   DLSYM_FN (hsa_executable_freeze)
1386   DLSYM_FN (hsa_signal_create)
1387   DLSYM_FN (hsa_memory_allocate)
1388   DLSYM_FN (hsa_memory_assign_agent)
1389   DLSYM_FN (hsa_memory_copy)
1390   DLSYM_FN (hsa_memory_free)
1391   DLSYM_FN (hsa_signal_destroy)
1392   DLSYM_FN (hsa_executable_get_symbol)
1393   DLSYM_FN (hsa_executable_symbol_get_info)
1394   DLSYM_FN (hsa_executable_iterate_symbols)
1395   DLSYM_FN (hsa_queue_add_write_index_release)
1396   DLSYM_FN (hsa_queue_load_read_index_acquire)
1397   DLSYM_FN (hsa_signal_wait_acquire)
1398   DLSYM_FN (hsa_signal_store_relaxed)
1399   DLSYM_FN (hsa_signal_store_release)
1400   DLSYM_FN (hsa_signal_load_acquire)
1401   DLSYM_FN (hsa_queue_destroy)
1402   DLSYM_FN (hsa_code_object_deserialize)
1403   return true;
1404 #undef DLSYM_FN
1405 }
1406 
1407 /* Return true if the agent is a GPU and can accept of concurrent submissions
1408    from different threads.  */
1409 
1410 static bool
1411 suitable_hsa_agent_p (hsa_agent_t agent)
1412 {
1413   hsa_device_type_t device_type;
1414   hsa_status_t status
1415     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1416 				     &device_type);
1417   if (status != HSA_STATUS_SUCCESS)
1418     return false;
1419 
1420   switch (device_type)
1421     {
1422     case HSA_DEVICE_TYPE_GPU:
1423       break;
1424     case HSA_DEVICE_TYPE_CPU:
1425       if (!support_cpu_devices)
1426 	return false;
1427       break;
1428     default:
1429       return false;
1430     }
1431 
1432   uint32_t features = 0;
1433   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1434 					  &features);
1435   if (status != HSA_STATUS_SUCCESS
1436       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1437     return false;
1438   hsa_queue_type_t queue_type;
1439   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1440 					  &queue_type);
1441   if (status != HSA_STATUS_SUCCESS
1442       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1443     return false;
1444 
1445   return true;
1446 }
1447 
1448 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1449    agent_count in hsa_context.  */
1450 
1451 static hsa_status_t
1452 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1453 {
1454   if (suitable_hsa_agent_p (agent))
1455     hsa_context.agent_count++;
1456   return HSA_STATUS_SUCCESS;
1457 }
1458 
1459 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1460    id to the describing structure in the hsa context.  The index of the
1461    structure is pointed to by DATA, increment it afterwards.  */
1462 
1463 static hsa_status_t
1464 assign_agent_ids (hsa_agent_t agent, void *data)
1465 {
1466   if (suitable_hsa_agent_p (agent))
1467     {
1468       int *agent_index = (int *) data;
1469       hsa_context.agents[*agent_index].id = agent;
1470       ++*agent_index;
1471     }
1472   return HSA_STATUS_SUCCESS;
1473 }
1474 
1475 /* Initialize hsa_context if it has not already been done.
1476    Return TRUE on success.  */
1477 
1478 static bool
1479 init_hsa_context (void)
1480 {
1481   hsa_status_t status;
1482   int agent_index = 0;
1483 
1484   if (hsa_context.initialized)
1485     return true;
1486   init_environment_variables ();
1487   if (!init_hsa_runtime_functions ())
1488     {
1489       GCN_WARNING ("Run-time could not be dynamically opened\n");
1490       if (suppress_host_fallback)
1491 	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1492       return false;
1493     }
1494   status = hsa_fns.hsa_init_fn ();
1495   if (status != HSA_STATUS_SUCCESS)
1496     return hsa_error ("Run-time could not be initialized", status);
1497   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1498 
1499   if (debug)
1500     dump_hsa_system_info ();
1501 
1502   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1503   if (status != HSA_STATUS_SUCCESS)
1504     return hsa_error ("GCN GPU devices could not be enumerated", status);
1505   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1506 
1507   hsa_context.agents
1508     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1509 				  * sizeof (struct agent_info));
1510   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1511   if (status != HSA_STATUS_SUCCESS)
1512     return hsa_error ("Scanning compute agents failed", status);
1513   if (agent_index != hsa_context.agent_count)
1514     {
1515       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1516       return false;
1517     }
1518 
1519   if (debug)
1520     {
1521       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1522       if (status != HSA_STATUS_SUCCESS)
1523 	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1524     }
1525 
1526   uint16_t minor, major;
1527   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1528 					   &minor);
1529   if (status != HSA_STATUS_SUCCESS)
1530     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1531   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1532 					   &major);
1533   if (status != HSA_STATUS_SUCCESS)
1534     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1535 
1536   size_t len = sizeof hsa_context.driver_version_s;
1537   int printed = snprintf (hsa_context.driver_version_s, len,
1538 			  "HSA Runtime %hu.%hu", (unsigned short int)major,
1539 			  (unsigned short int)minor);
1540   if (printed >= len)
1541     GCN_WARNING ("HSA runtime version string was truncated."
1542 		 "Version %hu.%hu is too long.", (unsigned short int)major,
1543 		 (unsigned short int)minor);
1544 
1545   hsa_context.initialized = true;
1546   return true;
1547 }
1548 
1549 /* Verify that hsa_context has already been initialized and return the
1550    agent_info structure describing device number N.  Return NULL on error.  */
1551 
1552 static struct agent_info *
1553 get_agent_info (int n)
1554 {
1555   if (!hsa_context.initialized)
1556     {
1557       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1558       return NULL;
1559     }
1560   if (n >= hsa_context.agent_count)
1561     {
1562       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1563       return NULL;
1564     }
1565   if (!hsa_context.agents[n].initialized)
1566     {
1567       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1568       return NULL;
1569     }
1570   return &hsa_context.agents[n];
1571 }
1572 
1573 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1574 
1575    Selects (breaks at) a suitable region of type KIND.  */
1576 
1577 static hsa_status_t
1578 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1579 		   hsa_region_global_flag_t kind)
1580 {
1581   hsa_status_t status;
1582   hsa_region_segment_t segment;
1583 
1584   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1585 					   &segment);
1586   if (status != HSA_STATUS_SUCCESS)
1587     return status;
1588   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1589     return HSA_STATUS_SUCCESS;
1590 
1591   uint32_t flags;
1592   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1593 					   &flags);
1594   if (status != HSA_STATUS_SUCCESS)
1595     return status;
1596   if (flags & kind)
1597     {
1598       *retval = region;
1599       return HSA_STATUS_INFO_BREAK;
1600     }
1601   return HSA_STATUS_SUCCESS;
1602 }
1603 
1604 /* Callback of hsa_agent_iterate_regions.
1605 
1606    Selects a kernargs memory region.  */
1607 
1608 static hsa_status_t
1609 get_kernarg_memory_region (hsa_region_t region, void *data)
1610 {
1611   return get_memory_region (region, (hsa_region_t *)data,
1612 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
1613 }
1614 
1615 /* Callback of hsa_agent_iterate_regions.
1616 
1617    Selects a coarse-grained memory region suitable for the heap and
1618    offload data.  */
1619 
1620 static hsa_status_t
1621 get_data_memory_region (hsa_region_t region, void *data)
1622 {
1623   return get_memory_region (region, (hsa_region_t *)data,
1624 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1625 }
1626 
1627 static int
1628 elf_gcn_isa_field (Elf64_Ehdr *image)
1629 {
1630   return image->e_flags & EF_AMDGPU_MACH_MASK;
1631 }
1632 
1633 const static char *gcn_gfx803_s = "gfx803";
1634 const static char *gcn_gfx900_s = "gfx900";
1635 const static char *gcn_gfx906_s = "gfx906";
1636 const static int gcn_isa_name_len = 6;
1637 
1638 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1639    support the ISA. */
1640 
1641 static const char*
1642 isa_hsa_name (int isa) {
1643   switch(isa)
1644     {
1645     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1646       return gcn_gfx803_s;
1647     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1648       return gcn_gfx900_s;
1649     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1650       return gcn_gfx906_s;
1651     }
1652   return NULL;
1653 }
1654 
1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656    with -march) or NULL if we do not support the ISA.
1657    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1658 
1659 static const char*
1660 isa_gcc_name (int isa) {
1661   switch(isa)
1662     {
1663     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1664       return "fiji";
1665     default:
1666       return isa_hsa_name (isa);
1667     }
1668 }
1669 
1670 /* Returns the code which is used in the GCN object code to identify the ISA with
1671    the given name (as used by the HSA runtime).  */
1672 
1673 static gcn_isa
1674 isa_code(const char *isa) {
1675   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1676     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1677 
1678   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1679     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1680 
1681   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1682     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1683 
1684   return -1;
1685 }
1686 
1687 /* }}}  */
1688 /* {{{ Run  */
1689 
1690 /* Create or reuse a team arena.
1691 
1692    Team arenas are used by OpenMP to avoid calling malloc multiple times
1693    while setting up each team.  This is purely a performance optimization.
1694 
1695    Allocating an arena also costs performance, albeit on the host side, so
1696    this function will reuse an existing arena if a large enough one is idle.
1697    The arena is released, but not deallocated, when the kernel exits.  */
1698 
1699 static void *
1700 get_team_arena (struct agent_info *agent, int num_teams)
1701 {
1702   struct team_arena_list **next_ptr = &agent->team_arena_list;
1703   struct team_arena_list *item;
1704 
1705   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706     {
1707       if (item->num_teams < num_teams)
1708 	continue;
1709 
1710       if (pthread_mutex_trylock (&item->in_use))
1711 	continue;
1712 
1713       return item->arena;
1714     }
1715 
1716   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717 
1718   if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719     {
1720       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721       return false;
1722     }
1723   item = malloc (sizeof (*item));
1724   item->num_teams = num_teams;
1725   item->next = NULL;
1726   *next_ptr = item;
1727 
1728   if (pthread_mutex_init (&item->in_use, NULL))
1729     {
1730       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731       return false;
1732     }
1733   if (pthread_mutex_lock (&item->in_use))
1734     {
1735       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736       return false;
1737     }
1738   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739     {
1740       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741       return false;
1742     }
1743 
1744   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1745   hsa_status_t status;
1746   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747 					   TEAM_ARENA_SIZE*num_teams,
1748 					   &item->arena);
1749   if (status != HSA_STATUS_SUCCESS)
1750     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752 					       HSA_ACCESS_PERMISSION_RW);
1753   if (status != HSA_STATUS_SUCCESS)
1754     hsa_fatal ("Could not assign arena memory to device", status);
1755 
1756   return item->arena;
1757 }
1758 
1759 /* Mark a team arena available for reuse.  */
1760 
1761 static void
1762 release_team_arena (struct agent_info* agent, void *arena)
1763 {
1764   struct team_arena_list *item;
1765 
1766   for (item = agent->team_arena_list; item; item = item->next)
1767     {
1768       if (item->arena == arena)
1769 	{
1770 	  if (pthread_mutex_unlock (&item->in_use))
1771 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772 	  return;
1773 	}
1774     }
1775   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776 }
1777 
1778 /* Clean up all the allocated team arenas.  */
1779 
1780 static bool
1781 destroy_team_arenas (struct agent_info *agent)
1782 {
1783   struct team_arena_list *item, *next;
1784 
1785   for (item = agent->team_arena_list; item; item = next)
1786     {
1787       next = item->next;
1788       hsa_fns.hsa_memory_free_fn (item->arena);
1789       if (pthread_mutex_destroy (&item->in_use))
1790 	{
1791 	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792 	  return false;
1793 	}
1794       free (item);
1795     }
1796   agent->team_arena_list = NULL;
1797 
1798   return true;
1799 }
1800 
1801 /* Allocate memory on a specified device.  */
1802 
1803 static void *
1804 alloc_by_agent (struct agent_info *agent, size_t size)
1805 {
1806   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807 
1808   /* Zero-size allocations are invalid, so in order to return a valid pointer
1809      we need to pass a valid size.  One source of zero-size allocations is
1810      kernargs for kernels that have no inputs or outputs (the kernel may
1811      only use console output, for example).  */
1812   if (size == 0)
1813     size = 4;
1814 
1815   void *ptr;
1816   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817 							size, &ptr);
1818   if (status != HSA_STATUS_SUCCESS)
1819     {
1820       hsa_error ("Could not allocate device memory", status);
1821       return NULL;
1822     }
1823 
1824   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825 					       HSA_ACCESS_PERMISSION_RW);
1826   if (status != HSA_STATUS_SUCCESS)
1827     {
1828       hsa_error ("Could not assign data memory to device", status);
1829       return NULL;
1830     }
1831 
1832   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833   bool profiling_dispatch_p
1834     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835   if (profiling_dispatch_p)
1836     {
1837       acc_prof_info *prof_info = thr->prof_info;
1838       acc_event_info data_event_info;
1839       acc_api_info *api_info = thr->api_info;
1840 
1841       prof_info->event_type = acc_ev_alloc;
1842 
1843       data_event_info.data_event.event_type = prof_info->event_type;
1844       data_event_info.data_event.valid_bytes
1845 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846       data_event_info.data_event.parent_construct
1847 	= acc_construct_parallel;
1848       data_event_info.data_event.implicit = 1;
1849       data_event_info.data_event.tool_info = NULL;
1850       data_event_info.data_event.var_name = NULL;
1851       data_event_info.data_event.bytes = size;
1852       data_event_info.data_event.host_ptr = NULL;
1853       data_event_info.data_event.device_ptr = (void *) ptr;
1854 
1855       api_info->device_api = acc_device_api_other;
1856 
1857       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858 					    api_info);
1859     }
1860 
1861   return ptr;
1862 }
1863 
1864 /* Create kernel dispatch data structure for given KERNEL, along with
1865    the necessary device signals and memory allocations.  */
1866 
1867 static struct kernel_dispatch *
1868 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869 {
1870   struct agent_info *agent = kernel->agent;
1871   struct kernel_dispatch *shadow
1872     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873 
1874   shadow->agent = kernel->agent;
1875   shadow->object = kernel->object;
1876 
1877   hsa_signal_t sync_signal;
1878   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879   if (status != HSA_STATUS_SUCCESS)
1880     hsa_fatal ("Error creating the GCN sync signal", status);
1881 
1882   shadow->signal = sync_signal.handle;
1883   shadow->private_segment_size = kernel->private_segment_size;
1884   shadow->group_segment_size = kernel->group_segment_size;
1885 
1886   /* We expect kernels to request a single pointer, explicitly, and the
1887      rest of struct kernargs, implicitly.  If they request anything else
1888      then something is wrong.  */
1889   if (kernel->kernarg_segment_size > 8)
1890     {
1891       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892       return NULL;
1893     }
1894 
1895   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896 					   sizeof (struct kernargs),
1897 					   &shadow->kernarg_address);
1898   if (status != HSA_STATUS_SUCCESS)
1899     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900   struct kernargs *kernargs = shadow->kernarg_address;
1901 
1902   /* Zero-initialize the output_data (minimum needed).  */
1903   kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904   kernargs->output_data.next_output = 0;
1905   for (unsigned i = 0;
1906        i < (sizeof (kernargs->output_data.queue)
1907 	    / sizeof (kernargs->output_data.queue[0]));
1908        i++)
1909     kernargs->output_data.queue[i].written = 0;
1910   kernargs->output_data.consumed = 0;
1911 
1912   /* Pass in the heap location.  */
1913   kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914 
1915   /* Create an arena.  */
1916   if (kernel->kind == KIND_OPENMP)
1917     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918   else
1919     kernargs->arena_ptr = 0;
1920 
1921   /* Ensure we can recognize unset return values.  */
1922   kernargs->output_data.return_value = 0xcafe0000;
1923 
1924   return shadow;
1925 }
1926 
1927 /* Output any data written to console output from the kernel.  It is expected
1928    that this function is polled during kernel execution.
1929 
1930    We print all entries from the last item printed to the next entry without
1931    a "written" flag.  If the "final" flag is set then it'll continue right to
1932    the end.
1933 
1934    The print buffer is circular, but the from and to locations don't wrap when
1935    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1936    output when the buffer is full.  */
1937 
1938 static void
1939 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940 		bool final)
1941 {
1942   unsigned int limit = (sizeof (kernargs->output_data.queue)
1943 			/ sizeof (kernargs->output_data.queue[0]));
1944 
1945   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946 				       __ATOMIC_ACQUIRE);
1947   unsigned int to = kernargs->output_data.next_output;
1948 
1949   if (from > to)
1950     {
1951       /* Overflow.  */
1952       if (final)
1953 	printf ("GCN print buffer overflowed.\n");
1954       return;
1955     }
1956 
1957   unsigned int i;
1958   for (i = from; i < to; i++)
1959     {
1960       struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961 
1962       if (!data->written && !final)
1963 	break;
1964 
1965       switch (data->type)
1966 	{
1967 	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968 	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969 	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970 	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971 	default: printf ("GCN print buffer error!\n"); break;
1972 	}
1973       data->written = 0;
1974       __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975 			__ATOMIC_RELEASE);
1976     }
1977   fflush (stdout);
1978 }
1979 
1980 /* Release data structure created for a kernel dispatch in SHADOW argument,
1981    and clean up the signal and memory allocations.  */
1982 
1983 static void
1984 release_kernel_dispatch (struct kernel_dispatch *shadow)
1985 {
1986   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987 
1988   struct kernargs *kernargs = shadow->kernarg_address;
1989   void *arena = (void *)kernargs->arena_ptr;
1990   if (arena)
1991     release_team_arena (shadow->agent, arena);
1992 
1993   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994 
1995   hsa_signal_t s;
1996   s.handle = shadow->signal;
1997   hsa_fns.hsa_signal_destroy_fn (s);
1998 
1999   free (shadow);
2000 }
2001 
2002 /* Extract the properties from a kernel binary.  */
2003 
2004 static void
2005 init_kernel_properties (struct kernel_info *kernel)
2006 {
2007   hsa_status_t status;
2008   struct agent_info *agent = kernel->agent;
2009   hsa_executable_symbol_t kernel_symbol;
2010   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2011 						 kernel->name, agent->id,
2012 						 0, &kernel_symbol);
2013   if (status != HSA_STATUS_SUCCESS)
2014     {
2015       hsa_warn ("Could not find symbol for kernel in the code object", status);
2016       fprintf (stderr, "not found name: '%s'\n", kernel->name);
2017       dump_executable_symbols (agent->executable);
2018       goto failure;
2019     }
2020   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2021   status = hsa_fns.hsa_executable_symbol_get_info_fn
2022     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2023   if (status != HSA_STATUS_SUCCESS)
2024     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2025   status = hsa_fns.hsa_executable_symbol_get_info_fn
2026     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2027      &kernel->kernarg_segment_size);
2028   if (status != HSA_STATUS_SUCCESS)
2029     hsa_fatal ("Could not get info about kernel argument size", status);
2030   status = hsa_fns.hsa_executable_symbol_get_info_fn
2031     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2032      &kernel->group_segment_size);
2033   if (status != HSA_STATUS_SUCCESS)
2034     hsa_fatal ("Could not get info about kernel group segment size", status);
2035   status = hsa_fns.hsa_executable_symbol_get_info_fn
2036     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2037      &kernel->private_segment_size);
2038   if (status != HSA_STATUS_SUCCESS)
2039     hsa_fatal ("Could not get info about kernel private segment size",
2040 	       status);
2041 
2042   /* The kernel type is not known until something tries to launch it.  */
2043   kernel->kind = KIND_UNKNOWN;
2044 
2045   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2046 	     "following segment sizes: \n", kernel->name);
2047   GCN_DEBUG ("  group_segment_size: %u\n",
2048 	     (unsigned) kernel->group_segment_size);
2049   GCN_DEBUG ("  private_segment_size: %u\n",
2050 	     (unsigned) kernel->private_segment_size);
2051   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2052 	     (unsigned) kernel->kernarg_segment_size);
2053   return;
2054 
2055 failure:
2056   kernel->initialization_failed = true;
2057 }
2058 
2059 /* Do all the work that is necessary before running KERNEL for the first time.
2060    The function assumes the program has been created, finalized and frozen by
2061    create_and_finalize_hsa_program.  */
2062 
2063 static void
2064 init_kernel (struct kernel_info *kernel)
2065 {
2066   if (pthread_mutex_lock (&kernel->init_mutex))
2067     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2068   if (kernel->initialized)
2069     {
2070       if (pthread_mutex_unlock (&kernel->init_mutex))
2071 	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2072 			   "mutex");
2073 
2074       return;
2075     }
2076 
2077   init_kernel_properties (kernel);
2078 
2079   if (!kernel->initialization_failed)
2080     {
2081       GCN_DEBUG ("\n");
2082 
2083       kernel->initialized = true;
2084     }
2085   if (pthread_mutex_unlock (&kernel->init_mutex))
2086     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2087 		       "mutex");
2088 }
2089 
2090 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2091    launch attributes from KLA.
2092 
2093    MODULE_LOCKED indicates that the caller already holds the lock and
2094    run_kernel need not lock it again.
2095    If AQ is NULL then agent->sync_queue will be used.  */
2096 
2097 static void
2098 run_kernel (struct kernel_info *kernel, void *vars,
2099 	    struct GOMP_kernel_launch_attributes *kla,
2100 	    struct goacc_asyncqueue *aq, bool module_locked)
2101 {
2102   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2103 	     kernel->description->vpgr_count);
2104 
2105   /* Reduce the number of threads/workers if there are insufficient
2106      VGPRs available to run the kernels together.  */
2107   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2108     {
2109       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2110       int max_threads = (256 / granulated_vgprs) * 4;
2111       if (kla->gdims[2] > max_threads)
2112 	{
2113 	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2114 		       " per team/gang - reducing to %d threads/workers.\n",
2115 		       kla->gdims[2], max_threads);
2116 	  kla->gdims[2] = max_threads;
2117 	}
2118     }
2119 
2120   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2121 	     (aq ? aq->id : 0));
2122   GCN_DEBUG ("GCN launch attribs: gdims:[");
2123   int i;
2124   for (i = 0; i < kla->ndim; ++i)
2125     {
2126       if (i)
2127 	DEBUG_PRINT (", ");
2128       DEBUG_PRINT ("%u", kla->gdims[i]);
2129     }
2130   DEBUG_PRINT ("], normalized gdims:[");
2131   for (i = 0; i < kla->ndim; ++i)
2132     {
2133       if (i)
2134 	DEBUG_PRINT (", ");
2135       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2136     }
2137   DEBUG_PRINT ("], wdims:[");
2138   for (i = 0; i < kla->ndim; ++i)
2139     {
2140       if (i)
2141 	DEBUG_PRINT (", ");
2142       DEBUG_PRINT ("%u", kla->wdims[i]);
2143     }
2144   DEBUG_PRINT ("]\n");
2145   DEBUG_FLUSH ();
2146 
2147   struct agent_info *agent = kernel->agent;
2148   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2149     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2150 
2151   if (!agent->initialized)
2152     GOMP_PLUGIN_fatal ("Agent must be initialized");
2153 
2154   if (!kernel->initialized)
2155     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2156 
2157   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2158 
2159   uint64_t index
2160     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2161   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2162 
2163   /* Wait until the queue is not full before writing the packet.   */
2164   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2165 	 >= command_q->size)
2166     ;
2167 
2168   /* Do not allow the dimensions to be overridden when running
2169      constructors or destructors.  */
2170   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2171   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2172 
2173   hsa_kernel_dispatch_packet_t *packet;
2174   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2175 	   + index % command_q->size;
2176 
2177   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2178   packet->grid_size_x = override_x ? : kla->gdims[0];
2179   packet->workgroup_size_x = get_group_size (kla->ndim,
2180 					     packet->grid_size_x,
2181 					     kla->wdims[0]);
2182 
2183   if (kla->ndim >= 2)
2184     {
2185       packet->grid_size_y = kla->gdims[1];
2186       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2187 						 kla->wdims[1]);
2188     }
2189   else
2190     {
2191       packet->grid_size_y = 1;
2192       packet->workgroup_size_y = 1;
2193     }
2194 
2195   if (kla->ndim == 3)
2196     {
2197       packet->grid_size_z = limit_worker_threads (override_z
2198 						  ? : kla->gdims[2]);
2199       packet->workgroup_size_z = get_group_size (kla->ndim,
2200 						 packet->grid_size_z,
2201 						 kla->wdims[2]);
2202     }
2203   else
2204     {
2205       packet->grid_size_z = 1;
2206       packet->workgroup_size_z = 1;
2207     }
2208 
2209   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2210 	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2211 	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2212 	     packet->grid_size_x / packet->workgroup_size_x,
2213 	     packet->grid_size_y / packet->workgroup_size_y,
2214 	     packet->grid_size_z / packet->workgroup_size_z,
2215 	     packet->workgroup_size_x, packet->workgroup_size_y,
2216 	     packet->workgroup_size_z);
2217 
2218   struct kernel_dispatch *shadow
2219     = create_kernel_dispatch (kernel, packet->grid_size_x);
2220   shadow->queue = command_q;
2221 
2222   if (debug)
2223     {
2224       fprintf (stderr, "\nKernel has following dependencies:\n");
2225       print_kernel_dispatch (shadow, 2);
2226     }
2227 
2228   packet->private_segment_size = kernel->private_segment_size;
2229   packet->group_segment_size = kernel->group_segment_size;
2230   packet->kernel_object = kernel->object;
2231   packet->kernarg_address = shadow->kernarg_address;
2232   hsa_signal_t s;
2233   s.handle = shadow->signal;
2234   packet->completion_signal = s;
2235   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2236   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2237 
2238   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2239 
2240   uint16_t header;
2241   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2242   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2243   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2244 
2245   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2246 	     agent->device_id);
2247 
2248   packet_store_release ((uint32_t *) packet, header,
2249 			(uint16_t) kla->ndim
2250 			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2251 
2252   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2253 				       index);
2254 
2255   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2256 
2257   /* Root signal waits with 1ms timeout.  */
2258   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2259 					     1000 * 1000,
2260 					     HSA_WAIT_STATE_BLOCKED) != 0)
2261     {
2262       console_output (kernel, shadow->kernarg_address, false);
2263     }
2264   console_output (kernel, shadow->kernarg_address, true);
2265 
2266   struct kernargs *kernargs = shadow->kernarg_address;
2267   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2268 
2269   release_kernel_dispatch (shadow);
2270 
2271   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2272     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2273 
2274   unsigned int upper = (return_value & ~0xffff) >> 16;
2275   if (upper == 0xcafe)
2276     ; // exit not called, normal termination.
2277   else if (upper == 0xffff)
2278     ; // exit called.
2279   else
2280     {
2281       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2282 			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2283 			 return_value);
2284       abort ();
2285     }
2286 
2287   if (upper == 0xffff)
2288     {
2289       unsigned int signal = (return_value >> 8) & 0xff;
2290 
2291       if (signal == SIGABRT)
2292 	{
2293 	  GCN_WARNING ("GCN Kernel aborted\n");
2294 	  abort ();
2295 	}
2296       else if (signal != 0)
2297 	{
2298 	  GCN_WARNING ("GCN Kernel received unknown signal\n");
2299 	  abort ();
2300 	}
2301 
2302       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2303       exit (return_value & 0xff);
2304     }
2305 }
2306 
2307 /* }}}  */
2308 /* {{{ Load/Unload  */
2309 
2310 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2311 
2312 static bool
2313 init_basic_kernel_info (struct kernel_info *kernel,
2314 			struct hsa_kernel_description *d,
2315 			struct agent_info *agent,
2316 			struct module_info *module)
2317 {
2318   kernel->agent = agent;
2319   kernel->module = module;
2320   kernel->name = d->name;
2321   kernel->description = d;
2322   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2323     {
2324       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2325       return false;
2326     }
2327   return true;
2328 }
2329 
2330 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
2331    not found, return false.  */
2332 
2333 static bool
2334 find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2335 		  struct module_info *module, Elf64_Ehdr *image,
2336 		  Elf64_Shdr *sections)
2337 {
2338   bool res = false;
2339 
2340   hsa_status_t status;
2341 
2342   hsa_executable_symbol_t symbol;
2343   if (!find_executable_symbol (agent->executable, &symbol))
2344     return false;
2345 
2346   status = hsa_fns.hsa_executable_symbol_get_info_fn
2347     (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2348   if (status != HSA_STATUS_SUCCESS)
2349     {
2350       hsa_error ("Could not extract symbol address", status);
2351       return false;
2352     }
2353 
2354   char *symbol_name = get_executable_symbol_name (symbol);
2355   if (symbol_name == NULL)
2356     return false;
2357 
2358   /* Find the kernel function in ELF, and calculate actual load offset.  */
2359   for (int i = 0; i < image->e_shnum; i++)
2360     if (sections[i].sh_type == SHT_SYMTAB)
2361       {
2362 	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2363 	char *strings = (char *)image + strtab->sh_offset;
2364 
2365 	for (size_t offset = 0;
2366 	     offset < sections[i].sh_size;
2367 	     offset += sections[i].sh_entsize)
2368 	  {
2369 	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2370 					  + sections[i].sh_offset
2371 					  + offset);
2372 	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
2373 	      {
2374 		*load_offset -= sym->st_value;
2375 		res = true;
2376 		break;
2377 	      }
2378 	  }
2379       }
2380 
2381   free (symbol_name);
2382   return res;
2383 }
2384 
2385 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2386 
2387 static bool
2388 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2389 {
2390   int isa_field = elf_gcn_isa_field (image);
2391   const char* isa_s = isa_hsa_name (isa_field);
2392   if (!isa_s)
2393     {
2394       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2395       return false;
2396     }
2397 
2398   if (isa_field != agent->device_isa)
2399     {
2400       char msg[120];
2401       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2402       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2403       assert (agent_isa_s);
2404       assert (agent_isa_gcc_s);
2405 
2406       snprintf (msg, sizeof msg,
2407 		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2408 		"Try to recompile with '-foffload=-march=%s'.\n",
2409 		isa_s, agent_isa_s, agent_isa_gcc_s);
2410 
2411       hsa_error (msg, HSA_STATUS_ERROR);
2412       return false;
2413     }
2414 
2415   return true;
2416 }
2417 
2418 /* Create and finalize the program consisting of all loaded modules.  */
2419 
2420 static bool
2421 create_and_finalize_hsa_program (struct agent_info *agent)
2422 {
2423   hsa_status_t status;
2424   int reloc_count = 0;
2425   bool res = true;
2426   if (pthread_mutex_lock (&agent->prog_mutex))
2427     {
2428       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2429       return false;
2430     }
2431   if (agent->prog_finalized)
2432     goto final;
2433 
2434   status
2435     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2436 					HSA_EXECUTABLE_STATE_UNFROZEN,
2437 					"", &agent->executable);
2438   if (status != HSA_STATUS_SUCCESS)
2439     {
2440       hsa_error ("Could not create GCN executable", status);
2441       goto fail;
2442     }
2443 
2444   /* Load any GCN modules.  */
2445   struct module_info *module = agent->module;
2446   if (module)
2447     {
2448       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2449 
2450       if (!isa_matches_agent (agent, image))
2451 	goto fail;
2452 
2453       /* Hide relocations from the HSA runtime loader.
2454 	 Keep a copy of the unmodified section headers to use later.  */
2455       Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2456 						  + image->e_shoff);
2457       for (int i = image->e_shnum - 1; i >= 0; i--)
2458 	{
2459 	  if (image_sections[i].sh_type == SHT_RELA
2460 	      || image_sections[i].sh_type == SHT_REL)
2461 	    /* Change section type to something harmless.  */
2462 	    image_sections[i].sh_type |= 0x80;
2463 	}
2464 
2465       hsa_code_object_t co = { 0 };
2466       status = hsa_fns.hsa_code_object_deserialize_fn
2467 	(module->image_desc->gcn_image->image,
2468 	 module->image_desc->gcn_image->size,
2469 	 NULL, &co);
2470       if (status != HSA_STATUS_SUCCESS)
2471 	{
2472 	  hsa_error ("Could not deserialize GCN code object", status);
2473 	  goto fail;
2474 	}
2475 
2476       status = hsa_fns.hsa_executable_load_code_object_fn
2477 	(agent->executable, agent->id, co, "");
2478       if (status != HSA_STATUS_SUCCESS)
2479 	{
2480 	  hsa_error ("Could not load GCN code object", status);
2481 	  goto fail;
2482 	}
2483 
2484       if (!module->heap)
2485 	{
2486 	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2487 						   gcn_kernel_heap_size,
2488 						   (void**)&module->heap);
2489 	  if (status != HSA_STATUS_SUCCESS)
2490 	    {
2491 	      hsa_error ("Could not allocate memory for GCN heap", status);
2492 	      goto fail;
2493 	    }
2494 
2495 	  status = hsa_fns.hsa_memory_assign_agent_fn
2496 			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2497 	  if (status != HSA_STATUS_SUCCESS)
2498 	    {
2499 	      hsa_error ("Could not assign GCN heap memory to device", status);
2500 	      goto fail;
2501 	    }
2502 
2503 	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2504 				      &gcn_kernel_heap_size,
2505 				      sizeof (gcn_kernel_heap_size));
2506 	}
2507 
2508     }
2509 
2510   if (debug)
2511     dump_executable_symbols (agent->executable);
2512 
2513   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2514   if (status != HSA_STATUS_SUCCESS)
2515     {
2516       hsa_error ("Could not freeze the GCN executable", status);
2517       goto fail;
2518     }
2519 
2520   if (agent->module)
2521     {
2522       struct module_info *module = agent->module;
2523       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2524       Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2525 
2526       Elf64_Addr load_offset;
2527       if (!find_load_offset (&load_offset, agent, module, image, sections))
2528 	goto fail;
2529 
2530       /* Record the physical load address range.
2531 	 We need this for data copies later.  */
2532       Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2533       Elf64_Addr low = ~0, high = 0;
2534       for (int i = 0; i < image->e_phnum; i++)
2535 	if (segments[i].p_memsz > 0)
2536 	  {
2537 	    if (segments[i].p_paddr < low)
2538 	      low = segments[i].p_paddr;
2539 	    if (segments[i].p_paddr > high)
2540 	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
2541 	  }
2542       module->phys_address_start = low + load_offset;
2543       module->phys_address_end = high + load_offset;
2544 
2545       // Find dynamic symbol table
2546       Elf64_Shdr *dynsym = NULL;
2547       for (int i = 0; i < image->e_shnum; i++)
2548 	if (sections[i].sh_type == SHT_DYNSYM)
2549 	  {
2550 	    dynsym = &sections[i];
2551 	    break;
2552 	  }
2553 
2554       /* Fix up relocations.  */
2555       for (int i = 0; i < image->e_shnum; i++)
2556 	{
2557 	  if (sections[i].sh_type == (SHT_RELA | 0x80))
2558 	    for (size_t offset = 0;
2559 		 offset < sections[i].sh_size;
2560 		 offset += sections[i].sh_entsize)
2561 	      {
2562 		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2563 						  + sections[i].sh_offset
2564 						  + offset);
2565 		Elf64_Sym *sym =
2566 		  (dynsym
2567 		   ? (Elf64_Sym*)((char*)image
2568 				  + dynsym->sh_offset
2569 				  + (dynsym->sh_entsize
2570 				     * ELF64_R_SYM (reloc->r_info)))
2571 		   : NULL);
2572 
2573 		int64_t S = (sym ? sym->st_value : 0);
2574 		int64_t P = reloc->r_offset + load_offset;
2575 		int64_t A = reloc->r_addend;
2576 		int64_t B = load_offset;
2577 		int64_t V, size;
2578 		switch (ELF64_R_TYPE (reloc->r_info))
2579 		  {
2580 		  case R_AMDGPU_ABS32_LO:
2581 		    V = (S + A) & 0xFFFFFFFF;
2582 		    size = 4;
2583 		    break;
2584 		  case R_AMDGPU_ABS32_HI:
2585 		    V = (S + A) >> 32;
2586 		    size = 4;
2587 		    break;
2588 		  case R_AMDGPU_ABS64:
2589 		    V = S + A;
2590 		    size = 8;
2591 		    break;
2592 		  case R_AMDGPU_REL32:
2593 		    V = S + A - P;
2594 		    size = 4;
2595 		    break;
2596 		  case R_AMDGPU_REL64:
2597 		    /* FIXME
2598 		       LLD seems to emit REL64 where the the assembler has
2599 		       ABS64.  This is clearly wrong because it's not what the
2600 		       compiler is expecting.  Let's assume, for now, that
2601 		       it's a bug.  In any case, GCN kernels are always self
2602 		       contained and therefore relative relocations will have
2603 		       been resolved already, so this should be a safe
2604 		       workaround.  */
2605 		    V = S + A/* - P*/;
2606 		    size = 8;
2607 		    break;
2608 		  case R_AMDGPU_ABS32:
2609 		    V = S + A;
2610 		    size = 4;
2611 		    break;
2612 		    /* TODO R_AMDGPU_GOTPCREL */
2613 		    /* TODO R_AMDGPU_GOTPCREL32_LO */
2614 		    /* TODO R_AMDGPU_GOTPCREL32_HI */
2615 		  case R_AMDGPU_REL32_LO:
2616 		    V = (S + A - P) & 0xFFFFFFFF;
2617 		    size = 4;
2618 		    break;
2619 		  case R_AMDGPU_REL32_HI:
2620 		    V = (S + A - P) >> 32;
2621 		    size = 4;
2622 		    break;
2623 		  case R_AMDGPU_RELATIVE64:
2624 		    V = B + A;
2625 		    size = 8;
2626 		    break;
2627 		  default:
2628 		    fprintf (stderr, "Error: unsupported relocation type.\n");
2629 		    exit (1);
2630 		  }
2631 		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2632 		if (status != HSA_STATUS_SUCCESS)
2633 		  {
2634 		    hsa_error ("Failed to fix up relocation", status);
2635 		    goto fail;
2636 		  }
2637 		reloc_count++;
2638 	      }
2639 	}
2640     }
2641 
2642   GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2643 	     agent->device_id, reloc_count);
2644 
2645 final:
2646   agent->prog_finalized = true;
2647 
2648   if (pthread_mutex_unlock (&agent->prog_mutex))
2649     {
2650       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2651       res = false;
2652     }
2653 
2654   return res;
2655 
2656 fail:
2657   res = false;
2658   goto final;
2659 }
2660 
2661 /* Free the HSA program in agent and everything associated with it and set
2662    agent->prog_finalized and the initialized flags of all kernels to false.
2663    Return TRUE on success.  */
2664 
2665 static bool
2666 destroy_hsa_program (struct agent_info *agent)
2667 {
2668   if (!agent->prog_finalized)
2669     return true;
2670 
2671   hsa_status_t status;
2672 
2673   GCN_DEBUG ("Destroying the current GCN program.\n");
2674 
2675   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2676   if (status != HSA_STATUS_SUCCESS)
2677     return hsa_error ("Could not destroy GCN executable", status);
2678 
2679   if (agent->module)
2680     {
2681       int i;
2682       for (i = 0; i < agent->module->kernel_count; i++)
2683 	agent->module->kernels[i].initialized = false;
2684 
2685       if (agent->module->heap)
2686 	{
2687 	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
2688 	  agent->module->heap = NULL;
2689 	}
2690     }
2691   agent->prog_finalized = false;
2692   return true;
2693 }
2694 
2695 /* Deinitialize all information associated with MODULE and kernels within
2696    it.  Return TRUE on success.  */
2697 
2698 static bool
2699 destroy_module (struct module_info *module, bool locked)
2700 {
2701   /* Run destructors before destroying module.  */
2702   struct GOMP_kernel_launch_attributes kla =
2703     { 3,
2704       /* Grid size.  */
2705       { 1, 64, 1 },
2706       /* Work-group size.  */
2707       { 1, 64, 1 }
2708     };
2709 
2710   if (module->fini_array_func)
2711     {
2712       init_kernel (module->fini_array_func);
2713       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2714     }
2715   module->constructors_run_p = false;
2716 
2717   int i;
2718   for (i = 0; i < module->kernel_count; i++)
2719     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2720       {
2721 	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2722 			   "mutex");
2723 	return false;
2724       }
2725 
2726   return true;
2727 }
2728 
2729 /* }}}  */
2730 /* {{{ Async  */
2731 
2732 /* Callback of dispatch queues to report errors.  */
2733 
2734 static void
2735 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2736 {
2737   struct queue_entry *entry = &aq->queue[index];
2738 
2739   switch (entry->type)
2740     {
2741     case KERNEL_LAUNCH:
2742       if (DEBUG_QUEUES)
2743 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2744 		   aq->agent->device_id, aq->id, index);
2745       run_kernel (entry->u.launch.kernel,
2746 		  entry->u.launch.vars,
2747 		  &entry->u.launch.kla, aq, false);
2748       if (DEBUG_QUEUES)
2749 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2750 		   aq->agent->device_id, aq->id, index);
2751       break;
2752 
2753     case CALLBACK:
2754       if (DEBUG_QUEUES)
2755 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2756 		   aq->agent->device_id, aq->id, index);
2757       entry->u.callback.fn (entry->u.callback.data);
2758       if (DEBUG_QUEUES)
2759 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2760 		   aq->agent->device_id, aq->id, index);
2761       break;
2762 
2763     case ASYNC_WAIT:
2764       {
2765 	/* FIXME: is it safe to access a placeholder that may already have
2766 	   been executed?  */
2767         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2768 
2769 	if (DEBUG_QUEUES)
2770           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2771 		     aq->agent->device_id, aq->id, index);
2772 
2773 	pthread_mutex_lock (&placeholderp->mutex);
2774 
2775 	while (!placeholderp->executed)
2776           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2777 
2778 	pthread_mutex_unlock (&placeholderp->mutex);
2779 
2780 	if (pthread_cond_destroy (&placeholderp->cond))
2781 	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2782 
2783 	if (pthread_mutex_destroy (&placeholderp->mutex))
2784 	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2785 
2786 	if (DEBUG_QUEUES)
2787           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2788 		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2789       }
2790       break;
2791 
2792     case ASYNC_PLACEHOLDER:
2793       pthread_mutex_lock (&entry->u.placeholder.mutex);
2794       entry->u.placeholder.executed = 1;
2795       pthread_cond_signal (&entry->u.placeholder.cond);
2796       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2797       break;
2798 
2799     default:
2800       GOMP_PLUGIN_fatal ("Unknown queue element");
2801     }
2802 }
2803 
2804 /* This function is run as a thread to service an async queue in the
2805    background.  It runs continuously until the stop flag is set.  */
2806 
2807 static void *
2808 drain_queue (void *thread_arg)
2809 {
2810   struct goacc_asyncqueue *aq = thread_arg;
2811 
2812   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2813     {
2814       aq->drain_queue_stop = 2;
2815       return NULL;
2816     }
2817 
2818   pthread_mutex_lock (&aq->mutex);
2819 
2820   while (true)
2821     {
2822       if (aq->drain_queue_stop)
2823 	break;
2824 
2825       if (aq->queue_n > 0)
2826 	{
2827 	  pthread_mutex_unlock (&aq->mutex);
2828 	  execute_queue_entry (aq, aq->queue_first);
2829 
2830 	  pthread_mutex_lock (&aq->mutex);
2831 	  aq->queue_first = ((aq->queue_first + 1)
2832 			     % ASYNC_QUEUE_SIZE);
2833 	  aq->queue_n--;
2834 
2835 	  if (DEBUG_THREAD_SIGNAL)
2836 	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2837 		       aq->agent->device_id, aq->id);
2838 	  pthread_cond_broadcast (&aq->queue_cond_out);
2839 	  pthread_mutex_unlock (&aq->mutex);
2840 
2841 	  if (DEBUG_QUEUES)
2842 	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2843 		       aq->id);
2844 	  pthread_mutex_lock (&aq->mutex);
2845 	}
2846       else
2847 	{
2848 	  if (DEBUG_THREAD_SLEEP)
2849 	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2850 		       aq->agent->device_id, aq->id);
2851 	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2852 	  if (DEBUG_THREAD_SLEEP)
2853 	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2854 		       aq->agent->device_id, aq->id);
2855 	}
2856     }
2857 
2858   aq->drain_queue_stop = 2;
2859   if (DEBUG_THREAD_SIGNAL)
2860     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2861 	       aq->agent->device_id, aq->id);
2862   pthread_cond_broadcast (&aq->queue_cond_out);
2863   pthread_mutex_unlock (&aq->mutex);
2864 
2865   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2866   return NULL;
2867 }
2868 
2869 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2870    is not usually the case.  This is just a debug tool.  */
2871 
2872 static void
2873 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2874 {
2875   pthread_mutex_lock (&aq->mutex);
2876 
2877   while (aq->queue_n > 0)
2878     {
2879       execute_queue_entry (aq, aq->queue_first);
2880 
2881       aq->queue_first = ((aq->queue_first + 1)
2882 			 % ASYNC_QUEUE_SIZE);
2883       aq->queue_n--;
2884     }
2885 
2886   pthread_mutex_unlock (&aq->mutex);
2887 }
2888 
2889 /* Block the current thread until an async queue is writable.  The aq->mutex
2890    lock should be held on entry, and remains locked on exit.  */
2891 
2892 static void
2893 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2894 {
2895   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2896     {
2897       /* Queue is full.  Wait for it to not be full.  */
2898       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2899 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2900     }
2901 }
2902 
2903 /* Request an asynchronous kernel launch on the specified queue.  This
2904    may block if the queue is full, but returns without waiting for the
2905    kernel to run.  */
2906 
2907 static void
2908 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2909 		   void *vars, struct GOMP_kernel_launch_attributes *kla)
2910 {
2911   assert (aq->agent == kernel->agent);
2912 
2913   pthread_mutex_lock (&aq->mutex);
2914 
2915   wait_for_queue_nonfull (aq);
2916 
2917   int queue_last = ((aq->queue_first + aq->queue_n)
2918 		    % ASYNC_QUEUE_SIZE);
2919   if (DEBUG_QUEUES)
2920     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2921 	       aq->id, queue_last);
2922 
2923   aq->queue[queue_last].type = KERNEL_LAUNCH;
2924   aq->queue[queue_last].u.launch.kernel = kernel;
2925   aq->queue[queue_last].u.launch.vars = vars;
2926   aq->queue[queue_last].u.launch.kla = *kla;
2927 
2928   aq->queue_n++;
2929 
2930   if (DEBUG_THREAD_SIGNAL)
2931     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2932 	       aq->agent->device_id, aq->id);
2933   pthread_cond_signal (&aq->queue_cond_in);
2934 
2935   pthread_mutex_unlock (&aq->mutex);
2936 }
2937 
2938 /* Request an asynchronous callback on the specified queue.  The callback
2939    function will be called, with the given opaque data, from the appropriate
2940    async thread, when all previous items on that queue are complete.  */
2941 
2942 static void
2943 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2944 		     void *data)
2945 {
2946   pthread_mutex_lock (&aq->mutex);
2947 
2948   wait_for_queue_nonfull (aq);
2949 
2950   int queue_last = ((aq->queue_first + aq->queue_n)
2951 		    % ASYNC_QUEUE_SIZE);
2952   if (DEBUG_QUEUES)
2953     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2954 	       aq->id, queue_last);
2955 
2956   aq->queue[queue_last].type = CALLBACK;
2957   aq->queue[queue_last].u.callback.fn = fn;
2958   aq->queue[queue_last].u.callback.data = data;
2959 
2960   aq->queue_n++;
2961 
2962   if (DEBUG_THREAD_SIGNAL)
2963     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2964 	       aq->agent->device_id, aq->id);
2965   pthread_cond_signal (&aq->queue_cond_in);
2966 
2967   pthread_mutex_unlock (&aq->mutex);
2968 }
2969 
2970 /* Request that a given async thread wait for another thread (unspecified) to
2971    reach the given placeholder.  The wait will occur when all previous entries
2972    on the queue are complete.  A placeholder is effectively a kind of signal
2973    which simply sets a flag when encountered in a queue.  */
2974 
2975 static void
2976 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2977 		      struct placeholder *placeholderp)
2978 {
2979   pthread_mutex_lock (&aq->mutex);
2980 
2981   wait_for_queue_nonfull (aq);
2982 
2983   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2984   if (DEBUG_QUEUES)
2985     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2986 	       aq->id, queue_last);
2987 
2988   aq->queue[queue_last].type = ASYNC_WAIT;
2989   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2990 
2991   aq->queue_n++;
2992 
2993   if (DEBUG_THREAD_SIGNAL)
2994     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2995 	       aq->agent->device_id, aq->id);
2996   pthread_cond_signal (&aq->queue_cond_in);
2997 
2998   pthread_mutex_unlock (&aq->mutex);
2999 }
3000 
3001 /* Add a placeholder into an async queue.  When the async thread reaches the
3002    placeholder it will set the "executed" flag to true and continue.
3003    Another thread may be waiting on this thread reaching the placeholder.  */
3004 
3005 static struct placeholder *
3006 queue_push_placeholder (struct goacc_asyncqueue *aq)
3007 {
3008   struct placeholder *placeholderp;
3009 
3010   pthread_mutex_lock (&aq->mutex);
3011 
3012   wait_for_queue_nonfull (aq);
3013 
3014   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
3015   if (DEBUG_QUEUES)
3016     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
3017 	       aq->id, queue_last);
3018 
3019   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
3020   placeholderp = &aq->queue[queue_last].u.placeholder;
3021 
3022   if (pthread_mutex_init (&placeholderp->mutex, NULL))
3023     {
3024       pthread_mutex_unlock (&aq->mutex);
3025       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
3026     }
3027 
3028   if (pthread_cond_init (&placeholderp->cond, NULL))
3029     {
3030       pthread_mutex_unlock (&aq->mutex);
3031       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3032     }
3033 
3034   placeholderp->executed = 0;
3035 
3036   aq->queue_n++;
3037 
3038   if (DEBUG_THREAD_SIGNAL)
3039     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3040 	       aq->agent->device_id, aq->id);
3041   pthread_cond_signal (&aq->queue_cond_in);
3042 
3043   pthread_mutex_unlock (&aq->mutex);
3044 
3045   return placeholderp;
3046 }
3047 
3048 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
3049 
3050 static void
3051 finalize_async_thread (struct goacc_asyncqueue *aq)
3052 {
3053   pthread_mutex_lock (&aq->mutex);
3054   if (aq->drain_queue_stop == 2)
3055     {
3056       pthread_mutex_unlock (&aq->mutex);
3057       return;
3058     }
3059 
3060   aq->drain_queue_stop = 1;
3061 
3062   if (DEBUG_THREAD_SIGNAL)
3063     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3064 	       aq->agent->device_id, aq->id);
3065   pthread_cond_signal (&aq->queue_cond_in);
3066 
3067   while (aq->drain_queue_stop != 2)
3068     {
3069       if (DEBUG_THREAD_SLEEP)
3070 	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3071 		   " to sleep\n", aq->agent->device_id, aq->id);
3072       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3073       if (DEBUG_THREAD_SLEEP)
3074 	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
3075 		   aq->agent->device_id, aq->id);
3076     }
3077 
3078   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3079 	     aq->id);
3080   pthread_mutex_unlock (&aq->mutex);
3081 
3082   int err = pthread_join (aq->thread_drain_queue, NULL);
3083   if (err != 0)
3084     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3085 		       aq->agent->device_id, aq->id, strerror (err));
3086   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3087 }
3088 
3089 /* Set up an async queue for OpenMP.  There will be only one.  The
3090    implementation simply uses an OpenACC async queue.
3091    FIXME: is this thread-safe if two threads call this function?  */
3092 
3093 static void
3094 maybe_init_omp_async (struct agent_info *agent)
3095 {
3096   if (!agent->omp_async_queue)
3097     agent->omp_async_queue
3098       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3099 }
3100 
3101 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3102    copies from read-only pages.  */
3103 
3104 static void
3105 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3106 {
3107   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3108 
3109   if (status == HSA_STATUS_SUCCESS)
3110     return;
3111 
3112   /* It appears that the copy fails if the source data is in a read-only page.
3113      We can't detect that easily, so try copying the data to a temporary buffer
3114      and doing the copy again if we got an error above.  */
3115 
3116   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3117 	       "[%p:+%d]\n", (void *) src, (int) len);
3118 
3119   void *src_copy = malloc (len);
3120   memcpy (src_copy, src, len);
3121   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3122   free (src_copy);
3123   if (status != HSA_STATUS_SUCCESS)
3124     GOMP_PLUGIN_error ("memory copy failed");
3125 }
3126 
3127 /* Copy data to or from a device.  This is intended for use as an async
3128    callback event.  */
3129 
3130 static void
3131 copy_data (void *data_)
3132 {
3133   struct copy_data *data = (struct copy_data *)data_;
3134   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3135 	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
3136 	     data->dst);
3137   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3138   if (data->free_src)
3139     free ((void *) data->src);
3140   free (data);
3141 }
3142 
3143 /* Free device data.  This is intended for use as an async callback event.  */
3144 
3145 static void
3146 gomp_offload_free (void *ptr)
3147 {
3148   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
3149   GOMP_OFFLOAD_free (0, ptr);
3150 }
3151 
3152 /* Request an asynchronous data copy, to or from a device, on a given queue.
3153    The event will be registered as a callback.  If FREE_SRC is true
3154    then the source data will be freed following the copy.  */
3155 
3156 static void
3157 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3158 		 size_t len, bool free_src)
3159 {
3160   if (DEBUG_QUEUES)
3161     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3162 	       aq->agent->device_id, aq->id, len, src, dst);
3163   struct copy_data *data
3164     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3165   data->dst = dst;
3166   data->src = src;
3167   data->len = len;
3168   data->free_src = free_src;
3169   data->aq = aq;
3170   queue_push_callback (aq, copy_data, data);
3171 }
3172 
3173 /* Return true if the given queue is currently empty.  */
3174 
3175 static int
3176 queue_empty (struct goacc_asyncqueue *aq)
3177 {
3178   pthread_mutex_lock (&aq->mutex);
3179   int res = aq->queue_n == 0 ? 1 : 0;
3180   pthread_mutex_unlock (&aq->mutex);
3181 
3182   return res;
3183 }
3184 
3185 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3186    directive.  */
3187 
3188 static void
3189 wait_queue (struct goacc_asyncqueue *aq)
3190 {
3191   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3192     {
3193       drain_queue_synchronous (aq);
3194       return;
3195     }
3196 
3197   pthread_mutex_lock (&aq->mutex);
3198 
3199   while (aq->queue_n > 0)
3200     {
3201       if (DEBUG_THREAD_SLEEP)
3202 	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3203 		   aq->agent->device_id, aq->id);
3204       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3205       if (DEBUG_THREAD_SLEEP)
3206 	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3207 		   aq->id);
3208     }
3209 
3210   pthread_mutex_unlock (&aq->mutex);
3211   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3212 }
3213 
3214 /* }}}  */
3215 /* {{{ OpenACC support  */
3216 
3217 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3218 
3219 static void
3220 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3221 	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3222 	  struct goacc_asyncqueue *aq)
3223 {
3224   if (!GOMP_OFFLOAD_can_run (kernel))
3225     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3226 
3227   /* If we get here then this must be an OpenACC kernel.  */
3228   kernel->kind = KIND_OPENACC;
3229 
3230   /* devaddrs must be double-indirect on the target.  */
3231   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3232   for (size_t i = 0; i < mapnum; i++)
3233     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3234 				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3235 				sizeof (void *));
3236 
3237   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3238   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3239     {
3240       struct hsa_kernel_description *d
3241 	= &kernel->module->image_desc->kernel_infos[i];
3242       if (d->name == kernel->name)
3243 	{
3244 	  hsa_kernel_desc = d;
3245 	  break;
3246 	}
3247     }
3248 
3249   /* We may have statically-determined dimensions in
3250      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3251      invocation at runtime in dims[].  We allow static dimensions to take
3252      priority over dynamic dimensions when present (non-zero).  */
3253   if (hsa_kernel_desc->oacc_dims[0] > 0)
3254     dims[0] = hsa_kernel_desc->oacc_dims[0];
3255   if (hsa_kernel_desc->oacc_dims[1] > 0)
3256     dims[1] = hsa_kernel_desc->oacc_dims[1];
3257   if (hsa_kernel_desc->oacc_dims[2] > 0)
3258     dims[2] = hsa_kernel_desc->oacc_dims[2];
3259 
3260   /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3261      There isn't really a correct answer for this without a clue about the
3262      problem size, so let's do a reasonable number of single-worker gangs.
3263      64 gangs matches a typical Fiji device.  */
3264 
3265   /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3266      for the default number of workers.  */
3267   if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
3268   if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
3269 
3270   /* The incoming dimensions are expressed in terms of gangs, workers, and
3271      vectors.  The HSA dimensions are expressed in terms of "work-items",
3272      which means multiples of vector lanes.
3273 
3274      The "grid size" specifies the size of the problem space, and the
3275      "work-group size" specifies how much of that we want a single compute
3276      unit to chew on at once.
3277 
3278      The three dimensions do not really correspond to hardware, but the
3279      important thing is that the HSA runtime will launch as many
3280      work-groups as it takes to process the entire grid, and each
3281      work-group will contain as many wave-fronts as it takes to process
3282      the work-items in that group.
3283 
3284      Essentially, as long as we set the Y dimension to 64 (the number of
3285      vector lanes in hardware), and the Z group size to the maximum (16),
3286      then we will get the gangs (X) and workers (Z) launched as we expect.
3287 
3288      The reason for the apparent reversal of vector and worker dimension
3289      order is to do with the way the run-time distributes work-items across
3290      v1 and v2.  */
3291   struct GOMP_kernel_launch_attributes kla =
3292     {3,
3293      /* Grid size.  */
3294      {dims[0], 64, dims[1]},
3295      /* Work-group size.  */
3296      {1,       64, 16}
3297     };
3298 
3299   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3300   acc_prof_info *prof_info = thr->prof_info;
3301   acc_event_info enqueue_launch_event_info;
3302   acc_api_info *api_info = thr->api_info;
3303   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3304   if (profiling_dispatch_p)
3305     {
3306       prof_info->event_type = acc_ev_enqueue_launch_start;
3307 
3308       enqueue_launch_event_info.launch_event.event_type
3309 	= prof_info->event_type;
3310       enqueue_launch_event_info.launch_event.valid_bytes
3311 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3312       enqueue_launch_event_info.launch_event.parent_construct
3313 	= acc_construct_parallel;
3314       enqueue_launch_event_info.launch_event.implicit = 1;
3315       enqueue_launch_event_info.launch_event.tool_info = NULL;
3316       enqueue_launch_event_info.launch_event.kernel_name
3317 	= (char *) kernel->name;
3318       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3319       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3320       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3321 
3322       api_info->device_api = acc_device_api_other;
3323 
3324       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3325 	&enqueue_launch_event_info, api_info);
3326     }
3327 
3328   if (!async)
3329     {
3330       run_kernel (kernel, ind_da, &kla, NULL, false);
3331       gomp_offload_free (ind_da);
3332     }
3333   else
3334     {
3335       queue_push_launch (aq, kernel, ind_da, &kla);
3336       if (DEBUG_QUEUES)
3337 	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3338 		   aq->agent->device_id, aq->id, ind_da);
3339       queue_push_callback (aq, gomp_offload_free, ind_da);
3340     }
3341 
3342   if (profiling_dispatch_p)
3343     {
3344       prof_info->event_type = acc_ev_enqueue_launch_end;
3345       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3346       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3347 					    &enqueue_launch_event_info,
3348 					    api_info);
3349     }
3350 }
3351 
3352 /* }}}  */
3353 /* {{{ Generic Plugin API  */
3354 
3355 /* Return the name of the accelerator, which is "gcn".  */
3356 
3357 const char *
3358 GOMP_OFFLOAD_get_name (void)
3359 {
3360   return "gcn";
3361 }
3362 
3363 /* Return the specific capabilities the HSA accelerator have.  */
3364 
3365 unsigned int
3366 GOMP_OFFLOAD_get_caps (void)
3367 {
3368   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3369   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3370 	    | GOMP_OFFLOAD_CAP_OPENACC_200;
3371 }
3372 
3373 /* Identify as GCN accelerator.  */
3374 
3375 int
3376 GOMP_OFFLOAD_get_type (void)
3377 {
3378   return OFFLOAD_TARGET_TYPE_GCN;
3379 }
3380 
3381 /* Return the libgomp version number we're compatible with.  There is
3382    no requirement for cross-version compatibility.  */
3383 
3384 unsigned
3385 GOMP_OFFLOAD_version (void)
3386 {
3387   return GOMP_VERSION;
3388 }
3389 
3390 /* Return the number of GCN devices on the system.  */
3391 
3392 int
3393 GOMP_OFFLOAD_get_num_devices (void)
3394 {
3395   if (!init_hsa_context ())
3396     return 0;
3397   return hsa_context.agent_count;
3398 }
3399 
3400 /* Initialize device (agent) number N so that it can be used for computation.
3401    Return TRUE on success.  */
3402 
3403 bool
3404 GOMP_OFFLOAD_init_device (int n)
3405 {
3406   if (!init_hsa_context ())
3407     return false;
3408   if (n >= hsa_context.agent_count)
3409     {
3410       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3411       return false;
3412     }
3413   struct agent_info *agent = &hsa_context.agents[n];
3414 
3415   if (agent->initialized)
3416     return true;
3417 
3418   agent->device_id = n;
3419 
3420   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3421     {
3422       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3423       return false;
3424     }
3425   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3426     {
3427       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3428       return false;
3429     }
3430   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3431     {
3432       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3433       return false;
3434     }
3435   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3436     {
3437       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3438       return false;
3439     }
3440   agent->async_queues = NULL;
3441   agent->omp_async_queue = NULL;
3442   agent->team_arena_list = NULL;
3443 
3444   uint32_t queue_size;
3445   hsa_status_t status;
3446   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3447 					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3448 					  &queue_size);
3449   if (status != HSA_STATUS_SUCCESS)
3450     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3451 		      status);
3452 
3453   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3454 					  &agent->name);
3455   if (status != HSA_STATUS_SUCCESS)
3456     return hsa_error ("Error querying the name of the agent", status);
3457 
3458   agent->device_isa = isa_code (agent->name);
3459   if (agent->device_isa < 0)
3460     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3461 
3462   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3463 					  &agent->vendor_name);
3464   if (status != HSA_STATUS_SUCCESS)
3465     return hsa_error ("Error querying the vendor name of the agent", status);
3466 
3467   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3468 					HSA_QUEUE_TYPE_MULTI,
3469 					hsa_queue_callback, NULL, UINT32_MAX,
3470 					UINT32_MAX, &agent->sync_queue);
3471   if (status != HSA_STATUS_SUCCESS)
3472     return hsa_error ("Error creating command queue", status);
3473 
3474   agent->kernarg_region.handle = (uint64_t) -1;
3475   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3476 						 get_kernarg_memory_region,
3477 						 &agent->kernarg_region);
3478   if (status != HSA_STATUS_SUCCESS
3479       && status != HSA_STATUS_INFO_BREAK)
3480     hsa_error ("Scanning memory regions failed", status);
3481   if (agent->kernarg_region.handle == (uint64_t) -1)
3482     {
3483       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3484 			 "arguments");
3485       return false;
3486     }
3487   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3488   dump_hsa_region (agent->kernarg_region, NULL);
3489 
3490   agent->data_region.handle = (uint64_t) -1;
3491   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3492 						 get_data_memory_region,
3493 						 &agent->data_region);
3494   if (status != HSA_STATUS_SUCCESS
3495       && status != HSA_STATUS_INFO_BREAK)
3496     hsa_error ("Scanning memory regions failed", status);
3497   if (agent->data_region.handle == (uint64_t) -1)
3498     {
3499       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3500 			 "data");
3501       return false;
3502     }
3503   GCN_DEBUG ("Selected device data memory region:\n");
3504   dump_hsa_region (agent->data_region, NULL);
3505 
3506   GCN_DEBUG ("GCN agent %d initialized\n", n);
3507 
3508   agent->initialized = true;
3509   return true;
3510 }
3511 
3512 /* Load GCN object-code module described by struct gcn_image_desc in
3513    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3514    If there are any constructors then run them.  */
3515 
3516 int
3517 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3518 			 struct addr_pair **target_table)
3519 {
3520   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3521     {
3522       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3523 			 " (expected %u, received %u)",
3524 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3525       return -1;
3526     }
3527 
3528   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3529   struct agent_info *agent;
3530   struct addr_pair *pair;
3531   struct module_info *module;
3532   struct kernel_info *kernel;
3533   int kernel_count = image_desc->kernel_count;
3534   unsigned var_count = image_desc->global_variable_count;
3535 
3536   agent = get_agent_info (ord);
3537   if (!agent)
3538     return -1;
3539 
3540   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3541     {
3542       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3543       return -1;
3544     }
3545   if (agent->prog_finalized
3546       && !destroy_hsa_program (agent))
3547     return -1;
3548 
3549   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3550   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3551   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3552 			     * sizeof (struct addr_pair));
3553   *target_table = pair;
3554   module = (struct module_info *)
3555     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3556 				+ kernel_count * sizeof (struct kernel_info));
3557   module->image_desc = image_desc;
3558   module->kernel_count = kernel_count;
3559   module->heap = NULL;
3560   module->constructors_run_p = false;
3561 
3562   kernel = &module->kernels[0];
3563 
3564   /* Allocate memory for kernel dependencies.  */
3565   for (unsigned i = 0; i < kernel_count; i++)
3566     {
3567       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3568       if (!init_basic_kernel_info (kernel, d, agent, module))
3569 	return -1;
3570       if (strcmp (d->name, "_init_array") == 0)
3571 	module->init_array_func = kernel;
3572       else if (strcmp (d->name, "_fini_array") == 0)
3573         module->fini_array_func = kernel;
3574       else
3575 	{
3576 	  pair->start = (uintptr_t) kernel;
3577 	  pair->end = (uintptr_t) (kernel + 1);
3578 	  pair++;
3579 	}
3580       kernel++;
3581     }
3582 
3583   agent->module = module;
3584   if (pthread_rwlock_unlock (&agent->module_rwlock))
3585     {
3586       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3587       return -1;
3588     }
3589 
3590   if (!create_and_finalize_hsa_program (agent))
3591     return -1;
3592 
3593   for (unsigned i = 0; i < var_count; i++)
3594     {
3595       struct global_var_info *v = &image_desc->global_variables[i];
3596       GCN_DEBUG ("Looking for variable %s\n", v->name);
3597 
3598       hsa_status_t status;
3599       hsa_executable_symbol_t var_symbol;
3600       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3601 						     v->name, agent->id,
3602 						     0, &var_symbol);
3603 
3604       if (status != HSA_STATUS_SUCCESS)
3605 	hsa_fatal ("Could not find symbol for variable in the code object",
3606 		   status);
3607 
3608       uint64_t var_addr;
3609       uint32_t var_size;
3610       status = hsa_fns.hsa_executable_symbol_get_info_fn
3611 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3612       if (status != HSA_STATUS_SUCCESS)
3613 	hsa_fatal ("Could not extract a variable from its symbol", status);
3614       status = hsa_fns.hsa_executable_symbol_get_info_fn
3615 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3616       if (status != HSA_STATUS_SUCCESS)
3617 	hsa_fatal ("Could not extract a variable size from its symbol", status);
3618 
3619       pair->start = var_addr;
3620       pair->end = var_addr + var_size;
3621       GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3622 		 (void *)var_addr, var_size);
3623       pair++;
3624     }
3625 
3626   /* Ensure that constructors are run first.  */
3627   struct GOMP_kernel_launch_attributes kla =
3628     { 3,
3629       /* Grid size.  */
3630       { 1, 64, 1 },
3631       /* Work-group size.  */
3632       { 1, 64, 1 }
3633     };
3634 
3635   if (module->init_array_func)
3636     {
3637       init_kernel (module->init_array_func);
3638       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3639     }
3640   module->constructors_run_p = true;
3641 
3642   /* Don't report kernels that libgomp need not know about.  */
3643   if (module->init_array_func)
3644     kernel_count--;
3645   if (module->fini_array_func)
3646     kernel_count--;
3647 
3648   return kernel_count + var_count;
3649 }
3650 
3651 /* Unload GCN object-code module described by struct gcn_image_desc in
3652    TARGET_DATA from agent number N.  Return TRUE on success.  */
3653 
3654 bool
3655 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3656 {
3657   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3658     {
3659       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3660 			 " (expected %u, received %u)",
3661 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3662       return false;
3663     }
3664 
3665   struct agent_info *agent;
3666   agent = get_agent_info (n);
3667   if (!agent)
3668     return false;
3669 
3670   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3671     {
3672       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3673       return false;
3674     }
3675 
3676   if (!agent->module || agent->module->image_desc != target_data)
3677     {
3678       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3679 			 "loaded before");
3680       return false;
3681     }
3682 
3683   if (!destroy_module (agent->module, true))
3684     return false;
3685   free (agent->module);
3686   agent->module = NULL;
3687   if (!destroy_hsa_program (agent))
3688     return false;
3689   if (pthread_rwlock_unlock (&agent->module_rwlock))
3690     {
3691       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3692       return false;
3693     }
3694   return true;
3695 }
3696 
3697 /* Deinitialize all information and status associated with agent number N.  We
3698    do not attempt any synchronization, assuming the user and libgomp will not
3699    attempt deinitialization of a device that is in any way being used at the
3700    same time.  Return TRUE on success.  */
3701 
3702 bool
3703 GOMP_OFFLOAD_fini_device (int n)
3704 {
3705   struct agent_info *agent = get_agent_info (n);
3706   if (!agent)
3707     return false;
3708 
3709   if (!agent->initialized)
3710     return true;
3711 
3712   if (agent->omp_async_queue)
3713     {
3714       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3715       agent->omp_async_queue = NULL;
3716     }
3717 
3718   if (agent->module)
3719     {
3720       if (!destroy_module (agent->module, false))
3721 	return false;
3722       free (agent->module);
3723       agent->module = NULL;
3724     }
3725 
3726   if (!destroy_team_arenas (agent))
3727     return false;
3728 
3729   if (!destroy_hsa_program (agent))
3730     return false;
3731 
3732   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3733   if (status != HSA_STATUS_SUCCESS)
3734     return hsa_error ("Error destroying command queue", status);
3735 
3736   if (pthread_mutex_destroy (&agent->prog_mutex))
3737     {
3738       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3739       return false;
3740     }
3741   if (pthread_rwlock_destroy (&agent->module_rwlock))
3742     {
3743       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3744       return false;
3745     }
3746 
3747   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3748     {
3749       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3750       return false;
3751     }
3752   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3753     {
3754       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3755       return false;
3756     }
3757   agent->initialized = false;
3758   return true;
3759 }
3760 
3761 /* Return true if the HSA runtime can run function FN_PTR.  */
3762 
3763 bool
3764 GOMP_OFFLOAD_can_run (void *fn_ptr)
3765 {
3766   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3767 
3768   init_kernel (kernel);
3769   if (kernel->initialization_failed)
3770     goto failure;
3771 
3772   return true;
3773 
3774 failure:
3775   if (suppress_host_fallback)
3776     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3777   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3778   return false;
3779 }
3780 
3781 /* Allocate memory on device N.  */
3782 
3783 void *
3784 GOMP_OFFLOAD_alloc (int n, size_t size)
3785 {
3786   struct agent_info *agent = get_agent_info (n);
3787   return alloc_by_agent (agent, size);
3788 }
3789 
3790 /* Free memory from device N.  */
3791 
3792 bool
3793 GOMP_OFFLOAD_free (int device, void *ptr)
3794 {
3795   GCN_DEBUG ("Freeing memory on device %d\n", device);
3796 
3797   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3798   if (status != HSA_STATUS_SUCCESS)
3799     {
3800       hsa_error ("Could not free device memory", status);
3801       return false;
3802     }
3803 
3804   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3805   bool profiling_dispatch_p
3806     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3807   if (profiling_dispatch_p)
3808     {
3809       acc_prof_info *prof_info = thr->prof_info;
3810       acc_event_info data_event_info;
3811       acc_api_info *api_info = thr->api_info;
3812 
3813       prof_info->event_type = acc_ev_free;
3814 
3815       data_event_info.data_event.event_type = prof_info->event_type;
3816       data_event_info.data_event.valid_bytes
3817 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
3818       data_event_info.data_event.parent_construct
3819 	= acc_construct_parallel;
3820       data_event_info.data_event.implicit = 1;
3821       data_event_info.data_event.tool_info = NULL;
3822       data_event_info.data_event.var_name = NULL;
3823       data_event_info.data_event.bytes = 0;
3824       data_event_info.data_event.host_ptr = NULL;
3825       data_event_info.data_event.device_ptr = (void *) ptr;
3826 
3827       api_info->device_api = acc_device_api_other;
3828 
3829       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3830 					    api_info);
3831     }
3832 
3833   return true;
3834 }
3835 
3836 /* Copy data from DEVICE to host.  */
3837 
3838 bool
3839 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3840 {
3841   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3842 	     src, dst);
3843   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3844   if (status != HSA_STATUS_SUCCESS)
3845     GOMP_PLUGIN_error ("memory copy failed");
3846   return true;
3847 }
3848 
3849 /* Copy data from host to DEVICE.  */
3850 
3851 bool
3852 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3853 {
3854   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3855 	     device, dst);
3856   hsa_memory_copy_wrapper (dst, src, n);
3857   return true;
3858 }
3859 
3860 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3861 
3862 bool
3863 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3864 {
3865   struct gcn_thread *thread_data = gcn_thread ();
3866 
3867   if (thread_data && !async_synchronous_p (thread_data->async))
3868     {
3869       struct agent_info *agent = get_agent_info (device);
3870       maybe_init_omp_async (agent);
3871       queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3872       return true;
3873     }
3874 
3875   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3876 	     device, src, device, dst);
3877   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3878   if (status != HSA_STATUS_SUCCESS)
3879     GOMP_PLUGIN_error ("memory copy failed");
3880   return true;
3881 }
3882 
3883 /* }}}  */
3884 /* {{{ OpenMP Plugin API  */
3885 
3886 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3887    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3888    to a kernel_info structure, and must have previously been loaded to the
3889    specified device.  */
3890 
3891 void
3892 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3893 {
3894   struct agent_info *agent = get_agent_info (device);
3895   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3896   struct GOMP_kernel_launch_attributes def;
3897   struct GOMP_kernel_launch_attributes *kla;
3898   assert (agent == kernel->agent);
3899 
3900   /* If we get here then the kernel must be OpenMP.  */
3901   kernel->kind = KIND_OPENMP;
3902 
3903   if (!parse_target_attributes (args, &def, &kla, agent))
3904     {
3905       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3906       return;
3907     }
3908   run_kernel (kernel, vars, kla, NULL, false);
3909 }
3910 
3911 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3912    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3913    GOMP_PLUGIN_target_task_completion when it has finished.  */
3914 
3915 void
3916 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3917 			void **args, void *async_data)
3918 {
3919   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3920   struct agent_info *agent = get_agent_info (device);
3921   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3922   struct GOMP_kernel_launch_attributes def;
3923   struct GOMP_kernel_launch_attributes *kla;
3924   assert (agent == kernel->agent);
3925 
3926   /* If we get here then the kernel must be OpenMP.  */
3927   kernel->kind = KIND_OPENMP;
3928 
3929   if (!parse_target_attributes (args, &def, &kla, agent))
3930     {
3931       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3932       return;
3933     }
3934 
3935   maybe_init_omp_async (agent);
3936   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3937   queue_push_callback (agent->omp_async_queue,
3938 		       GOMP_PLUGIN_target_task_completion, async_data);
3939 }
3940 
3941 /* }}} */
3942 /* {{{ OpenACC Plugin API  */
3943 
3944 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3945    already-loaded KERNEL.  */
3946 
3947 void
3948 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3949 			   void **hostaddrs, void **devaddrs, unsigned *dims,
3950 			   void *targ_mem_desc)
3951 {
3952   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3953 
3954   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3955 	    NULL);
3956 }
3957 
3958 /* Run an asynchronous OpenACC kernel on the specified queue.  */
3959 
3960 void
3961 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3962 				 void **hostaddrs, void **devaddrs,
3963 				 unsigned *dims, void *targ_mem_desc,
3964 				 struct goacc_asyncqueue *aq)
3965 {
3966   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3967 
3968   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3969 	    aq);
3970 }
3971 
3972 /* Create a new asynchronous thread and queue for running future kernels.  */
3973 
3974 struct goacc_asyncqueue *
3975 GOMP_OFFLOAD_openacc_async_construct (int device)
3976 {
3977   struct agent_info *agent = get_agent_info (device);
3978 
3979   pthread_mutex_lock (&agent->async_queues_mutex);
3980 
3981   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3982   aq->agent = get_agent_info (device);
3983   aq->prev = NULL;
3984   aq->next = agent->async_queues;
3985   if (aq->next)
3986     {
3987       aq->next->prev = aq;
3988       aq->id = aq->next->id + 1;
3989     }
3990   else
3991     aq->id = 1;
3992   agent->async_queues = aq;
3993 
3994   aq->queue_first = 0;
3995   aq->queue_n = 0;
3996   aq->drain_queue_stop = 0;
3997 
3998   if (pthread_mutex_init (&aq->mutex, NULL))
3999     {
4000       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4001       return false;
4002     }
4003   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4004     {
4005       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4006       return false;
4007     }
4008   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4009     {
4010       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4011       return false;
4012     }
4013 
4014   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4015 						     ASYNC_QUEUE_SIZE,
4016 						     HSA_QUEUE_TYPE_MULTI,
4017 						     hsa_queue_callback, NULL,
4018 						     UINT32_MAX, UINT32_MAX,
4019 						     &aq->hsa_queue);
4020   if (status != HSA_STATUS_SUCCESS)
4021     hsa_fatal ("Error creating command queue", status);
4022 
4023   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4024   if (err != 0)
4025     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4026 		       strerror (err));
4027   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4028 	     aq->id);
4029 
4030   pthread_mutex_unlock (&agent->async_queues_mutex);
4031 
4032   return aq;
4033 }
4034 
4035 /* Destroy an existing asynchronous thread and queue.  Waits for any
4036    currently-running task to complete, but cancels any queued tasks.  */
4037 
4038 bool
4039 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4040 {
4041   struct agent_info *agent = aq->agent;
4042 
4043   finalize_async_thread (aq);
4044 
4045   pthread_mutex_lock (&agent->async_queues_mutex);
4046 
4047   int err;
4048   if ((err = pthread_mutex_destroy (&aq->mutex)))
4049     {
4050       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4051       goto fail;
4052     }
4053   if (pthread_cond_destroy (&aq->queue_cond_in))
4054     {
4055       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4056       goto fail;
4057     }
4058   if (pthread_cond_destroy (&aq->queue_cond_out))
4059     {
4060       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4061       goto fail;
4062     }
4063   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4064   if (status != HSA_STATUS_SUCCESS)
4065     {
4066       hsa_error ("Error destroying command queue", status);
4067       goto fail;
4068     }
4069 
4070   if (aq->prev)
4071     aq->prev->next = aq->next;
4072   if (aq->next)
4073     aq->next->prev = aq->prev;
4074   if (agent->async_queues == aq)
4075     agent->async_queues = aq->next;
4076 
4077   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4078 
4079   free (aq);
4080   pthread_mutex_unlock (&agent->async_queues_mutex);
4081   return true;
4082 
4083 fail:
4084   pthread_mutex_unlock (&agent->async_queues_mutex);
4085   return false;
4086 }
4087 
4088 /* Return true if the specified async queue is currently empty.  */
4089 
4090 int
4091 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4092 {
4093   return queue_empty (aq);
4094 }
4095 
4096 /* Block until the specified queue has executed all its tasks and the
4097    queue is empty.  */
4098 
4099 bool
4100 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4101 {
4102   wait_queue (aq);
4103   return true;
4104 }
4105 
4106 /* Add a serialization point across two async queues. Any new tasks added to
4107    AQ2, after this call, will not run until all tasks on AQ1, at the time
4108    of this call, have completed.  */
4109 
4110 bool
4111 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4112 				      struct goacc_asyncqueue *aq2)
4113 {
4114   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4115      scheduled to run on it up to this point.  */
4116   if (aq1 != aq2)
4117     {
4118       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4119       queue_push_asyncwait (aq2, placeholderp);
4120     }
4121   return true;
4122 }
4123 
4124 /* Add an opaque callback to the given async queue.  */
4125 
4126 void
4127 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4128 					   void (*fn) (void *), void *data)
4129 {
4130   queue_push_callback (aq, fn, data);
4131 }
4132 
4133 /* Queue up an asynchronous data copy from host to DEVICE.  */
4134 
4135 bool
4136 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4137 				     size_t n, struct goacc_asyncqueue *aq)
4138 {
4139   struct agent_info *agent = get_agent_info (device);
4140   assert (agent == aq->agent);
4141   /* The source data does not necessarily remain live until the deferred
4142      copy happens.  Taking a snapshot of the data here avoids reading
4143      uninitialised data later, but means that (a) data is copied twice and
4144      (b) modifications to the copied data between the "spawning" point of
4145      the asynchronous kernel and when it is executed will not be seen.
4146      But, that is probably correct.  */
4147   void *src_copy = GOMP_PLUGIN_malloc (n);
4148   memcpy (src_copy, src, n);
4149   queue_push_copy (aq, dst, src_copy, n, true);
4150   return true;
4151 }
4152 
4153 /* Queue up an asynchronous data copy from DEVICE to host.  */
4154 
4155 bool
4156 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4157 				     size_t n, struct goacc_asyncqueue *aq)
4158 {
4159   struct agent_info *agent = get_agent_info (device);
4160   assert (agent == aq->agent);
4161   queue_push_copy (aq, dst, src, n, false);
4162   return true;
4163 }
4164 
4165 union goacc_property_value
4166 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4167 {
4168   struct agent_info *agent = get_agent_info (device);
4169 
4170   union goacc_property_value propval = { .val = 0 };
4171 
4172   switch (prop)
4173     {
4174     case GOACC_PROPERTY_FREE_MEMORY:
4175       /* Not supported. */
4176       break;
4177     case GOACC_PROPERTY_MEMORY:
4178       {
4179 	size_t size;
4180 	hsa_region_t region = agent->data_region;
4181 	hsa_status_t status =
4182 	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4183 	if (status == HSA_STATUS_SUCCESS)
4184 	  propval.val = size;
4185 	break;
4186       }
4187     case GOACC_PROPERTY_NAME:
4188       propval.ptr = agent->name;
4189       break;
4190     case GOACC_PROPERTY_VENDOR:
4191       propval.ptr = agent->vendor_name;
4192       break;
4193     case GOACC_PROPERTY_DRIVER:
4194       propval.ptr = hsa_context.driver_version_s;
4195       break;
4196     }
4197 
4198   return propval;
4199 }
4200 
4201 /* Set up plugin-specific thread-local-data (host-side).  */
4202 
4203 void *
4204 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4205 {
4206   struct gcn_thread *thread_data
4207     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4208 
4209   thread_data->async = GOMP_ASYNC_SYNC;
4210 
4211   return (void *) thread_data;
4212 }
4213 
4214 /* Clean up plugin-specific thread-local-data.  */
4215 
4216 void
4217 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4218 {
4219   free (data);
4220 }
4221 
4222 /* }}} */
4223