xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/plugin/plugin-gcn.c (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1 /* Plugin for AMD GCN execution.
2 
3    Copyright (C) 2013-2022 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* {{{ Includes and defines  */
30 
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "gomp-constants.h"
46 #include <elf.h>
47 #include "oacc-plugin.h"
48 #include "oacc-int.h"
49 #include <assert.h>
50 
51 /* These probably won't be in elf.h for a while.  */
52 #ifndef R_AMDGPU_NONE
53 #define R_AMDGPU_NONE		0
54 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
55 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
56 #define R_AMDGPU_ABS64		3	/* S + A  */
57 #define R_AMDGPU_REL32		4	/* S + A - P  */
58 #define R_AMDGPU_REL64		5	/* S + A - P  */
59 #define R_AMDGPU_ABS32		6	/* S + A  */
60 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
61 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
62 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
63 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
64 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
65 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
66 #endif
67 
68 /* GCN specific definitions for asynchronous queues.  */
69 
70 #define ASYNC_QUEUE_SIZE 64
71 #define DRAIN_QUEUE_SYNCHRONOUS_P false
72 #define DEBUG_QUEUES 0
73 #define DEBUG_THREAD_SLEEP 0
74 #define DEBUG_THREAD_SIGNAL 0
75 
76 /* Defaults.  */
77 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
78 
79 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
80 #ifndef HAVE_SECURE_GETENV
81 #ifdef HAVE___SECURE_GETENV
82 #define secure_getenv __secure_getenv
83 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
84   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
85 
86 #include <unistd.h>
87 
88 /* Implementation of secure_getenv() for targets where it is not provided but
89    we have at least means to test real and effective IDs. */
90 
91 static char *
92 secure_getenv (const char *name)
93 {
94   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
95     return getenv (name);
96   else
97     return NULL;
98 }
99 
100 #else
101 #define secure_getenv getenv
102 #endif
103 #endif
104 
105 /* }}}  */
106 /* {{{ Types  */
107 
108 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
109 
110 struct gcn_thread
111 {
112   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
113   int async;
114 };
115 
116 /* As an HSA runtime is dlopened, following structure defines function
117    pointers utilized by the HSA plug-in.  */
118 
119 struct hsa_runtime_fn_info
120 {
121   /* HSA runtime.  */
122   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
123 					const char **status_string);
124   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
125 					  void *value);
126   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
127 					 hsa_agent_info_t attribute,
128 					 void *value);
129   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
130 				      hsa_isa_info_t attribute,
131 				      uint32_t index,
132 				      void *value);
133   hsa_status_t (*hsa_init_fn) (void);
134   hsa_status_t (*hsa_iterate_agents_fn)
135     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
136   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
137 					  hsa_region_info_t attribute,
138 					  void *value);
139   hsa_status_t (*hsa_queue_create_fn)
140     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
141      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
142      void *data, uint32_t private_segment_size,
143      uint32_t group_segment_size, hsa_queue_t **queue);
144   hsa_status_t (*hsa_agent_iterate_regions_fn)
145     (hsa_agent_t agent,
146      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
147   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
148   hsa_status_t (*hsa_executable_create_fn)
149     (hsa_profile_t profile, hsa_executable_state_t executable_state,
150      const char *options, hsa_executable_t *executable);
151   hsa_status_t (*hsa_executable_global_variable_define_fn)
152     (hsa_executable_t executable, const char *variable_name, void *address);
153   hsa_status_t (*hsa_executable_load_code_object_fn)
154     (hsa_executable_t executable, hsa_agent_t agent,
155      hsa_code_object_t code_object, const char *options);
156   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
157 					   const char *options);
158   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
159 					uint32_t num_consumers,
160 					const hsa_agent_t *consumers,
161 					hsa_signal_t *signal);
162   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
163 					  void **ptr);
164   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
165 					      hsa_access_permission_t access);
166   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
167   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
168   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
169   hsa_status_t (*hsa_executable_get_symbol_fn)
170     (hsa_executable_t executable, const char *module_name,
171      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
172      hsa_executable_symbol_t *symbol);
173   hsa_status_t (*hsa_executable_symbol_get_info_fn)
174     (hsa_executable_symbol_t executable_symbol,
175      hsa_executable_symbol_info_t attribute, void *value);
176   hsa_status_t (*hsa_executable_iterate_symbols_fn)
177     (hsa_executable_t executable,
178      hsa_status_t (*callback)(hsa_executable_t executable,
179 			      hsa_executable_symbol_t symbol, void *data),
180      void *data);
181   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
182 						    uint64_t value);
183   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
184   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
185 				       hsa_signal_value_t value);
186   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
187 				       hsa_signal_value_t value);
188   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
189     (hsa_signal_t signal, hsa_signal_condition_t condition,
190      hsa_signal_value_t compare_value, uint64_t timeout_hint,
191      hsa_wait_state_t wait_state_hint);
192   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
193   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
194 
195   hsa_status_t (*hsa_code_object_deserialize_fn)
196     (void *serialized_code_object, size_t serialized_code_object_size,
197      const char *options, hsa_code_object_t *code_object);
198 };
199 
200 /* Structure describing the run-time and grid properties of an HSA kernel
201    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
202 
203 struct GOMP_kernel_launch_attributes
204 {
205   /* Number of dimensions the workload has.  Maximum number is 3.  */
206   uint32_t ndim;
207   /* Size of the grid in the three respective dimensions.  */
208   uint32_t gdims[3];
209   /* Size of work-groups in the respective dimensions.  */
210   uint32_t wdims[3];
211 };
212 
213 /* Collection of information needed for a dispatch of a kernel from a
214    kernel.  */
215 
216 struct kernel_dispatch
217 {
218   struct agent_info *agent;
219   /* Pointer to a command queue associated with a kernel dispatch agent.  */
220   void *queue;
221   /* Pointer to a memory space used for kernel arguments passing.  */
222   void *kernarg_address;
223   /* Kernel object.  */
224   uint64_t object;
225   /* Synchronization signal used for dispatch synchronization.  */
226   uint64_t signal;
227   /* Private segment size.  */
228   uint32_t private_segment_size;
229   /* Group segment size.  */
230   uint32_t group_segment_size;
231 };
232 
233 /* Structure of the kernargs segment, supporting console output.
234 
235    This needs to match the definitions in Newlib, and the expectations
236    in libgomp target code.  */
237 
238 struct kernargs {
239   /* Leave space for the real kernel arguments.
240      OpenACC and OpenMP only use one pointer.  */
241   int64_t dummy1;
242   int64_t dummy2;
243 
244   /* A pointer to struct output, below, for console output data.  */
245   int64_t out_ptr;
246 
247   /* A pointer to struct heap, below.  */
248   int64_t heap_ptr;
249 
250   /* A pointer to an ephemeral memory arena.
251     Only needed for OpenMP.  */
252   int64_t arena_ptr;
253 
254   /* Output data.  */
255   struct output {
256     int return_value;
257     unsigned int next_output;
258     struct printf_data {
259       int written;
260       char msg[128];
261       int type;
262       union {
263 	int64_t ivalue;
264 	double dvalue;
265 	char text[128];
266       };
267     } queue[1024];
268     unsigned int consumed;
269   } output_data;
270 };
271 
272 /* A queue entry for a future asynchronous launch.  */
273 
274 struct kernel_launch
275 {
276   struct kernel_info *kernel;
277   void *vars;
278   struct GOMP_kernel_launch_attributes kla;
279 };
280 
281 /* A queue entry for a future callback.  */
282 
283 struct callback
284 {
285   void (*fn)(void *);
286   void *data;
287 };
288 
289 /* A data struct for the copy_data callback.  */
290 
291 struct copy_data
292 {
293   void *dst;
294   const void *src;
295   size_t len;
296   struct goacc_asyncqueue *aq;
297 };
298 
299 /* A queue entry for a placeholder.  These correspond to a wait event.  */
300 
301 struct placeholder
302 {
303   int executed;
304   pthread_cond_t cond;
305   pthread_mutex_t mutex;
306 };
307 
308 /* A queue entry for a wait directive.  */
309 
310 struct asyncwait_info
311 {
312   struct placeholder *placeholderp;
313 };
314 
315 /* Encode the type of an entry in an async queue.  */
316 
317 enum entry_type
318 {
319   KERNEL_LAUNCH,
320   CALLBACK,
321   ASYNC_WAIT,
322   ASYNC_PLACEHOLDER
323 };
324 
325 /* An entry in an async queue.  */
326 
327 struct queue_entry
328 {
329   enum entry_type type;
330   union {
331     struct kernel_launch launch;
332     struct callback callback;
333     struct asyncwait_info asyncwait;
334     struct placeholder placeholder;
335   } u;
336 };
337 
338 /* An async queue header.
339 
340    OpenMP may create one of these.
341    OpenACC may create many.  */
342 
343 struct goacc_asyncqueue
344 {
345   struct agent_info *agent;
346   hsa_queue_t *hsa_queue;
347 
348   pthread_t thread_drain_queue;
349   pthread_mutex_t mutex;
350   pthread_cond_t queue_cond_in;
351   pthread_cond_t queue_cond_out;
352   struct queue_entry queue[ASYNC_QUEUE_SIZE];
353   int queue_first;
354   int queue_n;
355   int drain_queue_stop;
356 
357   int id;
358   struct goacc_asyncqueue *prev;
359   struct goacc_asyncqueue *next;
360 };
361 
362 /* Mkoffload uses this structure to describe a kernel.
363 
364    OpenMP kernel dimensions are passed at runtime.
365    OpenACC kernel dimensions are passed at compile time, here.  */
366 
367 struct hsa_kernel_description
368 {
369   const char *name;
370   int oacc_dims[3];  /* Only present for GCN kernels.  */
371   int sgpr_count;
372   int vpgr_count;
373 };
374 
375 /* Mkoffload uses this structure to describe an offload variable.  */
376 
377 struct global_var_info
378 {
379   const char *name;
380   void *address;
381 };
382 
383 /* Mkoffload uses this structure to describe all the kernels in a
384    loadable module.  These are passed the libgomp via static constructors.  */
385 
386 struct gcn_image_desc
387 {
388   struct gcn_image {
389     size_t size;
390     void *image;
391   } *gcn_image;
392   const unsigned kernel_count;
393   struct hsa_kernel_description *kernel_infos;
394   const unsigned global_variable_count;
395 };
396 
397 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
398    support.
399    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
400 
401 typedef enum {
402   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
403   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
404   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
405   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
406 } EF_AMDGPU_MACH;
407 
408 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
409 typedef EF_AMDGPU_MACH gcn_isa;
410 
411 /* Description of an HSA GPU agent (device) and the program associated with
412    it.  */
413 
414 struct agent_info
415 {
416   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
417   hsa_agent_t id;
418   /* The user-visible device number.  */
419   int device_id;
420   /* Whether the agent has been initialized.  The fields below are usable only
421      if it has been.  */
422   bool initialized;
423 
424   /* The instruction set architecture of the device. */
425   gcn_isa device_isa;
426   /* Name of the agent. */
427   char name[64];
428   /* Name of the vendor of the agent. */
429   char vendor_name[64];
430   /* Command queues of the agent.  */
431   hsa_queue_t *sync_queue;
432   struct goacc_asyncqueue *async_queues, *omp_async_queue;
433   pthread_mutex_t async_queues_mutex;
434 
435   /* The HSA memory region from which to allocate kernel arguments.  */
436   hsa_region_t kernarg_region;
437 
438   /* The HSA memory region from which to allocate device data.  */
439   hsa_region_t data_region;
440 
441   /* Allocated team arenas.  */
442   struct team_arena_list *team_arena_list;
443   pthread_mutex_t team_arena_write_lock;
444 
445   /* Read-write lock that protects kernels which are running or about to be run
446      from interference with loading and unloading of images.  Needs to be
447      locked for reading while a kernel is being run, and for writing if the
448      list of modules is manipulated (and thus the HSA program invalidated).  */
449   pthread_rwlock_t module_rwlock;
450 
451   /* The module associated with this kernel.  */
452   struct module_info *module;
453 
454   /* Mutex enforcing that only one thread will finalize the HSA program.  A
455      thread should have locked agent->module_rwlock for reading before
456      acquiring it.  */
457   pthread_mutex_t prog_mutex;
458   /* Flag whether the HSA program that consists of all the modules has been
459      finalized.  */
460   bool prog_finalized;
461   /* HSA executable - the finalized program that is used to locate kernels.  */
462   hsa_executable_t executable;
463 };
464 
465 /* Information required to identify, finalize and run any given kernel.  */
466 
467 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
468 
469 struct kernel_info
470 {
471   /* Name of the kernel, required to locate it within the GCN object-code
472      module.  */
473   const char *name;
474   /* The specific agent the kernel has been or will be finalized for and run
475      on.  */
476   struct agent_info *agent;
477   /* The specific module where the kernel takes place.  */
478   struct module_info *module;
479   /* Information provided by mkoffload associated with the kernel.  */
480   struct hsa_kernel_description *description;
481   /* Mutex enforcing that at most once thread ever initializes a kernel for
482      use.  A thread should have locked agent->module_rwlock for reading before
483      acquiring it.  */
484   pthread_mutex_t init_mutex;
485   /* Flag indicating whether the kernel has been initialized and all fields
486      below it contain valid data.  */
487   bool initialized;
488   /* Flag indicating that the kernel has a problem that blocks an execution.  */
489   bool initialization_failed;
490   /* The object to be put into the dispatch queue.  */
491   uint64_t object;
492   /* Required size of kernel arguments.  */
493   uint32_t kernarg_segment_size;
494   /* Required size of group segment.  */
495   uint32_t group_segment_size;
496   /* Required size of private segment.  */
497   uint32_t private_segment_size;
498   /* Set up for OpenMP or OpenACC?  */
499   enum offload_kind kind;
500 };
501 
502 /* Information about a particular GCN module, its image and kernels.  */
503 
504 struct module_info
505 {
506   /* The description with which the program has registered the image.  */
507   struct gcn_image_desc *image_desc;
508   /* GCN heap allocation.  */
509   struct heap *heap;
510   /* Physical boundaries of the loaded module.  */
511   Elf64_Addr phys_address_start;
512   Elf64_Addr phys_address_end;
513 
514   bool constructors_run_p;
515   struct kernel_info *init_array_func, *fini_array_func;
516 
517   /* Number of kernels in this module.  */
518   int kernel_count;
519   /* An array of kernel_info structures describing each kernel in this
520      module.  */
521   struct kernel_info kernels[];
522 };
523 
524 /* A linked list of memory arenas allocated on the device.
525    These are only used by OpenMP, as a means to optimize per-team malloc.  */
526 
527 struct team_arena_list
528 {
529   struct team_arena_list *next;
530 
531   /* The number of teams determines the size of the allocation.  */
532   int num_teams;
533   /* The device address of the arena itself.  */
534   void *arena;
535   /* A flag to prevent two asynchronous kernels trying to use the same arena.
536      The mutex is locked until the kernel exits.  */
537   pthread_mutex_t in_use;
538 };
539 
540 /* Information about the whole HSA environment and all of its agents.  */
541 
542 struct hsa_context_info
543 {
544   /* Whether the structure has been initialized.  */
545   bool initialized;
546   /* Number of usable GPU HSA agents in the system.  */
547   int agent_count;
548   /* Array of agent_info structures describing the individual HSA agents.  */
549   struct agent_info *agents;
550   /* Driver version string. */
551   char driver_version_s[30];
552 };
553 
554 /* Format of the on-device heap.
555 
556    This must match the definition in Newlib and gcn-run.  */
557 
558 struct heap {
559   int64_t size;
560   char data[0];
561 };
562 
563 /* }}}  */
564 /* {{{ Global variables  */
565 
566 /* Information about the whole HSA environment and all of its agents.  */
567 
568 static struct hsa_context_info hsa_context;
569 
570 /* HSA runtime functions that are initialized in init_hsa_context.  */
571 
572 static struct hsa_runtime_fn_info hsa_fns;
573 
574 /* Heap space, allocated target-side, provided for use of newlib malloc.
575    Each module should have it's own heap allocated.
576    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
577 
578 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
579 
580 /* Flag to decide whether print to stderr information about what is going on.
581    Set in init_debug depending on environment variables.  */
582 
583 static bool debug;
584 
585 /* Flag to decide if the runtime should suppress a possible fallback to host
586    execution.  */
587 
588 static bool suppress_host_fallback;
589 
590 /* Flag to locate HSA runtime shared library that is dlopened
591    by this plug-in.  */
592 
593 static const char *hsa_runtime_lib;
594 
595 /* Flag to decide if the runtime should support also CPU devices (can be
596    a simulator).  */
597 
598 static bool support_cpu_devices;
599 
600 /* Runtime dimension overrides.  Zero indicates default.  */
601 
602 static int override_x_dim = 0;
603 static int override_z_dim = 0;
604 
605 /* }}}  */
606 /* {{{ Debug & Diagnostic  */
607 
608 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
609 
610 #define DEBUG_PRINT(...) \
611   do \
612   { \
613     if (debug) \
614       { \
615 	fprintf (stderr, __VA_ARGS__); \
616       } \
617   } \
618   while (false);
619 
620 /* Flush stderr if GCN_DEBUG value is set to true.  */
621 
622 #define DEBUG_FLUSH()				\
623   do {						\
624     if (debug)					\
625       fflush (stderr);				\
626   } while (false)
627 
628 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
629    is set to true.  */
630 
631 #define DEBUG_LOG(prefix, ...)			\
632   do						\
633     {						\
634       DEBUG_PRINT (prefix);			\
635       DEBUG_PRINT (__VA_ARGS__);			\
636       DEBUG_FLUSH ();				\
637     } while (false)
638 
639 /* Print a debugging message to stderr.  */
640 
641 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
642 
643 /* Print a warning message to stderr.  */
644 
645 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
646 
647 /* Print HSA warning STR with an HSA STATUS code.  */
648 
649 static void
650 hsa_warn (const char *str, hsa_status_t status)
651 {
652   if (!debug)
653     return;
654 
655   const char *hsa_error_msg = "[unknown]";
656   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657 
658   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
659 	   hsa_error_msg);
660 }
661 
662 /* Report a fatal error STR together with the HSA error corresponding to STATUS
663    and terminate execution of the current process.  */
664 
665 static void
666 hsa_fatal (const char *str, hsa_status_t status)
667 {
668   const char *hsa_error_msg = "[unknown]";
669   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
671 		     hsa_error_msg);
672 }
673 
674 /* Like hsa_fatal, except only report error message, and return FALSE
675    for propagating error processing to outside of plugin.  */
676 
677 static bool
678 hsa_error (const char *str, hsa_status_t status)
679 {
680   const char *hsa_error_msg = "[unknown]";
681   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
682   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
683 		     hsa_error_msg);
684   return false;
685 }
686 
687 /* Dump information about the available hardware.  */
688 
689 static void
690 dump_hsa_system_info (void)
691 {
692   hsa_status_t status;
693 
694   hsa_endianness_t endianness;
695   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
696 					   &endianness);
697   if (status == HSA_STATUS_SUCCESS)
698     switch (endianness)
699       {
700       case HSA_ENDIANNESS_LITTLE:
701 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
702 	break;
703       case HSA_ENDIANNESS_BIG:
704 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
705 	break;
706       default:
707 	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
708       }
709   else
710     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
711 
712   uint8_t extensions[128];
713   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
714 					   &extensions);
715   if (status == HSA_STATUS_SUCCESS)
716     {
717       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
718 	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
719     }
720   else
721     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
722 }
723 
724 /* Dump information about the available hardware.  */
725 
726 static void
727 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
728 {
729   switch (machine_model)
730     {
731     case HSA_MACHINE_MODEL_SMALL:
732       GCN_DEBUG ("%s: SMALL\n", s);
733       break;
734     case HSA_MACHINE_MODEL_LARGE:
735       GCN_DEBUG ("%s: LARGE\n", s);
736       break;
737     default:
738       GCN_WARNING ("%s: UNKNOWN\n", s);
739       break;
740     }
741 }
742 
743 /* Dump information about the available hardware.  */
744 
745 static void
746 dump_profile (hsa_profile_t profile, const char *s)
747 {
748   switch (profile)
749     {
750     case HSA_PROFILE_FULL:
751       GCN_DEBUG ("%s: FULL\n", s);
752       break;
753     case HSA_PROFILE_BASE:
754       GCN_DEBUG ("%s: BASE\n", s);
755       break;
756     default:
757       GCN_WARNING ("%s: UNKNOWN\n", s);
758       break;
759     }
760 }
761 
762 /* Dump information about a device memory region.  */
763 
764 static hsa_status_t
765 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
766 {
767   hsa_status_t status;
768 
769   hsa_region_segment_t segment;
770   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
771 					   &segment);
772   if (status == HSA_STATUS_SUCCESS)
773     {
774       if (segment == HSA_REGION_SEGMENT_GLOBAL)
775 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
776       else if (segment == HSA_REGION_SEGMENT_READONLY)
777 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
778       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
779 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
780       else if (segment == HSA_REGION_SEGMENT_GROUP)
781 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
782       else
783 	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
784     }
785   else
786     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
787 
788   if (segment == HSA_REGION_SEGMENT_GLOBAL)
789     {
790       uint32_t flags;
791       status
792 	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
793 					  &flags);
794       if (status == HSA_STATUS_SUCCESS)
795 	{
796 	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
797 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
798 	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
799 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
800 	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
801 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
802 	}
803       else
804 	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
805     }
806 
807   size_t size;
808   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
809   if (status == HSA_STATUS_SUCCESS)
810     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
811   else
812     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
813 
814   status
815     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
816 				      &size);
817   if (status == HSA_STATUS_SUCCESS)
818     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
819   else
820     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
821 
822   bool alloc_allowed;
823   status
824     = hsa_fns.hsa_region_get_info_fn (region,
825 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
826 				      &alloc_allowed);
827   if (status == HSA_STATUS_SUCCESS)
828     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
829   else
830     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
831 
832   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
833     return HSA_STATUS_SUCCESS;
834 
835   status
836     = hsa_fns.hsa_region_get_info_fn (region,
837 				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
838 				      &size);
839   if (status == HSA_STATUS_SUCCESS)
840     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
841   else
842     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
843 
844   size_t align;
845   status
846     = hsa_fns.hsa_region_get_info_fn (region,
847 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
848 				      &align);
849   if (status == HSA_STATUS_SUCCESS)
850     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
851   else
852     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
853 
854   return HSA_STATUS_SUCCESS;
855 }
856 
857 /* Dump information about all the device memory regions.  */
858 
859 static void
860 dump_hsa_regions (hsa_agent_t agent)
861 {
862   hsa_status_t status;
863   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
864 						 dump_hsa_region,
865 						 NULL);
866   if (status != HSA_STATUS_SUCCESS)
867     hsa_error ("Dumping hsa regions failed", status);
868 }
869 
870 /* Dump information about the available devices.  */
871 
872 static hsa_status_t
873 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
874 {
875   hsa_status_t status;
876 
877   char buf[64];
878   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
879 					  &buf);
880   if (status == HSA_STATUS_SUCCESS)
881     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
882   else
883     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
884 
885   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
886 					  &buf);
887   if (status == HSA_STATUS_SUCCESS)
888     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
889   else
890     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
891 
892   hsa_machine_model_t machine_model;
893   status
894     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
895 				     &machine_model);
896   if (status == HSA_STATUS_SUCCESS)
897     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
898   else
899     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
900 
901   hsa_profile_t profile;
902   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
903 					  &profile);
904   if (status == HSA_STATUS_SUCCESS)
905     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
906   else
907     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
908 
909   hsa_device_type_t device_type;
910   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
911 					  &device_type);
912   if (status == HSA_STATUS_SUCCESS)
913     {
914       switch (device_type)
915 	{
916 	case HSA_DEVICE_TYPE_CPU:
917 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
918 	  break;
919 	case HSA_DEVICE_TYPE_GPU:
920 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
921 	  break;
922 	case HSA_DEVICE_TYPE_DSP:
923 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
924 	  break;
925 	default:
926 	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
927 	  break;
928 	}
929     }
930   else
931     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
932 
933   uint32_t cu_count;
934   status = hsa_fns.hsa_agent_get_info_fn
935     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
936   if (status == HSA_STATUS_SUCCESS)
937     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
938   else
939     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
940 
941   uint32_t size;
942   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
943 					  &size);
944   if (status == HSA_STATUS_SUCCESS)
945     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
946   else
947     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
948 
949   uint32_t max_dim;
950   status = hsa_fns.hsa_agent_get_info_fn (agent,
951 					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
952 					  &max_dim);
953   if (status == HSA_STATUS_SUCCESS)
954     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
955   else
956     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
957 
958   uint32_t max_size;
959   status = hsa_fns.hsa_agent_get_info_fn (agent,
960 					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
961 					  &max_size);
962   if (status == HSA_STATUS_SUCCESS)
963     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
964   else
965     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
966 
967   uint32_t grid_max_dim;
968   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
969 					  &grid_max_dim);
970   if (status == HSA_STATUS_SUCCESS)
971     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
972   else
973     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
974 
975   uint32_t grid_max_size;
976   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
977 					  &grid_max_size);
978   if (status == HSA_STATUS_SUCCESS)
979     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
980   else
981     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
982 
983   dump_hsa_regions (agent);
984 
985   return HSA_STATUS_SUCCESS;
986 }
987 
988 /* Forward reference.  */
989 
990 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
991 
992 /* Helper function for dump_executable_symbols.  */
993 
994 static hsa_status_t
995 dump_executable_symbol (hsa_executable_t executable,
996 			hsa_executable_symbol_t symbol,
997 			void *data __attribute__((unused)))
998 {
999   char *name = get_executable_symbol_name (symbol);
1000 
1001   if (name)
1002     {
1003       GCN_DEBUG ("executable symbol: %s\n", name);
1004       free (name);
1005     }
1006 
1007   return HSA_STATUS_SUCCESS;
1008 }
1009 
1010 /* Dump all global symbol in an executable.  */
1011 
1012 static void
1013 dump_executable_symbols (hsa_executable_t executable)
1014 {
1015   hsa_status_t status;
1016   status
1017     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1018 						 dump_executable_symbol,
1019 						 NULL);
1020   if (status != HSA_STATUS_SUCCESS)
1021     hsa_fatal ("Could not dump HSA executable symbols", status);
1022 }
1023 
1024 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1025 
1026 static void
1027 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1028 {
1029   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1030 
1031   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1032   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1033   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1034   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1035 	   (void*)kernargs->heap_ptr);
1036   fprintf (stderr, "%*sarena address: %p\n", indent, "",
1037 	   (void*)kernargs->arena_ptr);
1038   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1039   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1040 	   dispatch->private_segment_size);
1041   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1042 	   dispatch->group_segment_size);
1043   fprintf (stderr, "\n");
1044 }
1045 
1046 /* }}}  */
1047 /* {{{ Utility functions  */
1048 
1049 /* Cast the thread local storage to gcn_thread.  */
1050 
1051 static inline struct gcn_thread *
1052 gcn_thread (void)
1053 {
1054   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1055 }
1056 
1057 /* Initialize debug and suppress_host_fallback according to the environment.  */
1058 
1059 static void
1060 init_environment_variables (void)
1061 {
1062   if (secure_getenv ("GCN_DEBUG"))
1063     debug = true;
1064   else
1065     debug = false;
1066 
1067   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1068     suppress_host_fallback = true;
1069   else
1070     suppress_host_fallback = false;
1071 
1072   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1073   if (hsa_runtime_lib == NULL)
1074     hsa_runtime_lib = "libhsa-runtime64.so.1";
1075 
1076   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1077 
1078   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1079   if (!x)
1080     x = secure_getenv ("GCN_NUM_GANGS");
1081   if (x)
1082     override_x_dim = atoi (x);
1083 
1084   const char *z = secure_getenv ("GCN_NUM_THREADS");
1085   if (!z)
1086     z = secure_getenv ("GCN_NUM_WORKERS");
1087   if (z)
1088     override_z_dim = atoi (z);
1089 
1090   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1091   if (heap)
1092     {
1093       size_t tmp = atol (heap);
1094       if (tmp)
1095 	gcn_kernel_heap_size = tmp;
1096     }
1097 }
1098 
1099 /* Return malloc'd string with name of SYMBOL.  */
1100 
1101 static char *
1102 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103 {
1104   hsa_status_t status;
1105   char *res;
1106   uint32_t len;
1107   const hsa_executable_symbol_info_t info_name_length
1108     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109 
1110   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1111 						      &len);
1112   if (status != HSA_STATUS_SUCCESS)
1113     {
1114       hsa_error ("Could not get length of symbol name", status);
1115       return NULL;
1116     }
1117 
1118   res = GOMP_PLUGIN_malloc (len + 1);
1119 
1120   const hsa_executable_symbol_info_t info_name
1121     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122 
1123   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124 
1125   if (status != HSA_STATUS_SUCCESS)
1126     {
1127       hsa_error ("Could not get symbol name", status);
1128       free (res);
1129       return NULL;
1130     }
1131 
1132   res[len] = '\0';
1133 
1134   return res;
1135 }
1136 
1137 /* Get the number of GPU Compute Units.  */
1138 
1139 static int
1140 get_cu_count (struct agent_info *agent)
1141 {
1142   uint32_t cu_count;
1143   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1144     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1145   if (status == HSA_STATUS_SUCCESS)
1146     return cu_count;
1147   else
1148     return 64;  /* The usual number for older devices.  */
1149 }
1150 
1151 /* Calculate the maximum grid size for OMP threads / OACC workers.
1152    This depends on the kernel's resource usage levels.  */
1153 
1154 static int
1155 limit_worker_threads (int threads)
1156 {
1157   /* FIXME Do something more inteligent here.
1158      GCN can always run 4 threads within a Compute Unit, but
1159      more than that depends on register usage.  */
1160   if (threads > 16)
1161     threads = 16;
1162   return threads;
1163 }
1164 
1165 /* Parse the target attributes INPUT provided by the compiler and return true
1166    if we should run anything all.  If INPUT is NULL, fill DEF with default
1167    values, then store INPUT or DEF into *RESULT.
1168 
1169    This is used for OpenMP only.  */
1170 
1171 static bool
1172 parse_target_attributes (void **input,
1173 			 struct GOMP_kernel_launch_attributes *def,
1174 			 struct GOMP_kernel_launch_attributes **result,
1175 			 struct agent_info *agent)
1176 {
1177   if (!input)
1178     GOMP_PLUGIN_fatal ("No target arguments provided");
1179 
1180   bool grid_attrs_found = false;
1181   bool gcn_dims_found = false;
1182   int gcn_teams = 0;
1183   int gcn_threads = 0;
1184   while (*input)
1185     {
1186       intptr_t id = (intptr_t) *input++, val;
1187 
1188       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189 	val = (intptr_t) *input++;
1190       else
1191 	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192 
1193       val = (val > INT_MAX) ? INT_MAX : val;
1194 
1195       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1197 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198 	{
1199 	  grid_attrs_found = true;
1200 	  break;
1201 	}
1202       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203 	       == GOMP_TARGET_ARG_DEVICE_ALL)
1204 	{
1205 	  gcn_dims_found = true;
1206 	  switch (id & GOMP_TARGET_ARG_ID_MASK)
1207 	    {
1208 	    case GOMP_TARGET_ARG_NUM_TEAMS:
1209 	      gcn_teams = val;
1210 	      break;
1211 	    case GOMP_TARGET_ARG_THREAD_LIMIT:
1212 	      gcn_threads = limit_worker_threads (val);
1213 	      break;
1214 	    default:
1215 	      ;
1216 	    }
1217 	}
1218     }
1219 
1220   if (gcn_dims_found)
1221     {
1222       bool gfx900_workaround_p = false;
1223 
1224       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1225 	  && gcn_threads == 0 && override_z_dim == 0)
1226 	{
1227 	  gfx900_workaround_p = true;
1228 	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1229 		       "threads to at most 4 per team.\n");
1230 	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231 		       "GCN_NUM_THREADS=16\n");
1232 	}
1233 
1234       /* Ideally, when a dimension isn't explicitly specified, we should
1235 	 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236 	 In practice, we tune for peak performance on BabelStream, which
1237 	 for OpenACC is currently 32 threads per CU.  */
1238       def->ndim = 3;
1239       if (gcn_teams <= 0 && gcn_threads <= 0)
1240 	{
1241 	  /* Set up a reasonable number of teams and threads.  */
1242 	  gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1243 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1244 	  def->gdims[2] = gcn_threads;
1245 	}
1246       else if (gcn_teams <= 0 && gcn_threads > 0)
1247 	{
1248 	  /* Auto-scale the number of teams with the number of threads.  */
1249 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1250 	  def->gdims[2] = gcn_threads;
1251 	}
1252       else if (gcn_teams > 0 && gcn_threads <= 0)
1253 	{
1254 	  int max_threads = gfx900_workaround_p ? 4 : 16;
1255 
1256 	  /* Auto-scale the number of threads with the number of teams.  */
1257 	  def->gdims[0] = gcn_teams;
1258 	  def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259 	  if (def->gdims[2] == 0)
1260 	    def->gdims[2] = 1;
1261 	  else if (def->gdims[2] > max_threads)
1262 	    def->gdims[2] = max_threads;
1263 	}
1264       else
1265 	{
1266 	  def->gdims[0] = gcn_teams;
1267 	  def->gdims[2] = gcn_threads;
1268 	}
1269       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1270       def->wdims[0] = 1;  /* Single team per work-group.  */
1271       def->wdims[1] = 64;
1272       def->wdims[2] = 16;
1273       *result = def;
1274       return true;
1275     }
1276   else if (!grid_attrs_found)
1277     {
1278       def->ndim = 1;
1279       def->gdims[0] = 1;
1280       def->gdims[1] = 1;
1281       def->gdims[2] = 1;
1282       def->wdims[0] = 1;
1283       def->wdims[1] = 1;
1284       def->wdims[2] = 1;
1285       *result = def;
1286       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1287       return true;
1288     }
1289 
1290   struct GOMP_kernel_launch_attributes *kla;
1291   kla = (struct GOMP_kernel_launch_attributes *) *input;
1292   *result = kla;
1293   if (kla->ndim == 0 || kla->ndim > 3)
1294     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1295 
1296   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1297   unsigned i;
1298   for (i = 0; i < kla->ndim; i++)
1299     {
1300       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1301 		 kla->gdims[i], kla->wdims[i]);
1302       if (kla->gdims[i] == 0)
1303 	return false;
1304     }
1305   return true;
1306 }
1307 
1308 /* Return the group size given the requested GROUP size, GRID size and number
1309    of grid dimensions NDIM.  */
1310 
1311 static uint32_t
1312 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1313 {
1314   if (group == 0)
1315     {
1316       /* TODO: Provide a default via environment or device characteristics.  */
1317       if (ndim == 1)
1318 	group = 64;
1319       else if (ndim == 2)
1320 	group = 8;
1321       else
1322 	group = 4;
1323     }
1324 
1325   if (group > grid)
1326     group = grid;
1327   return group;
1328 }
1329 
1330 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1331 
1332 static void
1333 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1334 {
1335   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1336 }
1337 
1338 /* A never-called callback for the HSA command queues.  These signal events
1339    that we don't use, so we trigger an error.
1340 
1341    This "queue" is not to be confused with the async queues, below.  */
1342 
1343 static void
1344 hsa_queue_callback (hsa_status_t status,
1345 		hsa_queue_t *queue __attribute__ ((unused)),
1346 		void *data __attribute__ ((unused)))
1347 {
1348   hsa_fatal ("Asynchronous queue error", status);
1349 }
1350 
1351 /* }}}  */
1352 /* {{{ HSA initialization  */
1353 
1354 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1355 
1356 static bool
1357 init_hsa_runtime_functions (void)
1358 {
1359 #define DLSYM_FN(function) \
1360   hsa_fns.function##_fn = dlsym (handle, #function); \
1361   if (hsa_fns.function##_fn == NULL) \
1362     return false;
1363   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1364   if (handle == NULL)
1365     return false;
1366 
1367   DLSYM_FN (hsa_status_string)
1368   DLSYM_FN (hsa_system_get_info)
1369   DLSYM_FN (hsa_agent_get_info)
1370   DLSYM_FN (hsa_init)
1371   DLSYM_FN (hsa_iterate_agents)
1372   DLSYM_FN (hsa_region_get_info)
1373   DLSYM_FN (hsa_queue_create)
1374   DLSYM_FN (hsa_agent_iterate_regions)
1375   DLSYM_FN (hsa_executable_destroy)
1376   DLSYM_FN (hsa_executable_create)
1377   DLSYM_FN (hsa_executable_global_variable_define)
1378   DLSYM_FN (hsa_executable_load_code_object)
1379   DLSYM_FN (hsa_executable_freeze)
1380   DLSYM_FN (hsa_signal_create)
1381   DLSYM_FN (hsa_memory_allocate)
1382   DLSYM_FN (hsa_memory_assign_agent)
1383   DLSYM_FN (hsa_memory_copy)
1384   DLSYM_FN (hsa_memory_free)
1385   DLSYM_FN (hsa_signal_destroy)
1386   DLSYM_FN (hsa_executable_get_symbol)
1387   DLSYM_FN (hsa_executable_symbol_get_info)
1388   DLSYM_FN (hsa_executable_iterate_symbols)
1389   DLSYM_FN (hsa_queue_add_write_index_release)
1390   DLSYM_FN (hsa_queue_load_read_index_acquire)
1391   DLSYM_FN (hsa_signal_wait_acquire)
1392   DLSYM_FN (hsa_signal_store_relaxed)
1393   DLSYM_FN (hsa_signal_store_release)
1394   DLSYM_FN (hsa_signal_load_acquire)
1395   DLSYM_FN (hsa_queue_destroy)
1396   DLSYM_FN (hsa_code_object_deserialize)
1397   return true;
1398 #undef DLSYM_FN
1399 }
1400 
1401 /* Return true if the agent is a GPU and can accept of concurrent submissions
1402    from different threads.  */
1403 
1404 static bool
1405 suitable_hsa_agent_p (hsa_agent_t agent)
1406 {
1407   hsa_device_type_t device_type;
1408   hsa_status_t status
1409     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1410 				     &device_type);
1411   if (status != HSA_STATUS_SUCCESS)
1412     return false;
1413 
1414   switch (device_type)
1415     {
1416     case HSA_DEVICE_TYPE_GPU:
1417       break;
1418     case HSA_DEVICE_TYPE_CPU:
1419       if (!support_cpu_devices)
1420 	return false;
1421       break;
1422     default:
1423       return false;
1424     }
1425 
1426   uint32_t features = 0;
1427   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1428 					  &features);
1429   if (status != HSA_STATUS_SUCCESS
1430       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1431     return false;
1432   hsa_queue_type_t queue_type;
1433   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1434 					  &queue_type);
1435   if (status != HSA_STATUS_SUCCESS
1436       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1437     return false;
1438 
1439   return true;
1440 }
1441 
1442 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443    agent_count in hsa_context.  */
1444 
1445 static hsa_status_t
1446 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1447 {
1448   if (suitable_hsa_agent_p (agent))
1449     hsa_context.agent_count++;
1450   return HSA_STATUS_SUCCESS;
1451 }
1452 
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454    id to the describing structure in the hsa context.  The index of the
1455    structure is pointed to by DATA, increment it afterwards.  */
1456 
1457 static hsa_status_t
1458 assign_agent_ids (hsa_agent_t agent, void *data)
1459 {
1460   if (suitable_hsa_agent_p (agent))
1461     {
1462       int *agent_index = (int *) data;
1463       hsa_context.agents[*agent_index].id = agent;
1464       ++*agent_index;
1465     }
1466   return HSA_STATUS_SUCCESS;
1467 }
1468 
1469 /* Initialize hsa_context if it has not already been done.
1470    Return TRUE on success.  */
1471 
1472 static bool
1473 init_hsa_context (void)
1474 {
1475   hsa_status_t status;
1476   int agent_index = 0;
1477 
1478   if (hsa_context.initialized)
1479     return true;
1480   init_environment_variables ();
1481   if (!init_hsa_runtime_functions ())
1482     {
1483       GCN_WARNING ("Run-time could not be dynamically opened\n");
1484       if (suppress_host_fallback)
1485 	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1486       return false;
1487     }
1488   status = hsa_fns.hsa_init_fn ();
1489   if (status != HSA_STATUS_SUCCESS)
1490     return hsa_error ("Run-time could not be initialized", status);
1491   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1492 
1493   if (debug)
1494     dump_hsa_system_info ();
1495 
1496   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1497   if (status != HSA_STATUS_SUCCESS)
1498     return hsa_error ("GCN GPU devices could not be enumerated", status);
1499   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1500 
1501   hsa_context.agents
1502     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1503 				  * sizeof (struct agent_info));
1504   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1505   if (status != HSA_STATUS_SUCCESS)
1506     return hsa_error ("Scanning compute agents failed", status);
1507   if (agent_index != hsa_context.agent_count)
1508     {
1509       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1510       return false;
1511     }
1512 
1513   if (debug)
1514     {
1515       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1516       if (status != HSA_STATUS_SUCCESS)
1517 	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1518     }
1519 
1520   uint16_t minor, major;
1521   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1522 					   &minor);
1523   if (status != HSA_STATUS_SUCCESS)
1524     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1526 					   &major);
1527   if (status != HSA_STATUS_SUCCESS)
1528     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1529 
1530   size_t len = sizeof hsa_context.driver_version_s;
1531   int printed = snprintf (hsa_context.driver_version_s, len,
1532 			  "HSA Runtime %hu.%hu", (unsigned short int)major,
1533 			  (unsigned short int)minor);
1534   if (printed >= len)
1535     GCN_WARNING ("HSA runtime version string was truncated."
1536 		 "Version %hu.%hu is too long.", (unsigned short int)major,
1537 		 (unsigned short int)minor);
1538 
1539   hsa_context.initialized = true;
1540   return true;
1541 }
1542 
1543 /* Verify that hsa_context has already been initialized and return the
1544    agent_info structure describing device number N.  Return NULL on error.  */
1545 
1546 static struct agent_info *
1547 get_agent_info (int n)
1548 {
1549   if (!hsa_context.initialized)
1550     {
1551       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1552       return NULL;
1553     }
1554   if (n >= hsa_context.agent_count)
1555     {
1556       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1557       return NULL;
1558     }
1559   if (!hsa_context.agents[n].initialized)
1560     {
1561       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1562       return NULL;
1563     }
1564   return &hsa_context.agents[n];
1565 }
1566 
1567 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1568 
1569    Selects (breaks at) a suitable region of type KIND.  */
1570 
1571 static hsa_status_t
1572 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1573 		   hsa_region_global_flag_t kind)
1574 {
1575   hsa_status_t status;
1576   hsa_region_segment_t segment;
1577 
1578   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1579 					   &segment);
1580   if (status != HSA_STATUS_SUCCESS)
1581     return status;
1582   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1583     return HSA_STATUS_SUCCESS;
1584 
1585   uint32_t flags;
1586   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1587 					   &flags);
1588   if (status != HSA_STATUS_SUCCESS)
1589     return status;
1590   if (flags & kind)
1591     {
1592       *retval = region;
1593       return HSA_STATUS_INFO_BREAK;
1594     }
1595   return HSA_STATUS_SUCCESS;
1596 }
1597 
1598 /* Callback of hsa_agent_iterate_regions.
1599 
1600    Selects a kernargs memory region.  */
1601 
1602 static hsa_status_t
1603 get_kernarg_memory_region (hsa_region_t region, void *data)
1604 {
1605   return get_memory_region (region, (hsa_region_t *)data,
1606 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
1607 }
1608 
1609 /* Callback of hsa_agent_iterate_regions.
1610 
1611    Selects a coarse-grained memory region suitable for the heap and
1612    offload data.  */
1613 
1614 static hsa_status_t
1615 get_data_memory_region (hsa_region_t region, void *data)
1616 {
1617   return get_memory_region (region, (hsa_region_t *)data,
1618 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1619 }
1620 
1621 static int
1622 elf_gcn_isa_field (Elf64_Ehdr *image)
1623 {
1624   return image->e_flags & EF_AMDGPU_MACH_MASK;
1625 }
1626 
1627 const static char *gcn_gfx803_s = "gfx803";
1628 const static char *gcn_gfx900_s = "gfx900";
1629 const static char *gcn_gfx906_s = "gfx906";
1630 const static char *gcn_gfx908_s = "gfx908";
1631 const static int gcn_isa_name_len = 6;
1632 
1633 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1634    support the ISA. */
1635 
1636 static const char*
1637 isa_hsa_name (int isa) {
1638   switch(isa)
1639     {
1640     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1641       return gcn_gfx803_s;
1642     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1643       return gcn_gfx900_s;
1644     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1645       return gcn_gfx906_s;
1646     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1647       return gcn_gfx908_s;
1648     }
1649   return NULL;
1650 }
1651 
1652 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1653    with -march) or NULL if we do not support the ISA.
1654    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1655 
1656 static const char*
1657 isa_gcc_name (int isa) {
1658   switch(isa)
1659     {
1660     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1661       return "fiji";
1662     default:
1663       return isa_hsa_name (isa);
1664     }
1665 }
1666 
1667 /* Returns the code which is used in the GCN object code to identify the ISA with
1668    the given name (as used by the HSA runtime).  */
1669 
1670 static gcn_isa
1671 isa_code(const char *isa) {
1672   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1673     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1674 
1675   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1676     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1677 
1678   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1679     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1680 
1681   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1682     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1683 
1684   return -1;
1685 }
1686 
1687 /* }}}  */
1688 /* {{{ Run  */
1689 
1690 /* Create or reuse a team arena.
1691 
1692    Team arenas are used by OpenMP to avoid calling malloc multiple times
1693    while setting up each team.  This is purely a performance optimization.
1694 
1695    Allocating an arena also costs performance, albeit on the host side, so
1696    this function will reuse an existing arena if a large enough one is idle.
1697    The arena is released, but not deallocated, when the kernel exits.  */
1698 
1699 static void *
1700 get_team_arena (struct agent_info *agent, int num_teams)
1701 {
1702   struct team_arena_list **next_ptr = &agent->team_arena_list;
1703   struct team_arena_list *item;
1704 
1705   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706     {
1707       if (item->num_teams < num_teams)
1708 	continue;
1709 
1710       if (pthread_mutex_trylock (&item->in_use))
1711 	continue;
1712 
1713       return item->arena;
1714     }
1715 
1716   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717 
1718   if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719     {
1720       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721       return false;
1722     }
1723   item = malloc (sizeof (*item));
1724   item->num_teams = num_teams;
1725   item->next = NULL;
1726   *next_ptr = item;
1727 
1728   if (pthread_mutex_init (&item->in_use, NULL))
1729     {
1730       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731       return false;
1732     }
1733   if (pthread_mutex_lock (&item->in_use))
1734     {
1735       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736       return false;
1737     }
1738   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739     {
1740       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741       return false;
1742     }
1743 
1744   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1745   hsa_status_t status;
1746   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747 					   TEAM_ARENA_SIZE*num_teams,
1748 					   &item->arena);
1749   if (status != HSA_STATUS_SUCCESS)
1750     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752 					       HSA_ACCESS_PERMISSION_RW);
1753   if (status != HSA_STATUS_SUCCESS)
1754     hsa_fatal ("Could not assign arena memory to device", status);
1755 
1756   return item->arena;
1757 }
1758 
1759 /* Mark a team arena available for reuse.  */
1760 
1761 static void
1762 release_team_arena (struct agent_info* agent, void *arena)
1763 {
1764   struct team_arena_list *item;
1765 
1766   for (item = agent->team_arena_list; item; item = item->next)
1767     {
1768       if (item->arena == arena)
1769 	{
1770 	  if (pthread_mutex_unlock (&item->in_use))
1771 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772 	  return;
1773 	}
1774     }
1775   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776 }
1777 
1778 /* Clean up all the allocated team arenas.  */
1779 
1780 static bool
1781 destroy_team_arenas (struct agent_info *agent)
1782 {
1783   struct team_arena_list *item, *next;
1784 
1785   for (item = agent->team_arena_list; item; item = next)
1786     {
1787       next = item->next;
1788       hsa_fns.hsa_memory_free_fn (item->arena);
1789       if (pthread_mutex_destroy (&item->in_use))
1790 	{
1791 	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792 	  return false;
1793 	}
1794       free (item);
1795     }
1796   agent->team_arena_list = NULL;
1797 
1798   return true;
1799 }
1800 
1801 /* Allocate memory on a specified device.  */
1802 
1803 static void *
1804 alloc_by_agent (struct agent_info *agent, size_t size)
1805 {
1806   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807 
1808   /* Zero-size allocations are invalid, so in order to return a valid pointer
1809      we need to pass a valid size.  One source of zero-size allocations is
1810      kernargs for kernels that have no inputs or outputs (the kernel may
1811      only use console output, for example).  */
1812   if (size == 0)
1813     size = 4;
1814 
1815   void *ptr;
1816   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817 							size, &ptr);
1818   if (status != HSA_STATUS_SUCCESS)
1819     {
1820       hsa_error ("Could not allocate device memory", status);
1821       return NULL;
1822     }
1823 
1824   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825 					       HSA_ACCESS_PERMISSION_RW);
1826   if (status != HSA_STATUS_SUCCESS)
1827     {
1828       hsa_error ("Could not assign data memory to device", status);
1829       return NULL;
1830     }
1831 
1832   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833   bool profiling_dispatch_p
1834     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835   if (profiling_dispatch_p)
1836     {
1837       acc_prof_info *prof_info = thr->prof_info;
1838       acc_event_info data_event_info;
1839       acc_api_info *api_info = thr->api_info;
1840 
1841       prof_info->event_type = acc_ev_alloc;
1842 
1843       data_event_info.data_event.event_type = prof_info->event_type;
1844       data_event_info.data_event.valid_bytes
1845 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846       data_event_info.data_event.parent_construct
1847 	= acc_construct_parallel;
1848       data_event_info.data_event.implicit = 1;
1849       data_event_info.data_event.tool_info = NULL;
1850       data_event_info.data_event.var_name = NULL;
1851       data_event_info.data_event.bytes = size;
1852       data_event_info.data_event.host_ptr = NULL;
1853       data_event_info.data_event.device_ptr = (void *) ptr;
1854 
1855       api_info->device_api = acc_device_api_other;
1856 
1857       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858 					    api_info);
1859     }
1860 
1861   return ptr;
1862 }
1863 
1864 /* Create kernel dispatch data structure for given KERNEL, along with
1865    the necessary device signals and memory allocations.  */
1866 
1867 static struct kernel_dispatch *
1868 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869 {
1870   struct agent_info *agent = kernel->agent;
1871   struct kernel_dispatch *shadow
1872     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873 
1874   shadow->agent = kernel->agent;
1875   shadow->object = kernel->object;
1876 
1877   hsa_signal_t sync_signal;
1878   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879   if (status != HSA_STATUS_SUCCESS)
1880     hsa_fatal ("Error creating the GCN sync signal", status);
1881 
1882   shadow->signal = sync_signal.handle;
1883   shadow->private_segment_size = kernel->private_segment_size;
1884   shadow->group_segment_size = kernel->group_segment_size;
1885 
1886   /* We expect kernels to request a single pointer, explicitly, and the
1887      rest of struct kernargs, implicitly.  If they request anything else
1888      then something is wrong.  */
1889   if (kernel->kernarg_segment_size > 8)
1890     {
1891       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892       return NULL;
1893     }
1894 
1895   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896 					   sizeof (struct kernargs),
1897 					   &shadow->kernarg_address);
1898   if (status != HSA_STATUS_SUCCESS)
1899     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900   struct kernargs *kernargs = shadow->kernarg_address;
1901 
1902   /* Zero-initialize the output_data (minimum needed).  */
1903   kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904   kernargs->output_data.next_output = 0;
1905   for (unsigned i = 0;
1906        i < (sizeof (kernargs->output_data.queue)
1907 	    / sizeof (kernargs->output_data.queue[0]));
1908        i++)
1909     kernargs->output_data.queue[i].written = 0;
1910   kernargs->output_data.consumed = 0;
1911 
1912   /* Pass in the heap location.  */
1913   kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914 
1915   /* Create an arena.  */
1916   if (kernel->kind == KIND_OPENMP)
1917     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918   else
1919     kernargs->arena_ptr = 0;
1920 
1921   /* Ensure we can recognize unset return values.  */
1922   kernargs->output_data.return_value = 0xcafe0000;
1923 
1924   return shadow;
1925 }
1926 
1927 /* Output any data written to console output from the kernel.  It is expected
1928    that this function is polled during kernel execution.
1929 
1930    We print all entries from the last item printed to the next entry without
1931    a "written" flag.  If the "final" flag is set then it'll continue right to
1932    the end.
1933 
1934    The print buffer is circular, but the from and to locations don't wrap when
1935    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1936    output when the buffer is full.  */
1937 
1938 static void
1939 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940 		bool final)
1941 {
1942   unsigned int limit = (sizeof (kernargs->output_data.queue)
1943 			/ sizeof (kernargs->output_data.queue[0]));
1944 
1945   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946 				       __ATOMIC_ACQUIRE);
1947   unsigned int to = kernargs->output_data.next_output;
1948 
1949   if (from > to)
1950     {
1951       /* Overflow.  */
1952       if (final)
1953 	printf ("GCN print buffer overflowed.\n");
1954       return;
1955     }
1956 
1957   unsigned int i;
1958   for (i = from; i < to; i++)
1959     {
1960       struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961 
1962       if (!data->written && !final)
1963 	break;
1964 
1965       switch (data->type)
1966 	{
1967 	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968 	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969 	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970 	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971 	default: printf ("GCN print buffer error!\n"); break;
1972 	}
1973       data->written = 0;
1974       __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975 			__ATOMIC_RELEASE);
1976     }
1977   fflush (stdout);
1978 }
1979 
1980 /* Release data structure created for a kernel dispatch in SHADOW argument,
1981    and clean up the signal and memory allocations.  */
1982 
1983 static void
1984 release_kernel_dispatch (struct kernel_dispatch *shadow)
1985 {
1986   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987 
1988   struct kernargs *kernargs = shadow->kernarg_address;
1989   void *arena = (void *)kernargs->arena_ptr;
1990   if (arena)
1991     release_team_arena (shadow->agent, arena);
1992 
1993   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994 
1995   hsa_signal_t s;
1996   s.handle = shadow->signal;
1997   hsa_fns.hsa_signal_destroy_fn (s);
1998 
1999   free (shadow);
2000 }
2001 
2002 /* Extract the properties from a kernel binary.  */
2003 
2004 static void
2005 init_kernel_properties (struct kernel_info *kernel)
2006 {
2007   hsa_status_t status;
2008   struct agent_info *agent = kernel->agent;
2009   hsa_executable_symbol_t kernel_symbol;
2010   char *buf = alloca (strlen (kernel->name) + 4);
2011   sprintf (buf, "%s.kd", kernel->name);
2012   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2013 						 buf, agent->id,
2014 						 0, &kernel_symbol);
2015   if (status != HSA_STATUS_SUCCESS)
2016     {
2017       hsa_warn ("Could not find symbol for kernel in the code object", status);
2018       fprintf (stderr, "not found name: '%s'\n", buf);
2019       dump_executable_symbols (agent->executable);
2020       goto failure;
2021     }
2022   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2023   status = hsa_fns.hsa_executable_symbol_get_info_fn
2024     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2025   if (status != HSA_STATUS_SUCCESS)
2026     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2027   status = hsa_fns.hsa_executable_symbol_get_info_fn
2028     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2029      &kernel->kernarg_segment_size);
2030   if (status != HSA_STATUS_SUCCESS)
2031     hsa_fatal ("Could not get info about kernel argument size", status);
2032   status = hsa_fns.hsa_executable_symbol_get_info_fn
2033     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2034      &kernel->group_segment_size);
2035   if (status != HSA_STATUS_SUCCESS)
2036     hsa_fatal ("Could not get info about kernel group segment size", status);
2037   status = hsa_fns.hsa_executable_symbol_get_info_fn
2038     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2039      &kernel->private_segment_size);
2040   if (status != HSA_STATUS_SUCCESS)
2041     hsa_fatal ("Could not get info about kernel private segment size",
2042 	       status);
2043 
2044   /* The kernel type is not known until something tries to launch it.  */
2045   kernel->kind = KIND_UNKNOWN;
2046 
2047   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2048 	     "following segment sizes: \n", kernel->name);
2049   GCN_DEBUG ("  group_segment_size: %u\n",
2050 	     (unsigned) kernel->group_segment_size);
2051   GCN_DEBUG ("  private_segment_size: %u\n",
2052 	     (unsigned) kernel->private_segment_size);
2053   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2054 	     (unsigned) kernel->kernarg_segment_size);
2055   return;
2056 
2057 failure:
2058   kernel->initialization_failed = true;
2059 }
2060 
2061 /* Do all the work that is necessary before running KERNEL for the first time.
2062    The function assumes the program has been created, finalized and frozen by
2063    create_and_finalize_hsa_program.  */
2064 
2065 static void
2066 init_kernel (struct kernel_info *kernel)
2067 {
2068   if (pthread_mutex_lock (&kernel->init_mutex))
2069     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2070   if (kernel->initialized)
2071     {
2072       if (pthread_mutex_unlock (&kernel->init_mutex))
2073 	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2074 			   "mutex");
2075 
2076       return;
2077     }
2078 
2079   init_kernel_properties (kernel);
2080 
2081   if (!kernel->initialization_failed)
2082     {
2083       GCN_DEBUG ("\n");
2084 
2085       kernel->initialized = true;
2086     }
2087   if (pthread_mutex_unlock (&kernel->init_mutex))
2088     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2089 		       "mutex");
2090 }
2091 
2092 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2093    launch attributes from KLA.
2094 
2095    MODULE_LOCKED indicates that the caller already holds the lock and
2096    run_kernel need not lock it again.
2097    If AQ is NULL then agent->sync_queue will be used.  */
2098 
2099 static void
2100 run_kernel (struct kernel_info *kernel, void *vars,
2101 	    struct GOMP_kernel_launch_attributes *kla,
2102 	    struct goacc_asyncqueue *aq, bool module_locked)
2103 {
2104   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2105 	     kernel->description->vpgr_count);
2106 
2107   /* Reduce the number of threads/workers if there are insufficient
2108      VGPRs available to run the kernels together.  */
2109   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2110     {
2111       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2112       int max_threads = (256 / granulated_vgprs) * 4;
2113       if (kla->gdims[2] > max_threads)
2114 	{
2115 	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2116 		       " per team/gang - reducing to %d threads/workers.\n",
2117 		       kla->gdims[2], max_threads);
2118 	  kla->gdims[2] = max_threads;
2119 	}
2120     }
2121 
2122   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2123 	     (aq ? aq->id : 0));
2124   GCN_DEBUG ("GCN launch attribs: gdims:[");
2125   int i;
2126   for (i = 0; i < kla->ndim; ++i)
2127     {
2128       if (i)
2129 	DEBUG_PRINT (", ");
2130       DEBUG_PRINT ("%u", kla->gdims[i]);
2131     }
2132   DEBUG_PRINT ("], normalized gdims:[");
2133   for (i = 0; i < kla->ndim; ++i)
2134     {
2135       if (i)
2136 	DEBUG_PRINT (", ");
2137       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2138     }
2139   DEBUG_PRINT ("], wdims:[");
2140   for (i = 0; i < kla->ndim; ++i)
2141     {
2142       if (i)
2143 	DEBUG_PRINT (", ");
2144       DEBUG_PRINT ("%u", kla->wdims[i]);
2145     }
2146   DEBUG_PRINT ("]\n");
2147   DEBUG_FLUSH ();
2148 
2149   struct agent_info *agent = kernel->agent;
2150   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2151     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2152 
2153   if (!agent->initialized)
2154     GOMP_PLUGIN_fatal ("Agent must be initialized");
2155 
2156   if (!kernel->initialized)
2157     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2158 
2159   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2160 
2161   uint64_t index
2162     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2163   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2164 
2165   /* Wait until the queue is not full before writing the packet.   */
2166   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2167 	 >= command_q->size)
2168     ;
2169 
2170   /* Do not allow the dimensions to be overridden when running
2171      constructors or destructors.  */
2172   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2173   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2174 
2175   hsa_kernel_dispatch_packet_t *packet;
2176   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2177 	   + index % command_q->size;
2178 
2179   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2180   packet->grid_size_x = override_x ? : kla->gdims[0];
2181   packet->workgroup_size_x = get_group_size (kla->ndim,
2182 					     packet->grid_size_x,
2183 					     kla->wdims[0]);
2184 
2185   if (kla->ndim >= 2)
2186     {
2187       packet->grid_size_y = kla->gdims[1];
2188       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2189 						 kla->wdims[1]);
2190     }
2191   else
2192     {
2193       packet->grid_size_y = 1;
2194       packet->workgroup_size_y = 1;
2195     }
2196 
2197   if (kla->ndim == 3)
2198     {
2199       packet->grid_size_z = limit_worker_threads (override_z
2200 						  ? : kla->gdims[2]);
2201       packet->workgroup_size_z = get_group_size (kla->ndim,
2202 						 packet->grid_size_z,
2203 						 kla->wdims[2]);
2204     }
2205   else
2206     {
2207       packet->grid_size_z = 1;
2208       packet->workgroup_size_z = 1;
2209     }
2210 
2211   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2212 	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2213 	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2214 	     packet->grid_size_x / packet->workgroup_size_x,
2215 	     packet->grid_size_y / packet->workgroup_size_y,
2216 	     packet->grid_size_z / packet->workgroup_size_z,
2217 	     packet->workgroup_size_x, packet->workgroup_size_y,
2218 	     packet->workgroup_size_z);
2219 
2220   struct kernel_dispatch *shadow
2221     = create_kernel_dispatch (kernel, packet->grid_size_x);
2222   shadow->queue = command_q;
2223 
2224   if (debug)
2225     {
2226       fprintf (stderr, "\nKernel has following dependencies:\n");
2227       print_kernel_dispatch (shadow, 2);
2228     }
2229 
2230   packet->private_segment_size = kernel->private_segment_size;
2231   packet->group_segment_size = kernel->group_segment_size;
2232   packet->kernel_object = kernel->object;
2233   packet->kernarg_address = shadow->kernarg_address;
2234   hsa_signal_t s;
2235   s.handle = shadow->signal;
2236   packet->completion_signal = s;
2237   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2238   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2239 
2240   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2241 
2242   uint16_t header;
2243   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2244   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2245   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2246 
2247   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2248 	     agent->device_id);
2249 
2250   packet_store_release ((uint32_t *) packet, header,
2251 			(uint16_t) kla->ndim
2252 			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2253 
2254   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2255 				       index);
2256 
2257   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2258 
2259   /* Root signal waits with 1ms timeout.  */
2260   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2261 					     1000 * 1000,
2262 					     HSA_WAIT_STATE_BLOCKED) != 0)
2263     {
2264       console_output (kernel, shadow->kernarg_address, false);
2265     }
2266   console_output (kernel, shadow->kernarg_address, true);
2267 
2268   struct kernargs *kernargs = shadow->kernarg_address;
2269   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2270 
2271   release_kernel_dispatch (shadow);
2272 
2273   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2274     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2275 
2276   unsigned int upper = (return_value & ~0xffff) >> 16;
2277   if (upper == 0xcafe)
2278     ; // exit not called, normal termination.
2279   else if (upper == 0xffff)
2280     ; // exit called.
2281   else
2282     {
2283       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2284 			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2285 			 return_value);
2286       abort ();
2287     }
2288 
2289   if (upper == 0xffff)
2290     {
2291       unsigned int signal = (return_value >> 8) & 0xff;
2292 
2293       if (signal == SIGABRT)
2294 	{
2295 	  GCN_WARNING ("GCN Kernel aborted\n");
2296 	  abort ();
2297 	}
2298       else if (signal != 0)
2299 	{
2300 	  GCN_WARNING ("GCN Kernel received unknown signal\n");
2301 	  abort ();
2302 	}
2303 
2304       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2305       exit (return_value & 0xff);
2306     }
2307 }
2308 
2309 /* }}}  */
2310 /* {{{ Load/Unload  */
2311 
2312 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2313 
2314 static bool
2315 init_basic_kernel_info (struct kernel_info *kernel,
2316 			struct hsa_kernel_description *d,
2317 			struct agent_info *agent,
2318 			struct module_info *module)
2319 {
2320   kernel->agent = agent;
2321   kernel->module = module;
2322   kernel->name = d->name;
2323   kernel->description = d;
2324   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2325     {
2326       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2327       return false;
2328     }
2329   return true;
2330 }
2331 
2332 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2333 
2334 static bool
2335 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2336 {
2337   int isa_field = elf_gcn_isa_field (image);
2338   const char* isa_s = isa_hsa_name (isa_field);
2339   if (!isa_s)
2340     {
2341       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2342       return false;
2343     }
2344 
2345   if (isa_field != agent->device_isa)
2346     {
2347       char msg[120];
2348       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2349       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2350       assert (agent_isa_s);
2351       assert (agent_isa_gcc_s);
2352 
2353       snprintf (msg, sizeof msg,
2354 		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2355 		"Try to recompile with '-foffload=-march=%s'.\n",
2356 		isa_s, agent_isa_s, agent_isa_gcc_s);
2357 
2358       hsa_error (msg, HSA_STATUS_ERROR);
2359       return false;
2360     }
2361 
2362   return true;
2363 }
2364 
2365 /* Create and finalize the program consisting of all loaded modules.  */
2366 
2367 static bool
2368 create_and_finalize_hsa_program (struct agent_info *agent)
2369 {
2370   hsa_status_t status;
2371   bool res = true;
2372   if (pthread_mutex_lock (&agent->prog_mutex))
2373     {
2374       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2375       return false;
2376     }
2377   if (agent->prog_finalized)
2378     goto final;
2379 
2380   status
2381     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2382 					HSA_EXECUTABLE_STATE_UNFROZEN,
2383 					"", &agent->executable);
2384   if (status != HSA_STATUS_SUCCESS)
2385     {
2386       hsa_error ("Could not create GCN executable", status);
2387       goto fail;
2388     }
2389 
2390   /* Load any GCN modules.  */
2391   struct module_info *module = agent->module;
2392   if (module)
2393     {
2394       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2395 
2396       if (!isa_matches_agent (agent, image))
2397 	goto fail;
2398 
2399       hsa_code_object_t co = { 0 };
2400       status = hsa_fns.hsa_code_object_deserialize_fn
2401 	(module->image_desc->gcn_image->image,
2402 	 module->image_desc->gcn_image->size,
2403 	 NULL, &co);
2404       if (status != HSA_STATUS_SUCCESS)
2405 	{
2406 	  hsa_error ("Could not deserialize GCN code object", status);
2407 	  goto fail;
2408 	}
2409 
2410       status = hsa_fns.hsa_executable_load_code_object_fn
2411 	(agent->executable, agent->id, co, "");
2412       if (status != HSA_STATUS_SUCCESS)
2413 	{
2414 	  hsa_error ("Could not load GCN code object", status);
2415 	  goto fail;
2416 	}
2417 
2418       if (!module->heap)
2419 	{
2420 	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2421 						   gcn_kernel_heap_size,
2422 						   (void**)&module->heap);
2423 	  if (status != HSA_STATUS_SUCCESS)
2424 	    {
2425 	      hsa_error ("Could not allocate memory for GCN heap", status);
2426 	      goto fail;
2427 	    }
2428 
2429 	  status = hsa_fns.hsa_memory_assign_agent_fn
2430 			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2431 	  if (status != HSA_STATUS_SUCCESS)
2432 	    {
2433 	      hsa_error ("Could not assign GCN heap memory to device", status);
2434 	      goto fail;
2435 	    }
2436 
2437 	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2438 				      &gcn_kernel_heap_size,
2439 				      sizeof (gcn_kernel_heap_size));
2440 	}
2441 
2442     }
2443 
2444   if (debug)
2445     dump_executable_symbols (agent->executable);
2446 
2447   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2448   if (status != HSA_STATUS_SUCCESS)
2449     {
2450       hsa_error ("Could not freeze the GCN executable", status);
2451       goto fail;
2452     }
2453 
2454 final:
2455   agent->prog_finalized = true;
2456 
2457   if (pthread_mutex_unlock (&agent->prog_mutex))
2458     {
2459       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2460       res = false;
2461     }
2462 
2463   return res;
2464 
2465 fail:
2466   res = false;
2467   goto final;
2468 }
2469 
2470 /* Free the HSA program in agent and everything associated with it and set
2471    agent->prog_finalized and the initialized flags of all kernels to false.
2472    Return TRUE on success.  */
2473 
2474 static bool
2475 destroy_hsa_program (struct agent_info *agent)
2476 {
2477   if (!agent->prog_finalized)
2478     return true;
2479 
2480   hsa_status_t status;
2481 
2482   GCN_DEBUG ("Destroying the current GCN program.\n");
2483 
2484   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2485   if (status != HSA_STATUS_SUCCESS)
2486     return hsa_error ("Could not destroy GCN executable", status);
2487 
2488   if (agent->module)
2489     {
2490       int i;
2491       for (i = 0; i < agent->module->kernel_count; i++)
2492 	agent->module->kernels[i].initialized = false;
2493 
2494       if (agent->module->heap)
2495 	{
2496 	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
2497 	  agent->module->heap = NULL;
2498 	}
2499     }
2500   agent->prog_finalized = false;
2501   return true;
2502 }
2503 
2504 /* Deinitialize all information associated with MODULE and kernels within
2505    it.  Return TRUE on success.  */
2506 
2507 static bool
2508 destroy_module (struct module_info *module, bool locked)
2509 {
2510   /* Run destructors before destroying module.  */
2511   struct GOMP_kernel_launch_attributes kla =
2512     { 3,
2513       /* Grid size.  */
2514       { 1, 64, 1 },
2515       /* Work-group size.  */
2516       { 1, 64, 1 }
2517     };
2518 
2519   if (module->fini_array_func)
2520     {
2521       init_kernel (module->fini_array_func);
2522       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2523     }
2524   module->constructors_run_p = false;
2525 
2526   int i;
2527   for (i = 0; i < module->kernel_count; i++)
2528     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2529       {
2530 	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2531 			   "mutex");
2532 	return false;
2533       }
2534 
2535   return true;
2536 }
2537 
2538 /* }}}  */
2539 /* {{{ Async  */
2540 
2541 /* Callback of dispatch queues to report errors.  */
2542 
2543 static void
2544 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2545 {
2546   struct queue_entry *entry = &aq->queue[index];
2547 
2548   switch (entry->type)
2549     {
2550     case KERNEL_LAUNCH:
2551       if (DEBUG_QUEUES)
2552 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2553 		   aq->agent->device_id, aq->id, index);
2554       run_kernel (entry->u.launch.kernel,
2555 		  entry->u.launch.vars,
2556 		  &entry->u.launch.kla, aq, false);
2557       if (DEBUG_QUEUES)
2558 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2559 		   aq->agent->device_id, aq->id, index);
2560       break;
2561 
2562     case CALLBACK:
2563       if (DEBUG_QUEUES)
2564 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2565 		   aq->agent->device_id, aq->id, index);
2566       entry->u.callback.fn (entry->u.callback.data);
2567       if (DEBUG_QUEUES)
2568 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2569 		   aq->agent->device_id, aq->id, index);
2570       break;
2571 
2572     case ASYNC_WAIT:
2573       {
2574 	/* FIXME: is it safe to access a placeholder that may already have
2575 	   been executed?  */
2576         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2577 
2578 	if (DEBUG_QUEUES)
2579           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2580 		     aq->agent->device_id, aq->id, index);
2581 
2582 	pthread_mutex_lock (&placeholderp->mutex);
2583 
2584 	while (!placeholderp->executed)
2585           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2586 
2587 	pthread_mutex_unlock (&placeholderp->mutex);
2588 
2589 	if (pthread_cond_destroy (&placeholderp->cond))
2590 	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2591 
2592 	if (pthread_mutex_destroy (&placeholderp->mutex))
2593 	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2594 
2595 	if (DEBUG_QUEUES)
2596           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2597 		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2598       }
2599       break;
2600 
2601     case ASYNC_PLACEHOLDER:
2602       pthread_mutex_lock (&entry->u.placeholder.mutex);
2603       entry->u.placeholder.executed = 1;
2604       pthread_cond_signal (&entry->u.placeholder.cond);
2605       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2606       break;
2607 
2608     default:
2609       GOMP_PLUGIN_fatal ("Unknown queue element");
2610     }
2611 }
2612 
2613 /* This function is run as a thread to service an async queue in the
2614    background.  It runs continuously until the stop flag is set.  */
2615 
2616 static void *
2617 drain_queue (void *thread_arg)
2618 {
2619   struct goacc_asyncqueue *aq = thread_arg;
2620 
2621   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2622     {
2623       aq->drain_queue_stop = 2;
2624       return NULL;
2625     }
2626 
2627   pthread_mutex_lock (&aq->mutex);
2628 
2629   while (true)
2630     {
2631       if (aq->drain_queue_stop)
2632 	break;
2633 
2634       if (aq->queue_n > 0)
2635 	{
2636 	  pthread_mutex_unlock (&aq->mutex);
2637 	  execute_queue_entry (aq, aq->queue_first);
2638 
2639 	  pthread_mutex_lock (&aq->mutex);
2640 	  aq->queue_first = ((aq->queue_first + 1)
2641 			     % ASYNC_QUEUE_SIZE);
2642 	  aq->queue_n--;
2643 
2644 	  if (DEBUG_THREAD_SIGNAL)
2645 	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2646 		       aq->agent->device_id, aq->id);
2647 	  pthread_cond_broadcast (&aq->queue_cond_out);
2648 	  pthread_mutex_unlock (&aq->mutex);
2649 
2650 	  if (DEBUG_QUEUES)
2651 	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2652 		       aq->id);
2653 	  pthread_mutex_lock (&aq->mutex);
2654 	}
2655       else
2656 	{
2657 	  if (DEBUG_THREAD_SLEEP)
2658 	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2659 		       aq->agent->device_id, aq->id);
2660 	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2661 	  if (DEBUG_THREAD_SLEEP)
2662 	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2663 		       aq->agent->device_id, aq->id);
2664 	}
2665     }
2666 
2667   aq->drain_queue_stop = 2;
2668   if (DEBUG_THREAD_SIGNAL)
2669     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2670 	       aq->agent->device_id, aq->id);
2671   pthread_cond_broadcast (&aq->queue_cond_out);
2672   pthread_mutex_unlock (&aq->mutex);
2673 
2674   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2675   return NULL;
2676 }
2677 
2678 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2679    is not usually the case.  This is just a debug tool.  */
2680 
2681 static void
2682 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2683 {
2684   pthread_mutex_lock (&aq->mutex);
2685 
2686   while (aq->queue_n > 0)
2687     {
2688       execute_queue_entry (aq, aq->queue_first);
2689 
2690       aq->queue_first = ((aq->queue_first + 1)
2691 			 % ASYNC_QUEUE_SIZE);
2692       aq->queue_n--;
2693     }
2694 
2695   pthread_mutex_unlock (&aq->mutex);
2696 }
2697 
2698 /* Block the current thread until an async queue is writable.  The aq->mutex
2699    lock should be held on entry, and remains locked on exit.  */
2700 
2701 static void
2702 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2703 {
2704   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2705     {
2706       /* Queue is full.  Wait for it to not be full.  */
2707       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2708 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2709     }
2710 }
2711 
2712 /* Request an asynchronous kernel launch on the specified queue.  This
2713    may block if the queue is full, but returns without waiting for the
2714    kernel to run.  */
2715 
2716 static void
2717 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2718 		   void *vars, struct GOMP_kernel_launch_attributes *kla)
2719 {
2720   assert (aq->agent == kernel->agent);
2721 
2722   pthread_mutex_lock (&aq->mutex);
2723 
2724   wait_for_queue_nonfull (aq);
2725 
2726   int queue_last = ((aq->queue_first + aq->queue_n)
2727 		    % ASYNC_QUEUE_SIZE);
2728   if (DEBUG_QUEUES)
2729     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2730 	       aq->id, queue_last);
2731 
2732   aq->queue[queue_last].type = KERNEL_LAUNCH;
2733   aq->queue[queue_last].u.launch.kernel = kernel;
2734   aq->queue[queue_last].u.launch.vars = vars;
2735   aq->queue[queue_last].u.launch.kla = *kla;
2736 
2737   aq->queue_n++;
2738 
2739   if (DEBUG_THREAD_SIGNAL)
2740     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2741 	       aq->agent->device_id, aq->id);
2742   pthread_cond_signal (&aq->queue_cond_in);
2743 
2744   pthread_mutex_unlock (&aq->mutex);
2745 }
2746 
2747 /* Request an asynchronous callback on the specified queue.  The callback
2748    function will be called, with the given opaque data, from the appropriate
2749    async thread, when all previous items on that queue are complete.  */
2750 
2751 static void
2752 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2753 		     void *data)
2754 {
2755   pthread_mutex_lock (&aq->mutex);
2756 
2757   wait_for_queue_nonfull (aq);
2758 
2759   int queue_last = ((aq->queue_first + aq->queue_n)
2760 		    % ASYNC_QUEUE_SIZE);
2761   if (DEBUG_QUEUES)
2762     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2763 	       aq->id, queue_last);
2764 
2765   aq->queue[queue_last].type = CALLBACK;
2766   aq->queue[queue_last].u.callback.fn = fn;
2767   aq->queue[queue_last].u.callback.data = data;
2768 
2769   aq->queue_n++;
2770 
2771   if (DEBUG_THREAD_SIGNAL)
2772     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2773 	       aq->agent->device_id, aq->id);
2774   pthread_cond_signal (&aq->queue_cond_in);
2775 
2776   pthread_mutex_unlock (&aq->mutex);
2777 }
2778 
2779 /* Request that a given async thread wait for another thread (unspecified) to
2780    reach the given placeholder.  The wait will occur when all previous entries
2781    on the queue are complete.  A placeholder is effectively a kind of signal
2782    which simply sets a flag when encountered in a queue.  */
2783 
2784 static void
2785 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2786 		      struct placeholder *placeholderp)
2787 {
2788   pthread_mutex_lock (&aq->mutex);
2789 
2790   wait_for_queue_nonfull (aq);
2791 
2792   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2793   if (DEBUG_QUEUES)
2794     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2795 	       aq->id, queue_last);
2796 
2797   aq->queue[queue_last].type = ASYNC_WAIT;
2798   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2799 
2800   aq->queue_n++;
2801 
2802   if (DEBUG_THREAD_SIGNAL)
2803     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2804 	       aq->agent->device_id, aq->id);
2805   pthread_cond_signal (&aq->queue_cond_in);
2806 
2807   pthread_mutex_unlock (&aq->mutex);
2808 }
2809 
2810 /* Add a placeholder into an async queue.  When the async thread reaches the
2811    placeholder it will set the "executed" flag to true and continue.
2812    Another thread may be waiting on this thread reaching the placeholder.  */
2813 
2814 static struct placeholder *
2815 queue_push_placeholder (struct goacc_asyncqueue *aq)
2816 {
2817   struct placeholder *placeholderp;
2818 
2819   pthread_mutex_lock (&aq->mutex);
2820 
2821   wait_for_queue_nonfull (aq);
2822 
2823   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2824   if (DEBUG_QUEUES)
2825     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2826 	       aq->id, queue_last);
2827 
2828   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2829   placeholderp = &aq->queue[queue_last].u.placeholder;
2830 
2831   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2832     {
2833       pthread_mutex_unlock (&aq->mutex);
2834       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2835     }
2836 
2837   if (pthread_cond_init (&placeholderp->cond, NULL))
2838     {
2839       pthread_mutex_unlock (&aq->mutex);
2840       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2841     }
2842 
2843   placeholderp->executed = 0;
2844 
2845   aq->queue_n++;
2846 
2847   if (DEBUG_THREAD_SIGNAL)
2848     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2849 	       aq->agent->device_id, aq->id);
2850   pthread_cond_signal (&aq->queue_cond_in);
2851 
2852   pthread_mutex_unlock (&aq->mutex);
2853 
2854   return placeholderp;
2855 }
2856 
2857 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2858 
2859 static void
2860 finalize_async_thread (struct goacc_asyncqueue *aq)
2861 {
2862   pthread_mutex_lock (&aq->mutex);
2863   if (aq->drain_queue_stop == 2)
2864     {
2865       pthread_mutex_unlock (&aq->mutex);
2866       return;
2867     }
2868 
2869   aq->drain_queue_stop = 1;
2870 
2871   if (DEBUG_THREAD_SIGNAL)
2872     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2873 	       aq->agent->device_id, aq->id);
2874   pthread_cond_signal (&aq->queue_cond_in);
2875 
2876   while (aq->drain_queue_stop != 2)
2877     {
2878       if (DEBUG_THREAD_SLEEP)
2879 	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2880 		   " to sleep\n", aq->agent->device_id, aq->id);
2881       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2882       if (DEBUG_THREAD_SLEEP)
2883 	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2884 		   aq->agent->device_id, aq->id);
2885     }
2886 
2887   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2888 	     aq->id);
2889   pthread_mutex_unlock (&aq->mutex);
2890 
2891   int err = pthread_join (aq->thread_drain_queue, NULL);
2892   if (err != 0)
2893     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2894 		       aq->agent->device_id, aq->id, strerror (err));
2895   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2896 }
2897 
2898 /* Set up an async queue for OpenMP.  There will be only one.  The
2899    implementation simply uses an OpenACC async queue.
2900    FIXME: is this thread-safe if two threads call this function?  */
2901 
2902 static void
2903 maybe_init_omp_async (struct agent_info *agent)
2904 {
2905   if (!agent->omp_async_queue)
2906     agent->omp_async_queue
2907       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2908 }
2909 
2910 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2911    copies from read-only pages.  */
2912 
2913 static void
2914 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2915 {
2916   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2917 
2918   if (status == HSA_STATUS_SUCCESS)
2919     return;
2920 
2921   /* It appears that the copy fails if the source data is in a read-only page.
2922      We can't detect that easily, so try copying the data to a temporary buffer
2923      and doing the copy again if we got an error above.  */
2924 
2925   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2926 	       "[%p:+%d]\n", (void *) src, (int) len);
2927 
2928   void *src_copy = malloc (len);
2929   memcpy (src_copy, src, len);
2930   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2931   free (src_copy);
2932   if (status != HSA_STATUS_SUCCESS)
2933     GOMP_PLUGIN_error ("memory copy failed");
2934 }
2935 
2936 /* Copy data to or from a device.  This is intended for use as an async
2937    callback event.  */
2938 
2939 static void
2940 copy_data (void *data_)
2941 {
2942   struct copy_data *data = (struct copy_data *)data_;
2943   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2944 	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
2945 	     data->dst);
2946   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2947   free (data);
2948 }
2949 
2950 /* Free device data.  This is intended for use as an async callback event.  */
2951 
2952 static void
2953 gomp_offload_free (void *ptr)
2954 {
2955   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2956   GOMP_OFFLOAD_free (0, ptr);
2957 }
2958 
2959 /* Request an asynchronous data copy, to or from a device, on a given queue.
2960    The event will be registered as a callback.  */
2961 
2962 static void
2963 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2964 		 size_t len)
2965 {
2966   if (DEBUG_QUEUES)
2967     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2968 	       aq->agent->device_id, aq->id, len, src, dst);
2969   struct copy_data *data
2970     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2971   data->dst = dst;
2972   data->src = src;
2973   data->len = len;
2974   data->aq = aq;
2975   queue_push_callback (aq, copy_data, data);
2976 }
2977 
2978 /* Return true if the given queue is currently empty.  */
2979 
2980 static int
2981 queue_empty (struct goacc_asyncqueue *aq)
2982 {
2983   pthread_mutex_lock (&aq->mutex);
2984   int res = aq->queue_n == 0 ? 1 : 0;
2985   pthread_mutex_unlock (&aq->mutex);
2986 
2987   return res;
2988 }
2989 
2990 /* Wait for a given queue to become empty.  This implements an OpenACC wait
2991    directive.  */
2992 
2993 static void
2994 wait_queue (struct goacc_asyncqueue *aq)
2995 {
2996   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2997     {
2998       drain_queue_synchronous (aq);
2999       return;
3000     }
3001 
3002   pthread_mutex_lock (&aq->mutex);
3003 
3004   while (aq->queue_n > 0)
3005     {
3006       if (DEBUG_THREAD_SLEEP)
3007 	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3008 		   aq->agent->device_id, aq->id);
3009       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3010       if (DEBUG_THREAD_SLEEP)
3011 	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3012 		   aq->id);
3013     }
3014 
3015   pthread_mutex_unlock (&aq->mutex);
3016   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3017 }
3018 
3019 /* }}}  */
3020 /* {{{ OpenACC support  */
3021 
3022 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3023 
3024 static void
3025 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3026 	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3027 	  struct goacc_asyncqueue *aq)
3028 {
3029   if (!GOMP_OFFLOAD_can_run (kernel))
3030     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3031 
3032   /* If we get here then this must be an OpenACC kernel.  */
3033   kernel->kind = KIND_OPENACC;
3034 
3035   /* devaddrs must be double-indirect on the target.  */
3036   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3037   for (size_t i = 0; i < mapnum; i++)
3038     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3039 				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3040 				sizeof (void *));
3041 
3042   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3043   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3044     {
3045       struct hsa_kernel_description *d
3046 	= &kernel->module->image_desc->kernel_infos[i];
3047       if (d->name == kernel->name)
3048 	{
3049 	  hsa_kernel_desc = d;
3050 	  break;
3051 	}
3052     }
3053 
3054   /* We may have statically-determined dimensions in
3055      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3056      invocation at runtime in dims[].  We allow static dimensions to take
3057      priority over dynamic dimensions when present (non-zero).  */
3058   if (hsa_kernel_desc->oacc_dims[0] > 0)
3059     dims[0] = hsa_kernel_desc->oacc_dims[0];
3060   if (hsa_kernel_desc->oacc_dims[1] > 0)
3061     dims[1] = hsa_kernel_desc->oacc_dims[1];
3062   if (hsa_kernel_desc->oacc_dims[2] > 0)
3063     dims[2] = hsa_kernel_desc->oacc_dims[2];
3064 
3065   /* Ideally, when a dimension isn't explicitly specified, we should
3066      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3067      In practice, we tune for peak performance on BabelStream, which
3068      for OpenACC is currently 32 threads per CU.  */
3069   if (dims[0] == 0 && dims[1] == 0)
3070     {
3071       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3072 	 number.  There isn't really a correct answer for this without a clue
3073 	 about the problem size, so let's do a reasonable number of workers
3074 	 and gangs.  */
3075 
3076       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3077       dims[1] = 8; /* Workers.  */
3078     }
3079   else if (dims[0] == 0 && dims[1] > 0)
3080     {
3081       /* Auto-scale the number of gangs with the requested number of workers.  */
3082       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3083     }
3084   else if (dims[0] > 0 && dims[1] == 0)
3085     {
3086       /* Auto-scale the number of workers with the requested number of gangs.  */
3087       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3088       if (dims[1] == 0)
3089 	dims[1] = 1;
3090       if (dims[1] > 16)
3091 	dims[1] = 16;
3092     }
3093 
3094   /* The incoming dimensions are expressed in terms of gangs, workers, and
3095      vectors.  The HSA dimensions are expressed in terms of "work-items",
3096      which means multiples of vector lanes.
3097 
3098      The "grid size" specifies the size of the problem space, and the
3099      "work-group size" specifies how much of that we want a single compute
3100      unit to chew on at once.
3101 
3102      The three dimensions do not really correspond to hardware, but the
3103      important thing is that the HSA runtime will launch as many
3104      work-groups as it takes to process the entire grid, and each
3105      work-group will contain as many wave-fronts as it takes to process
3106      the work-items in that group.
3107 
3108      Essentially, as long as we set the Y dimension to 64 (the number of
3109      vector lanes in hardware), and the Z group size to the maximum (16),
3110      then we will get the gangs (X) and workers (Z) launched as we expect.
3111 
3112      The reason for the apparent reversal of vector and worker dimension
3113      order is to do with the way the run-time distributes work-items across
3114      v1 and v2.  */
3115   struct GOMP_kernel_launch_attributes kla =
3116     {3,
3117      /* Grid size.  */
3118      {dims[0], 64, dims[1]},
3119      /* Work-group size.  */
3120      {1,       64, 16}
3121     };
3122 
3123   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3124   acc_prof_info *prof_info = thr->prof_info;
3125   acc_event_info enqueue_launch_event_info;
3126   acc_api_info *api_info = thr->api_info;
3127   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3128   if (profiling_dispatch_p)
3129     {
3130       prof_info->event_type = acc_ev_enqueue_launch_start;
3131 
3132       enqueue_launch_event_info.launch_event.event_type
3133 	= prof_info->event_type;
3134       enqueue_launch_event_info.launch_event.valid_bytes
3135 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3136       enqueue_launch_event_info.launch_event.parent_construct
3137 	= acc_construct_parallel;
3138       enqueue_launch_event_info.launch_event.implicit = 1;
3139       enqueue_launch_event_info.launch_event.tool_info = NULL;
3140       enqueue_launch_event_info.launch_event.kernel_name
3141 	= (char *) kernel->name;
3142       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3143       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3144       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3145 
3146       api_info->device_api = acc_device_api_other;
3147 
3148       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3149 	&enqueue_launch_event_info, api_info);
3150     }
3151 
3152   if (!async)
3153     {
3154       run_kernel (kernel, ind_da, &kla, NULL, false);
3155       gomp_offload_free (ind_da);
3156     }
3157   else
3158     {
3159       queue_push_launch (aq, kernel, ind_da, &kla);
3160       if (DEBUG_QUEUES)
3161 	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3162 		   aq->agent->device_id, aq->id, ind_da);
3163       queue_push_callback (aq, gomp_offload_free, ind_da);
3164     }
3165 
3166   if (profiling_dispatch_p)
3167     {
3168       prof_info->event_type = acc_ev_enqueue_launch_end;
3169       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3170       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3171 					    &enqueue_launch_event_info,
3172 					    api_info);
3173     }
3174 }
3175 
3176 /* }}}  */
3177 /* {{{ Generic Plugin API  */
3178 
3179 /* Return the name of the accelerator, which is "gcn".  */
3180 
3181 const char *
3182 GOMP_OFFLOAD_get_name (void)
3183 {
3184   return "gcn";
3185 }
3186 
3187 /* Return the specific capabilities the HSA accelerator have.  */
3188 
3189 unsigned int
3190 GOMP_OFFLOAD_get_caps (void)
3191 {
3192   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3193   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3194 	    | GOMP_OFFLOAD_CAP_OPENACC_200;
3195 }
3196 
3197 /* Identify as GCN accelerator.  */
3198 
3199 int
3200 GOMP_OFFLOAD_get_type (void)
3201 {
3202   return OFFLOAD_TARGET_TYPE_GCN;
3203 }
3204 
3205 /* Return the libgomp version number we're compatible with.  There is
3206    no requirement for cross-version compatibility.  */
3207 
3208 unsigned
3209 GOMP_OFFLOAD_version (void)
3210 {
3211   return GOMP_VERSION;
3212 }
3213 
3214 /* Return the number of GCN devices on the system.  */
3215 
3216 int
3217 GOMP_OFFLOAD_get_num_devices (void)
3218 {
3219   if (!init_hsa_context ())
3220     return 0;
3221   return hsa_context.agent_count;
3222 }
3223 
3224 /* Initialize device (agent) number N so that it can be used for computation.
3225    Return TRUE on success.  */
3226 
3227 bool
3228 GOMP_OFFLOAD_init_device (int n)
3229 {
3230   if (!init_hsa_context ())
3231     return false;
3232   if (n >= hsa_context.agent_count)
3233     {
3234       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3235       return false;
3236     }
3237   struct agent_info *agent = &hsa_context.agents[n];
3238 
3239   if (agent->initialized)
3240     return true;
3241 
3242   agent->device_id = n;
3243 
3244   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3245     {
3246       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3247       return false;
3248     }
3249   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3250     {
3251       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3252       return false;
3253     }
3254   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3255     {
3256       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3257       return false;
3258     }
3259   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3260     {
3261       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3262       return false;
3263     }
3264   agent->async_queues = NULL;
3265   agent->omp_async_queue = NULL;
3266   agent->team_arena_list = NULL;
3267 
3268   uint32_t queue_size;
3269   hsa_status_t status;
3270   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3271 					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3272 					  &queue_size);
3273   if (status != HSA_STATUS_SUCCESS)
3274     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3275 		      status);
3276 
3277   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3278 					  &agent->name);
3279   if (status != HSA_STATUS_SUCCESS)
3280     return hsa_error ("Error querying the name of the agent", status);
3281 
3282   agent->device_isa = isa_code (agent->name);
3283   if (agent->device_isa < 0)
3284     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3285 
3286   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3287 					  &agent->vendor_name);
3288   if (status != HSA_STATUS_SUCCESS)
3289     return hsa_error ("Error querying the vendor name of the agent", status);
3290 
3291   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3292 					HSA_QUEUE_TYPE_MULTI,
3293 					hsa_queue_callback, NULL, UINT32_MAX,
3294 					UINT32_MAX, &agent->sync_queue);
3295   if (status != HSA_STATUS_SUCCESS)
3296     return hsa_error ("Error creating command queue", status);
3297 
3298   agent->kernarg_region.handle = (uint64_t) -1;
3299   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3300 						 get_kernarg_memory_region,
3301 						 &agent->kernarg_region);
3302   if (status != HSA_STATUS_SUCCESS
3303       && status != HSA_STATUS_INFO_BREAK)
3304     hsa_error ("Scanning memory regions failed", status);
3305   if (agent->kernarg_region.handle == (uint64_t) -1)
3306     {
3307       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3308 			 "arguments");
3309       return false;
3310     }
3311   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3312   dump_hsa_region (agent->kernarg_region, NULL);
3313 
3314   agent->data_region.handle = (uint64_t) -1;
3315   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3316 						 get_data_memory_region,
3317 						 &agent->data_region);
3318   if (status != HSA_STATUS_SUCCESS
3319       && status != HSA_STATUS_INFO_BREAK)
3320     hsa_error ("Scanning memory regions failed", status);
3321   if (agent->data_region.handle == (uint64_t) -1)
3322     {
3323       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3324 			 "data");
3325       return false;
3326     }
3327   GCN_DEBUG ("Selected device data memory region:\n");
3328   dump_hsa_region (agent->data_region, NULL);
3329 
3330   GCN_DEBUG ("GCN agent %d initialized\n", n);
3331 
3332   agent->initialized = true;
3333   return true;
3334 }
3335 
3336 /* Load GCN object-code module described by struct gcn_image_desc in
3337    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3338    If there are any constructors then run them.  */
3339 
3340 int
3341 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3342 			 struct addr_pair **target_table)
3343 {
3344   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3345     {
3346       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3347 			 " (expected %u, received %u)",
3348 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3349       return -1;
3350     }
3351 
3352   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3353   struct agent_info *agent;
3354   struct addr_pair *pair;
3355   struct module_info *module;
3356   struct kernel_info *kernel;
3357   int kernel_count = image_desc->kernel_count;
3358   unsigned var_count = image_desc->global_variable_count;
3359   int other_count = 1;
3360 
3361   agent = get_agent_info (ord);
3362   if (!agent)
3363     return -1;
3364 
3365   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3366     {
3367       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3368       return -1;
3369     }
3370   if (agent->prog_finalized
3371       && !destroy_hsa_program (agent))
3372     return -1;
3373 
3374   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3375   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3376   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3377   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3378 			     * sizeof (struct addr_pair));
3379   *target_table = pair;
3380   module = (struct module_info *)
3381     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3382 				+ kernel_count * sizeof (struct kernel_info));
3383   module->image_desc = image_desc;
3384   module->kernel_count = kernel_count;
3385   module->heap = NULL;
3386   module->constructors_run_p = false;
3387 
3388   kernel = &module->kernels[0];
3389 
3390   /* Allocate memory for kernel dependencies.  */
3391   for (unsigned i = 0; i < kernel_count; i++)
3392     {
3393       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3394       if (!init_basic_kernel_info (kernel, d, agent, module))
3395 	return -1;
3396       if (strcmp (d->name, "_init_array") == 0)
3397 	module->init_array_func = kernel;
3398       else if (strcmp (d->name, "_fini_array") == 0)
3399         module->fini_array_func = kernel;
3400       else
3401 	{
3402 	  pair->start = (uintptr_t) kernel;
3403 	  pair->end = (uintptr_t) (kernel + 1);
3404 	  pair++;
3405 	}
3406       kernel++;
3407     }
3408 
3409   agent->module = module;
3410   if (pthread_rwlock_unlock (&agent->module_rwlock))
3411     {
3412       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3413       return -1;
3414     }
3415 
3416   if (!create_and_finalize_hsa_program (agent))
3417     return -1;
3418 
3419   if (var_count > 0)
3420     {
3421       hsa_status_t status;
3422       hsa_executable_symbol_t var_symbol;
3423       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3424 						     ".offload_var_table",
3425 						     agent->id,
3426 						     0, &var_symbol);
3427 
3428       if (status != HSA_STATUS_SUCCESS)
3429 	hsa_fatal ("Could not find symbol for variable in the code object",
3430 		   status);
3431 
3432       uint64_t var_table_addr;
3433       status = hsa_fns.hsa_executable_symbol_get_info_fn
3434 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3435 	 &var_table_addr);
3436       if (status != HSA_STATUS_SUCCESS)
3437 	hsa_fatal ("Could not extract a variable from its symbol", status);
3438 
3439       struct {
3440 	uint64_t addr;
3441 	uint64_t size;
3442       } var_table[var_count];
3443       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3444 			     (void*)var_table_addr, sizeof (var_table));
3445 
3446       for (unsigned i = 0; i < var_count; i++)
3447 	{
3448 	  pair->start = var_table[i].addr;
3449 	  pair->end = var_table[i].addr + var_table[i].size;
3450 	  GCN_DEBUG ("Found variable at %p with size %lu\n",
3451 		     (void *)var_table[i].addr, var_table[i].size);
3452 	  pair++;
3453 	}
3454     }
3455 
3456   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_DEVICE_NUM_VAR));
3457 
3458   hsa_status_t status;
3459   hsa_executable_symbol_t var_symbol;
3460   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3461 						 XSTRING (GOMP_DEVICE_NUM_VAR),
3462 						 agent->id, 0, &var_symbol);
3463   if (status == HSA_STATUS_SUCCESS)
3464     {
3465       uint64_t device_num_varptr;
3466       uint32_t device_num_varsize;
3467 
3468       status = hsa_fns.hsa_executable_symbol_get_info_fn
3469 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3470 	 &device_num_varptr);
3471       if (status != HSA_STATUS_SUCCESS)
3472 	hsa_fatal ("Could not extract a variable from its symbol", status);
3473       status = hsa_fns.hsa_executable_symbol_get_info_fn
3474 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3475 	 &device_num_varsize);
3476       if (status != HSA_STATUS_SUCCESS)
3477 	hsa_fatal ("Could not extract a variable size from its symbol", status);
3478 
3479       pair->start = device_num_varptr;
3480       pair->end = device_num_varptr + device_num_varsize;
3481     }
3482   else
3483     /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
3484     pair->start = pair->end = 0;
3485   pair++;
3486 
3487   /* Ensure that constructors are run first.  */
3488   struct GOMP_kernel_launch_attributes kla =
3489     { 3,
3490       /* Grid size.  */
3491       { 1, 64, 1 },
3492       /* Work-group size.  */
3493       { 1, 64, 1 }
3494     };
3495 
3496   if (module->init_array_func)
3497     {
3498       init_kernel (module->init_array_func);
3499       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3500     }
3501   module->constructors_run_p = true;
3502 
3503   /* Don't report kernels that libgomp need not know about.  */
3504   if (module->init_array_func)
3505     kernel_count--;
3506   if (module->fini_array_func)
3507     kernel_count--;
3508 
3509   return kernel_count + var_count + other_count;
3510 }
3511 
3512 /* Unload GCN object-code module described by struct gcn_image_desc in
3513    TARGET_DATA from agent number N.  Return TRUE on success.  */
3514 
3515 bool
3516 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3517 {
3518   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3519     {
3520       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3521 			 " (expected %u, received %u)",
3522 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3523       return false;
3524     }
3525 
3526   struct agent_info *agent;
3527   agent = get_agent_info (n);
3528   if (!agent)
3529     return false;
3530 
3531   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3532     {
3533       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3534       return false;
3535     }
3536 
3537   if (!agent->module || agent->module->image_desc != target_data)
3538     {
3539       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3540 			 "loaded before");
3541       return false;
3542     }
3543 
3544   if (!destroy_module (agent->module, true))
3545     return false;
3546   free (agent->module);
3547   agent->module = NULL;
3548   if (!destroy_hsa_program (agent))
3549     return false;
3550   if (pthread_rwlock_unlock (&agent->module_rwlock))
3551     {
3552       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3553       return false;
3554     }
3555   return true;
3556 }
3557 
3558 /* Deinitialize all information and status associated with agent number N.  We
3559    do not attempt any synchronization, assuming the user and libgomp will not
3560    attempt deinitialization of a device that is in any way being used at the
3561    same time.  Return TRUE on success.  */
3562 
3563 bool
3564 GOMP_OFFLOAD_fini_device (int n)
3565 {
3566   struct agent_info *agent = get_agent_info (n);
3567   if (!agent)
3568     return false;
3569 
3570   if (!agent->initialized)
3571     return true;
3572 
3573   if (agent->omp_async_queue)
3574     {
3575       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3576       agent->omp_async_queue = NULL;
3577     }
3578 
3579   if (agent->module)
3580     {
3581       if (!destroy_module (agent->module, false))
3582 	return false;
3583       free (agent->module);
3584       agent->module = NULL;
3585     }
3586 
3587   if (!destroy_team_arenas (agent))
3588     return false;
3589 
3590   if (!destroy_hsa_program (agent))
3591     return false;
3592 
3593   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3594   if (status != HSA_STATUS_SUCCESS)
3595     return hsa_error ("Error destroying command queue", status);
3596 
3597   if (pthread_mutex_destroy (&agent->prog_mutex))
3598     {
3599       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3600       return false;
3601     }
3602   if (pthread_rwlock_destroy (&agent->module_rwlock))
3603     {
3604       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3605       return false;
3606     }
3607 
3608   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3609     {
3610       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3611       return false;
3612     }
3613   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3614     {
3615       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3616       return false;
3617     }
3618   agent->initialized = false;
3619   return true;
3620 }
3621 
3622 /* Return true if the HSA runtime can run function FN_PTR.  */
3623 
3624 bool
3625 GOMP_OFFLOAD_can_run (void *fn_ptr)
3626 {
3627   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3628 
3629   init_kernel (kernel);
3630   if (kernel->initialization_failed)
3631     goto failure;
3632 
3633   return true;
3634 
3635 failure:
3636   if (suppress_host_fallback)
3637     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3638   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3639   return false;
3640 }
3641 
3642 /* Allocate memory on device N.  */
3643 
3644 void *
3645 GOMP_OFFLOAD_alloc (int n, size_t size)
3646 {
3647   struct agent_info *agent = get_agent_info (n);
3648   return alloc_by_agent (agent, size);
3649 }
3650 
3651 /* Free memory from device N.  */
3652 
3653 bool
3654 GOMP_OFFLOAD_free (int device, void *ptr)
3655 {
3656   GCN_DEBUG ("Freeing memory on device %d\n", device);
3657 
3658   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3659   if (status != HSA_STATUS_SUCCESS)
3660     {
3661       hsa_error ("Could not free device memory", status);
3662       return false;
3663     }
3664 
3665   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3666   bool profiling_dispatch_p
3667     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3668   if (profiling_dispatch_p)
3669     {
3670       acc_prof_info *prof_info = thr->prof_info;
3671       acc_event_info data_event_info;
3672       acc_api_info *api_info = thr->api_info;
3673 
3674       prof_info->event_type = acc_ev_free;
3675 
3676       data_event_info.data_event.event_type = prof_info->event_type;
3677       data_event_info.data_event.valid_bytes
3678 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
3679       data_event_info.data_event.parent_construct
3680 	= acc_construct_parallel;
3681       data_event_info.data_event.implicit = 1;
3682       data_event_info.data_event.tool_info = NULL;
3683       data_event_info.data_event.var_name = NULL;
3684       data_event_info.data_event.bytes = 0;
3685       data_event_info.data_event.host_ptr = NULL;
3686       data_event_info.data_event.device_ptr = (void *) ptr;
3687 
3688       api_info->device_api = acc_device_api_other;
3689 
3690       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3691 					    api_info);
3692     }
3693 
3694   return true;
3695 }
3696 
3697 /* Copy data from DEVICE to host.  */
3698 
3699 bool
3700 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3701 {
3702   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3703 	     src, dst);
3704   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3705   if (status != HSA_STATUS_SUCCESS)
3706     GOMP_PLUGIN_error ("memory copy failed");
3707   return true;
3708 }
3709 
3710 /* Copy data from host to DEVICE.  */
3711 
3712 bool
3713 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3714 {
3715   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3716 	     device, dst);
3717   hsa_memory_copy_wrapper (dst, src, n);
3718   return true;
3719 }
3720 
3721 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3722 
3723 bool
3724 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3725 {
3726   struct gcn_thread *thread_data = gcn_thread ();
3727 
3728   if (thread_data && !async_synchronous_p (thread_data->async))
3729     {
3730       struct agent_info *agent = get_agent_info (device);
3731       maybe_init_omp_async (agent);
3732       queue_push_copy (agent->omp_async_queue, dst, src, n);
3733       return true;
3734     }
3735 
3736   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3737 	     device, src, device, dst);
3738   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3739   if (status != HSA_STATUS_SUCCESS)
3740     GOMP_PLUGIN_error ("memory copy failed");
3741   return true;
3742 }
3743 
3744 /* }}}  */
3745 /* {{{ OpenMP Plugin API  */
3746 
3747 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3748    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3749    to a kernel_info structure, and must have previously been loaded to the
3750    specified device.  */
3751 
3752 void
3753 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3754 {
3755   struct agent_info *agent = get_agent_info (device);
3756   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3757   struct GOMP_kernel_launch_attributes def;
3758   struct GOMP_kernel_launch_attributes *kla;
3759   assert (agent == kernel->agent);
3760 
3761   /* If we get here then the kernel must be OpenMP.  */
3762   kernel->kind = KIND_OPENMP;
3763 
3764   if (!parse_target_attributes (args, &def, &kla, agent))
3765     {
3766       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3767       return;
3768     }
3769   run_kernel (kernel, vars, kla, NULL, false);
3770 }
3771 
3772 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3773    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3774    GOMP_PLUGIN_target_task_completion when it has finished.  */
3775 
3776 void
3777 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3778 			void **args, void *async_data)
3779 {
3780   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3781   struct agent_info *agent = get_agent_info (device);
3782   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3783   struct GOMP_kernel_launch_attributes def;
3784   struct GOMP_kernel_launch_attributes *kla;
3785   assert (agent == kernel->agent);
3786 
3787   /* If we get here then the kernel must be OpenMP.  */
3788   kernel->kind = KIND_OPENMP;
3789 
3790   if (!parse_target_attributes (args, &def, &kla, agent))
3791     {
3792       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3793       return;
3794     }
3795 
3796   maybe_init_omp_async (agent);
3797   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3798   queue_push_callback (agent->omp_async_queue,
3799 		       GOMP_PLUGIN_target_task_completion, async_data);
3800 }
3801 
3802 /* }}} */
3803 /* {{{ OpenACC Plugin API  */
3804 
3805 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3806    already-loaded KERNEL.  */
3807 
3808 void
3809 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3810 			   void **hostaddrs, void **devaddrs, unsigned *dims,
3811 			   void *targ_mem_desc)
3812 {
3813   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3814 
3815   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3816 	    NULL);
3817 }
3818 
3819 /* Run an asynchronous OpenACC kernel on the specified queue.  */
3820 
3821 void
3822 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3823 				 void **hostaddrs, void **devaddrs,
3824 				 unsigned *dims, void *targ_mem_desc,
3825 				 struct goacc_asyncqueue *aq)
3826 {
3827   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3828 
3829   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3830 	    aq);
3831 }
3832 
3833 /* Create a new asynchronous thread and queue for running future kernels.  */
3834 
3835 struct goacc_asyncqueue *
3836 GOMP_OFFLOAD_openacc_async_construct (int device)
3837 {
3838   struct agent_info *agent = get_agent_info (device);
3839 
3840   pthread_mutex_lock (&agent->async_queues_mutex);
3841 
3842   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3843   aq->agent = get_agent_info (device);
3844   aq->prev = NULL;
3845   aq->next = agent->async_queues;
3846   if (aq->next)
3847     {
3848       aq->next->prev = aq;
3849       aq->id = aq->next->id + 1;
3850     }
3851   else
3852     aq->id = 1;
3853   agent->async_queues = aq;
3854 
3855   aq->queue_first = 0;
3856   aq->queue_n = 0;
3857   aq->drain_queue_stop = 0;
3858 
3859   if (pthread_mutex_init (&aq->mutex, NULL))
3860     {
3861       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3862       return false;
3863     }
3864   if (pthread_cond_init (&aq->queue_cond_in, NULL))
3865     {
3866       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3867       return false;
3868     }
3869   if (pthread_cond_init (&aq->queue_cond_out, NULL))
3870     {
3871       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3872       return false;
3873     }
3874 
3875   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3876 						     ASYNC_QUEUE_SIZE,
3877 						     HSA_QUEUE_TYPE_MULTI,
3878 						     hsa_queue_callback, NULL,
3879 						     UINT32_MAX, UINT32_MAX,
3880 						     &aq->hsa_queue);
3881   if (status != HSA_STATUS_SUCCESS)
3882     hsa_fatal ("Error creating command queue", status);
3883 
3884   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3885   if (err != 0)
3886     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3887 		       strerror (err));
3888   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3889 	     aq->id);
3890 
3891   pthread_mutex_unlock (&agent->async_queues_mutex);
3892 
3893   return aq;
3894 }
3895 
3896 /* Destroy an existing asynchronous thread and queue.  Waits for any
3897    currently-running task to complete, but cancels any queued tasks.  */
3898 
3899 bool
3900 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3901 {
3902   struct agent_info *agent = aq->agent;
3903 
3904   finalize_async_thread (aq);
3905 
3906   pthread_mutex_lock (&agent->async_queues_mutex);
3907 
3908   int err;
3909   if ((err = pthread_mutex_destroy (&aq->mutex)))
3910     {
3911       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3912       goto fail;
3913     }
3914   if (pthread_cond_destroy (&aq->queue_cond_in))
3915     {
3916       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3917       goto fail;
3918     }
3919   if (pthread_cond_destroy (&aq->queue_cond_out))
3920     {
3921       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3922       goto fail;
3923     }
3924   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3925   if (status != HSA_STATUS_SUCCESS)
3926     {
3927       hsa_error ("Error destroying command queue", status);
3928       goto fail;
3929     }
3930 
3931   if (aq->prev)
3932     aq->prev->next = aq->next;
3933   if (aq->next)
3934     aq->next->prev = aq->prev;
3935   if (agent->async_queues == aq)
3936     agent->async_queues = aq->next;
3937 
3938   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3939 
3940   free (aq);
3941   pthread_mutex_unlock (&agent->async_queues_mutex);
3942   return true;
3943 
3944 fail:
3945   pthread_mutex_unlock (&agent->async_queues_mutex);
3946   return false;
3947 }
3948 
3949 /* Return true if the specified async queue is currently empty.  */
3950 
3951 int
3952 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3953 {
3954   return queue_empty (aq);
3955 }
3956 
3957 /* Block until the specified queue has executed all its tasks and the
3958    queue is empty.  */
3959 
3960 bool
3961 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3962 {
3963   wait_queue (aq);
3964   return true;
3965 }
3966 
3967 /* Add a serialization point across two async queues. Any new tasks added to
3968    AQ2, after this call, will not run until all tasks on AQ1, at the time
3969    of this call, have completed.  */
3970 
3971 bool
3972 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3973 				      struct goacc_asyncqueue *aq2)
3974 {
3975   /* For serialize, stream aq2 waits for aq1 to complete work that has been
3976      scheduled to run on it up to this point.  */
3977   if (aq1 != aq2)
3978     {
3979       struct placeholder *placeholderp = queue_push_placeholder (aq1);
3980       queue_push_asyncwait (aq2, placeholderp);
3981     }
3982   return true;
3983 }
3984 
3985 /* Add an opaque callback to the given async queue.  */
3986 
3987 void
3988 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3989 					   void (*fn) (void *), void *data)
3990 {
3991   queue_push_callback (aq, fn, data);
3992 }
3993 
3994 /* Queue up an asynchronous data copy from host to DEVICE.  */
3995 
3996 bool
3997 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3998 				     size_t n, struct goacc_asyncqueue *aq)
3999 {
4000   struct agent_info *agent = get_agent_info (device);
4001   assert (agent == aq->agent);
4002   queue_push_copy (aq, dst, src, n);
4003   return true;
4004 }
4005 
4006 /* Queue up an asynchronous data copy from DEVICE to host.  */
4007 
4008 bool
4009 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4010 				     size_t n, struct goacc_asyncqueue *aq)
4011 {
4012   struct agent_info *agent = get_agent_info (device);
4013   assert (agent == aq->agent);
4014   queue_push_copy (aq, dst, src, n);
4015   return true;
4016 }
4017 
4018 union goacc_property_value
4019 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4020 {
4021   struct agent_info *agent = get_agent_info (device);
4022 
4023   union goacc_property_value propval = { .val = 0 };
4024 
4025   switch (prop)
4026     {
4027     case GOACC_PROPERTY_FREE_MEMORY:
4028       /* Not supported. */
4029       break;
4030     case GOACC_PROPERTY_MEMORY:
4031       {
4032 	size_t size;
4033 	hsa_region_t region = agent->data_region;
4034 	hsa_status_t status =
4035 	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4036 	if (status == HSA_STATUS_SUCCESS)
4037 	  propval.val = size;
4038 	break;
4039       }
4040     case GOACC_PROPERTY_NAME:
4041       propval.ptr = agent->name;
4042       break;
4043     case GOACC_PROPERTY_VENDOR:
4044       propval.ptr = agent->vendor_name;
4045       break;
4046     case GOACC_PROPERTY_DRIVER:
4047       propval.ptr = hsa_context.driver_version_s;
4048       break;
4049     }
4050 
4051   return propval;
4052 }
4053 
4054 /* Set up plugin-specific thread-local-data (host-side).  */
4055 
4056 void *
4057 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4058 {
4059   struct gcn_thread *thread_data
4060     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4061 
4062   thread_data->async = GOMP_ASYNC_SYNC;
4063 
4064   return (void *) thread_data;
4065 }
4066 
4067 /* Clean up plugin-specific thread-local-data.  */
4068 
4069 void
4070 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4071 {
4072   free (data);
4073 }
4074 
4075 /* }}} */
4076