1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <pthread.h>
36 #include <inttypes.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <hsa.h>
40 #include <dlfcn.h>
41 #include <signal.h>
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
44 #include <elf.h>
45 #include "oacc-plugin.h"
46 #include "oacc-int.h"
47 #include <assert.h>
48
49 /* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54 /* These probably won't be in elf.h for a while. */
55 #define R_AMDGPU_NONE 0
56 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
57 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
58 #define R_AMDGPU_ABS64 3 /* S + A */
59 #define R_AMDGPU_REL32 4 /* S + A - P */
60 #define R_AMDGPU_REL64 5 /* S + A - P */
61 #define R_AMDGPU_ABS32 6 /* S + A */
62 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
63 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
64 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
65 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
66 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
67 #define reserved 12
68 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69
70 /* GCN specific definitions for asynchronous queues. */
71
72 #define ASYNC_QUEUE_SIZE 64
73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
74 #define DEBUG_QUEUES 0
75 #define DEBUG_THREAD_SLEEP 0
76 #define DEBUG_THREAD_SIGNAL 0
77
78 /* Defaults. */
79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80
81 /* Secure getenv() which returns NULL if running as SUID/SGID. */
82 #ifndef HAVE_SECURE_GETENV
83 #ifdef HAVE___SECURE_GETENV
84 #define secure_getenv __secure_getenv
85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87
88 #include <unistd.h>
89
90 /* Implementation of secure_getenv() for targets where it is not provided but
91 we have at least means to test real and effective IDs. */
92
93 static char *
secure_getenv(const char * name)94 secure_getenv (const char *name)
95 {
96 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97 return getenv (name);
98 else
99 return NULL;
100 }
101
102 #else
103 #define secure_getenv getenv
104 #endif
105 #endif
106
107 /* }}} */
108 /* {{{ Types */
109
110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111
112 struct gcn_thread
113 {
114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
115 int async;
116 };
117
118 /* As an HSA runtime is dlopened, following structure defines function
119 pointers utilized by the HSA plug-in. */
120
121 struct hsa_runtime_fn_info
122 {
123 /* HSA runtime. */
124 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125 const char **status_string);
126 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129 hsa_agent_info_t attribute,
130 void *value);
131 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132 hsa_isa_info_t attribute,
133 uint32_t index,
134 void *value);
135 hsa_status_t (*hsa_init_fn) (void);
136 hsa_status_t (*hsa_iterate_agents_fn)
137 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139 hsa_region_info_t attribute,
140 void *value);
141 hsa_status_t (*hsa_queue_create_fn)
142 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144 void *data, uint32_t private_segment_size,
145 uint32_t group_segment_size, hsa_queue_t **queue);
146 hsa_status_t (*hsa_agent_iterate_regions_fn)
147 (hsa_agent_t agent,
148 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150 hsa_status_t (*hsa_executable_create_fn)
151 (hsa_profile_t profile, hsa_executable_state_t executable_state,
152 const char *options, hsa_executable_t *executable);
153 hsa_status_t (*hsa_executable_global_variable_define_fn)
154 (hsa_executable_t executable, const char *variable_name, void *address);
155 hsa_status_t (*hsa_executable_load_code_object_fn)
156 (hsa_executable_t executable, hsa_agent_t agent,
157 hsa_code_object_t code_object, const char *options);
158 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159 const char *options);
160 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161 uint32_t num_consumers,
162 const hsa_agent_t *consumers,
163 hsa_signal_t *signal);
164 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165 void **ptr);
166 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167 hsa_access_permission_t access);
168 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171 hsa_status_t (*hsa_executable_get_symbol_fn)
172 (hsa_executable_t executable, const char *module_name,
173 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174 hsa_executable_symbol_t *symbol);
175 hsa_status_t (*hsa_executable_symbol_get_info_fn)
176 (hsa_executable_symbol_t executable_symbol,
177 hsa_executable_symbol_info_t attribute, void *value);
178 hsa_status_t (*hsa_executable_iterate_symbols_fn)
179 (hsa_executable_t executable,
180 hsa_status_t (*callback)(hsa_executable_t executable,
181 hsa_executable_symbol_t symbol, void *data),
182 void *data);
183 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184 uint64_t value);
185 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189 hsa_signal_value_t value);
190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191 (hsa_signal_t signal, hsa_signal_condition_t condition,
192 hsa_signal_value_t compare_value, uint64_t timeout_hint,
193 hsa_wait_state_t wait_state_hint);
194 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196
197 hsa_status_t (*hsa_code_object_deserialize_fn)
198 (void *serialized_code_object, size_t serialized_code_object_size,
199 const char *options, hsa_code_object_t *code_object);
200 };
201
202 /* Structure describing the run-time and grid properties of an HSA kernel
203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204
205 struct GOMP_kernel_launch_attributes
206 {
207 /* Number of dimensions the workload has. Maximum number is 3. */
208 uint32_t ndim;
209 /* Size of the grid in the three respective dimensions. */
210 uint32_t gdims[3];
211 /* Size of work-groups in the respective dimensions. */
212 uint32_t wdims[3];
213 };
214
215 /* Collection of information needed for a dispatch of a kernel from a
216 kernel. */
217
218 struct kernel_dispatch
219 {
220 struct agent_info *agent;
221 /* Pointer to a command queue associated with a kernel dispatch agent. */
222 void *queue;
223 /* Pointer to a memory space used for kernel arguments passing. */
224 void *kernarg_address;
225 /* Kernel object. */
226 uint64_t object;
227 /* Synchronization signal used for dispatch synchronization. */
228 uint64_t signal;
229 /* Private segment size. */
230 uint32_t private_segment_size;
231 /* Group segment size. */
232 uint32_t group_segment_size;
233 };
234
235 /* Structure of the kernargs segment, supporting console output.
236
237 This needs to match the definitions in Newlib, and the expectations
238 in libgomp target code. */
239
240 struct kernargs {
241 /* Leave space for the real kernel arguments.
242 OpenACC and OpenMP only use one pointer. */
243 int64_t dummy1;
244 int64_t dummy2;
245
246 /* A pointer to struct output, below, for console output data. */
247 int64_t out_ptr;
248
249 /* A pointer to struct heap, below. */
250 int64_t heap_ptr;
251
252 /* A pointer to an ephemeral memory arena.
253 Only needed for OpenMP. */
254 int64_t arena_ptr;
255
256 /* Output data. */
257 struct output {
258 int return_value;
259 unsigned int next_output;
260 struct printf_data {
261 int written;
262 char msg[128];
263 int type;
264 union {
265 int64_t ivalue;
266 double dvalue;
267 char text[128];
268 };
269 } queue[1024];
270 unsigned int consumed;
271 } output_data;
272 };
273
274 /* A queue entry for a future asynchronous launch. */
275
276 struct kernel_launch
277 {
278 struct kernel_info *kernel;
279 void *vars;
280 struct GOMP_kernel_launch_attributes kla;
281 };
282
283 /* A queue entry for a future callback. */
284
285 struct callback
286 {
287 void (*fn)(void *);
288 void *data;
289 };
290
291 /* A data struct for the copy_data callback. */
292
293 struct copy_data
294 {
295 void *dst;
296 const void *src;
297 size_t len;
298 bool free_src;
299 struct goacc_asyncqueue *aq;
300 };
301
302 /* A queue entry for a placeholder. These correspond to a wait event. */
303
304 struct placeholder
305 {
306 int executed;
307 pthread_cond_t cond;
308 pthread_mutex_t mutex;
309 };
310
311 /* A queue entry for a wait directive. */
312
313 struct asyncwait_info
314 {
315 struct placeholder *placeholderp;
316 };
317
318 /* Encode the type of an entry in an async queue. */
319
320 enum entry_type
321 {
322 KERNEL_LAUNCH,
323 CALLBACK,
324 ASYNC_WAIT,
325 ASYNC_PLACEHOLDER
326 };
327
328 /* An entry in an async queue. */
329
330 struct queue_entry
331 {
332 enum entry_type type;
333 union {
334 struct kernel_launch launch;
335 struct callback callback;
336 struct asyncwait_info asyncwait;
337 struct placeholder placeholder;
338 } u;
339 };
340
341 /* An async queue header.
342
343 OpenMP may create one of these.
344 OpenACC may create many. */
345
346 struct goacc_asyncqueue
347 {
348 struct agent_info *agent;
349 hsa_queue_t *hsa_queue;
350
351 pthread_t thread_drain_queue;
352 pthread_mutex_t mutex;
353 pthread_cond_t queue_cond_in;
354 pthread_cond_t queue_cond_out;
355 struct queue_entry queue[ASYNC_QUEUE_SIZE];
356 int queue_first;
357 int queue_n;
358 int drain_queue_stop;
359
360 int id;
361 struct goacc_asyncqueue *prev;
362 struct goacc_asyncqueue *next;
363 };
364
365 /* Mkoffload uses this structure to describe a kernel.
366
367 OpenMP kernel dimensions are passed at runtime.
368 OpenACC kernel dimensions are passed at compile time, here. */
369
370 struct hsa_kernel_description
371 {
372 const char *name;
373 int oacc_dims[3]; /* Only present for GCN kernels. */
374 int sgpr_count;
375 int vpgr_count;
376 };
377
378 /* Mkoffload uses this structure to describe an offload variable. */
379
380 struct global_var_info
381 {
382 const char *name;
383 void *address;
384 };
385
386 /* Mkoffload uses this structure to describe all the kernels in a
387 loadable module. These are passed the libgomp via static constructors. */
388
389 struct gcn_image_desc
390 {
391 struct gcn_image {
392 size_t size;
393 void *image;
394 } *gcn_image;
395 const unsigned kernel_count;
396 struct hsa_kernel_description *kernel_infos;
397 const unsigned global_variable_count;
398 struct global_var_info *global_variables;
399 };
400
401 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
402 support.
403 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
404
405 typedef enum {
406 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
407 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
408 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
409 } EF_AMDGPU_MACH;
410
411 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
412 typedef EF_AMDGPU_MACH gcn_isa;
413
414 /* Description of an HSA GPU agent (device) and the program associated with
415 it. */
416
417 struct agent_info
418 {
419 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
420 hsa_agent_t id;
421 /* The user-visible device number. */
422 int device_id;
423 /* Whether the agent has been initialized. The fields below are usable only
424 if it has been. */
425 bool initialized;
426
427 /* The instruction set architecture of the device. */
428 gcn_isa device_isa;
429 /* Name of the agent. */
430 char name[64];
431 /* Name of the vendor of the agent. */
432 char vendor_name[64];
433 /* Command queues of the agent. */
434 hsa_queue_t *sync_queue;
435 struct goacc_asyncqueue *async_queues, *omp_async_queue;
436 pthread_mutex_t async_queues_mutex;
437
438 /* The HSA memory region from which to allocate kernel arguments. */
439 hsa_region_t kernarg_region;
440
441 /* The HSA memory region from which to allocate device data. */
442 hsa_region_t data_region;
443
444 /* Allocated team arenas. */
445 struct team_arena_list *team_arena_list;
446 pthread_mutex_t team_arena_write_lock;
447
448 /* Read-write lock that protects kernels which are running or about to be run
449 from interference with loading and unloading of images. Needs to be
450 locked for reading while a kernel is being run, and for writing if the
451 list of modules is manipulated (and thus the HSA program invalidated). */
452 pthread_rwlock_t module_rwlock;
453
454 /* The module associated with this kernel. */
455 struct module_info *module;
456
457 /* Mutex enforcing that only one thread will finalize the HSA program. A
458 thread should have locked agent->module_rwlock for reading before
459 acquiring it. */
460 pthread_mutex_t prog_mutex;
461 /* Flag whether the HSA program that consists of all the modules has been
462 finalized. */
463 bool prog_finalized;
464 /* HSA executable - the finalized program that is used to locate kernels. */
465 hsa_executable_t executable;
466 };
467
468 /* Information required to identify, finalize and run any given kernel. */
469
470 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
471
472 struct kernel_info
473 {
474 /* Name of the kernel, required to locate it within the GCN object-code
475 module. */
476 const char *name;
477 /* The specific agent the kernel has been or will be finalized for and run
478 on. */
479 struct agent_info *agent;
480 /* The specific module where the kernel takes place. */
481 struct module_info *module;
482 /* Information provided by mkoffload associated with the kernel. */
483 struct hsa_kernel_description *description;
484 /* Mutex enforcing that at most once thread ever initializes a kernel for
485 use. A thread should have locked agent->module_rwlock for reading before
486 acquiring it. */
487 pthread_mutex_t init_mutex;
488 /* Flag indicating whether the kernel has been initialized and all fields
489 below it contain valid data. */
490 bool initialized;
491 /* Flag indicating that the kernel has a problem that blocks an execution. */
492 bool initialization_failed;
493 /* The object to be put into the dispatch queue. */
494 uint64_t object;
495 /* Required size of kernel arguments. */
496 uint32_t kernarg_segment_size;
497 /* Required size of group segment. */
498 uint32_t group_segment_size;
499 /* Required size of private segment. */
500 uint32_t private_segment_size;
501 /* Set up for OpenMP or OpenACC? */
502 enum offload_kind kind;
503 };
504
505 /* Information about a particular GCN module, its image and kernels. */
506
507 struct module_info
508 {
509 /* The description with which the program has registered the image. */
510 struct gcn_image_desc *image_desc;
511 /* GCN heap allocation. */
512 struct heap *heap;
513 /* Physical boundaries of the loaded module. */
514 Elf64_Addr phys_address_start;
515 Elf64_Addr phys_address_end;
516
517 bool constructors_run_p;
518 struct kernel_info *init_array_func, *fini_array_func;
519
520 /* Number of kernels in this module. */
521 int kernel_count;
522 /* An array of kernel_info structures describing each kernel in this
523 module. */
524 struct kernel_info kernels[];
525 };
526
527 /* A linked list of memory arenas allocated on the device.
528 These are only used by OpenMP, as a means to optimize per-team malloc. */
529
530 struct team_arena_list
531 {
532 struct team_arena_list *next;
533
534 /* The number of teams determines the size of the allocation. */
535 int num_teams;
536 /* The device address of the arena itself. */
537 void *arena;
538 /* A flag to prevent two asynchronous kernels trying to use the same arena.
539 The mutex is locked until the kernel exits. */
540 pthread_mutex_t in_use;
541 };
542
543 /* Information about the whole HSA environment and all of its agents. */
544
545 struct hsa_context_info
546 {
547 /* Whether the structure has been initialized. */
548 bool initialized;
549 /* Number of usable GPU HSA agents in the system. */
550 int agent_count;
551 /* Array of agent_info structures describing the individual HSA agents. */
552 struct agent_info *agents;
553 /* Driver version string. */
554 char driver_version_s[30];
555 };
556
557 /* Format of the on-device heap.
558
559 This must match the definition in Newlib and gcn-run. */
560
561 struct heap {
562 int64_t size;
563 char data[0];
564 };
565
566 /* }}} */
567 /* {{{ Global variables */
568
569 /* Information about the whole HSA environment and all of its agents. */
570
571 static struct hsa_context_info hsa_context;
572
573 /* HSA runtime functions that are initialized in init_hsa_context. */
574
575 static struct hsa_runtime_fn_info hsa_fns;
576
577 /* Heap space, allocated target-side, provided for use of newlib malloc.
578 Each module should have it's own heap allocated.
579 Beware that heap usage increases with OpenMP teams. See also arenas. */
580
581 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
582
583 /* Flag to decide whether print to stderr information about what is going on.
584 Set in init_debug depending on environment variables. */
585
586 static bool debug;
587
588 /* Flag to decide if the runtime should suppress a possible fallback to host
589 execution. */
590
591 static bool suppress_host_fallback;
592
593 /* Flag to locate HSA runtime shared library that is dlopened
594 by this plug-in. */
595
596 static const char *hsa_runtime_lib;
597
598 /* Flag to decide if the runtime should support also CPU devices (can be
599 a simulator). */
600
601 static bool support_cpu_devices;
602
603 /* Runtime dimension overrides. Zero indicates default. */
604
605 static int override_x_dim = 0;
606 static int override_z_dim = 0;
607
608 /* }}} */
609 /* {{{ Debug & Diagnostic */
610
611 /* Print a message to stderr if GCN_DEBUG value is set to true. */
612
613 #define DEBUG_PRINT(...) \
614 do \
615 { \
616 if (debug) \
617 { \
618 fprintf (stderr, __VA_ARGS__); \
619 } \
620 } \
621 while (false);
622
623 /* Flush stderr if GCN_DEBUG value is set to true. */
624
625 #define DEBUG_FLUSH() \
626 do { \
627 if (debug) \
628 fflush (stderr); \
629 } while (false)
630
631 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
632 is set to true. */
633
634 #define DEBUG_LOG(prefix, ...) \
635 do \
636 { \
637 DEBUG_PRINT (prefix); \
638 DEBUG_PRINT (__VA_ARGS__); \
639 DEBUG_FLUSH (); \
640 } while (false)
641
642 /* Print a debugging message to stderr. */
643
644 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
645
646 /* Print a warning message to stderr. */
647
648 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
649
650 /* Print HSA warning STR with an HSA STATUS code. */
651
652 static void
hsa_warn(const char * str,hsa_status_t status)653 hsa_warn (const char *str, hsa_status_t status)
654 {
655 if (!debug)
656 return;
657
658 const char *hsa_error_msg = "[unknown]";
659 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660
661 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
662 hsa_error_msg);
663 }
664
665 /* Report a fatal error STR together with the HSA error corresponding to STATUS
666 and terminate execution of the current process. */
667
668 static void
hsa_fatal(const char * str,hsa_status_t status)669 hsa_fatal (const char *str, hsa_status_t status)
670 {
671 const char *hsa_error_msg = "[unknown]";
672 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
673 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
674 hsa_error_msg);
675 }
676
677 /* Like hsa_fatal, except only report error message, and return FALSE
678 for propagating error processing to outside of plugin. */
679
680 static bool
hsa_error(const char * str,hsa_status_t status)681 hsa_error (const char *str, hsa_status_t status)
682 {
683 const char *hsa_error_msg = "[unknown]";
684 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
685 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
686 hsa_error_msg);
687 return false;
688 }
689
690 /* Dump information about the available hardware. */
691
692 static void
dump_hsa_system_info(void)693 dump_hsa_system_info (void)
694 {
695 hsa_status_t status;
696
697 hsa_endianness_t endianness;
698 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
699 &endianness);
700 if (status == HSA_STATUS_SUCCESS)
701 switch (endianness)
702 {
703 case HSA_ENDIANNESS_LITTLE:
704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
705 break;
706 case HSA_ENDIANNESS_BIG:
707 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
708 break;
709 default:
710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
711 }
712 else
713 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
714
715 uint8_t extensions[128];
716 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
717 &extensions);
718 if (status == HSA_STATUS_SUCCESS)
719 {
720 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
721 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
722 }
723 else
724 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
725 }
726
727 /* Dump information about the available hardware. */
728
729 static void
dump_machine_model(hsa_machine_model_t machine_model,const char * s)730 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
731 {
732 switch (machine_model)
733 {
734 case HSA_MACHINE_MODEL_SMALL:
735 GCN_DEBUG ("%s: SMALL\n", s);
736 break;
737 case HSA_MACHINE_MODEL_LARGE:
738 GCN_DEBUG ("%s: LARGE\n", s);
739 break;
740 default:
741 GCN_WARNING ("%s: UNKNOWN\n", s);
742 break;
743 }
744 }
745
746 /* Dump information about the available hardware. */
747
748 static void
dump_profile(hsa_profile_t profile,const char * s)749 dump_profile (hsa_profile_t profile, const char *s)
750 {
751 switch (profile)
752 {
753 case HSA_PROFILE_FULL:
754 GCN_DEBUG ("%s: FULL\n", s);
755 break;
756 case HSA_PROFILE_BASE:
757 GCN_DEBUG ("%s: BASE\n", s);
758 break;
759 default:
760 GCN_WARNING ("%s: UNKNOWN\n", s);
761 break;
762 }
763 }
764
765 /* Dump information about a device memory region. */
766
767 static hsa_status_t
dump_hsa_region(hsa_region_t region,void * data)768 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
769 {
770 hsa_status_t status;
771
772 hsa_region_segment_t segment;
773 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
774 &segment);
775 if (status == HSA_STATUS_SUCCESS)
776 {
777 if (segment == HSA_REGION_SEGMENT_GLOBAL)
778 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
779 else if (segment == HSA_REGION_SEGMENT_READONLY)
780 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
781 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
782 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
783 else if (segment == HSA_REGION_SEGMENT_GROUP)
784 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
785 else
786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
787 }
788 else
789 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
790
791 if (segment == HSA_REGION_SEGMENT_GLOBAL)
792 {
793 uint32_t flags;
794 status
795 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
796 &flags);
797 if (status == HSA_STATUS_SUCCESS)
798 {
799 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
800 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
801 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
802 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
803 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
804 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
805 }
806 else
807 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
808 }
809
810 size_t size;
811 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
812 if (status == HSA_STATUS_SUCCESS)
813 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
814 else
815 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
816
817 status
818 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
819 &size);
820 if (status == HSA_STATUS_SUCCESS)
821 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
822 else
823 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
824
825 bool alloc_allowed;
826 status
827 = hsa_fns.hsa_region_get_info_fn (region,
828 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
829 &alloc_allowed);
830 if (status == HSA_STATUS_SUCCESS)
831 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
832 else
833 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
834
835 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
836 return HSA_STATUS_SUCCESS;
837
838 status
839 = hsa_fns.hsa_region_get_info_fn (region,
840 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
841 &size);
842 if (status == HSA_STATUS_SUCCESS)
843 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
844 else
845 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
846
847 size_t align;
848 status
849 = hsa_fns.hsa_region_get_info_fn (region,
850 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
851 &align);
852 if (status == HSA_STATUS_SUCCESS)
853 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
854 else
855 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
856
857 return HSA_STATUS_SUCCESS;
858 }
859
860 /* Dump information about all the device memory regions. */
861
862 static void
dump_hsa_regions(hsa_agent_t agent)863 dump_hsa_regions (hsa_agent_t agent)
864 {
865 hsa_status_t status;
866 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
867 dump_hsa_region,
868 NULL);
869 if (status != HSA_STATUS_SUCCESS)
870 hsa_error ("Dumping hsa regions failed", status);
871 }
872
873 /* Dump information about the available devices. */
874
875 static hsa_status_t
dump_hsa_agent_info(hsa_agent_t agent,void * data)876 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
877 {
878 hsa_status_t status;
879
880 char buf[64];
881 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
882 &buf);
883 if (status == HSA_STATUS_SUCCESS)
884 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
885 else
886 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
887
888 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
889 &buf);
890 if (status == HSA_STATUS_SUCCESS)
891 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
892 else
893 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
894
895 hsa_machine_model_t machine_model;
896 status
897 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
898 &machine_model);
899 if (status == HSA_STATUS_SUCCESS)
900 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
901 else
902 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
903
904 hsa_profile_t profile;
905 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
906 &profile);
907 if (status == HSA_STATUS_SUCCESS)
908 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
909 else
910 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
911
912 hsa_device_type_t device_type;
913 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
914 &device_type);
915 if (status == HSA_STATUS_SUCCESS)
916 {
917 switch (device_type)
918 {
919 case HSA_DEVICE_TYPE_CPU:
920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
921 break;
922 case HSA_DEVICE_TYPE_GPU:
923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
924 break;
925 case HSA_DEVICE_TYPE_DSP:
926 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
927 break;
928 default:
929 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
930 break;
931 }
932 }
933 else
934 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
935
936 uint32_t cu_count;
937 status = hsa_fns.hsa_agent_get_info_fn
938 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
939 if (status == HSA_STATUS_SUCCESS)
940 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
941 else
942 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
943
944 uint32_t size;
945 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
946 &size);
947 if (status == HSA_STATUS_SUCCESS)
948 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
949 else
950 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
951
952 uint32_t max_dim;
953 status = hsa_fns.hsa_agent_get_info_fn (agent,
954 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
955 &max_dim);
956 if (status == HSA_STATUS_SUCCESS)
957 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
958 else
959 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
960
961 uint32_t max_size;
962 status = hsa_fns.hsa_agent_get_info_fn (agent,
963 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
964 &max_size);
965 if (status == HSA_STATUS_SUCCESS)
966 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
967 else
968 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
969
970 uint32_t grid_max_dim;
971 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
972 &grid_max_dim);
973 if (status == HSA_STATUS_SUCCESS)
974 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
975 else
976 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
977
978 uint32_t grid_max_size;
979 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
980 &grid_max_size);
981 if (status == HSA_STATUS_SUCCESS)
982 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
983 else
984 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
985
986 dump_hsa_regions (agent);
987
988 return HSA_STATUS_SUCCESS;
989 }
990
991 /* Forward reference. */
992
993 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
994
995 /* Helper function for dump_executable_symbols. */
996
997 static hsa_status_t
dump_executable_symbol(hsa_executable_t executable,hsa_executable_symbol_t symbol,void * data)998 dump_executable_symbol (hsa_executable_t executable,
999 hsa_executable_symbol_t symbol,
1000 void *data __attribute__((unused)))
1001 {
1002 char *name = get_executable_symbol_name (symbol);
1003
1004 if (name)
1005 {
1006 GCN_DEBUG ("executable symbol: %s\n", name);
1007 free (name);
1008 }
1009
1010 return HSA_STATUS_SUCCESS;
1011 }
1012
1013 /* Dump all global symbol in an executable. */
1014
1015 static void
dump_executable_symbols(hsa_executable_t executable)1016 dump_executable_symbols (hsa_executable_t executable)
1017 {
1018 hsa_status_t status;
1019 status
1020 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1021 dump_executable_symbol,
1022 NULL);
1023 if (status != HSA_STATUS_SUCCESS)
1024 hsa_fatal ("Could not dump HSA executable symbols", status);
1025 }
1026
1027 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1028
1029 static void
print_kernel_dispatch(struct kernel_dispatch * dispatch,unsigned indent)1030 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1031 {
1032 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1033
1034 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1035 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1036 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1037 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1038 (void*)kernargs->heap_ptr);
1039 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1040 (void*)kernargs->arena_ptr);
1041 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1042 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1043 dispatch->private_segment_size);
1044 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1045 dispatch->group_segment_size);
1046 fprintf (stderr, "\n");
1047 }
1048
1049 /* }}} */
1050 /* {{{ Utility functions */
1051
1052 /* Cast the thread local storage to gcn_thread. */
1053
1054 static inline struct gcn_thread *
gcn_thread(void)1055 gcn_thread (void)
1056 {
1057 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1058 }
1059
1060 /* Initialize debug and suppress_host_fallback according to the environment. */
1061
1062 static void
init_environment_variables(void)1063 init_environment_variables (void)
1064 {
1065 if (secure_getenv ("GCN_DEBUG"))
1066 debug = true;
1067 else
1068 debug = false;
1069
1070 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1071 suppress_host_fallback = true;
1072 else
1073 suppress_host_fallback = false;
1074
1075 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1076 if (hsa_runtime_lib == NULL)
1077 hsa_runtime_lib = "libhsa-runtime64.so";
1078
1079 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1080
1081 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1082 if (!x)
1083 x = secure_getenv ("GCN_NUM_GANGS");
1084 if (x)
1085 override_x_dim = atoi (x);
1086
1087 const char *z = secure_getenv ("GCN_NUM_THREADS");
1088 if (!z)
1089 z = secure_getenv ("GCN_NUM_WORKERS");
1090 if (z)
1091 override_z_dim = atoi (z);
1092
1093 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1094 if (heap)
1095 {
1096 size_t tmp = atol (heap);
1097 if (tmp)
1098 gcn_kernel_heap_size = tmp;
1099 }
1100 }
1101
1102 /* Return malloc'd string with name of SYMBOL. */
1103
1104 static char *
get_executable_symbol_name(hsa_executable_symbol_t symbol)1105 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1106 {
1107 hsa_status_t status;
1108 char *res;
1109 uint32_t len;
1110 const hsa_executable_symbol_info_t info_name_length
1111 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1112
1113 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1114 &len);
1115 if (status != HSA_STATUS_SUCCESS)
1116 {
1117 hsa_error ("Could not get length of symbol name", status);
1118 return NULL;
1119 }
1120
1121 res = GOMP_PLUGIN_malloc (len + 1);
1122
1123 const hsa_executable_symbol_info_t info_name
1124 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1125
1126 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1127
1128 if (status != HSA_STATUS_SUCCESS)
1129 {
1130 hsa_error ("Could not get symbol name", status);
1131 free (res);
1132 return NULL;
1133 }
1134
1135 res[len] = '\0';
1136
1137 return res;
1138 }
1139
1140 /* Helper function for find_executable_symbol. */
1141
1142 static hsa_status_t
find_executable_symbol_1(hsa_executable_t executable,hsa_executable_symbol_t symbol,void * data)1143 find_executable_symbol_1 (hsa_executable_t executable,
1144 hsa_executable_symbol_t symbol,
1145 void *data)
1146 {
1147 hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1148 *res = symbol;
1149 return HSA_STATUS_INFO_BREAK;
1150 }
1151
1152 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
1153 found, return false. */
1154
1155 static bool
find_executable_symbol(hsa_executable_t executable,hsa_executable_symbol_t * symbol)1156 find_executable_symbol (hsa_executable_t executable,
1157 hsa_executable_symbol_t *symbol)
1158 {
1159 hsa_status_t status;
1160
1161 status
1162 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1163 find_executable_symbol_1,
1164 symbol);
1165 if (status != HSA_STATUS_INFO_BREAK)
1166 {
1167 hsa_error ("Could not find executable symbol", status);
1168 return false;
1169 }
1170
1171 return true;
1172 }
1173
1174 /* Get the number of GPU Compute Units. */
1175
1176 static int
get_cu_count(struct agent_info * agent)1177 get_cu_count (struct agent_info *agent)
1178 {
1179 uint32_t cu_count;
1180 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1181 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1182 if (status == HSA_STATUS_SUCCESS)
1183 return cu_count;
1184 else
1185 return 64; /* The usual number for older devices. */
1186 }
1187
1188 /* Calculate the maximum grid size for OMP threads / OACC workers.
1189 This depends on the kernel's resource usage levels. */
1190
1191 static int
limit_worker_threads(int threads)1192 limit_worker_threads (int threads)
1193 {
1194 /* FIXME Do something more inteligent here.
1195 GCN can always run 4 threads within a Compute Unit, but
1196 more than that depends on register usage. */
1197 if (threads > 16)
1198 threads = 16;
1199 return threads;
1200 }
1201
1202 /* Parse the target attributes INPUT provided by the compiler and return true
1203 if we should run anything all. If INPUT is NULL, fill DEF with default
1204 values, then store INPUT or DEF into *RESULT.
1205
1206 This is used for OpenMP only. */
1207
1208 static bool
parse_target_attributes(void ** input,struct GOMP_kernel_launch_attributes * def,struct GOMP_kernel_launch_attributes ** result,struct agent_info * agent)1209 parse_target_attributes (void **input,
1210 struct GOMP_kernel_launch_attributes *def,
1211 struct GOMP_kernel_launch_attributes **result,
1212 struct agent_info *agent)
1213 {
1214 if (!input)
1215 GOMP_PLUGIN_fatal ("No target arguments provided");
1216
1217 bool grid_attrs_found = false;
1218 bool gcn_dims_found = false;
1219 int gcn_teams = 0;
1220 int gcn_threads = 0;
1221 while (*input)
1222 {
1223 intptr_t id = (intptr_t) *input++, val;
1224
1225 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1226 val = (intptr_t) *input++;
1227 else
1228 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1229
1230 val = (val > INT_MAX) ? INT_MAX : val;
1231
1232 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1233 && ((id & GOMP_TARGET_ARG_ID_MASK)
1234 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1235 {
1236 grid_attrs_found = true;
1237 break;
1238 }
1239 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1240 == GOMP_TARGET_ARG_DEVICE_ALL)
1241 {
1242 gcn_dims_found = true;
1243 switch (id & GOMP_TARGET_ARG_ID_MASK)
1244 {
1245 case GOMP_TARGET_ARG_NUM_TEAMS:
1246 gcn_teams = val;
1247 break;
1248 case GOMP_TARGET_ARG_THREAD_LIMIT:
1249 gcn_threads = limit_worker_threads (val);
1250 break;
1251 default:
1252 ;
1253 }
1254 }
1255 }
1256
1257 if (gcn_dims_found)
1258 {
1259 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1260 && gcn_threads == 0 && override_z_dim == 0)
1261 {
1262 gcn_threads = 4;
1263 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1264 "threads to 4 per team.\n");
1265 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1266 "GCN_NUM_THREADS=16\n");
1267 }
1268
1269 def->ndim = 3;
1270 /* Fiji has 64 CUs, but Vega20 has 60. */
1271 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1272 /* Each thread is 64 work items wide. */
1273 def->gdims[1] = 64;
1274 /* A work group can have 16 wavefronts. */
1275 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1276 def->wdims[0] = 1; /* Single team per work-group. */
1277 def->wdims[1] = 64;
1278 def->wdims[2] = 16;
1279 *result = def;
1280 return true;
1281 }
1282 else if (!grid_attrs_found)
1283 {
1284 def->ndim = 1;
1285 def->gdims[0] = 1;
1286 def->gdims[1] = 1;
1287 def->gdims[2] = 1;
1288 def->wdims[0] = 1;
1289 def->wdims[1] = 1;
1290 def->wdims[2] = 1;
1291 *result = def;
1292 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1293 return true;
1294 }
1295
1296 struct GOMP_kernel_launch_attributes *kla;
1297 kla = (struct GOMP_kernel_launch_attributes *) *input;
1298 *result = kla;
1299 if (kla->ndim == 0 || kla->ndim > 3)
1300 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1301
1302 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1303 unsigned i;
1304 for (i = 0; i < kla->ndim; i++)
1305 {
1306 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1307 kla->gdims[i], kla->wdims[i]);
1308 if (kla->gdims[i] == 0)
1309 return false;
1310 }
1311 return true;
1312 }
1313
1314 /* Return the group size given the requested GROUP size, GRID size and number
1315 of grid dimensions NDIM. */
1316
1317 static uint32_t
get_group_size(uint32_t ndim,uint32_t grid,uint32_t group)1318 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1319 {
1320 if (group == 0)
1321 {
1322 /* TODO: Provide a default via environment or device characteristics. */
1323 if (ndim == 1)
1324 group = 64;
1325 else if (ndim == 2)
1326 group = 8;
1327 else
1328 group = 4;
1329 }
1330
1331 if (group > grid)
1332 group = grid;
1333 return group;
1334 }
1335
1336 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1337
1338 static void
packet_store_release(uint32_t * packet,uint16_t header,uint16_t rest)1339 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1340 {
1341 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1342 }
1343
1344 /* A never-called callback for the HSA command queues. These signal events
1345 that we don't use, so we trigger an error.
1346
1347 This "queue" is not to be confused with the async queues, below. */
1348
1349 static void
hsa_queue_callback(hsa_status_t status,hsa_queue_t * queue,void * data)1350 hsa_queue_callback (hsa_status_t status,
1351 hsa_queue_t *queue __attribute__ ((unused)),
1352 void *data __attribute__ ((unused)))
1353 {
1354 hsa_fatal ("Asynchronous queue error", status);
1355 }
1356
1357 /* }}} */
1358 /* {{{ HSA initialization */
1359
1360 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1361
1362 static bool
init_hsa_runtime_functions(void)1363 init_hsa_runtime_functions (void)
1364 {
1365 #define DLSYM_FN(function) \
1366 hsa_fns.function##_fn = dlsym (handle, #function); \
1367 if (hsa_fns.function##_fn == NULL) \
1368 return false;
1369 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1370 if (handle == NULL)
1371 return false;
1372
1373 DLSYM_FN (hsa_status_string)
1374 DLSYM_FN (hsa_system_get_info)
1375 DLSYM_FN (hsa_agent_get_info)
1376 DLSYM_FN (hsa_init)
1377 DLSYM_FN (hsa_iterate_agents)
1378 DLSYM_FN (hsa_region_get_info)
1379 DLSYM_FN (hsa_queue_create)
1380 DLSYM_FN (hsa_agent_iterate_regions)
1381 DLSYM_FN (hsa_executable_destroy)
1382 DLSYM_FN (hsa_executable_create)
1383 DLSYM_FN (hsa_executable_global_variable_define)
1384 DLSYM_FN (hsa_executable_load_code_object)
1385 DLSYM_FN (hsa_executable_freeze)
1386 DLSYM_FN (hsa_signal_create)
1387 DLSYM_FN (hsa_memory_allocate)
1388 DLSYM_FN (hsa_memory_assign_agent)
1389 DLSYM_FN (hsa_memory_copy)
1390 DLSYM_FN (hsa_memory_free)
1391 DLSYM_FN (hsa_signal_destroy)
1392 DLSYM_FN (hsa_executable_get_symbol)
1393 DLSYM_FN (hsa_executable_symbol_get_info)
1394 DLSYM_FN (hsa_executable_iterate_symbols)
1395 DLSYM_FN (hsa_queue_add_write_index_release)
1396 DLSYM_FN (hsa_queue_load_read_index_acquire)
1397 DLSYM_FN (hsa_signal_wait_acquire)
1398 DLSYM_FN (hsa_signal_store_relaxed)
1399 DLSYM_FN (hsa_signal_store_release)
1400 DLSYM_FN (hsa_signal_load_acquire)
1401 DLSYM_FN (hsa_queue_destroy)
1402 DLSYM_FN (hsa_code_object_deserialize)
1403 return true;
1404 #undef DLSYM_FN
1405 }
1406
1407 /* Return true if the agent is a GPU and can accept of concurrent submissions
1408 from different threads. */
1409
1410 static bool
suitable_hsa_agent_p(hsa_agent_t agent)1411 suitable_hsa_agent_p (hsa_agent_t agent)
1412 {
1413 hsa_device_type_t device_type;
1414 hsa_status_t status
1415 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1416 &device_type);
1417 if (status != HSA_STATUS_SUCCESS)
1418 return false;
1419
1420 switch (device_type)
1421 {
1422 case HSA_DEVICE_TYPE_GPU:
1423 break;
1424 case HSA_DEVICE_TYPE_CPU:
1425 if (!support_cpu_devices)
1426 return false;
1427 break;
1428 default:
1429 return false;
1430 }
1431
1432 uint32_t features = 0;
1433 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1434 &features);
1435 if (status != HSA_STATUS_SUCCESS
1436 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1437 return false;
1438 hsa_queue_type_t queue_type;
1439 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1440 &queue_type);
1441 if (status != HSA_STATUS_SUCCESS
1442 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1443 return false;
1444
1445 return true;
1446 }
1447
1448 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1449 agent_count in hsa_context. */
1450
1451 static hsa_status_t
count_gpu_agents(hsa_agent_t agent,void * data)1452 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1453 {
1454 if (suitable_hsa_agent_p (agent))
1455 hsa_context.agent_count++;
1456 return HSA_STATUS_SUCCESS;
1457 }
1458
1459 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1460 id to the describing structure in the hsa context. The index of the
1461 structure is pointed to by DATA, increment it afterwards. */
1462
1463 static hsa_status_t
assign_agent_ids(hsa_agent_t agent,void * data)1464 assign_agent_ids (hsa_agent_t agent, void *data)
1465 {
1466 if (suitable_hsa_agent_p (agent))
1467 {
1468 int *agent_index = (int *) data;
1469 hsa_context.agents[*agent_index].id = agent;
1470 ++*agent_index;
1471 }
1472 return HSA_STATUS_SUCCESS;
1473 }
1474
1475 /* Initialize hsa_context if it has not already been done.
1476 Return TRUE on success. */
1477
1478 static bool
init_hsa_context(void)1479 init_hsa_context (void)
1480 {
1481 hsa_status_t status;
1482 int agent_index = 0;
1483
1484 if (hsa_context.initialized)
1485 return true;
1486 init_environment_variables ();
1487 if (!init_hsa_runtime_functions ())
1488 {
1489 GCN_WARNING ("Run-time could not be dynamically opened\n");
1490 if (suppress_host_fallback)
1491 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1492 return false;
1493 }
1494 status = hsa_fns.hsa_init_fn ();
1495 if (status != HSA_STATUS_SUCCESS)
1496 return hsa_error ("Run-time could not be initialized", status);
1497 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1498
1499 if (debug)
1500 dump_hsa_system_info ();
1501
1502 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1503 if (status != HSA_STATUS_SUCCESS)
1504 return hsa_error ("GCN GPU devices could not be enumerated", status);
1505 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1506
1507 hsa_context.agents
1508 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1509 * sizeof (struct agent_info));
1510 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1511 if (status != HSA_STATUS_SUCCESS)
1512 return hsa_error ("Scanning compute agents failed", status);
1513 if (agent_index != hsa_context.agent_count)
1514 {
1515 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1516 return false;
1517 }
1518
1519 if (debug)
1520 {
1521 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1522 if (status != HSA_STATUS_SUCCESS)
1523 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1524 }
1525
1526 uint16_t minor, major;
1527 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1528 &minor);
1529 if (status != HSA_STATUS_SUCCESS)
1530 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1531 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1532 &major);
1533 if (status != HSA_STATUS_SUCCESS)
1534 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1535
1536 size_t len = sizeof hsa_context.driver_version_s;
1537 int printed = snprintf (hsa_context.driver_version_s, len,
1538 "HSA Runtime %hu.%hu", (unsigned short int)major,
1539 (unsigned short int)minor);
1540 if (printed >= len)
1541 GCN_WARNING ("HSA runtime version string was truncated."
1542 "Version %hu.%hu is too long.", (unsigned short int)major,
1543 (unsigned short int)minor);
1544
1545 hsa_context.initialized = true;
1546 return true;
1547 }
1548
1549 /* Verify that hsa_context has already been initialized and return the
1550 agent_info structure describing device number N. Return NULL on error. */
1551
1552 static struct agent_info *
get_agent_info(int n)1553 get_agent_info (int n)
1554 {
1555 if (!hsa_context.initialized)
1556 {
1557 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1558 return NULL;
1559 }
1560 if (n >= hsa_context.agent_count)
1561 {
1562 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1563 return NULL;
1564 }
1565 if (!hsa_context.agents[n].initialized)
1566 {
1567 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1568 return NULL;
1569 }
1570 return &hsa_context.agents[n];
1571 }
1572
1573 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1574
1575 Selects (breaks at) a suitable region of type KIND. */
1576
1577 static hsa_status_t
get_memory_region(hsa_region_t region,hsa_region_t * retval,hsa_region_global_flag_t kind)1578 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1579 hsa_region_global_flag_t kind)
1580 {
1581 hsa_status_t status;
1582 hsa_region_segment_t segment;
1583
1584 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1585 &segment);
1586 if (status != HSA_STATUS_SUCCESS)
1587 return status;
1588 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1589 return HSA_STATUS_SUCCESS;
1590
1591 uint32_t flags;
1592 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1593 &flags);
1594 if (status != HSA_STATUS_SUCCESS)
1595 return status;
1596 if (flags & kind)
1597 {
1598 *retval = region;
1599 return HSA_STATUS_INFO_BREAK;
1600 }
1601 return HSA_STATUS_SUCCESS;
1602 }
1603
1604 /* Callback of hsa_agent_iterate_regions.
1605
1606 Selects a kernargs memory region. */
1607
1608 static hsa_status_t
get_kernarg_memory_region(hsa_region_t region,void * data)1609 get_kernarg_memory_region (hsa_region_t region, void *data)
1610 {
1611 return get_memory_region (region, (hsa_region_t *)data,
1612 HSA_REGION_GLOBAL_FLAG_KERNARG);
1613 }
1614
1615 /* Callback of hsa_agent_iterate_regions.
1616
1617 Selects a coarse-grained memory region suitable for the heap and
1618 offload data. */
1619
1620 static hsa_status_t
get_data_memory_region(hsa_region_t region,void * data)1621 get_data_memory_region (hsa_region_t region, void *data)
1622 {
1623 return get_memory_region (region, (hsa_region_t *)data,
1624 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1625 }
1626
1627 static int
elf_gcn_isa_field(Elf64_Ehdr * image)1628 elf_gcn_isa_field (Elf64_Ehdr *image)
1629 {
1630 return image->e_flags & EF_AMDGPU_MACH_MASK;
1631 }
1632
1633 const static char *gcn_gfx803_s = "gfx803";
1634 const static char *gcn_gfx900_s = "gfx900";
1635 const static char *gcn_gfx906_s = "gfx906";
1636 const static int gcn_isa_name_len = 6;
1637
1638 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1639 support the ISA. */
1640
1641 static const char*
isa_hsa_name(int isa)1642 isa_hsa_name (int isa) {
1643 switch(isa)
1644 {
1645 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1646 return gcn_gfx803_s;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1648 return gcn_gfx900_s;
1649 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1650 return gcn_gfx906_s;
1651 }
1652 return NULL;
1653 }
1654
1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656 with -march) or NULL if we do not support the ISA.
1657 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1658
1659 static const char*
isa_gcc_name(int isa)1660 isa_gcc_name (int isa) {
1661 switch(isa)
1662 {
1663 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1664 return "fiji";
1665 default:
1666 return isa_hsa_name (isa);
1667 }
1668 }
1669
1670 /* Returns the code which is used in the GCN object code to identify the ISA with
1671 the given name (as used by the HSA runtime). */
1672
1673 static gcn_isa
isa_code(const char * isa)1674 isa_code(const char *isa) {
1675 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1676 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1677
1678 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1679 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1680
1681 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1682 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1683
1684 return -1;
1685 }
1686
1687 /* }}} */
1688 /* {{{ Run */
1689
1690 /* Create or reuse a team arena.
1691
1692 Team arenas are used by OpenMP to avoid calling malloc multiple times
1693 while setting up each team. This is purely a performance optimization.
1694
1695 Allocating an arena also costs performance, albeit on the host side, so
1696 this function will reuse an existing arena if a large enough one is idle.
1697 The arena is released, but not deallocated, when the kernel exits. */
1698
1699 static void *
get_team_arena(struct agent_info * agent,int num_teams)1700 get_team_arena (struct agent_info *agent, int num_teams)
1701 {
1702 struct team_arena_list **next_ptr = &agent->team_arena_list;
1703 struct team_arena_list *item;
1704
1705 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706 {
1707 if (item->num_teams < num_teams)
1708 continue;
1709
1710 if (pthread_mutex_trylock (&item->in_use))
1711 continue;
1712
1713 return item->arena;
1714 }
1715
1716 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717
1718 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719 {
1720 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721 return false;
1722 }
1723 item = malloc (sizeof (*item));
1724 item->num_teams = num_teams;
1725 item->next = NULL;
1726 *next_ptr = item;
1727
1728 if (pthread_mutex_init (&item->in_use, NULL))
1729 {
1730 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731 return false;
1732 }
1733 if (pthread_mutex_lock (&item->in_use))
1734 {
1735 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736 return false;
1737 }
1738 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739 {
1740 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741 return false;
1742 }
1743
1744 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1745 hsa_status_t status;
1746 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747 TEAM_ARENA_SIZE*num_teams,
1748 &item->arena);
1749 if (status != HSA_STATUS_SUCCESS)
1750 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752 HSA_ACCESS_PERMISSION_RW);
1753 if (status != HSA_STATUS_SUCCESS)
1754 hsa_fatal ("Could not assign arena memory to device", status);
1755
1756 return item->arena;
1757 }
1758
1759 /* Mark a team arena available for reuse. */
1760
1761 static void
release_team_arena(struct agent_info * agent,void * arena)1762 release_team_arena (struct agent_info* agent, void *arena)
1763 {
1764 struct team_arena_list *item;
1765
1766 for (item = agent->team_arena_list; item; item = item->next)
1767 {
1768 if (item->arena == arena)
1769 {
1770 if (pthread_mutex_unlock (&item->in_use))
1771 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772 return;
1773 }
1774 }
1775 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776 }
1777
1778 /* Clean up all the allocated team arenas. */
1779
1780 static bool
destroy_team_arenas(struct agent_info * agent)1781 destroy_team_arenas (struct agent_info *agent)
1782 {
1783 struct team_arena_list *item, *next;
1784
1785 for (item = agent->team_arena_list; item; item = next)
1786 {
1787 next = item->next;
1788 hsa_fns.hsa_memory_free_fn (item->arena);
1789 if (pthread_mutex_destroy (&item->in_use))
1790 {
1791 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792 return false;
1793 }
1794 free (item);
1795 }
1796 agent->team_arena_list = NULL;
1797
1798 return true;
1799 }
1800
1801 /* Allocate memory on a specified device. */
1802
1803 static void *
alloc_by_agent(struct agent_info * agent,size_t size)1804 alloc_by_agent (struct agent_info *agent, size_t size)
1805 {
1806 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807
1808 /* Zero-size allocations are invalid, so in order to return a valid pointer
1809 we need to pass a valid size. One source of zero-size allocations is
1810 kernargs for kernels that have no inputs or outputs (the kernel may
1811 only use console output, for example). */
1812 if (size == 0)
1813 size = 4;
1814
1815 void *ptr;
1816 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817 size, &ptr);
1818 if (status != HSA_STATUS_SUCCESS)
1819 {
1820 hsa_error ("Could not allocate device memory", status);
1821 return NULL;
1822 }
1823
1824 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825 HSA_ACCESS_PERMISSION_RW);
1826 if (status != HSA_STATUS_SUCCESS)
1827 {
1828 hsa_error ("Could not assign data memory to device", status);
1829 return NULL;
1830 }
1831
1832 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833 bool profiling_dispatch_p
1834 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835 if (profiling_dispatch_p)
1836 {
1837 acc_prof_info *prof_info = thr->prof_info;
1838 acc_event_info data_event_info;
1839 acc_api_info *api_info = thr->api_info;
1840
1841 prof_info->event_type = acc_ev_alloc;
1842
1843 data_event_info.data_event.event_type = prof_info->event_type;
1844 data_event_info.data_event.valid_bytes
1845 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846 data_event_info.data_event.parent_construct
1847 = acc_construct_parallel;
1848 data_event_info.data_event.implicit = 1;
1849 data_event_info.data_event.tool_info = NULL;
1850 data_event_info.data_event.var_name = NULL;
1851 data_event_info.data_event.bytes = size;
1852 data_event_info.data_event.host_ptr = NULL;
1853 data_event_info.data_event.device_ptr = (void *) ptr;
1854
1855 api_info->device_api = acc_device_api_other;
1856
1857 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858 api_info);
1859 }
1860
1861 return ptr;
1862 }
1863
1864 /* Create kernel dispatch data structure for given KERNEL, along with
1865 the necessary device signals and memory allocations. */
1866
1867 static struct kernel_dispatch *
create_kernel_dispatch(struct kernel_info * kernel,int num_teams)1868 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869 {
1870 struct agent_info *agent = kernel->agent;
1871 struct kernel_dispatch *shadow
1872 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873
1874 shadow->agent = kernel->agent;
1875 shadow->object = kernel->object;
1876
1877 hsa_signal_t sync_signal;
1878 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879 if (status != HSA_STATUS_SUCCESS)
1880 hsa_fatal ("Error creating the GCN sync signal", status);
1881
1882 shadow->signal = sync_signal.handle;
1883 shadow->private_segment_size = kernel->private_segment_size;
1884 shadow->group_segment_size = kernel->group_segment_size;
1885
1886 /* We expect kernels to request a single pointer, explicitly, and the
1887 rest of struct kernargs, implicitly. If they request anything else
1888 then something is wrong. */
1889 if (kernel->kernarg_segment_size > 8)
1890 {
1891 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892 return NULL;
1893 }
1894
1895 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896 sizeof (struct kernargs),
1897 &shadow->kernarg_address);
1898 if (status != HSA_STATUS_SUCCESS)
1899 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900 struct kernargs *kernargs = shadow->kernarg_address;
1901
1902 /* Zero-initialize the output_data (minimum needed). */
1903 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904 kernargs->output_data.next_output = 0;
1905 for (unsigned i = 0;
1906 i < (sizeof (kernargs->output_data.queue)
1907 / sizeof (kernargs->output_data.queue[0]));
1908 i++)
1909 kernargs->output_data.queue[i].written = 0;
1910 kernargs->output_data.consumed = 0;
1911
1912 /* Pass in the heap location. */
1913 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914
1915 /* Create an arena. */
1916 if (kernel->kind == KIND_OPENMP)
1917 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918 else
1919 kernargs->arena_ptr = 0;
1920
1921 /* Ensure we can recognize unset return values. */
1922 kernargs->output_data.return_value = 0xcafe0000;
1923
1924 return shadow;
1925 }
1926
1927 /* Output any data written to console output from the kernel. It is expected
1928 that this function is polled during kernel execution.
1929
1930 We print all entries from the last item printed to the next entry without
1931 a "written" flag. If the "final" flag is set then it'll continue right to
1932 the end.
1933
1934 The print buffer is circular, but the from and to locations don't wrap when
1935 the buffer does, so the output limit is UINT_MAX. The target blocks on
1936 output when the buffer is full. */
1937
1938 static void
console_output(struct kernel_info * kernel,struct kernargs * kernargs,bool final)1939 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940 bool final)
1941 {
1942 unsigned int limit = (sizeof (kernargs->output_data.queue)
1943 / sizeof (kernargs->output_data.queue[0]));
1944
1945 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946 __ATOMIC_ACQUIRE);
1947 unsigned int to = kernargs->output_data.next_output;
1948
1949 if (from > to)
1950 {
1951 /* Overflow. */
1952 if (final)
1953 printf ("GCN print buffer overflowed.\n");
1954 return;
1955 }
1956
1957 unsigned int i;
1958 for (i = from; i < to; i++)
1959 {
1960 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961
1962 if (!data->written && !final)
1963 break;
1964
1965 switch (data->type)
1966 {
1967 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971 default: printf ("GCN print buffer error!\n"); break;
1972 }
1973 data->written = 0;
1974 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975 __ATOMIC_RELEASE);
1976 }
1977 fflush (stdout);
1978 }
1979
1980 /* Release data structure created for a kernel dispatch in SHADOW argument,
1981 and clean up the signal and memory allocations. */
1982
1983 static void
release_kernel_dispatch(struct kernel_dispatch * shadow)1984 release_kernel_dispatch (struct kernel_dispatch *shadow)
1985 {
1986 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987
1988 struct kernargs *kernargs = shadow->kernarg_address;
1989 void *arena = (void *)kernargs->arena_ptr;
1990 if (arena)
1991 release_team_arena (shadow->agent, arena);
1992
1993 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994
1995 hsa_signal_t s;
1996 s.handle = shadow->signal;
1997 hsa_fns.hsa_signal_destroy_fn (s);
1998
1999 free (shadow);
2000 }
2001
2002 /* Extract the properties from a kernel binary. */
2003
2004 static void
init_kernel_properties(struct kernel_info * kernel)2005 init_kernel_properties (struct kernel_info *kernel)
2006 {
2007 hsa_status_t status;
2008 struct agent_info *agent = kernel->agent;
2009 hsa_executable_symbol_t kernel_symbol;
2010 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2011 kernel->name, agent->id,
2012 0, &kernel_symbol);
2013 if (status != HSA_STATUS_SUCCESS)
2014 {
2015 hsa_warn ("Could not find symbol for kernel in the code object", status);
2016 fprintf (stderr, "not found name: '%s'\n", kernel->name);
2017 dump_executable_symbols (agent->executable);
2018 goto failure;
2019 }
2020 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2021 status = hsa_fns.hsa_executable_symbol_get_info_fn
2022 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2023 if (status != HSA_STATUS_SUCCESS)
2024 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2025 status = hsa_fns.hsa_executable_symbol_get_info_fn
2026 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2027 &kernel->kernarg_segment_size);
2028 if (status != HSA_STATUS_SUCCESS)
2029 hsa_fatal ("Could not get info about kernel argument size", status);
2030 status = hsa_fns.hsa_executable_symbol_get_info_fn
2031 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2032 &kernel->group_segment_size);
2033 if (status != HSA_STATUS_SUCCESS)
2034 hsa_fatal ("Could not get info about kernel group segment size", status);
2035 status = hsa_fns.hsa_executable_symbol_get_info_fn
2036 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2037 &kernel->private_segment_size);
2038 if (status != HSA_STATUS_SUCCESS)
2039 hsa_fatal ("Could not get info about kernel private segment size",
2040 status);
2041
2042 /* The kernel type is not known until something tries to launch it. */
2043 kernel->kind = KIND_UNKNOWN;
2044
2045 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2046 "following segment sizes: \n", kernel->name);
2047 GCN_DEBUG (" group_segment_size: %u\n",
2048 (unsigned) kernel->group_segment_size);
2049 GCN_DEBUG (" private_segment_size: %u\n",
2050 (unsigned) kernel->private_segment_size);
2051 GCN_DEBUG (" kernarg_segment_size: %u\n",
2052 (unsigned) kernel->kernarg_segment_size);
2053 return;
2054
2055 failure:
2056 kernel->initialization_failed = true;
2057 }
2058
2059 /* Do all the work that is necessary before running KERNEL for the first time.
2060 The function assumes the program has been created, finalized and frozen by
2061 create_and_finalize_hsa_program. */
2062
2063 static void
init_kernel(struct kernel_info * kernel)2064 init_kernel (struct kernel_info *kernel)
2065 {
2066 if (pthread_mutex_lock (&kernel->init_mutex))
2067 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2068 if (kernel->initialized)
2069 {
2070 if (pthread_mutex_unlock (&kernel->init_mutex))
2071 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2072 "mutex");
2073
2074 return;
2075 }
2076
2077 init_kernel_properties (kernel);
2078
2079 if (!kernel->initialization_failed)
2080 {
2081 GCN_DEBUG ("\n");
2082
2083 kernel->initialized = true;
2084 }
2085 if (pthread_mutex_unlock (&kernel->init_mutex))
2086 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2087 "mutex");
2088 }
2089
2090 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2091 launch attributes from KLA.
2092
2093 MODULE_LOCKED indicates that the caller already holds the lock and
2094 run_kernel need not lock it again.
2095 If AQ is NULL then agent->sync_queue will be used. */
2096
2097 static void
run_kernel(struct kernel_info * kernel,void * vars,struct GOMP_kernel_launch_attributes * kla,struct goacc_asyncqueue * aq,bool module_locked)2098 run_kernel (struct kernel_info *kernel, void *vars,
2099 struct GOMP_kernel_launch_attributes *kla,
2100 struct goacc_asyncqueue *aq, bool module_locked)
2101 {
2102 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2103 kernel->description->vpgr_count);
2104
2105 /* Reduce the number of threads/workers if there are insufficient
2106 VGPRs available to run the kernels together. */
2107 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2108 {
2109 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2110 int max_threads = (256 / granulated_vgprs) * 4;
2111 if (kla->gdims[2] > max_threads)
2112 {
2113 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2114 " per team/gang - reducing to %d threads/workers.\n",
2115 kla->gdims[2], max_threads);
2116 kla->gdims[2] = max_threads;
2117 }
2118 }
2119
2120 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2121 (aq ? aq->id : 0));
2122 GCN_DEBUG ("GCN launch attribs: gdims:[");
2123 int i;
2124 for (i = 0; i < kla->ndim; ++i)
2125 {
2126 if (i)
2127 DEBUG_PRINT (", ");
2128 DEBUG_PRINT ("%u", kla->gdims[i]);
2129 }
2130 DEBUG_PRINT ("], normalized gdims:[");
2131 for (i = 0; i < kla->ndim; ++i)
2132 {
2133 if (i)
2134 DEBUG_PRINT (", ");
2135 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2136 }
2137 DEBUG_PRINT ("], wdims:[");
2138 for (i = 0; i < kla->ndim; ++i)
2139 {
2140 if (i)
2141 DEBUG_PRINT (", ");
2142 DEBUG_PRINT ("%u", kla->wdims[i]);
2143 }
2144 DEBUG_PRINT ("]\n");
2145 DEBUG_FLUSH ();
2146
2147 struct agent_info *agent = kernel->agent;
2148 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2149 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2150
2151 if (!agent->initialized)
2152 GOMP_PLUGIN_fatal ("Agent must be initialized");
2153
2154 if (!kernel->initialized)
2155 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2156
2157 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2158
2159 uint64_t index
2160 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2161 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2162
2163 /* Wait until the queue is not full before writing the packet. */
2164 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2165 >= command_q->size)
2166 ;
2167
2168 /* Do not allow the dimensions to be overridden when running
2169 constructors or destructors. */
2170 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2171 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2172
2173 hsa_kernel_dispatch_packet_t *packet;
2174 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2175 + index % command_q->size;
2176
2177 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2178 packet->grid_size_x = override_x ? : kla->gdims[0];
2179 packet->workgroup_size_x = get_group_size (kla->ndim,
2180 packet->grid_size_x,
2181 kla->wdims[0]);
2182
2183 if (kla->ndim >= 2)
2184 {
2185 packet->grid_size_y = kla->gdims[1];
2186 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2187 kla->wdims[1]);
2188 }
2189 else
2190 {
2191 packet->grid_size_y = 1;
2192 packet->workgroup_size_y = 1;
2193 }
2194
2195 if (kla->ndim == 3)
2196 {
2197 packet->grid_size_z = limit_worker_threads (override_z
2198 ? : kla->gdims[2]);
2199 packet->workgroup_size_z = get_group_size (kla->ndim,
2200 packet->grid_size_z,
2201 kla->wdims[2]);
2202 }
2203 else
2204 {
2205 packet->grid_size_z = 1;
2206 packet->workgroup_size_z = 1;
2207 }
2208
2209 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2210 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2211 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2212 packet->grid_size_x / packet->workgroup_size_x,
2213 packet->grid_size_y / packet->workgroup_size_y,
2214 packet->grid_size_z / packet->workgroup_size_z,
2215 packet->workgroup_size_x, packet->workgroup_size_y,
2216 packet->workgroup_size_z);
2217
2218 struct kernel_dispatch *shadow
2219 = create_kernel_dispatch (kernel, packet->grid_size_x);
2220 shadow->queue = command_q;
2221
2222 if (debug)
2223 {
2224 fprintf (stderr, "\nKernel has following dependencies:\n");
2225 print_kernel_dispatch (shadow, 2);
2226 }
2227
2228 packet->private_segment_size = kernel->private_segment_size;
2229 packet->group_segment_size = kernel->group_segment_size;
2230 packet->kernel_object = kernel->object;
2231 packet->kernarg_address = shadow->kernarg_address;
2232 hsa_signal_t s;
2233 s.handle = shadow->signal;
2234 packet->completion_signal = s;
2235 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2236 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2237
2238 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2239
2240 uint16_t header;
2241 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2242 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2243 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2244
2245 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2246 agent->device_id);
2247
2248 packet_store_release ((uint32_t *) packet, header,
2249 (uint16_t) kla->ndim
2250 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2251
2252 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2253 index);
2254
2255 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2256
2257 /* Root signal waits with 1ms timeout. */
2258 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2259 1000 * 1000,
2260 HSA_WAIT_STATE_BLOCKED) != 0)
2261 {
2262 console_output (kernel, shadow->kernarg_address, false);
2263 }
2264 console_output (kernel, shadow->kernarg_address, true);
2265
2266 struct kernargs *kernargs = shadow->kernarg_address;
2267 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2268
2269 release_kernel_dispatch (shadow);
2270
2271 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2272 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2273
2274 unsigned int upper = (return_value & ~0xffff) >> 16;
2275 if (upper == 0xcafe)
2276 ; // exit not called, normal termination.
2277 else if (upper == 0xffff)
2278 ; // exit called.
2279 else
2280 {
2281 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2282 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2283 return_value);
2284 abort ();
2285 }
2286
2287 if (upper == 0xffff)
2288 {
2289 unsigned int signal = (return_value >> 8) & 0xff;
2290
2291 if (signal == SIGABRT)
2292 {
2293 GCN_WARNING ("GCN Kernel aborted\n");
2294 abort ();
2295 }
2296 else if (signal != 0)
2297 {
2298 GCN_WARNING ("GCN Kernel received unknown signal\n");
2299 abort ();
2300 }
2301
2302 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2303 exit (return_value & 0xff);
2304 }
2305 }
2306
2307 /* }}} */
2308 /* {{{ Load/Unload */
2309
2310 /* Initialize KERNEL from D and other parameters. Return true on success. */
2311
2312 static bool
init_basic_kernel_info(struct kernel_info * kernel,struct hsa_kernel_description * d,struct agent_info * agent,struct module_info * module)2313 init_basic_kernel_info (struct kernel_info *kernel,
2314 struct hsa_kernel_description *d,
2315 struct agent_info *agent,
2316 struct module_info *module)
2317 {
2318 kernel->agent = agent;
2319 kernel->module = module;
2320 kernel->name = d->name;
2321 kernel->description = d;
2322 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2323 {
2324 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2325 return false;
2326 }
2327 return true;
2328 }
2329
2330 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
2331 not found, return false. */
2332
2333 static bool
find_load_offset(Elf64_Addr * load_offset,struct agent_info * agent,struct module_info * module,Elf64_Ehdr * image,Elf64_Shdr * sections)2334 find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2335 struct module_info *module, Elf64_Ehdr *image,
2336 Elf64_Shdr *sections)
2337 {
2338 bool res = false;
2339
2340 hsa_status_t status;
2341
2342 hsa_executable_symbol_t symbol;
2343 if (!find_executable_symbol (agent->executable, &symbol))
2344 return false;
2345
2346 status = hsa_fns.hsa_executable_symbol_get_info_fn
2347 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2348 if (status != HSA_STATUS_SUCCESS)
2349 {
2350 hsa_error ("Could not extract symbol address", status);
2351 return false;
2352 }
2353
2354 char *symbol_name = get_executable_symbol_name (symbol);
2355 if (symbol_name == NULL)
2356 return false;
2357
2358 /* Find the kernel function in ELF, and calculate actual load offset. */
2359 for (int i = 0; i < image->e_shnum; i++)
2360 if (sections[i].sh_type == SHT_SYMTAB)
2361 {
2362 Elf64_Shdr *strtab = §ions[sections[i].sh_link];
2363 char *strings = (char *)image + strtab->sh_offset;
2364
2365 for (size_t offset = 0;
2366 offset < sections[i].sh_size;
2367 offset += sections[i].sh_entsize)
2368 {
2369 Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2370 + sections[i].sh_offset
2371 + offset);
2372 if (strcmp (symbol_name, strings + sym->st_name) == 0)
2373 {
2374 *load_offset -= sym->st_value;
2375 res = true;
2376 break;
2377 }
2378 }
2379 }
2380
2381 free (symbol_name);
2382 return res;
2383 }
2384
2385 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2386
2387 static bool
isa_matches_agent(struct agent_info * agent,Elf64_Ehdr * image)2388 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2389 {
2390 int isa_field = elf_gcn_isa_field (image);
2391 const char* isa_s = isa_hsa_name (isa_field);
2392 if (!isa_s)
2393 {
2394 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2395 return false;
2396 }
2397
2398 if (isa_field != agent->device_isa)
2399 {
2400 char msg[120];
2401 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2402 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2403 assert (agent_isa_s);
2404 assert (agent_isa_gcc_s);
2405
2406 snprintf (msg, sizeof msg,
2407 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2408 "Try to recompile with '-foffload=-march=%s'.\n",
2409 isa_s, agent_isa_s, agent_isa_gcc_s);
2410
2411 hsa_error (msg, HSA_STATUS_ERROR);
2412 return false;
2413 }
2414
2415 return true;
2416 }
2417
2418 /* Create and finalize the program consisting of all loaded modules. */
2419
2420 static bool
create_and_finalize_hsa_program(struct agent_info * agent)2421 create_and_finalize_hsa_program (struct agent_info *agent)
2422 {
2423 hsa_status_t status;
2424 int reloc_count = 0;
2425 bool res = true;
2426 if (pthread_mutex_lock (&agent->prog_mutex))
2427 {
2428 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2429 return false;
2430 }
2431 if (agent->prog_finalized)
2432 goto final;
2433
2434 status
2435 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2436 HSA_EXECUTABLE_STATE_UNFROZEN,
2437 "", &agent->executable);
2438 if (status != HSA_STATUS_SUCCESS)
2439 {
2440 hsa_error ("Could not create GCN executable", status);
2441 goto fail;
2442 }
2443
2444 /* Load any GCN modules. */
2445 struct module_info *module = agent->module;
2446 if (module)
2447 {
2448 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2449
2450 if (!isa_matches_agent (agent, image))
2451 goto fail;
2452
2453 /* Hide relocations from the HSA runtime loader.
2454 Keep a copy of the unmodified section headers to use later. */
2455 Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2456 + image->e_shoff);
2457 for (int i = image->e_shnum - 1; i >= 0; i--)
2458 {
2459 if (image_sections[i].sh_type == SHT_RELA
2460 || image_sections[i].sh_type == SHT_REL)
2461 /* Change section type to something harmless. */
2462 image_sections[i].sh_type |= 0x80;
2463 }
2464
2465 hsa_code_object_t co = { 0 };
2466 status = hsa_fns.hsa_code_object_deserialize_fn
2467 (module->image_desc->gcn_image->image,
2468 module->image_desc->gcn_image->size,
2469 NULL, &co);
2470 if (status != HSA_STATUS_SUCCESS)
2471 {
2472 hsa_error ("Could not deserialize GCN code object", status);
2473 goto fail;
2474 }
2475
2476 status = hsa_fns.hsa_executable_load_code_object_fn
2477 (agent->executable, agent->id, co, "");
2478 if (status != HSA_STATUS_SUCCESS)
2479 {
2480 hsa_error ("Could not load GCN code object", status);
2481 goto fail;
2482 }
2483
2484 if (!module->heap)
2485 {
2486 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2487 gcn_kernel_heap_size,
2488 (void**)&module->heap);
2489 if (status != HSA_STATUS_SUCCESS)
2490 {
2491 hsa_error ("Could not allocate memory for GCN heap", status);
2492 goto fail;
2493 }
2494
2495 status = hsa_fns.hsa_memory_assign_agent_fn
2496 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2497 if (status != HSA_STATUS_SUCCESS)
2498 {
2499 hsa_error ("Could not assign GCN heap memory to device", status);
2500 goto fail;
2501 }
2502
2503 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2504 &gcn_kernel_heap_size,
2505 sizeof (gcn_kernel_heap_size));
2506 }
2507
2508 }
2509
2510 if (debug)
2511 dump_executable_symbols (agent->executable);
2512
2513 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2514 if (status != HSA_STATUS_SUCCESS)
2515 {
2516 hsa_error ("Could not freeze the GCN executable", status);
2517 goto fail;
2518 }
2519
2520 if (agent->module)
2521 {
2522 struct module_info *module = agent->module;
2523 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2524 Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2525
2526 Elf64_Addr load_offset;
2527 if (!find_load_offset (&load_offset, agent, module, image, sections))
2528 goto fail;
2529
2530 /* Record the physical load address range.
2531 We need this for data copies later. */
2532 Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2533 Elf64_Addr low = ~0, high = 0;
2534 for (int i = 0; i < image->e_phnum; i++)
2535 if (segments[i].p_memsz > 0)
2536 {
2537 if (segments[i].p_paddr < low)
2538 low = segments[i].p_paddr;
2539 if (segments[i].p_paddr > high)
2540 high = segments[i].p_paddr + segments[i].p_memsz - 1;
2541 }
2542 module->phys_address_start = low + load_offset;
2543 module->phys_address_end = high + load_offset;
2544
2545 // Find dynamic symbol table
2546 Elf64_Shdr *dynsym = NULL;
2547 for (int i = 0; i < image->e_shnum; i++)
2548 if (sections[i].sh_type == SHT_DYNSYM)
2549 {
2550 dynsym = §ions[i];
2551 break;
2552 }
2553
2554 /* Fix up relocations. */
2555 for (int i = 0; i < image->e_shnum; i++)
2556 {
2557 if (sections[i].sh_type == (SHT_RELA | 0x80))
2558 for (size_t offset = 0;
2559 offset < sections[i].sh_size;
2560 offset += sections[i].sh_entsize)
2561 {
2562 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2563 + sections[i].sh_offset
2564 + offset);
2565 Elf64_Sym *sym =
2566 (dynsym
2567 ? (Elf64_Sym*)((char*)image
2568 + dynsym->sh_offset
2569 + (dynsym->sh_entsize
2570 * ELF64_R_SYM (reloc->r_info)))
2571 : NULL);
2572
2573 int64_t S = (sym ? sym->st_value : 0);
2574 int64_t P = reloc->r_offset + load_offset;
2575 int64_t A = reloc->r_addend;
2576 int64_t B = load_offset;
2577 int64_t V, size;
2578 switch (ELF64_R_TYPE (reloc->r_info))
2579 {
2580 case R_AMDGPU_ABS32_LO:
2581 V = (S + A) & 0xFFFFFFFF;
2582 size = 4;
2583 break;
2584 case R_AMDGPU_ABS32_HI:
2585 V = (S + A) >> 32;
2586 size = 4;
2587 break;
2588 case R_AMDGPU_ABS64:
2589 V = S + A;
2590 size = 8;
2591 break;
2592 case R_AMDGPU_REL32:
2593 V = S + A - P;
2594 size = 4;
2595 break;
2596 case R_AMDGPU_REL64:
2597 /* FIXME
2598 LLD seems to emit REL64 where the the assembler has
2599 ABS64. This is clearly wrong because it's not what the
2600 compiler is expecting. Let's assume, for now, that
2601 it's a bug. In any case, GCN kernels are always self
2602 contained and therefore relative relocations will have
2603 been resolved already, so this should be a safe
2604 workaround. */
2605 V = S + A/* - P*/;
2606 size = 8;
2607 break;
2608 case R_AMDGPU_ABS32:
2609 V = S + A;
2610 size = 4;
2611 break;
2612 /* TODO R_AMDGPU_GOTPCREL */
2613 /* TODO R_AMDGPU_GOTPCREL32_LO */
2614 /* TODO R_AMDGPU_GOTPCREL32_HI */
2615 case R_AMDGPU_REL32_LO:
2616 V = (S + A - P) & 0xFFFFFFFF;
2617 size = 4;
2618 break;
2619 case R_AMDGPU_REL32_HI:
2620 V = (S + A - P) >> 32;
2621 size = 4;
2622 break;
2623 case R_AMDGPU_RELATIVE64:
2624 V = B + A;
2625 size = 8;
2626 break;
2627 default:
2628 fprintf (stderr, "Error: unsupported relocation type.\n");
2629 exit (1);
2630 }
2631 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2632 if (status != HSA_STATUS_SUCCESS)
2633 {
2634 hsa_error ("Failed to fix up relocation", status);
2635 goto fail;
2636 }
2637 reloc_count++;
2638 }
2639 }
2640 }
2641
2642 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2643 agent->device_id, reloc_count);
2644
2645 final:
2646 agent->prog_finalized = true;
2647
2648 if (pthread_mutex_unlock (&agent->prog_mutex))
2649 {
2650 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2651 res = false;
2652 }
2653
2654 return res;
2655
2656 fail:
2657 res = false;
2658 goto final;
2659 }
2660
2661 /* Free the HSA program in agent and everything associated with it and set
2662 agent->prog_finalized and the initialized flags of all kernels to false.
2663 Return TRUE on success. */
2664
2665 static bool
destroy_hsa_program(struct agent_info * agent)2666 destroy_hsa_program (struct agent_info *agent)
2667 {
2668 if (!agent->prog_finalized)
2669 return true;
2670
2671 hsa_status_t status;
2672
2673 GCN_DEBUG ("Destroying the current GCN program.\n");
2674
2675 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2676 if (status != HSA_STATUS_SUCCESS)
2677 return hsa_error ("Could not destroy GCN executable", status);
2678
2679 if (agent->module)
2680 {
2681 int i;
2682 for (i = 0; i < agent->module->kernel_count; i++)
2683 agent->module->kernels[i].initialized = false;
2684
2685 if (agent->module->heap)
2686 {
2687 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2688 agent->module->heap = NULL;
2689 }
2690 }
2691 agent->prog_finalized = false;
2692 return true;
2693 }
2694
2695 /* Deinitialize all information associated with MODULE and kernels within
2696 it. Return TRUE on success. */
2697
2698 static bool
destroy_module(struct module_info * module,bool locked)2699 destroy_module (struct module_info *module, bool locked)
2700 {
2701 /* Run destructors before destroying module. */
2702 struct GOMP_kernel_launch_attributes kla =
2703 { 3,
2704 /* Grid size. */
2705 { 1, 64, 1 },
2706 /* Work-group size. */
2707 { 1, 64, 1 }
2708 };
2709
2710 if (module->fini_array_func)
2711 {
2712 init_kernel (module->fini_array_func);
2713 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2714 }
2715 module->constructors_run_p = false;
2716
2717 int i;
2718 for (i = 0; i < module->kernel_count; i++)
2719 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2720 {
2721 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2722 "mutex");
2723 return false;
2724 }
2725
2726 return true;
2727 }
2728
2729 /* }}} */
2730 /* {{{ Async */
2731
2732 /* Callback of dispatch queues to report errors. */
2733
2734 static void
execute_queue_entry(struct goacc_asyncqueue * aq,int index)2735 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2736 {
2737 struct queue_entry *entry = &aq->queue[index];
2738
2739 switch (entry->type)
2740 {
2741 case KERNEL_LAUNCH:
2742 if (DEBUG_QUEUES)
2743 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2744 aq->agent->device_id, aq->id, index);
2745 run_kernel (entry->u.launch.kernel,
2746 entry->u.launch.vars,
2747 &entry->u.launch.kla, aq, false);
2748 if (DEBUG_QUEUES)
2749 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2750 aq->agent->device_id, aq->id, index);
2751 break;
2752
2753 case CALLBACK:
2754 if (DEBUG_QUEUES)
2755 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2756 aq->agent->device_id, aq->id, index);
2757 entry->u.callback.fn (entry->u.callback.data);
2758 if (DEBUG_QUEUES)
2759 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2760 aq->agent->device_id, aq->id, index);
2761 break;
2762
2763 case ASYNC_WAIT:
2764 {
2765 /* FIXME: is it safe to access a placeholder that may already have
2766 been executed? */
2767 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2768
2769 if (DEBUG_QUEUES)
2770 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2771 aq->agent->device_id, aq->id, index);
2772
2773 pthread_mutex_lock (&placeholderp->mutex);
2774
2775 while (!placeholderp->executed)
2776 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2777
2778 pthread_mutex_unlock (&placeholderp->mutex);
2779
2780 if (pthread_cond_destroy (&placeholderp->cond))
2781 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2782
2783 if (pthread_mutex_destroy (&placeholderp->mutex))
2784 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2785
2786 if (DEBUG_QUEUES)
2787 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2788 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2789 }
2790 break;
2791
2792 case ASYNC_PLACEHOLDER:
2793 pthread_mutex_lock (&entry->u.placeholder.mutex);
2794 entry->u.placeholder.executed = 1;
2795 pthread_cond_signal (&entry->u.placeholder.cond);
2796 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2797 break;
2798
2799 default:
2800 GOMP_PLUGIN_fatal ("Unknown queue element");
2801 }
2802 }
2803
2804 /* This function is run as a thread to service an async queue in the
2805 background. It runs continuously until the stop flag is set. */
2806
2807 static void *
drain_queue(void * thread_arg)2808 drain_queue (void *thread_arg)
2809 {
2810 struct goacc_asyncqueue *aq = thread_arg;
2811
2812 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2813 {
2814 aq->drain_queue_stop = 2;
2815 return NULL;
2816 }
2817
2818 pthread_mutex_lock (&aq->mutex);
2819
2820 while (true)
2821 {
2822 if (aq->drain_queue_stop)
2823 break;
2824
2825 if (aq->queue_n > 0)
2826 {
2827 pthread_mutex_unlock (&aq->mutex);
2828 execute_queue_entry (aq, aq->queue_first);
2829
2830 pthread_mutex_lock (&aq->mutex);
2831 aq->queue_first = ((aq->queue_first + 1)
2832 % ASYNC_QUEUE_SIZE);
2833 aq->queue_n--;
2834
2835 if (DEBUG_THREAD_SIGNAL)
2836 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2837 aq->agent->device_id, aq->id);
2838 pthread_cond_broadcast (&aq->queue_cond_out);
2839 pthread_mutex_unlock (&aq->mutex);
2840
2841 if (DEBUG_QUEUES)
2842 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2843 aq->id);
2844 pthread_mutex_lock (&aq->mutex);
2845 }
2846 else
2847 {
2848 if (DEBUG_THREAD_SLEEP)
2849 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2850 aq->agent->device_id, aq->id);
2851 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2852 if (DEBUG_THREAD_SLEEP)
2853 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2854 aq->agent->device_id, aq->id);
2855 }
2856 }
2857
2858 aq->drain_queue_stop = 2;
2859 if (DEBUG_THREAD_SIGNAL)
2860 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2861 aq->agent->device_id, aq->id);
2862 pthread_cond_broadcast (&aq->queue_cond_out);
2863 pthread_mutex_unlock (&aq->mutex);
2864
2865 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2866 return NULL;
2867 }
2868
2869 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2870 is not usually the case. This is just a debug tool. */
2871
2872 static void
drain_queue_synchronous(struct goacc_asyncqueue * aq)2873 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2874 {
2875 pthread_mutex_lock (&aq->mutex);
2876
2877 while (aq->queue_n > 0)
2878 {
2879 execute_queue_entry (aq, aq->queue_first);
2880
2881 aq->queue_first = ((aq->queue_first + 1)
2882 % ASYNC_QUEUE_SIZE);
2883 aq->queue_n--;
2884 }
2885
2886 pthread_mutex_unlock (&aq->mutex);
2887 }
2888
2889 /* Block the current thread until an async queue is writable. The aq->mutex
2890 lock should be held on entry, and remains locked on exit. */
2891
2892 static void
wait_for_queue_nonfull(struct goacc_asyncqueue * aq)2893 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2894 {
2895 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2896 {
2897 /* Queue is full. Wait for it to not be full. */
2898 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2899 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2900 }
2901 }
2902
2903 /* Request an asynchronous kernel launch on the specified queue. This
2904 may block if the queue is full, but returns without waiting for the
2905 kernel to run. */
2906
2907 static void
queue_push_launch(struct goacc_asyncqueue * aq,struct kernel_info * kernel,void * vars,struct GOMP_kernel_launch_attributes * kla)2908 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2909 void *vars, struct GOMP_kernel_launch_attributes *kla)
2910 {
2911 assert (aq->agent == kernel->agent);
2912
2913 pthread_mutex_lock (&aq->mutex);
2914
2915 wait_for_queue_nonfull (aq);
2916
2917 int queue_last = ((aq->queue_first + aq->queue_n)
2918 % ASYNC_QUEUE_SIZE);
2919 if (DEBUG_QUEUES)
2920 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2921 aq->id, queue_last);
2922
2923 aq->queue[queue_last].type = KERNEL_LAUNCH;
2924 aq->queue[queue_last].u.launch.kernel = kernel;
2925 aq->queue[queue_last].u.launch.vars = vars;
2926 aq->queue[queue_last].u.launch.kla = *kla;
2927
2928 aq->queue_n++;
2929
2930 if (DEBUG_THREAD_SIGNAL)
2931 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2932 aq->agent->device_id, aq->id);
2933 pthread_cond_signal (&aq->queue_cond_in);
2934
2935 pthread_mutex_unlock (&aq->mutex);
2936 }
2937
2938 /* Request an asynchronous callback on the specified queue. The callback
2939 function will be called, with the given opaque data, from the appropriate
2940 async thread, when all previous items on that queue are complete. */
2941
2942 static void
queue_push_callback(struct goacc_asyncqueue * aq,void (* fn)(void *),void * data)2943 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2944 void *data)
2945 {
2946 pthread_mutex_lock (&aq->mutex);
2947
2948 wait_for_queue_nonfull (aq);
2949
2950 int queue_last = ((aq->queue_first + aq->queue_n)
2951 % ASYNC_QUEUE_SIZE);
2952 if (DEBUG_QUEUES)
2953 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2954 aq->id, queue_last);
2955
2956 aq->queue[queue_last].type = CALLBACK;
2957 aq->queue[queue_last].u.callback.fn = fn;
2958 aq->queue[queue_last].u.callback.data = data;
2959
2960 aq->queue_n++;
2961
2962 if (DEBUG_THREAD_SIGNAL)
2963 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2964 aq->agent->device_id, aq->id);
2965 pthread_cond_signal (&aq->queue_cond_in);
2966
2967 pthread_mutex_unlock (&aq->mutex);
2968 }
2969
2970 /* Request that a given async thread wait for another thread (unspecified) to
2971 reach the given placeholder. The wait will occur when all previous entries
2972 on the queue are complete. A placeholder is effectively a kind of signal
2973 which simply sets a flag when encountered in a queue. */
2974
2975 static void
queue_push_asyncwait(struct goacc_asyncqueue * aq,struct placeholder * placeholderp)2976 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2977 struct placeholder *placeholderp)
2978 {
2979 pthread_mutex_lock (&aq->mutex);
2980
2981 wait_for_queue_nonfull (aq);
2982
2983 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2984 if (DEBUG_QUEUES)
2985 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2986 aq->id, queue_last);
2987
2988 aq->queue[queue_last].type = ASYNC_WAIT;
2989 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2990
2991 aq->queue_n++;
2992
2993 if (DEBUG_THREAD_SIGNAL)
2994 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2995 aq->agent->device_id, aq->id);
2996 pthread_cond_signal (&aq->queue_cond_in);
2997
2998 pthread_mutex_unlock (&aq->mutex);
2999 }
3000
3001 /* Add a placeholder into an async queue. When the async thread reaches the
3002 placeholder it will set the "executed" flag to true and continue.
3003 Another thread may be waiting on this thread reaching the placeholder. */
3004
3005 static struct placeholder *
queue_push_placeholder(struct goacc_asyncqueue * aq)3006 queue_push_placeholder (struct goacc_asyncqueue *aq)
3007 {
3008 struct placeholder *placeholderp;
3009
3010 pthread_mutex_lock (&aq->mutex);
3011
3012 wait_for_queue_nonfull (aq);
3013
3014 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
3015 if (DEBUG_QUEUES)
3016 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
3017 aq->id, queue_last);
3018
3019 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
3020 placeholderp = &aq->queue[queue_last].u.placeholder;
3021
3022 if (pthread_mutex_init (&placeholderp->mutex, NULL))
3023 {
3024 pthread_mutex_unlock (&aq->mutex);
3025 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
3026 }
3027
3028 if (pthread_cond_init (&placeholderp->cond, NULL))
3029 {
3030 pthread_mutex_unlock (&aq->mutex);
3031 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3032 }
3033
3034 placeholderp->executed = 0;
3035
3036 aq->queue_n++;
3037
3038 if (DEBUG_THREAD_SIGNAL)
3039 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3040 aq->agent->device_id, aq->id);
3041 pthread_cond_signal (&aq->queue_cond_in);
3042
3043 pthread_mutex_unlock (&aq->mutex);
3044
3045 return placeholderp;
3046 }
3047
3048 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
3049
3050 static void
finalize_async_thread(struct goacc_asyncqueue * aq)3051 finalize_async_thread (struct goacc_asyncqueue *aq)
3052 {
3053 pthread_mutex_lock (&aq->mutex);
3054 if (aq->drain_queue_stop == 2)
3055 {
3056 pthread_mutex_unlock (&aq->mutex);
3057 return;
3058 }
3059
3060 aq->drain_queue_stop = 1;
3061
3062 if (DEBUG_THREAD_SIGNAL)
3063 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3064 aq->agent->device_id, aq->id);
3065 pthread_cond_signal (&aq->queue_cond_in);
3066
3067 while (aq->drain_queue_stop != 2)
3068 {
3069 if (DEBUG_THREAD_SLEEP)
3070 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3071 " to sleep\n", aq->agent->device_id, aq->id);
3072 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3073 if (DEBUG_THREAD_SLEEP)
3074 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3075 aq->agent->device_id, aq->id);
3076 }
3077
3078 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3079 aq->id);
3080 pthread_mutex_unlock (&aq->mutex);
3081
3082 int err = pthread_join (aq->thread_drain_queue, NULL);
3083 if (err != 0)
3084 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3085 aq->agent->device_id, aq->id, strerror (err));
3086 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3087 }
3088
3089 /* Set up an async queue for OpenMP. There will be only one. The
3090 implementation simply uses an OpenACC async queue.
3091 FIXME: is this thread-safe if two threads call this function? */
3092
3093 static void
maybe_init_omp_async(struct agent_info * agent)3094 maybe_init_omp_async (struct agent_info *agent)
3095 {
3096 if (!agent->omp_async_queue)
3097 agent->omp_async_queue
3098 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3099 }
3100
3101 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3102 copies from read-only pages. */
3103
3104 static void
hsa_memory_copy_wrapper(void * dst,const void * src,size_t len)3105 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3106 {
3107 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3108
3109 if (status == HSA_STATUS_SUCCESS)
3110 return;
3111
3112 /* It appears that the copy fails if the source data is in a read-only page.
3113 We can't detect that easily, so try copying the data to a temporary buffer
3114 and doing the copy again if we got an error above. */
3115
3116 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3117 "[%p:+%d]\n", (void *) src, (int) len);
3118
3119 void *src_copy = malloc (len);
3120 memcpy (src_copy, src, len);
3121 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3122 free (src_copy);
3123 if (status != HSA_STATUS_SUCCESS)
3124 GOMP_PLUGIN_error ("memory copy failed");
3125 }
3126
3127 /* Copy data to or from a device. This is intended for use as an async
3128 callback event. */
3129
3130 static void
copy_data(void * data_)3131 copy_data (void *data_)
3132 {
3133 struct copy_data *data = (struct copy_data *)data_;
3134 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3135 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3136 data->dst);
3137 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3138 if (data->free_src)
3139 free ((void *) data->src);
3140 free (data);
3141 }
3142
3143 /* Free device data. This is intended for use as an async callback event. */
3144
3145 static void
gomp_offload_free(void * ptr)3146 gomp_offload_free (void *ptr)
3147 {
3148 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
3149 GOMP_OFFLOAD_free (0, ptr);
3150 }
3151
3152 /* Request an asynchronous data copy, to or from a device, on a given queue.
3153 The event will be registered as a callback. If FREE_SRC is true
3154 then the source data will be freed following the copy. */
3155
3156 static void
queue_push_copy(struct goacc_asyncqueue * aq,void * dst,const void * src,size_t len,bool free_src)3157 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3158 size_t len, bool free_src)
3159 {
3160 if (DEBUG_QUEUES)
3161 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3162 aq->agent->device_id, aq->id, len, src, dst);
3163 struct copy_data *data
3164 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3165 data->dst = dst;
3166 data->src = src;
3167 data->len = len;
3168 data->free_src = free_src;
3169 data->aq = aq;
3170 queue_push_callback (aq, copy_data, data);
3171 }
3172
3173 /* Return true if the given queue is currently empty. */
3174
3175 static int
queue_empty(struct goacc_asyncqueue * aq)3176 queue_empty (struct goacc_asyncqueue *aq)
3177 {
3178 pthread_mutex_lock (&aq->mutex);
3179 int res = aq->queue_n == 0 ? 1 : 0;
3180 pthread_mutex_unlock (&aq->mutex);
3181
3182 return res;
3183 }
3184
3185 /* Wait for a given queue to become empty. This implements an OpenACC wait
3186 directive. */
3187
3188 static void
wait_queue(struct goacc_asyncqueue * aq)3189 wait_queue (struct goacc_asyncqueue *aq)
3190 {
3191 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3192 {
3193 drain_queue_synchronous (aq);
3194 return;
3195 }
3196
3197 pthread_mutex_lock (&aq->mutex);
3198
3199 while (aq->queue_n > 0)
3200 {
3201 if (DEBUG_THREAD_SLEEP)
3202 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3203 aq->agent->device_id, aq->id);
3204 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3205 if (DEBUG_THREAD_SLEEP)
3206 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3207 aq->id);
3208 }
3209
3210 pthread_mutex_unlock (&aq->mutex);
3211 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3212 }
3213
3214 /* }}} */
3215 /* {{{ OpenACC support */
3216
3217 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3218
3219 static void
gcn_exec(struct kernel_info * kernel,size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,bool async,struct goacc_asyncqueue * aq)3220 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3221 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3222 struct goacc_asyncqueue *aq)
3223 {
3224 if (!GOMP_OFFLOAD_can_run (kernel))
3225 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3226
3227 /* If we get here then this must be an OpenACC kernel. */
3228 kernel->kind = KIND_OPENACC;
3229
3230 /* devaddrs must be double-indirect on the target. */
3231 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3232 for (size_t i = 0; i < mapnum; i++)
3233 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3234 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3235 sizeof (void *));
3236
3237 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3238 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3239 {
3240 struct hsa_kernel_description *d
3241 = &kernel->module->image_desc->kernel_infos[i];
3242 if (d->name == kernel->name)
3243 {
3244 hsa_kernel_desc = d;
3245 break;
3246 }
3247 }
3248
3249 /* We may have statically-determined dimensions in
3250 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3251 invocation at runtime in dims[]. We allow static dimensions to take
3252 priority over dynamic dimensions when present (non-zero). */
3253 if (hsa_kernel_desc->oacc_dims[0] > 0)
3254 dims[0] = hsa_kernel_desc->oacc_dims[0];
3255 if (hsa_kernel_desc->oacc_dims[1] > 0)
3256 dims[1] = hsa_kernel_desc->oacc_dims[1];
3257 if (hsa_kernel_desc->oacc_dims[2] > 0)
3258 dims[2] = hsa_kernel_desc->oacc_dims[2];
3259
3260 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3261 There isn't really a correct answer for this without a clue about the
3262 problem size, so let's do a reasonable number of single-worker gangs.
3263 64 gangs matches a typical Fiji device. */
3264
3265 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3266 for the default number of workers. */
3267 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3268 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3269
3270 /* The incoming dimensions are expressed in terms of gangs, workers, and
3271 vectors. The HSA dimensions are expressed in terms of "work-items",
3272 which means multiples of vector lanes.
3273
3274 The "grid size" specifies the size of the problem space, and the
3275 "work-group size" specifies how much of that we want a single compute
3276 unit to chew on at once.
3277
3278 The three dimensions do not really correspond to hardware, but the
3279 important thing is that the HSA runtime will launch as many
3280 work-groups as it takes to process the entire grid, and each
3281 work-group will contain as many wave-fronts as it takes to process
3282 the work-items in that group.
3283
3284 Essentially, as long as we set the Y dimension to 64 (the number of
3285 vector lanes in hardware), and the Z group size to the maximum (16),
3286 then we will get the gangs (X) and workers (Z) launched as we expect.
3287
3288 The reason for the apparent reversal of vector and worker dimension
3289 order is to do with the way the run-time distributes work-items across
3290 v1 and v2. */
3291 struct GOMP_kernel_launch_attributes kla =
3292 {3,
3293 /* Grid size. */
3294 {dims[0], 64, dims[1]},
3295 /* Work-group size. */
3296 {1, 64, 16}
3297 };
3298
3299 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3300 acc_prof_info *prof_info = thr->prof_info;
3301 acc_event_info enqueue_launch_event_info;
3302 acc_api_info *api_info = thr->api_info;
3303 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3304 if (profiling_dispatch_p)
3305 {
3306 prof_info->event_type = acc_ev_enqueue_launch_start;
3307
3308 enqueue_launch_event_info.launch_event.event_type
3309 = prof_info->event_type;
3310 enqueue_launch_event_info.launch_event.valid_bytes
3311 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3312 enqueue_launch_event_info.launch_event.parent_construct
3313 = acc_construct_parallel;
3314 enqueue_launch_event_info.launch_event.implicit = 1;
3315 enqueue_launch_event_info.launch_event.tool_info = NULL;
3316 enqueue_launch_event_info.launch_event.kernel_name
3317 = (char *) kernel->name;
3318 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3319 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3320 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3321
3322 api_info->device_api = acc_device_api_other;
3323
3324 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3325 &enqueue_launch_event_info, api_info);
3326 }
3327
3328 if (!async)
3329 {
3330 run_kernel (kernel, ind_da, &kla, NULL, false);
3331 gomp_offload_free (ind_da);
3332 }
3333 else
3334 {
3335 queue_push_launch (aq, kernel, ind_da, &kla);
3336 if (DEBUG_QUEUES)
3337 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3338 aq->agent->device_id, aq->id, ind_da);
3339 queue_push_callback (aq, gomp_offload_free, ind_da);
3340 }
3341
3342 if (profiling_dispatch_p)
3343 {
3344 prof_info->event_type = acc_ev_enqueue_launch_end;
3345 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3346 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3347 &enqueue_launch_event_info,
3348 api_info);
3349 }
3350 }
3351
3352 /* }}} */
3353 /* {{{ Generic Plugin API */
3354
3355 /* Return the name of the accelerator, which is "gcn". */
3356
3357 const char *
GOMP_OFFLOAD_get_name(void)3358 GOMP_OFFLOAD_get_name (void)
3359 {
3360 return "gcn";
3361 }
3362
3363 /* Return the specific capabilities the HSA accelerator have. */
3364
3365 unsigned int
GOMP_OFFLOAD_get_caps(void)3366 GOMP_OFFLOAD_get_caps (void)
3367 {
3368 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3369 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3370 | GOMP_OFFLOAD_CAP_OPENACC_200;
3371 }
3372
3373 /* Identify as GCN accelerator. */
3374
3375 int
GOMP_OFFLOAD_get_type(void)3376 GOMP_OFFLOAD_get_type (void)
3377 {
3378 return OFFLOAD_TARGET_TYPE_GCN;
3379 }
3380
3381 /* Return the libgomp version number we're compatible with. There is
3382 no requirement for cross-version compatibility. */
3383
3384 unsigned
GOMP_OFFLOAD_version(void)3385 GOMP_OFFLOAD_version (void)
3386 {
3387 return GOMP_VERSION;
3388 }
3389
3390 /* Return the number of GCN devices on the system. */
3391
3392 int
GOMP_OFFLOAD_get_num_devices(void)3393 GOMP_OFFLOAD_get_num_devices (void)
3394 {
3395 if (!init_hsa_context ())
3396 return 0;
3397 return hsa_context.agent_count;
3398 }
3399
3400 /* Initialize device (agent) number N so that it can be used for computation.
3401 Return TRUE on success. */
3402
3403 bool
GOMP_OFFLOAD_init_device(int n)3404 GOMP_OFFLOAD_init_device (int n)
3405 {
3406 if (!init_hsa_context ())
3407 return false;
3408 if (n >= hsa_context.agent_count)
3409 {
3410 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3411 return false;
3412 }
3413 struct agent_info *agent = &hsa_context.agents[n];
3414
3415 if (agent->initialized)
3416 return true;
3417
3418 agent->device_id = n;
3419
3420 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3421 {
3422 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3423 return false;
3424 }
3425 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3426 {
3427 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3428 return false;
3429 }
3430 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3431 {
3432 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3433 return false;
3434 }
3435 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3436 {
3437 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3438 return false;
3439 }
3440 agent->async_queues = NULL;
3441 agent->omp_async_queue = NULL;
3442 agent->team_arena_list = NULL;
3443
3444 uint32_t queue_size;
3445 hsa_status_t status;
3446 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3447 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3448 &queue_size);
3449 if (status != HSA_STATUS_SUCCESS)
3450 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3451 status);
3452
3453 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3454 &agent->name);
3455 if (status != HSA_STATUS_SUCCESS)
3456 return hsa_error ("Error querying the name of the agent", status);
3457
3458 agent->device_isa = isa_code (agent->name);
3459 if (agent->device_isa < 0)
3460 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3461
3462 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3463 &agent->vendor_name);
3464 if (status != HSA_STATUS_SUCCESS)
3465 return hsa_error ("Error querying the vendor name of the agent", status);
3466
3467 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3468 HSA_QUEUE_TYPE_MULTI,
3469 hsa_queue_callback, NULL, UINT32_MAX,
3470 UINT32_MAX, &agent->sync_queue);
3471 if (status != HSA_STATUS_SUCCESS)
3472 return hsa_error ("Error creating command queue", status);
3473
3474 agent->kernarg_region.handle = (uint64_t) -1;
3475 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3476 get_kernarg_memory_region,
3477 &agent->kernarg_region);
3478 if (status != HSA_STATUS_SUCCESS
3479 && status != HSA_STATUS_INFO_BREAK)
3480 hsa_error ("Scanning memory regions failed", status);
3481 if (agent->kernarg_region.handle == (uint64_t) -1)
3482 {
3483 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3484 "arguments");
3485 return false;
3486 }
3487 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3488 dump_hsa_region (agent->kernarg_region, NULL);
3489
3490 agent->data_region.handle = (uint64_t) -1;
3491 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3492 get_data_memory_region,
3493 &agent->data_region);
3494 if (status != HSA_STATUS_SUCCESS
3495 && status != HSA_STATUS_INFO_BREAK)
3496 hsa_error ("Scanning memory regions failed", status);
3497 if (agent->data_region.handle == (uint64_t) -1)
3498 {
3499 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3500 "data");
3501 return false;
3502 }
3503 GCN_DEBUG ("Selected device data memory region:\n");
3504 dump_hsa_region (agent->data_region, NULL);
3505
3506 GCN_DEBUG ("GCN agent %d initialized\n", n);
3507
3508 agent->initialized = true;
3509 return true;
3510 }
3511
3512 /* Load GCN object-code module described by struct gcn_image_desc in
3513 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3514 If there are any constructors then run them. */
3515
3516 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)3517 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3518 struct addr_pair **target_table)
3519 {
3520 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3521 {
3522 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3523 " (expected %u, received %u)",
3524 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3525 return -1;
3526 }
3527
3528 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3529 struct agent_info *agent;
3530 struct addr_pair *pair;
3531 struct module_info *module;
3532 struct kernel_info *kernel;
3533 int kernel_count = image_desc->kernel_count;
3534 unsigned var_count = image_desc->global_variable_count;
3535
3536 agent = get_agent_info (ord);
3537 if (!agent)
3538 return -1;
3539
3540 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3541 {
3542 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3543 return -1;
3544 }
3545 if (agent->prog_finalized
3546 && !destroy_hsa_program (agent))
3547 return -1;
3548
3549 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3550 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3551 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3552 * sizeof (struct addr_pair));
3553 *target_table = pair;
3554 module = (struct module_info *)
3555 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3556 + kernel_count * sizeof (struct kernel_info));
3557 module->image_desc = image_desc;
3558 module->kernel_count = kernel_count;
3559 module->heap = NULL;
3560 module->constructors_run_p = false;
3561
3562 kernel = &module->kernels[0];
3563
3564 /* Allocate memory for kernel dependencies. */
3565 for (unsigned i = 0; i < kernel_count; i++)
3566 {
3567 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3568 if (!init_basic_kernel_info (kernel, d, agent, module))
3569 return -1;
3570 if (strcmp (d->name, "_init_array") == 0)
3571 module->init_array_func = kernel;
3572 else if (strcmp (d->name, "_fini_array") == 0)
3573 module->fini_array_func = kernel;
3574 else
3575 {
3576 pair->start = (uintptr_t) kernel;
3577 pair->end = (uintptr_t) (kernel + 1);
3578 pair++;
3579 }
3580 kernel++;
3581 }
3582
3583 agent->module = module;
3584 if (pthread_rwlock_unlock (&agent->module_rwlock))
3585 {
3586 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3587 return -1;
3588 }
3589
3590 if (!create_and_finalize_hsa_program (agent))
3591 return -1;
3592
3593 for (unsigned i = 0; i < var_count; i++)
3594 {
3595 struct global_var_info *v = &image_desc->global_variables[i];
3596 GCN_DEBUG ("Looking for variable %s\n", v->name);
3597
3598 hsa_status_t status;
3599 hsa_executable_symbol_t var_symbol;
3600 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3601 v->name, agent->id,
3602 0, &var_symbol);
3603
3604 if (status != HSA_STATUS_SUCCESS)
3605 hsa_fatal ("Could not find symbol for variable in the code object",
3606 status);
3607
3608 uint64_t var_addr;
3609 uint32_t var_size;
3610 status = hsa_fns.hsa_executable_symbol_get_info_fn
3611 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3612 if (status != HSA_STATUS_SUCCESS)
3613 hsa_fatal ("Could not extract a variable from its symbol", status);
3614 status = hsa_fns.hsa_executable_symbol_get_info_fn
3615 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3616 if (status != HSA_STATUS_SUCCESS)
3617 hsa_fatal ("Could not extract a variable size from its symbol", status);
3618
3619 pair->start = var_addr;
3620 pair->end = var_addr + var_size;
3621 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3622 (void *)var_addr, var_size);
3623 pair++;
3624 }
3625
3626 /* Ensure that constructors are run first. */
3627 struct GOMP_kernel_launch_attributes kla =
3628 { 3,
3629 /* Grid size. */
3630 { 1, 64, 1 },
3631 /* Work-group size. */
3632 { 1, 64, 1 }
3633 };
3634
3635 if (module->init_array_func)
3636 {
3637 init_kernel (module->init_array_func);
3638 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3639 }
3640 module->constructors_run_p = true;
3641
3642 /* Don't report kernels that libgomp need not know about. */
3643 if (module->init_array_func)
3644 kernel_count--;
3645 if (module->fini_array_func)
3646 kernel_count--;
3647
3648 return kernel_count + var_count;
3649 }
3650
3651 /* Unload GCN object-code module described by struct gcn_image_desc in
3652 TARGET_DATA from agent number N. Return TRUE on success. */
3653
3654 bool
GOMP_OFFLOAD_unload_image(int n,unsigned version,const void * target_data)3655 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3656 {
3657 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3658 {
3659 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3660 " (expected %u, received %u)",
3661 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3662 return false;
3663 }
3664
3665 struct agent_info *agent;
3666 agent = get_agent_info (n);
3667 if (!agent)
3668 return false;
3669
3670 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3671 {
3672 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3673 return false;
3674 }
3675
3676 if (!agent->module || agent->module->image_desc != target_data)
3677 {
3678 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3679 "loaded before");
3680 return false;
3681 }
3682
3683 if (!destroy_module (agent->module, true))
3684 return false;
3685 free (agent->module);
3686 agent->module = NULL;
3687 if (!destroy_hsa_program (agent))
3688 return false;
3689 if (pthread_rwlock_unlock (&agent->module_rwlock))
3690 {
3691 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3692 return false;
3693 }
3694 return true;
3695 }
3696
3697 /* Deinitialize all information and status associated with agent number N. We
3698 do not attempt any synchronization, assuming the user and libgomp will not
3699 attempt deinitialization of a device that is in any way being used at the
3700 same time. Return TRUE on success. */
3701
3702 bool
GOMP_OFFLOAD_fini_device(int n)3703 GOMP_OFFLOAD_fini_device (int n)
3704 {
3705 struct agent_info *agent = get_agent_info (n);
3706 if (!agent)
3707 return false;
3708
3709 if (!agent->initialized)
3710 return true;
3711
3712 if (agent->omp_async_queue)
3713 {
3714 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3715 agent->omp_async_queue = NULL;
3716 }
3717
3718 if (agent->module)
3719 {
3720 if (!destroy_module (agent->module, false))
3721 return false;
3722 free (agent->module);
3723 agent->module = NULL;
3724 }
3725
3726 if (!destroy_team_arenas (agent))
3727 return false;
3728
3729 if (!destroy_hsa_program (agent))
3730 return false;
3731
3732 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3733 if (status != HSA_STATUS_SUCCESS)
3734 return hsa_error ("Error destroying command queue", status);
3735
3736 if (pthread_mutex_destroy (&agent->prog_mutex))
3737 {
3738 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3739 return false;
3740 }
3741 if (pthread_rwlock_destroy (&agent->module_rwlock))
3742 {
3743 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3744 return false;
3745 }
3746
3747 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3748 {
3749 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3750 return false;
3751 }
3752 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3753 {
3754 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3755 return false;
3756 }
3757 agent->initialized = false;
3758 return true;
3759 }
3760
3761 /* Return true if the HSA runtime can run function FN_PTR. */
3762
3763 bool
GOMP_OFFLOAD_can_run(void * fn_ptr)3764 GOMP_OFFLOAD_can_run (void *fn_ptr)
3765 {
3766 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3767
3768 init_kernel (kernel);
3769 if (kernel->initialization_failed)
3770 goto failure;
3771
3772 return true;
3773
3774 failure:
3775 if (suppress_host_fallback)
3776 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3777 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3778 return false;
3779 }
3780
3781 /* Allocate memory on device N. */
3782
3783 void *
GOMP_OFFLOAD_alloc(int n,size_t size)3784 GOMP_OFFLOAD_alloc (int n, size_t size)
3785 {
3786 struct agent_info *agent = get_agent_info (n);
3787 return alloc_by_agent (agent, size);
3788 }
3789
3790 /* Free memory from device N. */
3791
3792 bool
GOMP_OFFLOAD_free(int device,void * ptr)3793 GOMP_OFFLOAD_free (int device, void *ptr)
3794 {
3795 GCN_DEBUG ("Freeing memory on device %d\n", device);
3796
3797 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3798 if (status != HSA_STATUS_SUCCESS)
3799 {
3800 hsa_error ("Could not free device memory", status);
3801 return false;
3802 }
3803
3804 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3805 bool profiling_dispatch_p
3806 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3807 if (profiling_dispatch_p)
3808 {
3809 acc_prof_info *prof_info = thr->prof_info;
3810 acc_event_info data_event_info;
3811 acc_api_info *api_info = thr->api_info;
3812
3813 prof_info->event_type = acc_ev_free;
3814
3815 data_event_info.data_event.event_type = prof_info->event_type;
3816 data_event_info.data_event.valid_bytes
3817 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3818 data_event_info.data_event.parent_construct
3819 = acc_construct_parallel;
3820 data_event_info.data_event.implicit = 1;
3821 data_event_info.data_event.tool_info = NULL;
3822 data_event_info.data_event.var_name = NULL;
3823 data_event_info.data_event.bytes = 0;
3824 data_event_info.data_event.host_ptr = NULL;
3825 data_event_info.data_event.device_ptr = (void *) ptr;
3826
3827 api_info->device_api = acc_device_api_other;
3828
3829 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3830 api_info);
3831 }
3832
3833 return true;
3834 }
3835
3836 /* Copy data from DEVICE to host. */
3837
3838 bool
GOMP_OFFLOAD_dev2host(int device,void * dst,const void * src,size_t n)3839 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3840 {
3841 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3842 src, dst);
3843 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3844 if (status != HSA_STATUS_SUCCESS)
3845 GOMP_PLUGIN_error ("memory copy failed");
3846 return true;
3847 }
3848
3849 /* Copy data from host to DEVICE. */
3850
3851 bool
GOMP_OFFLOAD_host2dev(int device,void * dst,const void * src,size_t n)3852 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3853 {
3854 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3855 device, dst);
3856 hsa_memory_copy_wrapper (dst, src, n);
3857 return true;
3858 }
3859
3860 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3861
3862 bool
GOMP_OFFLOAD_dev2dev(int device,void * dst,const void * src,size_t n)3863 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3864 {
3865 struct gcn_thread *thread_data = gcn_thread ();
3866
3867 if (thread_data && !async_synchronous_p (thread_data->async))
3868 {
3869 struct agent_info *agent = get_agent_info (device);
3870 maybe_init_omp_async (agent);
3871 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3872 return true;
3873 }
3874
3875 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3876 device, src, device, dst);
3877 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3878 if (status != HSA_STATUS_SUCCESS)
3879 GOMP_PLUGIN_error ("memory copy failed");
3880 return true;
3881 }
3882
3883 /* }}} */
3884 /* {{{ OpenMP Plugin API */
3885
3886 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3887 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3888 to a kernel_info structure, and must have previously been loaded to the
3889 specified device. */
3890
3891 void
GOMP_OFFLOAD_run(int device,void * fn_ptr,void * vars,void ** args)3892 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3893 {
3894 struct agent_info *agent = get_agent_info (device);
3895 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3896 struct GOMP_kernel_launch_attributes def;
3897 struct GOMP_kernel_launch_attributes *kla;
3898 assert (agent == kernel->agent);
3899
3900 /* If we get here then the kernel must be OpenMP. */
3901 kernel->kind = KIND_OPENMP;
3902
3903 if (!parse_target_attributes (args, &def, &kla, agent))
3904 {
3905 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3906 return;
3907 }
3908 run_kernel (kernel, vars, kla, NULL, false);
3909 }
3910
3911 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3912 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3913 GOMP_PLUGIN_target_task_completion when it has finished. */
3914
3915 void
GOMP_OFFLOAD_async_run(int device,void * tgt_fn,void * tgt_vars,void ** args,void * async_data)3916 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3917 void **args, void *async_data)
3918 {
3919 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3920 struct agent_info *agent = get_agent_info (device);
3921 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3922 struct GOMP_kernel_launch_attributes def;
3923 struct GOMP_kernel_launch_attributes *kla;
3924 assert (agent == kernel->agent);
3925
3926 /* If we get here then the kernel must be OpenMP. */
3927 kernel->kind = KIND_OPENMP;
3928
3929 if (!parse_target_attributes (args, &def, &kla, agent))
3930 {
3931 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3932 return;
3933 }
3934
3935 maybe_init_omp_async (agent);
3936 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3937 queue_push_callback (agent->omp_async_queue,
3938 GOMP_PLUGIN_target_task_completion, async_data);
3939 }
3940
3941 /* }}} */
3942 /* {{{ OpenACC Plugin API */
3943
3944 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3945 already-loaded KERNEL. */
3946
3947 void
GOMP_OFFLOAD_openacc_exec(void (* fn_ptr)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)3948 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3949 void **hostaddrs, void **devaddrs, unsigned *dims,
3950 void *targ_mem_desc)
3951 {
3952 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3953
3954 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3955 NULL);
3956 }
3957
3958 /* Run an asynchronous OpenACC kernel on the specified queue. */
3959
3960 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn_ptr)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)3961 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3962 void **hostaddrs, void **devaddrs,
3963 unsigned *dims, void *targ_mem_desc,
3964 struct goacc_asyncqueue *aq)
3965 {
3966 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3967
3968 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3969 aq);
3970 }
3971
3972 /* Create a new asynchronous thread and queue for running future kernels. */
3973
3974 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)3975 GOMP_OFFLOAD_openacc_async_construct (int device)
3976 {
3977 struct agent_info *agent = get_agent_info (device);
3978
3979 pthread_mutex_lock (&agent->async_queues_mutex);
3980
3981 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3982 aq->agent = get_agent_info (device);
3983 aq->prev = NULL;
3984 aq->next = agent->async_queues;
3985 if (aq->next)
3986 {
3987 aq->next->prev = aq;
3988 aq->id = aq->next->id + 1;
3989 }
3990 else
3991 aq->id = 1;
3992 agent->async_queues = aq;
3993
3994 aq->queue_first = 0;
3995 aq->queue_n = 0;
3996 aq->drain_queue_stop = 0;
3997
3998 if (pthread_mutex_init (&aq->mutex, NULL))
3999 {
4000 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4001 return false;
4002 }
4003 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4004 {
4005 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4006 return false;
4007 }
4008 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4009 {
4010 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4011 return false;
4012 }
4013
4014 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4015 ASYNC_QUEUE_SIZE,
4016 HSA_QUEUE_TYPE_MULTI,
4017 hsa_queue_callback, NULL,
4018 UINT32_MAX, UINT32_MAX,
4019 &aq->hsa_queue);
4020 if (status != HSA_STATUS_SUCCESS)
4021 hsa_fatal ("Error creating command queue", status);
4022
4023 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4024 if (err != 0)
4025 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4026 strerror (err));
4027 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4028 aq->id);
4029
4030 pthread_mutex_unlock (&agent->async_queues_mutex);
4031
4032 return aq;
4033 }
4034
4035 /* Destroy an existing asynchronous thread and queue. Waits for any
4036 currently-running task to complete, but cancels any queued tasks. */
4037
4038 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)4039 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4040 {
4041 struct agent_info *agent = aq->agent;
4042
4043 finalize_async_thread (aq);
4044
4045 pthread_mutex_lock (&agent->async_queues_mutex);
4046
4047 int err;
4048 if ((err = pthread_mutex_destroy (&aq->mutex)))
4049 {
4050 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4051 goto fail;
4052 }
4053 if (pthread_cond_destroy (&aq->queue_cond_in))
4054 {
4055 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4056 goto fail;
4057 }
4058 if (pthread_cond_destroy (&aq->queue_cond_out))
4059 {
4060 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4061 goto fail;
4062 }
4063 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4064 if (status != HSA_STATUS_SUCCESS)
4065 {
4066 hsa_error ("Error destroying command queue", status);
4067 goto fail;
4068 }
4069
4070 if (aq->prev)
4071 aq->prev->next = aq->next;
4072 if (aq->next)
4073 aq->next->prev = aq->prev;
4074 if (agent->async_queues == aq)
4075 agent->async_queues = aq->next;
4076
4077 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4078
4079 free (aq);
4080 pthread_mutex_unlock (&agent->async_queues_mutex);
4081 return true;
4082
4083 fail:
4084 pthread_mutex_unlock (&agent->async_queues_mutex);
4085 return false;
4086 }
4087
4088 /* Return true if the specified async queue is currently empty. */
4089
4090 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)4091 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4092 {
4093 return queue_empty (aq);
4094 }
4095
4096 /* Block until the specified queue has executed all its tasks and the
4097 queue is empty. */
4098
4099 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)4100 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4101 {
4102 wait_queue (aq);
4103 return true;
4104 }
4105
4106 /* Add a serialization point across two async queues. Any new tasks added to
4107 AQ2, after this call, will not run until all tasks on AQ1, at the time
4108 of this call, have completed. */
4109
4110 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)4111 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4112 struct goacc_asyncqueue *aq2)
4113 {
4114 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4115 scheduled to run on it up to this point. */
4116 if (aq1 != aq2)
4117 {
4118 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4119 queue_push_asyncwait (aq2, placeholderp);
4120 }
4121 return true;
4122 }
4123
4124 /* Add an opaque callback to the given async queue. */
4125
4126 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* fn)(void *),void * data)4127 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4128 void (*fn) (void *), void *data)
4129 {
4130 queue_push_callback (aq, fn, data);
4131 }
4132
4133 /* Queue up an asynchronous data copy from host to DEVICE. */
4134
4135 bool
GOMP_OFFLOAD_openacc_async_host2dev(int device,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)4136 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4137 size_t n, struct goacc_asyncqueue *aq)
4138 {
4139 struct agent_info *agent = get_agent_info (device);
4140 assert (agent == aq->agent);
4141 /* The source data does not necessarily remain live until the deferred
4142 copy happens. Taking a snapshot of the data here avoids reading
4143 uninitialised data later, but means that (a) data is copied twice and
4144 (b) modifications to the copied data between the "spawning" point of
4145 the asynchronous kernel and when it is executed will not be seen.
4146 But, that is probably correct. */
4147 void *src_copy = GOMP_PLUGIN_malloc (n);
4148 memcpy (src_copy, src, n);
4149 queue_push_copy (aq, dst, src_copy, n, true);
4150 return true;
4151 }
4152
4153 /* Queue up an asynchronous data copy from DEVICE to host. */
4154
4155 bool
GOMP_OFFLOAD_openacc_async_dev2host(int device,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)4156 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4157 size_t n, struct goacc_asyncqueue *aq)
4158 {
4159 struct agent_info *agent = get_agent_info (device);
4160 assert (agent == aq->agent);
4161 queue_push_copy (aq, dst, src, n, false);
4162 return true;
4163 }
4164
4165 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int device,enum goacc_property prop)4166 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4167 {
4168 struct agent_info *agent = get_agent_info (device);
4169
4170 union goacc_property_value propval = { .val = 0 };
4171
4172 switch (prop)
4173 {
4174 case GOACC_PROPERTY_FREE_MEMORY:
4175 /* Not supported. */
4176 break;
4177 case GOACC_PROPERTY_MEMORY:
4178 {
4179 size_t size;
4180 hsa_region_t region = agent->data_region;
4181 hsa_status_t status =
4182 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4183 if (status == HSA_STATUS_SUCCESS)
4184 propval.val = size;
4185 break;
4186 }
4187 case GOACC_PROPERTY_NAME:
4188 propval.ptr = agent->name;
4189 break;
4190 case GOACC_PROPERTY_VENDOR:
4191 propval.ptr = agent->vendor_name;
4192 break;
4193 case GOACC_PROPERTY_DRIVER:
4194 propval.ptr = hsa_context.driver_version_s;
4195 break;
4196 }
4197
4198 return propval;
4199 }
4200
4201 /* Set up plugin-specific thread-local-data (host-side). */
4202
4203 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)4204 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4205 {
4206 struct gcn_thread *thread_data
4207 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4208
4209 thread_data->async = GOMP_ASYNC_SYNC;
4210
4211 return (void *) thread_data;
4212 }
4213
4214 /* Clean up plugin-specific thread-local-data. */
4215
4216 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)4217 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4218 {
4219 free (data);
4220 }
4221
4222 /* }}} */
4223