xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/gcn/gcn-run.c (revision dd3ee07da436799d8de85f3055253118b76bf345)
1 /* Run a stand-alone AMD GCN kernel.
2 
3    Copyright 2017 Mentor Graphics Corporation
4    Copyright 2018-2019 Free Software Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation, either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18 
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20 
21    The kernel entry point's signature must use a standard main signature:
22 
23      int main(int argc, char **argv)
24 */
25 
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36 
37 /* These probably won't be in elf.h for a while.  */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE		0
40 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
41 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
42 #define R_AMDGPU_ABS64		3	/* S + A  */
43 #define R_AMDGPU_REL32		4	/* S + A - P  */
44 #define R_AMDGPU_REL64		5	/* S + A - P  */
45 #define R_AMDGPU_ABS32		6	/* S + A  */
46 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
47 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
48 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
49 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
50 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
51 #define reserved		12
52 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
53 #endif
54 
55 #include "hsa.h"
56 
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
60 
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
64 
65 bool debug = false;
66 
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t kernel = 0;
70 hsa_executable_t executable = { 0 };
71 
72 hsa_region_t kernargs_region = { 0 };
73 uint32_t kernarg_segment_size = 0;
74 uint32_t group_segment_size = 0;
75 uint32_t private_segment_size = 0;
76 
77 static void
78 usage (const char *progname)
79 {
80   printf ("Usage: %s [options] kernel [kernel-args]\n\n"
81 	  "Options:\n"
82 	  "  --help\n"
83 	  "  --version\n"
84 	  "  --debug\n", progname);
85 }
86 
87 static void
88 version (const char *progname)
89 {
90   printf ("%s " VERSION_STRING "\n", progname);
91 }
92 
93 /* As an HSA runtime is dlopened, following structure defines the necessary
94    function pointers.
95    Code adapted from libgomp.  */
96 
97 struct hsa_runtime_fn_info
98 {
99   /* HSA runtime.  */
100   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
101 					const char **status_string);
102   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
103 					 hsa_agent_info_t attribute,
104 					 void *value);
105   hsa_status_t (*hsa_init_fn) (void);
106   hsa_status_t (*hsa_iterate_agents_fn)
107     (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
108   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
109 					  hsa_region_info_t attribute,
110 					  void *value);
111   hsa_status_t (*hsa_queue_create_fn)
112     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
113      void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
114      void *data, uint32_t private_segment_size,
115      uint32_t group_segment_size, hsa_queue_t **queue);
116   hsa_status_t (*hsa_agent_iterate_regions_fn)
117     (hsa_agent_t agent,
118      hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
119   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
120   hsa_status_t (*hsa_executable_create_fn)
121     (hsa_profile_t profile, hsa_executable_state_t executable_state,
122      const char *options, hsa_executable_t *executable);
123   hsa_status_t (*hsa_executable_global_variable_define_fn)
124     (hsa_executable_t executable, const char *variable_name, void *address);
125   hsa_status_t (*hsa_executable_load_code_object_fn)
126     (hsa_executable_t executable, hsa_agent_t agent,
127      hsa_code_object_t code_object, const char *options);
128   hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
129 					    const char *options);
130   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
131 					uint32_t num_consumers,
132 					const hsa_agent_t *consumers,
133 					hsa_signal_t *signal);
134   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
135 					  void **ptr);
136   hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
137 				      size_t size);
138   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
139   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
140   hsa_status_t (*hsa_executable_get_symbol_fn)
141     (hsa_executable_t executable, const char *module_name,
142      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
143      hsa_executable_symbol_t *symbol);
144   hsa_status_t (*hsa_executable_symbol_get_info_fn)
145     (hsa_executable_symbol_t executable_symbol,
146      hsa_executable_symbol_info_t attribute, void *value);
147   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
148 				       hsa_signal_value_t value);
149   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
150     (hsa_signal_t signal, hsa_signal_condition_t condition,
151      hsa_signal_value_t compare_value, uint64_t timeout_hint,
152      hsa_wait_state_t wait_state_hint);
153   hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
154     (hsa_signal_t signal, hsa_signal_condition_t condition,
155      hsa_signal_value_t compare_value, uint64_t timeout_hint,
156      hsa_wait_state_t wait_state_hint);
157   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
158   hsa_status_t (*hsa_code_object_deserialize_fn)
159     (void *serialized_code_object, size_t serialized_code_object_size,
160      const char *options, hsa_code_object_t *code_object);
161   uint64_t (*hsa_queue_load_write_index_relaxed_fn)
162     (const hsa_queue_t *queue);
163   void (*hsa_queue_store_write_index_relaxed_fn)
164     (const hsa_queue_t *queue, uint64_t value);
165   hsa_status_t (*hsa_shut_down_fn) ();
166 };
167 
168 /* HSA runtime functions that are initialized in init_hsa_context.
169    Code adapted from libgomp.  */
170 
171 static struct hsa_runtime_fn_info hsa_fns;
172 
173 #define DLSYM_FN(function)					 \
174   *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
175   if (hsa_fns.function##_fn == NULL)				 \
176     goto fail;
177 
178 static void
179 init_hsa_runtime_functions (void)
180 {
181   void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
182   if (handle == NULL)
183     {
184       fprintf (stderr,
185 	       "The HSA runtime is required to run GCN kernels on hardware.\n"
186 	       "%s: File not found or could not be opened\n",
187 	       HSA_RUNTIME_LIB);
188       exit (1);
189     }
190 
191   DLSYM_FN (hsa_status_string)
192   DLSYM_FN (hsa_agent_get_info)
193   DLSYM_FN (hsa_init)
194   DLSYM_FN (hsa_iterate_agents)
195   DLSYM_FN (hsa_region_get_info)
196   DLSYM_FN (hsa_queue_create)
197   DLSYM_FN (hsa_agent_iterate_regions)
198   DLSYM_FN (hsa_executable_destroy)
199   DLSYM_FN (hsa_executable_create)
200   DLSYM_FN (hsa_executable_global_variable_define)
201   DLSYM_FN (hsa_executable_load_code_object)
202   DLSYM_FN (hsa_executable_freeze)
203   DLSYM_FN (hsa_signal_create)
204   DLSYM_FN (hsa_memory_allocate)
205   DLSYM_FN (hsa_memory_copy)
206   DLSYM_FN (hsa_memory_free)
207   DLSYM_FN (hsa_signal_destroy)
208   DLSYM_FN (hsa_executable_get_symbol)
209   DLSYM_FN (hsa_executable_symbol_get_info)
210   DLSYM_FN (hsa_signal_wait_acquire)
211   DLSYM_FN (hsa_signal_wait_relaxed)
212   DLSYM_FN (hsa_signal_store_relaxed)
213   DLSYM_FN (hsa_queue_destroy)
214   DLSYM_FN (hsa_code_object_deserialize)
215   DLSYM_FN (hsa_queue_load_write_index_relaxed)
216   DLSYM_FN (hsa_queue_store_write_index_relaxed)
217   DLSYM_FN (hsa_shut_down)
218 
219   return;
220 
221 fail:
222   fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
223   exit (1);
224 }
225 
226 #undef DLSYM_FN
227 
228 /* Report a fatal error STR together with the HSA error corresponding to
229    STATUS and terminate execution of the current process.  */
230 
231 static void
232 hsa_fatal (const char *str, hsa_status_t status)
233 {
234   const char *hsa_error_msg;
235   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
236   fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
237 	   hsa_error_msg);
238   exit (1);
239 }
240 
241 /* Helper macros to ensure we check the return values from the HSA Runtime.
242    These just keep the rest of the code a bit cleaner.  */
243 
244 #define XHSA_CMP(FN, CMP, MSG)		   \
245   do {					   \
246     hsa_status_t status = (FN);		   \
247     if (!(CMP))				   \
248       hsa_fatal ((MSG), status);	   \
249     else if (debug)			   \
250       fprintf (stderr, "%s: OK\n", (MSG)); \
251   } while (0)
252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
253 
254 /* Callback of hsa_iterate_agents.
255    Called once for each available device, and returns "break" when a
256    suitable one has been found.  */
257 
258 static hsa_status_t
259 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
260 {
261   hsa_device_type_t device_type;
262   XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
263 				       &device_type),
264 	"Get agent type");
265 
266   /* Select only GPU devices.  */
267   /* TODO: support selecting from multiple GPUs.  */
268   if (HSA_DEVICE_TYPE_GPU == device_type)
269     {
270       device = agent;
271       return HSA_STATUS_INFO_BREAK;
272     }
273 
274   /* The device was not suitable.  */
275   return HSA_STATUS_SUCCESS;
276 }
277 
278 /* Callback of hsa_iterate_regions.
279    Called once for each available memory region, and returns "break" when a
280    suitable one has been found.  */
281 
282 static hsa_status_t
283 get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
284 {
285   /* Reject non-global regions.  */
286   hsa_region_segment_t segment;
287   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
288   if (HSA_REGION_SEGMENT_GLOBAL != segment)
289     return HSA_STATUS_SUCCESS;
290 
291   /* Find a region with the KERNARG flag set.  */
292   hsa_region_global_flag_t flags;
293   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
294 				  &flags);
295   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
296     {
297       kernargs_region = region;
298       return HSA_STATUS_INFO_BREAK;
299     }
300 
301   /* The region was not suitable.  */
302   return HSA_STATUS_SUCCESS;
303 }
304 
305 /* Initialize the HSA Runtime library and GPU device.  */
306 
307 static void
308 init_device ()
309 {
310   /* Load the shared library and find the API functions.  */
311   init_hsa_runtime_functions ();
312 
313   /* Initialize the HSA Runtime.  */
314   XHSA (hsa_fns.hsa_init_fn (),
315 	"Initialize run-time");
316 
317   /* Select a suitable device.
318      The call-back function, get_gpu_agent, does the selection.  */
319   XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
320 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
321 	    "Find a device");
322 
323   /* Initialize the queue used for launching kernels.  */
324   uint32_t queue_size = 0;
325   XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
326 				       &queue_size),
327 	"Find max queue size");
328   XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
329 				     HSA_QUEUE_TYPE_SINGLE, NULL,
330 				     NULL, UINT32_MAX, UINT32_MAX, &queue),
331 	"Set up a device queue");
332 
333   /* Select a memory region for the kernel arguments.
334      The call-back function, get_kernarg_region, does the selection.  */
335   XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
336 						  NULL),
337 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
338 	    "Locate kernargs memory");
339 }
340 
341 
342 /* Read a whole input file.
343    Code copied from mkoffload. */
344 
345 static char *
346 read_file (const char *filename, size_t *plen)
347 {
348   size_t alloc = 16384;
349   size_t base = 0;
350   char *buffer;
351 
352   FILE *stream = fopen (filename, "rb");
353   if (!stream)
354     {
355       perror (filename);
356       exit (1);
357     }
358 
359   if (!fseek (stream, 0, SEEK_END))
360     {
361       /* Get the file size.  */
362       long s = ftell (stream);
363       if (s >= 0)
364 	alloc = s + 100;
365       fseek (stream, 0, SEEK_SET);
366     }
367   buffer = malloc (alloc);
368 
369   for (;;)
370     {
371       size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
372 
373       if (!n)
374 	break;
375       base += n;
376       if (base + 1 == alloc)
377 	{
378 	  alloc *= 2;
379 	  buffer = realloc (buffer, alloc);
380 	}
381     }
382   buffer[base] = 0;
383   *plen = base;
384 
385   fclose (stream);
386 
387   return buffer;
388 }
389 
390 /* Read a HSA Code Object (HSACO) from file, and load it into the device.  */
391 
392 static void
393 load_image (const char *filename)
394 {
395   size_t image_size;
396   Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
397 
398   /* An "executable" consists of one or more code objects.  */
399   XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
400 					  HSA_EXECUTABLE_STATE_UNFROZEN, "",
401 					  &executable),
402 	"Initialize GCN executable");
403 
404   /* Hide relocations from the HSA runtime loader.
405      Keep a copy of the unmodified section headers to use later.  */
406   Elf64_Shdr *image_sections =
407     (Elf64_Shdr *) ((char *) image + image->e_shoff);
408   Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
409   memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
410   for (int i = image->e_shnum - 1; i >= 0; i--)
411     {
412       if (image_sections[i].sh_type == SHT_RELA
413 	  || image_sections[i].sh_type == SHT_REL)
414 	/* Change section type to something harmless.  */
415 	image_sections[i].sh_type = SHT_NOTE;
416     }
417 
418   /* Add the HSACO to the executable.  */
419   hsa_code_object_t co = { 0 };
420   XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
421 	"Deserialize GCN code object");
422   XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
423 						    ""),
424 	"Load GCN code object");
425 
426   /* We're done modifying he executable.  */
427   XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
428 	"Freeze GCN executable");
429 
430   /* Locate the "main" function, and read the kernel's properties.  */
431   hsa_executable_symbol_t symbol;
432   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
433 					      device, 0, &symbol),
434 	"Find 'main' function");
435   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
436 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel),
437 	"Extract kernel object");
438   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
439 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
440 	     &kernarg_segment_size),
441 	"Extract kernarg segment size");
442   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
443 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
444 	     &group_segment_size),
445 	"Extract group segment size");
446   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
447 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
448 	     &private_segment_size),
449 	"Extract private segment size");
450 
451   /* Find main function in ELF, and calculate actual load offset.  */
452   Elf64_Addr load_offset;
453   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
454 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
455 	     &load_offset),
456 	"Extract 'main' symbol address");
457   for (int i = 0; i < image->e_shnum; i++)
458     if (sections[i].sh_type == SHT_SYMTAB)
459       {
460 	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
461 	char *strings = (char *) image + strtab->sh_offset;
462 
463 	for (size_t offset = 0;
464 	     offset < sections[i].sh_size;
465 	     offset += sections[i].sh_entsize)
466 	  {
467 	    Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
468 					    + sections[i].sh_offset + offset);
469 	    if (strcmp ("main", strings + sym->st_name) == 0)
470 	      {
471 		load_offset -= sym->st_value;
472 		goto found_main;
473 	      }
474 	  }
475       }
476   /* We only get here when main was not found.
477      This should never happen.  */
478   fprintf (stderr, "Error: main function not found.\n");
479   abort ();
480 found_main:;
481 
482   /* Find dynamic symbol table.  */
483   Elf64_Shdr *dynsym = NULL;
484   for (int i = 0; i < image->e_shnum; i++)
485     if (sections[i].sh_type == SHT_DYNSYM)
486       {
487 	dynsym = &sections[i];
488 	break;
489       }
490 
491   /* Fix up relocations.  */
492   for (int i = 0; i < image->e_shnum; i++)
493     {
494       if (sections[i].sh_type == SHT_RELA)
495 	for (size_t offset = 0;
496 	     offset < sections[i].sh_size;
497 	     offset += sections[i].sh_entsize)
498 	  {
499 	    Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
500 						+ sections[i].sh_offset
501 						+ offset);
502 	    Elf64_Sym *sym =
503 	      (dynsym
504 	       ? (Elf64_Sym *) ((char *) image
505 				+ dynsym->sh_offset
506 				+ (dynsym->sh_entsize
507 				   * ELF64_R_SYM (reloc->r_info))) : NULL);
508 
509 	    int64_t S = (sym ? sym->st_value : 0);
510 	    int64_t P = reloc->r_offset + load_offset;
511 	    int64_t A = reloc->r_addend;
512 	    int64_t B = load_offset;
513 	    int64_t V, size;
514 	    switch (ELF64_R_TYPE (reloc->r_info))
515 	      {
516 	      case R_AMDGPU_ABS32_LO:
517 		V = (S + A) & 0xFFFFFFFF;
518 		size = 4;
519 		break;
520 	      case R_AMDGPU_ABS32_HI:
521 		V = (S + A) >> 32;
522 		size = 4;
523 		break;
524 	      case R_AMDGPU_ABS64:
525 		V = S + A;
526 		size = 8;
527 		break;
528 	      case R_AMDGPU_REL32:
529 		V = S + A - P;
530 		size = 4;
531 		break;
532 	      case R_AMDGPU_REL64:
533 		/* FIXME
534 		   LLD seems to emit REL64 where the the assembler has ABS64.
535 		   This is clearly wrong because it's not what the compiler
536 		   is expecting.  Let's assume, for now, that it's a bug.
537 		   In any case, GCN kernels are always self contained and
538 		   therefore relative relocations will have been resolved
539 		   already, so this should be a safe workaround.  */
540 		V = S + A /* - P */ ;
541 		size = 8;
542 		break;
543 	      case R_AMDGPU_ABS32:
544 		V = S + A;
545 		size = 4;
546 		break;
547 	      /* TODO R_AMDGPU_GOTPCREL */
548 	      /* TODO R_AMDGPU_GOTPCREL32_LO */
549 	      /* TODO R_AMDGPU_GOTPCREL32_HI */
550 	      case R_AMDGPU_REL32_LO:
551 		V = (S + A - P) & 0xFFFFFFFF;
552 		size = 4;
553 		break;
554 	      case R_AMDGPU_REL32_HI:
555 		V = (S + A - P) >> 32;
556 		size = 4;
557 		break;
558 	      case R_AMDGPU_RELATIVE64:
559 		V = B + A;
560 		size = 8;
561 		break;
562 	      default:
563 		fprintf (stderr, "Error: unsupported relocation type.\n");
564 		exit (1);
565 	      }
566 	    XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
567 		  "Fix up relocation");
568 	  }
569     }
570 }
571 
572 /* Allocate some device memory from the kernargs region.
573    The returned address will be 32-bit (with excess zeroed on 64-bit host),
574    and accessible via the same address on both host and target (via
575    __flat_scalar GCN address space).  */
576 
577 static void *
578 device_malloc (size_t size)
579 {
580   void *result;
581   XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
582 	"Allocate device memory");
583   return result;
584 }
585 
586 /* These are the device pointers that will be transferred to the target.
587    The HSA Runtime points the kernargs register here.
588    They correspond to function signature:
589        int main (int argc, char *argv[], int *return_value)
590    The compiler expects this, for kernel functions, and will
591    automatically assign the exit value to *return_value.  */
592 struct kernargs
593 {
594   /* Kernargs.  */
595   int32_t argc;
596   int64_t argv;
597   int64_t out_ptr;
598   int64_t heap_ptr;
599 
600   /* Output data.  */
601   struct output
602   {
603     int return_value;
604     unsigned int next_output;
605     struct printf_data
606     {
607       int written;
608       char msg[128];
609       int type;
610       union
611       {
612 	int64_t ivalue;
613 	double dvalue;
614 	char text[128];
615       };
616     } queue[1024];
617     unsigned int consumed;
618   } output_data;
619 
620   struct heap
621   {
622     int64_t size;
623     char data[0];
624   } heap;
625 };
626 
627 /* Print any console output from the kernel.
628    We print all entries from "consumed" to the next entry without a "written"
629    flag, or "next_output" is reached.  The buffer is circular, but the
630    indices are absolute.  It is assumed the kernel will stop writing data
631    if "next_output" wraps (becomes smaller than "consumed").  */
632 void
633 gomp_print_output (struct kernargs *kernargs, bool final)
634 {
635   unsigned int limit = (sizeof (kernargs->output_data.queue)
636 			/ sizeof (kernargs->output_data.queue[0]));
637 
638   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
639 				       __ATOMIC_ACQUIRE);
640   unsigned int to = kernargs->output_data.next_output;
641 
642   if (from > to)
643     {
644       /* Overflow.  */
645       if (final)
646 	printf ("GCN print buffer overflowed.\n");
647       return;
648     }
649 
650   unsigned int i;
651   for (i = from; i < to; i++)
652     {
653       struct printf_data *data = &kernargs->output_data.queue[i%limit];
654 
655       if (!data->written && !final)
656 	break;
657 
658       switch (data->type)
659 	{
660 	case 0:
661 	  printf ("%.128s%ld\n", data->msg, data->ivalue);
662 	  break;
663 	case 1:
664 	  printf ("%.128s%f\n", data->msg, data->dvalue);
665 	  break;
666 	case 2:
667 	  printf ("%.128s%.128s\n", data->msg, data->text);
668 	  break;
669 	case 3:
670 	  printf ("%.128s%.128s", data->msg, data->text);
671 	  break;
672 	default:
673 	  printf ("GCN print buffer error!\n");
674 	  break;
675 	}
676 
677       data->written = 0;
678       __atomic_store_n (&kernargs->output_data.consumed, i+1,
679 			__ATOMIC_RELEASE);
680     }
681   fflush (stdout);
682 }
683 
684 /* Execute an already-loaded kernel on the device.  */
685 
686 static void
687 run (void *kernargs)
688 {
689   /* A "signal" is used to launch and monitor the kernel.  */
690   hsa_signal_t signal;
691   XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
692 	"Create signal");
693 
694   /* Configure for a single-worker kernel.  */
695   uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
696   const uint32_t queueMask = queue->size - 1;
697   hsa_kernel_dispatch_packet_t *dispatch_packet =
698     &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
699 							       queueMask]);
700   dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
701   dispatch_packet->workgroup_size_x = (uint16_t) 1;
702   dispatch_packet->workgroup_size_y = (uint16_t) 64;
703   dispatch_packet->workgroup_size_z = (uint16_t) 1;
704   dispatch_packet->grid_size_x = 1;
705   dispatch_packet->grid_size_y = 64;
706   dispatch_packet->grid_size_z = 1;
707   dispatch_packet->completion_signal = signal;
708   dispatch_packet->kernel_object = kernel;
709   dispatch_packet->kernarg_address = (void *) kernargs;
710   dispatch_packet->private_segment_size = private_segment_size;
711   dispatch_packet->group_segment_size = group_segment_size;
712 
713   uint16_t header = 0;
714   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
715   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
716   header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
717 
718   __atomic_store_n ((uint32_t *) dispatch_packet,
719 		    header | (dispatch_packet->setup << 16),
720 		    __ATOMIC_RELEASE);
721 
722   if (debug)
723     fprintf (stderr, "Launch kernel\n");
724 
725   hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
726   hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
727   /* Kernel running ......  */
728   while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
729 					     1, 1000000,
730 					     HSA_WAIT_STATE_ACTIVE) != 0)
731     {
732       usleep (10000);
733       gomp_print_output (kernargs, false);
734     }
735 
736   gomp_print_output (kernargs, true);
737 
738   if (debug)
739     fprintf (stderr, "Kernel exited\n");
740 
741   XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
742 	"Clean up signal");
743 }
744 
745 int
746 main (int argc, char *argv[])
747 {
748   int kernel_arg = 0;
749   for (int i = 1; i < argc; i++)
750     {
751       if (!strcmp (argv[i], "--help"))
752 	{
753 	  usage (argv[0]);
754 	  return 0;
755 	}
756       else if (!strcmp (argv[i], "--version"))
757 	{
758 	  version (argv[0]);
759 	  return 0;
760 	}
761       else if (!strcmp (argv[i], "--debug"))
762 	debug = true;
763       else if (argv[i][0] == '-')
764 	{
765 	  usage (argv[0]);
766 	  return 1;
767 	}
768       else
769 	{
770 	  kernel_arg = i;
771 	  break;
772 	}
773     }
774 
775   if (!kernel_arg)
776     {
777       /* No kernel arguments were found.  */
778       usage (argv[0]);
779       return 1;
780     }
781 
782   /* The remaining arguments are for the GCN kernel.  */
783   int kernel_argc = argc - kernel_arg;
784   char **kernel_argv = &argv[kernel_arg];
785 
786   init_device ();
787   load_image (kernel_argv[0]);
788 
789   /* Calculate size of function parameters + argv data.  */
790   size_t args_size = 0;
791   for (int i = 0; i < kernel_argc; i++)
792     args_size += strlen (kernel_argv[i]) + 1;
793 
794   /* Allocate device memory for both function parameters and the argv
795      data.  */
796   size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
797   struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
798   struct argdata
799   {
800     int64_t argv_data[kernel_argc];
801     char strings[args_size];
802   } *args = device_malloc (sizeof (struct argdata));
803 
804   /* Write the data to the target.  */
805   kernargs->argc = kernel_argc;
806   kernargs->argv = (int64_t) args->argv_data;
807   kernargs->out_ptr = (int64_t) &kernargs->output_data;
808   kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
809   kernargs->output_data.next_output = 0;
810   for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
811 			    / sizeof (kernargs->output_data.queue[0])); i++)
812     kernargs->output_data.queue[i].written = 0;
813   kernargs->output_data.consumed = 0;
814   int offset = 0;
815   for (int i = 0; i < kernel_argc; i++)
816     {
817       size_t arg_len = strlen (kernel_argv[i]) + 1;
818       args->argv_data[i] = (int64_t) &args->strings[offset];
819       memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
820       offset += arg_len;
821     }
822   kernargs->heap_ptr = (int64_t) &kernargs->heap;
823   kernargs->heap.size = heap_size;
824 
825   /* Run the kernel on the GPU.  */
826   run (kernargs);
827   unsigned int return_value =
828     (unsigned int) kernargs->output_data.return_value;
829 
830   unsigned int upper = (return_value & ~0xffff) >> 16;
831   if (upper == 0xcafe)
832     {
833       printf ("Kernel exit value was never set\n");
834       return_value = 0xff;
835     }
836   else if (upper == 0xffff)
837     ; /* Set by exit.  */
838   else if (upper == 0)
839     ; /* Set by return from main.  */
840   else
841     printf ("Possible kernel exit value corruption, 2 most significant bytes "
842 	    "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
843 
844   if (upper == 0xffff)
845     {
846       unsigned int signal = (return_value >> 8) & 0xff;
847       if (signal == SIGABRT)
848 	printf ("Kernel aborted\n");
849       else if (signal != 0)
850 	printf ("Kernel received unkown signal\n");
851     }
852 
853   if (debug)
854     printf ("Kernel exit value: %d\n", return_value & 0xff);
855 
856   /* Clean shut down.  */
857   XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
858 	"Clean up device memory");
859   XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
860 	"Clean up GCN executable");
861   XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
862 	"Clean up device queue");
863   XHSA (hsa_fns.hsa_shut_down_fn (),
864 	"Shut down run-time");
865 
866   return return_value & 0xff;
867 }
868