1 /* Run a stand-alone AMD GCN kernel. 2 3 Copyright 2017 Mentor Graphics Corporation 4 Copyright 2018-2019 Free Software Foundation, Inc. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19 /* This program will run a compiled stand-alone GCN kernel on a GPU. 20 21 The kernel entry point's signature must use a standard main signature: 22 23 int main(int argc, char **argv) 24 */ 25 26 #include <stdint.h> 27 #include <stdbool.h> 28 #include <stdlib.h> 29 #include <malloc.h> 30 #include <stdio.h> 31 #include <string.h> 32 #include <dlfcn.h> 33 #include <unistd.h> 34 #include <elf.h> 35 #include <signal.h> 36 37 /* These probably won't be in elf.h for a while. */ 38 #ifndef R_AMDGPU_NONE 39 #define R_AMDGPU_NONE 0 40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ 41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */ 42 #define R_AMDGPU_ABS64 3 /* S + A */ 43 #define R_AMDGPU_REL32 4 /* S + A - P */ 44 #define R_AMDGPU_REL64 5 /* S + A - P */ 45 #define R_AMDGPU_ABS32 6 /* S + A */ 46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */ 47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */ 48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */ 49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */ 50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */ 51 #define reserved 12 52 #define R_AMDGPU_RELATIVE64 13 /* B + A */ 53 #endif 54 55 #include "hsa.h" 56 57 #ifndef HSA_RUNTIME_LIB 58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so" 59 #endif 60 61 #ifndef VERSION_STRING 62 #define VERSION_STRING "(version unknown)" 63 #endif 64 65 bool debug = false; 66 67 hsa_agent_t device = { 0 }; 68 hsa_queue_t *queue = NULL; 69 uint64_t kernel = 0; 70 hsa_executable_t executable = { 0 }; 71 72 hsa_region_t kernargs_region = { 0 }; 73 uint32_t kernarg_segment_size = 0; 74 uint32_t group_segment_size = 0; 75 uint32_t private_segment_size = 0; 76 77 static void 78 usage (const char *progname) 79 { 80 printf ("Usage: %s [options] kernel [kernel-args]\n\n" 81 "Options:\n" 82 " --help\n" 83 " --version\n" 84 " --debug\n", progname); 85 } 86 87 static void 88 version (const char *progname) 89 { 90 printf ("%s " VERSION_STRING "\n", progname); 91 } 92 93 /* As an HSA runtime is dlopened, following structure defines the necessary 94 function pointers. 95 Code adapted from libgomp. */ 96 97 struct hsa_runtime_fn_info 98 { 99 /* HSA runtime. */ 100 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, 101 const char **status_string); 102 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, 103 hsa_agent_info_t attribute, 104 void *value); 105 hsa_status_t (*hsa_init_fn) (void); 106 hsa_status_t (*hsa_iterate_agents_fn) 107 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data); 108 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, 109 hsa_region_info_t attribute, 110 void *value); 111 hsa_status_t (*hsa_queue_create_fn) 112 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, 113 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data), 114 void *data, uint32_t private_segment_size, 115 uint32_t group_segment_size, hsa_queue_t **queue); 116 hsa_status_t (*hsa_agent_iterate_regions_fn) 117 (hsa_agent_t agent, 118 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data); 119 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); 120 hsa_status_t (*hsa_executable_create_fn) 121 (hsa_profile_t profile, hsa_executable_state_t executable_state, 122 const char *options, hsa_executable_t *executable); 123 hsa_status_t (*hsa_executable_global_variable_define_fn) 124 (hsa_executable_t executable, const char *variable_name, void *address); 125 hsa_status_t (*hsa_executable_load_code_object_fn) 126 (hsa_executable_t executable, hsa_agent_t agent, 127 hsa_code_object_t code_object, const char *options); 128 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable, 129 const char *options); 130 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, 131 uint32_t num_consumers, 132 const hsa_agent_t *consumers, 133 hsa_signal_t *signal); 134 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, 135 void **ptr); 136 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, 137 size_t size); 138 hsa_status_t (*hsa_memory_free_fn) (void *ptr); 139 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); 140 hsa_status_t (*hsa_executable_get_symbol_fn) 141 (hsa_executable_t executable, const char *module_name, 142 const char *symbol_name, hsa_agent_t agent, int32_t call_convention, 143 hsa_executable_symbol_t *symbol); 144 hsa_status_t (*hsa_executable_symbol_get_info_fn) 145 (hsa_executable_symbol_t executable_symbol, 146 hsa_executable_symbol_info_t attribute, void *value); 147 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, 148 hsa_signal_value_t value); 149 hsa_signal_value_t (*hsa_signal_wait_acquire_fn) 150 (hsa_signal_t signal, hsa_signal_condition_t condition, 151 hsa_signal_value_t compare_value, uint64_t timeout_hint, 152 hsa_wait_state_t wait_state_hint); 153 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn) 154 (hsa_signal_t signal, hsa_signal_condition_t condition, 155 hsa_signal_value_t compare_value, uint64_t timeout_hint, 156 hsa_wait_state_t wait_state_hint); 157 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); 158 hsa_status_t (*hsa_code_object_deserialize_fn) 159 (void *serialized_code_object, size_t serialized_code_object_size, 160 const char *options, hsa_code_object_t *code_object); 161 uint64_t (*hsa_queue_load_write_index_relaxed_fn) 162 (const hsa_queue_t *queue); 163 void (*hsa_queue_store_write_index_relaxed_fn) 164 (const hsa_queue_t *queue, uint64_t value); 165 hsa_status_t (*hsa_shut_down_fn) (); 166 }; 167 168 /* HSA runtime functions that are initialized in init_hsa_context. 169 Code adapted from libgomp. */ 170 171 static struct hsa_runtime_fn_info hsa_fns; 172 173 #define DLSYM_FN(function) \ 174 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \ 175 if (hsa_fns.function##_fn == NULL) \ 176 goto fail; 177 178 static void 179 init_hsa_runtime_functions (void) 180 { 181 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY); 182 if (handle == NULL) 183 { 184 fprintf (stderr, 185 "The HSA runtime is required to run GCN kernels on hardware.\n" 186 "%s: File not found or could not be opened\n", 187 HSA_RUNTIME_LIB); 188 exit (1); 189 } 190 191 DLSYM_FN (hsa_status_string) 192 DLSYM_FN (hsa_agent_get_info) 193 DLSYM_FN (hsa_init) 194 DLSYM_FN (hsa_iterate_agents) 195 DLSYM_FN (hsa_region_get_info) 196 DLSYM_FN (hsa_queue_create) 197 DLSYM_FN (hsa_agent_iterate_regions) 198 DLSYM_FN (hsa_executable_destroy) 199 DLSYM_FN (hsa_executable_create) 200 DLSYM_FN (hsa_executable_global_variable_define) 201 DLSYM_FN (hsa_executable_load_code_object) 202 DLSYM_FN (hsa_executable_freeze) 203 DLSYM_FN (hsa_signal_create) 204 DLSYM_FN (hsa_memory_allocate) 205 DLSYM_FN (hsa_memory_copy) 206 DLSYM_FN (hsa_memory_free) 207 DLSYM_FN (hsa_signal_destroy) 208 DLSYM_FN (hsa_executable_get_symbol) 209 DLSYM_FN (hsa_executable_symbol_get_info) 210 DLSYM_FN (hsa_signal_wait_acquire) 211 DLSYM_FN (hsa_signal_wait_relaxed) 212 DLSYM_FN (hsa_signal_store_relaxed) 213 DLSYM_FN (hsa_queue_destroy) 214 DLSYM_FN (hsa_code_object_deserialize) 215 DLSYM_FN (hsa_queue_load_write_index_relaxed) 216 DLSYM_FN (hsa_queue_store_write_index_relaxed) 217 DLSYM_FN (hsa_shut_down) 218 219 return; 220 221 fail: 222 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n"); 223 exit (1); 224 } 225 226 #undef DLSYM_FN 227 228 /* Report a fatal error STR together with the HSA error corresponding to 229 STATUS and terminate execution of the current process. */ 230 231 static void 232 hsa_fatal (const char *str, hsa_status_t status) 233 { 234 const char *hsa_error_msg; 235 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 236 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str, 237 hsa_error_msg); 238 exit (1); 239 } 240 241 /* Helper macros to ensure we check the return values from the HSA Runtime. 242 These just keep the rest of the code a bit cleaner. */ 243 244 #define XHSA_CMP(FN, CMP, MSG) \ 245 do { \ 246 hsa_status_t status = (FN); \ 247 if (!(CMP)) \ 248 hsa_fatal ((MSG), status); \ 249 else if (debug) \ 250 fprintf (stderr, "%s: OK\n", (MSG)); \ 251 } while (0) 252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG) 253 254 /* Callback of hsa_iterate_agents. 255 Called once for each available device, and returns "break" when a 256 suitable one has been found. */ 257 258 static hsa_status_t 259 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) 260 { 261 hsa_device_type_t device_type; 262 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, 263 &device_type), 264 "Get agent type"); 265 266 /* Select only GPU devices. */ 267 /* TODO: support selecting from multiple GPUs. */ 268 if (HSA_DEVICE_TYPE_GPU == device_type) 269 { 270 device = agent; 271 return HSA_STATUS_INFO_BREAK; 272 } 273 274 /* The device was not suitable. */ 275 return HSA_STATUS_SUCCESS; 276 } 277 278 /* Callback of hsa_iterate_regions. 279 Called once for each available memory region, and returns "break" when a 280 suitable one has been found. */ 281 282 static hsa_status_t 283 get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) 284 { 285 /* Reject non-global regions. */ 286 hsa_region_segment_t segment; 287 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment); 288 if (HSA_REGION_SEGMENT_GLOBAL != segment) 289 return HSA_STATUS_SUCCESS; 290 291 /* Find a region with the KERNARG flag set. */ 292 hsa_region_global_flag_t flags; 293 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, 294 &flags); 295 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) 296 { 297 kernargs_region = region; 298 return HSA_STATUS_INFO_BREAK; 299 } 300 301 /* The region was not suitable. */ 302 return HSA_STATUS_SUCCESS; 303 } 304 305 /* Initialize the HSA Runtime library and GPU device. */ 306 307 static void 308 init_device () 309 { 310 /* Load the shared library and find the API functions. */ 311 init_hsa_runtime_functions (); 312 313 /* Initialize the HSA Runtime. */ 314 XHSA (hsa_fns.hsa_init_fn (), 315 "Initialize run-time"); 316 317 /* Select a suitable device. 318 The call-back function, get_gpu_agent, does the selection. */ 319 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL), 320 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, 321 "Find a device"); 322 323 /* Initialize the queue used for launching kernels. */ 324 uint32_t queue_size = 0; 325 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, 326 &queue_size), 327 "Find max queue size"); 328 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size, 329 HSA_QUEUE_TYPE_SINGLE, NULL, 330 NULL, UINT32_MAX, UINT32_MAX, &queue), 331 "Set up a device queue"); 332 333 /* Select a memory region for the kernel arguments. 334 The call-back function, get_kernarg_region, does the selection. */ 335 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region, 336 NULL), 337 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, 338 "Locate kernargs memory"); 339 } 340 341 342 /* Read a whole input file. 343 Code copied from mkoffload. */ 344 345 static char * 346 read_file (const char *filename, size_t *plen) 347 { 348 size_t alloc = 16384; 349 size_t base = 0; 350 char *buffer; 351 352 FILE *stream = fopen (filename, "rb"); 353 if (!stream) 354 { 355 perror (filename); 356 exit (1); 357 } 358 359 if (!fseek (stream, 0, SEEK_END)) 360 { 361 /* Get the file size. */ 362 long s = ftell (stream); 363 if (s >= 0) 364 alloc = s + 100; 365 fseek (stream, 0, SEEK_SET); 366 } 367 buffer = malloc (alloc); 368 369 for (;;) 370 { 371 size_t n = fread (buffer + base, 1, alloc - base - 1, stream); 372 373 if (!n) 374 break; 375 base += n; 376 if (base + 1 == alloc) 377 { 378 alloc *= 2; 379 buffer = realloc (buffer, alloc); 380 } 381 } 382 buffer[base] = 0; 383 *plen = base; 384 385 fclose (stream); 386 387 return buffer; 388 } 389 390 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */ 391 392 static void 393 load_image (const char *filename) 394 { 395 size_t image_size; 396 Elf64_Ehdr *image = (void *) read_file (filename, &image_size); 397 398 /* An "executable" consists of one or more code objects. */ 399 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, 400 HSA_EXECUTABLE_STATE_UNFROZEN, "", 401 &executable), 402 "Initialize GCN executable"); 403 404 /* Hide relocations from the HSA runtime loader. 405 Keep a copy of the unmodified section headers to use later. */ 406 Elf64_Shdr *image_sections = 407 (Elf64_Shdr *) ((char *) image + image->e_shoff); 408 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum); 409 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum); 410 for (int i = image->e_shnum - 1; i >= 0; i--) 411 { 412 if (image_sections[i].sh_type == SHT_RELA 413 || image_sections[i].sh_type == SHT_REL) 414 /* Change section type to something harmless. */ 415 image_sections[i].sh_type = SHT_NOTE; 416 } 417 418 /* Add the HSACO to the executable. */ 419 hsa_code_object_t co = { 0 }; 420 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co), 421 "Deserialize GCN code object"); 422 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co, 423 ""), 424 "Load GCN code object"); 425 426 /* We're done modifying he executable. */ 427 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""), 428 "Freeze GCN executable"); 429 430 /* Locate the "main" function, and read the kernel's properties. */ 431 hsa_executable_symbol_t symbol; 432 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main", 433 device, 0, &symbol), 434 "Find 'main' function"); 435 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn 436 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel), 437 "Extract kernel object"); 438 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn 439 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, 440 &kernarg_segment_size), 441 "Extract kernarg segment size"); 442 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn 443 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, 444 &group_segment_size), 445 "Extract group segment size"); 446 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn 447 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, 448 &private_segment_size), 449 "Extract private segment size"); 450 451 /* Find main function in ELF, and calculate actual load offset. */ 452 Elf64_Addr load_offset; 453 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn 454 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 455 &load_offset), 456 "Extract 'main' symbol address"); 457 for (int i = 0; i < image->e_shnum; i++) 458 if (sections[i].sh_type == SHT_SYMTAB) 459 { 460 Elf64_Shdr *strtab = §ions[sections[i].sh_link]; 461 char *strings = (char *) image + strtab->sh_offset; 462 463 for (size_t offset = 0; 464 offset < sections[i].sh_size; 465 offset += sections[i].sh_entsize) 466 { 467 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image 468 + sections[i].sh_offset + offset); 469 if (strcmp ("main", strings + sym->st_name) == 0) 470 { 471 load_offset -= sym->st_value; 472 goto found_main; 473 } 474 } 475 } 476 /* We only get here when main was not found. 477 This should never happen. */ 478 fprintf (stderr, "Error: main function not found.\n"); 479 abort (); 480 found_main:; 481 482 /* Find dynamic symbol table. */ 483 Elf64_Shdr *dynsym = NULL; 484 for (int i = 0; i < image->e_shnum; i++) 485 if (sections[i].sh_type == SHT_DYNSYM) 486 { 487 dynsym = §ions[i]; 488 break; 489 } 490 491 /* Fix up relocations. */ 492 for (int i = 0; i < image->e_shnum; i++) 493 { 494 if (sections[i].sh_type == SHT_RELA) 495 for (size_t offset = 0; 496 offset < sections[i].sh_size; 497 offset += sections[i].sh_entsize) 498 { 499 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image 500 + sections[i].sh_offset 501 + offset); 502 Elf64_Sym *sym = 503 (dynsym 504 ? (Elf64_Sym *) ((char *) image 505 + dynsym->sh_offset 506 + (dynsym->sh_entsize 507 * ELF64_R_SYM (reloc->r_info))) : NULL); 508 509 int64_t S = (sym ? sym->st_value : 0); 510 int64_t P = reloc->r_offset + load_offset; 511 int64_t A = reloc->r_addend; 512 int64_t B = load_offset; 513 int64_t V, size; 514 switch (ELF64_R_TYPE (reloc->r_info)) 515 { 516 case R_AMDGPU_ABS32_LO: 517 V = (S + A) & 0xFFFFFFFF; 518 size = 4; 519 break; 520 case R_AMDGPU_ABS32_HI: 521 V = (S + A) >> 32; 522 size = 4; 523 break; 524 case R_AMDGPU_ABS64: 525 V = S + A; 526 size = 8; 527 break; 528 case R_AMDGPU_REL32: 529 V = S + A - P; 530 size = 4; 531 break; 532 case R_AMDGPU_REL64: 533 /* FIXME 534 LLD seems to emit REL64 where the the assembler has ABS64. 535 This is clearly wrong because it's not what the compiler 536 is expecting. Let's assume, for now, that it's a bug. 537 In any case, GCN kernels are always self contained and 538 therefore relative relocations will have been resolved 539 already, so this should be a safe workaround. */ 540 V = S + A /* - P */ ; 541 size = 8; 542 break; 543 case R_AMDGPU_ABS32: 544 V = S + A; 545 size = 4; 546 break; 547 /* TODO R_AMDGPU_GOTPCREL */ 548 /* TODO R_AMDGPU_GOTPCREL32_LO */ 549 /* TODO R_AMDGPU_GOTPCREL32_HI */ 550 case R_AMDGPU_REL32_LO: 551 V = (S + A - P) & 0xFFFFFFFF; 552 size = 4; 553 break; 554 case R_AMDGPU_REL32_HI: 555 V = (S + A - P) >> 32; 556 size = 4; 557 break; 558 case R_AMDGPU_RELATIVE64: 559 V = B + A; 560 size = 8; 561 break; 562 default: 563 fprintf (stderr, "Error: unsupported relocation type.\n"); 564 exit (1); 565 } 566 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size), 567 "Fix up relocation"); 568 } 569 } 570 } 571 572 /* Allocate some device memory from the kernargs region. 573 The returned address will be 32-bit (with excess zeroed on 64-bit host), 574 and accessible via the same address on both host and target (via 575 __flat_scalar GCN address space). */ 576 577 static void * 578 device_malloc (size_t size) 579 { 580 void *result; 581 XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result), 582 "Allocate device memory"); 583 return result; 584 } 585 586 /* These are the device pointers that will be transferred to the target. 587 The HSA Runtime points the kernargs register here. 588 They correspond to function signature: 589 int main (int argc, char *argv[], int *return_value) 590 The compiler expects this, for kernel functions, and will 591 automatically assign the exit value to *return_value. */ 592 struct kernargs 593 { 594 /* Kernargs. */ 595 int32_t argc; 596 int64_t argv; 597 int64_t out_ptr; 598 int64_t heap_ptr; 599 600 /* Output data. */ 601 struct output 602 { 603 int return_value; 604 unsigned int next_output; 605 struct printf_data 606 { 607 int written; 608 char msg[128]; 609 int type; 610 union 611 { 612 int64_t ivalue; 613 double dvalue; 614 char text[128]; 615 }; 616 } queue[1024]; 617 unsigned int consumed; 618 } output_data; 619 620 struct heap 621 { 622 int64_t size; 623 char data[0]; 624 } heap; 625 }; 626 627 /* Print any console output from the kernel. 628 We print all entries from "consumed" to the next entry without a "written" 629 flag, or "next_output" is reached. The buffer is circular, but the 630 indices are absolute. It is assumed the kernel will stop writing data 631 if "next_output" wraps (becomes smaller than "consumed"). */ 632 void 633 gomp_print_output (struct kernargs *kernargs, bool final) 634 { 635 unsigned int limit = (sizeof (kernargs->output_data.queue) 636 / sizeof (kernargs->output_data.queue[0])); 637 638 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, 639 __ATOMIC_ACQUIRE); 640 unsigned int to = kernargs->output_data.next_output; 641 642 if (from > to) 643 { 644 /* Overflow. */ 645 if (final) 646 printf ("GCN print buffer overflowed.\n"); 647 return; 648 } 649 650 unsigned int i; 651 for (i = from; i < to; i++) 652 { 653 struct printf_data *data = &kernargs->output_data.queue[i%limit]; 654 655 if (!data->written && !final) 656 break; 657 658 switch (data->type) 659 { 660 case 0: 661 printf ("%.128s%ld\n", data->msg, data->ivalue); 662 break; 663 case 1: 664 printf ("%.128s%f\n", data->msg, data->dvalue); 665 break; 666 case 2: 667 printf ("%.128s%.128s\n", data->msg, data->text); 668 break; 669 case 3: 670 printf ("%.128s%.128s", data->msg, data->text); 671 break; 672 default: 673 printf ("GCN print buffer error!\n"); 674 break; 675 } 676 677 data->written = 0; 678 __atomic_store_n (&kernargs->output_data.consumed, i+1, 679 __ATOMIC_RELEASE); 680 } 681 fflush (stdout); 682 } 683 684 /* Execute an already-loaded kernel on the device. */ 685 686 static void 687 run (void *kernargs) 688 { 689 /* A "signal" is used to launch and monitor the kernel. */ 690 hsa_signal_t signal; 691 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal), 692 "Create signal"); 693 694 /* Configure for a single-worker kernel. */ 695 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue); 696 const uint32_t queueMask = queue->size - 1; 697 hsa_kernel_dispatch_packet_t *dispatch_packet = 698 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index & 699 queueMask]); 700 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 701 dispatch_packet->workgroup_size_x = (uint16_t) 1; 702 dispatch_packet->workgroup_size_y = (uint16_t) 64; 703 dispatch_packet->workgroup_size_z = (uint16_t) 1; 704 dispatch_packet->grid_size_x = 1; 705 dispatch_packet->grid_size_y = 64; 706 dispatch_packet->grid_size_z = 1; 707 dispatch_packet->completion_signal = signal; 708 dispatch_packet->kernel_object = kernel; 709 dispatch_packet->kernarg_address = (void *) kernargs; 710 dispatch_packet->private_segment_size = private_segment_size; 711 dispatch_packet->group_segment_size = group_segment_size; 712 713 uint16_t header = 0; 714 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 715 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 716 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 717 718 __atomic_store_n ((uint32_t *) dispatch_packet, 719 header | (dispatch_packet->setup << 16), 720 __ATOMIC_RELEASE); 721 722 if (debug) 723 fprintf (stderr, "Launch kernel\n"); 724 725 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1); 726 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index); 727 /* Kernel running ...... */ 728 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT, 729 1, 1000000, 730 HSA_WAIT_STATE_ACTIVE) != 0) 731 { 732 usleep (10000); 733 gomp_print_output (kernargs, false); 734 } 735 736 gomp_print_output (kernargs, true); 737 738 if (debug) 739 fprintf (stderr, "Kernel exited\n"); 740 741 XHSA (hsa_fns.hsa_signal_destroy_fn (signal), 742 "Clean up signal"); 743 } 744 745 int 746 main (int argc, char *argv[]) 747 { 748 int kernel_arg = 0; 749 for (int i = 1; i < argc; i++) 750 { 751 if (!strcmp (argv[i], "--help")) 752 { 753 usage (argv[0]); 754 return 0; 755 } 756 else if (!strcmp (argv[i], "--version")) 757 { 758 version (argv[0]); 759 return 0; 760 } 761 else if (!strcmp (argv[i], "--debug")) 762 debug = true; 763 else if (argv[i][0] == '-') 764 { 765 usage (argv[0]); 766 return 1; 767 } 768 else 769 { 770 kernel_arg = i; 771 break; 772 } 773 } 774 775 if (!kernel_arg) 776 { 777 /* No kernel arguments were found. */ 778 usage (argv[0]); 779 return 1; 780 } 781 782 /* The remaining arguments are for the GCN kernel. */ 783 int kernel_argc = argc - kernel_arg; 784 char **kernel_argv = &argv[kernel_arg]; 785 786 init_device (); 787 load_image (kernel_argv[0]); 788 789 /* Calculate size of function parameters + argv data. */ 790 size_t args_size = 0; 791 for (int i = 0; i < kernel_argc; i++) 792 args_size += strlen (kernel_argv[i]) + 1; 793 794 /* Allocate device memory for both function parameters and the argv 795 data. */ 796 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ 797 struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size); 798 struct argdata 799 { 800 int64_t argv_data[kernel_argc]; 801 char strings[args_size]; 802 } *args = device_malloc (sizeof (struct argdata)); 803 804 /* Write the data to the target. */ 805 kernargs->argc = kernel_argc; 806 kernargs->argv = (int64_t) args->argv_data; 807 kernargs->out_ptr = (int64_t) &kernargs->output_data; 808 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */ 809 kernargs->output_data.next_output = 0; 810 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) 811 / sizeof (kernargs->output_data.queue[0])); i++) 812 kernargs->output_data.queue[i].written = 0; 813 kernargs->output_data.consumed = 0; 814 int offset = 0; 815 for (int i = 0; i < kernel_argc; i++) 816 { 817 size_t arg_len = strlen (kernel_argv[i]) + 1; 818 args->argv_data[i] = (int64_t) &args->strings[offset]; 819 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); 820 offset += arg_len; 821 } 822 kernargs->heap_ptr = (int64_t) &kernargs->heap; 823 kernargs->heap.size = heap_size; 824 825 /* Run the kernel on the GPU. */ 826 run (kernargs); 827 unsigned int return_value = 828 (unsigned int) kernargs->output_data.return_value; 829 830 unsigned int upper = (return_value & ~0xffff) >> 16; 831 if (upper == 0xcafe) 832 { 833 printf ("Kernel exit value was never set\n"); 834 return_value = 0xff; 835 } 836 else if (upper == 0xffff) 837 ; /* Set by exit. */ 838 else if (upper == 0) 839 ; /* Set by return from main. */ 840 else 841 printf ("Possible kernel exit value corruption, 2 most significant bytes " 842 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value); 843 844 if (upper == 0xffff) 845 { 846 unsigned int signal = (return_value >> 8) & 0xff; 847 if (signal == SIGABRT) 848 printf ("Kernel aborted\n"); 849 else if (signal != 0) 850 printf ("Kernel received unkown signal\n"); 851 } 852 853 if (debug) 854 printf ("Kernel exit value: %d\n", return_value & 0xff); 855 856 /* Clean shut down. */ 857 XHSA (hsa_fns.hsa_memory_free_fn (kernargs), 858 "Clean up device memory"); 859 XHSA (hsa_fns.hsa_executable_destroy_fn (executable), 860 "Clean up GCN executable"); 861 XHSA (hsa_fns.hsa_queue_destroy_fn (queue), 862 "Clean up device queue"); 863 XHSA (hsa_fns.hsa_shut_down_fn (), 864 "Shut down run-time"); 865 866 return return_value & 0xff; 867 } 868