1 /* Plugin for NVPTX execution. 2 3 Copyright (C) 2013-2016 Free Software Foundation, Inc. 4 5 Contributed by Mentor Embedded. 6 7 This file is part of the GNU Offloading and Multi Processing Library 8 (libgomp). 9 10 Libgomp is free software; you can redistribute it and/or modify it 11 under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 more details. 19 20 Under Section 7 of GPL version 3, you are granted additional 21 permissions described in the GCC Runtime Library Exception, version 22 3.1, as published by the Free Software Foundation. 23 24 You should have received a copy of the GNU General Public License and 25 a copy of the GCC Runtime Library Exception along with this program; 26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 <http://www.gnu.org/licenses/>. */ 28 29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver 30 library appears to hold some implicit state, but the documentation 31 is not clear as to what that state might be. Or how one might 32 propagate it from one thread to another. */ 33 34 #include "openacc.h" 35 #include "config.h" 36 #include "libgomp-plugin.h" 37 #include "oacc-plugin.h" 38 #include "gomp-constants.h" 39 40 #include <pthread.h> 41 #include <cuda.h> 42 #include <stdbool.h> 43 #include <stdint.h> 44 #include <string.h> 45 #include <stdio.h> 46 #include <unistd.h> 47 #include <assert.h> 48 49 static const char * 50 cuda_error (CUresult r) 51 { 52 #if CUDA_VERSION < 7000 53 /* Specified in documentation and present in library from at least 54 5.5. Not declared in header file prior to 7.0. */ 55 extern CUresult cuGetErrorString (CUresult, const char **); 56 #endif 57 const char *desc; 58 59 r = cuGetErrorString (r, &desc); 60 if (r != CUDA_SUCCESS) 61 desc = "unknown cuda error"; 62 63 return desc; 64 } 65 66 static unsigned int instantiated_devices = 0; 67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; 68 69 struct ptx_stream 70 { 71 CUstream stream; 72 pthread_t host_thread; 73 bool multithreaded; 74 75 CUdeviceptr d; 76 void *h; 77 void *h_begin; 78 void *h_end; 79 void *h_next; 80 void *h_prev; 81 void *h_tail; 82 83 struct ptx_stream *next; 84 }; 85 86 /* Thread-specific data for PTX. */ 87 88 struct nvptx_thread 89 { 90 struct ptx_stream *current_stream; 91 struct ptx_device *ptx_dev; 92 }; 93 94 struct map 95 { 96 int async; 97 size_t size; 98 char mappings[0]; 99 }; 100 101 static void 102 map_init (struct ptx_stream *s) 103 { 104 CUresult r; 105 106 int size = getpagesize (); 107 108 assert (s); 109 assert (!s->d); 110 assert (!s->h); 111 112 r = cuMemAllocHost (&s->h, size); 113 if (r != CUDA_SUCCESS) 114 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); 115 116 r = cuMemHostGetDevicePointer (&s->d, s->h, 0); 117 if (r != CUDA_SUCCESS) 118 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); 119 120 assert (s->h); 121 122 s->h_begin = s->h; 123 s->h_end = s->h_begin + size; 124 s->h_next = s->h_prev = s->h_tail = s->h_begin; 125 126 assert (s->h_next); 127 assert (s->h_end); 128 } 129 130 static void 131 map_fini (struct ptx_stream *s) 132 { 133 CUresult r; 134 135 r = cuMemFreeHost (s->h); 136 if (r != CUDA_SUCCESS) 137 GOMP_PLUGIN_error ("cuMemFreeHost error: %s", cuda_error (r)); 138 } 139 140 static void 141 map_pop (struct ptx_stream *s) 142 { 143 struct map *m; 144 145 assert (s != NULL); 146 assert (s->h_next); 147 assert (s->h_prev); 148 assert (s->h_tail); 149 150 m = s->h_tail; 151 152 s->h_tail += m->size; 153 154 if (s->h_tail >= s->h_end) 155 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); 156 157 if (s->h_next == s->h_tail) 158 s->h_prev = s->h_next; 159 160 assert (s->h_next >= s->h_begin); 161 assert (s->h_tail >= s->h_begin); 162 assert (s->h_prev >= s->h_begin); 163 164 assert (s->h_next <= s->h_end); 165 assert (s->h_tail <= s->h_end); 166 assert (s->h_prev <= s->h_end); 167 } 168 169 static void 170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) 171 { 172 int left; 173 int offset; 174 struct map *m; 175 176 assert (s != NULL); 177 178 left = s->h_end - s->h_next; 179 size += sizeof (struct map); 180 181 assert (s->h_prev); 182 assert (s->h_next); 183 184 if (size >= left) 185 { 186 m = s->h_prev; 187 m->size += left; 188 s->h_next = s->h_begin; 189 190 if (s->h_next + size > s->h_end) 191 GOMP_PLUGIN_fatal ("unable to push map"); 192 } 193 194 assert (s->h_next); 195 196 m = s->h_next; 197 m->async = async; 198 m->size = size; 199 200 offset = (void *)&m->mappings[0] - s->h; 201 202 *d = (void *)(s->d + offset); 203 *h = (void *)(s->h + offset); 204 205 s->h_prev = s->h_next; 206 s->h_next += size; 207 208 assert (s->h_prev); 209 assert (s->h_next); 210 211 assert (s->h_next >= s->h_begin); 212 assert (s->h_tail >= s->h_begin); 213 assert (s->h_prev >= s->h_begin); 214 assert (s->h_next <= s->h_end); 215 assert (s->h_tail <= s->h_end); 216 assert (s->h_prev <= s->h_end); 217 218 return; 219 } 220 221 /* Target data function launch information. */ 222 223 struct targ_fn_launch 224 { 225 const char *fn; 226 unsigned short dim[GOMP_DIM_MAX]; 227 }; 228 229 /* Target PTX object information. */ 230 231 struct targ_ptx_obj 232 { 233 const char *code; 234 size_t size; 235 }; 236 237 /* Target data image information. */ 238 239 typedef struct nvptx_tdata 240 { 241 const struct targ_ptx_obj *ptx_objs; 242 unsigned ptx_num; 243 244 const char *const *var_names; 245 unsigned var_num; 246 247 const struct targ_fn_launch *fn_descs; 248 unsigned fn_num; 249 } nvptx_tdata_t; 250 251 /* Descriptor of a loaded function. */ 252 253 struct targ_fn_descriptor 254 { 255 CUfunction fn; 256 const struct targ_fn_launch *launch; 257 }; 258 259 /* A loaded PTX image. */ 260 struct ptx_image_data 261 { 262 const void *target_data; 263 CUmodule module; 264 265 struct targ_fn_descriptor *fns; /* Array of functions. */ 266 267 struct ptx_image_data *next; 268 }; 269 270 struct ptx_device 271 { 272 CUcontext ctx; 273 bool ctx_shared; 274 CUdevice dev; 275 struct ptx_stream *null_stream; 276 /* All non-null streams associated with this device (actually context), 277 either created implicitly or passed in from the user (via 278 acc_set_cuda_stream). */ 279 struct ptx_stream *active_streams; 280 struct { 281 struct ptx_stream **arr; 282 int size; 283 } async_streams; 284 /* A lock for use when manipulating the above stream list and array. */ 285 pthread_mutex_t stream_lock; 286 int ord; 287 bool overlap; 288 bool map; 289 bool concur; 290 int mode; 291 bool mkern; 292 293 struct ptx_image_data *images; /* Images loaded on device. */ 294 pthread_mutex_t image_lock; /* Lock for above list. */ 295 296 struct ptx_device *next; 297 }; 298 299 enum ptx_event_type 300 { 301 PTX_EVT_MEM, 302 PTX_EVT_KNL, 303 PTX_EVT_SYNC, 304 PTX_EVT_ASYNC_CLEANUP 305 }; 306 307 struct ptx_event 308 { 309 CUevent *evt; 310 int type; 311 void *addr; 312 int ord; 313 314 struct ptx_event *next; 315 }; 316 317 static pthread_mutex_t ptx_event_lock; 318 static struct ptx_event *ptx_events; 319 320 static struct ptx_device **ptx_devices; 321 322 static inline struct nvptx_thread * 323 nvptx_thread (void) 324 { 325 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 326 } 327 328 static void 329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) 330 { 331 int i; 332 struct ptx_stream *null_stream 333 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); 334 335 null_stream->stream = NULL; 336 null_stream->host_thread = pthread_self (); 337 null_stream->multithreaded = true; 338 null_stream->d = (CUdeviceptr) NULL; 339 null_stream->h = NULL; 340 map_init (null_stream); 341 ptx_dev->null_stream = null_stream; 342 343 ptx_dev->active_streams = NULL; 344 pthread_mutex_init (&ptx_dev->stream_lock, NULL); 345 346 if (concurrency < 1) 347 concurrency = 1; 348 349 /* This is just a guess -- make space for as many async streams as the 350 current device is capable of concurrently executing. This can grow 351 later as necessary. No streams are created yet. */ 352 ptx_dev->async_streams.arr 353 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *)); 354 ptx_dev->async_streams.size = concurrency; 355 356 for (i = 0; i < concurrency; i++) 357 ptx_dev->async_streams.arr[i] = NULL; 358 } 359 360 static void 361 fini_streams_for_device (struct ptx_device *ptx_dev) 362 { 363 free (ptx_dev->async_streams.arr); 364 365 while (ptx_dev->active_streams != NULL) 366 { 367 struct ptx_stream *s = ptx_dev->active_streams; 368 ptx_dev->active_streams = ptx_dev->active_streams->next; 369 370 map_fini (s); 371 cuStreamDestroy (s->stream); 372 free (s); 373 } 374 375 map_fini (ptx_dev->null_stream); 376 free (ptx_dev->null_stream); 377 } 378 379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current 380 thread THREAD (and also current device/context). If CREATE is true, create 381 the stream if it does not exist (or use EXISTING if it is non-NULL), and 382 associate the stream with the same thread argument. Returns stream to use 383 as result. */ 384 385 static struct ptx_stream * 386 select_stream_for_async (int async, pthread_t thread, bool create, 387 CUstream existing) 388 { 389 struct nvptx_thread *nvthd = nvptx_thread (); 390 /* Local copy of TLS variable. */ 391 struct ptx_device *ptx_dev = nvthd->ptx_dev; 392 struct ptx_stream *stream = NULL; 393 int orig_async = async; 394 395 /* The special value acc_async_noval (-1) maps (for now) to an 396 implicitly-created stream, which is then handled the same as any other 397 numbered async stream. Other options are available, e.g. using the null 398 stream for anonymous async operations, or choosing an idle stream from an 399 active set. But, stick with this for now. */ 400 if (async > acc_async_sync) 401 async++; 402 403 if (create) 404 pthread_mutex_lock (&ptx_dev->stream_lock); 405 406 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the 407 null stream, and in fact better performance may be obtainable if it doesn't 408 (because the null stream enforces overly-strict synchronisation with 409 respect to other streams for legacy reasons, and that's probably not 410 needed with OpenACC). Maybe investigate later. */ 411 if (async == acc_async_sync) 412 stream = ptx_dev->null_stream; 413 else if (async >= 0 && async < ptx_dev->async_streams.size 414 && ptx_dev->async_streams.arr[async] && !(create && existing)) 415 stream = ptx_dev->async_streams.arr[async]; 416 else if (async >= 0 && create) 417 { 418 if (async >= ptx_dev->async_streams.size) 419 { 420 int i, newsize = ptx_dev->async_streams.size * 2; 421 422 if (async >= newsize) 423 newsize = async + 1; 424 425 ptx_dev->async_streams.arr 426 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, 427 newsize * sizeof (struct ptx_stream *)); 428 429 for (i = ptx_dev->async_streams.size; i < newsize; i++) 430 ptx_dev->async_streams.arr[i] = NULL; 431 432 ptx_dev->async_streams.size = newsize; 433 } 434 435 /* Create a new stream on-demand if there isn't one already, or if we're 436 setting a particular async value to an existing (externally-provided) 437 stream. */ 438 if (!ptx_dev->async_streams.arr[async] || existing) 439 { 440 CUresult r; 441 struct ptx_stream *s 442 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); 443 444 if (existing) 445 s->stream = existing; 446 else 447 { 448 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); 449 if (r != CUDA_SUCCESS) 450 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); 451 } 452 453 /* If CREATE is true, we're going to be queueing some work on this 454 stream. Associate it with the current host thread. */ 455 s->host_thread = thread; 456 s->multithreaded = false; 457 458 s->d = (CUdeviceptr) NULL; 459 s->h = NULL; 460 map_init (s); 461 462 s->next = ptx_dev->active_streams; 463 ptx_dev->active_streams = s; 464 ptx_dev->async_streams.arr[async] = s; 465 } 466 467 stream = ptx_dev->async_streams.arr[async]; 468 } 469 else if (async < 0) 470 GOMP_PLUGIN_fatal ("bad async %d", async); 471 472 if (create) 473 { 474 assert (stream != NULL); 475 476 /* If we're trying to use the same stream from different threads 477 simultaneously, set stream->multithreaded to true. This affects the 478 behaviour of acc_async_test_all and acc_wait_all, which are supposed to 479 only wait for asynchronous launches from the same host thread they are 480 invoked on. If multiple threads use the same async value, we make note 481 of that here and fall back to testing/waiting for all threads in those 482 functions. */ 483 if (thread != stream->host_thread) 484 stream->multithreaded = true; 485 486 pthread_mutex_unlock (&ptx_dev->stream_lock); 487 } 488 else if (stream && !stream->multithreaded 489 && !pthread_equal (stream->host_thread, thread)) 490 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); 491 492 return stream; 493 } 494 495 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK 496 should be locked on entry and remains locked on exit. */ 497 498 static bool 499 nvptx_init (void) 500 { 501 CUresult r; 502 int ndevs; 503 504 if (instantiated_devices != 0) 505 return true; 506 507 r = cuInit (0); 508 if (r != CUDA_SUCCESS) 509 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r)); 510 511 ptx_events = NULL; 512 513 pthread_mutex_init (&ptx_event_lock, NULL); 514 515 r = cuDeviceGetCount (&ndevs); 516 if (r != CUDA_SUCCESS) 517 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); 518 519 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) 520 * ndevs); 521 522 return true; 523 } 524 525 /* Select the N'th PTX device for the current host thread. The device must 526 have been previously opened before calling this function. */ 527 528 static void 529 nvptx_attach_host_thread_to_device (int n) 530 { 531 CUdevice dev; 532 CUresult r; 533 struct ptx_device *ptx_dev; 534 CUcontext thd_ctx; 535 536 r = cuCtxGetDevice (&dev); 537 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 538 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); 539 540 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) 541 return; 542 else 543 { 544 CUcontext old_ctx; 545 546 ptx_dev = ptx_devices[n]; 547 assert (ptx_dev); 548 549 r = cuCtxGetCurrent (&thd_ctx); 550 if (r != CUDA_SUCCESS) 551 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 552 553 /* We don't necessarily have a current context (e.g. if it has been 554 destroyed. Pop it if we do though. */ 555 if (thd_ctx != NULL) 556 { 557 r = cuCtxPopCurrent (&old_ctx); 558 if (r != CUDA_SUCCESS) 559 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); 560 } 561 562 r = cuCtxPushCurrent (ptx_dev->ctx); 563 if (r != CUDA_SUCCESS) 564 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); 565 } 566 } 567 568 static struct ptx_device * 569 nvptx_open_device (int n) 570 { 571 struct ptx_device *ptx_dev; 572 CUdevice dev, ctx_dev; 573 CUresult r; 574 int async_engines, pi; 575 576 r = cuDeviceGet (&dev, n); 577 if (r != CUDA_SUCCESS) 578 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); 579 580 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); 581 582 ptx_dev->ord = n; 583 ptx_dev->dev = dev; 584 ptx_dev->ctx_shared = false; 585 586 r = cuCtxGetDevice (&ctx_dev); 587 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 588 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); 589 590 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) 591 { 592 /* The current host thread has an active context for a different device. 593 Detach it. */ 594 CUcontext old_ctx; 595 596 r = cuCtxPopCurrent (&old_ctx); 597 if (r != CUDA_SUCCESS) 598 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); 599 } 600 601 r = cuCtxGetCurrent (&ptx_dev->ctx); 602 if (r != CUDA_SUCCESS) 603 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 604 605 if (!ptx_dev->ctx) 606 { 607 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); 608 if (r != CUDA_SUCCESS) 609 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); 610 } 611 else 612 ptx_dev->ctx_shared = true; 613 614 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); 615 if (r != CUDA_SUCCESS) 616 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 617 618 ptx_dev->overlap = pi; 619 620 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); 621 if (r != CUDA_SUCCESS) 622 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 623 624 ptx_dev->map = pi; 625 626 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); 627 if (r != CUDA_SUCCESS) 628 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 629 630 ptx_dev->concur = pi; 631 632 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 633 if (r != CUDA_SUCCESS) 634 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 635 636 ptx_dev->mode = pi; 637 638 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); 639 if (r != CUDA_SUCCESS) 640 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 641 642 ptx_dev->mkern = pi; 643 644 r = cuDeviceGetAttribute (&async_engines, 645 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); 646 if (r != CUDA_SUCCESS) 647 async_engines = 1; 648 649 ptx_dev->images = NULL; 650 pthread_mutex_init (&ptx_dev->image_lock, NULL); 651 652 init_streams_for_device (ptx_dev, async_engines); 653 654 return ptx_dev; 655 } 656 657 static void 658 nvptx_close_device (struct ptx_device *ptx_dev) 659 { 660 CUresult r; 661 662 if (!ptx_dev) 663 return; 664 665 fini_streams_for_device (ptx_dev); 666 667 pthread_mutex_destroy (&ptx_dev->image_lock); 668 669 if (!ptx_dev->ctx_shared) 670 { 671 r = cuCtxDestroy (ptx_dev->ctx); 672 if (r != CUDA_SUCCESS) 673 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r)); 674 } 675 676 free (ptx_dev); 677 } 678 679 static int 680 nvptx_get_num_devices (void) 681 { 682 int n; 683 CUresult r; 684 685 /* PR libgomp/65099: Currently, we only support offloading in 64-bit 686 configurations. */ 687 if (sizeof (void *) != 8) 688 return 0; 689 690 /* This function will be called before the plugin has been initialized in 691 order to enumerate available devices, but CUDA API routines can't be used 692 until cuInit has been called. Just call it now (but don't yet do any 693 further initialization). */ 694 if (instantiated_devices == 0) 695 { 696 r = cuInit (0); 697 /* This is not an error: e.g. we may have CUDA libraries installed but 698 no devices available. */ 699 if (r != CUDA_SUCCESS) 700 return 0; 701 } 702 703 r = cuDeviceGetCount (&n); 704 if (r!= CUDA_SUCCESS) 705 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); 706 707 return n; 708 } 709 710 711 static void 712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, 713 unsigned num_objs) 714 { 715 CUjit_option opts[6]; 716 void *optvals[6]; 717 float elapsed = 0.0; 718 #define LOGSIZE 8192 719 char elog[LOGSIZE]; 720 char ilog[LOGSIZE]; 721 unsigned long logsize = LOGSIZE; 722 CUlinkState linkstate; 723 CUresult r; 724 void *linkout; 725 size_t linkoutsize __attribute__ ((unused)); 726 727 opts[0] = CU_JIT_WALL_TIME; 728 optvals[0] = &elapsed; 729 730 opts[1] = CU_JIT_INFO_LOG_BUFFER; 731 optvals[1] = &ilog[0]; 732 733 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 734 optvals[2] = (void *) logsize; 735 736 opts[3] = CU_JIT_ERROR_LOG_BUFFER; 737 optvals[3] = &elog[0]; 738 739 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 740 optvals[4] = (void *) logsize; 741 742 opts[5] = CU_JIT_LOG_VERBOSE; 743 optvals[5] = (void *) 1; 744 745 r = cuLinkCreate (6, opts, optvals, &linkstate); 746 if (r != CUDA_SUCCESS) 747 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); 748 749 for (; num_objs--; ptx_objs++) 750 { 751 /* cuLinkAddData's 'data' argument erroneously omits the const 752 qualifier. */ 753 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); 754 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, 755 ptx_objs->size, 0, 0, 0, 0); 756 if (r != CUDA_SUCCESS) 757 { 758 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 759 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", 760 cuda_error (r)); 761 } 762 } 763 764 GOMP_PLUGIN_debug (0, "Linking\n"); 765 r = cuLinkComplete (linkstate, &linkout, &linkoutsize); 766 767 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); 768 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); 769 770 if (r != CUDA_SUCCESS) 771 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); 772 773 r = cuModuleLoadData (module, linkout); 774 if (r != CUDA_SUCCESS) 775 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); 776 777 r = cuLinkDestroy (linkstate); 778 if (r != CUDA_SUCCESS) 779 GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r)); 780 } 781 782 static void 783 event_gc (bool memmap_lockable) 784 { 785 struct ptx_event *ptx_event = ptx_events; 786 struct nvptx_thread *nvthd = nvptx_thread (); 787 788 pthread_mutex_lock (&ptx_event_lock); 789 790 while (ptx_event != NULL) 791 { 792 CUresult r; 793 struct ptx_event *e = ptx_event; 794 795 ptx_event = ptx_event->next; 796 797 if (e->ord != nvthd->ptx_dev->ord) 798 continue; 799 800 r = cuEventQuery (*e->evt); 801 if (r == CUDA_SUCCESS) 802 { 803 CUevent *te; 804 805 te = e->evt; 806 807 switch (e->type) 808 { 809 case PTX_EVT_MEM: 810 case PTX_EVT_SYNC: 811 break; 812 813 case PTX_EVT_KNL: 814 map_pop (e->addr); 815 break; 816 817 case PTX_EVT_ASYNC_CLEANUP: 818 { 819 /* The function gomp_plugin_async_unmap_vars needs to claim the 820 memory-map splay tree lock for the current device, so we 821 can't call it when one of our callers has already claimed 822 the lock. In that case, just delay the GC for this event 823 until later. */ 824 if (!memmap_lockable) 825 continue; 826 827 GOMP_PLUGIN_async_unmap_vars (e->addr); 828 } 829 break; 830 } 831 832 cuEventDestroy (*te); 833 free ((void *)te); 834 835 if (ptx_events == e) 836 ptx_events = ptx_events->next; 837 else 838 { 839 struct ptx_event *e_ = ptx_events; 840 while (e_->next != e) 841 e_ = e_->next; 842 e_->next = e_->next->next; 843 } 844 845 free (e); 846 } 847 } 848 849 pthread_mutex_unlock (&ptx_event_lock); 850 } 851 852 static void 853 event_add (enum ptx_event_type type, CUevent *e, void *h) 854 { 855 struct ptx_event *ptx_event; 856 struct nvptx_thread *nvthd = nvptx_thread (); 857 858 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC 859 || type == PTX_EVT_ASYNC_CLEANUP); 860 861 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event)); 862 ptx_event->type = type; 863 ptx_event->evt = e; 864 ptx_event->addr = h; 865 ptx_event->ord = nvthd->ptx_dev->ord; 866 867 pthread_mutex_lock (&ptx_event_lock); 868 869 ptx_event->next = ptx_events; 870 ptx_events = ptx_event; 871 872 pthread_mutex_unlock (&ptx_event_lock); 873 } 874 875 void 876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, 877 int async, unsigned *dims, void *targ_mem_desc) 878 { 879 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; 880 CUfunction function; 881 CUresult r; 882 int i; 883 struct ptx_stream *dev_str; 884 void *kargs[1]; 885 void *hp, *dp; 886 struct nvptx_thread *nvthd = nvptx_thread (); 887 const char *maybe_abort_msg = "(perhaps abort was called)"; 888 889 function = targ_fn->fn; 890 891 dev_str = select_stream_for_async (async, pthread_self (), false, NULL); 892 assert (dev_str == nvthd->current_stream); 893 894 /* Initialize the launch dimensions. Typically this is constant, 895 provided by the device compiler, but we must permit runtime 896 values. */ 897 int seen_zero = 0; 898 for (i = 0; i != GOMP_DIM_MAX; i++) 899 { 900 if (targ_fn->launch->dim[i]) 901 dims[i] = targ_fn->launch->dim[i]; 902 if (!dims[i]) 903 seen_zero = 1; 904 } 905 906 if (seen_zero) 907 { 908 for (i = 0; i != GOMP_DIM_MAX; i++) 909 if (!dims[i]) 910 dims[i] = /* TODO */ 32; 911 } 912 913 /* This reserves a chunk of a pre-allocated page of memory mapped on both 914 the host and the device. HP is a host pointer to the new chunk, and DP is 915 the corresponding device pointer. */ 916 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); 917 918 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 919 920 /* Copy the array of arguments to the mapped page. */ 921 for (i = 0; i < mapnum; i++) 922 ((void **) hp)[i] = devaddrs[i]; 923 924 /* Copy the (device) pointers to arguments to the device (dp and hp might in 925 fact have the same value on a unified-memory system). */ 926 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); 927 if (r != CUDA_SUCCESS) 928 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); 929 930 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" 931 " gangs=%u, workers=%u, vectors=%u\n", 932 __FUNCTION__, targ_fn->launch->fn, 933 dims[0], dims[1], dims[2]); 934 935 // OpenACC CUDA 936 // 937 // num_gangs nctaid.x 938 // num_workers ntid.y 939 // vector length ntid.x 940 941 kargs[0] = &dp; 942 r = cuLaunchKernel (function, 943 dims[GOMP_DIM_GANG], 1, 1, 944 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 945 0, dev_str->stream, kargs, 0); 946 if (r != CUDA_SUCCESS) 947 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); 948 949 #ifndef DISABLE_ASYNC 950 if (async < acc_async_noval) 951 { 952 r = cuStreamSynchronize (dev_str->stream); 953 if (r == CUDA_ERROR_LAUNCH_FAILED) 954 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), 955 maybe_abort_msg); 956 else if (r != CUDA_SUCCESS) 957 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 958 } 959 else 960 { 961 CUevent *e; 962 963 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 964 965 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 966 if (r == CUDA_ERROR_LAUNCH_FAILED) 967 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), 968 maybe_abort_msg); 969 else if (r != CUDA_SUCCESS) 970 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 971 972 event_gc (true); 973 974 r = cuEventRecord (*e, dev_str->stream); 975 if (r != CUDA_SUCCESS) 976 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 977 978 event_add (PTX_EVT_KNL, e, (void *)dev_str); 979 } 980 #else 981 r = cuCtxSynchronize (); 982 if (r == CUDA_ERROR_LAUNCH_FAILED) 983 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), 984 maybe_abort_msg); 985 else if (r != CUDA_SUCCESS) 986 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); 987 #endif 988 989 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, 990 targ_fn->launch->fn); 991 992 #ifndef DISABLE_ASYNC 993 if (async < acc_async_noval) 994 #endif 995 map_pop (dev_str); 996 } 997 998 void * openacc_get_current_cuda_context (void); 999 1000 static void * 1001 nvptx_alloc (size_t s) 1002 { 1003 CUdeviceptr d; 1004 CUresult r; 1005 1006 r = cuMemAlloc (&d, s); 1007 if (r == CUDA_ERROR_OUT_OF_MEMORY) 1008 return 0; 1009 if (r != CUDA_SUCCESS) 1010 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); 1011 return (void *)d; 1012 } 1013 1014 static void 1015 nvptx_free (void *p) 1016 { 1017 CUresult r; 1018 CUdeviceptr pb; 1019 size_t ps; 1020 1021 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); 1022 if (r != CUDA_SUCCESS) 1023 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1024 1025 if ((CUdeviceptr)p != pb) 1026 GOMP_PLUGIN_fatal ("invalid device address"); 1027 1028 r = cuMemFree ((CUdeviceptr)p); 1029 if (r != CUDA_SUCCESS) 1030 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); 1031 } 1032 1033 static void * 1034 nvptx_host2dev (void *d, const void *h, size_t s) 1035 { 1036 CUresult r; 1037 CUdeviceptr pb; 1038 size_t ps; 1039 struct nvptx_thread *nvthd = nvptx_thread (); 1040 1041 if (!s) 1042 return 0; 1043 1044 if (!d) 1045 GOMP_PLUGIN_fatal ("invalid device address"); 1046 1047 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); 1048 if (r != CUDA_SUCCESS) 1049 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1050 1051 if (!pb) 1052 GOMP_PLUGIN_fatal ("invalid device address"); 1053 1054 if (!h) 1055 GOMP_PLUGIN_fatal ("invalid host address"); 1056 1057 if (d == h) 1058 GOMP_PLUGIN_fatal ("invalid host or device address"); 1059 1060 if ((void *)(d + s) > (void *)(pb + ps)) 1061 GOMP_PLUGIN_fatal ("invalid size"); 1062 1063 #ifndef DISABLE_ASYNC 1064 if (nvthd->current_stream != nvthd->ptx_dev->null_stream) 1065 { 1066 CUevent *e; 1067 1068 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1069 1070 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1071 if (r != CUDA_SUCCESS) 1072 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1073 1074 event_gc (false); 1075 1076 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, 1077 nvthd->current_stream->stream); 1078 if (r != CUDA_SUCCESS) 1079 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r)); 1080 1081 r = cuEventRecord (*e, nvthd->current_stream->stream); 1082 if (r != CUDA_SUCCESS) 1083 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1084 1085 event_add (PTX_EVT_MEM, e, (void *)h); 1086 } 1087 else 1088 #endif 1089 { 1090 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); 1091 if (r != CUDA_SUCCESS) 1092 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); 1093 } 1094 1095 return 0; 1096 } 1097 1098 static void * 1099 nvptx_dev2host (void *h, const void *d, size_t s) 1100 { 1101 CUresult r; 1102 CUdeviceptr pb; 1103 size_t ps; 1104 struct nvptx_thread *nvthd = nvptx_thread (); 1105 1106 if (!s) 1107 return 0; 1108 1109 if (!d) 1110 GOMP_PLUGIN_fatal ("invalid device address"); 1111 1112 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); 1113 if (r != CUDA_SUCCESS) 1114 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1115 1116 if (!pb) 1117 GOMP_PLUGIN_fatal ("invalid device address"); 1118 1119 if (!h) 1120 GOMP_PLUGIN_fatal ("invalid host address"); 1121 1122 if (d == h) 1123 GOMP_PLUGIN_fatal ("invalid host or device address"); 1124 1125 if ((void *)(d + s) > (void *)(pb + ps)) 1126 GOMP_PLUGIN_fatal ("invalid size"); 1127 1128 #ifndef DISABLE_ASYNC 1129 if (nvthd->current_stream != nvthd->ptx_dev->null_stream) 1130 { 1131 CUevent *e; 1132 1133 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1134 1135 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1136 if (r != CUDA_SUCCESS) 1137 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); 1138 1139 event_gc (false); 1140 1141 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, 1142 nvthd->current_stream->stream); 1143 if (r != CUDA_SUCCESS) 1144 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); 1145 1146 r = cuEventRecord (*e, nvthd->current_stream->stream); 1147 if (r != CUDA_SUCCESS) 1148 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1149 1150 event_add (PTX_EVT_MEM, e, (void *)h); 1151 } 1152 else 1153 #endif 1154 { 1155 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); 1156 if (r != CUDA_SUCCESS) 1157 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); 1158 } 1159 1160 return 0; 1161 } 1162 1163 static void 1164 nvptx_set_async (int async) 1165 { 1166 struct nvptx_thread *nvthd = nvptx_thread (); 1167 nvthd->current_stream 1168 = select_stream_for_async (async, pthread_self (), true, NULL); 1169 } 1170 1171 static int 1172 nvptx_async_test (int async) 1173 { 1174 CUresult r; 1175 struct ptx_stream *s; 1176 1177 s = select_stream_for_async (async, pthread_self (), false, NULL); 1178 1179 if (!s) 1180 GOMP_PLUGIN_fatal ("unknown async %d", async); 1181 1182 r = cuStreamQuery (s->stream); 1183 if (r == CUDA_SUCCESS) 1184 { 1185 /* The oacc-parallel.c:goacc_wait function calls this hook to determine 1186 whether all work has completed on this stream, and if so omits the call 1187 to the wait hook. If that happens, event_gc might not get called 1188 (which prevents variables from getting unmapped and their associated 1189 device storage freed), so call it here. */ 1190 event_gc (true); 1191 return 1; 1192 } 1193 else if (r == CUDA_ERROR_NOT_READY) 1194 return 0; 1195 1196 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); 1197 1198 return 0; 1199 } 1200 1201 static int 1202 nvptx_async_test_all (void) 1203 { 1204 struct ptx_stream *s; 1205 pthread_t self = pthread_self (); 1206 struct nvptx_thread *nvthd = nvptx_thread (); 1207 1208 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1209 1210 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) 1211 { 1212 if ((s->multithreaded || pthread_equal (s->host_thread, self)) 1213 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) 1214 { 1215 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1216 return 0; 1217 } 1218 } 1219 1220 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1221 1222 event_gc (true); 1223 1224 return 1; 1225 } 1226 1227 static void 1228 nvptx_wait (int async) 1229 { 1230 CUresult r; 1231 struct ptx_stream *s; 1232 1233 s = select_stream_for_async (async, pthread_self (), false, NULL); 1234 1235 if (!s) 1236 GOMP_PLUGIN_fatal ("unknown async %d", async); 1237 1238 r = cuStreamSynchronize (s->stream); 1239 if (r != CUDA_SUCCESS) 1240 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1241 1242 event_gc (true); 1243 } 1244 1245 static void 1246 nvptx_wait_async (int async1, int async2) 1247 { 1248 CUresult r; 1249 CUevent *e; 1250 struct ptx_stream *s1, *s2; 1251 pthread_t self = pthread_self (); 1252 1253 /* The stream that is waiting (rather than being waited for) doesn't 1254 necessarily have to exist already. */ 1255 s2 = select_stream_for_async (async2, self, true, NULL); 1256 1257 s1 = select_stream_for_async (async1, self, false, NULL); 1258 if (!s1) 1259 GOMP_PLUGIN_fatal ("invalid async 1\n"); 1260 1261 if (s1 == s2) 1262 GOMP_PLUGIN_fatal ("identical parameters"); 1263 1264 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1265 1266 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1267 if (r != CUDA_SUCCESS) 1268 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1269 1270 event_gc (true); 1271 1272 r = cuEventRecord (*e, s1->stream); 1273 if (r != CUDA_SUCCESS) 1274 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1275 1276 event_add (PTX_EVT_SYNC, e, NULL); 1277 1278 r = cuStreamWaitEvent (s2->stream, *e, 0); 1279 if (r != CUDA_SUCCESS) 1280 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); 1281 } 1282 1283 static void 1284 nvptx_wait_all (void) 1285 { 1286 CUresult r; 1287 struct ptx_stream *s; 1288 pthread_t self = pthread_self (); 1289 struct nvptx_thread *nvthd = nvptx_thread (); 1290 1291 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1292 1293 /* Wait for active streams initiated by this thread (or by multiple threads) 1294 to complete. */ 1295 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) 1296 { 1297 if (s->multithreaded || pthread_equal (s->host_thread, self)) 1298 { 1299 r = cuStreamQuery (s->stream); 1300 if (r == CUDA_SUCCESS) 1301 continue; 1302 else if (r != CUDA_ERROR_NOT_READY) 1303 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); 1304 1305 r = cuStreamSynchronize (s->stream); 1306 if (r != CUDA_SUCCESS) 1307 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1308 } 1309 } 1310 1311 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1312 1313 event_gc (true); 1314 } 1315 1316 static void 1317 nvptx_wait_all_async (int async) 1318 { 1319 CUresult r; 1320 struct ptx_stream *waiting_stream, *other_stream; 1321 CUevent *e; 1322 struct nvptx_thread *nvthd = nvptx_thread (); 1323 pthread_t self = pthread_self (); 1324 1325 /* The stream doing the waiting. This could be the first mention of the 1326 stream, so create it if necessary. */ 1327 waiting_stream 1328 = select_stream_for_async (async, pthread_self (), true, NULL); 1329 1330 /* Launches on the null stream already block on other streams in the 1331 context. */ 1332 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) 1333 return; 1334 1335 event_gc (true); 1336 1337 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1338 1339 for (other_stream = nvthd->ptx_dev->active_streams; 1340 other_stream != NULL; 1341 other_stream = other_stream->next) 1342 { 1343 if (!other_stream->multithreaded 1344 && !pthread_equal (other_stream->host_thread, self)) 1345 continue; 1346 1347 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); 1348 1349 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1350 if (r != CUDA_SUCCESS) 1351 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1352 1353 /* Record an event on the waited-for stream. */ 1354 r = cuEventRecord (*e, other_stream->stream); 1355 if (r != CUDA_SUCCESS) 1356 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1357 1358 event_add (PTX_EVT_SYNC, e, NULL); 1359 1360 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); 1361 if (r != CUDA_SUCCESS) 1362 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); 1363 } 1364 1365 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1366 } 1367 1368 static void * 1369 nvptx_get_current_cuda_device (void) 1370 { 1371 struct nvptx_thread *nvthd = nvptx_thread (); 1372 1373 if (!nvthd || !nvthd->ptx_dev) 1374 return NULL; 1375 1376 return &nvthd->ptx_dev->dev; 1377 } 1378 1379 static void * 1380 nvptx_get_current_cuda_context (void) 1381 { 1382 struct nvptx_thread *nvthd = nvptx_thread (); 1383 1384 if (!nvthd || !nvthd->ptx_dev) 1385 return NULL; 1386 1387 return nvthd->ptx_dev->ctx; 1388 } 1389 1390 static void * 1391 nvptx_get_cuda_stream (int async) 1392 { 1393 struct ptx_stream *s; 1394 struct nvptx_thread *nvthd = nvptx_thread (); 1395 1396 if (!nvthd || !nvthd->ptx_dev) 1397 return NULL; 1398 1399 s = select_stream_for_async (async, pthread_self (), false, NULL); 1400 1401 return s ? s->stream : NULL; 1402 } 1403 1404 static int 1405 nvptx_set_cuda_stream (int async, void *stream) 1406 { 1407 struct ptx_stream *oldstream; 1408 pthread_t self = pthread_self (); 1409 struct nvptx_thread *nvthd = nvptx_thread (); 1410 1411 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1412 1413 if (async < 0) 1414 GOMP_PLUGIN_fatal ("bad async %d", async); 1415 1416 /* We have a list of active streams and an array mapping async values to 1417 entries of that list. We need to take "ownership" of the passed-in stream, 1418 and add it to our list, removing the previous entry also (if there was one) 1419 in order to prevent resource leaks. Note the potential for surprise 1420 here: maybe we should keep track of passed-in streams and leave it up to 1421 the user to tidy those up, but that doesn't work for stream handles 1422 returned from acc_get_cuda_stream above... */ 1423 1424 oldstream = select_stream_for_async (async, self, false, NULL); 1425 1426 if (oldstream) 1427 { 1428 if (nvthd->ptx_dev->active_streams == oldstream) 1429 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; 1430 else 1431 { 1432 struct ptx_stream *s = nvthd->ptx_dev->active_streams; 1433 while (s->next != oldstream) 1434 s = s->next; 1435 s->next = s->next->next; 1436 } 1437 1438 cuStreamDestroy (oldstream->stream); 1439 map_fini (oldstream); 1440 free (oldstream); 1441 } 1442 1443 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1444 1445 (void) select_stream_for_async (async, self, true, (CUstream) stream); 1446 1447 return 1; 1448 } 1449 1450 /* Plugin entry points. */ 1451 1452 const char * 1453 GOMP_OFFLOAD_get_name (void) 1454 { 1455 return "nvptx"; 1456 } 1457 1458 unsigned int 1459 GOMP_OFFLOAD_get_caps (void) 1460 { 1461 return GOMP_OFFLOAD_CAP_OPENACC_200; 1462 } 1463 1464 int 1465 GOMP_OFFLOAD_get_type (void) 1466 { 1467 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX; 1468 } 1469 1470 int 1471 GOMP_OFFLOAD_get_num_devices (void) 1472 { 1473 return nvptx_get_num_devices (); 1474 } 1475 1476 void 1477 GOMP_OFFLOAD_init_device (int n) 1478 { 1479 pthread_mutex_lock (&ptx_dev_lock); 1480 1481 if (!nvptx_init () || ptx_devices[n] != NULL) 1482 { 1483 pthread_mutex_unlock (&ptx_dev_lock); 1484 return; 1485 } 1486 1487 ptx_devices[n] = nvptx_open_device (n); 1488 instantiated_devices++; 1489 1490 pthread_mutex_unlock (&ptx_dev_lock); 1491 } 1492 1493 void 1494 GOMP_OFFLOAD_fini_device (int n) 1495 { 1496 pthread_mutex_lock (&ptx_dev_lock); 1497 1498 if (ptx_devices[n] != NULL) 1499 { 1500 nvptx_attach_host_thread_to_device (n); 1501 nvptx_close_device (ptx_devices[n]); 1502 ptx_devices[n] = NULL; 1503 instantiated_devices--; 1504 } 1505 1506 pthread_mutex_unlock (&ptx_dev_lock); 1507 } 1508 1509 /* Return the libgomp version number we're compatible with. There is 1510 no requirement for cross-version compatibility. */ 1511 1512 unsigned 1513 GOMP_OFFLOAD_version (void) 1514 { 1515 return GOMP_VERSION; 1516 } 1517 1518 /* Load the (partial) program described by TARGET_DATA to device 1519 number ORD. Allocate and return TARGET_TABLE. */ 1520 1521 int 1522 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, 1523 struct addr_pair **target_table) 1524 { 1525 CUmodule module; 1526 const char *const *var_names; 1527 const struct targ_fn_launch *fn_descs; 1528 unsigned int fn_entries, var_entries, i, j; 1529 CUresult r; 1530 struct targ_fn_descriptor *targ_fns; 1531 struct addr_pair *targ_tbl; 1532 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; 1533 struct ptx_image_data *new_image; 1534 struct ptx_device *dev; 1535 1536 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1537 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin" 1538 " (expected %u, received %u)", 1539 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); 1540 1541 GOMP_OFFLOAD_init_device (ord); 1542 1543 dev = ptx_devices[ord]; 1544 1545 nvptx_attach_host_thread_to_device (ord); 1546 1547 link_ptx (&module, img_header->ptx_objs, img_header->ptx_num); 1548 1549 /* The mkoffload utility emits a struct of pointers/integers at the 1550 start of each offload image. The array of kernel names and the 1551 functions addresses form a one-to-one correspondence. */ 1552 1553 var_entries = img_header->var_num; 1554 var_names = img_header->var_names; 1555 fn_entries = img_header->fn_num; 1556 fn_descs = img_header->fn_descs; 1557 1558 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) 1559 * (fn_entries + var_entries)); 1560 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) 1561 * fn_entries); 1562 1563 *target_table = targ_tbl; 1564 1565 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); 1566 new_image->target_data = target_data; 1567 new_image->module = module; 1568 new_image->fns = targ_fns; 1569 1570 pthread_mutex_lock (&dev->image_lock); 1571 new_image->next = dev->images; 1572 dev->images = new_image; 1573 pthread_mutex_unlock (&dev->image_lock); 1574 1575 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) 1576 { 1577 CUfunction function; 1578 1579 r = cuModuleGetFunction (&function, module, fn_descs[i].fn); 1580 if (r != CUDA_SUCCESS) 1581 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); 1582 1583 targ_fns->fn = function; 1584 targ_fns->launch = &fn_descs[i]; 1585 1586 targ_tbl->start = (uintptr_t) targ_fns; 1587 targ_tbl->end = targ_tbl->start + 1; 1588 } 1589 1590 for (j = 0; j < var_entries; j++, targ_tbl++) 1591 { 1592 CUdeviceptr var; 1593 size_t bytes; 1594 1595 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); 1596 if (r != CUDA_SUCCESS) 1597 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); 1598 1599 targ_tbl->start = (uintptr_t) var; 1600 targ_tbl->end = targ_tbl->start + bytes; 1601 } 1602 1603 return fn_entries + var_entries; 1604 } 1605 1606 /* Unload the program described by TARGET_DATA. DEV_DATA is the 1607 function descriptors allocated by G_O_load_image. */ 1608 1609 void 1610 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) 1611 { 1612 struct ptx_image_data *image, **prev_p; 1613 struct ptx_device *dev = ptx_devices[ord]; 1614 1615 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1616 return; 1617 1618 pthread_mutex_lock (&dev->image_lock); 1619 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) 1620 if (image->target_data == target_data) 1621 { 1622 *prev_p = image->next; 1623 cuModuleUnload (image->module); 1624 free (image->fns); 1625 free (image); 1626 break; 1627 } 1628 pthread_mutex_unlock (&dev->image_lock); 1629 } 1630 1631 void * 1632 GOMP_OFFLOAD_alloc (int ord, size_t size) 1633 { 1634 nvptx_attach_host_thread_to_device (ord); 1635 return nvptx_alloc (size); 1636 } 1637 1638 void 1639 GOMP_OFFLOAD_free (int ord, void *ptr) 1640 { 1641 nvptx_attach_host_thread_to_device (ord); 1642 nvptx_free (ptr); 1643 } 1644 1645 void * 1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) 1647 { 1648 nvptx_attach_host_thread_to_device (ord); 1649 return nvptx_dev2host (dst, src, n); 1650 } 1651 1652 void * 1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) 1654 { 1655 nvptx_attach_host_thread_to_device (ord); 1656 return nvptx_host2dev (dst, src, n); 1657 } 1658 1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; 1660 1661 void 1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, 1663 void **hostaddrs, void **devaddrs, 1664 int async, unsigned *dims, void *targ_mem_desc) 1665 { 1666 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc); 1667 } 1668 1669 void 1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) 1671 { 1672 CUevent *e; 1673 CUresult r; 1674 struct nvptx_thread *nvthd = nvptx_thread (); 1675 1676 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); 1677 1678 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1679 if (r != CUDA_SUCCESS) 1680 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1681 1682 r = cuEventRecord (*e, nvthd->current_stream->stream); 1683 if (r != CUDA_SUCCESS) 1684 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1685 1686 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); 1687 } 1688 1689 int 1690 GOMP_OFFLOAD_openacc_async_test (int async) 1691 { 1692 return nvptx_async_test (async); 1693 } 1694 1695 int 1696 GOMP_OFFLOAD_openacc_async_test_all (void) 1697 { 1698 return nvptx_async_test_all (); 1699 } 1700 1701 void 1702 GOMP_OFFLOAD_openacc_async_wait (int async) 1703 { 1704 nvptx_wait (async); 1705 } 1706 1707 void 1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2) 1709 { 1710 nvptx_wait_async (async1, async2); 1711 } 1712 1713 void 1714 GOMP_OFFLOAD_openacc_async_wait_all (void) 1715 { 1716 nvptx_wait_all (); 1717 } 1718 1719 void 1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async) 1721 { 1722 nvptx_wait_all_async (async); 1723 } 1724 1725 void 1726 GOMP_OFFLOAD_openacc_async_set_async (int async) 1727 { 1728 nvptx_set_async (async); 1729 } 1730 1731 void * 1732 GOMP_OFFLOAD_openacc_create_thread_data (int ord) 1733 { 1734 struct ptx_device *ptx_dev; 1735 struct nvptx_thread *nvthd 1736 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); 1737 CUresult r; 1738 CUcontext thd_ctx; 1739 1740 ptx_dev = ptx_devices[ord]; 1741 1742 assert (ptx_dev); 1743 1744 r = cuCtxGetCurrent (&thd_ctx); 1745 if (r != CUDA_SUCCESS) 1746 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 1747 1748 assert (ptx_dev->ctx); 1749 1750 if (!thd_ctx) 1751 { 1752 r = cuCtxPushCurrent (ptx_dev->ctx); 1753 if (r != CUDA_SUCCESS) 1754 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); 1755 } 1756 1757 nvthd->current_stream = ptx_dev->null_stream; 1758 nvthd->ptx_dev = ptx_dev; 1759 1760 return (void *) nvthd; 1761 } 1762 1763 void 1764 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 1765 { 1766 free (data); 1767 } 1768 1769 void * 1770 GOMP_OFFLOAD_openacc_get_current_cuda_device (void) 1771 { 1772 return nvptx_get_current_cuda_device (); 1773 } 1774 1775 void * 1776 GOMP_OFFLOAD_openacc_get_current_cuda_context (void) 1777 { 1778 return nvptx_get_current_cuda_context (); 1779 } 1780 1781 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */ 1782 1783 void * 1784 GOMP_OFFLOAD_openacc_get_cuda_stream (int async) 1785 { 1786 return nvptx_get_cuda_stream (async); 1787 } 1788 1789 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */ 1790 1791 int 1792 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) 1793 { 1794 return nvptx_set_cuda_stream (async, stream); 1795 } 1796