1 /* Plugin for NVPTX execution. 2 3 Copyright (C) 2013-2022 Free Software Foundation, Inc. 4 5 Contributed by Mentor Embedded. 6 7 This file is part of the GNU Offloading and Multi Processing Library 8 (libgomp). 9 10 Libgomp is free software; you can redistribute it and/or modify it 11 under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 more details. 19 20 Under Section 7 of GPL version 3, you are granted additional 21 permissions described in the GCC Runtime Library Exception, version 22 3.1, as published by the Free Software Foundation. 23 24 You should have received a copy of the GNU General Public License and 25 a copy of the GCC Runtime Library Exception along with this program; 26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 <http://www.gnu.org/licenses/>. */ 28 29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver 30 library appears to hold some implicit state, but the documentation 31 is not clear as to what that state might be. Or how one might 32 propagate it from one thread to another. */ 33 34 #define _GNU_SOURCE 35 #include "openacc.h" 36 #include "config.h" 37 #include "symcat.h" 38 #include "libgomp-plugin.h" 39 #include "oacc-plugin.h" 40 #include "gomp-constants.h" 41 #include "oacc-int.h" 42 43 #include <pthread.h> 44 #if PLUGIN_NVPTX_DYNAMIC 45 # include "cuda/cuda.h" 46 #else 47 # include <cuda.h> 48 #endif 49 #include <stdbool.h> 50 #include <limits.h> 51 #include <string.h> 52 #include <stdio.h> 53 #include <unistd.h> 54 #include <assert.h> 55 #include <errno.h> 56 57 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks 58 block to cache between kernel invocations. For soft-stacks blocks bigger 59 than this, we will free the block before attempting another GPU memory 60 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails, 61 we will free the cached soft-stacks block anyway then retry the 62 allocation. If that fails too, we lose. */ 63 64 #define SOFTSTACK_CACHE_LIMIT 134217728 65 66 #if CUDA_VERSION < 6000 67 extern CUresult cuGetErrorString (CUresult, const char **); 68 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82 69 #endif 70 71 #if CUDA_VERSION >= 6050 72 #undef cuLinkCreate 73 #undef cuLinkAddData 74 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, 75 const char *, unsigned, CUjit_option *, void **); 76 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); 77 #else 78 typedef size_t (*CUoccupancyB2DSize)(int); 79 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, 80 const char *, unsigned, CUjit_option *, void **); 81 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); 82 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, 83 CUoccupancyB2DSize, size_t, int); 84 #endif 85 86 #define DO_PRAGMA(x) _Pragma (#x) 87 88 #if PLUGIN_NVPTX_DYNAMIC 89 # include <dlfcn.h> 90 91 struct cuda_lib_s { 92 93 # define CUDA_ONE_CALL(call) \ 94 __typeof (call) *call; 95 # define CUDA_ONE_CALL_MAYBE_NULL(call) \ 96 CUDA_ONE_CALL (call) 97 #include "cuda-lib.def" 98 # undef CUDA_ONE_CALL 99 # undef CUDA_ONE_CALL_MAYBE_NULL 100 101 } cuda_lib; 102 103 /* -1 if init_cuda_lib has not been called yet, false 104 if it has been and failed, true if it has been and succeeded. */ 105 static signed char cuda_lib_inited = -1; 106 107 /* Dynamically load the CUDA runtime library and initialize function 108 pointers, return false if unsuccessful, true if successful. */ 109 static bool 110 init_cuda_lib (void) 111 { 112 if (cuda_lib_inited != -1) 113 return cuda_lib_inited; 114 const char *cuda_runtime_lib = "libcuda.so.1"; 115 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY); 116 cuda_lib_inited = false; 117 if (h == NULL) 118 return false; 119 120 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false) 121 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true) 122 # define CUDA_ONE_CALL_1(call, allow_null) \ 123 cuda_lib.call = dlsym (h, #call); \ 124 if (!allow_null && cuda_lib.call == NULL) \ 125 return false; 126 #include "cuda-lib.def" 127 # undef CUDA_ONE_CALL 128 # undef CUDA_ONE_CALL_1 129 # undef CUDA_ONE_CALL_MAYBE_NULL 130 131 cuda_lib_inited = true; 132 return true; 133 } 134 # define CUDA_CALL_PREFIX cuda_lib. 135 #else 136 137 # define CUDA_ONE_CALL(call) 138 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call) 139 #include "cuda-lib.def" 140 #undef CUDA_ONE_CALL_MAYBE_NULL 141 #undef CUDA_ONE_CALL 142 143 # define CUDA_CALL_PREFIX 144 # define init_cuda_lib() true 145 #endif 146 147 #include "secure_getenv.h" 148 149 #undef MIN 150 #undef MAX 151 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) 152 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) 153 154 /* Convenience macros for the frequently used CUDA library call and 155 error handling sequence as well as CUDA library calls that 156 do the error checking themselves or don't do it at all. */ 157 158 #define CUDA_CALL_ERET(ERET, FN, ...) \ 159 do { \ 160 unsigned __r \ 161 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ 162 if (__r != CUDA_SUCCESS) \ 163 { \ 164 GOMP_PLUGIN_error (#FN " error: %s", \ 165 cuda_error (__r)); \ 166 return ERET; \ 167 } \ 168 } while (0) 169 170 #define CUDA_CALL(FN, ...) \ 171 CUDA_CALL_ERET (false, FN, __VA_ARGS__) 172 173 #define CUDA_CALL_ASSERT(FN, ...) \ 174 do { \ 175 unsigned __r \ 176 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ 177 if (__r != CUDA_SUCCESS) \ 178 { \ 179 GOMP_PLUGIN_fatal (#FN " error: %s", \ 180 cuda_error (__r)); \ 181 } \ 182 } while (0) 183 184 #define CUDA_CALL_NOCHECK(FN, ...) \ 185 CUDA_CALL_PREFIX FN (__VA_ARGS__) 186 187 #define CUDA_CALL_EXISTS(FN) \ 188 CUDA_CALL_PREFIX FN 189 190 static const char * 191 cuda_error (CUresult r) 192 { 193 const char *fallback = "unknown cuda error"; 194 const char *desc; 195 196 if (!CUDA_CALL_EXISTS (cuGetErrorString)) 197 return fallback; 198 199 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); 200 if (r == CUDA_SUCCESS) 201 return desc; 202 203 return fallback; 204 } 205 206 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by 207 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */ 208 static char cuda_driver_version_s[30]; 209 210 static unsigned int instantiated_devices = 0; 211 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; 212 213 /* NVPTX/CUDA specific definition of asynchronous queues. */ 214 struct goacc_asyncqueue 215 { 216 CUstream cuda_stream; 217 }; 218 219 struct nvptx_callback 220 { 221 void (*fn) (void *); 222 void *ptr; 223 struct goacc_asyncqueue *aq; 224 struct nvptx_callback *next; 225 }; 226 227 /* Thread-specific data for PTX. */ 228 229 struct nvptx_thread 230 { 231 /* We currently have this embedded inside the plugin because libgomp manages 232 devices through integer target_ids. This might be better if using an 233 opaque target-specific pointer directly from gomp_device_descr. */ 234 struct ptx_device *ptx_dev; 235 }; 236 237 /* Target data function launch information. */ 238 239 struct targ_fn_launch 240 { 241 const char *fn; 242 unsigned short dim[GOMP_DIM_MAX]; 243 }; 244 245 /* Target PTX object information. */ 246 247 struct targ_ptx_obj 248 { 249 const char *code; 250 size_t size; 251 }; 252 253 /* Target data image information. */ 254 255 typedef struct nvptx_tdata 256 { 257 const struct targ_ptx_obj *ptx_objs; 258 unsigned ptx_num; 259 260 const char *const *var_names; 261 unsigned var_num; 262 263 const struct targ_fn_launch *fn_descs; 264 unsigned fn_num; 265 } nvptx_tdata_t; 266 267 /* Descriptor of a loaded function. */ 268 269 struct targ_fn_descriptor 270 { 271 CUfunction fn; 272 const struct targ_fn_launch *launch; 273 int regs_per_thread; 274 int max_threads_per_block; 275 }; 276 277 /* A loaded PTX image. */ 278 struct ptx_image_data 279 { 280 const void *target_data; 281 CUmodule module; 282 283 struct targ_fn_descriptor *fns; /* Array of functions. */ 284 285 struct ptx_image_data *next; 286 }; 287 288 struct ptx_free_block 289 { 290 void *ptr; 291 struct ptx_free_block *next; 292 }; 293 294 struct ptx_device 295 { 296 CUcontext ctx; 297 bool ctx_shared; 298 CUdevice dev; 299 300 int ord; 301 bool overlap; 302 bool map; 303 bool concur; 304 bool mkern; 305 int mode; 306 int clock_khz; 307 int num_sms; 308 int regs_per_block; 309 int regs_per_sm; 310 int warp_size; 311 int max_threads_per_block; 312 int max_threads_per_multiprocessor; 313 int default_dims[GOMP_DIM_MAX]; 314 315 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */ 316 char name[256]; 317 318 struct ptx_image_data *images; /* Images loaded on device. */ 319 pthread_mutex_t image_lock; /* Lock for above list. */ 320 321 struct ptx_free_block *free_blocks; 322 pthread_mutex_t free_blocks_lock; 323 324 /* OpenMP stacks, cached between kernel invocations. */ 325 struct 326 { 327 CUdeviceptr ptr; 328 size_t size; 329 pthread_mutex_t lock; 330 } omp_stacks; 331 332 struct ptx_device *next; 333 }; 334 335 static struct ptx_device **ptx_devices; 336 337 static inline struct nvptx_thread * 338 nvptx_thread (void) 339 { 340 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 341 } 342 343 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK 344 should be locked on entry and remains locked on exit. */ 345 346 static bool 347 nvptx_init (void) 348 { 349 int ndevs; 350 351 if (instantiated_devices != 0) 352 return true; 353 354 if (!init_cuda_lib ()) 355 return false; 356 357 CUDA_CALL (cuInit, 0); 358 359 int cuda_driver_version; 360 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version); 361 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s, 362 "CUDA Driver %u.%u", 363 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10); 364 365 CUDA_CALL (cuDeviceGetCount, &ndevs); 366 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) 367 * ndevs); 368 369 return true; 370 } 371 372 /* Select the N'th PTX device for the current host thread. The device must 373 have been previously opened before calling this function. */ 374 375 static bool 376 nvptx_attach_host_thread_to_device (int n) 377 { 378 CUdevice dev; 379 CUresult r; 380 struct ptx_device *ptx_dev; 381 CUcontext thd_ctx; 382 383 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); 384 if (r == CUDA_ERROR_NOT_PERMITTED) 385 { 386 /* Assume we're in a CUDA callback, just return true. */ 387 return true; 388 } 389 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 390 { 391 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); 392 return false; 393 } 394 395 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) 396 return true; 397 else 398 { 399 CUcontext old_ctx; 400 401 ptx_dev = ptx_devices[n]; 402 if (!ptx_dev) 403 { 404 GOMP_PLUGIN_error ("device %d not found", n); 405 return false; 406 } 407 408 CUDA_CALL (cuCtxGetCurrent, &thd_ctx); 409 410 /* We don't necessarily have a current context (e.g. if it has been 411 destroyed. Pop it if we do though. */ 412 if (thd_ctx != NULL) 413 CUDA_CALL (cuCtxPopCurrent, &old_ctx); 414 415 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); 416 } 417 return true; 418 } 419 420 static struct ptx_device * 421 nvptx_open_device (int n) 422 { 423 struct ptx_device *ptx_dev; 424 CUdevice dev, ctx_dev; 425 CUresult r; 426 int async_engines, pi; 427 428 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); 429 430 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); 431 432 ptx_dev->ord = n; 433 ptx_dev->dev = dev; 434 ptx_dev->ctx_shared = false; 435 436 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); 437 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 438 { 439 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); 440 return NULL; 441 } 442 443 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) 444 { 445 /* The current host thread has an active context for a different device. 446 Detach it. */ 447 CUcontext old_ctx; 448 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx); 449 } 450 451 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx); 452 453 if (!ptx_dev->ctx) 454 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); 455 else 456 ptx_dev->ctx_shared = true; 457 458 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 459 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); 460 ptx_dev->overlap = pi; 461 462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 463 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); 464 ptx_dev->map = pi; 465 466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 467 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); 468 ptx_dev->concur = pi; 469 470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 471 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 472 ptx_dev->mode = pi; 473 474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 475 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); 476 ptx_dev->mkern = pi; 477 478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 479 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); 480 ptx_dev->clock_khz = pi; 481 482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 483 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); 484 ptx_dev->num_sms = pi; 485 486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 487 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); 488 ptx_dev->regs_per_block = pi; 489 490 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only 491 in CUDA 6.0 and newer. */ 492 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 493 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, 494 dev); 495 /* Fallback: use limit of registers per block, which is usually equal. */ 496 if (r == CUDA_ERROR_INVALID_VALUE) 497 pi = ptx_dev->regs_per_block; 498 else if (r != CUDA_SUCCESS) 499 { 500 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r)); 501 return NULL; 502 } 503 ptx_dev->regs_per_sm = pi; 504 505 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 506 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); 507 if (pi != 32) 508 { 509 GOMP_PLUGIN_error ("Only warp size 32 is supported"); 510 return NULL; 511 } 512 ptx_dev->warp_size = pi; 513 514 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, 515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); 516 ptx_dev->max_threads_per_block = pi; 517 518 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, 519 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); 520 ptx_dev->max_threads_per_multiprocessor = pi; 521 522 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, 523 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); 524 if (r != CUDA_SUCCESS) 525 async_engines = 1; 526 527 for (int i = 0; i != GOMP_DIM_MAX; i++) 528 ptx_dev->default_dims[i] = 0; 529 530 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name, 531 dev); 532 533 ptx_dev->images = NULL; 534 pthread_mutex_init (&ptx_dev->image_lock, NULL); 535 536 ptx_dev->free_blocks = NULL; 537 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); 538 539 ptx_dev->omp_stacks.ptr = 0; 540 ptx_dev->omp_stacks.size = 0; 541 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL); 542 543 return ptx_dev; 544 } 545 546 static bool 547 nvptx_close_device (struct ptx_device *ptx_dev) 548 { 549 if (!ptx_dev) 550 return true; 551 552 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;) 553 { 554 struct ptx_free_block *b_next = b->next; 555 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr); 556 free (b); 557 b = b_next; 558 } 559 560 pthread_mutex_destroy (&ptx_dev->free_blocks_lock); 561 pthread_mutex_destroy (&ptx_dev->image_lock); 562 563 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock); 564 565 if (ptx_dev->omp_stacks.ptr) 566 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr); 567 568 if (!ptx_dev->ctx_shared) 569 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); 570 571 free (ptx_dev); 572 return true; 573 } 574 575 static int 576 nvptx_get_num_devices (void) 577 { 578 int n; 579 580 /* This function will be called before the plugin has been initialized in 581 order to enumerate available devices, but CUDA API routines can't be used 582 until cuInit has been called. Just call it now (but don't yet do any 583 further initialization). */ 584 if (instantiated_devices == 0) 585 { 586 if (!init_cuda_lib ()) 587 return 0; 588 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0); 589 /* This is not an error: e.g. we may have CUDA libraries installed but 590 no devices available. */ 591 if (r != CUDA_SUCCESS) 592 { 593 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n", 594 cuda_error (r)); 595 return 0; 596 } 597 } 598 599 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n); 600 return n; 601 } 602 603 static void 604 notify_var (const char *var_name, const char *env_var) 605 { 606 if (env_var == NULL) 607 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name); 608 else 609 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var); 610 } 611 612 static void 613 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o) 614 { 615 const char *var_name = "GOMP_NVPTX_JIT"; 616 const char *env_var = secure_getenv (var_name); 617 notify_var (var_name, env_var); 618 619 if (env_var == NULL) 620 return; 621 622 const char *c = env_var; 623 while (*c != '\0') 624 { 625 while (*c == ' ') 626 c++; 627 628 if (c[0] == '-' && c[1] == 'O' 629 && '0' <= c[2] && c[2] <= '4' 630 && (c[3] == '\0' || c[3] == ' ')) 631 { 632 *gomp_nvptx_o = c[2] - '0'; 633 c += 3; 634 continue; 635 } 636 637 GOMP_PLUGIN_error ("Error parsing %s", var_name); 638 break; 639 } 640 } 641 642 static bool 643 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, 644 unsigned num_objs) 645 { 646 CUjit_option opts[7]; 647 void *optvals[7]; 648 float elapsed = 0.0; 649 char elog[1024]; 650 char ilog[16384]; 651 CUlinkState linkstate; 652 CUresult r; 653 void *linkout; 654 size_t linkoutsize __attribute__ ((unused)); 655 656 opts[0] = CU_JIT_WALL_TIME; 657 optvals[0] = &elapsed; 658 659 opts[1] = CU_JIT_INFO_LOG_BUFFER; 660 optvals[1] = &ilog[0]; 661 662 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 663 optvals[2] = (void *) sizeof ilog; 664 665 opts[3] = CU_JIT_ERROR_LOG_BUFFER; 666 optvals[3] = &elog[0]; 667 668 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 669 optvals[4] = (void *) sizeof elog; 670 671 opts[5] = CU_JIT_LOG_VERBOSE; 672 optvals[5] = (void *) 1; 673 674 static intptr_t gomp_nvptx_o = -1; 675 676 static bool init_done = false; 677 if (!init_done) 678 { 679 process_GOMP_NVPTX_JIT (&gomp_nvptx_o); 680 init_done = true; 681 } 682 683 int nopts = 6; 684 if (gomp_nvptx_o != -1) 685 { 686 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL; 687 optvals[nopts] = (void *) gomp_nvptx_o; 688 nopts++; 689 } 690 691 if (CUDA_CALL_EXISTS (cuLinkCreate_v2)) 692 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate); 693 else 694 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate); 695 696 for (; num_objs--; ptx_objs++) 697 { 698 /* cuLinkAddData's 'data' argument erroneously omits the const 699 qualifier. */ 700 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); 701 if (CUDA_CALL_EXISTS (cuLinkAddData_v2)) 702 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX, 703 (char *) ptx_objs->code, ptx_objs->size, 704 0, 0, 0, 0); 705 else 706 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, 707 (char *) ptx_objs->code, ptx_objs->size, 708 0, 0, 0, 0); 709 if (r != CUDA_SUCCESS) 710 { 711 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 712 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", 713 cuda_error (r)); 714 return false; 715 } 716 } 717 718 GOMP_PLUGIN_debug (0, "Linking\n"); 719 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize); 720 721 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); 722 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); 723 724 if (r != CUDA_SUCCESS) 725 { 726 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 727 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r)); 728 return false; 729 } 730 731 CUDA_CALL (cuModuleLoadData, module, linkout); 732 CUDA_CALL (cuLinkDestroy, linkstate); 733 return true; 734 } 735 736 static void 737 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, 738 unsigned *dims, void *targ_mem_desc, 739 CUdeviceptr dp, CUstream stream) 740 { 741 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; 742 CUfunction function; 743 int i; 744 void *kargs[1]; 745 struct nvptx_thread *nvthd = nvptx_thread (); 746 int warp_size = nvthd->ptx_dev->warp_size; 747 748 function = targ_fn->fn; 749 750 /* Initialize the launch dimensions. Typically this is constant, 751 provided by the device compiler, but we must permit runtime 752 values. */ 753 int seen_zero = 0; 754 for (i = 0; i != GOMP_DIM_MAX; i++) 755 { 756 if (targ_fn->launch->dim[i]) 757 dims[i] = targ_fn->launch->dim[i]; 758 if (!dims[i]) 759 seen_zero = 1; 760 } 761 762 if (seen_zero) 763 { 764 pthread_mutex_lock (&ptx_dev_lock); 765 766 static int gomp_openacc_dims[GOMP_DIM_MAX]; 767 if (!gomp_openacc_dims[0]) 768 { 769 /* See if the user provided GOMP_OPENACC_DIM environment 770 variable to specify runtime defaults. */ 771 for (int i = 0; i < GOMP_DIM_MAX; ++i) 772 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i); 773 } 774 775 if (!nvthd->ptx_dev->default_dims[0]) 776 { 777 int default_dims[GOMP_DIM_MAX]; 778 for (int i = 0; i < GOMP_DIM_MAX; ++i) 779 default_dims[i] = gomp_openacc_dims[i]; 780 781 int gang, worker, vector; 782 { 783 int block_size = nvthd->ptx_dev->max_threads_per_block; 784 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor; 785 int dev_size = nvthd->ptx_dev->num_sms; 786 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," 787 " dev_size=%d, cpu_size=%d\n", 788 warp_size, block_size, dev_size, cpu_size); 789 790 gang = (cpu_size / block_size) * dev_size; 791 worker = block_size / warp_size; 792 vector = warp_size; 793 } 794 795 /* There is no upper bound on the gang size. The best size 796 matches the hardware configuration. Logical gangs are 797 scheduled onto physical hardware. To maximize usage, we 798 should guess a large number. */ 799 if (default_dims[GOMP_DIM_GANG] < 1) 800 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024; 801 /* The worker size must not exceed the hardware. */ 802 if (default_dims[GOMP_DIM_WORKER] < 1 803 || (default_dims[GOMP_DIM_WORKER] > worker && gang)) 804 default_dims[GOMP_DIM_WORKER] = worker; 805 /* The vector size must exactly match the hardware. */ 806 if (default_dims[GOMP_DIM_VECTOR] < 1 807 || (default_dims[GOMP_DIM_VECTOR] != vector && gang)) 808 default_dims[GOMP_DIM_VECTOR] = vector; 809 810 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n", 811 default_dims[GOMP_DIM_GANG], 812 default_dims[GOMP_DIM_WORKER], 813 default_dims[GOMP_DIM_VECTOR]); 814 815 for (i = 0; i != GOMP_DIM_MAX; i++) 816 nvthd->ptx_dev->default_dims[i] = default_dims[i]; 817 } 818 pthread_mutex_unlock (&ptx_dev_lock); 819 820 { 821 bool default_dim_p[GOMP_DIM_MAX]; 822 for (i = 0; i != GOMP_DIM_MAX; i++) 823 default_dim_p[i] = !dims[i]; 824 825 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) 826 { 827 for (i = 0; i != GOMP_DIM_MAX; i++) 828 if (default_dim_p[i]) 829 dims[i] = nvthd->ptx_dev->default_dims[i]; 830 831 if (default_dim_p[GOMP_DIM_VECTOR]) 832 dims[GOMP_DIM_VECTOR] 833 = MIN (dims[GOMP_DIM_VECTOR], 834 (targ_fn->max_threads_per_block / warp_size 835 * warp_size)); 836 837 if (default_dim_p[GOMP_DIM_WORKER]) 838 dims[GOMP_DIM_WORKER] 839 = MIN (dims[GOMP_DIM_WORKER], 840 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); 841 } 842 else 843 { 844 /* Handle the case that the compiler allows the runtime to choose 845 the vector-length conservatively, by ignoring 846 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle 847 it. */ 848 int vectors = 0; 849 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that 850 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not 851 exceed targ_fn->max_threads_per_block. */ 852 int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; 853 int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; 854 int grids, blocks; 855 856 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, 857 &blocks, function, NULL, 0, 858 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); 859 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " 860 "grid = %d, block = %d\n", grids, blocks); 861 862 /* Keep the num_gangs proportional to the block size. In 863 the case were a block size is limited by shared-memory 864 or the register file capacity, the runtime will not 865 excessively over assign gangs to the multiprocessor 866 units if their state is going to be swapped out even 867 more than necessary. The constant factor 2 is there to 868 prevent threads from idling when there is insufficient 869 work for them. */ 870 if (gangs == 0) 871 gangs = 2 * grids * (blocks / warp_size); 872 873 if (vectors == 0) 874 vectors = warp_size; 875 876 if (workers == 0) 877 { 878 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR] 879 ? vectors 880 : dims[GOMP_DIM_VECTOR]); 881 workers = blocks / actual_vectors; 882 workers = MAX (workers, 1); 883 /* If we need a per-worker barrier ... . */ 884 if (actual_vectors > 32) 885 /* Don't use more barriers than available. */ 886 workers = MIN (workers, 15); 887 } 888 889 for (i = 0; i != GOMP_DIM_MAX; i++) 890 if (default_dim_p[i]) 891 switch (i) 892 { 893 case GOMP_DIM_GANG: dims[i] = gangs; break; 894 case GOMP_DIM_WORKER: dims[i] = workers; break; 895 case GOMP_DIM_VECTOR: dims[i] = vectors; break; 896 default: GOMP_PLUGIN_fatal ("invalid dim"); 897 } 898 } 899 } 900 } 901 902 /* Check if the accelerator has sufficient hardware resources to 903 launch the offloaded kernel. */ 904 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] 905 > targ_fn->max_threads_per_block) 906 { 907 const char *msg 908 = ("The Nvidia accelerator has insufficient resources to launch '%s'" 909 " with num_workers = %d and vector_length = %d" 910 "; " 911 "recompile the program with 'num_workers = x and vector_length = y'" 912 " on that offloaded region or '-fopenacc-dim=:x:y' where" 913 " x * y <= %d" 914 ".\n"); 915 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], 916 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block); 917 } 918 919 /* Check if the accelerator has sufficient barrier resources to 920 launch the offloaded kernel. */ 921 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32) 922 { 923 const char *msg 924 = ("The Nvidia accelerator has insufficient barrier resources to launch" 925 " '%s' with num_workers = %d and vector_length = %d" 926 "; " 927 "recompile the program with 'num_workers = x' on that offloaded" 928 " region or '-fopenacc-dim=:x:' where x <= 15" 929 "; " 930 "or, recompile the program with 'vector_length = 32' on that" 931 " offloaded region or '-fopenacc-dim=::32'" 932 ".\n"); 933 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], 934 dims[GOMP_DIM_VECTOR]); 935 } 936 937 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" 938 " gangs=%u, workers=%u, vectors=%u\n", 939 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], 940 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]); 941 942 // OpenACC CUDA 943 // 944 // num_gangs nctaid.x 945 // num_workers ntid.y 946 // vector length ntid.x 947 948 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 949 acc_prof_info *prof_info = thr->prof_info; 950 acc_event_info enqueue_launch_event_info; 951 acc_api_info *api_info = thr->api_info; 952 bool profiling_p = __builtin_expect (prof_info != NULL, false); 953 if (profiling_p) 954 { 955 prof_info->event_type = acc_ev_enqueue_launch_start; 956 957 enqueue_launch_event_info.launch_event.event_type 958 = prof_info->event_type; 959 enqueue_launch_event_info.launch_event.valid_bytes 960 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES; 961 enqueue_launch_event_info.launch_event.parent_construct 962 = acc_construct_parallel; 963 enqueue_launch_event_info.launch_event.implicit = 1; 964 enqueue_launch_event_info.launch_event.tool_info = NULL; 965 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn; 966 enqueue_launch_event_info.launch_event.num_gangs 967 = dims[GOMP_DIM_GANG]; 968 enqueue_launch_event_info.launch_event.num_workers 969 = dims[GOMP_DIM_WORKER]; 970 enqueue_launch_event_info.launch_event.vector_length 971 = dims[GOMP_DIM_VECTOR]; 972 973 api_info->device_api = acc_device_api_cuda; 974 975 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, 976 api_info); 977 } 978 979 kargs[0] = &dp; 980 CUDA_CALL_ASSERT (cuLaunchKernel, function, 981 dims[GOMP_DIM_GANG], 1, 1, 982 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 983 0, stream, kargs, 0); 984 985 if (profiling_p) 986 { 987 prof_info->event_type = acc_ev_enqueue_launch_end; 988 enqueue_launch_event_info.launch_event.event_type 989 = prof_info->event_type; 990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, 991 api_info); 992 } 993 994 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, 995 targ_fn->launch->fn); 996 } 997 998 void * openacc_get_current_cuda_context (void); 999 1000 static void 1001 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s) 1002 { 1003 acc_prof_info *prof_info = thr->prof_info; 1004 acc_event_info data_event_info; 1005 acc_api_info *api_info = thr->api_info; 1006 1007 prof_info->event_type = acc_ev_alloc; 1008 1009 data_event_info.data_event.event_type = prof_info->event_type; 1010 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1011 data_event_info.data_event.parent_construct = acc_construct_parallel; 1012 data_event_info.data_event.implicit = 1; 1013 data_event_info.data_event.tool_info = NULL; 1014 data_event_info.data_event.var_name = NULL; 1015 data_event_info.data_event.bytes = s; 1016 data_event_info.data_event.host_ptr = NULL; 1017 data_event_info.data_event.device_ptr = dp; 1018 1019 api_info->device_api = acc_device_api_cuda; 1020 1021 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info); 1022 } 1023 1024 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT 1025 size threshold, or if FORCE is true. */ 1026 1027 static void 1028 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) 1029 { 1030 pthread_mutex_lock (&ptx_dev->omp_stacks.lock); 1031 if (ptx_dev->omp_stacks.ptr 1032 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT)) 1033 { 1034 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr); 1035 if (r != CUDA_SUCCESS) 1036 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); 1037 ptx_dev->omp_stacks.ptr = 0; 1038 ptx_dev->omp_stacks.size = 0; 1039 } 1040 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); 1041 } 1042 1043 static void * 1044 nvptx_alloc (size_t s, bool suppress_errors) 1045 { 1046 CUdeviceptr d; 1047 1048 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); 1049 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) 1050 return NULL; 1051 else if (r != CUDA_SUCCESS) 1052 { 1053 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r)); 1054 return NULL; 1055 } 1056 1057 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */ 1058 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1059 bool profiling_p 1060 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1061 if (profiling_p) 1062 goacc_profiling_acc_ev_alloc (thr, (void *) d, s); 1063 1064 return (void *) d; 1065 } 1066 1067 static void 1068 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p) 1069 { 1070 acc_prof_info *prof_info = thr->prof_info; 1071 acc_event_info data_event_info; 1072 acc_api_info *api_info = thr->api_info; 1073 1074 prof_info->event_type = acc_ev_free; 1075 1076 data_event_info.data_event.event_type = prof_info->event_type; 1077 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1078 data_event_info.data_event.parent_construct = acc_construct_parallel; 1079 data_event_info.data_event.implicit = 1; 1080 data_event_info.data_event.tool_info = NULL; 1081 data_event_info.data_event.var_name = NULL; 1082 data_event_info.data_event.bytes = -1; 1083 data_event_info.data_event.host_ptr = NULL; 1084 data_event_info.data_event.device_ptr = p; 1085 1086 api_info->device_api = acc_device_api_cuda; 1087 1088 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info); 1089 } 1090 1091 static bool 1092 nvptx_free (void *p, struct ptx_device *ptx_dev) 1093 { 1094 CUdeviceptr pb; 1095 size_t ps; 1096 1097 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps, 1098 (CUdeviceptr) p); 1099 if (r == CUDA_ERROR_NOT_PERMITTED) 1100 { 1101 /* We assume that this error indicates we are in a CUDA callback context, 1102 where all CUDA calls are not allowed (see cuStreamAddCallback 1103 documentation for description). Arrange to free this piece of device 1104 memory later. */ 1105 struct ptx_free_block *n 1106 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block)); 1107 n->ptr = p; 1108 pthread_mutex_lock (&ptx_dev->free_blocks_lock); 1109 n->next = ptx_dev->free_blocks; 1110 ptx_dev->free_blocks = n; 1111 pthread_mutex_unlock (&ptx_dev->free_blocks_lock); 1112 return true; 1113 } 1114 else if (r != CUDA_SUCCESS) 1115 { 1116 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r)); 1117 return false; 1118 } 1119 if ((CUdeviceptr) p != pb) 1120 { 1121 GOMP_PLUGIN_error ("invalid device address"); 1122 return false; 1123 } 1124 1125 CUDA_CALL (cuMemFree, (CUdeviceptr) p); 1126 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1127 bool profiling_p 1128 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1129 if (profiling_p) 1130 goacc_profiling_acc_ev_free (thr, p); 1131 1132 return true; 1133 } 1134 1135 static void * 1136 nvptx_get_current_cuda_device (void) 1137 { 1138 struct nvptx_thread *nvthd = nvptx_thread (); 1139 1140 if (!nvthd || !nvthd->ptx_dev) 1141 return NULL; 1142 1143 return &nvthd->ptx_dev->dev; 1144 } 1145 1146 static void * 1147 nvptx_get_current_cuda_context (void) 1148 { 1149 struct nvptx_thread *nvthd = nvptx_thread (); 1150 1151 if (!nvthd || !nvthd->ptx_dev) 1152 return NULL; 1153 1154 return nvthd->ptx_dev->ctx; 1155 } 1156 1157 /* Plugin entry points. */ 1158 1159 const char * 1160 GOMP_OFFLOAD_get_name (void) 1161 { 1162 return "nvptx"; 1163 } 1164 1165 unsigned int 1166 GOMP_OFFLOAD_get_caps (void) 1167 { 1168 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400; 1169 } 1170 1171 int 1172 GOMP_OFFLOAD_get_type (void) 1173 { 1174 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX; 1175 } 1176 1177 int 1178 GOMP_OFFLOAD_get_num_devices (void) 1179 { 1180 return nvptx_get_num_devices (); 1181 } 1182 1183 bool 1184 GOMP_OFFLOAD_init_device (int n) 1185 { 1186 struct ptx_device *dev; 1187 1188 pthread_mutex_lock (&ptx_dev_lock); 1189 1190 if (!nvptx_init () || ptx_devices[n] != NULL) 1191 { 1192 pthread_mutex_unlock (&ptx_dev_lock); 1193 return false; 1194 } 1195 1196 dev = nvptx_open_device (n); 1197 if (dev) 1198 { 1199 ptx_devices[n] = dev; 1200 instantiated_devices++; 1201 } 1202 1203 pthread_mutex_unlock (&ptx_dev_lock); 1204 1205 return dev != NULL; 1206 } 1207 1208 bool 1209 GOMP_OFFLOAD_fini_device (int n) 1210 { 1211 pthread_mutex_lock (&ptx_dev_lock); 1212 1213 if (ptx_devices[n] != NULL) 1214 { 1215 if (!nvptx_attach_host_thread_to_device (n) 1216 || !nvptx_close_device (ptx_devices[n])) 1217 { 1218 pthread_mutex_unlock (&ptx_dev_lock); 1219 return false; 1220 } 1221 ptx_devices[n] = NULL; 1222 instantiated_devices--; 1223 } 1224 1225 if (instantiated_devices == 0) 1226 { 1227 free (ptx_devices); 1228 ptx_devices = NULL; 1229 } 1230 1231 pthread_mutex_unlock (&ptx_dev_lock); 1232 return true; 1233 } 1234 1235 /* Return the libgomp version number we're compatible with. There is 1236 no requirement for cross-version compatibility. */ 1237 1238 unsigned 1239 GOMP_OFFLOAD_version (void) 1240 { 1241 return GOMP_VERSION; 1242 } 1243 1244 /* Initialize __nvptx_clocktick, if present in MODULE. */ 1245 1246 static void 1247 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) 1248 { 1249 CUdeviceptr dptr; 1250 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL, 1251 module, "__nvptx_clocktick"); 1252 if (r == CUDA_ERROR_NOT_FOUND) 1253 return; 1254 if (r != CUDA_SUCCESS) 1255 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); 1256 double __nvptx_clocktick = 1e-3 / dev->clock_khz; 1257 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick, 1258 sizeof (__nvptx_clocktick)); 1259 if (r != CUDA_SUCCESS) 1260 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); 1261 } 1262 1263 /* Load the (partial) program described by TARGET_DATA to device 1264 number ORD. Allocate and return TARGET_TABLE. */ 1265 1266 int 1267 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, 1268 struct addr_pair **target_table) 1269 { 1270 CUmodule module; 1271 const char *const *var_names; 1272 const struct targ_fn_launch *fn_descs; 1273 unsigned int fn_entries, var_entries, other_entries, i, j; 1274 struct targ_fn_descriptor *targ_fns; 1275 struct addr_pair *targ_tbl; 1276 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; 1277 struct ptx_image_data *new_image; 1278 struct ptx_device *dev; 1279 1280 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1281 { 1282 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" 1283 " (expected %u, received %u)", 1284 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); 1285 return -1; 1286 } 1287 1288 if (!nvptx_attach_host_thread_to_device (ord) 1289 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num)) 1290 return -1; 1291 1292 dev = ptx_devices[ord]; 1293 1294 /* The mkoffload utility emits a struct of pointers/integers at the 1295 start of each offload image. The array of kernel names and the 1296 functions addresses form a one-to-one correspondence. */ 1297 1298 var_entries = img_header->var_num; 1299 var_names = img_header->var_names; 1300 fn_entries = img_header->fn_num; 1301 fn_descs = img_header->fn_descs; 1302 1303 /* Currently, the only other entry kind is 'device number'. */ 1304 other_entries = 1; 1305 1306 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) 1307 * (fn_entries + var_entries + other_entries)); 1308 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) 1309 * fn_entries); 1310 1311 *target_table = targ_tbl; 1312 1313 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); 1314 new_image->target_data = target_data; 1315 new_image->module = module; 1316 new_image->fns = targ_fns; 1317 1318 pthread_mutex_lock (&dev->image_lock); 1319 new_image->next = dev->images; 1320 dev->images = new_image; 1321 pthread_mutex_unlock (&dev->image_lock); 1322 1323 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) 1324 { 1325 CUfunction function; 1326 int nregs, mthrs; 1327 1328 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, 1329 fn_descs[i].fn); 1330 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs, 1331 CU_FUNC_ATTRIBUTE_NUM_REGS, function); 1332 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs, 1333 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); 1334 1335 targ_fns->fn = function; 1336 targ_fns->launch = &fn_descs[i]; 1337 targ_fns->regs_per_thread = nregs; 1338 targ_fns->max_threads_per_block = mthrs; 1339 1340 targ_tbl->start = (uintptr_t) targ_fns; 1341 targ_tbl->end = targ_tbl->start + 1; 1342 } 1343 1344 for (j = 0; j < var_entries; j++, targ_tbl++) 1345 { 1346 CUdeviceptr var; 1347 size_t bytes; 1348 1349 CUDA_CALL_ERET (-1, cuModuleGetGlobal, 1350 &var, &bytes, module, var_names[j]); 1351 1352 targ_tbl->start = (uintptr_t) var; 1353 targ_tbl->end = targ_tbl->start + bytes; 1354 } 1355 1356 CUdeviceptr device_num_varptr; 1357 size_t device_num_varsize; 1358 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr, 1359 &device_num_varsize, module, 1360 XSTRING (GOMP_DEVICE_NUM_VAR)); 1361 if (r == CUDA_SUCCESS) 1362 { 1363 targ_tbl->start = (uintptr_t) device_num_varptr; 1364 targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize); 1365 } 1366 else 1367 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */ 1368 targ_tbl->start = targ_tbl->end = 0; 1369 targ_tbl++; 1370 1371 nvptx_set_clocktick (module, dev); 1372 1373 return fn_entries + var_entries + other_entries; 1374 } 1375 1376 /* Unload the program described by TARGET_DATA. DEV_DATA is the 1377 function descriptors allocated by G_O_load_image. */ 1378 1379 bool 1380 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) 1381 { 1382 struct ptx_image_data *image, **prev_p; 1383 struct ptx_device *dev = ptx_devices[ord]; 1384 1385 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1386 { 1387 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" 1388 " (expected %u, received %u)", 1389 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); 1390 return false; 1391 } 1392 1393 bool ret = true; 1394 pthread_mutex_lock (&dev->image_lock); 1395 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) 1396 if (image->target_data == target_data) 1397 { 1398 *prev_p = image->next; 1399 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) 1400 ret = false; 1401 free (image->fns); 1402 free (image); 1403 break; 1404 } 1405 pthread_mutex_unlock (&dev->image_lock); 1406 return ret; 1407 } 1408 1409 void * 1410 GOMP_OFFLOAD_alloc (int ord, size_t size) 1411 { 1412 if (!nvptx_attach_host_thread_to_device (ord)) 1413 return NULL; 1414 1415 struct ptx_device *ptx_dev = ptx_devices[ord]; 1416 struct ptx_free_block *blocks, *tmp; 1417 1418 pthread_mutex_lock (&ptx_dev->free_blocks_lock); 1419 blocks = ptx_dev->free_blocks; 1420 ptx_dev->free_blocks = NULL; 1421 pthread_mutex_unlock (&ptx_dev->free_blocks_lock); 1422 1423 nvptx_stacks_free (ptx_dev, false); 1424 1425 while (blocks) 1426 { 1427 tmp = blocks->next; 1428 nvptx_free (blocks->ptr, ptx_dev); 1429 free (blocks); 1430 blocks = tmp; 1431 } 1432 1433 void *d = nvptx_alloc (size, true); 1434 if (d) 1435 return d; 1436 else 1437 { 1438 /* Memory allocation failed. Try freeing the stacks block, and 1439 retrying. */ 1440 nvptx_stacks_free (ptx_dev, true); 1441 return nvptx_alloc (size, false); 1442 } 1443 } 1444 1445 bool 1446 GOMP_OFFLOAD_free (int ord, void *ptr) 1447 { 1448 return (nvptx_attach_host_thread_to_device (ord) 1449 && nvptx_free (ptr, ptx_devices[ord])); 1450 } 1451 1452 void 1453 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, 1454 void **hostaddrs, void **devaddrs, 1455 unsigned *dims, void *targ_mem_desc) 1456 { 1457 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 1458 1459 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1460 acc_prof_info *prof_info = thr->prof_info; 1461 acc_event_info data_event_info; 1462 acc_api_info *api_info = thr->api_info; 1463 bool profiling_p = __builtin_expect (prof_info != NULL, false); 1464 1465 void **hp = NULL; 1466 CUdeviceptr dp = 0; 1467 1468 if (mapnum > 0) 1469 { 1470 size_t s = mapnum * sizeof (void *); 1471 hp = alloca (s); 1472 for (int i = 0; i < mapnum; i++) 1473 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); 1474 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); 1475 if (profiling_p) 1476 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); 1477 } 1478 1479 /* Copy the (device) pointers to arguments to the device (dp and hp might in 1480 fact have the same value on a unified-memory system). */ 1481 if (mapnum > 0) 1482 { 1483 if (profiling_p) 1484 { 1485 prof_info->event_type = acc_ev_enqueue_upload_start; 1486 1487 data_event_info.data_event.event_type = prof_info->event_type; 1488 data_event_info.data_event.valid_bytes 1489 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1490 data_event_info.data_event.parent_construct 1491 = acc_construct_parallel; 1492 data_event_info.data_event.implicit = 1; /* Always implicit. */ 1493 data_event_info.data_event.tool_info = NULL; 1494 data_event_info.data_event.var_name = NULL; 1495 data_event_info.data_event.bytes = mapnum * sizeof (void *); 1496 data_event_info.data_event.host_ptr = hp; 1497 data_event_info.data_event.device_ptr = (const void *) dp; 1498 1499 api_info->device_api = acc_device_api_cuda; 1500 1501 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1502 api_info); 1503 } 1504 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, 1505 mapnum * sizeof (void *)); 1506 if (profiling_p) 1507 { 1508 prof_info->event_type = acc_ev_enqueue_upload_end; 1509 data_event_info.data_event.event_type = prof_info->event_type; 1510 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1511 api_info); 1512 } 1513 } 1514 1515 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, 1516 dp, NULL); 1517 1518 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL); 1519 const char *maybe_abort_msg = "(perhaps abort was called)"; 1520 if (r == CUDA_ERROR_LAUNCH_FAILED) 1521 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), 1522 maybe_abort_msg); 1523 else if (r != CUDA_SUCCESS) 1524 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1525 1526 CUDA_CALL_ASSERT (cuMemFree, dp); 1527 if (profiling_p) 1528 goacc_profiling_acc_ev_free (thr, (void *) dp); 1529 } 1530 1531 static void 1532 cuda_free_argmem (void *ptr) 1533 { 1534 void **block = (void **) ptr; 1535 nvptx_free (block[0], (struct ptx_device *) block[1]); 1536 free (block); 1537 } 1538 1539 void 1540 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, 1541 void **hostaddrs, void **devaddrs, 1542 unsigned *dims, void *targ_mem_desc, 1543 struct goacc_asyncqueue *aq) 1544 { 1545 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 1546 1547 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1548 acc_prof_info *prof_info = thr->prof_info; 1549 acc_event_info data_event_info; 1550 acc_api_info *api_info = thr->api_info; 1551 bool profiling_p = __builtin_expect (prof_info != NULL, false); 1552 1553 void **hp = NULL; 1554 CUdeviceptr dp = 0; 1555 void **block = NULL; 1556 1557 if (mapnum > 0) 1558 { 1559 size_t s = mapnum * sizeof (void *); 1560 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s); 1561 hp = block + 2; 1562 for (int i = 0; i < mapnum; i++) 1563 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); 1564 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); 1565 if (profiling_p) 1566 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); 1567 } 1568 1569 /* Copy the (device) pointers to arguments to the device (dp and hp might in 1570 fact have the same value on a unified-memory system). */ 1571 if (mapnum > 0) 1572 { 1573 if (profiling_p) 1574 { 1575 prof_info->event_type = acc_ev_enqueue_upload_start; 1576 1577 data_event_info.data_event.event_type = prof_info->event_type; 1578 data_event_info.data_event.valid_bytes 1579 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1580 data_event_info.data_event.parent_construct 1581 = acc_construct_parallel; 1582 data_event_info.data_event.implicit = 1; /* Always implicit. */ 1583 data_event_info.data_event.tool_info = NULL; 1584 data_event_info.data_event.var_name = NULL; 1585 data_event_info.data_event.bytes = mapnum * sizeof (void *); 1586 data_event_info.data_event.host_ptr = hp; 1587 data_event_info.data_event.device_ptr = (const void *) dp; 1588 1589 api_info->device_api = acc_device_api_cuda; 1590 1591 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1592 api_info); 1593 } 1594 1595 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, 1596 mapnum * sizeof (void *), aq->cuda_stream); 1597 block[0] = (void *) dp; 1598 1599 struct nvptx_thread *nvthd = 1600 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 1601 block[1] = (void *) nvthd->ptx_dev; 1602 1603 if (profiling_p) 1604 { 1605 prof_info->event_type = acc_ev_enqueue_upload_end; 1606 data_event_info.data_event.event_type = prof_info->event_type; 1607 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1608 api_info); 1609 } 1610 } 1611 1612 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, 1613 dp, aq->cuda_stream); 1614 1615 if (mapnum > 0) 1616 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block); 1617 } 1618 1619 void * 1620 GOMP_OFFLOAD_openacc_create_thread_data (int ord) 1621 { 1622 struct ptx_device *ptx_dev; 1623 struct nvptx_thread *nvthd 1624 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); 1625 CUcontext thd_ctx; 1626 1627 ptx_dev = ptx_devices[ord]; 1628 1629 assert (ptx_dev); 1630 1631 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx); 1632 1633 assert (ptx_dev->ctx); 1634 1635 if (!thd_ctx) 1636 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); 1637 1638 nvthd->ptx_dev = ptx_dev; 1639 1640 return (void *) nvthd; 1641 } 1642 1643 void 1644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 1645 { 1646 free (data); 1647 } 1648 1649 void * 1650 GOMP_OFFLOAD_openacc_cuda_get_current_device (void) 1651 { 1652 return nvptx_get_current_cuda_device (); 1653 } 1654 1655 void * 1656 GOMP_OFFLOAD_openacc_cuda_get_current_context (void) 1657 { 1658 return nvptx_get_current_cuda_context (); 1659 } 1660 1661 /* This returns a CUstream. */ 1662 void * 1663 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq) 1664 { 1665 return (void *) aq->cuda_stream; 1666 } 1667 1668 /* This takes a CUstream. */ 1669 int 1670 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream) 1671 { 1672 if (aq->cuda_stream) 1673 { 1674 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream); 1675 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream); 1676 } 1677 1678 aq->cuda_stream = (CUstream) stream; 1679 return 1; 1680 } 1681 1682 struct goacc_asyncqueue * 1683 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) 1684 { 1685 CUstream stream = NULL; 1686 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT); 1687 1688 struct goacc_asyncqueue *aq 1689 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue)); 1690 aq->cuda_stream = stream; 1691 return aq; 1692 } 1693 1694 bool 1695 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) 1696 { 1697 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream); 1698 free (aq); 1699 return true; 1700 } 1701 1702 int 1703 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) 1704 { 1705 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream); 1706 if (r == CUDA_SUCCESS) 1707 return 1; 1708 if (r == CUDA_ERROR_NOT_READY) 1709 return 0; 1710 1711 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r)); 1712 return -1; 1713 } 1714 1715 bool 1716 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) 1717 { 1718 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream); 1719 return true; 1720 } 1721 1722 bool 1723 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, 1724 struct goacc_asyncqueue *aq2) 1725 { 1726 CUevent e; 1727 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING); 1728 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream); 1729 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0); 1730 return true; 1731 } 1732 1733 static void 1734 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr) 1735 { 1736 if (res != CUDA_SUCCESS) 1737 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res)); 1738 struct nvptx_callback *cb = (struct nvptx_callback *) ptr; 1739 cb->fn (cb->ptr); 1740 free (ptr); 1741 } 1742 1743 void 1744 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, 1745 void (*callback_fn)(void *), 1746 void *userptr) 1747 { 1748 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b)); 1749 b->fn = callback_fn; 1750 b->ptr = userptr; 1751 b->aq = aq; 1752 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream, 1753 cuda_callback_wrapper, (void *) b, 0); 1754 } 1755 1756 static bool 1757 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) 1758 { 1759 CUdeviceptr pb; 1760 size_t ps; 1761 if (!s) 1762 return true; 1763 if (!d) 1764 { 1765 GOMP_PLUGIN_error ("invalid device address"); 1766 return false; 1767 } 1768 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); 1769 if (!pb) 1770 { 1771 GOMP_PLUGIN_error ("invalid device address"); 1772 return false; 1773 } 1774 if (!h) 1775 { 1776 GOMP_PLUGIN_error ("invalid host address"); 1777 return false; 1778 } 1779 if (d == h) 1780 { 1781 GOMP_PLUGIN_error ("invalid host or device address"); 1782 return false; 1783 } 1784 if ((void *)(d + s) > (void *)(pb + ps)) 1785 { 1786 GOMP_PLUGIN_error ("invalid size"); 1787 return false; 1788 } 1789 return true; 1790 } 1791 1792 bool 1793 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) 1794 { 1795 if (!nvptx_attach_host_thread_to_device (ord) 1796 || !cuda_memcpy_sanity_check (src, dst, n)) 1797 return false; 1798 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n); 1799 return true; 1800 } 1801 1802 bool 1803 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) 1804 { 1805 if (!nvptx_attach_host_thread_to_device (ord) 1806 || !cuda_memcpy_sanity_check (dst, src, n)) 1807 return false; 1808 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n); 1809 return true; 1810 } 1811 1812 bool 1813 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) 1814 { 1815 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); 1816 return true; 1817 } 1818 1819 bool 1820 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, 1821 size_t n, struct goacc_asyncqueue *aq) 1822 { 1823 if (!nvptx_attach_host_thread_to_device (ord) 1824 || !cuda_memcpy_sanity_check (src, dst, n)) 1825 return false; 1826 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream); 1827 return true; 1828 } 1829 1830 bool 1831 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, 1832 size_t n, struct goacc_asyncqueue *aq) 1833 { 1834 if (!nvptx_attach_host_thread_to_device (ord) 1835 || !cuda_memcpy_sanity_check (dst, src, n)) 1836 return false; 1837 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream); 1838 return true; 1839 } 1840 1841 union goacc_property_value 1842 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) 1843 { 1844 union goacc_property_value propval = { .val = 0 }; 1845 1846 pthread_mutex_lock (&ptx_dev_lock); 1847 1848 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL) 1849 { 1850 pthread_mutex_unlock (&ptx_dev_lock); 1851 return propval; 1852 } 1853 1854 struct ptx_device *ptx_dev = ptx_devices[n]; 1855 switch (prop) 1856 { 1857 case GOACC_PROPERTY_MEMORY: 1858 { 1859 size_t total_mem; 1860 1861 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev); 1862 propval.val = total_mem; 1863 } 1864 break; 1865 case GOACC_PROPERTY_FREE_MEMORY: 1866 { 1867 size_t total_mem; 1868 size_t free_mem; 1869 CUdevice ctxdev; 1870 1871 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev); 1872 if (ptx_dev->dev == ctxdev) 1873 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1874 else if (ptx_dev->ctx) 1875 { 1876 CUcontext old_ctx; 1877 1878 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx); 1879 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1880 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx); 1881 } 1882 else 1883 { 1884 CUcontext new_ctx; 1885 1886 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO, 1887 ptx_dev->dev); 1888 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1889 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx); 1890 } 1891 propval.val = free_mem; 1892 } 1893 break; 1894 case GOACC_PROPERTY_NAME: 1895 propval.ptr = ptx_dev->name; 1896 break; 1897 case GOACC_PROPERTY_VENDOR: 1898 propval.ptr = "Nvidia"; 1899 break; 1900 case GOACC_PROPERTY_DRIVER: 1901 propval.ptr = cuda_driver_version_s; 1902 break; 1903 default: 1904 break; 1905 } 1906 1907 pthread_mutex_unlock (&ptx_dev_lock); 1908 return propval; 1909 } 1910 1911 /* Adjust launch dimensions: pick good values for number of blocks and warps 1912 and ensure that number of warps does not exceed CUDA limits as well as GCC's 1913 own limits. */ 1914 1915 static void 1916 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn, 1917 struct ptx_device *ptx_dev, 1918 int *teams_p, int *threads_p) 1919 { 1920 int max_warps_block = fn->max_threads_per_block / 32; 1921 /* Maximum 32 warps per block is an implementation limit in NVPTX backend 1922 and libgcc, which matches documented limit of all GPUs as of 2015. */ 1923 if (max_warps_block > 32) 1924 max_warps_block = 32; 1925 if (*threads_p <= 0) 1926 *threads_p = 8; 1927 if (*threads_p > max_warps_block) 1928 *threads_p = max_warps_block; 1929 1930 int regs_per_block = fn->regs_per_thread * 32 * *threads_p; 1931 /* This is an estimate of how many blocks the device can host simultaneously. 1932 Actual limit, which may be lower, can be queried with "occupancy control" 1933 driver interface (since CUDA 6.0). */ 1934 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms; 1935 if (*teams_p <= 0 || *teams_p > max_blocks) 1936 *teams_p = max_blocks; 1937 } 1938 1939 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP 1940 target regions. */ 1941 1942 static size_t 1943 nvptx_stacks_size () 1944 { 1945 return 128 * 1024; 1946 } 1947 1948 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for 1949 the storage should be held on entry, and remains held on exit. */ 1950 1951 static void * 1952 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) 1953 { 1954 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num) 1955 return (void *) ptx_dev->omp_stacks.ptr; 1956 1957 /* Free the old, too-small stacks. */ 1958 if (ptx_dev->omp_stacks.ptr) 1959 { 1960 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); 1961 if (r != CUDA_SUCCESS) 1962 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r)); 1963 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr); 1964 if (r != CUDA_SUCCESS) 1965 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); 1966 } 1967 1968 /* Make new and bigger stacks, and remember where we put them and how big 1969 they are. */ 1970 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr, 1971 size * num); 1972 if (r != CUDA_SUCCESS) 1973 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); 1974 1975 ptx_dev->omp_stacks.size = size * num; 1976 1977 return (void *) ptx_dev->omp_stacks.ptr; 1978 } 1979 1980 void 1981 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) 1982 { 1983 struct targ_fn_descriptor *tgt_fn_desc 1984 = (struct targ_fn_descriptor *) tgt_fn; 1985 CUfunction function = tgt_fn_desc->fn; 1986 const struct targ_fn_launch *launch = tgt_fn_desc->launch; 1987 const char *fn_name = launch->fn; 1988 CUresult r; 1989 struct ptx_device *ptx_dev = ptx_devices[ord]; 1990 const char *maybe_abort_msg = "(perhaps abort was called)"; 1991 int teams = 0, threads = 0; 1992 1993 if (!args) 1994 GOMP_PLUGIN_fatal ("No target arguments provided"); 1995 while (*args) 1996 { 1997 intptr_t id = (intptr_t) *args++, val; 1998 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM) 1999 val = (intptr_t) *args++; 2000 else 2001 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT; 2002 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL) 2003 continue; 2004 val = val > INT_MAX ? INT_MAX : val; 2005 id &= GOMP_TARGET_ARG_ID_MASK; 2006 if (id == GOMP_TARGET_ARG_NUM_TEAMS) 2007 teams = val; 2008 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT) 2009 threads = val; 2010 } 2011 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); 2012 2013 size_t stack_size = nvptx_stacks_size (); 2014 2015 pthread_mutex_lock (&ptx_dev->omp_stacks.lock); 2016 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); 2017 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size}; 2018 size_t fn_args_size = sizeof fn_args; 2019 void *config[] = { 2020 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args, 2021 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size, 2022 CU_LAUNCH_PARAM_END 2023 }; 2024 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" 2025 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n", 2026 __FUNCTION__, fn_name, teams, threads); 2027 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, 2028 32, threads, 1, 0, NULL, NULL, config); 2029 if (r != CUDA_SUCCESS) 2030 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); 2031 2032 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); 2033 if (r == CUDA_ERROR_LAUNCH_FAILED) 2034 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), 2035 maybe_abort_msg); 2036 else if (r != CUDA_SUCCESS) 2037 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); 2038 2039 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); 2040 } 2041 2042 /* TODO: Implement GOMP_OFFLOAD_async_run. */ 2043