1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
42
43 #include <pthread.h>
44 #if PLUGIN_NVPTX_DYNAMIC
45 # include "cuda/cuda.h"
46 #else
47 # include <cuda.h>
48 #endif
49 #include <stdbool.h>
50 #include <limits.h>
51 #include <string.h>
52 #include <stdio.h>
53 #include <unistd.h>
54 #include <assert.h>
55 #include <errno.h>
56
57 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
58 block to cache between kernel invocations. For soft-stacks blocks bigger
59 than this, we will free the block before attempting another GPU memory
60 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
61 we will free the cached soft-stacks block anyway then retry the
62 allocation. If that fails too, we lose. */
63
64 #define SOFTSTACK_CACHE_LIMIT 134217728
65
66 #if CUDA_VERSION < 6000
67 extern CUresult cuGetErrorString (CUresult, const char **);
68 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
69 #endif
70
71 #if CUDA_VERSION >= 6050
72 #undef cuLinkCreate
73 #undef cuLinkAddData
74 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
75 const char *, unsigned, CUjit_option *, void **);
76 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
77 #else
78 typedef size_t (*CUoccupancyB2DSize)(int);
79 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
80 const char *, unsigned, CUjit_option *, void **);
81 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
82 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
83 CUoccupancyB2DSize, size_t, int);
84 #endif
85
86 #define DO_PRAGMA(x) _Pragma (#x)
87
88 #if PLUGIN_NVPTX_DYNAMIC
89 # include <dlfcn.h>
90
91 struct cuda_lib_s {
92
93 # define CUDA_ONE_CALL(call) \
94 __typeof (call) *call;
95 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
96 CUDA_ONE_CALL (call)
97 #include "cuda-lib.def"
98 # undef CUDA_ONE_CALL
99 # undef CUDA_ONE_CALL_MAYBE_NULL
100
101 } cuda_lib;
102
103 /* -1 if init_cuda_lib has not been called yet, false
104 if it has been and failed, true if it has been and succeeded. */
105 static signed char cuda_lib_inited = -1;
106
107 /* Dynamically load the CUDA runtime library and initialize function
108 pointers, return false if unsuccessful, true if successful. */
109 static bool
init_cuda_lib(void)110 init_cuda_lib (void)
111 {
112 if (cuda_lib_inited != -1)
113 return cuda_lib_inited;
114 const char *cuda_runtime_lib = "libcuda.so.1";
115 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
116 cuda_lib_inited = false;
117 if (h == NULL)
118 return false;
119
120 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
121 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
122 # define CUDA_ONE_CALL_1(call, allow_null) \
123 cuda_lib.call = dlsym (h, #call); \
124 if (!allow_null && cuda_lib.call == NULL) \
125 return false;
126 #include "cuda-lib.def"
127 # undef CUDA_ONE_CALL
128 # undef CUDA_ONE_CALL_1
129 # undef CUDA_ONE_CALL_MAYBE_NULL
130
131 cuda_lib_inited = true;
132 return true;
133 }
134 # define CUDA_CALL_PREFIX cuda_lib.
135 #else
136
137 # define CUDA_ONE_CALL(call)
138 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
139 #include "cuda-lib.def"
140 #undef CUDA_ONE_CALL_MAYBE_NULL
141 #undef CUDA_ONE_CALL
142
143 # define CUDA_CALL_PREFIX
144 # define init_cuda_lib() true
145 #endif
146
147 #include "secure_getenv.h"
148
149 #undef MIN
150 #undef MAX
151 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
152 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
153
154 /* Convenience macros for the frequently used CUDA library call and
155 error handling sequence as well as CUDA library calls that
156 do the error checking themselves or don't do it at all. */
157
158 #define CUDA_CALL_ERET(ERET, FN, ...) \
159 do { \
160 unsigned __r \
161 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
162 if (__r != CUDA_SUCCESS) \
163 { \
164 GOMP_PLUGIN_error (#FN " error: %s", \
165 cuda_error (__r)); \
166 return ERET; \
167 } \
168 } while (0)
169
170 #define CUDA_CALL(FN, ...) \
171 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
172
173 #define CUDA_CALL_ASSERT(FN, ...) \
174 do { \
175 unsigned __r \
176 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
177 if (__r != CUDA_SUCCESS) \
178 { \
179 GOMP_PLUGIN_fatal (#FN " error: %s", \
180 cuda_error (__r)); \
181 } \
182 } while (0)
183
184 #define CUDA_CALL_NOCHECK(FN, ...) \
185 CUDA_CALL_PREFIX FN (__VA_ARGS__)
186
187 #define CUDA_CALL_EXISTS(FN) \
188 CUDA_CALL_PREFIX FN
189
190 static const char *
cuda_error(CUresult r)191 cuda_error (CUresult r)
192 {
193 const char *fallback = "unknown cuda error";
194 const char *desc;
195
196 if (!CUDA_CALL_EXISTS (cuGetErrorString))
197 return fallback;
198
199 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
200 if (r == CUDA_SUCCESS)
201 return desc;
202
203 return fallback;
204 }
205
206 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
207 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
208 static char cuda_driver_version_s[30];
209
210 static unsigned int instantiated_devices = 0;
211 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
212
213 /* NVPTX/CUDA specific definition of asynchronous queues. */
214 struct goacc_asyncqueue
215 {
216 CUstream cuda_stream;
217 };
218
219 struct nvptx_callback
220 {
221 void (*fn) (void *);
222 void *ptr;
223 struct goacc_asyncqueue *aq;
224 struct nvptx_callback *next;
225 };
226
227 /* Thread-specific data for PTX. */
228
229 struct nvptx_thread
230 {
231 /* We currently have this embedded inside the plugin because libgomp manages
232 devices through integer target_ids. This might be better if using an
233 opaque target-specific pointer directly from gomp_device_descr. */
234 struct ptx_device *ptx_dev;
235 };
236
237 /* Target data function launch information. */
238
239 struct targ_fn_launch
240 {
241 const char *fn;
242 unsigned short dim[GOMP_DIM_MAX];
243 };
244
245 /* Target PTX object information. */
246
247 struct targ_ptx_obj
248 {
249 const char *code;
250 size_t size;
251 };
252
253 /* Target data image information. */
254
255 typedef struct nvptx_tdata
256 {
257 const struct targ_ptx_obj *ptx_objs;
258 unsigned ptx_num;
259
260 const char *const *var_names;
261 unsigned var_num;
262
263 const struct targ_fn_launch *fn_descs;
264 unsigned fn_num;
265 } nvptx_tdata_t;
266
267 /* Descriptor of a loaded function. */
268
269 struct targ_fn_descriptor
270 {
271 CUfunction fn;
272 const struct targ_fn_launch *launch;
273 int regs_per_thread;
274 int max_threads_per_block;
275 };
276
277 /* A loaded PTX image. */
278 struct ptx_image_data
279 {
280 const void *target_data;
281 CUmodule module;
282
283 struct targ_fn_descriptor *fns; /* Array of functions. */
284
285 struct ptx_image_data *next;
286 };
287
288 struct ptx_free_block
289 {
290 void *ptr;
291 struct ptx_free_block *next;
292 };
293
294 struct ptx_device
295 {
296 CUcontext ctx;
297 bool ctx_shared;
298 CUdevice dev;
299
300 int ord;
301 bool overlap;
302 bool map;
303 bool concur;
304 bool mkern;
305 int mode;
306 int clock_khz;
307 int num_sms;
308 int regs_per_block;
309 int regs_per_sm;
310 int warp_size;
311 int max_threads_per_block;
312 int max_threads_per_multiprocessor;
313 int default_dims[GOMP_DIM_MAX];
314
315 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
316 char name[256];
317
318 struct ptx_image_data *images; /* Images loaded on device. */
319 pthread_mutex_t image_lock; /* Lock for above list. */
320
321 struct ptx_free_block *free_blocks;
322 pthread_mutex_t free_blocks_lock;
323
324 /* OpenMP stacks, cached between kernel invocations. */
325 struct
326 {
327 CUdeviceptr ptr;
328 size_t size;
329 pthread_mutex_t lock;
330 } omp_stacks;
331
332 struct ptx_device *next;
333 };
334
335 static struct ptx_device **ptx_devices;
336
337 static inline struct nvptx_thread *
nvptx_thread(void)338 nvptx_thread (void)
339 {
340 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
341 }
342
343 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
344 should be locked on entry and remains locked on exit. */
345
346 static bool
nvptx_init(void)347 nvptx_init (void)
348 {
349 int ndevs;
350
351 if (instantiated_devices != 0)
352 return true;
353
354 if (!init_cuda_lib ())
355 return false;
356
357 CUDA_CALL (cuInit, 0);
358
359 int cuda_driver_version;
360 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
361 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
362 "CUDA Driver %u.%u",
363 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
364
365 CUDA_CALL (cuDeviceGetCount, &ndevs);
366 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
367 * ndevs);
368
369 return true;
370 }
371
372 /* Select the N'th PTX device for the current host thread. The device must
373 have been previously opened before calling this function. */
374
375 static bool
nvptx_attach_host_thread_to_device(int n)376 nvptx_attach_host_thread_to_device (int n)
377 {
378 CUdevice dev;
379 CUresult r;
380 struct ptx_device *ptx_dev;
381 CUcontext thd_ctx;
382
383 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
384 if (r == CUDA_ERROR_NOT_PERMITTED)
385 {
386 /* Assume we're in a CUDA callback, just return true. */
387 return true;
388 }
389 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
390 {
391 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
392 return false;
393 }
394
395 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
396 return true;
397 else
398 {
399 CUcontext old_ctx;
400
401 ptx_dev = ptx_devices[n];
402 if (!ptx_dev)
403 {
404 GOMP_PLUGIN_error ("device %d not found", n);
405 return false;
406 }
407
408 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
409
410 /* We don't necessarily have a current context (e.g. if it has been
411 destroyed. Pop it if we do though. */
412 if (thd_ctx != NULL)
413 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
414
415 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
416 }
417 return true;
418 }
419
420 static struct ptx_device *
nvptx_open_device(int n)421 nvptx_open_device (int n)
422 {
423 struct ptx_device *ptx_dev;
424 CUdevice dev, ctx_dev;
425 CUresult r;
426 int async_engines, pi;
427
428 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
429
430 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
431
432 ptx_dev->ord = n;
433 ptx_dev->dev = dev;
434 ptx_dev->ctx_shared = false;
435
436 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
437 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
438 {
439 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
440 return NULL;
441 }
442
443 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
444 {
445 /* The current host thread has an active context for a different device.
446 Detach it. */
447 CUcontext old_ctx;
448 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
449 }
450
451 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
452
453 if (!ptx_dev->ctx)
454 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
455 else
456 ptx_dev->ctx_shared = true;
457
458 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
460 ptx_dev->overlap = pi;
461
462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
464 ptx_dev->map = pi;
465
466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
468 ptx_dev->concur = pi;
469
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
472 ptx_dev->mode = pi;
473
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
476 ptx_dev->mkern = pi;
477
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
480 ptx_dev->clock_khz = pi;
481
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
484 ptx_dev->num_sms = pi;
485
486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
488 ptx_dev->regs_per_block = pi;
489
490 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
491 in CUDA 6.0 and newer. */
492 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
494 dev);
495 /* Fallback: use limit of registers per block, which is usually equal. */
496 if (r == CUDA_ERROR_INVALID_VALUE)
497 pi = ptx_dev->regs_per_block;
498 else if (r != CUDA_SUCCESS)
499 {
500 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
501 return NULL;
502 }
503 ptx_dev->regs_per_sm = pi;
504
505 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
506 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
507 if (pi != 32)
508 {
509 GOMP_PLUGIN_error ("Only warp size 32 is supported");
510 return NULL;
511 }
512 ptx_dev->warp_size = pi;
513
514 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
516 ptx_dev->max_threads_per_block = pi;
517
518 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
519 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
520 ptx_dev->max_threads_per_multiprocessor = pi;
521
522 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
523 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
524 if (r != CUDA_SUCCESS)
525 async_engines = 1;
526
527 for (int i = 0; i != GOMP_DIM_MAX; i++)
528 ptx_dev->default_dims[i] = 0;
529
530 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
531 dev);
532
533 ptx_dev->images = NULL;
534 pthread_mutex_init (&ptx_dev->image_lock, NULL);
535
536 ptx_dev->free_blocks = NULL;
537 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
538
539 ptx_dev->omp_stacks.ptr = 0;
540 ptx_dev->omp_stacks.size = 0;
541 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
542
543 return ptx_dev;
544 }
545
546 static bool
nvptx_close_device(struct ptx_device * ptx_dev)547 nvptx_close_device (struct ptx_device *ptx_dev)
548 {
549 if (!ptx_dev)
550 return true;
551
552 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
553 {
554 struct ptx_free_block *b_next = b->next;
555 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
556 free (b);
557 b = b_next;
558 }
559
560 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
561 pthread_mutex_destroy (&ptx_dev->image_lock);
562
563 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
564
565 if (ptx_dev->omp_stacks.ptr)
566 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
567
568 if (!ptx_dev->ctx_shared)
569 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
570
571 free (ptx_dev);
572 return true;
573 }
574
575 static int
nvptx_get_num_devices(void)576 nvptx_get_num_devices (void)
577 {
578 int n;
579
580 /* This function will be called before the plugin has been initialized in
581 order to enumerate available devices, but CUDA API routines can't be used
582 until cuInit has been called. Just call it now (but don't yet do any
583 further initialization). */
584 if (instantiated_devices == 0)
585 {
586 if (!init_cuda_lib ())
587 return 0;
588 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
589 /* This is not an error: e.g. we may have CUDA libraries installed but
590 no devices available. */
591 if (r != CUDA_SUCCESS)
592 {
593 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
594 cuda_error (r));
595 return 0;
596 }
597 }
598
599 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
600 return n;
601 }
602
603 static void
notify_var(const char * var_name,const char * env_var)604 notify_var (const char *var_name, const char *env_var)
605 {
606 if (env_var == NULL)
607 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
608 else
609 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
610 }
611
612 static void
process_GOMP_NVPTX_JIT(intptr_t * gomp_nvptx_o)613 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
614 {
615 const char *var_name = "GOMP_NVPTX_JIT";
616 const char *env_var = secure_getenv (var_name);
617 notify_var (var_name, env_var);
618
619 if (env_var == NULL)
620 return;
621
622 const char *c = env_var;
623 while (*c != '\0')
624 {
625 while (*c == ' ')
626 c++;
627
628 if (c[0] == '-' && c[1] == 'O'
629 && '0' <= c[2] && c[2] <= '4'
630 && (c[3] == '\0' || c[3] == ' '))
631 {
632 *gomp_nvptx_o = c[2] - '0';
633 c += 3;
634 continue;
635 }
636
637 GOMP_PLUGIN_error ("Error parsing %s", var_name);
638 break;
639 }
640 }
641
642 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)643 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
644 unsigned num_objs)
645 {
646 CUjit_option opts[7];
647 void *optvals[7];
648 float elapsed = 0.0;
649 char elog[1024];
650 char ilog[16384];
651 CUlinkState linkstate;
652 CUresult r;
653 void *linkout;
654 size_t linkoutsize __attribute__ ((unused));
655
656 opts[0] = CU_JIT_WALL_TIME;
657 optvals[0] = &elapsed;
658
659 opts[1] = CU_JIT_INFO_LOG_BUFFER;
660 optvals[1] = &ilog[0];
661
662 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
663 optvals[2] = (void *) sizeof ilog;
664
665 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
666 optvals[3] = &elog[0];
667
668 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
669 optvals[4] = (void *) sizeof elog;
670
671 opts[5] = CU_JIT_LOG_VERBOSE;
672 optvals[5] = (void *) 1;
673
674 static intptr_t gomp_nvptx_o = -1;
675
676 static bool init_done = false;
677 if (!init_done)
678 {
679 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
680 init_done = true;
681 }
682
683 int nopts = 6;
684 if (gomp_nvptx_o != -1)
685 {
686 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
687 optvals[nopts] = (void *) gomp_nvptx_o;
688 nopts++;
689 }
690
691 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
692 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
693 else
694 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
695
696 for (; num_objs--; ptx_objs++)
697 {
698 /* cuLinkAddData's 'data' argument erroneously omits the const
699 qualifier. */
700 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
701 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
702 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
703 (char *) ptx_objs->code, ptx_objs->size,
704 0, 0, 0, 0);
705 else
706 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
707 (char *) ptx_objs->code, ptx_objs->size,
708 0, 0, 0, 0);
709 if (r != CUDA_SUCCESS)
710 {
711 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
712 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
713 cuda_error (r));
714 return false;
715 }
716 }
717
718 GOMP_PLUGIN_debug (0, "Linking\n");
719 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
720
721 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
722 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
723
724 if (r != CUDA_SUCCESS)
725 {
726 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
727 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
728 return false;
729 }
730
731 CUDA_CALL (cuModuleLoadData, module, linkout);
732 CUDA_CALL (cuLinkDestroy, linkstate);
733 return true;
734 }
735
736 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,CUdeviceptr dp,CUstream stream)737 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
738 unsigned *dims, void *targ_mem_desc,
739 CUdeviceptr dp, CUstream stream)
740 {
741 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
742 CUfunction function;
743 int i;
744 void *kargs[1];
745 struct nvptx_thread *nvthd = nvptx_thread ();
746 int warp_size = nvthd->ptx_dev->warp_size;
747
748 function = targ_fn->fn;
749
750 /* Initialize the launch dimensions. Typically this is constant,
751 provided by the device compiler, but we must permit runtime
752 values. */
753 int seen_zero = 0;
754 for (i = 0; i != GOMP_DIM_MAX; i++)
755 {
756 if (targ_fn->launch->dim[i])
757 dims[i] = targ_fn->launch->dim[i];
758 if (!dims[i])
759 seen_zero = 1;
760 }
761
762 if (seen_zero)
763 {
764 pthread_mutex_lock (&ptx_dev_lock);
765
766 static int gomp_openacc_dims[GOMP_DIM_MAX];
767 if (!gomp_openacc_dims[0])
768 {
769 /* See if the user provided GOMP_OPENACC_DIM environment
770 variable to specify runtime defaults. */
771 for (int i = 0; i < GOMP_DIM_MAX; ++i)
772 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
773 }
774
775 if (!nvthd->ptx_dev->default_dims[0])
776 {
777 int default_dims[GOMP_DIM_MAX];
778 for (int i = 0; i < GOMP_DIM_MAX; ++i)
779 default_dims[i] = gomp_openacc_dims[i];
780
781 int gang, worker, vector;
782 {
783 int block_size = nvthd->ptx_dev->max_threads_per_block;
784 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
785 int dev_size = nvthd->ptx_dev->num_sms;
786 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
787 " dev_size=%d, cpu_size=%d\n",
788 warp_size, block_size, dev_size, cpu_size);
789
790 gang = (cpu_size / block_size) * dev_size;
791 worker = block_size / warp_size;
792 vector = warp_size;
793 }
794
795 /* There is no upper bound on the gang size. The best size
796 matches the hardware configuration. Logical gangs are
797 scheduled onto physical hardware. To maximize usage, we
798 should guess a large number. */
799 if (default_dims[GOMP_DIM_GANG] < 1)
800 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
801 /* The worker size must not exceed the hardware. */
802 if (default_dims[GOMP_DIM_WORKER] < 1
803 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
804 default_dims[GOMP_DIM_WORKER] = worker;
805 /* The vector size must exactly match the hardware. */
806 if (default_dims[GOMP_DIM_VECTOR] < 1
807 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
808 default_dims[GOMP_DIM_VECTOR] = vector;
809
810 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
811 default_dims[GOMP_DIM_GANG],
812 default_dims[GOMP_DIM_WORKER],
813 default_dims[GOMP_DIM_VECTOR]);
814
815 for (i = 0; i != GOMP_DIM_MAX; i++)
816 nvthd->ptx_dev->default_dims[i] = default_dims[i];
817 }
818 pthread_mutex_unlock (&ptx_dev_lock);
819
820 {
821 bool default_dim_p[GOMP_DIM_MAX];
822 for (i = 0; i != GOMP_DIM_MAX; i++)
823 default_dim_p[i] = !dims[i];
824
825 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
826 {
827 for (i = 0; i != GOMP_DIM_MAX; i++)
828 if (default_dim_p[i])
829 dims[i] = nvthd->ptx_dev->default_dims[i];
830
831 if (default_dim_p[GOMP_DIM_VECTOR])
832 dims[GOMP_DIM_VECTOR]
833 = MIN (dims[GOMP_DIM_VECTOR],
834 (targ_fn->max_threads_per_block / warp_size
835 * warp_size));
836
837 if (default_dim_p[GOMP_DIM_WORKER])
838 dims[GOMP_DIM_WORKER]
839 = MIN (dims[GOMP_DIM_WORKER],
840 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
841 }
842 else
843 {
844 /* Handle the case that the compiler allows the runtime to choose
845 the vector-length conservatively, by ignoring
846 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
847 it. */
848 int vectors = 0;
849 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
850 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
851 exceed targ_fn->max_threads_per_block. */
852 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
853 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
854 int grids, blocks;
855
856 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
857 &blocks, function, NULL, 0,
858 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
859 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
860 "grid = %d, block = %d\n", grids, blocks);
861
862 /* Keep the num_gangs proportional to the block size. In
863 the case were a block size is limited by shared-memory
864 or the register file capacity, the runtime will not
865 excessively over assign gangs to the multiprocessor
866 units if their state is going to be swapped out even
867 more than necessary. The constant factor 2 is there to
868 prevent threads from idling when there is insufficient
869 work for them. */
870 if (gangs == 0)
871 gangs = 2 * grids * (blocks / warp_size);
872
873 if (vectors == 0)
874 vectors = warp_size;
875
876 if (workers == 0)
877 {
878 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
879 ? vectors
880 : dims[GOMP_DIM_VECTOR]);
881 workers = blocks / actual_vectors;
882 workers = MAX (workers, 1);
883 /* If we need a per-worker barrier ... . */
884 if (actual_vectors > 32)
885 /* Don't use more barriers than available. */
886 workers = MIN (workers, 15);
887 }
888
889 for (i = 0; i != GOMP_DIM_MAX; i++)
890 if (default_dim_p[i])
891 switch (i)
892 {
893 case GOMP_DIM_GANG: dims[i] = gangs; break;
894 case GOMP_DIM_WORKER: dims[i] = workers; break;
895 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
896 default: GOMP_PLUGIN_fatal ("invalid dim");
897 }
898 }
899 }
900 }
901
902 /* Check if the accelerator has sufficient hardware resources to
903 launch the offloaded kernel. */
904 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
905 > targ_fn->max_threads_per_block)
906 {
907 const char *msg
908 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
909 " with num_workers = %d and vector_length = %d"
910 "; "
911 "recompile the program with 'num_workers = x and vector_length = y'"
912 " on that offloaded region or '-fopenacc-dim=:x:y' where"
913 " x * y <= %d"
914 ".\n");
915 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
916 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
917 }
918
919 /* Check if the accelerator has sufficient barrier resources to
920 launch the offloaded kernel. */
921 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
922 {
923 const char *msg
924 = ("The Nvidia accelerator has insufficient barrier resources to launch"
925 " '%s' with num_workers = %d and vector_length = %d"
926 "; "
927 "recompile the program with 'num_workers = x' on that offloaded"
928 " region or '-fopenacc-dim=:x:' where x <= 15"
929 "; "
930 "or, recompile the program with 'vector_length = 32' on that"
931 " offloaded region or '-fopenacc-dim=::32'"
932 ".\n");
933 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
934 dims[GOMP_DIM_VECTOR]);
935 }
936
937 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
938 " gangs=%u, workers=%u, vectors=%u\n",
939 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
940 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
941
942 // OpenACC CUDA
943 //
944 // num_gangs nctaid.x
945 // num_workers ntid.y
946 // vector length ntid.x
947
948 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
949 acc_prof_info *prof_info = thr->prof_info;
950 acc_event_info enqueue_launch_event_info;
951 acc_api_info *api_info = thr->api_info;
952 bool profiling_p = __builtin_expect (prof_info != NULL, false);
953 if (profiling_p)
954 {
955 prof_info->event_type = acc_ev_enqueue_launch_start;
956
957 enqueue_launch_event_info.launch_event.event_type
958 = prof_info->event_type;
959 enqueue_launch_event_info.launch_event.valid_bytes
960 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
961 enqueue_launch_event_info.launch_event.parent_construct
962 = acc_construct_parallel;
963 enqueue_launch_event_info.launch_event.implicit = 1;
964 enqueue_launch_event_info.launch_event.tool_info = NULL;
965 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
966 enqueue_launch_event_info.launch_event.num_gangs
967 = dims[GOMP_DIM_GANG];
968 enqueue_launch_event_info.launch_event.num_workers
969 = dims[GOMP_DIM_WORKER];
970 enqueue_launch_event_info.launch_event.vector_length
971 = dims[GOMP_DIM_VECTOR];
972
973 api_info->device_api = acc_device_api_cuda;
974
975 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
976 api_info);
977 }
978
979 kargs[0] = &dp;
980 CUDA_CALL_ASSERT (cuLaunchKernel, function,
981 dims[GOMP_DIM_GANG], 1, 1,
982 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
983 0, stream, kargs, 0);
984
985 if (profiling_p)
986 {
987 prof_info->event_type = acc_ev_enqueue_launch_end;
988 enqueue_launch_event_info.launch_event.event_type
989 = prof_info->event_type;
990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
991 api_info);
992 }
993
994 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
995 targ_fn->launch->fn);
996 }
997
998 void * openacc_get_current_cuda_context (void);
999
1000 static void
goacc_profiling_acc_ev_alloc(struct goacc_thread * thr,void * dp,size_t s)1001 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1002 {
1003 acc_prof_info *prof_info = thr->prof_info;
1004 acc_event_info data_event_info;
1005 acc_api_info *api_info = thr->api_info;
1006
1007 prof_info->event_type = acc_ev_alloc;
1008
1009 data_event_info.data_event.event_type = prof_info->event_type;
1010 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1011 data_event_info.data_event.parent_construct = acc_construct_parallel;
1012 data_event_info.data_event.implicit = 1;
1013 data_event_info.data_event.tool_info = NULL;
1014 data_event_info.data_event.var_name = NULL;
1015 data_event_info.data_event.bytes = s;
1016 data_event_info.data_event.host_ptr = NULL;
1017 data_event_info.data_event.device_ptr = dp;
1018
1019 api_info->device_api = acc_device_api_cuda;
1020
1021 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1022 }
1023
1024 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1025 size threshold, or if FORCE is true. */
1026
1027 static void
nvptx_stacks_free(struct ptx_device * ptx_dev,bool force)1028 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1029 {
1030 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1031 if (ptx_dev->omp_stacks.ptr
1032 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1033 {
1034 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1035 if (r != CUDA_SUCCESS)
1036 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1037 ptx_dev->omp_stacks.ptr = 0;
1038 ptx_dev->omp_stacks.size = 0;
1039 }
1040 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1041 }
1042
1043 static void *
nvptx_alloc(size_t s,bool suppress_errors)1044 nvptx_alloc (size_t s, bool suppress_errors)
1045 {
1046 CUdeviceptr d;
1047
1048 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1049 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1050 return NULL;
1051 else if (r != CUDA_SUCCESS)
1052 {
1053 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1054 return NULL;
1055 }
1056
1057 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1058 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1059 bool profiling_p
1060 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1061 if (profiling_p)
1062 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1063
1064 return (void *) d;
1065 }
1066
1067 static void
goacc_profiling_acc_ev_free(struct goacc_thread * thr,void * p)1068 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1069 {
1070 acc_prof_info *prof_info = thr->prof_info;
1071 acc_event_info data_event_info;
1072 acc_api_info *api_info = thr->api_info;
1073
1074 prof_info->event_type = acc_ev_free;
1075
1076 data_event_info.data_event.event_type = prof_info->event_type;
1077 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1078 data_event_info.data_event.parent_construct = acc_construct_parallel;
1079 data_event_info.data_event.implicit = 1;
1080 data_event_info.data_event.tool_info = NULL;
1081 data_event_info.data_event.var_name = NULL;
1082 data_event_info.data_event.bytes = -1;
1083 data_event_info.data_event.host_ptr = NULL;
1084 data_event_info.data_event.device_ptr = p;
1085
1086 api_info->device_api = acc_device_api_cuda;
1087
1088 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1089 }
1090
1091 static bool
nvptx_free(void * p,struct ptx_device * ptx_dev)1092 nvptx_free (void *p, struct ptx_device *ptx_dev)
1093 {
1094 CUdeviceptr pb;
1095 size_t ps;
1096
1097 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1098 (CUdeviceptr) p);
1099 if (r == CUDA_ERROR_NOT_PERMITTED)
1100 {
1101 /* We assume that this error indicates we are in a CUDA callback context,
1102 where all CUDA calls are not allowed (see cuStreamAddCallback
1103 documentation for description). Arrange to free this piece of device
1104 memory later. */
1105 struct ptx_free_block *n
1106 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1107 n->ptr = p;
1108 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1109 n->next = ptx_dev->free_blocks;
1110 ptx_dev->free_blocks = n;
1111 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1112 return true;
1113 }
1114 else if (r != CUDA_SUCCESS)
1115 {
1116 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1117 return false;
1118 }
1119 if ((CUdeviceptr) p != pb)
1120 {
1121 GOMP_PLUGIN_error ("invalid device address");
1122 return false;
1123 }
1124
1125 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1126 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1127 bool profiling_p
1128 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1129 if (profiling_p)
1130 goacc_profiling_acc_ev_free (thr, p);
1131
1132 return true;
1133 }
1134
1135 static void *
nvptx_get_current_cuda_device(void)1136 nvptx_get_current_cuda_device (void)
1137 {
1138 struct nvptx_thread *nvthd = nvptx_thread ();
1139
1140 if (!nvthd || !nvthd->ptx_dev)
1141 return NULL;
1142
1143 return &nvthd->ptx_dev->dev;
1144 }
1145
1146 static void *
nvptx_get_current_cuda_context(void)1147 nvptx_get_current_cuda_context (void)
1148 {
1149 struct nvptx_thread *nvthd = nvptx_thread ();
1150
1151 if (!nvthd || !nvthd->ptx_dev)
1152 return NULL;
1153
1154 return nvthd->ptx_dev->ctx;
1155 }
1156
1157 /* Plugin entry points. */
1158
1159 const char *
GOMP_OFFLOAD_get_name(void)1160 GOMP_OFFLOAD_get_name (void)
1161 {
1162 return "nvptx";
1163 }
1164
1165 unsigned int
GOMP_OFFLOAD_get_caps(void)1166 GOMP_OFFLOAD_get_caps (void)
1167 {
1168 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1169 }
1170
1171 int
GOMP_OFFLOAD_get_type(void)1172 GOMP_OFFLOAD_get_type (void)
1173 {
1174 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1175 }
1176
1177 int
GOMP_OFFLOAD_get_num_devices(void)1178 GOMP_OFFLOAD_get_num_devices (void)
1179 {
1180 return nvptx_get_num_devices ();
1181 }
1182
1183 bool
GOMP_OFFLOAD_init_device(int n)1184 GOMP_OFFLOAD_init_device (int n)
1185 {
1186 struct ptx_device *dev;
1187
1188 pthread_mutex_lock (&ptx_dev_lock);
1189
1190 if (!nvptx_init () || ptx_devices[n] != NULL)
1191 {
1192 pthread_mutex_unlock (&ptx_dev_lock);
1193 return false;
1194 }
1195
1196 dev = nvptx_open_device (n);
1197 if (dev)
1198 {
1199 ptx_devices[n] = dev;
1200 instantiated_devices++;
1201 }
1202
1203 pthread_mutex_unlock (&ptx_dev_lock);
1204
1205 return dev != NULL;
1206 }
1207
1208 bool
GOMP_OFFLOAD_fini_device(int n)1209 GOMP_OFFLOAD_fini_device (int n)
1210 {
1211 pthread_mutex_lock (&ptx_dev_lock);
1212
1213 if (ptx_devices[n] != NULL)
1214 {
1215 if (!nvptx_attach_host_thread_to_device (n)
1216 || !nvptx_close_device (ptx_devices[n]))
1217 {
1218 pthread_mutex_unlock (&ptx_dev_lock);
1219 return false;
1220 }
1221 ptx_devices[n] = NULL;
1222 instantiated_devices--;
1223 }
1224
1225 if (instantiated_devices == 0)
1226 {
1227 free (ptx_devices);
1228 ptx_devices = NULL;
1229 }
1230
1231 pthread_mutex_unlock (&ptx_dev_lock);
1232 return true;
1233 }
1234
1235 /* Return the libgomp version number we're compatible with. There is
1236 no requirement for cross-version compatibility. */
1237
1238 unsigned
GOMP_OFFLOAD_version(void)1239 GOMP_OFFLOAD_version (void)
1240 {
1241 return GOMP_VERSION;
1242 }
1243
1244 /* Initialize __nvptx_clocktick, if present in MODULE. */
1245
1246 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1247 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1248 {
1249 CUdeviceptr dptr;
1250 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1251 module, "__nvptx_clocktick");
1252 if (r == CUDA_ERROR_NOT_FOUND)
1253 return;
1254 if (r != CUDA_SUCCESS)
1255 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1256 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1257 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1258 sizeof (__nvptx_clocktick));
1259 if (r != CUDA_SUCCESS)
1260 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1261 }
1262
1263 /* Load the (partial) program described by TARGET_DATA to device
1264 number ORD. Allocate and return TARGET_TABLE. */
1265
1266 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1267 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1268 struct addr_pair **target_table)
1269 {
1270 CUmodule module;
1271 const char *const *var_names;
1272 const struct targ_fn_launch *fn_descs;
1273 unsigned int fn_entries, var_entries, other_entries, i, j;
1274 struct targ_fn_descriptor *targ_fns;
1275 struct addr_pair *targ_tbl;
1276 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1277 struct ptx_image_data *new_image;
1278 struct ptx_device *dev;
1279
1280 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1281 {
1282 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1283 " (expected %u, received %u)",
1284 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1285 return -1;
1286 }
1287
1288 if (!nvptx_attach_host_thread_to_device (ord)
1289 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1290 return -1;
1291
1292 dev = ptx_devices[ord];
1293
1294 /* The mkoffload utility emits a struct of pointers/integers at the
1295 start of each offload image. The array of kernel names and the
1296 functions addresses form a one-to-one correspondence. */
1297
1298 var_entries = img_header->var_num;
1299 var_names = img_header->var_names;
1300 fn_entries = img_header->fn_num;
1301 fn_descs = img_header->fn_descs;
1302
1303 /* Currently, the only other entry kind is 'device number'. */
1304 other_entries = 1;
1305
1306 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1307 * (fn_entries + var_entries + other_entries));
1308 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1309 * fn_entries);
1310
1311 *target_table = targ_tbl;
1312
1313 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1314 new_image->target_data = target_data;
1315 new_image->module = module;
1316 new_image->fns = targ_fns;
1317
1318 pthread_mutex_lock (&dev->image_lock);
1319 new_image->next = dev->images;
1320 dev->images = new_image;
1321 pthread_mutex_unlock (&dev->image_lock);
1322
1323 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1324 {
1325 CUfunction function;
1326 int nregs, mthrs;
1327
1328 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1329 fn_descs[i].fn);
1330 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1331 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1332 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1333 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1334
1335 targ_fns->fn = function;
1336 targ_fns->launch = &fn_descs[i];
1337 targ_fns->regs_per_thread = nregs;
1338 targ_fns->max_threads_per_block = mthrs;
1339
1340 targ_tbl->start = (uintptr_t) targ_fns;
1341 targ_tbl->end = targ_tbl->start + 1;
1342 }
1343
1344 for (j = 0; j < var_entries; j++, targ_tbl++)
1345 {
1346 CUdeviceptr var;
1347 size_t bytes;
1348
1349 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1350 &var, &bytes, module, var_names[j]);
1351
1352 targ_tbl->start = (uintptr_t) var;
1353 targ_tbl->end = targ_tbl->start + bytes;
1354 }
1355
1356 CUdeviceptr device_num_varptr;
1357 size_t device_num_varsize;
1358 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1359 &device_num_varsize, module,
1360 XSTRING (GOMP_DEVICE_NUM_VAR));
1361 if (r == CUDA_SUCCESS)
1362 {
1363 targ_tbl->start = (uintptr_t) device_num_varptr;
1364 targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1365 }
1366 else
1367 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
1368 targ_tbl->start = targ_tbl->end = 0;
1369 targ_tbl++;
1370
1371 nvptx_set_clocktick (module, dev);
1372
1373 return fn_entries + var_entries + other_entries;
1374 }
1375
1376 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1377 function descriptors allocated by G_O_load_image. */
1378
1379 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1380 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1381 {
1382 struct ptx_image_data *image, **prev_p;
1383 struct ptx_device *dev = ptx_devices[ord];
1384
1385 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1386 {
1387 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1388 " (expected %u, received %u)",
1389 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1390 return false;
1391 }
1392
1393 bool ret = true;
1394 pthread_mutex_lock (&dev->image_lock);
1395 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1396 if (image->target_data == target_data)
1397 {
1398 *prev_p = image->next;
1399 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1400 ret = false;
1401 free (image->fns);
1402 free (image);
1403 break;
1404 }
1405 pthread_mutex_unlock (&dev->image_lock);
1406 return ret;
1407 }
1408
1409 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1410 GOMP_OFFLOAD_alloc (int ord, size_t size)
1411 {
1412 if (!nvptx_attach_host_thread_to_device (ord))
1413 return NULL;
1414
1415 struct ptx_device *ptx_dev = ptx_devices[ord];
1416 struct ptx_free_block *blocks, *tmp;
1417
1418 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1419 blocks = ptx_dev->free_blocks;
1420 ptx_dev->free_blocks = NULL;
1421 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1422
1423 nvptx_stacks_free (ptx_dev, false);
1424
1425 while (blocks)
1426 {
1427 tmp = blocks->next;
1428 nvptx_free (blocks->ptr, ptx_dev);
1429 free (blocks);
1430 blocks = tmp;
1431 }
1432
1433 void *d = nvptx_alloc (size, true);
1434 if (d)
1435 return d;
1436 else
1437 {
1438 /* Memory allocation failed. Try freeing the stacks block, and
1439 retrying. */
1440 nvptx_stacks_free (ptx_dev, true);
1441 return nvptx_alloc (size, false);
1442 }
1443 }
1444
1445 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1446 GOMP_OFFLOAD_free (int ord, void *ptr)
1447 {
1448 return (nvptx_attach_host_thread_to_device (ord)
1449 && nvptx_free (ptr, ptx_devices[ord]));
1450 }
1451
1452 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)1453 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1454 void **hostaddrs, void **devaddrs,
1455 unsigned *dims, void *targ_mem_desc)
1456 {
1457 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1458
1459 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1460 acc_prof_info *prof_info = thr->prof_info;
1461 acc_event_info data_event_info;
1462 acc_api_info *api_info = thr->api_info;
1463 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1464
1465 void **hp = NULL;
1466 CUdeviceptr dp = 0;
1467
1468 if (mapnum > 0)
1469 {
1470 size_t s = mapnum * sizeof (void *);
1471 hp = alloca (s);
1472 for (int i = 0; i < mapnum; i++)
1473 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1474 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1475 if (profiling_p)
1476 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1477 }
1478
1479 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1480 fact have the same value on a unified-memory system). */
1481 if (mapnum > 0)
1482 {
1483 if (profiling_p)
1484 {
1485 prof_info->event_type = acc_ev_enqueue_upload_start;
1486
1487 data_event_info.data_event.event_type = prof_info->event_type;
1488 data_event_info.data_event.valid_bytes
1489 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1490 data_event_info.data_event.parent_construct
1491 = acc_construct_parallel;
1492 data_event_info.data_event.implicit = 1; /* Always implicit. */
1493 data_event_info.data_event.tool_info = NULL;
1494 data_event_info.data_event.var_name = NULL;
1495 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1496 data_event_info.data_event.host_ptr = hp;
1497 data_event_info.data_event.device_ptr = (const void *) dp;
1498
1499 api_info->device_api = acc_device_api_cuda;
1500
1501 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1502 api_info);
1503 }
1504 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1505 mapnum * sizeof (void *));
1506 if (profiling_p)
1507 {
1508 prof_info->event_type = acc_ev_enqueue_upload_end;
1509 data_event_info.data_event.event_type = prof_info->event_type;
1510 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1511 api_info);
1512 }
1513 }
1514
1515 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1516 dp, NULL);
1517
1518 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1519 const char *maybe_abort_msg = "(perhaps abort was called)";
1520 if (r == CUDA_ERROR_LAUNCH_FAILED)
1521 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1522 maybe_abort_msg);
1523 else if (r != CUDA_SUCCESS)
1524 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1525
1526 CUDA_CALL_ASSERT (cuMemFree, dp);
1527 if (profiling_p)
1528 goacc_profiling_acc_ev_free (thr, (void *) dp);
1529 }
1530
1531 static void
cuda_free_argmem(void * ptr)1532 cuda_free_argmem (void *ptr)
1533 {
1534 void **block = (void **) ptr;
1535 nvptx_free (block[0], (struct ptx_device *) block[1]);
1536 free (block);
1537 }
1538
1539 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)1540 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1541 void **hostaddrs, void **devaddrs,
1542 unsigned *dims, void *targ_mem_desc,
1543 struct goacc_asyncqueue *aq)
1544 {
1545 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1546
1547 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1548 acc_prof_info *prof_info = thr->prof_info;
1549 acc_event_info data_event_info;
1550 acc_api_info *api_info = thr->api_info;
1551 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1552
1553 void **hp = NULL;
1554 CUdeviceptr dp = 0;
1555 void **block = NULL;
1556
1557 if (mapnum > 0)
1558 {
1559 size_t s = mapnum * sizeof (void *);
1560 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1561 hp = block + 2;
1562 for (int i = 0; i < mapnum; i++)
1563 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1564 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1565 if (profiling_p)
1566 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1567 }
1568
1569 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1570 fact have the same value on a unified-memory system). */
1571 if (mapnum > 0)
1572 {
1573 if (profiling_p)
1574 {
1575 prof_info->event_type = acc_ev_enqueue_upload_start;
1576
1577 data_event_info.data_event.event_type = prof_info->event_type;
1578 data_event_info.data_event.valid_bytes
1579 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1580 data_event_info.data_event.parent_construct
1581 = acc_construct_parallel;
1582 data_event_info.data_event.implicit = 1; /* Always implicit. */
1583 data_event_info.data_event.tool_info = NULL;
1584 data_event_info.data_event.var_name = NULL;
1585 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1586 data_event_info.data_event.host_ptr = hp;
1587 data_event_info.data_event.device_ptr = (const void *) dp;
1588
1589 api_info->device_api = acc_device_api_cuda;
1590
1591 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1592 api_info);
1593 }
1594
1595 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1596 mapnum * sizeof (void *), aq->cuda_stream);
1597 block[0] = (void *) dp;
1598
1599 struct nvptx_thread *nvthd =
1600 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1601 block[1] = (void *) nvthd->ptx_dev;
1602
1603 if (profiling_p)
1604 {
1605 prof_info->event_type = acc_ev_enqueue_upload_end;
1606 data_event_info.data_event.event_type = prof_info->event_type;
1607 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1608 api_info);
1609 }
1610 }
1611
1612 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1613 dp, aq->cuda_stream);
1614
1615 if (mapnum > 0)
1616 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1617 }
1618
1619 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)1620 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1621 {
1622 struct ptx_device *ptx_dev;
1623 struct nvptx_thread *nvthd
1624 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1625 CUcontext thd_ctx;
1626
1627 ptx_dev = ptx_devices[ord];
1628
1629 assert (ptx_dev);
1630
1631 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1632
1633 assert (ptx_dev->ctx);
1634
1635 if (!thd_ctx)
1636 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1637
1638 nvthd->ptx_dev = ptx_dev;
1639
1640 return (void *) nvthd;
1641 }
1642
1643 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)1644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1645 {
1646 free (data);
1647 }
1648
1649 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)1650 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1651 {
1652 return nvptx_get_current_cuda_device ();
1653 }
1654
1655 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)1656 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1657 {
1658 return nvptx_get_current_cuda_context ();
1659 }
1660
1661 /* This returns a CUstream. */
1662 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(struct goacc_asyncqueue * aq)1663 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1664 {
1665 return (void *) aq->cuda_stream;
1666 }
1667
1668 /* This takes a CUstream. */
1669 int
GOMP_OFFLOAD_openacc_cuda_set_stream(struct goacc_asyncqueue * aq,void * stream)1670 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1671 {
1672 if (aq->cuda_stream)
1673 {
1674 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1675 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1676 }
1677
1678 aq->cuda_stream = (CUstream) stream;
1679 return 1;
1680 }
1681
1682 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)1683 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1684 {
1685 CUstream stream = NULL;
1686 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1687
1688 struct goacc_asyncqueue *aq
1689 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1690 aq->cuda_stream = stream;
1691 return aq;
1692 }
1693
1694 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)1695 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1696 {
1697 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1698 free (aq);
1699 return true;
1700 }
1701
1702 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)1703 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1704 {
1705 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1706 if (r == CUDA_SUCCESS)
1707 return 1;
1708 if (r == CUDA_ERROR_NOT_READY)
1709 return 0;
1710
1711 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1712 return -1;
1713 }
1714
1715 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)1716 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1717 {
1718 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1719 return true;
1720 }
1721
1722 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)1723 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1724 struct goacc_asyncqueue *aq2)
1725 {
1726 CUevent e;
1727 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1728 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1729 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1730 return true;
1731 }
1732
1733 static void
cuda_callback_wrapper(CUstream stream,CUresult res,void * ptr)1734 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1735 {
1736 if (res != CUDA_SUCCESS)
1737 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1738 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1739 cb->fn (cb->ptr);
1740 free (ptr);
1741 }
1742
1743 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* callback_fn)(void *),void * userptr)1744 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1745 void (*callback_fn)(void *),
1746 void *userptr)
1747 {
1748 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1749 b->fn = callback_fn;
1750 b->ptr = userptr;
1751 b->aq = aq;
1752 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1753 cuda_callback_wrapper, (void *) b, 0);
1754 }
1755
1756 static bool
cuda_memcpy_sanity_check(const void * h,const void * d,size_t s)1757 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1758 {
1759 CUdeviceptr pb;
1760 size_t ps;
1761 if (!s)
1762 return true;
1763 if (!d)
1764 {
1765 GOMP_PLUGIN_error ("invalid device address");
1766 return false;
1767 }
1768 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1769 if (!pb)
1770 {
1771 GOMP_PLUGIN_error ("invalid device address");
1772 return false;
1773 }
1774 if (!h)
1775 {
1776 GOMP_PLUGIN_error ("invalid host address");
1777 return false;
1778 }
1779 if (d == h)
1780 {
1781 GOMP_PLUGIN_error ("invalid host or device address");
1782 return false;
1783 }
1784 if ((void *)(d + s) > (void *)(pb + ps))
1785 {
1786 GOMP_PLUGIN_error ("invalid size");
1787 return false;
1788 }
1789 return true;
1790 }
1791
1792 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1793 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1794 {
1795 if (!nvptx_attach_host_thread_to_device (ord)
1796 || !cuda_memcpy_sanity_check (src, dst, n))
1797 return false;
1798 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1799 return true;
1800 }
1801
1802 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1803 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1804 {
1805 if (!nvptx_attach_host_thread_to_device (ord)
1806 || !cuda_memcpy_sanity_check (dst, src, n))
1807 return false;
1808 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1809 return true;
1810 }
1811
1812 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1813 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1814 {
1815 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1816 return true;
1817 }
1818
1819 bool
GOMP_OFFLOAD_openacc_async_host2dev(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1820 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1821 size_t n, struct goacc_asyncqueue *aq)
1822 {
1823 if (!nvptx_attach_host_thread_to_device (ord)
1824 || !cuda_memcpy_sanity_check (src, dst, n))
1825 return false;
1826 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1827 return true;
1828 }
1829
1830 bool
GOMP_OFFLOAD_openacc_async_dev2host(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1831 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1832 size_t n, struct goacc_asyncqueue *aq)
1833 {
1834 if (!nvptx_attach_host_thread_to_device (ord)
1835 || !cuda_memcpy_sanity_check (dst, src, n))
1836 return false;
1837 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1838 return true;
1839 }
1840
1841 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int n,enum goacc_property prop)1842 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1843 {
1844 union goacc_property_value propval = { .val = 0 };
1845
1846 pthread_mutex_lock (&ptx_dev_lock);
1847
1848 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1849 {
1850 pthread_mutex_unlock (&ptx_dev_lock);
1851 return propval;
1852 }
1853
1854 struct ptx_device *ptx_dev = ptx_devices[n];
1855 switch (prop)
1856 {
1857 case GOACC_PROPERTY_MEMORY:
1858 {
1859 size_t total_mem;
1860
1861 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1862 propval.val = total_mem;
1863 }
1864 break;
1865 case GOACC_PROPERTY_FREE_MEMORY:
1866 {
1867 size_t total_mem;
1868 size_t free_mem;
1869 CUdevice ctxdev;
1870
1871 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1872 if (ptx_dev->dev == ctxdev)
1873 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1874 else if (ptx_dev->ctx)
1875 {
1876 CUcontext old_ctx;
1877
1878 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1879 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1880 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1881 }
1882 else
1883 {
1884 CUcontext new_ctx;
1885
1886 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1887 ptx_dev->dev);
1888 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1889 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1890 }
1891 propval.val = free_mem;
1892 }
1893 break;
1894 case GOACC_PROPERTY_NAME:
1895 propval.ptr = ptx_dev->name;
1896 break;
1897 case GOACC_PROPERTY_VENDOR:
1898 propval.ptr = "Nvidia";
1899 break;
1900 case GOACC_PROPERTY_DRIVER:
1901 propval.ptr = cuda_driver_version_s;
1902 break;
1903 default:
1904 break;
1905 }
1906
1907 pthread_mutex_unlock (&ptx_dev_lock);
1908 return propval;
1909 }
1910
1911 /* Adjust launch dimensions: pick good values for number of blocks and warps
1912 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1913 own limits. */
1914
1915 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)1916 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1917 struct ptx_device *ptx_dev,
1918 int *teams_p, int *threads_p)
1919 {
1920 int max_warps_block = fn->max_threads_per_block / 32;
1921 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1922 and libgcc, which matches documented limit of all GPUs as of 2015. */
1923 if (max_warps_block > 32)
1924 max_warps_block = 32;
1925 if (*threads_p <= 0)
1926 *threads_p = 8;
1927 if (*threads_p > max_warps_block)
1928 *threads_p = max_warps_block;
1929
1930 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1931 /* This is an estimate of how many blocks the device can host simultaneously.
1932 Actual limit, which may be lower, can be queried with "occupancy control"
1933 driver interface (since CUDA 6.0). */
1934 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1935 if (*teams_p <= 0 || *teams_p > max_blocks)
1936 *teams_p = max_blocks;
1937 }
1938
1939 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1940 target regions. */
1941
1942 static size_t
nvptx_stacks_size()1943 nvptx_stacks_size ()
1944 {
1945 return 128 * 1024;
1946 }
1947
1948 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1949 the storage should be held on entry, and remains held on exit. */
1950
1951 static void *
nvptx_stacks_acquire(struct ptx_device * ptx_dev,size_t size,int num)1952 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1953 {
1954 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1955 return (void *) ptx_dev->omp_stacks.ptr;
1956
1957 /* Free the old, too-small stacks. */
1958 if (ptx_dev->omp_stacks.ptr)
1959 {
1960 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1961 if (r != CUDA_SUCCESS)
1962 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1963 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1964 if (r != CUDA_SUCCESS)
1965 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1966 }
1967
1968 /* Make new and bigger stacks, and remember where we put them and how big
1969 they are. */
1970 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1971 size * num);
1972 if (r != CUDA_SUCCESS)
1973 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1974
1975 ptx_dev->omp_stacks.size = size * num;
1976
1977 return (void *) ptx_dev->omp_stacks.ptr;
1978 }
1979
1980 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)1981 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1982 {
1983 struct targ_fn_descriptor *tgt_fn_desc
1984 = (struct targ_fn_descriptor *) tgt_fn;
1985 CUfunction function = tgt_fn_desc->fn;
1986 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1987 const char *fn_name = launch->fn;
1988 CUresult r;
1989 struct ptx_device *ptx_dev = ptx_devices[ord];
1990 const char *maybe_abort_msg = "(perhaps abort was called)";
1991 int teams = 0, threads = 0;
1992
1993 if (!args)
1994 GOMP_PLUGIN_fatal ("No target arguments provided");
1995 while (*args)
1996 {
1997 intptr_t id = (intptr_t) *args++, val;
1998 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1999 val = (intptr_t) *args++;
2000 else
2001 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2002 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2003 continue;
2004 val = val > INT_MAX ? INT_MAX : val;
2005 id &= GOMP_TARGET_ARG_ID_MASK;
2006 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2007 teams = val;
2008 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2009 threads = val;
2010 }
2011 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2012
2013 size_t stack_size = nvptx_stacks_size ();
2014
2015 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2016 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2017 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2018 size_t fn_args_size = sizeof fn_args;
2019 void *config[] = {
2020 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2021 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2022 CU_LAUNCH_PARAM_END
2023 };
2024 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2025 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2026 __FUNCTION__, fn_name, teams, threads);
2027 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2028 32, threads, 1, 0, NULL, NULL, config);
2029 if (r != CUDA_SUCCESS)
2030 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2031
2032 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2033 if (r == CUDA_ERROR_LAUNCH_FAILED)
2034 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2035 maybe_abort_msg);
2036 else if (r != CUDA_SUCCESS)
2037 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2038
2039 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2040 }
2041
2042 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2043