1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
40 #include "oacc-int.h"
41
42 #include <pthread.h>
43 #include <cuda.h>
44 #include <stdbool.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
51
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
56
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
70 #endif
71
72 #define DO_PRAGMA(x) _Pragma (#x)
73
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
76
77 struct cuda_lib_s {
78
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
86
87 } cuda_lib;
88
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited = -1;
92
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95 static bool
init_cuda_lib(void)96 init_cuda_lib (void)
97 {
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
105
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
111 return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
116
117 cuda_lib_inited = true;
118 return true;
119 }
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
122
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
128
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
132
133 #include "secure_getenv.h"
134
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
143
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
149 { \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
153 } \
154 } while (0)
155
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
158
159 #define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
164 { \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
167 } \
168 } while (0)
169
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
173 #define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
175
176 static const char *
cuda_error(CUresult r)177 cuda_error (CUresult r)
178 {
179 const char *fallback = "unknown cuda error";
180 const char *desc;
181
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
184
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186 if (r == CUDA_SUCCESS)
187 return desc;
188
189 return fallback;
190 }
191
192 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194 static char cuda_driver_version_s[30];
195
196 static unsigned int instantiated_devices = 0;
197 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
198
199 /* NVPTX/CUDA specific definition of asynchronous queues. */
200 struct goacc_asyncqueue
201 {
202 CUstream cuda_stream;
203 };
204
205 struct nvptx_callback
206 {
207 void (*fn) (void *);
208 void *ptr;
209 struct goacc_asyncqueue *aq;
210 struct nvptx_callback *next;
211 };
212
213 /* Thread-specific data for PTX. */
214
215 struct nvptx_thread
216 {
217 /* We currently have this embedded inside the plugin because libgomp manages
218 devices through integer target_ids. This might be better if using an
219 opaque target-specific pointer directly from gomp_device_descr. */
220 struct ptx_device *ptx_dev;
221 };
222
223 /* Target data function launch information. */
224
225 struct targ_fn_launch
226 {
227 const char *fn;
228 unsigned short dim[GOMP_DIM_MAX];
229 };
230
231 /* Target PTX object information. */
232
233 struct targ_ptx_obj
234 {
235 const char *code;
236 size_t size;
237 };
238
239 /* Target data image information. */
240
241 typedef struct nvptx_tdata
242 {
243 const struct targ_ptx_obj *ptx_objs;
244 unsigned ptx_num;
245
246 const char *const *var_names;
247 unsigned var_num;
248
249 const struct targ_fn_launch *fn_descs;
250 unsigned fn_num;
251 } nvptx_tdata_t;
252
253 /* Descriptor of a loaded function. */
254
255 struct targ_fn_descriptor
256 {
257 CUfunction fn;
258 const struct targ_fn_launch *launch;
259 int regs_per_thread;
260 int max_threads_per_block;
261 };
262
263 /* A loaded PTX image. */
264 struct ptx_image_data
265 {
266 const void *target_data;
267 CUmodule module;
268
269 struct targ_fn_descriptor *fns; /* Array of functions. */
270
271 struct ptx_image_data *next;
272 };
273
274 struct ptx_free_block
275 {
276 void *ptr;
277 struct ptx_free_block *next;
278 };
279
280 struct ptx_device
281 {
282 CUcontext ctx;
283 bool ctx_shared;
284 CUdevice dev;
285
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
290 bool mkern;
291 int mode;
292 int clock_khz;
293 int num_sms;
294 int regs_per_block;
295 int regs_per_sm;
296 int warp_size;
297 int max_threads_per_block;
298 int max_threads_per_multiprocessor;
299 int default_dims[GOMP_DIM_MAX];
300
301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
302 char name[256];
303
304 struct ptx_image_data *images; /* Images loaded on device. */
305 pthread_mutex_t image_lock; /* Lock for above list. */
306
307 struct ptx_free_block *free_blocks;
308 pthread_mutex_t free_blocks_lock;
309
310 struct ptx_device *next;
311 };
312
313 static struct ptx_device **ptx_devices;
314
315 static inline struct nvptx_thread *
nvptx_thread(void)316 nvptx_thread (void)
317 {
318 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319 }
320
321 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
322 should be locked on entry and remains locked on exit. */
323
324 static bool
nvptx_init(void)325 nvptx_init (void)
326 {
327 int ndevs;
328
329 if (instantiated_devices != 0)
330 return true;
331
332 if (!init_cuda_lib ())
333 return false;
334
335 CUDA_CALL (cuInit, 0);
336
337 int cuda_driver_version;
338 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340 "CUDA Driver %u.%u",
341 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342
343 CUDA_CALL (cuDeviceGetCount, &ndevs);
344 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345 * ndevs);
346
347 return true;
348 }
349
350 /* Select the N'th PTX device for the current host thread. The device must
351 have been previously opened before calling this function. */
352
353 static bool
nvptx_attach_host_thread_to_device(int n)354 nvptx_attach_host_thread_to_device (int n)
355 {
356 CUdevice dev;
357 CUresult r;
358 struct ptx_device *ptx_dev;
359 CUcontext thd_ctx;
360
361 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
362 if (r == CUDA_ERROR_NOT_PERMITTED)
363 {
364 /* Assume we're in a CUDA callback, just return true. */
365 return true;
366 }
367 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
368 {
369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370 return false;
371 }
372
373 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
374 return true;
375 else
376 {
377 CUcontext old_ctx;
378
379 ptx_dev = ptx_devices[n];
380 if (!ptx_dev)
381 {
382 GOMP_PLUGIN_error ("device %d not found", n);
383 return false;
384 }
385
386 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
387
388 /* We don't necessarily have a current context (e.g. if it has been
389 destroyed. Pop it if we do though. */
390 if (thd_ctx != NULL)
391 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
392
393 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
394 }
395 return true;
396 }
397
398 static struct ptx_device *
nvptx_open_device(int n)399 nvptx_open_device (int n)
400 {
401 struct ptx_device *ptx_dev;
402 CUdevice dev, ctx_dev;
403 CUresult r;
404 int async_engines, pi;
405
406 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
407
408 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409
410 ptx_dev->ord = n;
411 ptx_dev->dev = dev;
412 ptx_dev->ctx_shared = false;
413
414 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
415 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
416 {
417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418 return NULL;
419 }
420
421 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422 {
423 /* The current host thread has an active context for a different device.
424 Detach it. */
425 CUcontext old_ctx;
426 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
427 }
428
429 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
430
431 if (!ptx_dev->ctx)
432 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
433 else
434 ptx_dev->ctx_shared = true;
435
436 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
438 ptx_dev->overlap = pi;
439
440 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
442 ptx_dev->map = pi;
443
444 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
446 ptx_dev->concur = pi;
447
448 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
450 ptx_dev->mode = pi;
451
452 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
454 ptx_dev->mkern = pi;
455
456 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458 ptx_dev->clock_khz = pi;
459
460 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
461 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462 ptx_dev->num_sms = pi;
463
464 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466 ptx_dev->regs_per_block = pi;
467
468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
469 in CUDA 6.0 and newer. */
470 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472 dev);
473 /* Fallback: use limit of registers per block, which is usually equal. */
474 if (r == CUDA_ERROR_INVALID_VALUE)
475 pi = ptx_dev->regs_per_block;
476 else if (r != CUDA_SUCCESS)
477 {
478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479 return NULL;
480 }
481 ptx_dev->regs_per_sm = pi;
482
483 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485 if (pi != 32)
486 {
487 GOMP_PLUGIN_error ("Only warp size 32 is supported");
488 return NULL;
489 }
490 ptx_dev->warp_size = pi;
491
492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494 ptx_dev->max_threads_per_block = pi;
495
496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498 ptx_dev->max_threads_per_multiprocessor = pi;
499
500 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
502 if (r != CUDA_SUCCESS)
503 async_engines = 1;
504
505 for (int i = 0; i != GOMP_DIM_MAX; i++)
506 ptx_dev->default_dims[i] = 0;
507
508 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509 dev);
510
511 ptx_dev->images = NULL;
512 pthread_mutex_init (&ptx_dev->image_lock, NULL);
513
514 ptx_dev->free_blocks = NULL;
515 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
516
517 return ptx_dev;
518 }
519
520 static bool
nvptx_close_device(struct ptx_device * ptx_dev)521 nvptx_close_device (struct ptx_device *ptx_dev)
522 {
523 if (!ptx_dev)
524 return true;
525
526 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527 {
528 struct ptx_free_block *b_next = b->next;
529 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530 free (b);
531 b = b_next;
532 }
533
534 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
535 pthread_mutex_destroy (&ptx_dev->image_lock);
536
537 if (!ptx_dev->ctx_shared)
538 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
539
540 free (ptx_dev);
541 return true;
542 }
543
544 static int
nvptx_get_num_devices(void)545 nvptx_get_num_devices (void)
546 {
547 int n;
548
549 /* This function will be called before the plugin has been initialized in
550 order to enumerate available devices, but CUDA API routines can't be used
551 until cuInit has been called. Just call it now (but don't yet do any
552 further initialization). */
553 if (instantiated_devices == 0)
554 {
555 if (!init_cuda_lib ())
556 return 0;
557 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
558 /* This is not an error: e.g. we may have CUDA libraries installed but
559 no devices available. */
560 if (r != CUDA_SUCCESS)
561 {
562 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
563 cuda_error (r));
564 return 0;
565 }
566 }
567
568 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
569 return n;
570 }
571
572 static void
notify_var(const char * var_name,const char * env_var)573 notify_var (const char *var_name, const char *env_var)
574 {
575 if (env_var == NULL)
576 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
577 else
578 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
579 }
580
581 static void
process_GOMP_NVPTX_JIT(intptr_t * gomp_nvptx_o)582 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
583 {
584 const char *var_name = "GOMP_NVPTX_JIT";
585 const char *env_var = secure_getenv (var_name);
586 notify_var (var_name, env_var);
587
588 if (env_var == NULL)
589 return;
590
591 const char *c = env_var;
592 while (*c != '\0')
593 {
594 while (*c == ' ')
595 c++;
596
597 if (c[0] == '-' && c[1] == 'O'
598 && '0' <= c[2] && c[2] <= '4'
599 && (c[3] == '\0' || c[3] == ' '))
600 {
601 *gomp_nvptx_o = c[2] - '0';
602 c += 3;
603 continue;
604 }
605
606 GOMP_PLUGIN_error ("Error parsing %s", var_name);
607 break;
608 }
609 }
610
611 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)612 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
613 unsigned num_objs)
614 {
615 CUjit_option opts[7];
616 void *optvals[7];
617 float elapsed = 0.0;
618 char elog[1024];
619 char ilog[16384];
620 CUlinkState linkstate;
621 CUresult r;
622 void *linkout;
623 size_t linkoutsize __attribute__ ((unused));
624
625 opts[0] = CU_JIT_WALL_TIME;
626 optvals[0] = &elapsed;
627
628 opts[1] = CU_JIT_INFO_LOG_BUFFER;
629 optvals[1] = &ilog[0];
630
631 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
632 optvals[2] = (void *) sizeof ilog;
633
634 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
635 optvals[3] = &elog[0];
636
637 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
638 optvals[4] = (void *) sizeof elog;
639
640 opts[5] = CU_JIT_LOG_VERBOSE;
641 optvals[5] = (void *) 1;
642
643 static intptr_t gomp_nvptx_o = -1;
644
645 static bool init_done = false;
646 if (!init_done)
647 {
648 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
649 init_done = true;
650 }
651
652 int nopts = 6;
653 if (gomp_nvptx_o != -1)
654 {
655 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
656 optvals[nopts] = (void *) gomp_nvptx_o;
657 nopts++;
658 }
659
660 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
661 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
662 else
663 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
664
665 for (; num_objs--; ptx_objs++)
666 {
667 /* cuLinkAddData's 'data' argument erroneously omits the const
668 qualifier. */
669 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
670 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
671 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
672 (char *) ptx_objs->code, ptx_objs->size,
673 0, 0, 0, 0);
674 else
675 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
676 (char *) ptx_objs->code, ptx_objs->size,
677 0, 0, 0, 0);
678 if (r != CUDA_SUCCESS)
679 {
680 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
681 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
682 cuda_error (r));
683 return false;
684 }
685 }
686
687 GOMP_PLUGIN_debug (0, "Linking\n");
688 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
689
690 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
691 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
692
693 if (r != CUDA_SUCCESS)
694 {
695 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
696 return false;
697 }
698
699 CUDA_CALL (cuModuleLoadData, module, linkout);
700 CUDA_CALL (cuLinkDestroy, linkstate);
701 return true;
702 }
703
704 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,CUdeviceptr dp,CUstream stream)705 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
706 unsigned *dims, void *targ_mem_desc,
707 CUdeviceptr dp, CUstream stream)
708 {
709 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
710 CUfunction function;
711 int i;
712 void *kargs[1];
713 struct nvptx_thread *nvthd = nvptx_thread ();
714 int warp_size = nvthd->ptx_dev->warp_size;
715
716 function = targ_fn->fn;
717
718 /* Initialize the launch dimensions. Typically this is constant,
719 provided by the device compiler, but we must permit runtime
720 values. */
721 int seen_zero = 0;
722 for (i = 0; i != GOMP_DIM_MAX; i++)
723 {
724 if (targ_fn->launch->dim[i])
725 dims[i] = targ_fn->launch->dim[i];
726 if (!dims[i])
727 seen_zero = 1;
728 }
729
730 if (seen_zero)
731 {
732 pthread_mutex_lock (&ptx_dev_lock);
733
734 static int gomp_openacc_dims[GOMP_DIM_MAX];
735 if (!gomp_openacc_dims[0])
736 {
737 /* See if the user provided GOMP_OPENACC_DIM environment
738 variable to specify runtime defaults. */
739 for (int i = 0; i < GOMP_DIM_MAX; ++i)
740 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
741 }
742
743 if (!nvthd->ptx_dev->default_dims[0])
744 {
745 int default_dims[GOMP_DIM_MAX];
746 for (int i = 0; i < GOMP_DIM_MAX; ++i)
747 default_dims[i] = gomp_openacc_dims[i];
748
749 int gang, worker, vector;
750 {
751 int block_size = nvthd->ptx_dev->max_threads_per_block;
752 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
753 int dev_size = nvthd->ptx_dev->num_sms;
754 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
755 " dev_size=%d, cpu_size=%d\n",
756 warp_size, block_size, dev_size, cpu_size);
757
758 gang = (cpu_size / block_size) * dev_size;
759 worker = block_size / warp_size;
760 vector = warp_size;
761 }
762
763 /* There is no upper bound on the gang size. The best size
764 matches the hardware configuration. Logical gangs are
765 scheduled onto physical hardware. To maximize usage, we
766 should guess a large number. */
767 if (default_dims[GOMP_DIM_GANG] < 1)
768 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
769 /* The worker size must not exceed the hardware. */
770 if (default_dims[GOMP_DIM_WORKER] < 1
771 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
772 default_dims[GOMP_DIM_WORKER] = worker;
773 /* The vector size must exactly match the hardware. */
774 if (default_dims[GOMP_DIM_VECTOR] < 1
775 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
776 default_dims[GOMP_DIM_VECTOR] = vector;
777
778 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
779 default_dims[GOMP_DIM_GANG],
780 default_dims[GOMP_DIM_WORKER],
781 default_dims[GOMP_DIM_VECTOR]);
782
783 for (i = 0; i != GOMP_DIM_MAX; i++)
784 nvthd->ptx_dev->default_dims[i] = default_dims[i];
785 }
786 pthread_mutex_unlock (&ptx_dev_lock);
787
788 {
789 bool default_dim_p[GOMP_DIM_MAX];
790 for (i = 0; i != GOMP_DIM_MAX; i++)
791 default_dim_p[i] = !dims[i];
792
793 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
794 {
795 for (i = 0; i != GOMP_DIM_MAX; i++)
796 if (default_dim_p[i])
797 dims[i] = nvthd->ptx_dev->default_dims[i];
798
799 if (default_dim_p[GOMP_DIM_VECTOR])
800 dims[GOMP_DIM_VECTOR]
801 = MIN (dims[GOMP_DIM_VECTOR],
802 (targ_fn->max_threads_per_block / warp_size
803 * warp_size));
804
805 if (default_dim_p[GOMP_DIM_WORKER])
806 dims[GOMP_DIM_WORKER]
807 = MIN (dims[GOMP_DIM_WORKER],
808 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
809 }
810 else
811 {
812 /* Handle the case that the compiler allows the runtime to choose
813 the vector-length conservatively, by ignoring
814 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
815 it. */
816 int vectors = 0;
817 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
818 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
819 exceed targ_fn->max_threads_per_block. */
820 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
821 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
822 int grids, blocks;
823
824 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
825 &blocks, function, NULL, 0,
826 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
827 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
828 "grid = %d, block = %d\n", grids, blocks);
829
830 /* Keep the num_gangs proportional to the block size. In
831 the case were a block size is limited by shared-memory
832 or the register file capacity, the runtime will not
833 excessively over assign gangs to the multiprocessor
834 units if their state is going to be swapped out even
835 more than necessary. The constant factor 2 is there to
836 prevent threads from idling when there is insufficient
837 work for them. */
838 if (gangs == 0)
839 gangs = 2 * grids * (blocks / warp_size);
840
841 if (vectors == 0)
842 vectors = warp_size;
843
844 if (workers == 0)
845 {
846 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
847 ? vectors
848 : dims[GOMP_DIM_VECTOR]);
849 workers = blocks / actual_vectors;
850 workers = MAX (workers, 1);
851 /* If we need a per-worker barrier ... . */
852 if (actual_vectors > 32)
853 /* Don't use more barriers than available. */
854 workers = MIN (workers, 15);
855 }
856
857 for (i = 0; i != GOMP_DIM_MAX; i++)
858 if (default_dim_p[i])
859 switch (i)
860 {
861 case GOMP_DIM_GANG: dims[i] = gangs; break;
862 case GOMP_DIM_WORKER: dims[i] = workers; break;
863 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
864 default: GOMP_PLUGIN_fatal ("invalid dim");
865 }
866 }
867 }
868 }
869
870 /* Check if the accelerator has sufficient hardware resources to
871 launch the offloaded kernel. */
872 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
873 > targ_fn->max_threads_per_block)
874 {
875 const char *msg
876 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
877 " with num_workers = %d and vector_length = %d"
878 "; "
879 "recompile the program with 'num_workers = x and vector_length = y'"
880 " on that offloaded region or '-fopenacc-dim=:x:y' where"
881 " x * y <= %d"
882 ".\n");
883 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
884 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
885 }
886
887 /* Check if the accelerator has sufficient barrier resources to
888 launch the offloaded kernel. */
889 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
890 {
891 const char *msg
892 = ("The Nvidia accelerator has insufficient barrier resources to launch"
893 " '%s' with num_workers = %d and vector_length = %d"
894 "; "
895 "recompile the program with 'num_workers = x' on that offloaded"
896 " region or '-fopenacc-dim=:x:' where x <= 15"
897 "; "
898 "or, recompile the program with 'vector_length = 32' on that"
899 " offloaded region or '-fopenacc-dim=::32'"
900 ".\n");
901 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
902 dims[GOMP_DIM_VECTOR]);
903 }
904
905 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
906 " gangs=%u, workers=%u, vectors=%u\n",
907 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
908 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
909
910 // OpenACC CUDA
911 //
912 // num_gangs nctaid.x
913 // num_workers ntid.y
914 // vector length ntid.x
915
916 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
917 acc_prof_info *prof_info = thr->prof_info;
918 acc_event_info enqueue_launch_event_info;
919 acc_api_info *api_info = thr->api_info;
920 bool profiling_p = __builtin_expect (prof_info != NULL, false);
921 if (profiling_p)
922 {
923 prof_info->event_type = acc_ev_enqueue_launch_start;
924
925 enqueue_launch_event_info.launch_event.event_type
926 = prof_info->event_type;
927 enqueue_launch_event_info.launch_event.valid_bytes
928 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
929 enqueue_launch_event_info.launch_event.parent_construct
930 = acc_construct_parallel;
931 enqueue_launch_event_info.launch_event.implicit = 1;
932 enqueue_launch_event_info.launch_event.tool_info = NULL;
933 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
934 enqueue_launch_event_info.launch_event.num_gangs
935 = dims[GOMP_DIM_GANG];
936 enqueue_launch_event_info.launch_event.num_workers
937 = dims[GOMP_DIM_WORKER];
938 enqueue_launch_event_info.launch_event.vector_length
939 = dims[GOMP_DIM_VECTOR];
940
941 api_info->device_api = acc_device_api_cuda;
942
943 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
944 api_info);
945 }
946
947 kargs[0] = &dp;
948 CUDA_CALL_ASSERT (cuLaunchKernel, function,
949 dims[GOMP_DIM_GANG], 1, 1,
950 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
951 0, stream, kargs, 0);
952
953 if (profiling_p)
954 {
955 prof_info->event_type = acc_ev_enqueue_launch_end;
956 enqueue_launch_event_info.launch_event.event_type
957 = prof_info->event_type;
958 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
959 api_info);
960 }
961
962 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
963 targ_fn->launch->fn);
964 }
965
966 void * openacc_get_current_cuda_context (void);
967
968 static void
goacc_profiling_acc_ev_alloc(struct goacc_thread * thr,void * dp,size_t s)969 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
970 {
971 acc_prof_info *prof_info = thr->prof_info;
972 acc_event_info data_event_info;
973 acc_api_info *api_info = thr->api_info;
974
975 prof_info->event_type = acc_ev_alloc;
976
977 data_event_info.data_event.event_type = prof_info->event_type;
978 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
979 data_event_info.data_event.parent_construct = acc_construct_parallel;
980 data_event_info.data_event.implicit = 1;
981 data_event_info.data_event.tool_info = NULL;
982 data_event_info.data_event.var_name = NULL;
983 data_event_info.data_event.bytes = s;
984 data_event_info.data_event.host_ptr = NULL;
985 data_event_info.data_event.device_ptr = dp;
986
987 api_info->device_api = acc_device_api_cuda;
988
989 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
990 }
991
992 static void *
nvptx_alloc(size_t s)993 nvptx_alloc (size_t s)
994 {
995 CUdeviceptr d;
996
997 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
998 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
999 bool profiling_p
1000 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1001 if (profiling_p)
1002 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1003
1004 return (void *) d;
1005 }
1006
1007 static void
goacc_profiling_acc_ev_free(struct goacc_thread * thr,void * p)1008 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1009 {
1010 acc_prof_info *prof_info = thr->prof_info;
1011 acc_event_info data_event_info;
1012 acc_api_info *api_info = thr->api_info;
1013
1014 prof_info->event_type = acc_ev_free;
1015
1016 data_event_info.data_event.event_type = prof_info->event_type;
1017 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1018 data_event_info.data_event.parent_construct = acc_construct_parallel;
1019 data_event_info.data_event.implicit = 1;
1020 data_event_info.data_event.tool_info = NULL;
1021 data_event_info.data_event.var_name = NULL;
1022 data_event_info.data_event.bytes = -1;
1023 data_event_info.data_event.host_ptr = NULL;
1024 data_event_info.data_event.device_ptr = p;
1025
1026 api_info->device_api = acc_device_api_cuda;
1027
1028 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1029 }
1030
1031 static bool
nvptx_free(void * p,struct ptx_device * ptx_dev)1032 nvptx_free (void *p, struct ptx_device *ptx_dev)
1033 {
1034 CUdeviceptr pb;
1035 size_t ps;
1036
1037 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1038 (CUdeviceptr) p);
1039 if (r == CUDA_ERROR_NOT_PERMITTED)
1040 {
1041 /* We assume that this error indicates we are in a CUDA callback context,
1042 where all CUDA calls are not allowed (see cuStreamAddCallback
1043 documentation for description). Arrange to free this piece of device
1044 memory later. */
1045 struct ptx_free_block *n
1046 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1047 n->ptr = p;
1048 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1049 n->next = ptx_dev->free_blocks;
1050 ptx_dev->free_blocks = n;
1051 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1052 return true;
1053 }
1054 else if (r != CUDA_SUCCESS)
1055 {
1056 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1057 return false;
1058 }
1059 if ((CUdeviceptr) p != pb)
1060 {
1061 GOMP_PLUGIN_error ("invalid device address");
1062 return false;
1063 }
1064
1065 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1066 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1067 bool profiling_p
1068 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1069 if (profiling_p)
1070 goacc_profiling_acc_ev_free (thr, p);
1071
1072 return true;
1073 }
1074
1075 static void *
nvptx_get_current_cuda_device(void)1076 nvptx_get_current_cuda_device (void)
1077 {
1078 struct nvptx_thread *nvthd = nvptx_thread ();
1079
1080 if (!nvthd || !nvthd->ptx_dev)
1081 return NULL;
1082
1083 return &nvthd->ptx_dev->dev;
1084 }
1085
1086 static void *
nvptx_get_current_cuda_context(void)1087 nvptx_get_current_cuda_context (void)
1088 {
1089 struct nvptx_thread *nvthd = nvptx_thread ();
1090
1091 if (!nvthd || !nvthd->ptx_dev)
1092 return NULL;
1093
1094 return nvthd->ptx_dev->ctx;
1095 }
1096
1097 /* Plugin entry points. */
1098
1099 const char *
GOMP_OFFLOAD_get_name(void)1100 GOMP_OFFLOAD_get_name (void)
1101 {
1102 return "nvptx";
1103 }
1104
1105 unsigned int
GOMP_OFFLOAD_get_caps(void)1106 GOMP_OFFLOAD_get_caps (void)
1107 {
1108 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1109 }
1110
1111 int
GOMP_OFFLOAD_get_type(void)1112 GOMP_OFFLOAD_get_type (void)
1113 {
1114 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1115 }
1116
1117 int
GOMP_OFFLOAD_get_num_devices(void)1118 GOMP_OFFLOAD_get_num_devices (void)
1119 {
1120 return nvptx_get_num_devices ();
1121 }
1122
1123 bool
GOMP_OFFLOAD_init_device(int n)1124 GOMP_OFFLOAD_init_device (int n)
1125 {
1126 struct ptx_device *dev;
1127
1128 pthread_mutex_lock (&ptx_dev_lock);
1129
1130 if (!nvptx_init () || ptx_devices[n] != NULL)
1131 {
1132 pthread_mutex_unlock (&ptx_dev_lock);
1133 return false;
1134 }
1135
1136 dev = nvptx_open_device (n);
1137 if (dev)
1138 {
1139 ptx_devices[n] = dev;
1140 instantiated_devices++;
1141 }
1142
1143 pthread_mutex_unlock (&ptx_dev_lock);
1144
1145 return dev != NULL;
1146 }
1147
1148 bool
GOMP_OFFLOAD_fini_device(int n)1149 GOMP_OFFLOAD_fini_device (int n)
1150 {
1151 pthread_mutex_lock (&ptx_dev_lock);
1152
1153 if (ptx_devices[n] != NULL)
1154 {
1155 if (!nvptx_attach_host_thread_to_device (n)
1156 || !nvptx_close_device (ptx_devices[n]))
1157 {
1158 pthread_mutex_unlock (&ptx_dev_lock);
1159 return false;
1160 }
1161 ptx_devices[n] = NULL;
1162 instantiated_devices--;
1163 }
1164
1165 if (instantiated_devices == 0)
1166 {
1167 free (ptx_devices);
1168 ptx_devices = NULL;
1169 }
1170
1171 pthread_mutex_unlock (&ptx_dev_lock);
1172 return true;
1173 }
1174
1175 /* Return the libgomp version number we're compatible with. There is
1176 no requirement for cross-version compatibility. */
1177
1178 unsigned
GOMP_OFFLOAD_version(void)1179 GOMP_OFFLOAD_version (void)
1180 {
1181 return GOMP_VERSION;
1182 }
1183
1184 /* Initialize __nvptx_clocktick, if present in MODULE. */
1185
1186 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1187 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1188 {
1189 CUdeviceptr dptr;
1190 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1191 module, "__nvptx_clocktick");
1192 if (r == CUDA_ERROR_NOT_FOUND)
1193 return;
1194 if (r != CUDA_SUCCESS)
1195 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1196 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1197 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1198 sizeof (__nvptx_clocktick));
1199 if (r != CUDA_SUCCESS)
1200 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1201 }
1202
1203 /* Load the (partial) program described by TARGET_DATA to device
1204 number ORD. Allocate and return TARGET_TABLE. */
1205
1206 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1207 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1208 struct addr_pair **target_table)
1209 {
1210 CUmodule module;
1211 const char *const *var_names;
1212 const struct targ_fn_launch *fn_descs;
1213 unsigned int fn_entries, var_entries, i, j;
1214 struct targ_fn_descriptor *targ_fns;
1215 struct addr_pair *targ_tbl;
1216 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1217 struct ptx_image_data *new_image;
1218 struct ptx_device *dev;
1219
1220 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1221 {
1222 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1223 " (expected %u, received %u)",
1224 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1225 return -1;
1226 }
1227
1228 if (!nvptx_attach_host_thread_to_device (ord)
1229 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1230 return -1;
1231
1232 dev = ptx_devices[ord];
1233
1234 /* The mkoffload utility emits a struct of pointers/integers at the
1235 start of each offload image. The array of kernel names and the
1236 functions addresses form a one-to-one correspondence. */
1237
1238 var_entries = img_header->var_num;
1239 var_names = img_header->var_names;
1240 fn_entries = img_header->fn_num;
1241 fn_descs = img_header->fn_descs;
1242
1243 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1244 * (fn_entries + var_entries));
1245 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1246 * fn_entries);
1247
1248 *target_table = targ_tbl;
1249
1250 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1251 new_image->target_data = target_data;
1252 new_image->module = module;
1253 new_image->fns = targ_fns;
1254
1255 pthread_mutex_lock (&dev->image_lock);
1256 new_image->next = dev->images;
1257 dev->images = new_image;
1258 pthread_mutex_unlock (&dev->image_lock);
1259
1260 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1261 {
1262 CUfunction function;
1263 int nregs, mthrs;
1264
1265 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1266 fn_descs[i].fn);
1267 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1268 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1269 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1270 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1271
1272 targ_fns->fn = function;
1273 targ_fns->launch = &fn_descs[i];
1274 targ_fns->regs_per_thread = nregs;
1275 targ_fns->max_threads_per_block = mthrs;
1276
1277 targ_tbl->start = (uintptr_t) targ_fns;
1278 targ_tbl->end = targ_tbl->start + 1;
1279 }
1280
1281 for (j = 0; j < var_entries; j++, targ_tbl++)
1282 {
1283 CUdeviceptr var;
1284 size_t bytes;
1285
1286 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1287 &var, &bytes, module, var_names[j]);
1288
1289 targ_tbl->start = (uintptr_t) var;
1290 targ_tbl->end = targ_tbl->start + bytes;
1291 }
1292
1293 nvptx_set_clocktick (module, dev);
1294
1295 return fn_entries + var_entries;
1296 }
1297
1298 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1299 function descriptors allocated by G_O_load_image. */
1300
1301 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1302 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1303 {
1304 struct ptx_image_data *image, **prev_p;
1305 struct ptx_device *dev = ptx_devices[ord];
1306
1307 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1308 {
1309 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1310 " (expected %u, received %u)",
1311 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1312 return false;
1313 }
1314
1315 bool ret = true;
1316 pthread_mutex_lock (&dev->image_lock);
1317 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1318 if (image->target_data == target_data)
1319 {
1320 *prev_p = image->next;
1321 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1322 ret = false;
1323 free (image->fns);
1324 free (image);
1325 break;
1326 }
1327 pthread_mutex_unlock (&dev->image_lock);
1328 return ret;
1329 }
1330
1331 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1332 GOMP_OFFLOAD_alloc (int ord, size_t size)
1333 {
1334 if (!nvptx_attach_host_thread_to_device (ord))
1335 return NULL;
1336
1337 struct ptx_device *ptx_dev = ptx_devices[ord];
1338 struct ptx_free_block *blocks, *tmp;
1339
1340 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1341 blocks = ptx_dev->free_blocks;
1342 ptx_dev->free_blocks = NULL;
1343 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1344
1345 while (blocks)
1346 {
1347 tmp = blocks->next;
1348 nvptx_free (blocks->ptr, ptx_dev);
1349 free (blocks);
1350 blocks = tmp;
1351 }
1352
1353 return nvptx_alloc (size);
1354 }
1355
1356 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1357 GOMP_OFFLOAD_free (int ord, void *ptr)
1358 {
1359 return (nvptx_attach_host_thread_to_device (ord)
1360 && nvptx_free (ptr, ptx_devices[ord]));
1361 }
1362
1363 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)1364 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1365 void **hostaddrs, void **devaddrs,
1366 unsigned *dims, void *targ_mem_desc)
1367 {
1368 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1369
1370 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1371 acc_prof_info *prof_info = thr->prof_info;
1372 acc_event_info data_event_info;
1373 acc_api_info *api_info = thr->api_info;
1374 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1375
1376 void **hp = NULL;
1377 CUdeviceptr dp = 0;
1378
1379 if (mapnum > 0)
1380 {
1381 size_t s = mapnum * sizeof (void *);
1382 hp = alloca (s);
1383 for (int i = 0; i < mapnum; i++)
1384 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1385 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1386 if (profiling_p)
1387 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1388 }
1389
1390 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1391 fact have the same value on a unified-memory system). */
1392 if (mapnum > 0)
1393 {
1394 if (profiling_p)
1395 {
1396 prof_info->event_type = acc_ev_enqueue_upload_start;
1397
1398 data_event_info.data_event.event_type = prof_info->event_type;
1399 data_event_info.data_event.valid_bytes
1400 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1401 data_event_info.data_event.parent_construct
1402 = acc_construct_parallel;
1403 data_event_info.data_event.implicit = 1; /* Always implicit. */
1404 data_event_info.data_event.tool_info = NULL;
1405 data_event_info.data_event.var_name = NULL;
1406 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1407 data_event_info.data_event.host_ptr = hp;
1408 data_event_info.data_event.device_ptr = (const void *) dp;
1409
1410 api_info->device_api = acc_device_api_cuda;
1411
1412 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1413 api_info);
1414 }
1415 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1416 mapnum * sizeof (void *));
1417 if (profiling_p)
1418 {
1419 prof_info->event_type = acc_ev_enqueue_upload_end;
1420 data_event_info.data_event.event_type = prof_info->event_type;
1421 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1422 api_info);
1423 }
1424 }
1425
1426 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1427 dp, NULL);
1428
1429 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1430 const char *maybe_abort_msg = "(perhaps abort was called)";
1431 if (r == CUDA_ERROR_LAUNCH_FAILED)
1432 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1433 maybe_abort_msg);
1434 else if (r != CUDA_SUCCESS)
1435 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1436
1437 CUDA_CALL_ASSERT (cuMemFree, dp);
1438 if (profiling_p)
1439 goacc_profiling_acc_ev_free (thr, (void *) dp);
1440 }
1441
1442 static void
cuda_free_argmem(void * ptr)1443 cuda_free_argmem (void *ptr)
1444 {
1445 void **block = (void **) ptr;
1446 nvptx_free (block[0], (struct ptx_device *) block[1]);
1447 free (block);
1448 }
1449
1450 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)1451 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1452 void **hostaddrs, void **devaddrs,
1453 unsigned *dims, void *targ_mem_desc,
1454 struct goacc_asyncqueue *aq)
1455 {
1456 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1457
1458 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1459 acc_prof_info *prof_info = thr->prof_info;
1460 acc_event_info data_event_info;
1461 acc_api_info *api_info = thr->api_info;
1462 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1463
1464 void **hp = NULL;
1465 CUdeviceptr dp = 0;
1466 void **block = NULL;
1467
1468 if (mapnum > 0)
1469 {
1470 size_t s = mapnum * sizeof (void *);
1471 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1472 hp = block + 2;
1473 for (int i = 0; i < mapnum; i++)
1474 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1475 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1476 if (profiling_p)
1477 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1478 }
1479
1480 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1481 fact have the same value on a unified-memory system). */
1482 if (mapnum > 0)
1483 {
1484 if (profiling_p)
1485 {
1486 prof_info->event_type = acc_ev_enqueue_upload_start;
1487
1488 data_event_info.data_event.event_type = prof_info->event_type;
1489 data_event_info.data_event.valid_bytes
1490 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1491 data_event_info.data_event.parent_construct
1492 = acc_construct_parallel;
1493 data_event_info.data_event.implicit = 1; /* Always implicit. */
1494 data_event_info.data_event.tool_info = NULL;
1495 data_event_info.data_event.var_name = NULL;
1496 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1497 data_event_info.data_event.host_ptr = hp;
1498 data_event_info.data_event.device_ptr = (const void *) dp;
1499
1500 api_info->device_api = acc_device_api_cuda;
1501
1502 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1503 api_info);
1504 }
1505
1506 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1507 mapnum * sizeof (void *), aq->cuda_stream);
1508 block[0] = (void *) dp;
1509
1510 struct nvptx_thread *nvthd =
1511 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1512 block[1] = (void *) nvthd->ptx_dev;
1513
1514 if (profiling_p)
1515 {
1516 prof_info->event_type = acc_ev_enqueue_upload_end;
1517 data_event_info.data_event.event_type = prof_info->event_type;
1518 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1519 api_info);
1520 }
1521 }
1522
1523 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1524 dp, aq->cuda_stream);
1525
1526 if (mapnum > 0)
1527 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1528 }
1529
1530 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)1531 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1532 {
1533 struct ptx_device *ptx_dev;
1534 struct nvptx_thread *nvthd
1535 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1536 CUcontext thd_ctx;
1537
1538 ptx_dev = ptx_devices[ord];
1539
1540 assert (ptx_dev);
1541
1542 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1543
1544 assert (ptx_dev->ctx);
1545
1546 if (!thd_ctx)
1547 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1548
1549 nvthd->ptx_dev = ptx_dev;
1550
1551 return (void *) nvthd;
1552 }
1553
1554 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)1555 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1556 {
1557 free (data);
1558 }
1559
1560 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)1561 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1562 {
1563 return nvptx_get_current_cuda_device ();
1564 }
1565
1566 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)1567 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1568 {
1569 return nvptx_get_current_cuda_context ();
1570 }
1571
1572 /* This returns a CUstream. */
1573 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(struct goacc_asyncqueue * aq)1574 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1575 {
1576 return (void *) aq->cuda_stream;
1577 }
1578
1579 /* This takes a CUstream. */
1580 int
GOMP_OFFLOAD_openacc_cuda_set_stream(struct goacc_asyncqueue * aq,void * stream)1581 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1582 {
1583 if (aq->cuda_stream)
1584 {
1585 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1586 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1587 }
1588
1589 aq->cuda_stream = (CUstream) stream;
1590 return 1;
1591 }
1592
1593 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)1594 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1595 {
1596 CUstream stream = NULL;
1597 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1598
1599 struct goacc_asyncqueue *aq
1600 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1601 aq->cuda_stream = stream;
1602 return aq;
1603 }
1604
1605 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)1606 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1607 {
1608 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1609 free (aq);
1610 return true;
1611 }
1612
1613 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)1614 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1615 {
1616 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1617 if (r == CUDA_SUCCESS)
1618 return 1;
1619 if (r == CUDA_ERROR_NOT_READY)
1620 return 0;
1621
1622 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1623 return -1;
1624 }
1625
1626 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)1627 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1628 {
1629 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1630 return true;
1631 }
1632
1633 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)1634 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1635 struct goacc_asyncqueue *aq2)
1636 {
1637 CUevent e;
1638 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1639 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1640 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1641 return true;
1642 }
1643
1644 static void
cuda_callback_wrapper(CUstream stream,CUresult res,void * ptr)1645 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1646 {
1647 if (res != CUDA_SUCCESS)
1648 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1649 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1650 cb->fn (cb->ptr);
1651 free (ptr);
1652 }
1653
1654 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* callback_fn)(void *),void * userptr)1655 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1656 void (*callback_fn)(void *),
1657 void *userptr)
1658 {
1659 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1660 b->fn = callback_fn;
1661 b->ptr = userptr;
1662 b->aq = aq;
1663 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1664 cuda_callback_wrapper, (void *) b, 0);
1665 }
1666
1667 static bool
cuda_memcpy_sanity_check(const void * h,const void * d,size_t s)1668 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1669 {
1670 CUdeviceptr pb;
1671 size_t ps;
1672 if (!s)
1673 return true;
1674 if (!d)
1675 {
1676 GOMP_PLUGIN_error ("invalid device address");
1677 return false;
1678 }
1679 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1680 if (!pb)
1681 {
1682 GOMP_PLUGIN_error ("invalid device address");
1683 return false;
1684 }
1685 if (!h)
1686 {
1687 GOMP_PLUGIN_error ("invalid host address");
1688 return false;
1689 }
1690 if (d == h)
1691 {
1692 GOMP_PLUGIN_error ("invalid host or device address");
1693 return false;
1694 }
1695 if ((void *)(d + s) > (void *)(pb + ps))
1696 {
1697 GOMP_PLUGIN_error ("invalid size");
1698 return false;
1699 }
1700 return true;
1701 }
1702
1703 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1704 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1705 {
1706 if (!nvptx_attach_host_thread_to_device (ord)
1707 || !cuda_memcpy_sanity_check (src, dst, n))
1708 return false;
1709 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1710 return true;
1711 }
1712
1713 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1714 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1715 {
1716 if (!nvptx_attach_host_thread_to_device (ord)
1717 || !cuda_memcpy_sanity_check (dst, src, n))
1718 return false;
1719 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1720 return true;
1721 }
1722
1723 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1724 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1725 {
1726 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1727 return true;
1728 }
1729
1730 bool
GOMP_OFFLOAD_openacc_async_host2dev(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1731 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1732 size_t n, struct goacc_asyncqueue *aq)
1733 {
1734 if (!nvptx_attach_host_thread_to_device (ord)
1735 || !cuda_memcpy_sanity_check (src, dst, n))
1736 return false;
1737 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1738 return true;
1739 }
1740
1741 bool
GOMP_OFFLOAD_openacc_async_dev2host(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1742 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1743 size_t n, struct goacc_asyncqueue *aq)
1744 {
1745 if (!nvptx_attach_host_thread_to_device (ord)
1746 || !cuda_memcpy_sanity_check (dst, src, n))
1747 return false;
1748 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1749 return true;
1750 }
1751
1752 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int n,enum goacc_property prop)1753 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1754 {
1755 union goacc_property_value propval = { .val = 0 };
1756
1757 pthread_mutex_lock (&ptx_dev_lock);
1758
1759 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1760 {
1761 pthread_mutex_unlock (&ptx_dev_lock);
1762 return propval;
1763 }
1764
1765 struct ptx_device *ptx_dev = ptx_devices[n];
1766 switch (prop)
1767 {
1768 case GOACC_PROPERTY_MEMORY:
1769 {
1770 size_t total_mem;
1771
1772 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1773 propval.val = total_mem;
1774 }
1775 break;
1776 case GOACC_PROPERTY_FREE_MEMORY:
1777 {
1778 size_t total_mem;
1779 size_t free_mem;
1780 CUdevice ctxdev;
1781
1782 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1783 if (ptx_dev->dev == ctxdev)
1784 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1785 else if (ptx_dev->ctx)
1786 {
1787 CUcontext old_ctx;
1788
1789 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1790 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1791 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1792 }
1793 else
1794 {
1795 CUcontext new_ctx;
1796
1797 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1798 ptx_dev->dev);
1799 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1800 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1801 }
1802 propval.val = free_mem;
1803 }
1804 break;
1805 case GOACC_PROPERTY_NAME:
1806 propval.ptr = ptx_dev->name;
1807 break;
1808 case GOACC_PROPERTY_VENDOR:
1809 propval.ptr = "Nvidia";
1810 break;
1811 case GOACC_PROPERTY_DRIVER:
1812 propval.ptr = cuda_driver_version_s;
1813 break;
1814 default:
1815 break;
1816 }
1817
1818 pthread_mutex_unlock (&ptx_dev_lock);
1819 return propval;
1820 }
1821
1822 /* Adjust launch dimensions: pick good values for number of blocks and warps
1823 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1824 own limits. */
1825
1826 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)1827 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1828 struct ptx_device *ptx_dev,
1829 int *teams_p, int *threads_p)
1830 {
1831 int max_warps_block = fn->max_threads_per_block / 32;
1832 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1833 and libgcc, which matches documented limit of all GPUs as of 2015. */
1834 if (max_warps_block > 32)
1835 max_warps_block = 32;
1836 if (*threads_p <= 0)
1837 *threads_p = 8;
1838 if (*threads_p > max_warps_block)
1839 *threads_p = max_warps_block;
1840
1841 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1842 /* This is an estimate of how many blocks the device can host simultaneously.
1843 Actual limit, which may be lower, can be queried with "occupancy control"
1844 driver interface (since CUDA 6.0). */
1845 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1846 if (*teams_p <= 0 || *teams_p > max_blocks)
1847 *teams_p = max_blocks;
1848 }
1849
1850 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1851 target regions. */
1852
1853 static size_t
nvptx_stacks_size()1854 nvptx_stacks_size ()
1855 {
1856 return 128 * 1024;
1857 }
1858
1859 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1860
1861 static void *
nvptx_stacks_alloc(size_t size,int num)1862 nvptx_stacks_alloc (size_t size, int num)
1863 {
1864 CUdeviceptr stacks;
1865 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1866 if (r != CUDA_SUCCESS)
1867 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1868 return (void *) stacks;
1869 }
1870
1871 /* Release storage previously allocated by nvptx_stacks_alloc. */
1872
1873 static void
nvptx_stacks_free(void * p,int num)1874 nvptx_stacks_free (void *p, int num)
1875 {
1876 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1877 if (r != CUDA_SUCCESS)
1878 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1879 }
1880
1881 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)1882 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1883 {
1884 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1885 CUresult r;
1886 struct ptx_device *ptx_dev = ptx_devices[ord];
1887 const char *maybe_abort_msg = "(perhaps abort was called)";
1888 int teams = 0, threads = 0;
1889
1890 if (!args)
1891 GOMP_PLUGIN_fatal ("No target arguments provided");
1892 while (*args)
1893 {
1894 intptr_t id = (intptr_t) *args++, val;
1895 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1896 val = (intptr_t) *args++;
1897 else
1898 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1899 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1900 continue;
1901 val = val > INT_MAX ? INT_MAX : val;
1902 id &= GOMP_TARGET_ARG_ID_MASK;
1903 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1904 teams = val;
1905 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1906 threads = val;
1907 }
1908 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1909
1910 size_t stack_size = nvptx_stacks_size ();
1911 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1912 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1913 size_t fn_args_size = sizeof fn_args;
1914 void *config[] = {
1915 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1916 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1917 CU_LAUNCH_PARAM_END
1918 };
1919 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1920 32, threads, 1, 0, NULL, NULL, config);
1921 if (r != CUDA_SUCCESS)
1922 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1923
1924 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1925 if (r == CUDA_ERROR_LAUNCH_FAILED)
1926 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1927 maybe_abort_msg);
1928 else if (r != CUDA_SUCCESS)
1929 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1930 nvptx_stacks_free (stacks, teams * threads);
1931 }
1932
1933 /* TODO: Implement GOMP_OFFLOAD_async_run. */
1934