xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/plugin/plugin-nvptx.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2022 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
42 
43 #include <pthread.h>
44 #if PLUGIN_NVPTX_DYNAMIC
45 # include "cuda/cuda.h"
46 #else
47 # include <cuda.h>
48 #endif
49 #include <stdbool.h>
50 #include <limits.h>
51 #include <string.h>
52 #include <stdio.h>
53 #include <unistd.h>
54 #include <assert.h>
55 #include <errno.h>
56 
57 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
58    block to cache between kernel invocations.  For soft-stacks blocks bigger
59    than this, we will free the block before attempting another GPU memory
60    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
61    we will free the cached soft-stacks block anyway then retry the
62    allocation.  If that fails too, we lose.  */
63 
64 #define SOFTSTACK_CACHE_LIMIT 134217728
65 
66 #if CUDA_VERSION < 6000
67 extern CUresult cuGetErrorString (CUresult, const char **);
68 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
69 #endif
70 
71 #if CUDA_VERSION >= 6050
72 #undef cuLinkCreate
73 #undef cuLinkAddData
74 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
75 			const char *, unsigned, CUjit_option *, void **);
76 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
77 #else
78 typedef size_t (*CUoccupancyB2DSize)(int);
79 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
80 			   const char *, unsigned, CUjit_option *, void **);
81 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
82 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
83 					  CUoccupancyB2DSize, size_t, int);
84 #endif
85 
86 #define DO_PRAGMA(x) _Pragma (#x)
87 
88 #if PLUGIN_NVPTX_DYNAMIC
89 # include <dlfcn.h>
90 
91 struct cuda_lib_s {
92 
93 # define CUDA_ONE_CALL(call)			\
94   __typeof (call) *call;
95 # define CUDA_ONE_CALL_MAYBE_NULL(call)		\
96   CUDA_ONE_CALL (call)
97 #include "cuda-lib.def"
98 # undef CUDA_ONE_CALL
99 # undef CUDA_ONE_CALL_MAYBE_NULL
100 
101 } cuda_lib;
102 
103 /* -1 if init_cuda_lib has not been called yet, false
104    if it has been and failed, true if it has been and succeeded.  */
105 static signed char cuda_lib_inited = -1;
106 
107 /* Dynamically load the CUDA runtime library and initialize function
108    pointers, return false if unsuccessful, true if successful.  */
109 static bool
init_cuda_lib(void)110 init_cuda_lib (void)
111 {
112   if (cuda_lib_inited != -1)
113     return cuda_lib_inited;
114   const char *cuda_runtime_lib = "libcuda.so.1";
115   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
116   cuda_lib_inited = false;
117   if (h == NULL)
118     return false;
119 
120 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
121 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
122 # define CUDA_ONE_CALL_1(call, allow_null)		\
123   cuda_lib.call = dlsym (h, #call);	\
124   if (!allow_null && cuda_lib.call == NULL)		\
125     return false;
126 #include "cuda-lib.def"
127 # undef CUDA_ONE_CALL
128 # undef CUDA_ONE_CALL_1
129 # undef CUDA_ONE_CALL_MAYBE_NULL
130 
131   cuda_lib_inited = true;
132   return true;
133 }
134 # define CUDA_CALL_PREFIX cuda_lib.
135 #else
136 
137 # define CUDA_ONE_CALL(call)
138 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
139 #include "cuda-lib.def"
140 #undef CUDA_ONE_CALL_MAYBE_NULL
141 #undef CUDA_ONE_CALL
142 
143 # define CUDA_CALL_PREFIX
144 # define init_cuda_lib() true
145 #endif
146 
147 #include "secure_getenv.h"
148 
149 #undef MIN
150 #undef MAX
151 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
152 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
153 
154 /* Convenience macros for the frequently used CUDA library call and
155    error handling sequence as well as CUDA library calls that
156    do the error checking themselves or don't do it at all.  */
157 
158 #define CUDA_CALL_ERET(ERET, FN, ...)		\
159   do {						\
160     unsigned __r				\
161       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
162     if (__r != CUDA_SUCCESS)			\
163       {						\
164 	GOMP_PLUGIN_error (#FN " error: %s",	\
165 			   cuda_error (__r));	\
166 	return ERET;				\
167       }						\
168   } while (0)
169 
170 #define CUDA_CALL(FN, ...)			\
171   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
172 
173 #define CUDA_CALL_ASSERT(FN, ...)		\
174   do {						\
175     unsigned __r				\
176       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
177     if (__r != CUDA_SUCCESS)			\
178       {						\
179 	GOMP_PLUGIN_fatal (#FN " error: %s",	\
180 			   cuda_error (__r));	\
181       }						\
182   } while (0)
183 
184 #define CUDA_CALL_NOCHECK(FN, ...)		\
185   CUDA_CALL_PREFIX FN (__VA_ARGS__)
186 
187 #define CUDA_CALL_EXISTS(FN)			\
188   CUDA_CALL_PREFIX FN
189 
190 static const char *
cuda_error(CUresult r)191 cuda_error (CUresult r)
192 {
193   const char *fallback = "unknown cuda error";
194   const char *desc;
195 
196   if (!CUDA_CALL_EXISTS (cuGetErrorString))
197     return fallback;
198 
199   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
200   if (r == CUDA_SUCCESS)
201     return desc;
202 
203   return fallback;
204 }
205 
206 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
207    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
208 static char cuda_driver_version_s[30];
209 
210 static unsigned int instantiated_devices = 0;
211 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
212 
213 /* NVPTX/CUDA specific definition of asynchronous queues.  */
214 struct goacc_asyncqueue
215 {
216   CUstream cuda_stream;
217 };
218 
219 struct nvptx_callback
220 {
221   void (*fn) (void *);
222   void *ptr;
223   struct goacc_asyncqueue *aq;
224   struct nvptx_callback *next;
225 };
226 
227 /* Thread-specific data for PTX.  */
228 
229 struct nvptx_thread
230 {
231   /* We currently have this embedded inside the plugin because libgomp manages
232      devices through integer target_ids.  This might be better if using an
233      opaque target-specific pointer directly from gomp_device_descr.  */
234   struct ptx_device *ptx_dev;
235 };
236 
237 /* Target data function launch information.  */
238 
239 struct targ_fn_launch
240 {
241   const char *fn;
242   unsigned short dim[GOMP_DIM_MAX];
243 };
244 
245 /* Target PTX object information.  */
246 
247 struct targ_ptx_obj
248 {
249   const char *code;
250   size_t size;
251 };
252 
253 /* Target data image information.  */
254 
255 typedef struct nvptx_tdata
256 {
257   const struct targ_ptx_obj *ptx_objs;
258   unsigned ptx_num;
259 
260   const char *const *var_names;
261   unsigned var_num;
262 
263   const struct targ_fn_launch *fn_descs;
264   unsigned fn_num;
265 } nvptx_tdata_t;
266 
267 /* Descriptor of a loaded function.  */
268 
269 struct targ_fn_descriptor
270 {
271   CUfunction fn;
272   const struct targ_fn_launch *launch;
273   int regs_per_thread;
274   int max_threads_per_block;
275 };
276 
277 /* A loaded PTX image.  */
278 struct ptx_image_data
279 {
280   const void *target_data;
281   CUmodule module;
282 
283   struct targ_fn_descriptor *fns;  /* Array of functions.  */
284 
285   struct ptx_image_data *next;
286 };
287 
288 struct ptx_free_block
289 {
290   void *ptr;
291   struct ptx_free_block *next;
292 };
293 
294 struct ptx_device
295 {
296   CUcontext ctx;
297   bool ctx_shared;
298   CUdevice dev;
299 
300   int ord;
301   bool overlap;
302   bool map;
303   bool concur;
304   bool mkern;
305   int mode;
306   int clock_khz;
307   int num_sms;
308   int regs_per_block;
309   int regs_per_sm;
310   int warp_size;
311   int max_threads_per_block;
312   int max_threads_per_multiprocessor;
313   int default_dims[GOMP_DIM_MAX];
314 
315   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
316   char name[256];
317 
318   struct ptx_image_data *images;  /* Images loaded on device.  */
319   pthread_mutex_t image_lock;     /* Lock for above list.  */
320 
321   struct ptx_free_block *free_blocks;
322   pthread_mutex_t free_blocks_lock;
323 
324   /* OpenMP stacks, cached between kernel invocations.  */
325   struct
326     {
327       CUdeviceptr ptr;
328       size_t size;
329       pthread_mutex_t lock;
330     } omp_stacks;
331 
332   struct ptx_device *next;
333 };
334 
335 static struct ptx_device **ptx_devices;
336 
337 static inline struct nvptx_thread *
nvptx_thread(void)338 nvptx_thread (void)
339 {
340   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
341 }
342 
343 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
344    should be locked on entry and remains locked on exit.  */
345 
346 static bool
nvptx_init(void)347 nvptx_init (void)
348 {
349   int ndevs;
350 
351   if (instantiated_devices != 0)
352     return true;
353 
354   if (!init_cuda_lib ())
355     return false;
356 
357   CUDA_CALL (cuInit, 0);
358 
359   int cuda_driver_version;
360   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
361   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
362 	    "CUDA Driver %u.%u",
363 	    cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
364 
365   CUDA_CALL (cuDeviceGetCount, &ndevs);
366   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
367 					    * ndevs);
368 
369   return true;
370 }
371 
372 /* Select the N'th PTX device for the current host thread.  The device must
373    have been previously opened before calling this function.  */
374 
375 static bool
nvptx_attach_host_thread_to_device(int n)376 nvptx_attach_host_thread_to_device (int n)
377 {
378   CUdevice dev;
379   CUresult r;
380   struct ptx_device *ptx_dev;
381   CUcontext thd_ctx;
382 
383   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
384   if (r == CUDA_ERROR_NOT_PERMITTED)
385     {
386       /* Assume we're in a CUDA callback, just return true.  */
387       return true;
388     }
389   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
390     {
391       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
392       return false;
393     }
394 
395   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
396     return true;
397   else
398     {
399       CUcontext old_ctx;
400 
401       ptx_dev = ptx_devices[n];
402       if (!ptx_dev)
403 	{
404 	  GOMP_PLUGIN_error ("device %d not found", n);
405 	  return false;
406 	}
407 
408       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
409 
410       /* We don't necessarily have a current context (e.g. if it has been
411          destroyed.  Pop it if we do though.  */
412       if (thd_ctx != NULL)
413 	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
414 
415       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
416     }
417   return true;
418 }
419 
420 static struct ptx_device *
nvptx_open_device(int n)421 nvptx_open_device (int n)
422 {
423   struct ptx_device *ptx_dev;
424   CUdevice dev, ctx_dev;
425   CUresult r;
426   int async_engines, pi;
427 
428   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
429 
430   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
431 
432   ptx_dev->ord = n;
433   ptx_dev->dev = dev;
434   ptx_dev->ctx_shared = false;
435 
436   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
437   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
438     {
439       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
440       return NULL;
441     }
442 
443   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
444     {
445       /* The current host thread has an active context for a different device.
446          Detach it.  */
447       CUcontext old_ctx;
448       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
449     }
450 
451   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
452 
453   if (!ptx_dev->ctx)
454     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
455   else
456     ptx_dev->ctx_shared = true;
457 
458   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
460   ptx_dev->overlap = pi;
461 
462   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
464   ptx_dev->map = pi;
465 
466   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
468   ptx_dev->concur = pi;
469 
470   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
472   ptx_dev->mode = pi;
473 
474   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
476   ptx_dev->mkern = pi;
477 
478   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
480   ptx_dev->clock_khz = pi;
481 
482   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
484   ptx_dev->num_sms = pi;
485 
486   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
488   ptx_dev->regs_per_block = pi;
489 
490   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
491      in CUDA 6.0 and newer.  */
492   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
493 			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
494 			 dev);
495   /* Fallback: use limit of registers per block, which is usually equal.  */
496   if (r == CUDA_ERROR_INVALID_VALUE)
497     pi = ptx_dev->regs_per_block;
498   else if (r != CUDA_SUCCESS)
499     {
500       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
501       return NULL;
502     }
503   ptx_dev->regs_per_sm = pi;
504 
505   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
506 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
507   if (pi != 32)
508     {
509       GOMP_PLUGIN_error ("Only warp size 32 is supported");
510       return NULL;
511     }
512   ptx_dev->warp_size = pi;
513 
514   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
516   ptx_dev->max_threads_per_block = pi;
517 
518   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
519 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
520   ptx_dev->max_threads_per_multiprocessor = pi;
521 
522   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
523 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
524   if (r != CUDA_SUCCESS)
525     async_engines = 1;
526 
527   for (int i = 0; i != GOMP_DIM_MAX; i++)
528     ptx_dev->default_dims[i] = 0;
529 
530   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
531 		  dev);
532 
533   ptx_dev->images = NULL;
534   pthread_mutex_init (&ptx_dev->image_lock, NULL);
535 
536   ptx_dev->free_blocks = NULL;
537   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
538 
539   ptx_dev->omp_stacks.ptr = 0;
540   ptx_dev->omp_stacks.size = 0;
541   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
542 
543   return ptx_dev;
544 }
545 
546 static bool
nvptx_close_device(struct ptx_device * ptx_dev)547 nvptx_close_device (struct ptx_device *ptx_dev)
548 {
549   if (!ptx_dev)
550     return true;
551 
552   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
553     {
554       struct ptx_free_block *b_next = b->next;
555       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
556       free (b);
557       b = b_next;
558     }
559 
560   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
561   pthread_mutex_destroy (&ptx_dev->image_lock);
562 
563   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
564 
565   if (ptx_dev->omp_stacks.ptr)
566     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
567 
568   if (!ptx_dev->ctx_shared)
569     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
570 
571   free (ptx_dev);
572   return true;
573 }
574 
575 static int
nvptx_get_num_devices(void)576 nvptx_get_num_devices (void)
577 {
578   int n;
579 
580   /* This function will be called before the plugin has been initialized in
581      order to enumerate available devices, but CUDA API routines can't be used
582      until cuInit has been called.  Just call it now (but don't yet do any
583      further initialization).  */
584   if (instantiated_devices == 0)
585     {
586       if (!init_cuda_lib ())
587 	return 0;
588       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
589       /* This is not an error: e.g. we may have CUDA libraries installed but
590          no devices available.  */
591       if (r != CUDA_SUCCESS)
592 	{
593 	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
594 			     cuda_error (r));
595 	  return 0;
596 	}
597     }
598 
599   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
600   return n;
601 }
602 
603 static void
notify_var(const char * var_name,const char * env_var)604 notify_var (const char *var_name, const char *env_var)
605 {
606   if (env_var == NULL)
607     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
608   else
609     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
610 }
611 
612 static void
process_GOMP_NVPTX_JIT(intptr_t * gomp_nvptx_o)613 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
614 {
615   const char *var_name = "GOMP_NVPTX_JIT";
616   const char *env_var = secure_getenv (var_name);
617   notify_var (var_name, env_var);
618 
619   if (env_var == NULL)
620     return;
621 
622   const char *c = env_var;
623   while (*c != '\0')
624     {
625       while (*c == ' ')
626 	c++;
627 
628       if (c[0] == '-' && c[1] == 'O'
629 	  && '0' <= c[2] && c[2] <= '4'
630 	  && (c[3] == '\0' || c[3] == ' '))
631 	{
632 	  *gomp_nvptx_o = c[2] - '0';
633 	  c += 3;
634 	  continue;
635 	}
636 
637       GOMP_PLUGIN_error ("Error parsing %s", var_name);
638       break;
639     }
640 }
641 
642 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)643 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
644 	  unsigned num_objs)
645 {
646   CUjit_option opts[7];
647   void *optvals[7];
648   float elapsed = 0.0;
649   char elog[1024];
650   char ilog[16384];
651   CUlinkState linkstate;
652   CUresult r;
653   void *linkout;
654   size_t linkoutsize __attribute__ ((unused));
655 
656   opts[0] = CU_JIT_WALL_TIME;
657   optvals[0] = &elapsed;
658 
659   opts[1] = CU_JIT_INFO_LOG_BUFFER;
660   optvals[1] = &ilog[0];
661 
662   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
663   optvals[2] = (void *) sizeof ilog;
664 
665   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
666   optvals[3] = &elog[0];
667 
668   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
669   optvals[4] = (void *) sizeof elog;
670 
671   opts[5] = CU_JIT_LOG_VERBOSE;
672   optvals[5] = (void *) 1;
673 
674   static intptr_t gomp_nvptx_o = -1;
675 
676   static bool init_done = false;
677   if (!init_done)
678     {
679       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
680       init_done = true;
681   }
682 
683   int nopts = 6;
684   if (gomp_nvptx_o != -1)
685     {
686       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
687       optvals[nopts] = (void *) gomp_nvptx_o;
688       nopts++;
689     }
690 
691   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
692     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
693   else
694     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
695 
696   for (; num_objs--; ptx_objs++)
697     {
698       /* cuLinkAddData's 'data' argument erroneously omits the const
699 	 qualifier.  */
700       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
701       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
702 	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
703 			       (char *) ptx_objs->code, ptx_objs->size,
704 			       0, 0, 0, 0);
705       else
706 	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
707 			       (char *) ptx_objs->code, ptx_objs->size,
708 			       0, 0, 0, 0);
709       if (r != CUDA_SUCCESS)
710 	{
711 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
712 	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
713 			     cuda_error (r));
714 	  return false;
715 	}
716     }
717 
718   GOMP_PLUGIN_debug (0, "Linking\n");
719   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
720 
721   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
722   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
723 
724   if (r != CUDA_SUCCESS)
725     {
726       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
727       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
728       return false;
729     }
730 
731   CUDA_CALL (cuModuleLoadData, module, linkout);
732   CUDA_CALL (cuLinkDestroy, linkstate);
733   return true;
734 }
735 
736 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,CUdeviceptr dp,CUstream stream)737 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
738 	    unsigned *dims, void *targ_mem_desc,
739 	    CUdeviceptr dp, CUstream stream)
740 {
741   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
742   CUfunction function;
743   int i;
744   void *kargs[1];
745   struct nvptx_thread *nvthd = nvptx_thread ();
746   int warp_size = nvthd->ptx_dev->warp_size;
747 
748   function = targ_fn->fn;
749 
750   /* Initialize the launch dimensions.  Typically this is constant,
751      provided by the device compiler, but we must permit runtime
752      values.  */
753   int seen_zero = 0;
754   for (i = 0; i != GOMP_DIM_MAX; i++)
755     {
756       if (targ_fn->launch->dim[i])
757        dims[i] = targ_fn->launch->dim[i];
758       if (!dims[i])
759        seen_zero = 1;
760     }
761 
762   if (seen_zero)
763     {
764       pthread_mutex_lock (&ptx_dev_lock);
765 
766       static int gomp_openacc_dims[GOMP_DIM_MAX];
767       if (!gomp_openacc_dims[0])
768 	{
769 	  /* See if the user provided GOMP_OPENACC_DIM environment
770 	     variable to specify runtime defaults.  */
771 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
772 	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
773 	}
774 
775       if (!nvthd->ptx_dev->default_dims[0])
776 	{
777 	  int default_dims[GOMP_DIM_MAX];
778 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
779 	    default_dims[i] = gomp_openacc_dims[i];
780 
781 	  int gang, worker, vector;
782 	  {
783 	    int block_size = nvthd->ptx_dev->max_threads_per_block;
784 	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
785 	    int dev_size = nvthd->ptx_dev->num_sms;
786 	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
787 			       " dev_size=%d, cpu_size=%d\n",
788 			       warp_size, block_size, dev_size, cpu_size);
789 
790 	    gang = (cpu_size / block_size) * dev_size;
791 	    worker = block_size / warp_size;
792 	    vector = warp_size;
793 	  }
794 
795 	  /* There is no upper bound on the gang size.  The best size
796 	     matches the hardware configuration.  Logical gangs are
797 	     scheduled onto physical hardware.  To maximize usage, we
798 	     should guess a large number.  */
799 	  if (default_dims[GOMP_DIM_GANG] < 1)
800 	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
801 	  /* The worker size must not exceed the hardware.  */
802 	  if (default_dims[GOMP_DIM_WORKER] < 1
803 	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
804 	    default_dims[GOMP_DIM_WORKER] = worker;
805 	  /* The vector size must exactly match the hardware.  */
806 	  if (default_dims[GOMP_DIM_VECTOR] < 1
807 	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
808 	    default_dims[GOMP_DIM_VECTOR] = vector;
809 
810 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
811 			     default_dims[GOMP_DIM_GANG],
812 			     default_dims[GOMP_DIM_WORKER],
813 			     default_dims[GOMP_DIM_VECTOR]);
814 
815 	  for (i = 0; i != GOMP_DIM_MAX; i++)
816 	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
817 	}
818       pthread_mutex_unlock (&ptx_dev_lock);
819 
820       {
821 	bool default_dim_p[GOMP_DIM_MAX];
822 	for (i = 0; i != GOMP_DIM_MAX; i++)
823 	  default_dim_p[i] = !dims[i];
824 
825 	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
826 	  {
827 	    for (i = 0; i != GOMP_DIM_MAX; i++)
828 	      if (default_dim_p[i])
829 		dims[i] = nvthd->ptx_dev->default_dims[i];
830 
831 	    if (default_dim_p[GOMP_DIM_VECTOR])
832 	      dims[GOMP_DIM_VECTOR]
833 		= MIN (dims[GOMP_DIM_VECTOR],
834 		       (targ_fn->max_threads_per_block / warp_size
835 			* warp_size));
836 
837 	    if (default_dim_p[GOMP_DIM_WORKER])
838 	      dims[GOMP_DIM_WORKER]
839 		= MIN (dims[GOMP_DIM_WORKER],
840 		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
841 	  }
842 	else
843 	  {
844 	    /* Handle the case that the compiler allows the runtime to choose
845 	       the vector-length conservatively, by ignoring
846 	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
847 	       it.  */
848 	    int vectors = 0;
849 	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
850 	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
851 	       exceed targ_fn->max_threads_per_block. */
852 	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
853 	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
854 	    int grids, blocks;
855 
856 	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
857 			      &blocks, function, NULL, 0,
858 			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
859 	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
860 			       "grid = %d, block = %d\n", grids, blocks);
861 
862 	    /* Keep the num_gangs proportional to the block size.  In
863 	       the case were a block size is limited by shared-memory
864 	       or the register file capacity, the runtime will not
865 	       excessively over assign gangs to the multiprocessor
866 	       units if their state is going to be swapped out even
867 	       more than necessary. The constant factor 2 is there to
868 	       prevent threads from idling when there is insufficient
869 	       work for them.  */
870 	    if (gangs == 0)
871 	      gangs = 2 * grids * (blocks / warp_size);
872 
873 	    if (vectors == 0)
874 	      vectors = warp_size;
875 
876 	    if (workers == 0)
877 	      {
878 		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
879 				      ? vectors
880 				      : dims[GOMP_DIM_VECTOR]);
881 		workers = blocks / actual_vectors;
882 		workers = MAX (workers, 1);
883 		/* If we need a per-worker barrier ... .  */
884 		if (actual_vectors > 32)
885 		  /* Don't use more barriers than available.  */
886 		  workers = MIN (workers, 15);
887 	      }
888 
889 	    for (i = 0; i != GOMP_DIM_MAX; i++)
890 	      if (default_dim_p[i])
891 		switch (i)
892 		  {
893 		  case GOMP_DIM_GANG: dims[i] = gangs; break;
894 		  case GOMP_DIM_WORKER: dims[i] = workers; break;
895 		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
896 		  default: GOMP_PLUGIN_fatal ("invalid dim");
897 		  }
898 	  }
899       }
900     }
901 
902   /* Check if the accelerator has sufficient hardware resources to
903      launch the offloaded kernel.  */
904   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
905       > targ_fn->max_threads_per_block)
906     {
907       const char *msg
908 	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
909 	   " with num_workers = %d and vector_length = %d"
910 	   "; "
911 	   "recompile the program with 'num_workers = x and vector_length = y'"
912 	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
913 	   " x * y <= %d"
914 	   ".\n");
915       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
916 			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
917     }
918 
919   /* Check if the accelerator has sufficient barrier resources to
920      launch the offloaded kernel.  */
921   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
922     {
923       const char *msg
924 	= ("The Nvidia accelerator has insufficient barrier resources to launch"
925 	   " '%s' with num_workers = %d and vector_length = %d"
926 	   "; "
927 	   "recompile the program with 'num_workers = x' on that offloaded"
928 	   " region or '-fopenacc-dim=:x:' where x <= 15"
929 	   "; "
930 	   "or, recompile the program with 'vector_length = 32' on that"
931 	   " offloaded region or '-fopenacc-dim=::32'"
932 	   ".\n");
933 	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
934 			   dims[GOMP_DIM_VECTOR]);
935     }
936 
937   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
938 		     " gangs=%u, workers=%u, vectors=%u\n",
939 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
940 		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
941 
942   // OpenACC		CUDA
943   //
944   // num_gangs		nctaid.x
945   // num_workers	ntid.y
946   // vector length	ntid.x
947 
948   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
949   acc_prof_info *prof_info = thr->prof_info;
950   acc_event_info enqueue_launch_event_info;
951   acc_api_info *api_info = thr->api_info;
952   bool profiling_p = __builtin_expect (prof_info != NULL, false);
953   if (profiling_p)
954     {
955       prof_info->event_type = acc_ev_enqueue_launch_start;
956 
957       enqueue_launch_event_info.launch_event.event_type
958 	= prof_info->event_type;
959       enqueue_launch_event_info.launch_event.valid_bytes
960 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
961       enqueue_launch_event_info.launch_event.parent_construct
962 	= acc_construct_parallel;
963       enqueue_launch_event_info.launch_event.implicit = 1;
964       enqueue_launch_event_info.launch_event.tool_info = NULL;
965       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
966       enqueue_launch_event_info.launch_event.num_gangs
967 	= dims[GOMP_DIM_GANG];
968       enqueue_launch_event_info.launch_event.num_workers
969 	= dims[GOMP_DIM_WORKER];
970       enqueue_launch_event_info.launch_event.vector_length
971 	= dims[GOMP_DIM_VECTOR];
972 
973       api_info->device_api = acc_device_api_cuda;
974 
975       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
976 					    api_info);
977     }
978 
979   kargs[0] = &dp;
980   CUDA_CALL_ASSERT (cuLaunchKernel, function,
981 		    dims[GOMP_DIM_GANG], 1, 1,
982 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
983 		    0, stream, kargs, 0);
984 
985   if (profiling_p)
986     {
987       prof_info->event_type = acc_ev_enqueue_launch_end;
988       enqueue_launch_event_info.launch_event.event_type
989 	= prof_info->event_type;
990       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
991 					    api_info);
992     }
993 
994   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
995 		     targ_fn->launch->fn);
996 }
997 
998 void * openacc_get_current_cuda_context (void);
999 
1000 static void
goacc_profiling_acc_ev_alloc(struct goacc_thread * thr,void * dp,size_t s)1001 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1002 {
1003   acc_prof_info *prof_info = thr->prof_info;
1004   acc_event_info data_event_info;
1005   acc_api_info *api_info = thr->api_info;
1006 
1007   prof_info->event_type = acc_ev_alloc;
1008 
1009   data_event_info.data_event.event_type = prof_info->event_type;
1010   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1011   data_event_info.data_event.parent_construct = acc_construct_parallel;
1012   data_event_info.data_event.implicit = 1;
1013   data_event_info.data_event.tool_info = NULL;
1014   data_event_info.data_event.var_name = NULL;
1015   data_event_info.data_event.bytes = s;
1016   data_event_info.data_event.host_ptr = NULL;
1017   data_event_info.data_event.device_ptr = dp;
1018 
1019   api_info->device_api = acc_device_api_cuda;
1020 
1021   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1022 }
1023 
1024 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1025    size threshold, or if FORCE is true.  */
1026 
1027 static void
nvptx_stacks_free(struct ptx_device * ptx_dev,bool force)1028 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1029 {
1030   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1031   if (ptx_dev->omp_stacks.ptr
1032       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1033     {
1034       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1035       if (r != CUDA_SUCCESS)
1036 	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1037       ptx_dev->omp_stacks.ptr = 0;
1038       ptx_dev->omp_stacks.size = 0;
1039     }
1040   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1041 }
1042 
1043 static void *
nvptx_alloc(size_t s,bool suppress_errors)1044 nvptx_alloc (size_t s, bool suppress_errors)
1045 {
1046   CUdeviceptr d;
1047 
1048   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1049   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1050     return NULL;
1051   else if (r != CUDA_SUCCESS)
1052     {
1053       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1054       return NULL;
1055     }
1056 
1057   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1058   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1059   bool profiling_p
1060     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1061   if (profiling_p)
1062     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1063 
1064   return (void *) d;
1065 }
1066 
1067 static void
goacc_profiling_acc_ev_free(struct goacc_thread * thr,void * p)1068 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1069 {
1070   acc_prof_info *prof_info = thr->prof_info;
1071   acc_event_info data_event_info;
1072   acc_api_info *api_info = thr->api_info;
1073 
1074   prof_info->event_type = acc_ev_free;
1075 
1076   data_event_info.data_event.event_type = prof_info->event_type;
1077   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1078   data_event_info.data_event.parent_construct = acc_construct_parallel;
1079   data_event_info.data_event.implicit = 1;
1080   data_event_info.data_event.tool_info = NULL;
1081   data_event_info.data_event.var_name = NULL;
1082   data_event_info.data_event.bytes = -1;
1083   data_event_info.data_event.host_ptr = NULL;
1084   data_event_info.data_event.device_ptr = p;
1085 
1086   api_info->device_api = acc_device_api_cuda;
1087 
1088   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1089 }
1090 
1091 static bool
nvptx_free(void * p,struct ptx_device * ptx_dev)1092 nvptx_free (void *p, struct ptx_device *ptx_dev)
1093 {
1094   CUdeviceptr pb;
1095   size_t ps;
1096 
1097   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1098 				  (CUdeviceptr) p);
1099   if (r == CUDA_ERROR_NOT_PERMITTED)
1100     {
1101       /* We assume that this error indicates we are in a CUDA callback context,
1102 	 where all CUDA calls are not allowed (see cuStreamAddCallback
1103 	 documentation for description). Arrange to free this piece of device
1104 	 memory later.  */
1105       struct ptx_free_block *n
1106 	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1107       n->ptr = p;
1108       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1109       n->next = ptx_dev->free_blocks;
1110       ptx_dev->free_blocks = n;
1111       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1112       return true;
1113     }
1114   else if (r != CUDA_SUCCESS)
1115     {
1116       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1117       return false;
1118     }
1119   if ((CUdeviceptr) p != pb)
1120     {
1121       GOMP_PLUGIN_error ("invalid device address");
1122       return false;
1123     }
1124 
1125   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1126   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1127   bool profiling_p
1128     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1129   if (profiling_p)
1130     goacc_profiling_acc_ev_free (thr, p);
1131 
1132   return true;
1133 }
1134 
1135 static void *
nvptx_get_current_cuda_device(void)1136 nvptx_get_current_cuda_device (void)
1137 {
1138   struct nvptx_thread *nvthd = nvptx_thread ();
1139 
1140   if (!nvthd || !nvthd->ptx_dev)
1141     return NULL;
1142 
1143   return &nvthd->ptx_dev->dev;
1144 }
1145 
1146 static void *
nvptx_get_current_cuda_context(void)1147 nvptx_get_current_cuda_context (void)
1148 {
1149   struct nvptx_thread *nvthd = nvptx_thread ();
1150 
1151   if (!nvthd || !nvthd->ptx_dev)
1152     return NULL;
1153 
1154   return nvthd->ptx_dev->ctx;
1155 }
1156 
1157 /* Plugin entry points.  */
1158 
1159 const char *
GOMP_OFFLOAD_get_name(void)1160 GOMP_OFFLOAD_get_name (void)
1161 {
1162   return "nvptx";
1163 }
1164 
1165 unsigned int
GOMP_OFFLOAD_get_caps(void)1166 GOMP_OFFLOAD_get_caps (void)
1167 {
1168   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1169 }
1170 
1171 int
GOMP_OFFLOAD_get_type(void)1172 GOMP_OFFLOAD_get_type (void)
1173 {
1174   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1175 }
1176 
1177 int
GOMP_OFFLOAD_get_num_devices(void)1178 GOMP_OFFLOAD_get_num_devices (void)
1179 {
1180   return nvptx_get_num_devices ();
1181 }
1182 
1183 bool
GOMP_OFFLOAD_init_device(int n)1184 GOMP_OFFLOAD_init_device (int n)
1185 {
1186   struct ptx_device *dev;
1187 
1188   pthread_mutex_lock (&ptx_dev_lock);
1189 
1190   if (!nvptx_init () || ptx_devices[n] != NULL)
1191     {
1192       pthread_mutex_unlock (&ptx_dev_lock);
1193       return false;
1194     }
1195 
1196   dev = nvptx_open_device (n);
1197   if (dev)
1198     {
1199       ptx_devices[n] = dev;
1200       instantiated_devices++;
1201     }
1202 
1203   pthread_mutex_unlock (&ptx_dev_lock);
1204 
1205   return dev != NULL;
1206 }
1207 
1208 bool
GOMP_OFFLOAD_fini_device(int n)1209 GOMP_OFFLOAD_fini_device (int n)
1210 {
1211   pthread_mutex_lock (&ptx_dev_lock);
1212 
1213   if (ptx_devices[n] != NULL)
1214     {
1215       if (!nvptx_attach_host_thread_to_device (n)
1216 	  || !nvptx_close_device (ptx_devices[n]))
1217 	{
1218 	  pthread_mutex_unlock (&ptx_dev_lock);
1219 	  return false;
1220 	}
1221       ptx_devices[n] = NULL;
1222       instantiated_devices--;
1223     }
1224 
1225   if (instantiated_devices == 0)
1226     {
1227       free (ptx_devices);
1228       ptx_devices = NULL;
1229     }
1230 
1231   pthread_mutex_unlock (&ptx_dev_lock);
1232   return true;
1233 }
1234 
1235 /* Return the libgomp version number we're compatible with.  There is
1236    no requirement for cross-version compatibility.  */
1237 
1238 unsigned
GOMP_OFFLOAD_version(void)1239 GOMP_OFFLOAD_version (void)
1240 {
1241   return GOMP_VERSION;
1242 }
1243 
1244 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1245 
1246 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1247 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1248 {
1249   CUdeviceptr dptr;
1250   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1251 				  module, "__nvptx_clocktick");
1252   if (r == CUDA_ERROR_NOT_FOUND)
1253     return;
1254   if (r != CUDA_SUCCESS)
1255     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1256   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1257   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1258 			 sizeof (__nvptx_clocktick));
1259   if (r != CUDA_SUCCESS)
1260     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1261 }
1262 
1263 /* Load the (partial) program described by TARGET_DATA to device
1264    number ORD.  Allocate and return TARGET_TABLE.  */
1265 
1266 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1267 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1268 			 struct addr_pair **target_table)
1269 {
1270   CUmodule module;
1271   const char *const *var_names;
1272   const struct targ_fn_launch *fn_descs;
1273   unsigned int fn_entries, var_entries, other_entries, i, j;
1274   struct targ_fn_descriptor *targ_fns;
1275   struct addr_pair *targ_tbl;
1276   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1277   struct ptx_image_data *new_image;
1278   struct ptx_device *dev;
1279 
1280   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1281     {
1282       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1283 			 " (expected %u, received %u)",
1284 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1285       return -1;
1286     }
1287 
1288   if (!nvptx_attach_host_thread_to_device (ord)
1289       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1290     return -1;
1291 
1292   dev = ptx_devices[ord];
1293 
1294   /* The mkoffload utility emits a struct of pointers/integers at the
1295      start of each offload image.  The array of kernel names and the
1296      functions addresses form a one-to-one correspondence.  */
1297 
1298   var_entries = img_header->var_num;
1299   var_names = img_header->var_names;
1300   fn_entries = img_header->fn_num;
1301   fn_descs = img_header->fn_descs;
1302 
1303   /* Currently, the only other entry kind is 'device number'.  */
1304   other_entries = 1;
1305 
1306   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1307 				 * (fn_entries + var_entries + other_entries));
1308   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1309 				 * fn_entries);
1310 
1311   *target_table = targ_tbl;
1312 
1313   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1314   new_image->target_data = target_data;
1315   new_image->module = module;
1316   new_image->fns = targ_fns;
1317 
1318   pthread_mutex_lock (&dev->image_lock);
1319   new_image->next = dev->images;
1320   dev->images = new_image;
1321   pthread_mutex_unlock (&dev->image_lock);
1322 
1323   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1324     {
1325       CUfunction function;
1326       int nregs, mthrs;
1327 
1328       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1329 		      fn_descs[i].fn);
1330       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1331 		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1332       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1333 		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1334 
1335       targ_fns->fn = function;
1336       targ_fns->launch = &fn_descs[i];
1337       targ_fns->regs_per_thread = nregs;
1338       targ_fns->max_threads_per_block = mthrs;
1339 
1340       targ_tbl->start = (uintptr_t) targ_fns;
1341       targ_tbl->end = targ_tbl->start + 1;
1342     }
1343 
1344   for (j = 0; j < var_entries; j++, targ_tbl++)
1345     {
1346       CUdeviceptr var;
1347       size_t bytes;
1348 
1349       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1350 		      &var, &bytes, module, var_names[j]);
1351 
1352       targ_tbl->start = (uintptr_t) var;
1353       targ_tbl->end = targ_tbl->start + bytes;
1354     }
1355 
1356   CUdeviceptr device_num_varptr;
1357   size_t device_num_varsize;
1358   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1359 				  &device_num_varsize, module,
1360 				  XSTRING (GOMP_DEVICE_NUM_VAR));
1361   if (r == CUDA_SUCCESS)
1362     {
1363       targ_tbl->start = (uintptr_t) device_num_varptr;
1364       targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1365     }
1366   else
1367     /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
1368     targ_tbl->start = targ_tbl->end = 0;
1369   targ_tbl++;
1370 
1371   nvptx_set_clocktick (module, dev);
1372 
1373   return fn_entries + var_entries + other_entries;
1374 }
1375 
1376 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1377    function descriptors allocated by G_O_load_image.  */
1378 
1379 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1380 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1381 {
1382   struct ptx_image_data *image, **prev_p;
1383   struct ptx_device *dev = ptx_devices[ord];
1384 
1385   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1386     {
1387       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1388 			 " (expected %u, received %u)",
1389 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1390       return false;
1391     }
1392 
1393   bool ret = true;
1394   pthread_mutex_lock (&dev->image_lock);
1395   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1396     if (image->target_data == target_data)
1397       {
1398 	*prev_p = image->next;
1399 	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1400 	  ret = false;
1401 	free (image->fns);
1402 	free (image);
1403 	break;
1404       }
1405   pthread_mutex_unlock (&dev->image_lock);
1406   return ret;
1407 }
1408 
1409 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1410 GOMP_OFFLOAD_alloc (int ord, size_t size)
1411 {
1412   if (!nvptx_attach_host_thread_to_device (ord))
1413     return NULL;
1414 
1415   struct ptx_device *ptx_dev = ptx_devices[ord];
1416   struct ptx_free_block *blocks, *tmp;
1417 
1418   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1419   blocks = ptx_dev->free_blocks;
1420   ptx_dev->free_blocks = NULL;
1421   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1422 
1423   nvptx_stacks_free (ptx_dev, false);
1424 
1425   while (blocks)
1426     {
1427       tmp = blocks->next;
1428       nvptx_free (blocks->ptr, ptx_dev);
1429       free (blocks);
1430       blocks = tmp;
1431     }
1432 
1433   void *d = nvptx_alloc (size, true);
1434   if (d)
1435     return d;
1436   else
1437     {
1438       /* Memory allocation failed.  Try freeing the stacks block, and
1439 	 retrying.  */
1440       nvptx_stacks_free (ptx_dev, true);
1441       return nvptx_alloc (size, false);
1442     }
1443 }
1444 
1445 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1446 GOMP_OFFLOAD_free (int ord, void *ptr)
1447 {
1448   return (nvptx_attach_host_thread_to_device (ord)
1449 	  && nvptx_free (ptr, ptx_devices[ord]));
1450 }
1451 
1452 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)1453 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1454 			   void **hostaddrs, void **devaddrs,
1455 			   unsigned *dims, void *targ_mem_desc)
1456 {
1457   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1458 
1459   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1460   acc_prof_info *prof_info = thr->prof_info;
1461   acc_event_info data_event_info;
1462   acc_api_info *api_info = thr->api_info;
1463   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1464 
1465   void **hp = NULL;
1466   CUdeviceptr dp = 0;
1467 
1468   if (mapnum > 0)
1469     {
1470       size_t s = mapnum * sizeof (void *);
1471       hp = alloca (s);
1472       for (int i = 0; i < mapnum; i++)
1473 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1474       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1475       if (profiling_p)
1476 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1477     }
1478 
1479   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1480      fact have the same value on a unified-memory system).  */
1481   if (mapnum > 0)
1482     {
1483       if (profiling_p)
1484 	{
1485 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1486 
1487 	  data_event_info.data_event.event_type = prof_info->event_type;
1488 	  data_event_info.data_event.valid_bytes
1489 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1490 	  data_event_info.data_event.parent_construct
1491 	    = acc_construct_parallel;
1492 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1493 	  data_event_info.data_event.tool_info = NULL;
1494 	  data_event_info.data_event.var_name = NULL;
1495 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1496 	  data_event_info.data_event.host_ptr = hp;
1497 	  data_event_info.data_event.device_ptr = (const void *) dp;
1498 
1499 	  api_info->device_api = acc_device_api_cuda;
1500 
1501 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1502 						api_info);
1503 	}
1504       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1505 			mapnum * sizeof (void *));
1506       if (profiling_p)
1507 	{
1508 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1509 	  data_event_info.data_event.event_type = prof_info->event_type;
1510 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1511 						api_info);
1512 	}
1513     }
1514 
1515   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1516 	      dp, NULL);
1517 
1518   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1519   const char *maybe_abort_msg = "(perhaps abort was called)";
1520   if (r == CUDA_ERROR_LAUNCH_FAILED)
1521     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1522 		       maybe_abort_msg);
1523   else if (r != CUDA_SUCCESS)
1524     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1525 
1526   CUDA_CALL_ASSERT (cuMemFree, dp);
1527   if (profiling_p)
1528     goacc_profiling_acc_ev_free (thr, (void *) dp);
1529 }
1530 
1531 static void
cuda_free_argmem(void * ptr)1532 cuda_free_argmem (void *ptr)
1533 {
1534   void **block = (void **) ptr;
1535   nvptx_free (block[0], (struct ptx_device *) block[1]);
1536   free (block);
1537 }
1538 
1539 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)1540 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1541 				 void **hostaddrs, void **devaddrs,
1542 				 unsigned *dims, void *targ_mem_desc,
1543 				 struct goacc_asyncqueue *aq)
1544 {
1545   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1546 
1547   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1548   acc_prof_info *prof_info = thr->prof_info;
1549   acc_event_info data_event_info;
1550   acc_api_info *api_info = thr->api_info;
1551   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1552 
1553   void **hp = NULL;
1554   CUdeviceptr dp = 0;
1555   void **block = NULL;
1556 
1557   if (mapnum > 0)
1558     {
1559       size_t s = mapnum * sizeof (void *);
1560       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1561       hp = block + 2;
1562       for (int i = 0; i < mapnum; i++)
1563 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1564       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1565       if (profiling_p)
1566 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1567     }
1568 
1569   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1570      fact have the same value on a unified-memory system).  */
1571   if (mapnum > 0)
1572     {
1573       if (profiling_p)
1574 	{
1575 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1576 
1577 	  data_event_info.data_event.event_type = prof_info->event_type;
1578 	  data_event_info.data_event.valid_bytes
1579 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1580 	  data_event_info.data_event.parent_construct
1581 	    = acc_construct_parallel;
1582 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1583 	  data_event_info.data_event.tool_info = NULL;
1584 	  data_event_info.data_event.var_name = NULL;
1585 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1586 	  data_event_info.data_event.host_ptr = hp;
1587 	  data_event_info.data_event.device_ptr = (const void *) dp;
1588 
1589 	  api_info->device_api = acc_device_api_cuda;
1590 
1591 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1592 						api_info);
1593 	}
1594 
1595       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1596 			mapnum * sizeof (void *), aq->cuda_stream);
1597       block[0] = (void *) dp;
1598 
1599       struct nvptx_thread *nvthd =
1600 	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1601       block[1] = (void *) nvthd->ptx_dev;
1602 
1603       if (profiling_p)
1604 	{
1605 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1606 	  data_event_info.data_event.event_type = prof_info->event_type;
1607 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1608 						api_info);
1609 	}
1610     }
1611 
1612   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1613 	      dp, aq->cuda_stream);
1614 
1615   if (mapnum > 0)
1616     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1617 }
1618 
1619 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)1620 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1621 {
1622   struct ptx_device *ptx_dev;
1623   struct nvptx_thread *nvthd
1624     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1625   CUcontext thd_ctx;
1626 
1627   ptx_dev = ptx_devices[ord];
1628 
1629   assert (ptx_dev);
1630 
1631   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1632 
1633   assert (ptx_dev->ctx);
1634 
1635   if (!thd_ctx)
1636     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1637 
1638   nvthd->ptx_dev = ptx_dev;
1639 
1640   return (void *) nvthd;
1641 }
1642 
1643 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)1644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1645 {
1646   free (data);
1647 }
1648 
1649 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)1650 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1651 {
1652   return nvptx_get_current_cuda_device ();
1653 }
1654 
1655 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)1656 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1657 {
1658   return nvptx_get_current_cuda_context ();
1659 }
1660 
1661 /* This returns a CUstream.  */
1662 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(struct goacc_asyncqueue * aq)1663 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1664 {
1665   return (void *) aq->cuda_stream;
1666 }
1667 
1668 /* This takes a CUstream.  */
1669 int
GOMP_OFFLOAD_openacc_cuda_set_stream(struct goacc_asyncqueue * aq,void * stream)1670 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1671 {
1672   if (aq->cuda_stream)
1673     {
1674       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1675       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1676     }
1677 
1678   aq->cuda_stream = (CUstream) stream;
1679   return 1;
1680 }
1681 
1682 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)1683 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1684 {
1685   CUstream stream = NULL;
1686   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1687 
1688   struct goacc_asyncqueue *aq
1689     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1690   aq->cuda_stream = stream;
1691   return aq;
1692 }
1693 
1694 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)1695 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1696 {
1697   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1698   free (aq);
1699   return true;
1700 }
1701 
1702 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)1703 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1704 {
1705   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1706   if (r == CUDA_SUCCESS)
1707     return 1;
1708   if (r == CUDA_ERROR_NOT_READY)
1709     return 0;
1710 
1711   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1712   return -1;
1713 }
1714 
1715 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)1716 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1717 {
1718   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1719   return true;
1720 }
1721 
1722 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)1723 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1724 				      struct goacc_asyncqueue *aq2)
1725 {
1726   CUevent e;
1727   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1728   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1729   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1730   return true;
1731 }
1732 
1733 static void
cuda_callback_wrapper(CUstream stream,CUresult res,void * ptr)1734 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1735 {
1736   if (res != CUDA_SUCCESS)
1737     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1738   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1739   cb->fn (cb->ptr);
1740   free (ptr);
1741 }
1742 
1743 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* callback_fn)(void *),void * userptr)1744 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1745 					   void (*callback_fn)(void *),
1746 					   void *userptr)
1747 {
1748   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1749   b->fn = callback_fn;
1750   b->ptr = userptr;
1751   b->aq = aq;
1752   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1753 		    cuda_callback_wrapper, (void *) b, 0);
1754 }
1755 
1756 static bool
cuda_memcpy_sanity_check(const void * h,const void * d,size_t s)1757 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1758 {
1759   CUdeviceptr pb;
1760   size_t ps;
1761   if (!s)
1762     return true;
1763   if (!d)
1764     {
1765       GOMP_PLUGIN_error ("invalid device address");
1766       return false;
1767     }
1768   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1769   if (!pb)
1770     {
1771       GOMP_PLUGIN_error ("invalid device address");
1772       return false;
1773     }
1774   if (!h)
1775     {
1776       GOMP_PLUGIN_error ("invalid host address");
1777       return false;
1778     }
1779   if (d == h)
1780     {
1781       GOMP_PLUGIN_error ("invalid host or device address");
1782       return false;
1783     }
1784   if ((void *)(d + s) > (void *)(pb + ps))
1785     {
1786       GOMP_PLUGIN_error ("invalid size");
1787       return false;
1788     }
1789   return true;
1790 }
1791 
1792 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1793 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1794 {
1795   if (!nvptx_attach_host_thread_to_device (ord)
1796       || !cuda_memcpy_sanity_check (src, dst, n))
1797     return false;
1798   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1799   return true;
1800 }
1801 
1802 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1803 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1804 {
1805   if (!nvptx_attach_host_thread_to_device (ord)
1806       || !cuda_memcpy_sanity_check (dst, src, n))
1807     return false;
1808   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1809   return true;
1810 }
1811 
1812 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1813 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1814 {
1815   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1816   return true;
1817 }
1818 
1819 bool
GOMP_OFFLOAD_openacc_async_host2dev(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1820 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1821 				     size_t n, struct goacc_asyncqueue *aq)
1822 {
1823   if (!nvptx_attach_host_thread_to_device (ord)
1824       || !cuda_memcpy_sanity_check (src, dst, n))
1825     return false;
1826   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1827   return true;
1828 }
1829 
1830 bool
GOMP_OFFLOAD_openacc_async_dev2host(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1831 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1832 				     size_t n, struct goacc_asyncqueue *aq)
1833 {
1834   if (!nvptx_attach_host_thread_to_device (ord)
1835       || !cuda_memcpy_sanity_check (dst, src, n))
1836     return false;
1837   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1838   return true;
1839 }
1840 
1841 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int n,enum goacc_property prop)1842 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1843 {
1844   union goacc_property_value propval = { .val = 0 };
1845 
1846   pthread_mutex_lock (&ptx_dev_lock);
1847 
1848   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1849     {
1850       pthread_mutex_unlock (&ptx_dev_lock);
1851       return propval;
1852     }
1853 
1854   struct ptx_device *ptx_dev = ptx_devices[n];
1855   switch (prop)
1856     {
1857     case GOACC_PROPERTY_MEMORY:
1858       {
1859 	size_t total_mem;
1860 
1861 	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1862 	propval.val = total_mem;
1863       }
1864       break;
1865     case GOACC_PROPERTY_FREE_MEMORY:
1866       {
1867 	size_t total_mem;
1868 	size_t free_mem;
1869 	CUdevice ctxdev;
1870 
1871 	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1872 	if (ptx_dev->dev == ctxdev)
1873 	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1874 	else if (ptx_dev->ctx)
1875 	  {
1876 	    CUcontext old_ctx;
1877 
1878 	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1879 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1880 	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1881 	  }
1882 	else
1883 	  {
1884 	    CUcontext new_ctx;
1885 
1886 	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1887 			    ptx_dev->dev);
1888 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1889 	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1890 	  }
1891 	propval.val = free_mem;
1892       }
1893       break;
1894     case GOACC_PROPERTY_NAME:
1895       propval.ptr = ptx_dev->name;
1896       break;
1897     case GOACC_PROPERTY_VENDOR:
1898       propval.ptr = "Nvidia";
1899       break;
1900     case GOACC_PROPERTY_DRIVER:
1901       propval.ptr = cuda_driver_version_s;
1902       break;
1903     default:
1904       break;
1905     }
1906 
1907   pthread_mutex_unlock (&ptx_dev_lock);
1908   return propval;
1909 }
1910 
1911 /* Adjust launch dimensions: pick good values for number of blocks and warps
1912    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1913    own limits.  */
1914 
1915 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)1916 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1917 			    struct ptx_device *ptx_dev,
1918 			    int *teams_p, int *threads_p)
1919 {
1920   int max_warps_block = fn->max_threads_per_block / 32;
1921   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1922      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1923   if (max_warps_block > 32)
1924     max_warps_block = 32;
1925   if (*threads_p <= 0)
1926     *threads_p = 8;
1927   if (*threads_p > max_warps_block)
1928     *threads_p = max_warps_block;
1929 
1930   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1931   /* This is an estimate of how many blocks the device can host simultaneously.
1932      Actual limit, which may be lower, can be queried with "occupancy control"
1933      driver interface (since CUDA 6.0).  */
1934   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1935   if (*teams_p <= 0 || *teams_p > max_blocks)
1936     *teams_p = max_blocks;
1937 }
1938 
1939 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1940    target regions.  */
1941 
1942 static size_t
nvptx_stacks_size()1943 nvptx_stacks_size ()
1944 {
1945   return 128 * 1024;
1946 }
1947 
1948 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
1949    the storage should be held on entry, and remains held on exit.  */
1950 
1951 static void *
nvptx_stacks_acquire(struct ptx_device * ptx_dev,size_t size,int num)1952 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1953 {
1954   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1955     return (void *) ptx_dev->omp_stacks.ptr;
1956 
1957   /* Free the old, too-small stacks.  */
1958   if (ptx_dev->omp_stacks.ptr)
1959     {
1960       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1961       if (r != CUDA_SUCCESS)
1962 	GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1963       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1964       if (r != CUDA_SUCCESS)
1965 	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1966     }
1967 
1968   /* Make new and bigger stacks, and remember where we put them and how big
1969      they are.  */
1970   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1971 				  size * num);
1972   if (r != CUDA_SUCCESS)
1973     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1974 
1975   ptx_dev->omp_stacks.size = size * num;
1976 
1977   return (void *) ptx_dev->omp_stacks.ptr;
1978 }
1979 
1980 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)1981 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1982 {
1983   struct targ_fn_descriptor *tgt_fn_desc
1984     = (struct targ_fn_descriptor *) tgt_fn;
1985   CUfunction function = tgt_fn_desc->fn;
1986   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1987   const char *fn_name = launch->fn;
1988   CUresult r;
1989   struct ptx_device *ptx_dev = ptx_devices[ord];
1990   const char *maybe_abort_msg = "(perhaps abort was called)";
1991   int teams = 0, threads = 0;
1992 
1993   if (!args)
1994     GOMP_PLUGIN_fatal ("No target arguments provided");
1995   while (*args)
1996     {
1997       intptr_t id = (intptr_t) *args++, val;
1998       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1999 	val = (intptr_t) *args++;
2000       else
2001         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2002       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2003 	continue;
2004       val = val > INT_MAX ? INT_MAX : val;
2005       id &= GOMP_TARGET_ARG_ID_MASK;
2006       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2007 	teams = val;
2008       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2009 	threads = val;
2010     }
2011   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2012 
2013   size_t stack_size = nvptx_stacks_size ();
2014 
2015   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2016   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2017   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2018   size_t fn_args_size = sizeof fn_args;
2019   void *config[] = {
2020     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2021     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2022     CU_LAUNCH_PARAM_END
2023   };
2024   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2025 		     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2026 		     __FUNCTION__, fn_name, teams, threads);
2027   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2028 			 32, threads, 1, 0, NULL, NULL, config);
2029   if (r != CUDA_SUCCESS)
2030     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2031 
2032   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2033   if (r == CUDA_ERROR_LAUNCH_FAILED)
2034     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2035 		       maybe_abort_msg);
2036   else if (r != CUDA_SUCCESS)
2037     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2038 
2039   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2040 }
2041 
2042 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2043