xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/plugin/plugin-nvptx.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
40 #include "oacc-int.h"
41 
42 #include <pthread.h>
43 #include <cuda.h>
44 #include <stdbool.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
51 
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
56 
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 			const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 			   const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 					  CUoccupancyB2DSize, size_t, int);
70 #endif
71 
72 #define DO_PRAGMA(x) _Pragma (#x)
73 
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
76 
77 struct cuda_lib_s {
78 
79 # define CUDA_ONE_CALL(call)			\
80   __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call)		\
82   CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
86 
87 } cuda_lib;
88 
89 /* -1 if init_cuda_lib has not been called yet, false
90    if it has been and failed, true if it has been and succeeded.  */
91 static signed char cuda_lib_inited = -1;
92 
93 /* Dynamically load the CUDA runtime library and initialize function
94    pointers, return false if unsuccessful, true if successful.  */
95 static bool
init_cuda_lib(void)96 init_cuda_lib (void)
97 {
98   if (cuda_lib_inited != -1)
99     return cuda_lib_inited;
100   const char *cuda_runtime_lib = "libcuda.so.1";
101   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102   cuda_lib_inited = false;
103   if (h == NULL)
104     return false;
105 
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null)		\
109   cuda_lib.call = dlsym (h, #call);	\
110   if (!allow_null && cuda_lib.call == NULL)		\
111     return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
116 
117   cuda_lib_inited = true;
118   return true;
119 }
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
122 
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
128 
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
132 
133 #include "secure_getenv.h"
134 
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139 
140 /* Convenience macros for the frequently used CUDA library call and
141    error handling sequence as well as CUDA library calls that
142    do the error checking themselves or don't do it at all.  */
143 
144 #define CUDA_CALL_ERET(ERET, FN, ...)		\
145   do {						\
146     unsigned __r				\
147       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
148     if (__r != CUDA_SUCCESS)			\
149       {						\
150 	GOMP_PLUGIN_error (#FN " error: %s",	\
151 			   cuda_error (__r));	\
152 	return ERET;				\
153       }						\
154   } while (0)
155 
156 #define CUDA_CALL(FN, ...)			\
157   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
158 
159 #define CUDA_CALL_ASSERT(FN, ...)		\
160   do {						\
161     unsigned __r				\
162       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
163     if (__r != CUDA_SUCCESS)			\
164       {						\
165 	GOMP_PLUGIN_fatal (#FN " error: %s",	\
166 			   cuda_error (__r));	\
167       }						\
168   } while (0)
169 
170 #define CUDA_CALL_NOCHECK(FN, ...)		\
171   CUDA_CALL_PREFIX FN (__VA_ARGS__)
172 
173 #define CUDA_CALL_EXISTS(FN)			\
174   CUDA_CALL_PREFIX FN
175 
176 static const char *
cuda_error(CUresult r)177 cuda_error (CUresult r)
178 {
179   const char *fallback = "unknown cuda error";
180   const char *desc;
181 
182   if (!CUDA_CALL_EXISTS (cuGetErrorString))
183     return fallback;
184 
185   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186   if (r == CUDA_SUCCESS)
187     return desc;
188 
189   return fallback;
190 }
191 
192 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194 static char cuda_driver_version_s[30];
195 
196 static unsigned int instantiated_devices = 0;
197 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
198 
199 /* NVPTX/CUDA specific definition of asynchronous queues.  */
200 struct goacc_asyncqueue
201 {
202   CUstream cuda_stream;
203 };
204 
205 struct nvptx_callback
206 {
207   void (*fn) (void *);
208   void *ptr;
209   struct goacc_asyncqueue *aq;
210   struct nvptx_callback *next;
211 };
212 
213 /* Thread-specific data for PTX.  */
214 
215 struct nvptx_thread
216 {
217   /* We currently have this embedded inside the plugin because libgomp manages
218      devices through integer target_ids.  This might be better if using an
219      opaque target-specific pointer directly from gomp_device_descr.  */
220   struct ptx_device *ptx_dev;
221 };
222 
223 /* Target data function launch information.  */
224 
225 struct targ_fn_launch
226 {
227   const char *fn;
228   unsigned short dim[GOMP_DIM_MAX];
229 };
230 
231 /* Target PTX object information.  */
232 
233 struct targ_ptx_obj
234 {
235   const char *code;
236   size_t size;
237 };
238 
239 /* Target data image information.  */
240 
241 typedef struct nvptx_tdata
242 {
243   const struct targ_ptx_obj *ptx_objs;
244   unsigned ptx_num;
245 
246   const char *const *var_names;
247   unsigned var_num;
248 
249   const struct targ_fn_launch *fn_descs;
250   unsigned fn_num;
251 } nvptx_tdata_t;
252 
253 /* Descriptor of a loaded function.  */
254 
255 struct targ_fn_descriptor
256 {
257   CUfunction fn;
258   const struct targ_fn_launch *launch;
259   int regs_per_thread;
260   int max_threads_per_block;
261 };
262 
263 /* A loaded PTX image.  */
264 struct ptx_image_data
265 {
266   const void *target_data;
267   CUmodule module;
268 
269   struct targ_fn_descriptor *fns;  /* Array of functions.  */
270 
271   struct ptx_image_data *next;
272 };
273 
274 struct ptx_free_block
275 {
276   void *ptr;
277   struct ptx_free_block *next;
278 };
279 
280 struct ptx_device
281 {
282   CUcontext ctx;
283   bool ctx_shared;
284   CUdevice dev;
285 
286   int ord;
287   bool overlap;
288   bool map;
289   bool concur;
290   bool mkern;
291   int mode;
292   int clock_khz;
293   int num_sms;
294   int regs_per_block;
295   int regs_per_sm;
296   int warp_size;
297   int max_threads_per_block;
298   int max_threads_per_multiprocessor;
299   int default_dims[GOMP_DIM_MAX];
300 
301   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
302   char name[256];
303 
304   struct ptx_image_data *images;  /* Images loaded on device.  */
305   pthread_mutex_t image_lock;     /* Lock for above list.  */
306 
307   struct ptx_free_block *free_blocks;
308   pthread_mutex_t free_blocks_lock;
309 
310   struct ptx_device *next;
311 };
312 
313 static struct ptx_device **ptx_devices;
314 
315 static inline struct nvptx_thread *
nvptx_thread(void)316 nvptx_thread (void)
317 {
318   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319 }
320 
321 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
322    should be locked on entry and remains locked on exit.  */
323 
324 static bool
nvptx_init(void)325 nvptx_init (void)
326 {
327   int ndevs;
328 
329   if (instantiated_devices != 0)
330     return true;
331 
332   if (!init_cuda_lib ())
333     return false;
334 
335   CUDA_CALL (cuInit, 0);
336 
337   int cuda_driver_version;
338   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340 	    "CUDA Driver %u.%u",
341 	    cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342 
343   CUDA_CALL (cuDeviceGetCount, &ndevs);
344   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345 					    * ndevs);
346 
347   return true;
348 }
349 
350 /* Select the N'th PTX device for the current host thread.  The device must
351    have been previously opened before calling this function.  */
352 
353 static bool
nvptx_attach_host_thread_to_device(int n)354 nvptx_attach_host_thread_to_device (int n)
355 {
356   CUdevice dev;
357   CUresult r;
358   struct ptx_device *ptx_dev;
359   CUcontext thd_ctx;
360 
361   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
362   if (r == CUDA_ERROR_NOT_PERMITTED)
363     {
364       /* Assume we're in a CUDA callback, just return true.  */
365       return true;
366     }
367   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
368     {
369       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370       return false;
371     }
372 
373   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
374     return true;
375   else
376     {
377       CUcontext old_ctx;
378 
379       ptx_dev = ptx_devices[n];
380       if (!ptx_dev)
381 	{
382 	  GOMP_PLUGIN_error ("device %d not found", n);
383 	  return false;
384 	}
385 
386       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
387 
388       /* We don't necessarily have a current context (e.g. if it has been
389          destroyed.  Pop it if we do though.  */
390       if (thd_ctx != NULL)
391 	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
392 
393       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
394     }
395   return true;
396 }
397 
398 static struct ptx_device *
nvptx_open_device(int n)399 nvptx_open_device (int n)
400 {
401   struct ptx_device *ptx_dev;
402   CUdevice dev, ctx_dev;
403   CUresult r;
404   int async_engines, pi;
405 
406   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
407 
408   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409 
410   ptx_dev->ord = n;
411   ptx_dev->dev = dev;
412   ptx_dev->ctx_shared = false;
413 
414   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
415   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
416     {
417       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418       return NULL;
419     }
420 
421   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422     {
423       /* The current host thread has an active context for a different device.
424          Detach it.  */
425       CUcontext old_ctx;
426       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
427     }
428 
429   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
430 
431   if (!ptx_dev->ctx)
432     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
433   else
434     ptx_dev->ctx_shared = true;
435 
436   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437 		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
438   ptx_dev->overlap = pi;
439 
440   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441 		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
442   ptx_dev->map = pi;
443 
444   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445 		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
446   ptx_dev->concur = pi;
447 
448   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449 		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
450   ptx_dev->mode = pi;
451 
452   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
454   ptx_dev->mkern = pi;
455 
456   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457 		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458   ptx_dev->clock_khz = pi;
459 
460   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
461 		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462   ptx_dev->num_sms = pi;
463 
464   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466   ptx_dev->regs_per_block = pi;
467 
468   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
469      in CUDA 6.0 and newer.  */
470   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471 			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472 			 dev);
473   /* Fallback: use limit of registers per block, which is usually equal.  */
474   if (r == CUDA_ERROR_INVALID_VALUE)
475     pi = ptx_dev->regs_per_block;
476   else if (r != CUDA_SUCCESS)
477     {
478       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479       return NULL;
480     }
481   ptx_dev->regs_per_sm = pi;
482 
483   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485   if (pi != 32)
486     {
487       GOMP_PLUGIN_error ("Only warp size 32 is supported");
488       return NULL;
489     }
490   ptx_dev->warp_size = pi;
491 
492   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494   ptx_dev->max_threads_per_block = pi;
495 
496   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498   ptx_dev->max_threads_per_multiprocessor = pi;
499 
500   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
502   if (r != CUDA_SUCCESS)
503     async_engines = 1;
504 
505   for (int i = 0; i != GOMP_DIM_MAX; i++)
506     ptx_dev->default_dims[i] = 0;
507 
508   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509 		  dev);
510 
511   ptx_dev->images = NULL;
512   pthread_mutex_init (&ptx_dev->image_lock, NULL);
513 
514   ptx_dev->free_blocks = NULL;
515   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
516 
517   return ptx_dev;
518 }
519 
520 static bool
nvptx_close_device(struct ptx_device * ptx_dev)521 nvptx_close_device (struct ptx_device *ptx_dev)
522 {
523   if (!ptx_dev)
524     return true;
525 
526   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527     {
528       struct ptx_free_block *b_next = b->next;
529       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530       free (b);
531       b = b_next;
532     }
533 
534   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
535   pthread_mutex_destroy (&ptx_dev->image_lock);
536 
537   if (!ptx_dev->ctx_shared)
538     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
539 
540   free (ptx_dev);
541   return true;
542 }
543 
544 static int
nvptx_get_num_devices(void)545 nvptx_get_num_devices (void)
546 {
547   int n;
548 
549   /* This function will be called before the plugin has been initialized in
550      order to enumerate available devices, but CUDA API routines can't be used
551      until cuInit has been called.  Just call it now (but don't yet do any
552      further initialization).  */
553   if (instantiated_devices == 0)
554     {
555       if (!init_cuda_lib ())
556 	return 0;
557       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
558       /* This is not an error: e.g. we may have CUDA libraries installed but
559          no devices available.  */
560       if (r != CUDA_SUCCESS)
561 	{
562 	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
563 			     cuda_error (r));
564 	  return 0;
565 	}
566     }
567 
568   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
569   return n;
570 }
571 
572 static void
notify_var(const char * var_name,const char * env_var)573 notify_var (const char *var_name, const char *env_var)
574 {
575   if (env_var == NULL)
576     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
577   else
578     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
579 }
580 
581 static void
process_GOMP_NVPTX_JIT(intptr_t * gomp_nvptx_o)582 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
583 {
584   const char *var_name = "GOMP_NVPTX_JIT";
585   const char *env_var = secure_getenv (var_name);
586   notify_var (var_name, env_var);
587 
588   if (env_var == NULL)
589     return;
590 
591   const char *c = env_var;
592   while (*c != '\0')
593     {
594       while (*c == ' ')
595 	c++;
596 
597       if (c[0] == '-' && c[1] == 'O'
598 	  && '0' <= c[2] && c[2] <= '4'
599 	  && (c[3] == '\0' || c[3] == ' '))
600 	{
601 	  *gomp_nvptx_o = c[2] - '0';
602 	  c += 3;
603 	  continue;
604 	}
605 
606       GOMP_PLUGIN_error ("Error parsing %s", var_name);
607       break;
608     }
609 }
610 
611 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)612 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
613 	  unsigned num_objs)
614 {
615   CUjit_option opts[7];
616   void *optvals[7];
617   float elapsed = 0.0;
618   char elog[1024];
619   char ilog[16384];
620   CUlinkState linkstate;
621   CUresult r;
622   void *linkout;
623   size_t linkoutsize __attribute__ ((unused));
624 
625   opts[0] = CU_JIT_WALL_TIME;
626   optvals[0] = &elapsed;
627 
628   opts[1] = CU_JIT_INFO_LOG_BUFFER;
629   optvals[1] = &ilog[0];
630 
631   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
632   optvals[2] = (void *) sizeof ilog;
633 
634   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
635   optvals[3] = &elog[0];
636 
637   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
638   optvals[4] = (void *) sizeof elog;
639 
640   opts[5] = CU_JIT_LOG_VERBOSE;
641   optvals[5] = (void *) 1;
642 
643   static intptr_t gomp_nvptx_o = -1;
644 
645   static bool init_done = false;
646   if (!init_done)
647     {
648       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
649       init_done = true;
650   }
651 
652   int nopts = 6;
653   if (gomp_nvptx_o != -1)
654     {
655       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
656       optvals[nopts] = (void *) gomp_nvptx_o;
657       nopts++;
658     }
659 
660   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
661     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
662   else
663     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
664 
665   for (; num_objs--; ptx_objs++)
666     {
667       /* cuLinkAddData's 'data' argument erroneously omits the const
668 	 qualifier.  */
669       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
670       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
671 	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
672 			       (char *) ptx_objs->code, ptx_objs->size,
673 			       0, 0, 0, 0);
674       else
675 	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
676 			       (char *) ptx_objs->code, ptx_objs->size,
677 			       0, 0, 0, 0);
678       if (r != CUDA_SUCCESS)
679 	{
680 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
681 	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
682 			     cuda_error (r));
683 	  return false;
684 	}
685     }
686 
687   GOMP_PLUGIN_debug (0, "Linking\n");
688   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
689 
690   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
691   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
692 
693   if (r != CUDA_SUCCESS)
694     {
695       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
696       return false;
697     }
698 
699   CUDA_CALL (cuModuleLoadData, module, linkout);
700   CUDA_CALL (cuLinkDestroy, linkstate);
701   return true;
702 }
703 
704 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,CUdeviceptr dp,CUstream stream)705 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
706 	    unsigned *dims, void *targ_mem_desc,
707 	    CUdeviceptr dp, CUstream stream)
708 {
709   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
710   CUfunction function;
711   int i;
712   void *kargs[1];
713   struct nvptx_thread *nvthd = nvptx_thread ();
714   int warp_size = nvthd->ptx_dev->warp_size;
715 
716   function = targ_fn->fn;
717 
718   /* Initialize the launch dimensions.  Typically this is constant,
719      provided by the device compiler, but we must permit runtime
720      values.  */
721   int seen_zero = 0;
722   for (i = 0; i != GOMP_DIM_MAX; i++)
723     {
724       if (targ_fn->launch->dim[i])
725        dims[i] = targ_fn->launch->dim[i];
726       if (!dims[i])
727        seen_zero = 1;
728     }
729 
730   if (seen_zero)
731     {
732       pthread_mutex_lock (&ptx_dev_lock);
733 
734       static int gomp_openacc_dims[GOMP_DIM_MAX];
735       if (!gomp_openacc_dims[0])
736 	{
737 	  /* See if the user provided GOMP_OPENACC_DIM environment
738 	     variable to specify runtime defaults.  */
739 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
740 	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
741 	}
742 
743       if (!nvthd->ptx_dev->default_dims[0])
744 	{
745 	  int default_dims[GOMP_DIM_MAX];
746 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
747 	    default_dims[i] = gomp_openacc_dims[i];
748 
749 	  int gang, worker, vector;
750 	  {
751 	    int block_size = nvthd->ptx_dev->max_threads_per_block;
752 	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
753 	    int dev_size = nvthd->ptx_dev->num_sms;
754 	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
755 			       " dev_size=%d, cpu_size=%d\n",
756 			       warp_size, block_size, dev_size, cpu_size);
757 
758 	    gang = (cpu_size / block_size) * dev_size;
759 	    worker = block_size / warp_size;
760 	    vector = warp_size;
761 	  }
762 
763 	  /* There is no upper bound on the gang size.  The best size
764 	     matches the hardware configuration.  Logical gangs are
765 	     scheduled onto physical hardware.  To maximize usage, we
766 	     should guess a large number.  */
767 	  if (default_dims[GOMP_DIM_GANG] < 1)
768 	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
769 	  /* The worker size must not exceed the hardware.  */
770 	  if (default_dims[GOMP_DIM_WORKER] < 1
771 	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
772 	    default_dims[GOMP_DIM_WORKER] = worker;
773 	  /* The vector size must exactly match the hardware.  */
774 	  if (default_dims[GOMP_DIM_VECTOR] < 1
775 	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
776 	    default_dims[GOMP_DIM_VECTOR] = vector;
777 
778 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
779 			     default_dims[GOMP_DIM_GANG],
780 			     default_dims[GOMP_DIM_WORKER],
781 			     default_dims[GOMP_DIM_VECTOR]);
782 
783 	  for (i = 0; i != GOMP_DIM_MAX; i++)
784 	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
785 	}
786       pthread_mutex_unlock (&ptx_dev_lock);
787 
788       {
789 	bool default_dim_p[GOMP_DIM_MAX];
790 	for (i = 0; i != GOMP_DIM_MAX; i++)
791 	  default_dim_p[i] = !dims[i];
792 
793 	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
794 	  {
795 	    for (i = 0; i != GOMP_DIM_MAX; i++)
796 	      if (default_dim_p[i])
797 		dims[i] = nvthd->ptx_dev->default_dims[i];
798 
799 	    if (default_dim_p[GOMP_DIM_VECTOR])
800 	      dims[GOMP_DIM_VECTOR]
801 		= MIN (dims[GOMP_DIM_VECTOR],
802 		       (targ_fn->max_threads_per_block / warp_size
803 			* warp_size));
804 
805 	    if (default_dim_p[GOMP_DIM_WORKER])
806 	      dims[GOMP_DIM_WORKER]
807 		= MIN (dims[GOMP_DIM_WORKER],
808 		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
809 	  }
810 	else
811 	  {
812 	    /* Handle the case that the compiler allows the runtime to choose
813 	       the vector-length conservatively, by ignoring
814 	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
815 	       it.  */
816 	    int vectors = 0;
817 	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
818 	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
819 	       exceed targ_fn->max_threads_per_block. */
820 	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
821 	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
822 	    int grids, blocks;
823 
824 	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
825 			      &blocks, function, NULL, 0,
826 			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
827 	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
828 			       "grid = %d, block = %d\n", grids, blocks);
829 
830 	    /* Keep the num_gangs proportional to the block size.  In
831 	       the case were a block size is limited by shared-memory
832 	       or the register file capacity, the runtime will not
833 	       excessively over assign gangs to the multiprocessor
834 	       units if their state is going to be swapped out even
835 	       more than necessary. The constant factor 2 is there to
836 	       prevent threads from idling when there is insufficient
837 	       work for them.  */
838 	    if (gangs == 0)
839 	      gangs = 2 * grids * (blocks / warp_size);
840 
841 	    if (vectors == 0)
842 	      vectors = warp_size;
843 
844 	    if (workers == 0)
845 	      {
846 		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
847 				      ? vectors
848 				      : dims[GOMP_DIM_VECTOR]);
849 		workers = blocks / actual_vectors;
850 		workers = MAX (workers, 1);
851 		/* If we need a per-worker barrier ... .  */
852 		if (actual_vectors > 32)
853 		  /* Don't use more barriers than available.  */
854 		  workers = MIN (workers, 15);
855 	      }
856 
857 	    for (i = 0; i != GOMP_DIM_MAX; i++)
858 	      if (default_dim_p[i])
859 		switch (i)
860 		  {
861 		  case GOMP_DIM_GANG: dims[i] = gangs; break;
862 		  case GOMP_DIM_WORKER: dims[i] = workers; break;
863 		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
864 		  default: GOMP_PLUGIN_fatal ("invalid dim");
865 		  }
866 	  }
867       }
868     }
869 
870   /* Check if the accelerator has sufficient hardware resources to
871      launch the offloaded kernel.  */
872   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
873       > targ_fn->max_threads_per_block)
874     {
875       const char *msg
876 	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
877 	   " with num_workers = %d and vector_length = %d"
878 	   "; "
879 	   "recompile the program with 'num_workers = x and vector_length = y'"
880 	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
881 	   " x * y <= %d"
882 	   ".\n");
883       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
884 			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
885     }
886 
887   /* Check if the accelerator has sufficient barrier resources to
888      launch the offloaded kernel.  */
889   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
890     {
891       const char *msg
892 	= ("The Nvidia accelerator has insufficient barrier resources to launch"
893 	   " '%s' with num_workers = %d and vector_length = %d"
894 	   "; "
895 	   "recompile the program with 'num_workers = x' on that offloaded"
896 	   " region or '-fopenacc-dim=:x:' where x <= 15"
897 	   "; "
898 	   "or, recompile the program with 'vector_length = 32' on that"
899 	   " offloaded region or '-fopenacc-dim=::32'"
900 	   ".\n");
901 	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
902 			   dims[GOMP_DIM_VECTOR]);
903     }
904 
905   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
906 		     " gangs=%u, workers=%u, vectors=%u\n",
907 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
908 		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
909 
910   // OpenACC		CUDA
911   //
912   // num_gangs		nctaid.x
913   // num_workers	ntid.y
914   // vector length	ntid.x
915 
916   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
917   acc_prof_info *prof_info = thr->prof_info;
918   acc_event_info enqueue_launch_event_info;
919   acc_api_info *api_info = thr->api_info;
920   bool profiling_p = __builtin_expect (prof_info != NULL, false);
921   if (profiling_p)
922     {
923       prof_info->event_type = acc_ev_enqueue_launch_start;
924 
925       enqueue_launch_event_info.launch_event.event_type
926 	= prof_info->event_type;
927       enqueue_launch_event_info.launch_event.valid_bytes
928 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
929       enqueue_launch_event_info.launch_event.parent_construct
930 	= acc_construct_parallel;
931       enqueue_launch_event_info.launch_event.implicit = 1;
932       enqueue_launch_event_info.launch_event.tool_info = NULL;
933       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
934       enqueue_launch_event_info.launch_event.num_gangs
935 	= dims[GOMP_DIM_GANG];
936       enqueue_launch_event_info.launch_event.num_workers
937 	= dims[GOMP_DIM_WORKER];
938       enqueue_launch_event_info.launch_event.vector_length
939 	= dims[GOMP_DIM_VECTOR];
940 
941       api_info->device_api = acc_device_api_cuda;
942 
943       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
944 					    api_info);
945     }
946 
947   kargs[0] = &dp;
948   CUDA_CALL_ASSERT (cuLaunchKernel, function,
949 		    dims[GOMP_DIM_GANG], 1, 1,
950 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
951 		    0, stream, kargs, 0);
952 
953   if (profiling_p)
954     {
955       prof_info->event_type = acc_ev_enqueue_launch_end;
956       enqueue_launch_event_info.launch_event.event_type
957 	= prof_info->event_type;
958       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
959 					    api_info);
960     }
961 
962   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
963 		     targ_fn->launch->fn);
964 }
965 
966 void * openacc_get_current_cuda_context (void);
967 
968 static void
goacc_profiling_acc_ev_alloc(struct goacc_thread * thr,void * dp,size_t s)969 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
970 {
971   acc_prof_info *prof_info = thr->prof_info;
972   acc_event_info data_event_info;
973   acc_api_info *api_info = thr->api_info;
974 
975   prof_info->event_type = acc_ev_alloc;
976 
977   data_event_info.data_event.event_type = prof_info->event_type;
978   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
979   data_event_info.data_event.parent_construct = acc_construct_parallel;
980   data_event_info.data_event.implicit = 1;
981   data_event_info.data_event.tool_info = NULL;
982   data_event_info.data_event.var_name = NULL;
983   data_event_info.data_event.bytes = s;
984   data_event_info.data_event.host_ptr = NULL;
985   data_event_info.data_event.device_ptr = dp;
986 
987   api_info->device_api = acc_device_api_cuda;
988 
989   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
990 }
991 
992 static void *
nvptx_alloc(size_t s)993 nvptx_alloc (size_t s)
994 {
995   CUdeviceptr d;
996 
997   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
998   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
999   bool profiling_p
1000     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1001   if (profiling_p)
1002     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1003 
1004   return (void *) d;
1005 }
1006 
1007 static void
goacc_profiling_acc_ev_free(struct goacc_thread * thr,void * p)1008 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1009 {
1010   acc_prof_info *prof_info = thr->prof_info;
1011   acc_event_info data_event_info;
1012   acc_api_info *api_info = thr->api_info;
1013 
1014   prof_info->event_type = acc_ev_free;
1015 
1016   data_event_info.data_event.event_type = prof_info->event_type;
1017   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1018   data_event_info.data_event.parent_construct = acc_construct_parallel;
1019   data_event_info.data_event.implicit = 1;
1020   data_event_info.data_event.tool_info = NULL;
1021   data_event_info.data_event.var_name = NULL;
1022   data_event_info.data_event.bytes = -1;
1023   data_event_info.data_event.host_ptr = NULL;
1024   data_event_info.data_event.device_ptr = p;
1025 
1026   api_info->device_api = acc_device_api_cuda;
1027 
1028   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1029 }
1030 
1031 static bool
nvptx_free(void * p,struct ptx_device * ptx_dev)1032 nvptx_free (void *p, struct ptx_device *ptx_dev)
1033 {
1034   CUdeviceptr pb;
1035   size_t ps;
1036 
1037   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1038 				  (CUdeviceptr) p);
1039   if (r == CUDA_ERROR_NOT_PERMITTED)
1040     {
1041       /* We assume that this error indicates we are in a CUDA callback context,
1042 	 where all CUDA calls are not allowed (see cuStreamAddCallback
1043 	 documentation for description). Arrange to free this piece of device
1044 	 memory later.  */
1045       struct ptx_free_block *n
1046 	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1047       n->ptr = p;
1048       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1049       n->next = ptx_dev->free_blocks;
1050       ptx_dev->free_blocks = n;
1051       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1052       return true;
1053     }
1054   else if (r != CUDA_SUCCESS)
1055     {
1056       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1057       return false;
1058     }
1059   if ((CUdeviceptr) p != pb)
1060     {
1061       GOMP_PLUGIN_error ("invalid device address");
1062       return false;
1063     }
1064 
1065   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1066   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1067   bool profiling_p
1068     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1069   if (profiling_p)
1070     goacc_profiling_acc_ev_free (thr, p);
1071 
1072   return true;
1073 }
1074 
1075 static void *
nvptx_get_current_cuda_device(void)1076 nvptx_get_current_cuda_device (void)
1077 {
1078   struct nvptx_thread *nvthd = nvptx_thread ();
1079 
1080   if (!nvthd || !nvthd->ptx_dev)
1081     return NULL;
1082 
1083   return &nvthd->ptx_dev->dev;
1084 }
1085 
1086 static void *
nvptx_get_current_cuda_context(void)1087 nvptx_get_current_cuda_context (void)
1088 {
1089   struct nvptx_thread *nvthd = nvptx_thread ();
1090 
1091   if (!nvthd || !nvthd->ptx_dev)
1092     return NULL;
1093 
1094   return nvthd->ptx_dev->ctx;
1095 }
1096 
1097 /* Plugin entry points.  */
1098 
1099 const char *
GOMP_OFFLOAD_get_name(void)1100 GOMP_OFFLOAD_get_name (void)
1101 {
1102   return "nvptx";
1103 }
1104 
1105 unsigned int
GOMP_OFFLOAD_get_caps(void)1106 GOMP_OFFLOAD_get_caps (void)
1107 {
1108   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1109 }
1110 
1111 int
GOMP_OFFLOAD_get_type(void)1112 GOMP_OFFLOAD_get_type (void)
1113 {
1114   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1115 }
1116 
1117 int
GOMP_OFFLOAD_get_num_devices(void)1118 GOMP_OFFLOAD_get_num_devices (void)
1119 {
1120   return nvptx_get_num_devices ();
1121 }
1122 
1123 bool
GOMP_OFFLOAD_init_device(int n)1124 GOMP_OFFLOAD_init_device (int n)
1125 {
1126   struct ptx_device *dev;
1127 
1128   pthread_mutex_lock (&ptx_dev_lock);
1129 
1130   if (!nvptx_init () || ptx_devices[n] != NULL)
1131     {
1132       pthread_mutex_unlock (&ptx_dev_lock);
1133       return false;
1134     }
1135 
1136   dev = nvptx_open_device (n);
1137   if (dev)
1138     {
1139       ptx_devices[n] = dev;
1140       instantiated_devices++;
1141     }
1142 
1143   pthread_mutex_unlock (&ptx_dev_lock);
1144 
1145   return dev != NULL;
1146 }
1147 
1148 bool
GOMP_OFFLOAD_fini_device(int n)1149 GOMP_OFFLOAD_fini_device (int n)
1150 {
1151   pthread_mutex_lock (&ptx_dev_lock);
1152 
1153   if (ptx_devices[n] != NULL)
1154     {
1155       if (!nvptx_attach_host_thread_to_device (n)
1156 	  || !nvptx_close_device (ptx_devices[n]))
1157 	{
1158 	  pthread_mutex_unlock (&ptx_dev_lock);
1159 	  return false;
1160 	}
1161       ptx_devices[n] = NULL;
1162       instantiated_devices--;
1163     }
1164 
1165   if (instantiated_devices == 0)
1166     {
1167       free (ptx_devices);
1168       ptx_devices = NULL;
1169     }
1170 
1171   pthread_mutex_unlock (&ptx_dev_lock);
1172   return true;
1173 }
1174 
1175 /* Return the libgomp version number we're compatible with.  There is
1176    no requirement for cross-version compatibility.  */
1177 
1178 unsigned
GOMP_OFFLOAD_version(void)1179 GOMP_OFFLOAD_version (void)
1180 {
1181   return GOMP_VERSION;
1182 }
1183 
1184 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1185 
1186 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1187 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1188 {
1189   CUdeviceptr dptr;
1190   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1191 				  module, "__nvptx_clocktick");
1192   if (r == CUDA_ERROR_NOT_FOUND)
1193     return;
1194   if (r != CUDA_SUCCESS)
1195     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1196   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1197   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1198 			 sizeof (__nvptx_clocktick));
1199   if (r != CUDA_SUCCESS)
1200     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1201 }
1202 
1203 /* Load the (partial) program described by TARGET_DATA to device
1204    number ORD.  Allocate and return TARGET_TABLE.  */
1205 
1206 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1207 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1208 			 struct addr_pair **target_table)
1209 {
1210   CUmodule module;
1211   const char *const *var_names;
1212   const struct targ_fn_launch *fn_descs;
1213   unsigned int fn_entries, var_entries, i, j;
1214   struct targ_fn_descriptor *targ_fns;
1215   struct addr_pair *targ_tbl;
1216   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1217   struct ptx_image_data *new_image;
1218   struct ptx_device *dev;
1219 
1220   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1221     {
1222       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1223 			 " (expected %u, received %u)",
1224 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1225       return -1;
1226     }
1227 
1228   if (!nvptx_attach_host_thread_to_device (ord)
1229       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1230     return -1;
1231 
1232   dev = ptx_devices[ord];
1233 
1234   /* The mkoffload utility emits a struct of pointers/integers at the
1235      start of each offload image.  The array of kernel names and the
1236      functions addresses form a one-to-one correspondence.  */
1237 
1238   var_entries = img_header->var_num;
1239   var_names = img_header->var_names;
1240   fn_entries = img_header->fn_num;
1241   fn_descs = img_header->fn_descs;
1242 
1243   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1244 				 * (fn_entries + var_entries));
1245   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1246 				 * fn_entries);
1247 
1248   *target_table = targ_tbl;
1249 
1250   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1251   new_image->target_data = target_data;
1252   new_image->module = module;
1253   new_image->fns = targ_fns;
1254 
1255   pthread_mutex_lock (&dev->image_lock);
1256   new_image->next = dev->images;
1257   dev->images = new_image;
1258   pthread_mutex_unlock (&dev->image_lock);
1259 
1260   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1261     {
1262       CUfunction function;
1263       int nregs, mthrs;
1264 
1265       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1266 		      fn_descs[i].fn);
1267       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1268 		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1269       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1270 		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1271 
1272       targ_fns->fn = function;
1273       targ_fns->launch = &fn_descs[i];
1274       targ_fns->regs_per_thread = nregs;
1275       targ_fns->max_threads_per_block = mthrs;
1276 
1277       targ_tbl->start = (uintptr_t) targ_fns;
1278       targ_tbl->end = targ_tbl->start + 1;
1279     }
1280 
1281   for (j = 0; j < var_entries; j++, targ_tbl++)
1282     {
1283       CUdeviceptr var;
1284       size_t bytes;
1285 
1286       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1287 		      &var, &bytes, module, var_names[j]);
1288 
1289       targ_tbl->start = (uintptr_t) var;
1290       targ_tbl->end = targ_tbl->start + bytes;
1291     }
1292 
1293   nvptx_set_clocktick (module, dev);
1294 
1295   return fn_entries + var_entries;
1296 }
1297 
1298 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1299    function descriptors allocated by G_O_load_image.  */
1300 
1301 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1302 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1303 {
1304   struct ptx_image_data *image, **prev_p;
1305   struct ptx_device *dev = ptx_devices[ord];
1306 
1307   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1308     {
1309       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1310 			 " (expected %u, received %u)",
1311 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1312       return false;
1313     }
1314 
1315   bool ret = true;
1316   pthread_mutex_lock (&dev->image_lock);
1317   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1318     if (image->target_data == target_data)
1319       {
1320 	*prev_p = image->next;
1321 	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1322 	  ret = false;
1323 	free (image->fns);
1324 	free (image);
1325 	break;
1326       }
1327   pthread_mutex_unlock (&dev->image_lock);
1328   return ret;
1329 }
1330 
1331 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1332 GOMP_OFFLOAD_alloc (int ord, size_t size)
1333 {
1334   if (!nvptx_attach_host_thread_to_device (ord))
1335     return NULL;
1336 
1337   struct ptx_device *ptx_dev = ptx_devices[ord];
1338   struct ptx_free_block *blocks, *tmp;
1339 
1340   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1341   blocks = ptx_dev->free_blocks;
1342   ptx_dev->free_blocks = NULL;
1343   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1344 
1345   while (blocks)
1346     {
1347       tmp = blocks->next;
1348       nvptx_free (blocks->ptr, ptx_dev);
1349       free (blocks);
1350       blocks = tmp;
1351     }
1352 
1353   return nvptx_alloc (size);
1354 }
1355 
1356 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1357 GOMP_OFFLOAD_free (int ord, void *ptr)
1358 {
1359   return (nvptx_attach_host_thread_to_device (ord)
1360 	  && nvptx_free (ptr, ptx_devices[ord]));
1361 }
1362 
1363 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)1364 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1365 			   void **hostaddrs, void **devaddrs,
1366 			   unsigned *dims, void *targ_mem_desc)
1367 {
1368   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1369 
1370   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1371   acc_prof_info *prof_info = thr->prof_info;
1372   acc_event_info data_event_info;
1373   acc_api_info *api_info = thr->api_info;
1374   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1375 
1376   void **hp = NULL;
1377   CUdeviceptr dp = 0;
1378 
1379   if (mapnum > 0)
1380     {
1381       size_t s = mapnum * sizeof (void *);
1382       hp = alloca (s);
1383       for (int i = 0; i < mapnum; i++)
1384 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1385       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1386       if (profiling_p)
1387 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1388     }
1389 
1390   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1391      fact have the same value on a unified-memory system).  */
1392   if (mapnum > 0)
1393     {
1394       if (profiling_p)
1395 	{
1396 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1397 
1398 	  data_event_info.data_event.event_type = prof_info->event_type;
1399 	  data_event_info.data_event.valid_bytes
1400 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1401 	  data_event_info.data_event.parent_construct
1402 	    = acc_construct_parallel;
1403 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1404 	  data_event_info.data_event.tool_info = NULL;
1405 	  data_event_info.data_event.var_name = NULL;
1406 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1407 	  data_event_info.data_event.host_ptr = hp;
1408 	  data_event_info.data_event.device_ptr = (const void *) dp;
1409 
1410 	  api_info->device_api = acc_device_api_cuda;
1411 
1412 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1413 						api_info);
1414 	}
1415       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1416 			mapnum * sizeof (void *));
1417       if (profiling_p)
1418 	{
1419 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1420 	  data_event_info.data_event.event_type = prof_info->event_type;
1421 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1422 						api_info);
1423 	}
1424     }
1425 
1426   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1427 	      dp, NULL);
1428 
1429   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1430   const char *maybe_abort_msg = "(perhaps abort was called)";
1431   if (r == CUDA_ERROR_LAUNCH_FAILED)
1432     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1433 		       maybe_abort_msg);
1434   else if (r != CUDA_SUCCESS)
1435     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1436 
1437   CUDA_CALL_ASSERT (cuMemFree, dp);
1438   if (profiling_p)
1439     goacc_profiling_acc_ev_free (thr, (void *) dp);
1440 }
1441 
1442 static void
cuda_free_argmem(void * ptr)1443 cuda_free_argmem (void *ptr)
1444 {
1445   void **block = (void **) ptr;
1446   nvptx_free (block[0], (struct ptx_device *) block[1]);
1447   free (block);
1448 }
1449 
1450 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)1451 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1452 				 void **hostaddrs, void **devaddrs,
1453 				 unsigned *dims, void *targ_mem_desc,
1454 				 struct goacc_asyncqueue *aq)
1455 {
1456   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1457 
1458   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1459   acc_prof_info *prof_info = thr->prof_info;
1460   acc_event_info data_event_info;
1461   acc_api_info *api_info = thr->api_info;
1462   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1463 
1464   void **hp = NULL;
1465   CUdeviceptr dp = 0;
1466   void **block = NULL;
1467 
1468   if (mapnum > 0)
1469     {
1470       size_t s = mapnum * sizeof (void *);
1471       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1472       hp = block + 2;
1473       for (int i = 0; i < mapnum; i++)
1474 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1475       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1476       if (profiling_p)
1477 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1478     }
1479 
1480   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1481      fact have the same value on a unified-memory system).  */
1482   if (mapnum > 0)
1483     {
1484       if (profiling_p)
1485 	{
1486 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1487 
1488 	  data_event_info.data_event.event_type = prof_info->event_type;
1489 	  data_event_info.data_event.valid_bytes
1490 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1491 	  data_event_info.data_event.parent_construct
1492 	    = acc_construct_parallel;
1493 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1494 	  data_event_info.data_event.tool_info = NULL;
1495 	  data_event_info.data_event.var_name = NULL;
1496 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1497 	  data_event_info.data_event.host_ptr = hp;
1498 	  data_event_info.data_event.device_ptr = (const void *) dp;
1499 
1500 	  api_info->device_api = acc_device_api_cuda;
1501 
1502 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1503 						api_info);
1504 	}
1505 
1506       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1507 			mapnum * sizeof (void *), aq->cuda_stream);
1508       block[0] = (void *) dp;
1509 
1510       struct nvptx_thread *nvthd =
1511 	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1512       block[1] = (void *) nvthd->ptx_dev;
1513 
1514       if (profiling_p)
1515 	{
1516 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1517 	  data_event_info.data_event.event_type = prof_info->event_type;
1518 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1519 						api_info);
1520 	}
1521     }
1522 
1523   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1524 	      dp, aq->cuda_stream);
1525 
1526   if (mapnum > 0)
1527     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1528 }
1529 
1530 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)1531 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1532 {
1533   struct ptx_device *ptx_dev;
1534   struct nvptx_thread *nvthd
1535     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1536   CUcontext thd_ctx;
1537 
1538   ptx_dev = ptx_devices[ord];
1539 
1540   assert (ptx_dev);
1541 
1542   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1543 
1544   assert (ptx_dev->ctx);
1545 
1546   if (!thd_ctx)
1547     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1548 
1549   nvthd->ptx_dev = ptx_dev;
1550 
1551   return (void *) nvthd;
1552 }
1553 
1554 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)1555 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1556 {
1557   free (data);
1558 }
1559 
1560 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)1561 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1562 {
1563   return nvptx_get_current_cuda_device ();
1564 }
1565 
1566 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)1567 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1568 {
1569   return nvptx_get_current_cuda_context ();
1570 }
1571 
1572 /* This returns a CUstream.  */
1573 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(struct goacc_asyncqueue * aq)1574 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1575 {
1576   return (void *) aq->cuda_stream;
1577 }
1578 
1579 /* This takes a CUstream.  */
1580 int
GOMP_OFFLOAD_openacc_cuda_set_stream(struct goacc_asyncqueue * aq,void * stream)1581 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1582 {
1583   if (aq->cuda_stream)
1584     {
1585       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1586       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1587     }
1588 
1589   aq->cuda_stream = (CUstream) stream;
1590   return 1;
1591 }
1592 
1593 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)1594 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1595 {
1596   CUstream stream = NULL;
1597   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1598 
1599   struct goacc_asyncqueue *aq
1600     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1601   aq->cuda_stream = stream;
1602   return aq;
1603 }
1604 
1605 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)1606 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1607 {
1608   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1609   free (aq);
1610   return true;
1611 }
1612 
1613 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)1614 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1615 {
1616   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1617   if (r == CUDA_SUCCESS)
1618     return 1;
1619   if (r == CUDA_ERROR_NOT_READY)
1620     return 0;
1621 
1622   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1623   return -1;
1624 }
1625 
1626 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)1627 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1628 {
1629   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1630   return true;
1631 }
1632 
1633 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)1634 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1635 				      struct goacc_asyncqueue *aq2)
1636 {
1637   CUevent e;
1638   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1639   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1640   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1641   return true;
1642 }
1643 
1644 static void
cuda_callback_wrapper(CUstream stream,CUresult res,void * ptr)1645 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1646 {
1647   if (res != CUDA_SUCCESS)
1648     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1649   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1650   cb->fn (cb->ptr);
1651   free (ptr);
1652 }
1653 
1654 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* callback_fn)(void *),void * userptr)1655 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1656 					   void (*callback_fn)(void *),
1657 					   void *userptr)
1658 {
1659   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1660   b->fn = callback_fn;
1661   b->ptr = userptr;
1662   b->aq = aq;
1663   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1664 		    cuda_callback_wrapper, (void *) b, 0);
1665 }
1666 
1667 static bool
cuda_memcpy_sanity_check(const void * h,const void * d,size_t s)1668 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1669 {
1670   CUdeviceptr pb;
1671   size_t ps;
1672   if (!s)
1673     return true;
1674   if (!d)
1675     {
1676       GOMP_PLUGIN_error ("invalid device address");
1677       return false;
1678     }
1679   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1680   if (!pb)
1681     {
1682       GOMP_PLUGIN_error ("invalid device address");
1683       return false;
1684     }
1685   if (!h)
1686     {
1687       GOMP_PLUGIN_error ("invalid host address");
1688       return false;
1689     }
1690   if (d == h)
1691     {
1692       GOMP_PLUGIN_error ("invalid host or device address");
1693       return false;
1694     }
1695   if ((void *)(d + s) > (void *)(pb + ps))
1696     {
1697       GOMP_PLUGIN_error ("invalid size");
1698       return false;
1699     }
1700   return true;
1701 }
1702 
1703 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1704 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1705 {
1706   if (!nvptx_attach_host_thread_to_device (ord)
1707       || !cuda_memcpy_sanity_check (src, dst, n))
1708     return false;
1709   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1710   return true;
1711 }
1712 
1713 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1714 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1715 {
1716   if (!nvptx_attach_host_thread_to_device (ord)
1717       || !cuda_memcpy_sanity_check (dst, src, n))
1718     return false;
1719   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1720   return true;
1721 }
1722 
1723 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1724 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1725 {
1726   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1727   return true;
1728 }
1729 
1730 bool
GOMP_OFFLOAD_openacc_async_host2dev(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1731 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1732 				     size_t n, struct goacc_asyncqueue *aq)
1733 {
1734   if (!nvptx_attach_host_thread_to_device (ord)
1735       || !cuda_memcpy_sanity_check (src, dst, n))
1736     return false;
1737   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1738   return true;
1739 }
1740 
1741 bool
GOMP_OFFLOAD_openacc_async_dev2host(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1742 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1743 				     size_t n, struct goacc_asyncqueue *aq)
1744 {
1745   if (!nvptx_attach_host_thread_to_device (ord)
1746       || !cuda_memcpy_sanity_check (dst, src, n))
1747     return false;
1748   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1749   return true;
1750 }
1751 
1752 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int n,enum goacc_property prop)1753 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1754 {
1755   union goacc_property_value propval = { .val = 0 };
1756 
1757   pthread_mutex_lock (&ptx_dev_lock);
1758 
1759   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1760     {
1761       pthread_mutex_unlock (&ptx_dev_lock);
1762       return propval;
1763     }
1764 
1765   struct ptx_device *ptx_dev = ptx_devices[n];
1766   switch (prop)
1767     {
1768     case GOACC_PROPERTY_MEMORY:
1769       {
1770 	size_t total_mem;
1771 
1772 	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1773 	propval.val = total_mem;
1774       }
1775       break;
1776     case GOACC_PROPERTY_FREE_MEMORY:
1777       {
1778 	size_t total_mem;
1779 	size_t free_mem;
1780 	CUdevice ctxdev;
1781 
1782 	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1783 	if (ptx_dev->dev == ctxdev)
1784 	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1785 	else if (ptx_dev->ctx)
1786 	  {
1787 	    CUcontext old_ctx;
1788 
1789 	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1790 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1791 	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1792 	  }
1793 	else
1794 	  {
1795 	    CUcontext new_ctx;
1796 
1797 	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1798 			    ptx_dev->dev);
1799 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1800 	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1801 	  }
1802 	propval.val = free_mem;
1803       }
1804       break;
1805     case GOACC_PROPERTY_NAME:
1806       propval.ptr = ptx_dev->name;
1807       break;
1808     case GOACC_PROPERTY_VENDOR:
1809       propval.ptr = "Nvidia";
1810       break;
1811     case GOACC_PROPERTY_DRIVER:
1812       propval.ptr = cuda_driver_version_s;
1813       break;
1814     default:
1815       break;
1816     }
1817 
1818   pthread_mutex_unlock (&ptx_dev_lock);
1819   return propval;
1820 }
1821 
1822 /* Adjust launch dimensions: pick good values for number of blocks and warps
1823    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1824    own limits.  */
1825 
1826 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)1827 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1828 			    struct ptx_device *ptx_dev,
1829 			    int *teams_p, int *threads_p)
1830 {
1831   int max_warps_block = fn->max_threads_per_block / 32;
1832   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1833      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1834   if (max_warps_block > 32)
1835     max_warps_block = 32;
1836   if (*threads_p <= 0)
1837     *threads_p = 8;
1838   if (*threads_p > max_warps_block)
1839     *threads_p = max_warps_block;
1840 
1841   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1842   /* This is an estimate of how many blocks the device can host simultaneously.
1843      Actual limit, which may be lower, can be queried with "occupancy control"
1844      driver interface (since CUDA 6.0).  */
1845   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1846   if (*teams_p <= 0 || *teams_p > max_blocks)
1847     *teams_p = max_blocks;
1848 }
1849 
1850 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1851    target regions.  */
1852 
1853 static size_t
nvptx_stacks_size()1854 nvptx_stacks_size ()
1855 {
1856   return 128 * 1024;
1857 }
1858 
1859 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
1860 
1861 static void *
nvptx_stacks_alloc(size_t size,int num)1862 nvptx_stacks_alloc (size_t size, int num)
1863 {
1864   CUdeviceptr stacks;
1865   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1866   if (r != CUDA_SUCCESS)
1867     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1868   return (void *) stacks;
1869 }
1870 
1871 /* Release storage previously allocated by nvptx_stacks_alloc.  */
1872 
1873 static void
nvptx_stacks_free(void * p,int num)1874 nvptx_stacks_free (void *p, int num)
1875 {
1876   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1877   if (r != CUDA_SUCCESS)
1878     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1879 }
1880 
1881 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)1882 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1883 {
1884   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1885   CUresult r;
1886   struct ptx_device *ptx_dev = ptx_devices[ord];
1887   const char *maybe_abort_msg = "(perhaps abort was called)";
1888   int teams = 0, threads = 0;
1889 
1890   if (!args)
1891     GOMP_PLUGIN_fatal ("No target arguments provided");
1892   while (*args)
1893     {
1894       intptr_t id = (intptr_t) *args++, val;
1895       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1896 	val = (intptr_t) *args++;
1897       else
1898         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1899       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1900 	continue;
1901       val = val > INT_MAX ? INT_MAX : val;
1902       id &= GOMP_TARGET_ARG_ID_MASK;
1903       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1904 	teams = val;
1905       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1906 	threads = val;
1907     }
1908   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1909 
1910   size_t stack_size = nvptx_stacks_size ();
1911   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1912   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1913   size_t fn_args_size = sizeof fn_args;
1914   void *config[] = {
1915     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1916     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1917     CU_LAUNCH_PARAM_END
1918   };
1919   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1920 			 32, threads, 1, 0, NULL, NULL, config);
1921   if (r != CUDA_SUCCESS)
1922     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1923 
1924   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1925   if (r == CUDA_ERROR_LAUNCH_FAILED)
1926     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1927 		       maybe_abort_msg);
1928   else if (r != CUDA_SUCCESS)
1929     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1930   nvptx_stacks_free (stacks, teams * threads);
1931 }
1932 
1933 /* TODO: Implement GOMP_OFFLOAD_async_run. */
1934