xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/plugin/plugin-nvptx.c (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2017 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
39 
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <limits.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <errno.h>
50 
51 #if PLUGIN_NVPTX_DYNAMIC
52 # include <dlfcn.h>
53 
54 # define CUDA_CALLS \
55 CUDA_ONE_CALL (cuCtxCreate)		\
56 CUDA_ONE_CALL (cuCtxDestroy)		\
57 CUDA_ONE_CALL (cuCtxGetCurrent)		\
58 CUDA_ONE_CALL (cuCtxGetDevice)		\
59 CUDA_ONE_CALL (cuCtxPopCurrent)		\
60 CUDA_ONE_CALL (cuCtxPushCurrent)	\
61 CUDA_ONE_CALL (cuCtxSynchronize)	\
62 CUDA_ONE_CALL (cuDeviceGet)		\
63 CUDA_ONE_CALL (cuDeviceGetAttribute)	\
64 CUDA_ONE_CALL (cuDeviceGetCount)	\
65 CUDA_ONE_CALL (cuEventCreate)		\
66 CUDA_ONE_CALL (cuEventDestroy)		\
67 CUDA_ONE_CALL (cuEventElapsedTime)	\
68 CUDA_ONE_CALL (cuEventQuery)		\
69 CUDA_ONE_CALL (cuEventRecord)		\
70 CUDA_ONE_CALL (cuEventSynchronize)	\
71 CUDA_ONE_CALL (cuFuncGetAttribute)	\
72 CUDA_ONE_CALL (cuGetErrorString)	\
73 CUDA_ONE_CALL (cuInit)			\
74 CUDA_ONE_CALL (cuLaunchKernel)		\
75 CUDA_ONE_CALL (cuLinkAddData)		\
76 CUDA_ONE_CALL (cuLinkComplete)		\
77 CUDA_ONE_CALL (cuLinkCreate)		\
78 CUDA_ONE_CALL (cuLinkDestroy)		\
79 CUDA_ONE_CALL (cuMemAlloc)		\
80 CUDA_ONE_CALL (cuMemAllocHost)		\
81 CUDA_ONE_CALL (cuMemcpy)		\
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync)	\
83 CUDA_ONE_CALL (cuMemcpyDtoH)		\
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync)	\
85 CUDA_ONE_CALL (cuMemcpyHtoD)		\
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync)	\
87 CUDA_ONE_CALL (cuMemFree)		\
88 CUDA_ONE_CALL (cuMemFreeHost)		\
89 CUDA_ONE_CALL (cuMemGetAddressRange)	\
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
91 CUDA_ONE_CALL (cuModuleGetFunction)	\
92 CUDA_ONE_CALL (cuModuleGetGlobal)	\
93 CUDA_ONE_CALL (cuModuleLoad)		\
94 CUDA_ONE_CALL (cuModuleLoadData)	\
95 CUDA_ONE_CALL (cuModuleUnload)		\
96 CUDA_ONE_CALL (cuStreamCreate)		\
97 CUDA_ONE_CALL (cuStreamDestroy)		\
98 CUDA_ONE_CALL (cuStreamQuery)		\
99 CUDA_ONE_CALL (cuStreamSynchronize)	\
100 CUDA_ONE_CALL (cuStreamWaitEvent)
101 # define CUDA_ONE_CALL(call) \
102   __typeof (call) *call;
103 struct cuda_lib_s {
104   CUDA_CALLS
105 } cuda_lib;
106 
107 /* -1 if init_cuda_lib has not been called yet, false
108    if it has been and failed, true if it has been and succeeded.  */
109 static signed char cuda_lib_inited = -1;
110 
111 /* Dynamically load the CUDA runtime library and initialize function
112    pointers, return false if unsuccessful, true if successful.  */
113 static bool
114 init_cuda_lib (void)
115 {
116   if (cuda_lib_inited != -1)
117     return cuda_lib_inited;
118   const char *cuda_runtime_lib = "libcuda.so.1";
119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120   cuda_lib_inited = false;
121   if (h == NULL)
122     return false;
123 # undef CUDA_ONE_CALL
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
125 # define CUDA_ONE_CALL_1(call) \
126   cuda_lib.call = dlsym (h, #call);	\
127   if (cuda_lib.call == NULL)		\
128     return false;
129   CUDA_CALLS
130   cuda_lib_inited = true;
131   return true;
132 }
133 # undef CUDA_ONE_CALL
134 # undef CUDA_ONE_CALL_1
135 # define CUDA_CALL_PREFIX cuda_lib.
136 #else
137 # define CUDA_CALL_PREFIX
138 # define init_cuda_lib() true
139 #endif
140 
141 /* Convenience macros for the frequently used CUDA library call and
142    error handling sequence as well as CUDA library calls that
143    do the error checking themselves or don't do it at all.  */
144 
145 #define CUDA_CALL_ERET(ERET, FN, ...)		\
146   do {						\
147     unsigned __r				\
148       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
149     if (__r != CUDA_SUCCESS)			\
150       {						\
151 	GOMP_PLUGIN_error (#FN " error: %s",	\
152 			   cuda_error (__r));	\
153 	return ERET;				\
154       }						\
155   } while (0)
156 
157 #define CUDA_CALL(FN, ...)			\
158   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 
160 #define CUDA_CALL_ASSERT(FN, ...)		\
161   do {						\
162     unsigned __r				\
163       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
164     if (__r != CUDA_SUCCESS)			\
165       {						\
166 	GOMP_PLUGIN_fatal (#FN " error: %s",	\
167 			   cuda_error (__r));	\
168       }						\
169   } while (0)
170 
171 #define CUDA_CALL_NOCHECK(FN, ...)		\
172   CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 
174 static const char *
175 cuda_error (CUresult r)
176 {
177 #if CUDA_VERSION < 7000
178   /* Specified in documentation and present in library from at least
179      5.5.  Not declared in header file prior to 7.0.  */
180   extern CUresult cuGetErrorString (CUresult, const char **);
181 #endif
182   const char *desc;
183 
184   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
185   if (r != CUDA_SUCCESS)
186     desc = "unknown cuda error";
187 
188   return desc;
189 }
190 
191 static unsigned int instantiated_devices = 0;
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
193 
194 struct ptx_stream
195 {
196   CUstream stream;
197   pthread_t host_thread;
198   bool multithreaded;
199 
200   CUdeviceptr d;
201   void *h;
202   void *h_begin;
203   void *h_end;
204   void *h_next;
205   void *h_prev;
206   void *h_tail;
207 
208   struct ptx_stream *next;
209 };
210 
211 /* Thread-specific data for PTX.  */
212 
213 struct nvptx_thread
214 {
215   struct ptx_stream *current_stream;
216   struct ptx_device *ptx_dev;
217 };
218 
219 struct map
220 {
221   int     async;
222   size_t  size;
223   char    mappings[0];
224 };
225 
226 static bool
227 map_init (struct ptx_stream *s)
228 {
229   int size = getpagesize ();
230 
231   assert (s);
232   assert (!s->d);
233   assert (!s->h);
234 
235   CUDA_CALL (cuMemAllocHost, &s->h, size);
236   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
237 
238   assert (s->h);
239 
240   s->h_begin = s->h;
241   s->h_end = s->h_begin + size;
242   s->h_next = s->h_prev = s->h_tail = s->h_begin;
243 
244   assert (s->h_next);
245   assert (s->h_end);
246   return true;
247 }
248 
249 static bool
250 map_fini (struct ptx_stream *s)
251 {
252   CUDA_CALL (cuMemFreeHost, s->h);
253   return true;
254 }
255 
256 static void
257 map_pop (struct ptx_stream *s)
258 {
259   struct map *m;
260 
261   assert (s != NULL);
262   assert (s->h_next);
263   assert (s->h_prev);
264   assert (s->h_tail);
265 
266   m = s->h_tail;
267 
268   s->h_tail += m->size;
269 
270   if (s->h_tail >= s->h_end)
271     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
272 
273   if (s->h_next == s->h_tail)
274     s->h_prev = s->h_next;
275 
276   assert (s->h_next >= s->h_begin);
277   assert (s->h_tail >= s->h_begin);
278   assert (s->h_prev >= s->h_begin);
279 
280   assert (s->h_next <= s->h_end);
281   assert (s->h_tail <= s->h_end);
282   assert (s->h_prev <= s->h_end);
283 }
284 
285 static void
286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
287 {
288   int left;
289   int offset;
290   struct map *m;
291 
292   assert (s != NULL);
293 
294   left = s->h_end - s->h_next;
295   size += sizeof (struct map);
296 
297   assert (s->h_prev);
298   assert (s->h_next);
299 
300   if (size >= left)
301     {
302       m = s->h_prev;
303       m->size += left;
304       s->h_next = s->h_begin;
305 
306       if (s->h_next + size > s->h_end)
307 	GOMP_PLUGIN_fatal ("unable to push map");
308     }
309 
310   assert (s->h_next);
311 
312   m = s->h_next;
313   m->async = async;
314   m->size = size;
315 
316   offset = (void *)&m->mappings[0] - s->h;
317 
318   *d = (void *)(s->d + offset);
319   *h = (void *)(s->h + offset);
320 
321   s->h_prev = s->h_next;
322   s->h_next += size;
323 
324   assert (s->h_prev);
325   assert (s->h_next);
326 
327   assert (s->h_next >= s->h_begin);
328   assert (s->h_tail >= s->h_begin);
329   assert (s->h_prev >= s->h_begin);
330   assert (s->h_next <= s->h_end);
331   assert (s->h_tail <= s->h_end);
332   assert (s->h_prev <= s->h_end);
333 
334   return;
335 }
336 
337 /* Target data function launch information.  */
338 
339 struct targ_fn_launch
340 {
341   const char *fn;
342   unsigned short dim[GOMP_DIM_MAX];
343 };
344 
345 /* Target PTX object information.  */
346 
347 struct targ_ptx_obj
348 {
349   const char *code;
350   size_t size;
351 };
352 
353 /* Target data image information.  */
354 
355 typedef struct nvptx_tdata
356 {
357   const struct targ_ptx_obj *ptx_objs;
358   unsigned ptx_num;
359 
360   const char *const *var_names;
361   unsigned var_num;
362 
363   const struct targ_fn_launch *fn_descs;
364   unsigned fn_num;
365 } nvptx_tdata_t;
366 
367 /* Descriptor of a loaded function.  */
368 
369 struct targ_fn_descriptor
370 {
371   CUfunction fn;
372   const struct targ_fn_launch *launch;
373   int regs_per_thread;
374   int max_threads_per_block;
375 };
376 
377 /* A loaded PTX image.  */
378 struct ptx_image_data
379 {
380   const void *target_data;
381   CUmodule module;
382 
383   struct targ_fn_descriptor *fns;  /* Array of functions.  */
384 
385   struct ptx_image_data *next;
386 };
387 
388 struct ptx_device
389 {
390   CUcontext ctx;
391   bool ctx_shared;
392   CUdevice dev;
393   struct ptx_stream *null_stream;
394   /* All non-null streams associated with this device (actually context),
395      either created implicitly or passed in from the user (via
396      acc_set_cuda_stream).  */
397   struct ptx_stream *active_streams;
398   struct {
399     struct ptx_stream **arr;
400     int size;
401   } async_streams;
402   /* A lock for use when manipulating the above stream list and array.  */
403   pthread_mutex_t stream_lock;
404   int ord;
405   bool overlap;
406   bool map;
407   bool concur;
408   bool mkern;
409   int  mode;
410   int clock_khz;
411   int num_sms;
412   int regs_per_block;
413   int regs_per_sm;
414 
415   struct ptx_image_data *images;  /* Images loaded on device.  */
416   pthread_mutex_t image_lock;     /* Lock for above list.  */
417 
418   struct ptx_device *next;
419 };
420 
421 enum ptx_event_type
422 {
423   PTX_EVT_MEM,
424   PTX_EVT_KNL,
425   PTX_EVT_SYNC,
426   PTX_EVT_ASYNC_CLEANUP
427 };
428 
429 struct ptx_event
430 {
431   CUevent *evt;
432   int type;
433   void *addr;
434   int ord;
435   int val;
436 
437   struct ptx_event *next;
438 };
439 
440 static pthread_mutex_t ptx_event_lock;
441 static struct ptx_event *ptx_events;
442 
443 static struct ptx_device **ptx_devices;
444 
445 static inline struct nvptx_thread *
446 nvptx_thread (void)
447 {
448   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
449 }
450 
451 static bool
452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
453 {
454   int i;
455   struct ptx_stream *null_stream
456     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
457 
458   null_stream->stream = NULL;
459   null_stream->host_thread = pthread_self ();
460   null_stream->multithreaded = true;
461   null_stream->d = (CUdeviceptr) NULL;
462   null_stream->h = NULL;
463   if (!map_init (null_stream))
464     return false;
465 
466   ptx_dev->null_stream = null_stream;
467   ptx_dev->active_streams = NULL;
468   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
469 
470   if (concurrency < 1)
471     concurrency = 1;
472 
473   /* This is just a guess -- make space for as many async streams as the
474      current device is capable of concurrently executing.  This can grow
475      later as necessary.  No streams are created yet.  */
476   ptx_dev->async_streams.arr
477     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
478   ptx_dev->async_streams.size = concurrency;
479 
480   for (i = 0; i < concurrency; i++)
481     ptx_dev->async_streams.arr[i] = NULL;
482 
483   return true;
484 }
485 
486 static bool
487 fini_streams_for_device (struct ptx_device *ptx_dev)
488 {
489   free (ptx_dev->async_streams.arr);
490 
491   bool ret = true;
492   while (ptx_dev->active_streams != NULL)
493     {
494       struct ptx_stream *s = ptx_dev->active_streams;
495       ptx_dev->active_streams = ptx_dev->active_streams->next;
496 
497       ret &= map_fini (s);
498 
499       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
500       if (r != CUDA_SUCCESS)
501 	{
502 	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
503 	  ret = false;
504 	}
505       free (s);
506     }
507 
508   ret &= map_fini (ptx_dev->null_stream);
509   free (ptx_dev->null_stream);
510   return ret;
511 }
512 
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
514    thread THREAD (and also current device/context).  If CREATE is true, create
515    the stream if it does not exist (or use EXISTING if it is non-NULL), and
516    associate the stream with the same thread argument.  Returns stream to use
517    as result.  */
518 
519 static struct ptx_stream *
520 select_stream_for_async (int async, pthread_t thread, bool create,
521 			 CUstream existing)
522 {
523   struct nvptx_thread *nvthd = nvptx_thread ();
524   /* Local copy of TLS variable.  */
525   struct ptx_device *ptx_dev = nvthd->ptx_dev;
526   struct ptx_stream *stream = NULL;
527   int orig_async = async;
528 
529   /* The special value acc_async_noval (-1) maps (for now) to an
530      implicitly-created stream, which is then handled the same as any other
531      numbered async stream.  Other options are available, e.g. using the null
532      stream for anonymous async operations, or choosing an idle stream from an
533      active set.  But, stick with this for now.  */
534   if (async > acc_async_sync)
535     async++;
536 
537   if (create)
538     pthread_mutex_lock (&ptx_dev->stream_lock);
539 
540   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
541      null stream, and in fact better performance may be obtainable if it doesn't
542      (because the null stream enforces overly-strict synchronisation with
543      respect to other streams for legacy reasons, and that's probably not
544      needed with OpenACC).  Maybe investigate later.  */
545   if (async == acc_async_sync)
546     stream = ptx_dev->null_stream;
547   else if (async >= 0 && async < ptx_dev->async_streams.size
548 	   && ptx_dev->async_streams.arr[async] && !(create && existing))
549     stream = ptx_dev->async_streams.arr[async];
550   else if (async >= 0 && create)
551     {
552       if (async >= ptx_dev->async_streams.size)
553 	{
554 	  int i, newsize = ptx_dev->async_streams.size * 2;
555 
556 	  if (async >= newsize)
557 	    newsize = async + 1;
558 
559 	  ptx_dev->async_streams.arr
560 	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
561 				   newsize * sizeof (struct ptx_stream *));
562 
563 	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
564 	    ptx_dev->async_streams.arr[i] = NULL;
565 
566 	  ptx_dev->async_streams.size = newsize;
567 	}
568 
569       /* Create a new stream on-demand if there isn't one already, or if we're
570 	 setting a particular async value to an existing (externally-provided)
571 	 stream.  */
572       if (!ptx_dev->async_streams.arr[async] || existing)
573         {
574 	  CUresult r;
575 	  struct ptx_stream *s
576 	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
577 
578 	  if (existing)
579 	    s->stream = existing;
580 	  else
581 	    {
582 	      r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
583 				     CU_STREAM_DEFAULT);
584 	      if (r != CUDA_SUCCESS)
585 		{
586 		  pthread_mutex_unlock (&ptx_dev->stream_lock);
587 		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
588 				     cuda_error (r));
589 		}
590 	    }
591 
592 	  /* If CREATE is true, we're going to be queueing some work on this
593 	     stream.  Associate it with the current host thread.  */
594 	  s->host_thread = thread;
595 	  s->multithreaded = false;
596 
597 	  s->d = (CUdeviceptr) NULL;
598 	  s->h = NULL;
599 	  if (!map_init (s))
600 	    {
601 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
602 	      GOMP_PLUGIN_fatal ("map_init fail");
603 	    }
604 
605 	  s->next = ptx_dev->active_streams;
606 	  ptx_dev->active_streams = s;
607 	  ptx_dev->async_streams.arr[async] = s;
608 	}
609 
610       stream = ptx_dev->async_streams.arr[async];
611     }
612   else if (async < 0)
613     {
614       if (create)
615 	pthread_mutex_unlock (&ptx_dev->stream_lock);
616       GOMP_PLUGIN_fatal ("bad async %d", async);
617     }
618 
619   if (create)
620     {
621       assert (stream != NULL);
622 
623       /* If we're trying to use the same stream from different threads
624 	 simultaneously, set stream->multithreaded to true.  This affects the
625 	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
626 	 only wait for asynchronous launches from the same host thread they are
627 	 invoked on.  If multiple threads use the same async value, we make note
628 	 of that here and fall back to testing/waiting for all threads in those
629 	 functions.  */
630       if (thread != stream->host_thread)
631         stream->multithreaded = true;
632 
633       pthread_mutex_unlock (&ptx_dev->stream_lock);
634     }
635   else if (stream && !stream->multithreaded
636 	   && !pthread_equal (stream->host_thread, thread))
637     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
638 
639   return stream;
640 }
641 
642 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
643    should be locked on entry and remains locked on exit.  */
644 
645 static bool
646 nvptx_init (void)
647 {
648   int ndevs;
649 
650   if (instantiated_devices != 0)
651     return true;
652 
653   ptx_events = NULL;
654   pthread_mutex_init (&ptx_event_lock, NULL);
655 
656   if (!init_cuda_lib ())
657     return false;
658 
659   CUDA_CALL (cuInit, 0);
660 
661   CUDA_CALL (cuDeviceGetCount, &ndevs);
662   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
663 					    * ndevs);
664   return true;
665 }
666 
667 /* Select the N'th PTX device for the current host thread.  The device must
668    have been previously opened before calling this function.  */
669 
670 static bool
671 nvptx_attach_host_thread_to_device (int n)
672 {
673   CUdevice dev;
674   CUresult r;
675   struct ptx_device *ptx_dev;
676   CUcontext thd_ctx;
677 
678   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
679   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
680     {
681       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
682       return false;
683     }
684 
685   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
686     return true;
687   else
688     {
689       CUcontext old_ctx;
690 
691       ptx_dev = ptx_devices[n];
692       if (!ptx_dev)
693 	{
694 	  GOMP_PLUGIN_error ("device %d not found", n);
695 	  return false;
696 	}
697 
698       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
699 
700       /* We don't necessarily have a current context (e.g. if it has been
701          destroyed.  Pop it if we do though.  */
702       if (thd_ctx != NULL)
703 	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
704 
705       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
706     }
707   return true;
708 }
709 
710 static struct ptx_device *
711 nvptx_open_device (int n)
712 {
713   struct ptx_device *ptx_dev;
714   CUdevice dev, ctx_dev;
715   CUresult r;
716   int async_engines, pi;
717 
718   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
719 
720   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
721 
722   ptx_dev->ord = n;
723   ptx_dev->dev = dev;
724   ptx_dev->ctx_shared = false;
725 
726   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
727   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
728     {
729       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
730       return NULL;
731     }
732 
733   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
734     {
735       /* The current host thread has an active context for a different device.
736          Detach it.  */
737       CUcontext old_ctx;
738       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
739     }
740 
741   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
742 
743   if (!ptx_dev->ctx)
744     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
745   else
746     ptx_dev->ctx_shared = true;
747 
748   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
749 		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
750   ptx_dev->overlap = pi;
751 
752   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
753 		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
754   ptx_dev->map = pi;
755 
756   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
757 		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
758   ptx_dev->concur = pi;
759 
760   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
761 		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
762   ptx_dev->mode = pi;
763 
764   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
765 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
766   ptx_dev->mkern = pi;
767 
768   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
769 		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
770   ptx_dev->clock_khz = pi;
771 
772   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
773 		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
774   ptx_dev->num_sms = pi;
775 
776   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
777 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
778   ptx_dev->regs_per_block = pi;
779 
780   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
781      in CUDA 6.0 and newer.  */
782   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
783   /* Fallback: use limit of registers per block, which is usually equal.  */
784   if (r == CUDA_ERROR_INVALID_VALUE)
785     pi = ptx_dev->regs_per_block;
786   else if (r != CUDA_SUCCESS)
787     {
788       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
789       return NULL;
790     }
791   ptx_dev->regs_per_sm = pi;
792 
793   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
794 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
795   if (pi != 32)
796     {
797       GOMP_PLUGIN_error ("Only warp size 32 is supported");
798       return NULL;
799     }
800 
801   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
802 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
803   if (r != CUDA_SUCCESS)
804     async_engines = 1;
805 
806   ptx_dev->images = NULL;
807   pthread_mutex_init (&ptx_dev->image_lock, NULL);
808 
809   if (!init_streams_for_device (ptx_dev, async_engines))
810     return NULL;
811 
812   return ptx_dev;
813 }
814 
815 static bool
816 nvptx_close_device (struct ptx_device *ptx_dev)
817 {
818   if (!ptx_dev)
819     return true;
820 
821   if (!fini_streams_for_device (ptx_dev))
822     return false;
823 
824   pthread_mutex_destroy (&ptx_dev->image_lock);
825 
826   if (!ptx_dev->ctx_shared)
827     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
828 
829   free (ptx_dev);
830   return true;
831 }
832 
833 static int
834 nvptx_get_num_devices (void)
835 {
836   int n;
837 
838   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
839      configurations.  */
840   if (sizeof (void *) != 8)
841     return 0;
842 
843   /* This function will be called before the plugin has been initialized in
844      order to enumerate available devices, but CUDA API routines can't be used
845      until cuInit has been called.  Just call it now (but don't yet do any
846      further initialization).  */
847   if (instantiated_devices == 0)
848     {
849       if (!init_cuda_lib ())
850 	return 0;
851       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
852       /* This is not an error: e.g. we may have CUDA libraries installed but
853          no devices available.  */
854       if (r != CUDA_SUCCESS)
855         return 0;
856     }
857 
858   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
859   return n;
860 }
861 
862 
863 static bool
864 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
865 	  unsigned num_objs)
866 {
867   CUjit_option opts[6];
868   void *optvals[6];
869   float elapsed = 0.0;
870   char elog[1024];
871   char ilog[16384];
872   CUlinkState linkstate;
873   CUresult r;
874   void *linkout;
875   size_t linkoutsize __attribute__ ((unused));
876 
877   opts[0] = CU_JIT_WALL_TIME;
878   optvals[0] = &elapsed;
879 
880   opts[1] = CU_JIT_INFO_LOG_BUFFER;
881   optvals[1] = &ilog[0];
882 
883   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
884   optvals[2] = (void *) sizeof ilog;
885 
886   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
887   optvals[3] = &elog[0];
888 
889   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
890   optvals[4] = (void *) sizeof elog;
891 
892   opts[5] = CU_JIT_LOG_VERBOSE;
893   optvals[5] = (void *) 1;
894 
895   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
896 
897   for (; num_objs--; ptx_objs++)
898     {
899       /* cuLinkAddData's 'data' argument erroneously omits the const
900 	 qualifier.  */
901       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
902       r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
903 			     (char *) ptx_objs->code, ptx_objs->size,
904 			     0, 0, 0, 0);
905       if (r != CUDA_SUCCESS)
906 	{
907 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
908 	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
909 			     cuda_error (r));
910 	  return false;
911 	}
912     }
913 
914   GOMP_PLUGIN_debug (0, "Linking\n");
915   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
916 
917   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
918   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
919 
920   if (r != CUDA_SUCCESS)
921     {
922       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
923       return false;
924     }
925 
926   CUDA_CALL (cuModuleLoadData, module, linkout);
927   CUDA_CALL (cuLinkDestroy, linkstate);
928   return true;
929 }
930 
931 static void
932 event_gc (bool memmap_lockable)
933 {
934   struct ptx_event *ptx_event = ptx_events;
935   struct ptx_event *async_cleanups = NULL;
936   struct nvptx_thread *nvthd = nvptx_thread ();
937 
938   pthread_mutex_lock (&ptx_event_lock);
939 
940   while (ptx_event != NULL)
941     {
942       CUresult r;
943       struct ptx_event *e = ptx_event;
944 
945       ptx_event = ptx_event->next;
946 
947       if (e->ord != nvthd->ptx_dev->ord)
948 	continue;
949 
950       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
951       if (r == CUDA_SUCCESS)
952 	{
953 	  bool append_async = false;
954 	  CUevent *te;
955 
956 	  te = e->evt;
957 
958 	  switch (e->type)
959 	    {
960 	    case PTX_EVT_MEM:
961 	    case PTX_EVT_SYNC:
962 	      break;
963 
964 	    case PTX_EVT_KNL:
965 	      map_pop (e->addr);
966 	      break;
967 
968 	    case PTX_EVT_ASYNC_CLEANUP:
969 	      {
970 		/* The function gomp_plugin_async_unmap_vars needs to claim the
971 		   memory-map splay tree lock for the current device, so we
972 		   can't call it when one of our callers has already claimed
973 		   the lock.  In that case, just delay the GC for this event
974 		   until later.  */
975 		if (!memmap_lockable)
976 		  continue;
977 
978 		append_async = true;
979 	      }
980 	      break;
981 	    }
982 
983 	  CUDA_CALL_NOCHECK (cuEventDestroy, *te);
984 	  free ((void *)te);
985 
986 	  /* Unlink 'e' from ptx_events list.  */
987 	  if (ptx_events == e)
988 	    ptx_events = ptx_events->next;
989 	  else
990 	    {
991 	      struct ptx_event *e_ = ptx_events;
992 	      while (e_->next != e)
993 		e_ = e_->next;
994 	      e_->next = e_->next->next;
995 	    }
996 
997 	  if (append_async)
998 	    {
999 	      e->next = async_cleanups;
1000 	      async_cleanups = e;
1001 	    }
1002 	  else
1003 	    free (e);
1004 	}
1005     }
1006 
1007   pthread_mutex_unlock (&ptx_event_lock);
1008 
1009   /* We have to do these here, after ptx_event_lock is released.  */
1010   while (async_cleanups)
1011     {
1012       struct ptx_event *e = async_cleanups;
1013       async_cleanups = async_cleanups->next;
1014 
1015       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1016       free (e);
1017     }
1018 }
1019 
1020 static void
1021 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1022 {
1023   struct ptx_event *ptx_event;
1024   struct nvptx_thread *nvthd = nvptx_thread ();
1025 
1026   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1027 	  || type == PTX_EVT_ASYNC_CLEANUP);
1028 
1029   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1030   ptx_event->type = type;
1031   ptx_event->evt = e;
1032   ptx_event->addr = h;
1033   ptx_event->ord = nvthd->ptx_dev->ord;
1034   ptx_event->val = val;
1035 
1036   pthread_mutex_lock (&ptx_event_lock);
1037 
1038   ptx_event->next = ptx_events;
1039   ptx_events = ptx_event;
1040 
1041   pthread_mutex_unlock (&ptx_event_lock);
1042 }
1043 
1044 static void
1045 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1046 	    int async, unsigned *dims, void *targ_mem_desc)
1047 {
1048   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1049   CUfunction function;
1050   CUresult r;
1051   int i;
1052   struct ptx_stream *dev_str;
1053   void *kargs[1];
1054   void *hp, *dp;
1055   struct nvptx_thread *nvthd = nvptx_thread ();
1056   const char *maybe_abort_msg = "(perhaps abort was called)";
1057 
1058   function = targ_fn->fn;
1059 
1060   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1061   assert (dev_str == nvthd->current_stream);
1062 
1063   /* Initialize the launch dimensions.  Typically this is constant,
1064      provided by the device compiler, but we must permit runtime
1065      values.  */
1066   int seen_zero = 0;
1067   for (i = 0; i != GOMP_DIM_MAX; i++)
1068     {
1069       if (targ_fn->launch->dim[i])
1070        dims[i] = targ_fn->launch->dim[i];
1071       if (!dims[i])
1072        seen_zero = 1;
1073     }
1074 
1075   if (seen_zero)
1076     {
1077       /* See if the user provided GOMP_OPENACC_DIM environment
1078 	 variable to specify runtime defaults. */
1079       static int default_dims[GOMP_DIM_MAX];
1080 
1081       pthread_mutex_lock (&ptx_dev_lock);
1082       if (!default_dims[0])
1083 	{
1084 	  /* We only read the environment variable once.  You can't
1085 	     change it in the middle of execution.  The syntax  is
1086 	     the same as for the -fopenacc-dim compilation option.  */
1087 	  const char *env_var = getenv ("GOMP_OPENACC_DIM");
1088 	  if (env_var)
1089 	    {
1090 	      const char *pos = env_var;
1091 
1092 	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1093 		{
1094 		  if (i && *pos++ != ':')
1095 		    break;
1096 		  if (*pos != ':')
1097 		    {
1098 		      const char *eptr;
1099 
1100 		      errno = 0;
1101 		      long val = strtol (pos, (char **)&eptr, 10);
1102 		      if (errno || val < 0 || (unsigned)val != val)
1103 			break;
1104 		      default_dims[i] = (int)val;
1105 		      pos = eptr;
1106 		    }
1107 		}
1108 	    }
1109 
1110 	  int warp_size, block_size, dev_size, cpu_size;
1111 	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
1112 	  /* 32 is the default for known hardware.  */
1113 	  int gang = 0, worker = 32, vector = 32;
1114 	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1115 
1116 	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1117 	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1118 	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1119 	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1120 
1121 	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1122 				 dev) == CUDA_SUCCESS
1123 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1124 				    dev) == CUDA_SUCCESS
1125 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1126 				    dev) == CUDA_SUCCESS
1127 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1128 				    dev) == CUDA_SUCCESS)
1129 	    {
1130 	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1131 				 " dev_size=%d, cpu_size=%d\n",
1132 				 warp_size, block_size, dev_size, cpu_size);
1133 	      gang = (cpu_size / block_size) * dev_size;
1134 	      worker = block_size / warp_size;
1135 	      vector = warp_size;
1136 	    }
1137 
1138 	  /* There is no upper bound on the gang size.  The best size
1139 	     matches the hardware configuration.  Logical gangs are
1140 	     scheduled onto physical hardware.  To maximize usage, we
1141 	     should guess a large number.  */
1142 	  if (default_dims[GOMP_DIM_GANG] < 1)
1143 	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1144 	  /* The worker size must not exceed the hardware.  */
1145 	  if (default_dims[GOMP_DIM_WORKER] < 1
1146 	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1147 	    default_dims[GOMP_DIM_WORKER] = worker;
1148 	  /* The vector size must exactly match the hardware.  */
1149 	  if (default_dims[GOMP_DIM_VECTOR] < 1
1150 	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1151 	    default_dims[GOMP_DIM_VECTOR] = vector;
1152 
1153 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1154 			     default_dims[GOMP_DIM_GANG],
1155 			     default_dims[GOMP_DIM_WORKER],
1156 			     default_dims[GOMP_DIM_VECTOR]);
1157 	}
1158       pthread_mutex_unlock (&ptx_dev_lock);
1159 
1160       for (i = 0; i != GOMP_DIM_MAX; i++)
1161 	if (!dims[i])
1162 	  dims[i] = default_dims[i];
1163     }
1164 
1165   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1166      the host and the device. HP is a host pointer to the new chunk, and DP is
1167      the corresponding device pointer.  */
1168   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1169 
1170   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1171 
1172   /* Copy the array of arguments to the mapped page.  */
1173   for (i = 0; i < mapnum; i++)
1174     ((void **) hp)[i] = devaddrs[i];
1175 
1176   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1177      fact have the same value on a unified-memory system).  */
1178   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1179 		    mapnum * sizeof (void *));
1180   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1181 		     " gangs=%u, workers=%u, vectors=%u\n",
1182 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1183 		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1184 
1185   // OpenACC		CUDA
1186   //
1187   // num_gangs		nctaid.x
1188   // num_workers	ntid.y
1189   // vector length	ntid.x
1190 
1191   kargs[0] = &dp;
1192   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1193 		    dims[GOMP_DIM_GANG], 1, 1,
1194 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1195 		    0, dev_str->stream, kargs, 0);
1196 
1197 #ifndef DISABLE_ASYNC
1198   if (async < acc_async_noval)
1199     {
1200       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1201       if (r == CUDA_ERROR_LAUNCH_FAILED)
1202 	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1203 			   maybe_abort_msg);
1204       else if (r != CUDA_SUCCESS)
1205         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1206     }
1207   else
1208     {
1209       CUevent *e;
1210 
1211       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1212 
1213       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1214       if (r == CUDA_ERROR_LAUNCH_FAILED)
1215 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1216 			   maybe_abort_msg);
1217       else if (r != CUDA_SUCCESS)
1218         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1219 
1220       event_gc (true);
1221 
1222       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1223 
1224       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1225     }
1226 #else
1227   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1228   if (r == CUDA_ERROR_LAUNCH_FAILED)
1229     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1230 		       maybe_abort_msg);
1231   else if (r != CUDA_SUCCESS)
1232     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1233 #endif
1234 
1235   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1236 		     targ_fn->launch->fn);
1237 
1238 #ifndef DISABLE_ASYNC
1239   if (async < acc_async_noval)
1240 #endif
1241     map_pop (dev_str);
1242 }
1243 
1244 void * openacc_get_current_cuda_context (void);
1245 
1246 static void *
1247 nvptx_alloc (size_t s)
1248 {
1249   CUdeviceptr d;
1250 
1251   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1252   return (void *) d;
1253 }
1254 
1255 static bool
1256 nvptx_free (void *p)
1257 {
1258   CUdeviceptr pb;
1259   size_t ps;
1260 
1261   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1262   if ((CUdeviceptr) p != pb)
1263     {
1264       GOMP_PLUGIN_error ("invalid device address");
1265       return false;
1266     }
1267 
1268   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1269   return true;
1270 }
1271 
1272 
1273 static bool
1274 nvptx_host2dev (void *d, const void *h, size_t s)
1275 {
1276   CUdeviceptr pb;
1277   size_t ps;
1278   struct nvptx_thread *nvthd = nvptx_thread ();
1279 
1280   if (!s)
1281     return true;
1282   if (!d)
1283     {
1284       GOMP_PLUGIN_error ("invalid device address");
1285       return false;
1286     }
1287 
1288   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1289 
1290   if (!pb)
1291     {
1292       GOMP_PLUGIN_error ("invalid device address");
1293       return false;
1294     }
1295   if (!h)
1296     {
1297       GOMP_PLUGIN_error ("invalid host address");
1298       return false;
1299     }
1300   if (d == h)
1301     {
1302       GOMP_PLUGIN_error ("invalid host or device address");
1303       return false;
1304     }
1305   if ((void *)(d + s) > (void *)(pb + ps))
1306     {
1307       GOMP_PLUGIN_error ("invalid size");
1308       return false;
1309     }
1310 
1311 #ifndef DISABLE_ASYNC
1312   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1313     {
1314       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1315       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1316       event_gc (false);
1317       CUDA_CALL (cuMemcpyHtoDAsync,
1318 		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1319       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1320       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1321     }
1322   else
1323 #endif
1324     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1325 
1326   return true;
1327 }
1328 
1329 static bool
1330 nvptx_dev2host (void *h, const void *d, size_t s)
1331 {
1332   CUdeviceptr pb;
1333   size_t ps;
1334   struct nvptx_thread *nvthd = nvptx_thread ();
1335 
1336   if (!s)
1337     return true;
1338   if (!d)
1339     {
1340       GOMP_PLUGIN_error ("invalid device address");
1341       return false;
1342     }
1343 
1344   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1345 
1346   if (!pb)
1347     {
1348       GOMP_PLUGIN_error ("invalid device address");
1349       return false;
1350     }
1351   if (!h)
1352     {
1353       GOMP_PLUGIN_error ("invalid host address");
1354       return false;
1355     }
1356   if (d == h)
1357     {
1358       GOMP_PLUGIN_error ("invalid host or device address");
1359       return false;
1360     }
1361   if ((void *)(d + s) > (void *)(pb + ps))
1362     {
1363       GOMP_PLUGIN_error ("invalid size");
1364       return false;
1365     }
1366 
1367 #ifndef DISABLE_ASYNC
1368   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1369     {
1370       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1371       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1372       event_gc (false);
1373       CUDA_CALL (cuMemcpyDtoHAsync,
1374 		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1375       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1376       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1377     }
1378   else
1379 #endif
1380     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1381 
1382   return true;
1383 }
1384 
1385 static void
1386 nvptx_set_async (int async)
1387 {
1388   struct nvptx_thread *nvthd = nvptx_thread ();
1389   nvthd->current_stream
1390     = select_stream_for_async (async, pthread_self (), true, NULL);
1391 }
1392 
1393 static int
1394 nvptx_async_test (int async)
1395 {
1396   CUresult r;
1397   struct ptx_stream *s;
1398 
1399   s = select_stream_for_async (async, pthread_self (), false, NULL);
1400 
1401   if (!s)
1402     GOMP_PLUGIN_fatal ("unknown async %d", async);
1403 
1404   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1405   if (r == CUDA_SUCCESS)
1406     {
1407       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1408 	 whether all work has completed on this stream, and if so omits the call
1409 	 to the wait hook.  If that happens, event_gc might not get called
1410 	 (which prevents variables from getting unmapped and their associated
1411 	 device storage freed), so call it here.  */
1412       event_gc (true);
1413       return 1;
1414     }
1415   else if (r == CUDA_ERROR_NOT_READY)
1416     return 0;
1417 
1418   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1419 
1420   return 0;
1421 }
1422 
1423 static int
1424 nvptx_async_test_all (void)
1425 {
1426   struct ptx_stream *s;
1427   pthread_t self = pthread_self ();
1428   struct nvptx_thread *nvthd = nvptx_thread ();
1429 
1430   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1431 
1432   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1433     {
1434       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1435 	  && CUDA_CALL_NOCHECK (cuStreamQuery,
1436 				s->stream) == CUDA_ERROR_NOT_READY)
1437 	{
1438 	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1439 	  return 0;
1440 	}
1441     }
1442 
1443   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1444 
1445   event_gc (true);
1446 
1447   return 1;
1448 }
1449 
1450 static void
1451 nvptx_wait (int async)
1452 {
1453   struct ptx_stream *s;
1454 
1455   s = select_stream_for_async (async, pthread_self (), false, NULL);
1456   if (!s)
1457     GOMP_PLUGIN_fatal ("unknown async %d", async);
1458 
1459   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1460 
1461   event_gc (true);
1462 }
1463 
1464 static void
1465 nvptx_wait_async (int async1, int async2)
1466 {
1467   CUevent *e;
1468   struct ptx_stream *s1, *s2;
1469   pthread_t self = pthread_self ();
1470 
1471   /* The stream that is waiting (rather than being waited for) doesn't
1472      necessarily have to exist already.  */
1473   s2 = select_stream_for_async (async2, self, true, NULL);
1474 
1475   s1 = select_stream_for_async (async1, self, false, NULL);
1476   if (!s1)
1477     GOMP_PLUGIN_fatal ("invalid async 1\n");
1478 
1479   if (s1 == s2)
1480     GOMP_PLUGIN_fatal ("identical parameters");
1481 
1482   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1483 
1484   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1485 
1486   event_gc (true);
1487 
1488   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1489 
1490   event_add (PTX_EVT_SYNC, e, NULL, 0);
1491 
1492   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1493 }
1494 
1495 static void
1496 nvptx_wait_all (void)
1497 {
1498   CUresult r;
1499   struct ptx_stream *s;
1500   pthread_t self = pthread_self ();
1501   struct nvptx_thread *nvthd = nvptx_thread ();
1502 
1503   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1504 
1505   /* Wait for active streams initiated by this thread (or by multiple threads)
1506      to complete.  */
1507   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1508     {
1509       if (s->multithreaded || pthread_equal (s->host_thread, self))
1510 	{
1511 	  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1512 	  if (r == CUDA_SUCCESS)
1513 	    continue;
1514 	  else if (r != CUDA_ERROR_NOT_READY)
1515 	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1516 
1517 	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1518 	}
1519     }
1520 
1521   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1522 
1523   event_gc (true);
1524 }
1525 
1526 static void
1527 nvptx_wait_all_async (int async)
1528 {
1529   struct ptx_stream *waiting_stream, *other_stream;
1530   CUevent *e;
1531   struct nvptx_thread *nvthd = nvptx_thread ();
1532   pthread_t self = pthread_self ();
1533 
1534   /* The stream doing the waiting.  This could be the first mention of the
1535      stream, so create it if necessary.  */
1536   waiting_stream
1537     = select_stream_for_async (async, pthread_self (), true, NULL);
1538 
1539   /* Launches on the null stream already block on other streams in the
1540      context.  */
1541   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1542     return;
1543 
1544   event_gc (true);
1545 
1546   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1547 
1548   for (other_stream = nvthd->ptx_dev->active_streams;
1549        other_stream != NULL;
1550        other_stream = other_stream->next)
1551     {
1552       if (!other_stream->multithreaded
1553 	  && !pthread_equal (other_stream->host_thread, self))
1554 	continue;
1555 
1556       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1557 
1558       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1559 
1560       /* Record an event on the waited-for stream.  */
1561       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1562 
1563       event_add (PTX_EVT_SYNC, e, NULL, 0);
1564 
1565       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1566    }
1567 
1568   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569 }
1570 
1571 static void *
1572 nvptx_get_current_cuda_device (void)
1573 {
1574   struct nvptx_thread *nvthd = nvptx_thread ();
1575 
1576   if (!nvthd || !nvthd->ptx_dev)
1577     return NULL;
1578 
1579   return &nvthd->ptx_dev->dev;
1580 }
1581 
1582 static void *
1583 nvptx_get_current_cuda_context (void)
1584 {
1585   struct nvptx_thread *nvthd = nvptx_thread ();
1586 
1587   if (!nvthd || !nvthd->ptx_dev)
1588     return NULL;
1589 
1590   return nvthd->ptx_dev->ctx;
1591 }
1592 
1593 static void *
1594 nvptx_get_cuda_stream (int async)
1595 {
1596   struct ptx_stream *s;
1597   struct nvptx_thread *nvthd = nvptx_thread ();
1598 
1599   if (!nvthd || !nvthd->ptx_dev)
1600     return NULL;
1601 
1602   s = select_stream_for_async (async, pthread_self (), false, NULL);
1603 
1604   return s ? s->stream : NULL;
1605 }
1606 
1607 static int
1608 nvptx_set_cuda_stream (int async, void *stream)
1609 {
1610   struct ptx_stream *oldstream;
1611   pthread_t self = pthread_self ();
1612   struct nvptx_thread *nvthd = nvptx_thread ();
1613 
1614   if (async < 0)
1615     GOMP_PLUGIN_fatal ("bad async %d", async);
1616 
1617   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1618 
1619   /* We have a list of active streams and an array mapping async values to
1620      entries of that list.  We need to take "ownership" of the passed-in stream,
1621      and add it to our list, removing the previous entry also (if there was one)
1622      in order to prevent resource leaks.  Note the potential for surprise
1623      here: maybe we should keep track of passed-in streams and leave it up to
1624      the user to tidy those up, but that doesn't work for stream handles
1625      returned from acc_get_cuda_stream above...  */
1626 
1627   oldstream = select_stream_for_async (async, self, false, NULL);
1628 
1629   if (oldstream)
1630     {
1631       if (nvthd->ptx_dev->active_streams == oldstream)
1632 	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1633       else
1634 	{
1635 	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1636 	  while (s->next != oldstream)
1637 	    s = s->next;
1638 	  s->next = s->next->next;
1639 	}
1640 
1641       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1642 
1643       if (!map_fini (oldstream))
1644 	GOMP_PLUGIN_fatal ("error when freeing host memory");
1645 
1646       free (oldstream);
1647     }
1648 
1649   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1650 
1651   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1652 
1653   return 1;
1654 }
1655 
1656 /* Plugin entry points.  */
1657 
1658 const char *
1659 GOMP_OFFLOAD_get_name (void)
1660 {
1661   return "nvptx";
1662 }
1663 
1664 unsigned int
1665 GOMP_OFFLOAD_get_caps (void)
1666 {
1667   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1668 }
1669 
1670 int
1671 GOMP_OFFLOAD_get_type (void)
1672 {
1673   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1674 }
1675 
1676 int
1677 GOMP_OFFLOAD_get_num_devices (void)
1678 {
1679   return nvptx_get_num_devices ();
1680 }
1681 
1682 bool
1683 GOMP_OFFLOAD_init_device (int n)
1684 {
1685   struct ptx_device *dev;
1686 
1687   pthread_mutex_lock (&ptx_dev_lock);
1688 
1689   if (!nvptx_init () || ptx_devices[n] != NULL)
1690     {
1691       pthread_mutex_unlock (&ptx_dev_lock);
1692       return false;
1693     }
1694 
1695   dev = nvptx_open_device (n);
1696   if (dev)
1697     {
1698       ptx_devices[n] = dev;
1699       instantiated_devices++;
1700     }
1701 
1702   pthread_mutex_unlock (&ptx_dev_lock);
1703 
1704   return dev != NULL;
1705 }
1706 
1707 bool
1708 GOMP_OFFLOAD_fini_device (int n)
1709 {
1710   pthread_mutex_lock (&ptx_dev_lock);
1711 
1712   if (ptx_devices[n] != NULL)
1713     {
1714       if (!nvptx_attach_host_thread_to_device (n)
1715 	  || !nvptx_close_device (ptx_devices[n]))
1716 	{
1717 	  pthread_mutex_unlock (&ptx_dev_lock);
1718 	  return false;
1719 	}
1720       ptx_devices[n] = NULL;
1721       instantiated_devices--;
1722     }
1723 
1724   pthread_mutex_unlock (&ptx_dev_lock);
1725   return true;
1726 }
1727 
1728 /* Return the libgomp version number we're compatible with.  There is
1729    no requirement for cross-version compatibility.  */
1730 
1731 unsigned
1732 GOMP_OFFLOAD_version (void)
1733 {
1734   return GOMP_VERSION;
1735 }
1736 
1737 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1738 
1739 static void
1740 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1741 {
1742   CUdeviceptr dptr;
1743   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1744 				  module, "__nvptx_clocktick");
1745   if (r == CUDA_ERROR_NOT_FOUND)
1746     return;
1747   if (r != CUDA_SUCCESS)
1748     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1749   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1750   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1751 			 sizeof (__nvptx_clocktick));
1752   if (r != CUDA_SUCCESS)
1753     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1754 }
1755 
1756 /* Load the (partial) program described by TARGET_DATA to device
1757    number ORD.  Allocate and return TARGET_TABLE.  */
1758 
1759 int
1760 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1761 			 struct addr_pair **target_table)
1762 {
1763   CUmodule module;
1764   const char *const *var_names;
1765   const struct targ_fn_launch *fn_descs;
1766   unsigned int fn_entries, var_entries, i, j;
1767   struct targ_fn_descriptor *targ_fns;
1768   struct addr_pair *targ_tbl;
1769   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1770   struct ptx_image_data *new_image;
1771   struct ptx_device *dev;
1772 
1773   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1774     {
1775       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1776 			 " (expected %u, received %u)",
1777 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1778       return -1;
1779     }
1780 
1781   if (!nvptx_attach_host_thread_to_device (ord)
1782       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1783     return -1;
1784 
1785   dev = ptx_devices[ord];
1786 
1787   /* The mkoffload utility emits a struct of pointers/integers at the
1788      start of each offload image.  The array of kernel names and the
1789      functions addresses form a one-to-one correspondence.  */
1790 
1791   var_entries = img_header->var_num;
1792   var_names = img_header->var_names;
1793   fn_entries = img_header->fn_num;
1794   fn_descs = img_header->fn_descs;
1795 
1796   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1797 				 * (fn_entries + var_entries));
1798   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1799 				 * fn_entries);
1800 
1801   *target_table = targ_tbl;
1802 
1803   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1804   new_image->target_data = target_data;
1805   new_image->module = module;
1806   new_image->fns = targ_fns;
1807 
1808   pthread_mutex_lock (&dev->image_lock);
1809   new_image->next = dev->images;
1810   dev->images = new_image;
1811   pthread_mutex_unlock (&dev->image_lock);
1812 
1813   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1814     {
1815       CUfunction function;
1816       int nregs, mthrs;
1817 
1818       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1819 		      fn_descs[i].fn);
1820       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1821 		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1822       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1823 		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1824 
1825       targ_fns->fn = function;
1826       targ_fns->launch = &fn_descs[i];
1827       targ_fns->regs_per_thread = nregs;
1828       targ_fns->max_threads_per_block = mthrs;
1829 
1830       targ_tbl->start = (uintptr_t) targ_fns;
1831       targ_tbl->end = targ_tbl->start + 1;
1832     }
1833 
1834   for (j = 0; j < var_entries; j++, targ_tbl++)
1835     {
1836       CUdeviceptr var;
1837       size_t bytes;
1838 
1839       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1840 		      &var, &bytes, module, var_names[j]);
1841 
1842       targ_tbl->start = (uintptr_t) var;
1843       targ_tbl->end = targ_tbl->start + bytes;
1844     }
1845 
1846   nvptx_set_clocktick (module, dev);
1847 
1848   return fn_entries + var_entries;
1849 }
1850 
1851 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1852    function descriptors allocated by G_O_load_image.  */
1853 
1854 bool
1855 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1856 {
1857   struct ptx_image_data *image, **prev_p;
1858   struct ptx_device *dev = ptx_devices[ord];
1859 
1860   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1861     {
1862       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1863 			 " (expected %u, received %u)",
1864 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1865       return false;
1866     }
1867 
1868   bool ret = true;
1869   pthread_mutex_lock (&dev->image_lock);
1870   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1871     if (image->target_data == target_data)
1872       {
1873 	*prev_p = image->next;
1874 	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1875 	  ret = false;
1876 	free (image->fns);
1877 	free (image);
1878 	break;
1879       }
1880   pthread_mutex_unlock (&dev->image_lock);
1881   return ret;
1882 }
1883 
1884 void *
1885 GOMP_OFFLOAD_alloc (int ord, size_t size)
1886 {
1887   if (!nvptx_attach_host_thread_to_device (ord))
1888     return NULL;
1889   return nvptx_alloc (size);
1890 }
1891 
1892 bool
1893 GOMP_OFFLOAD_free (int ord, void *ptr)
1894 {
1895   return (nvptx_attach_host_thread_to_device (ord)
1896 	  && nvptx_free (ptr));
1897 }
1898 
1899 bool
1900 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1901 {
1902   return (nvptx_attach_host_thread_to_device (ord)
1903 	  && nvptx_dev2host (dst, src, n));
1904 }
1905 
1906 bool
1907 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1908 {
1909   return (nvptx_attach_host_thread_to_device (ord)
1910 	  && nvptx_host2dev (dst, src, n));
1911 }
1912 
1913 bool
1914 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1915 {
1916   struct ptx_device *ptx_dev = ptx_devices[ord];
1917   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1918 				ptx_dev->null_stream->stream);
1919   return true;
1920 }
1921 
1922 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1923 
1924 void
1925 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1926 			   void **hostaddrs, void **devaddrs,
1927 			   int async, unsigned *dims, void *targ_mem_desc)
1928 {
1929   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1930 }
1931 
1932 void
1933 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1934 {
1935   struct nvptx_thread *nvthd = nvptx_thread ();
1936   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1937 
1938   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1939   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1940   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1941 }
1942 
1943 int
1944 GOMP_OFFLOAD_openacc_async_test (int async)
1945 {
1946   return nvptx_async_test (async);
1947 }
1948 
1949 int
1950 GOMP_OFFLOAD_openacc_async_test_all (void)
1951 {
1952   return nvptx_async_test_all ();
1953 }
1954 
1955 void
1956 GOMP_OFFLOAD_openacc_async_wait (int async)
1957 {
1958   nvptx_wait (async);
1959 }
1960 
1961 void
1962 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1963 {
1964   nvptx_wait_async (async1, async2);
1965 }
1966 
1967 void
1968 GOMP_OFFLOAD_openacc_async_wait_all (void)
1969 {
1970   nvptx_wait_all ();
1971 }
1972 
1973 void
1974 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1975 {
1976   nvptx_wait_all_async (async);
1977 }
1978 
1979 void
1980 GOMP_OFFLOAD_openacc_async_set_async (int async)
1981 {
1982   nvptx_set_async (async);
1983 }
1984 
1985 void *
1986 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1987 {
1988   struct ptx_device *ptx_dev;
1989   struct nvptx_thread *nvthd
1990     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1991   CUcontext thd_ctx;
1992 
1993   ptx_dev = ptx_devices[ord];
1994 
1995   assert (ptx_dev);
1996 
1997   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1998 
1999   assert (ptx_dev->ctx);
2000 
2001   if (!thd_ctx)
2002     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2003 
2004   nvthd->current_stream = ptx_dev->null_stream;
2005   nvthd->ptx_dev = ptx_dev;
2006 
2007   return (void *) nvthd;
2008 }
2009 
2010 void
2011 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2012 {
2013   free (data);
2014 }
2015 
2016 void *
2017 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2018 {
2019   return nvptx_get_current_cuda_device ();
2020 }
2021 
2022 void *
2023 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2024 {
2025   return nvptx_get_current_cuda_context ();
2026 }
2027 
2028 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2029 
2030 void *
2031 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2032 {
2033   return nvptx_get_cuda_stream (async);
2034 }
2035 
2036 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2037 
2038 int
2039 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2040 {
2041   return nvptx_set_cuda_stream (async, stream);
2042 }
2043 
2044 /* Adjust launch dimensions: pick good values for number of blocks and warps
2045    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2046    own limits.  */
2047 
2048 static void
2049 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2050 			    struct ptx_device *ptx_dev,
2051 			    int *teams_p, int *threads_p)
2052 {
2053   int max_warps_block = fn->max_threads_per_block / 32;
2054   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2055      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2056   if (max_warps_block > 32)
2057     max_warps_block = 32;
2058   if (*threads_p <= 0)
2059     *threads_p = 8;
2060   if (*threads_p > max_warps_block)
2061     *threads_p = max_warps_block;
2062 
2063   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2064   /* This is an estimate of how many blocks the device can host simultaneously.
2065      Actual limit, which may be lower, can be queried with "occupancy control"
2066      driver interface (since CUDA 6.0).  */
2067   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2068   if (*teams_p <= 0 || *teams_p > max_blocks)
2069     *teams_p = max_blocks;
2070 }
2071 
2072 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2073    target regions.  */
2074 
2075 static size_t
2076 nvptx_stacks_size ()
2077 {
2078   return 128 * 1024;
2079 }
2080 
2081 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2082 
2083 static void *
2084 nvptx_stacks_alloc (size_t size, int num)
2085 {
2086   CUdeviceptr stacks;
2087   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2088   if (r != CUDA_SUCCESS)
2089     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2090   return (void *) stacks;
2091 }
2092 
2093 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2094 
2095 static void
2096 nvptx_stacks_free (void *p, int num)
2097 {
2098   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2099   if (r != CUDA_SUCCESS)
2100     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2101 }
2102 
2103 void
2104 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2105 {
2106   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2107   CUresult r;
2108   struct ptx_device *ptx_dev = ptx_devices[ord];
2109   const char *maybe_abort_msg = "(perhaps abort was called)";
2110   int teams = 0, threads = 0;
2111 
2112   if (!args)
2113     GOMP_PLUGIN_fatal ("No target arguments provided");
2114   while (*args)
2115     {
2116       intptr_t id = (intptr_t) *args++, val;
2117       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2118 	val = (intptr_t) *args++;
2119       else
2120         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2121       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2122 	continue;
2123       val = val > INT_MAX ? INT_MAX : val;
2124       id &= GOMP_TARGET_ARG_ID_MASK;
2125       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2126 	teams = val;
2127       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2128 	threads = val;
2129     }
2130   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2131 
2132   size_t stack_size = nvptx_stacks_size ();
2133   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2134   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2135   size_t fn_args_size = sizeof fn_args;
2136   void *config[] = {
2137     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2138     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2139     CU_LAUNCH_PARAM_END
2140   };
2141   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2142 			 32, threads, 1, 0, ptx_dev->null_stream->stream,
2143 			 NULL, config);
2144   if (r != CUDA_SUCCESS)
2145     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2146 
2147   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2148   if (r == CUDA_ERROR_LAUNCH_FAILED)
2149     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2150 		       maybe_abort_msg);
2151   else if (r != CUDA_SUCCESS)
2152     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2153   nvptx_stacks_free (stacks, teams * threads);
2154 }
2155 
2156 void
2157 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2158 			void *async_data)
2159 {
2160   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2161 }
2162