xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/plugin/plugin-nvptx.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2016 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
39 
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <assert.h>
48 
49 static const char *
50 cuda_error (CUresult r)
51 {
52 #if CUDA_VERSION < 7000
53   /* Specified in documentation and present in library from at least
54      5.5.  Not declared in header file prior to 7.0.  */
55   extern CUresult cuGetErrorString (CUresult, const char **);
56 #endif
57   const char *desc;
58 
59   r = cuGetErrorString (r, &desc);
60   if (r != CUDA_SUCCESS)
61     desc = "unknown cuda error";
62 
63   return desc;
64 }
65 
66 static unsigned int instantiated_devices = 0;
67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
68 
69 struct ptx_stream
70 {
71   CUstream stream;
72   pthread_t host_thread;
73   bool multithreaded;
74 
75   CUdeviceptr d;
76   void *h;
77   void *h_begin;
78   void *h_end;
79   void *h_next;
80   void *h_prev;
81   void *h_tail;
82 
83   struct ptx_stream *next;
84 };
85 
86 /* Thread-specific data for PTX.  */
87 
88 struct nvptx_thread
89 {
90   struct ptx_stream *current_stream;
91   struct ptx_device *ptx_dev;
92 };
93 
94 struct map
95 {
96   int     async;
97   size_t  size;
98   char    mappings[0];
99 };
100 
101 static void
102 map_init (struct ptx_stream *s)
103 {
104   CUresult r;
105 
106   int size = getpagesize ();
107 
108   assert (s);
109   assert (!s->d);
110   assert (!s->h);
111 
112   r = cuMemAllocHost (&s->h, size);
113   if (r != CUDA_SUCCESS)
114     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
115 
116   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
117   if (r != CUDA_SUCCESS)
118     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
119 
120   assert (s->h);
121 
122   s->h_begin = s->h;
123   s->h_end = s->h_begin + size;
124   s->h_next = s->h_prev = s->h_tail = s->h_begin;
125 
126   assert (s->h_next);
127   assert (s->h_end);
128 }
129 
130 static void
131 map_fini (struct ptx_stream *s)
132 {
133   CUresult r;
134 
135   r = cuMemFreeHost (s->h);
136   if (r != CUDA_SUCCESS)
137     GOMP_PLUGIN_error ("cuMemFreeHost error: %s", cuda_error (r));
138 }
139 
140 static void
141 map_pop (struct ptx_stream *s)
142 {
143   struct map *m;
144 
145   assert (s != NULL);
146   assert (s->h_next);
147   assert (s->h_prev);
148   assert (s->h_tail);
149 
150   m = s->h_tail;
151 
152   s->h_tail += m->size;
153 
154   if (s->h_tail >= s->h_end)
155     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
156 
157   if (s->h_next == s->h_tail)
158     s->h_prev = s->h_next;
159 
160   assert (s->h_next >= s->h_begin);
161   assert (s->h_tail >= s->h_begin);
162   assert (s->h_prev >= s->h_begin);
163 
164   assert (s->h_next <= s->h_end);
165   assert (s->h_tail <= s->h_end);
166   assert (s->h_prev <= s->h_end);
167 }
168 
169 static void
170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
171 {
172   int left;
173   int offset;
174   struct map *m;
175 
176   assert (s != NULL);
177 
178   left = s->h_end - s->h_next;
179   size += sizeof (struct map);
180 
181   assert (s->h_prev);
182   assert (s->h_next);
183 
184   if (size >= left)
185     {
186       m = s->h_prev;
187       m->size += left;
188       s->h_next = s->h_begin;
189 
190       if (s->h_next + size > s->h_end)
191 	GOMP_PLUGIN_fatal ("unable to push map");
192     }
193 
194   assert (s->h_next);
195 
196   m = s->h_next;
197   m->async = async;
198   m->size = size;
199 
200   offset = (void *)&m->mappings[0] - s->h;
201 
202   *d = (void *)(s->d + offset);
203   *h = (void *)(s->h + offset);
204 
205   s->h_prev = s->h_next;
206   s->h_next += size;
207 
208   assert (s->h_prev);
209   assert (s->h_next);
210 
211   assert (s->h_next >= s->h_begin);
212   assert (s->h_tail >= s->h_begin);
213   assert (s->h_prev >= s->h_begin);
214   assert (s->h_next <= s->h_end);
215   assert (s->h_tail <= s->h_end);
216   assert (s->h_prev <= s->h_end);
217 
218   return;
219 }
220 
221 /* Target data function launch information.  */
222 
223 struct targ_fn_launch
224 {
225   const char *fn;
226   unsigned short dim[GOMP_DIM_MAX];
227 };
228 
229 /* Target PTX object information.  */
230 
231 struct targ_ptx_obj
232 {
233   const char *code;
234   size_t size;
235 };
236 
237 /* Target data image information.  */
238 
239 typedef struct nvptx_tdata
240 {
241   const struct targ_ptx_obj *ptx_objs;
242   unsigned ptx_num;
243 
244   const char *const *var_names;
245   unsigned var_num;
246 
247   const struct targ_fn_launch *fn_descs;
248   unsigned fn_num;
249 } nvptx_tdata_t;
250 
251 /* Descriptor of a loaded function.  */
252 
253 struct targ_fn_descriptor
254 {
255   CUfunction fn;
256   const struct targ_fn_launch *launch;
257 };
258 
259 /* A loaded PTX image.  */
260 struct ptx_image_data
261 {
262   const void *target_data;
263   CUmodule module;
264 
265   struct targ_fn_descriptor *fns;  /* Array of functions.  */
266 
267   struct ptx_image_data *next;
268 };
269 
270 struct ptx_device
271 {
272   CUcontext ctx;
273   bool ctx_shared;
274   CUdevice dev;
275   struct ptx_stream *null_stream;
276   /* All non-null streams associated with this device (actually context),
277      either created implicitly or passed in from the user (via
278      acc_set_cuda_stream).  */
279   struct ptx_stream *active_streams;
280   struct {
281     struct ptx_stream **arr;
282     int size;
283   } async_streams;
284   /* A lock for use when manipulating the above stream list and array.  */
285   pthread_mutex_t stream_lock;
286   int ord;
287   bool overlap;
288   bool map;
289   bool concur;
290   int  mode;
291   bool mkern;
292 
293   struct ptx_image_data *images;  /* Images loaded on device.  */
294   pthread_mutex_t image_lock;     /* Lock for above list.  */
295 
296   struct ptx_device *next;
297 };
298 
299 enum ptx_event_type
300 {
301   PTX_EVT_MEM,
302   PTX_EVT_KNL,
303   PTX_EVT_SYNC,
304   PTX_EVT_ASYNC_CLEANUP
305 };
306 
307 struct ptx_event
308 {
309   CUevent *evt;
310   int type;
311   void *addr;
312   int ord;
313 
314   struct ptx_event *next;
315 };
316 
317 static pthread_mutex_t ptx_event_lock;
318 static struct ptx_event *ptx_events;
319 
320 static struct ptx_device **ptx_devices;
321 
322 static inline struct nvptx_thread *
323 nvptx_thread (void)
324 {
325   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
326 }
327 
328 static void
329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
330 {
331   int i;
332   struct ptx_stream *null_stream
333     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
334 
335   null_stream->stream = NULL;
336   null_stream->host_thread = pthread_self ();
337   null_stream->multithreaded = true;
338   null_stream->d = (CUdeviceptr) NULL;
339   null_stream->h = NULL;
340   map_init (null_stream);
341   ptx_dev->null_stream = null_stream;
342 
343   ptx_dev->active_streams = NULL;
344   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
345 
346   if (concurrency < 1)
347     concurrency = 1;
348 
349   /* This is just a guess -- make space for as many async streams as the
350      current device is capable of concurrently executing.  This can grow
351      later as necessary.  No streams are created yet.  */
352   ptx_dev->async_streams.arr
353     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
354   ptx_dev->async_streams.size = concurrency;
355 
356   for (i = 0; i < concurrency; i++)
357     ptx_dev->async_streams.arr[i] = NULL;
358 }
359 
360 static void
361 fini_streams_for_device (struct ptx_device *ptx_dev)
362 {
363   free (ptx_dev->async_streams.arr);
364 
365   while (ptx_dev->active_streams != NULL)
366     {
367       struct ptx_stream *s = ptx_dev->active_streams;
368       ptx_dev->active_streams = ptx_dev->active_streams->next;
369 
370       map_fini (s);
371       cuStreamDestroy (s->stream);
372       free (s);
373     }
374 
375   map_fini (ptx_dev->null_stream);
376   free (ptx_dev->null_stream);
377 }
378 
379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
380    thread THREAD (and also current device/context).  If CREATE is true, create
381    the stream if it does not exist (or use EXISTING if it is non-NULL), and
382    associate the stream with the same thread argument.  Returns stream to use
383    as result.  */
384 
385 static struct ptx_stream *
386 select_stream_for_async (int async, pthread_t thread, bool create,
387 			 CUstream existing)
388 {
389   struct nvptx_thread *nvthd = nvptx_thread ();
390   /* Local copy of TLS variable.  */
391   struct ptx_device *ptx_dev = nvthd->ptx_dev;
392   struct ptx_stream *stream = NULL;
393   int orig_async = async;
394 
395   /* The special value acc_async_noval (-1) maps (for now) to an
396      implicitly-created stream, which is then handled the same as any other
397      numbered async stream.  Other options are available, e.g. using the null
398      stream for anonymous async operations, or choosing an idle stream from an
399      active set.  But, stick with this for now.  */
400   if (async > acc_async_sync)
401     async++;
402 
403   if (create)
404     pthread_mutex_lock (&ptx_dev->stream_lock);
405 
406   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
407      null stream, and in fact better performance may be obtainable if it doesn't
408      (because the null stream enforces overly-strict synchronisation with
409      respect to other streams for legacy reasons, and that's probably not
410      needed with OpenACC).  Maybe investigate later.  */
411   if (async == acc_async_sync)
412     stream = ptx_dev->null_stream;
413   else if (async >= 0 && async < ptx_dev->async_streams.size
414 	   && ptx_dev->async_streams.arr[async] && !(create && existing))
415     stream = ptx_dev->async_streams.arr[async];
416   else if (async >= 0 && create)
417     {
418       if (async >= ptx_dev->async_streams.size)
419 	{
420 	  int i, newsize = ptx_dev->async_streams.size * 2;
421 
422 	  if (async >= newsize)
423 	    newsize = async + 1;
424 
425 	  ptx_dev->async_streams.arr
426 	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
427 				   newsize * sizeof (struct ptx_stream *));
428 
429 	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
430 	    ptx_dev->async_streams.arr[i] = NULL;
431 
432 	  ptx_dev->async_streams.size = newsize;
433 	}
434 
435       /* Create a new stream on-demand if there isn't one already, or if we're
436 	 setting a particular async value to an existing (externally-provided)
437 	 stream.  */
438       if (!ptx_dev->async_streams.arr[async] || existing)
439         {
440 	  CUresult r;
441 	  struct ptx_stream *s
442 	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
443 
444 	  if (existing)
445 	    s->stream = existing;
446 	  else
447 	    {
448 	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
449 	      if (r != CUDA_SUCCESS)
450 		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
451 	    }
452 
453 	  /* If CREATE is true, we're going to be queueing some work on this
454 	     stream.  Associate it with the current host thread.  */
455 	  s->host_thread = thread;
456 	  s->multithreaded = false;
457 
458 	  s->d = (CUdeviceptr) NULL;
459 	  s->h = NULL;
460 	  map_init (s);
461 
462 	  s->next = ptx_dev->active_streams;
463 	  ptx_dev->active_streams = s;
464 	  ptx_dev->async_streams.arr[async] = s;
465 	}
466 
467       stream = ptx_dev->async_streams.arr[async];
468     }
469   else if (async < 0)
470     GOMP_PLUGIN_fatal ("bad async %d", async);
471 
472   if (create)
473     {
474       assert (stream != NULL);
475 
476       /* If we're trying to use the same stream from different threads
477 	 simultaneously, set stream->multithreaded to true.  This affects the
478 	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
479 	 only wait for asynchronous launches from the same host thread they are
480 	 invoked on.  If multiple threads use the same async value, we make note
481 	 of that here and fall back to testing/waiting for all threads in those
482 	 functions.  */
483       if (thread != stream->host_thread)
484         stream->multithreaded = true;
485 
486       pthread_mutex_unlock (&ptx_dev->stream_lock);
487     }
488   else if (stream && !stream->multithreaded
489 	   && !pthread_equal (stream->host_thread, thread))
490     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
491 
492   return stream;
493 }
494 
495 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
496    should be locked on entry and remains locked on exit.  */
497 
498 static bool
499 nvptx_init (void)
500 {
501   CUresult r;
502   int ndevs;
503 
504   if (instantiated_devices != 0)
505     return true;
506 
507   r = cuInit (0);
508   if (r != CUDA_SUCCESS)
509     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
510 
511   ptx_events = NULL;
512 
513   pthread_mutex_init (&ptx_event_lock, NULL);
514 
515   r = cuDeviceGetCount (&ndevs);
516   if (r != CUDA_SUCCESS)
517     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
518 
519   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
520 					    * ndevs);
521 
522   return true;
523 }
524 
525 /* Select the N'th PTX device for the current host thread.  The device must
526    have been previously opened before calling this function.  */
527 
528 static void
529 nvptx_attach_host_thread_to_device (int n)
530 {
531   CUdevice dev;
532   CUresult r;
533   struct ptx_device *ptx_dev;
534   CUcontext thd_ctx;
535 
536   r = cuCtxGetDevice (&dev);
537   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
538     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
539 
540   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
541     return;
542   else
543     {
544       CUcontext old_ctx;
545 
546       ptx_dev = ptx_devices[n];
547       assert (ptx_dev);
548 
549       r = cuCtxGetCurrent (&thd_ctx);
550       if (r != CUDA_SUCCESS)
551         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
552 
553       /* We don't necessarily have a current context (e.g. if it has been
554          destroyed.  Pop it if we do though.  */
555       if (thd_ctx != NULL)
556 	{
557 	  r = cuCtxPopCurrent (&old_ctx);
558 	  if (r != CUDA_SUCCESS)
559             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
560 	}
561 
562       r = cuCtxPushCurrent (ptx_dev->ctx);
563       if (r != CUDA_SUCCESS)
564         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
565     }
566 }
567 
568 static struct ptx_device *
569 nvptx_open_device (int n)
570 {
571   struct ptx_device *ptx_dev;
572   CUdevice dev, ctx_dev;
573   CUresult r;
574   int async_engines, pi;
575 
576   r = cuDeviceGet (&dev, n);
577   if (r != CUDA_SUCCESS)
578     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
579 
580   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
581 
582   ptx_dev->ord = n;
583   ptx_dev->dev = dev;
584   ptx_dev->ctx_shared = false;
585 
586   r = cuCtxGetDevice (&ctx_dev);
587   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
588     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
589 
590   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
591     {
592       /* The current host thread has an active context for a different device.
593          Detach it.  */
594       CUcontext old_ctx;
595 
596       r = cuCtxPopCurrent (&old_ctx);
597       if (r != CUDA_SUCCESS)
598 	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
599     }
600 
601   r = cuCtxGetCurrent (&ptx_dev->ctx);
602   if (r != CUDA_SUCCESS)
603     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
604 
605   if (!ptx_dev->ctx)
606     {
607       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
608       if (r != CUDA_SUCCESS)
609 	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
610     }
611   else
612     ptx_dev->ctx_shared = true;
613 
614   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
615   if (r != CUDA_SUCCESS)
616     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
617 
618   ptx_dev->overlap = pi;
619 
620   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
621   if (r != CUDA_SUCCESS)
622     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
623 
624   ptx_dev->map = pi;
625 
626   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
627   if (r != CUDA_SUCCESS)
628     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
629 
630   ptx_dev->concur = pi;
631 
632   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
633   if (r != CUDA_SUCCESS)
634     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
635 
636   ptx_dev->mode = pi;
637 
638   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
639   if (r != CUDA_SUCCESS)
640     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
641 
642   ptx_dev->mkern = pi;
643 
644   r = cuDeviceGetAttribute (&async_engines,
645 			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
646   if (r != CUDA_SUCCESS)
647     async_engines = 1;
648 
649   ptx_dev->images = NULL;
650   pthread_mutex_init (&ptx_dev->image_lock, NULL);
651 
652   init_streams_for_device (ptx_dev, async_engines);
653 
654   return ptx_dev;
655 }
656 
657 static void
658 nvptx_close_device (struct ptx_device *ptx_dev)
659 {
660   CUresult r;
661 
662   if (!ptx_dev)
663     return;
664 
665   fini_streams_for_device (ptx_dev);
666 
667   pthread_mutex_destroy (&ptx_dev->image_lock);
668 
669   if (!ptx_dev->ctx_shared)
670     {
671       r = cuCtxDestroy (ptx_dev->ctx);
672       if (r != CUDA_SUCCESS)
673 	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
674     }
675 
676   free (ptx_dev);
677 }
678 
679 static int
680 nvptx_get_num_devices (void)
681 {
682   int n;
683   CUresult r;
684 
685   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
686      configurations.  */
687   if (sizeof (void *) != 8)
688     return 0;
689 
690   /* This function will be called before the plugin has been initialized in
691      order to enumerate available devices, but CUDA API routines can't be used
692      until cuInit has been called.  Just call it now (but don't yet do any
693      further initialization).  */
694   if (instantiated_devices == 0)
695     {
696       r = cuInit (0);
697       /* This is not an error: e.g. we may have CUDA libraries installed but
698          no devices available.  */
699       if (r != CUDA_SUCCESS)
700         return 0;
701     }
702 
703   r = cuDeviceGetCount (&n);
704   if (r!= CUDA_SUCCESS)
705     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
706 
707   return n;
708 }
709 
710 
711 static void
712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
713 	  unsigned num_objs)
714 {
715   CUjit_option opts[6];
716   void *optvals[6];
717   float elapsed = 0.0;
718 #define LOGSIZE 8192
719   char elog[LOGSIZE];
720   char ilog[LOGSIZE];
721   unsigned long logsize = LOGSIZE;
722   CUlinkState linkstate;
723   CUresult r;
724   void *linkout;
725   size_t linkoutsize __attribute__ ((unused));
726 
727   opts[0] = CU_JIT_WALL_TIME;
728   optvals[0] = &elapsed;
729 
730   opts[1] = CU_JIT_INFO_LOG_BUFFER;
731   optvals[1] = &ilog[0];
732 
733   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
734   optvals[2] = (void *) logsize;
735 
736   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
737   optvals[3] = &elog[0];
738 
739   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
740   optvals[4] = (void *) logsize;
741 
742   opts[5] = CU_JIT_LOG_VERBOSE;
743   optvals[5] = (void *) 1;
744 
745   r = cuLinkCreate (6, opts, optvals, &linkstate);
746   if (r != CUDA_SUCCESS)
747     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
748 
749   for (; num_objs--; ptx_objs++)
750     {
751       /* cuLinkAddData's 'data' argument erroneously omits the const
752 	 qualifier.  */
753       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
754       r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
755 			 ptx_objs->size, 0, 0, 0, 0);
756       if (r != CUDA_SUCCESS)
757 	{
758 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
759 	  GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
760 			     cuda_error (r));
761 	}
762     }
763 
764   GOMP_PLUGIN_debug (0, "Linking\n");
765   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
766 
767   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
768   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
769 
770   if (r != CUDA_SUCCESS)
771     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
772 
773   r = cuModuleLoadData (module, linkout);
774   if (r != CUDA_SUCCESS)
775     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
776 
777   r = cuLinkDestroy (linkstate);
778   if (r != CUDA_SUCCESS)
779     GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
780 }
781 
782 static void
783 event_gc (bool memmap_lockable)
784 {
785   struct ptx_event *ptx_event = ptx_events;
786   struct nvptx_thread *nvthd = nvptx_thread ();
787 
788   pthread_mutex_lock (&ptx_event_lock);
789 
790   while (ptx_event != NULL)
791     {
792       CUresult r;
793       struct ptx_event *e = ptx_event;
794 
795       ptx_event = ptx_event->next;
796 
797       if (e->ord != nvthd->ptx_dev->ord)
798 	continue;
799 
800       r = cuEventQuery (*e->evt);
801       if (r == CUDA_SUCCESS)
802 	{
803 	  CUevent *te;
804 
805 	  te = e->evt;
806 
807 	  switch (e->type)
808 	    {
809 	    case PTX_EVT_MEM:
810 	    case PTX_EVT_SYNC:
811 	      break;
812 
813 	    case PTX_EVT_KNL:
814 	      map_pop (e->addr);
815 	      break;
816 
817 	    case PTX_EVT_ASYNC_CLEANUP:
818 	      {
819 		/* The function gomp_plugin_async_unmap_vars needs to claim the
820 		   memory-map splay tree lock for the current device, so we
821 		   can't call it when one of our callers has already claimed
822 		   the lock.  In that case, just delay the GC for this event
823 		   until later.  */
824 		if (!memmap_lockable)
825 		  continue;
826 
827 		GOMP_PLUGIN_async_unmap_vars (e->addr);
828 	      }
829 	      break;
830 	    }
831 
832 	  cuEventDestroy (*te);
833 	  free ((void *)te);
834 
835 	  if (ptx_events == e)
836 	    ptx_events = ptx_events->next;
837 	  else
838 	    {
839 	      struct ptx_event *e_ = ptx_events;
840 	      while (e_->next != e)
841 		e_ = e_->next;
842 	      e_->next = e_->next->next;
843 	    }
844 
845 	  free (e);
846 	}
847     }
848 
849   pthread_mutex_unlock (&ptx_event_lock);
850 }
851 
852 static void
853 event_add (enum ptx_event_type type, CUevent *e, void *h)
854 {
855   struct ptx_event *ptx_event;
856   struct nvptx_thread *nvthd = nvptx_thread ();
857 
858   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
859 	  || type == PTX_EVT_ASYNC_CLEANUP);
860 
861   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
862   ptx_event->type = type;
863   ptx_event->evt = e;
864   ptx_event->addr = h;
865   ptx_event->ord = nvthd->ptx_dev->ord;
866 
867   pthread_mutex_lock (&ptx_event_lock);
868 
869   ptx_event->next = ptx_events;
870   ptx_events = ptx_event;
871 
872   pthread_mutex_unlock (&ptx_event_lock);
873 }
874 
875 void
876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
877 	    int async, unsigned *dims, void *targ_mem_desc)
878 {
879   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
880   CUfunction function;
881   CUresult r;
882   int i;
883   struct ptx_stream *dev_str;
884   void *kargs[1];
885   void *hp, *dp;
886   struct nvptx_thread *nvthd = nvptx_thread ();
887   const char *maybe_abort_msg = "(perhaps abort was called)";
888 
889   function = targ_fn->fn;
890 
891   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
892   assert (dev_str == nvthd->current_stream);
893 
894   /* Initialize the launch dimensions.  Typically this is constant,
895      provided by the device compiler, but we must permit runtime
896      values.  */
897   int seen_zero = 0;
898   for (i = 0; i != GOMP_DIM_MAX; i++)
899     {
900       if (targ_fn->launch->dim[i])
901        dims[i] = targ_fn->launch->dim[i];
902       if (!dims[i])
903        seen_zero = 1;
904     }
905 
906   if (seen_zero)
907     {
908       for (i = 0; i != GOMP_DIM_MAX; i++)
909        if (!dims[i])
910          dims[i] = /* TODO */ 32;
911     }
912 
913   /* This reserves a chunk of a pre-allocated page of memory mapped on both
914      the host and the device. HP is a host pointer to the new chunk, and DP is
915      the corresponding device pointer.  */
916   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
917 
918   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
919 
920   /* Copy the array of arguments to the mapped page.  */
921   for (i = 0; i < mapnum; i++)
922     ((void **) hp)[i] = devaddrs[i];
923 
924   /* Copy the (device) pointers to arguments to the device (dp and hp might in
925      fact have the same value on a unified-memory system).  */
926   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
927   if (r != CUDA_SUCCESS)
928     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
929 
930   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
931 		     " gangs=%u, workers=%u, vectors=%u\n",
932 		     __FUNCTION__, targ_fn->launch->fn,
933 		     dims[0], dims[1], dims[2]);
934 
935   // OpenACC		CUDA
936   //
937   // num_gangs		nctaid.x
938   // num_workers	ntid.y
939   // vector length	ntid.x
940 
941   kargs[0] = &dp;
942   r = cuLaunchKernel (function,
943 		      dims[GOMP_DIM_GANG], 1, 1,
944 		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
945 		      0, dev_str->stream, kargs, 0);
946   if (r != CUDA_SUCCESS)
947     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
948 
949 #ifndef DISABLE_ASYNC
950   if (async < acc_async_noval)
951     {
952       r = cuStreamSynchronize (dev_str->stream);
953       if (r == CUDA_ERROR_LAUNCH_FAILED)
954 	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
955 			   maybe_abort_msg);
956       else if (r != CUDA_SUCCESS)
957         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
958     }
959   else
960     {
961       CUevent *e;
962 
963       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
964 
965       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
966       if (r == CUDA_ERROR_LAUNCH_FAILED)
967 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
968 			   maybe_abort_msg);
969       else if (r != CUDA_SUCCESS)
970         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
971 
972       event_gc (true);
973 
974       r = cuEventRecord (*e, dev_str->stream);
975       if (r != CUDA_SUCCESS)
976         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
977 
978       event_add (PTX_EVT_KNL, e, (void *)dev_str);
979     }
980 #else
981   r = cuCtxSynchronize ();
982   if (r == CUDA_ERROR_LAUNCH_FAILED)
983     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
984 		       maybe_abort_msg);
985   else if (r != CUDA_SUCCESS)
986     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
987 #endif
988 
989   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
990 		     targ_fn->launch->fn);
991 
992 #ifndef DISABLE_ASYNC
993   if (async < acc_async_noval)
994 #endif
995     map_pop (dev_str);
996 }
997 
998 void * openacc_get_current_cuda_context (void);
999 
1000 static void *
1001 nvptx_alloc (size_t s)
1002 {
1003   CUdeviceptr d;
1004   CUresult r;
1005 
1006   r = cuMemAlloc (&d, s);
1007   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1008     return 0;
1009   if (r != CUDA_SUCCESS)
1010     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1011   return (void *)d;
1012 }
1013 
1014 static void
1015 nvptx_free (void *p)
1016 {
1017   CUresult r;
1018   CUdeviceptr pb;
1019   size_t ps;
1020 
1021   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1022   if (r != CUDA_SUCCESS)
1023     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1024 
1025   if ((CUdeviceptr)p != pb)
1026     GOMP_PLUGIN_fatal ("invalid device address");
1027 
1028   r = cuMemFree ((CUdeviceptr)p);
1029   if (r != CUDA_SUCCESS)
1030     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1031 }
1032 
1033 static void *
1034 nvptx_host2dev (void *d, const void *h, size_t s)
1035 {
1036   CUresult r;
1037   CUdeviceptr pb;
1038   size_t ps;
1039   struct nvptx_thread *nvthd = nvptx_thread ();
1040 
1041   if (!s)
1042     return 0;
1043 
1044   if (!d)
1045     GOMP_PLUGIN_fatal ("invalid device address");
1046 
1047   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1048   if (r != CUDA_SUCCESS)
1049     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1050 
1051   if (!pb)
1052     GOMP_PLUGIN_fatal ("invalid device address");
1053 
1054   if (!h)
1055     GOMP_PLUGIN_fatal ("invalid host address");
1056 
1057   if (d == h)
1058     GOMP_PLUGIN_fatal ("invalid host or device address");
1059 
1060   if ((void *)(d + s) > (void *)(pb + ps))
1061     GOMP_PLUGIN_fatal ("invalid size");
1062 
1063 #ifndef DISABLE_ASYNC
1064   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1065     {
1066       CUevent *e;
1067 
1068       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1069 
1070       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1071       if (r != CUDA_SUCCESS)
1072         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1073 
1074       event_gc (false);
1075 
1076       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1077 			     nvthd->current_stream->stream);
1078       if (r != CUDA_SUCCESS)
1079         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1080 
1081       r = cuEventRecord (*e, nvthd->current_stream->stream);
1082       if (r != CUDA_SUCCESS)
1083         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1084 
1085       event_add (PTX_EVT_MEM, e, (void *)h);
1086     }
1087   else
1088 #endif
1089     {
1090       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1091       if (r != CUDA_SUCCESS)
1092         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1093     }
1094 
1095   return 0;
1096 }
1097 
1098 static void *
1099 nvptx_dev2host (void *h, const void *d, size_t s)
1100 {
1101   CUresult r;
1102   CUdeviceptr pb;
1103   size_t ps;
1104   struct nvptx_thread *nvthd = nvptx_thread ();
1105 
1106   if (!s)
1107     return 0;
1108 
1109   if (!d)
1110     GOMP_PLUGIN_fatal ("invalid device address");
1111 
1112   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1113   if (r != CUDA_SUCCESS)
1114     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1115 
1116   if (!pb)
1117     GOMP_PLUGIN_fatal ("invalid device address");
1118 
1119   if (!h)
1120     GOMP_PLUGIN_fatal ("invalid host address");
1121 
1122   if (d == h)
1123     GOMP_PLUGIN_fatal ("invalid host or device address");
1124 
1125   if ((void *)(d + s) > (void *)(pb + ps))
1126     GOMP_PLUGIN_fatal ("invalid size");
1127 
1128 #ifndef DISABLE_ASYNC
1129   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1130     {
1131       CUevent *e;
1132 
1133       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1134 
1135       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1136       if (r != CUDA_SUCCESS)
1137         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1138 
1139       event_gc (false);
1140 
1141       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1142 			     nvthd->current_stream->stream);
1143       if (r != CUDA_SUCCESS)
1144         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1145 
1146       r = cuEventRecord (*e, nvthd->current_stream->stream);
1147       if (r != CUDA_SUCCESS)
1148         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1149 
1150       event_add (PTX_EVT_MEM, e, (void *)h);
1151     }
1152   else
1153 #endif
1154     {
1155       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1156       if (r != CUDA_SUCCESS)
1157 	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1158     }
1159 
1160   return 0;
1161 }
1162 
1163 static void
1164 nvptx_set_async (int async)
1165 {
1166   struct nvptx_thread *nvthd = nvptx_thread ();
1167   nvthd->current_stream
1168     = select_stream_for_async (async, pthread_self (), true, NULL);
1169 }
1170 
1171 static int
1172 nvptx_async_test (int async)
1173 {
1174   CUresult r;
1175   struct ptx_stream *s;
1176 
1177   s = select_stream_for_async (async, pthread_self (), false, NULL);
1178 
1179   if (!s)
1180     GOMP_PLUGIN_fatal ("unknown async %d", async);
1181 
1182   r = cuStreamQuery (s->stream);
1183   if (r == CUDA_SUCCESS)
1184     {
1185       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1186 	 whether all work has completed on this stream, and if so omits the call
1187 	 to the wait hook.  If that happens, event_gc might not get called
1188 	 (which prevents variables from getting unmapped and their associated
1189 	 device storage freed), so call it here.  */
1190       event_gc (true);
1191       return 1;
1192     }
1193   else if (r == CUDA_ERROR_NOT_READY)
1194     return 0;
1195 
1196   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1197 
1198   return 0;
1199 }
1200 
1201 static int
1202 nvptx_async_test_all (void)
1203 {
1204   struct ptx_stream *s;
1205   pthread_t self = pthread_self ();
1206   struct nvptx_thread *nvthd = nvptx_thread ();
1207 
1208   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1209 
1210   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1211     {
1212       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1213 	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1214 	{
1215 	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1216 	  return 0;
1217 	}
1218     }
1219 
1220   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1221 
1222   event_gc (true);
1223 
1224   return 1;
1225 }
1226 
1227 static void
1228 nvptx_wait (int async)
1229 {
1230   CUresult r;
1231   struct ptx_stream *s;
1232 
1233   s = select_stream_for_async (async, pthread_self (), false, NULL);
1234 
1235   if (!s)
1236     GOMP_PLUGIN_fatal ("unknown async %d", async);
1237 
1238   r = cuStreamSynchronize (s->stream);
1239   if (r != CUDA_SUCCESS)
1240     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1241 
1242   event_gc (true);
1243 }
1244 
1245 static void
1246 nvptx_wait_async (int async1, int async2)
1247 {
1248   CUresult r;
1249   CUevent *e;
1250   struct ptx_stream *s1, *s2;
1251   pthread_t self = pthread_self ();
1252 
1253   /* The stream that is waiting (rather than being waited for) doesn't
1254      necessarily have to exist already.  */
1255   s2 = select_stream_for_async (async2, self, true, NULL);
1256 
1257   s1 = select_stream_for_async (async1, self, false, NULL);
1258   if (!s1)
1259     GOMP_PLUGIN_fatal ("invalid async 1\n");
1260 
1261   if (s1 == s2)
1262     GOMP_PLUGIN_fatal ("identical parameters");
1263 
1264   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1265 
1266   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1267   if (r != CUDA_SUCCESS)
1268     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1269 
1270   event_gc (true);
1271 
1272   r = cuEventRecord (*e, s1->stream);
1273   if (r != CUDA_SUCCESS)
1274     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1275 
1276   event_add (PTX_EVT_SYNC, e, NULL);
1277 
1278   r = cuStreamWaitEvent (s2->stream, *e, 0);
1279   if (r != CUDA_SUCCESS)
1280     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1281 }
1282 
1283 static void
1284 nvptx_wait_all (void)
1285 {
1286   CUresult r;
1287   struct ptx_stream *s;
1288   pthread_t self = pthread_self ();
1289   struct nvptx_thread *nvthd = nvptx_thread ();
1290 
1291   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1292 
1293   /* Wait for active streams initiated by this thread (or by multiple threads)
1294      to complete.  */
1295   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1296     {
1297       if (s->multithreaded || pthread_equal (s->host_thread, self))
1298 	{
1299 	  r = cuStreamQuery (s->stream);
1300 	  if (r == CUDA_SUCCESS)
1301 	    continue;
1302 	  else if (r != CUDA_ERROR_NOT_READY)
1303 	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1304 
1305 	  r = cuStreamSynchronize (s->stream);
1306 	  if (r != CUDA_SUCCESS)
1307 	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1308 	}
1309     }
1310 
1311   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1312 
1313   event_gc (true);
1314 }
1315 
1316 static void
1317 nvptx_wait_all_async (int async)
1318 {
1319   CUresult r;
1320   struct ptx_stream *waiting_stream, *other_stream;
1321   CUevent *e;
1322   struct nvptx_thread *nvthd = nvptx_thread ();
1323   pthread_t self = pthread_self ();
1324 
1325   /* The stream doing the waiting.  This could be the first mention of the
1326      stream, so create it if necessary.  */
1327   waiting_stream
1328     = select_stream_for_async (async, pthread_self (), true, NULL);
1329 
1330   /* Launches on the null stream already block on other streams in the
1331      context.  */
1332   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1333     return;
1334 
1335   event_gc (true);
1336 
1337   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1338 
1339   for (other_stream = nvthd->ptx_dev->active_streams;
1340        other_stream != NULL;
1341        other_stream = other_stream->next)
1342     {
1343       if (!other_stream->multithreaded
1344 	  && !pthread_equal (other_stream->host_thread, self))
1345 	continue;
1346 
1347       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1348 
1349       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1350       if (r != CUDA_SUCCESS)
1351 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1352 
1353       /* Record an event on the waited-for stream.  */
1354       r = cuEventRecord (*e, other_stream->stream);
1355       if (r != CUDA_SUCCESS)
1356 	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1357 
1358       event_add (PTX_EVT_SYNC, e, NULL);
1359 
1360       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1361       if (r != CUDA_SUCCESS)
1362 	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1363    }
1364 
1365   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1366 }
1367 
1368 static void *
1369 nvptx_get_current_cuda_device (void)
1370 {
1371   struct nvptx_thread *nvthd = nvptx_thread ();
1372 
1373   if (!nvthd || !nvthd->ptx_dev)
1374     return NULL;
1375 
1376   return &nvthd->ptx_dev->dev;
1377 }
1378 
1379 static void *
1380 nvptx_get_current_cuda_context (void)
1381 {
1382   struct nvptx_thread *nvthd = nvptx_thread ();
1383 
1384   if (!nvthd || !nvthd->ptx_dev)
1385     return NULL;
1386 
1387   return nvthd->ptx_dev->ctx;
1388 }
1389 
1390 static void *
1391 nvptx_get_cuda_stream (int async)
1392 {
1393   struct ptx_stream *s;
1394   struct nvptx_thread *nvthd = nvptx_thread ();
1395 
1396   if (!nvthd || !nvthd->ptx_dev)
1397     return NULL;
1398 
1399   s = select_stream_for_async (async, pthread_self (), false, NULL);
1400 
1401   return s ? s->stream : NULL;
1402 }
1403 
1404 static int
1405 nvptx_set_cuda_stream (int async, void *stream)
1406 {
1407   struct ptx_stream *oldstream;
1408   pthread_t self = pthread_self ();
1409   struct nvptx_thread *nvthd = nvptx_thread ();
1410 
1411   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1412 
1413   if (async < 0)
1414     GOMP_PLUGIN_fatal ("bad async %d", async);
1415 
1416   /* We have a list of active streams and an array mapping async values to
1417      entries of that list.  We need to take "ownership" of the passed-in stream,
1418      and add it to our list, removing the previous entry also (if there was one)
1419      in order to prevent resource leaks.  Note the potential for surprise
1420      here: maybe we should keep track of passed-in streams and leave it up to
1421      the user to tidy those up, but that doesn't work for stream handles
1422      returned from acc_get_cuda_stream above...  */
1423 
1424   oldstream = select_stream_for_async (async, self, false, NULL);
1425 
1426   if (oldstream)
1427     {
1428       if (nvthd->ptx_dev->active_streams == oldstream)
1429 	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1430       else
1431 	{
1432 	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1433 	  while (s->next != oldstream)
1434 	    s = s->next;
1435 	  s->next = s->next->next;
1436 	}
1437 
1438       cuStreamDestroy (oldstream->stream);
1439       map_fini (oldstream);
1440       free (oldstream);
1441     }
1442 
1443   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1444 
1445   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1446 
1447   return 1;
1448 }
1449 
1450 /* Plugin entry points.  */
1451 
1452 const char *
1453 GOMP_OFFLOAD_get_name (void)
1454 {
1455   return "nvptx";
1456 }
1457 
1458 unsigned int
1459 GOMP_OFFLOAD_get_caps (void)
1460 {
1461   return GOMP_OFFLOAD_CAP_OPENACC_200;
1462 }
1463 
1464 int
1465 GOMP_OFFLOAD_get_type (void)
1466 {
1467   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1468 }
1469 
1470 int
1471 GOMP_OFFLOAD_get_num_devices (void)
1472 {
1473   return nvptx_get_num_devices ();
1474 }
1475 
1476 void
1477 GOMP_OFFLOAD_init_device (int n)
1478 {
1479   pthread_mutex_lock (&ptx_dev_lock);
1480 
1481   if (!nvptx_init () || ptx_devices[n] != NULL)
1482     {
1483       pthread_mutex_unlock (&ptx_dev_lock);
1484       return;
1485     }
1486 
1487   ptx_devices[n] = nvptx_open_device (n);
1488   instantiated_devices++;
1489 
1490   pthread_mutex_unlock (&ptx_dev_lock);
1491 }
1492 
1493 void
1494 GOMP_OFFLOAD_fini_device (int n)
1495 {
1496   pthread_mutex_lock (&ptx_dev_lock);
1497 
1498   if (ptx_devices[n] != NULL)
1499     {
1500       nvptx_attach_host_thread_to_device (n);
1501       nvptx_close_device (ptx_devices[n]);
1502       ptx_devices[n] = NULL;
1503       instantiated_devices--;
1504     }
1505 
1506   pthread_mutex_unlock (&ptx_dev_lock);
1507 }
1508 
1509 /* Return the libgomp version number we're compatible with.  There is
1510    no requirement for cross-version compatibility.  */
1511 
1512 unsigned
1513 GOMP_OFFLOAD_version (void)
1514 {
1515   return GOMP_VERSION;
1516 }
1517 
1518 /* Load the (partial) program described by TARGET_DATA to device
1519    number ORD.  Allocate and return TARGET_TABLE.  */
1520 
1521 int
1522 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1523 			 struct addr_pair **target_table)
1524 {
1525   CUmodule module;
1526   const char *const *var_names;
1527   const struct targ_fn_launch *fn_descs;
1528   unsigned int fn_entries, var_entries, i, j;
1529   CUresult r;
1530   struct targ_fn_descriptor *targ_fns;
1531   struct addr_pair *targ_tbl;
1532   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1533   struct ptx_image_data *new_image;
1534   struct ptx_device *dev;
1535 
1536   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1537     GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1538 		       " (expected %u, received %u)",
1539 		       GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1540 
1541   GOMP_OFFLOAD_init_device (ord);
1542 
1543   dev = ptx_devices[ord];
1544 
1545   nvptx_attach_host_thread_to_device (ord);
1546 
1547   link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);
1548 
1549   /* The mkoffload utility emits a struct of pointers/integers at the
1550      start of each offload image.  The array of kernel names and the
1551      functions addresses form a one-to-one correspondence.  */
1552 
1553   var_entries = img_header->var_num;
1554   var_names = img_header->var_names;
1555   fn_entries = img_header->fn_num;
1556   fn_descs = img_header->fn_descs;
1557 
1558   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1559 				 * (fn_entries + var_entries));
1560   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1561 				 * fn_entries);
1562 
1563   *target_table = targ_tbl;
1564 
1565   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1566   new_image->target_data = target_data;
1567   new_image->module = module;
1568   new_image->fns = targ_fns;
1569 
1570   pthread_mutex_lock (&dev->image_lock);
1571   new_image->next = dev->images;
1572   dev->images = new_image;
1573   pthread_mutex_unlock (&dev->image_lock);
1574 
1575   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1576     {
1577       CUfunction function;
1578 
1579       r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1580       if (r != CUDA_SUCCESS)
1581 	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1582 
1583       targ_fns->fn = function;
1584       targ_fns->launch = &fn_descs[i];
1585 
1586       targ_tbl->start = (uintptr_t) targ_fns;
1587       targ_tbl->end = targ_tbl->start + 1;
1588     }
1589 
1590   for (j = 0; j < var_entries; j++, targ_tbl++)
1591     {
1592       CUdeviceptr var;
1593       size_t bytes;
1594 
1595       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1596       if (r != CUDA_SUCCESS)
1597         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1598 
1599       targ_tbl->start = (uintptr_t) var;
1600       targ_tbl->end = targ_tbl->start + bytes;
1601     }
1602 
1603   return fn_entries + var_entries;
1604 }
1605 
1606 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1607    function descriptors allocated by G_O_load_image.  */
1608 
1609 void
1610 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1611 {
1612   struct ptx_image_data *image, **prev_p;
1613   struct ptx_device *dev = ptx_devices[ord];
1614 
1615   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1616     return;
1617 
1618   pthread_mutex_lock (&dev->image_lock);
1619   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1620     if (image->target_data == target_data)
1621       {
1622 	*prev_p = image->next;
1623 	cuModuleUnload (image->module);
1624 	free (image->fns);
1625 	free (image);
1626 	break;
1627       }
1628   pthread_mutex_unlock (&dev->image_lock);
1629 }
1630 
1631 void *
1632 GOMP_OFFLOAD_alloc (int ord, size_t size)
1633 {
1634   nvptx_attach_host_thread_to_device (ord);
1635   return nvptx_alloc (size);
1636 }
1637 
1638 void
1639 GOMP_OFFLOAD_free (int ord, void *ptr)
1640 {
1641   nvptx_attach_host_thread_to_device (ord);
1642   nvptx_free (ptr);
1643 }
1644 
1645 void *
1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1647 {
1648   nvptx_attach_host_thread_to_device (ord);
1649   return nvptx_dev2host (dst, src, n);
1650 }
1651 
1652 void *
1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1654 {
1655   nvptx_attach_host_thread_to_device (ord);
1656   return nvptx_host2dev (dst, src, n);
1657 }
1658 
1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1660 
1661 void
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1663 			       void **hostaddrs, void **devaddrs,
1664 			       int async, unsigned *dims, void *targ_mem_desc)
1665 {
1666   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1667 }
1668 
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1671 {
1672   CUevent *e;
1673   CUresult r;
1674   struct nvptx_thread *nvthd = nvptx_thread ();
1675 
1676   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1677 
1678   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1679   if (r != CUDA_SUCCESS)
1680     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1681 
1682   r = cuEventRecord (*e, nvthd->current_stream->stream);
1683   if (r != CUDA_SUCCESS)
1684     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1685 
1686   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1687 }
1688 
1689 int
1690 GOMP_OFFLOAD_openacc_async_test (int async)
1691 {
1692   return nvptx_async_test (async);
1693 }
1694 
1695 int
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1697 {
1698   return nvptx_async_test_all ();
1699 }
1700 
1701 void
1702 GOMP_OFFLOAD_openacc_async_wait (int async)
1703 {
1704   nvptx_wait (async);
1705 }
1706 
1707 void
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1709 {
1710   nvptx_wait_async (async1, async2);
1711 }
1712 
1713 void
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1715 {
1716   nvptx_wait_all ();
1717 }
1718 
1719 void
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1721 {
1722   nvptx_wait_all_async (async);
1723 }
1724 
1725 void
1726 GOMP_OFFLOAD_openacc_async_set_async (int async)
1727 {
1728   nvptx_set_async (async);
1729 }
1730 
1731 void *
1732 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1733 {
1734   struct ptx_device *ptx_dev;
1735   struct nvptx_thread *nvthd
1736     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1737   CUresult r;
1738   CUcontext thd_ctx;
1739 
1740   ptx_dev = ptx_devices[ord];
1741 
1742   assert (ptx_dev);
1743 
1744   r = cuCtxGetCurrent (&thd_ctx);
1745   if (r != CUDA_SUCCESS)
1746     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1747 
1748   assert (ptx_dev->ctx);
1749 
1750   if (!thd_ctx)
1751     {
1752       r = cuCtxPushCurrent (ptx_dev->ctx);
1753       if (r != CUDA_SUCCESS)
1754 	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1755     }
1756 
1757   nvthd->current_stream = ptx_dev->null_stream;
1758   nvthd->ptx_dev = ptx_dev;
1759 
1760   return (void *) nvthd;
1761 }
1762 
1763 void
1764 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1765 {
1766   free (data);
1767 }
1768 
1769 void *
1770 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1771 {
1772   return nvptx_get_current_cuda_device ();
1773 }
1774 
1775 void *
1776 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1777 {
1778   return nvptx_get_current_cuda_context ();
1779 }
1780 
1781 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1782 
1783 void *
1784 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1785 {
1786   return nvptx_get_cuda_stream (async);
1787 }
1788 
1789 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1790 
1791 int
1792 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1793 {
1794   return nvptx_set_cuda_stream (async, stream);
1795 }
1796