xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/plugin/plugin-nvptx.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
39 
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <dlfcn.h>
47 #include <unistd.h>
48 #include <assert.h>
49 
50 #define	ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
51 
52 static struct
53 {
54   CUresult r;
55   char *m;
56 } cuda_errlist[]=
57 {
58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
66   { CUDA_ERROR_NO_DEVICE, "no device" },
67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
71   { CUDA_ERROR_MAP_FAILED, "map error" },
72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
87                                            "shared object symbol not found" },
88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
91   { CUDA_ERROR_NOT_FOUND, "not found" },
92   { CUDA_ERROR_NOT_READY, "not ready" },
93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
97                                              "launch incompatibe texturing" },
98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
102   { CUDA_ERROR_ASSERT, "assert" },
103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
105                                            "host memory already registered" },
106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
109   { CUDA_ERROR_UNKNOWN, "unknown" }
110 };
111 
112 static char errmsg[128];
113 
114 static char *
115 cuda_error (CUresult r)
116 {
117   int i;
118 
119   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
120     {
121       if (cuda_errlist[i].r == r)
122 	return &cuda_errlist[i].m[0];
123     }
124 
125   sprintf (&errmsg[0], "unknown result code: %5d", r);
126 
127   return &errmsg[0];
128 }
129 
130 struct targ_fn_descriptor
131 {
132   CUfunction fn;
133   const char *name;
134 };
135 
136 static unsigned int instantiated_devices = 0;
137 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
138 
139 struct ptx_stream
140 {
141   CUstream stream;
142   pthread_t host_thread;
143   bool multithreaded;
144 
145   CUdeviceptr d;
146   void *h;
147   void *h_begin;
148   void *h_end;
149   void *h_next;
150   void *h_prev;
151   void *h_tail;
152 
153   struct ptx_stream *next;
154 };
155 
156 /* Thread-specific data for PTX.  */
157 
158 struct nvptx_thread
159 {
160   struct ptx_stream *current_stream;
161   struct ptx_device *ptx_dev;
162 };
163 
164 struct map
165 {
166   int     async;
167   size_t  size;
168   char    mappings[0];
169 };
170 
171 static void
172 map_init (struct ptx_stream *s)
173 {
174   CUresult r;
175 
176   int size = getpagesize ();
177 
178   assert (s);
179   assert (!s->d);
180   assert (!s->h);
181 
182   r = cuMemAllocHost (&s->h, size);
183   if (r != CUDA_SUCCESS)
184     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
185 
186   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
187   if (r != CUDA_SUCCESS)
188     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
189 
190   assert (s->h);
191 
192   s->h_begin = s->h;
193   s->h_end = s->h_begin + size;
194   s->h_next = s->h_prev = s->h_tail = s->h_begin;
195 
196   assert (s->h_next);
197   assert (s->h_end);
198 }
199 
200 static void
201 map_fini (struct ptx_stream *s)
202 {
203   CUresult r;
204 
205   r = cuMemFreeHost (s->h);
206   if (r != CUDA_SUCCESS)
207     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
208 }
209 
210 static void
211 map_pop (struct ptx_stream *s)
212 {
213   struct map *m;
214 
215   assert (s != NULL);
216   assert (s->h_next);
217   assert (s->h_prev);
218   assert (s->h_tail);
219 
220   m = s->h_tail;
221 
222   s->h_tail += m->size;
223 
224   if (s->h_tail >= s->h_end)
225     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
226 
227   if (s->h_next == s->h_tail)
228     s->h_prev = s->h_next;
229 
230   assert (s->h_next >= s->h_begin);
231   assert (s->h_tail >= s->h_begin);
232   assert (s->h_prev >= s->h_begin);
233 
234   assert (s->h_next <= s->h_end);
235   assert (s->h_tail <= s->h_end);
236   assert (s->h_prev <= s->h_end);
237 }
238 
239 static void
240 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
241 {
242   int left;
243   int offset;
244   struct map *m;
245 
246   assert (s != NULL);
247 
248   left = s->h_end - s->h_next;
249   size += sizeof (struct map);
250 
251   assert (s->h_prev);
252   assert (s->h_next);
253 
254   if (size >= left)
255     {
256       m = s->h_prev;
257       m->size += left;
258       s->h_next = s->h_begin;
259 
260       if (s->h_next + size > s->h_end)
261 	GOMP_PLUGIN_fatal ("unable to push map");
262     }
263 
264   assert (s->h_next);
265 
266   m = s->h_next;
267   m->async = async;
268   m->size = size;
269 
270   offset = (void *)&m->mappings[0] - s->h;
271 
272   *d = (void *)(s->d + offset);
273   *h = (void *)(s->h + offset);
274 
275   s->h_prev = s->h_next;
276   s->h_next += size;
277 
278   assert (s->h_prev);
279   assert (s->h_next);
280 
281   assert (s->h_next >= s->h_begin);
282   assert (s->h_tail >= s->h_begin);
283   assert (s->h_prev >= s->h_begin);
284   assert (s->h_next <= s->h_end);
285   assert (s->h_tail <= s->h_end);
286   assert (s->h_prev <= s->h_end);
287 
288   return;
289 }
290 
291 struct ptx_device
292 {
293   CUcontext ctx;
294   bool ctx_shared;
295   CUdevice dev;
296   struct ptx_stream *null_stream;
297   /* All non-null streams associated with this device (actually context),
298      either created implicitly or passed in from the user (via
299      acc_set_cuda_stream).  */
300   struct ptx_stream *active_streams;
301   struct {
302     struct ptx_stream **arr;
303     int size;
304   } async_streams;
305   /* A lock for use when manipulating the above stream list and array.  */
306   pthread_mutex_t stream_lock;
307   int ord;
308   bool overlap;
309   bool map;
310   bool concur;
311   int  mode;
312   bool mkern;
313 
314   struct ptx_device *next;
315 };
316 
317 enum ptx_event_type
318 {
319   PTX_EVT_MEM,
320   PTX_EVT_KNL,
321   PTX_EVT_SYNC,
322   PTX_EVT_ASYNC_CLEANUP
323 };
324 
325 struct ptx_event
326 {
327   CUevent *evt;
328   int type;
329   void *addr;
330   int ord;
331 
332   struct ptx_event *next;
333 };
334 
335 struct ptx_image_data
336 {
337   void *target_data;
338   CUmodule module;
339   struct ptx_image_data *next;
340 };
341 
342 static pthread_mutex_t ptx_event_lock;
343 static struct ptx_event *ptx_events;
344 
345 static struct ptx_device **ptx_devices;
346 
347 static struct ptx_image_data *ptx_images = NULL;
348 static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
349 
350 #define _XSTR(s) _STR(s)
351 #define _STR(s) #s
352 
353 static struct _synames
354 {
355   char *n;
356 } cuda_symnames[] =
357 {
358   { _XSTR (cuCtxCreate) },
359   { _XSTR (cuCtxDestroy) },
360   { _XSTR (cuCtxGetCurrent) },
361   { _XSTR (cuCtxPushCurrent) },
362   { _XSTR (cuCtxSynchronize) },
363   { _XSTR (cuDeviceGet) },
364   { _XSTR (cuDeviceGetAttribute) },
365   { _XSTR (cuDeviceGetCount) },
366   { _XSTR (cuEventCreate) },
367   { _XSTR (cuEventDestroy) },
368   { _XSTR (cuEventQuery) },
369   { _XSTR (cuEventRecord) },
370   { _XSTR (cuInit) },
371   { _XSTR (cuLaunchKernel) },
372   { _XSTR (cuLinkAddData) },
373   { _XSTR (cuLinkComplete) },
374   { _XSTR (cuLinkCreate) },
375   { _XSTR (cuMemAlloc) },
376   { _XSTR (cuMemAllocHost) },
377   { _XSTR (cuMemcpy) },
378   { _XSTR (cuMemcpyDtoH) },
379   { _XSTR (cuMemcpyDtoHAsync) },
380   { _XSTR (cuMemcpyHtoD) },
381   { _XSTR (cuMemcpyHtoDAsync) },
382   { _XSTR (cuMemFree) },
383   { _XSTR (cuMemFreeHost) },
384   { _XSTR (cuMemGetAddressRange) },
385   { _XSTR (cuMemHostGetDevicePointer) },
386   { _XSTR (cuMemHostRegister) },
387   { _XSTR (cuMemHostUnregister) },
388   { _XSTR (cuModuleGetFunction) },
389   { _XSTR (cuModuleLoadData) },
390   { _XSTR (cuStreamDestroy) },
391   { _XSTR (cuStreamQuery) },
392   { _XSTR (cuStreamSynchronize) },
393   { _XSTR (cuStreamWaitEvent) }
394 };
395 
396 static int
397 verify_device_library (void)
398 {
399   int i;
400   void *dh, *ds;
401 
402   dh = dlopen ("libcuda.so", RTLD_LAZY);
403   if (!dh)
404     return -1;
405 
406   for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
407     {
408       ds = dlsym (dh, cuda_symnames[i].n);
409       if (!ds)
410         return -1;
411     }
412 
413   dlclose (dh);
414 
415   return 0;
416 }
417 
418 static inline struct nvptx_thread *
419 nvptx_thread (void)
420 {
421   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
422 }
423 
424 static void
425 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
426 {
427   int i;
428   struct ptx_stream *null_stream
429     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
430 
431   null_stream->stream = NULL;
432   null_stream->host_thread = pthread_self ();
433   null_stream->multithreaded = true;
434   null_stream->d = (CUdeviceptr) NULL;
435   null_stream->h = NULL;
436   map_init (null_stream);
437   ptx_dev->null_stream = null_stream;
438 
439   ptx_dev->active_streams = NULL;
440   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
441 
442   if (concurrency < 1)
443     concurrency = 1;
444 
445   /* This is just a guess -- make space for as many async streams as the
446      current device is capable of concurrently executing.  This can grow
447      later as necessary.  No streams are created yet.  */
448   ptx_dev->async_streams.arr
449     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
450   ptx_dev->async_streams.size = concurrency;
451 
452   for (i = 0; i < concurrency; i++)
453     ptx_dev->async_streams.arr[i] = NULL;
454 }
455 
456 static void
457 fini_streams_for_device (struct ptx_device *ptx_dev)
458 {
459   free (ptx_dev->async_streams.arr);
460 
461   while (ptx_dev->active_streams != NULL)
462     {
463       struct ptx_stream *s = ptx_dev->active_streams;
464       ptx_dev->active_streams = ptx_dev->active_streams->next;
465 
466       map_fini (s);
467       cuStreamDestroy (s->stream);
468       free (s);
469     }
470 
471   map_fini (ptx_dev->null_stream);
472   free (ptx_dev->null_stream);
473 }
474 
475 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
476    thread THREAD (and also current device/context).  If CREATE is true, create
477    the stream if it does not exist (or use EXISTING if it is non-NULL), and
478    associate the stream with the same thread argument.  Returns stream to use
479    as result.  */
480 
481 static struct ptx_stream *
482 select_stream_for_async (int async, pthread_t thread, bool create,
483 			 CUstream existing)
484 {
485   struct nvptx_thread *nvthd = nvptx_thread ();
486   /* Local copy of TLS variable.  */
487   struct ptx_device *ptx_dev = nvthd->ptx_dev;
488   struct ptx_stream *stream = NULL;
489   int orig_async = async;
490 
491   /* The special value acc_async_noval (-1) maps (for now) to an
492      implicitly-created stream, which is then handled the same as any other
493      numbered async stream.  Other options are available, e.g. using the null
494      stream for anonymous async operations, or choosing an idle stream from an
495      active set.  But, stick with this for now.  */
496   if (async > acc_async_sync)
497     async++;
498 
499   if (create)
500     pthread_mutex_lock (&ptx_dev->stream_lock);
501 
502   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
503      null stream, and in fact better performance may be obtainable if it doesn't
504      (because the null stream enforces overly-strict synchronisation with
505      respect to other streams for legacy reasons, and that's probably not
506      needed with OpenACC).  Maybe investigate later.  */
507   if (async == acc_async_sync)
508     stream = ptx_dev->null_stream;
509   else if (async >= 0 && async < ptx_dev->async_streams.size
510 	   && ptx_dev->async_streams.arr[async] && !(create && existing))
511     stream = ptx_dev->async_streams.arr[async];
512   else if (async >= 0 && create)
513     {
514       if (async >= ptx_dev->async_streams.size)
515 	{
516 	  int i, newsize = ptx_dev->async_streams.size * 2;
517 
518 	  if (async >= newsize)
519 	    newsize = async + 1;
520 
521 	  ptx_dev->async_streams.arr
522 	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
523 				   newsize * sizeof (struct ptx_stream *));
524 
525 	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
526 	    ptx_dev->async_streams.arr[i] = NULL;
527 
528 	  ptx_dev->async_streams.size = newsize;
529 	}
530 
531       /* Create a new stream on-demand if there isn't one already, or if we're
532 	 setting a particular async value to an existing (externally-provided)
533 	 stream.  */
534       if (!ptx_dev->async_streams.arr[async] || existing)
535         {
536 	  CUresult r;
537 	  struct ptx_stream *s
538 	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
539 
540 	  if (existing)
541 	    s->stream = existing;
542 	  else
543 	    {
544 	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
545 	      if (r != CUDA_SUCCESS)
546 		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
547 	    }
548 
549 	  /* If CREATE is true, we're going to be queueing some work on this
550 	     stream.  Associate it with the current host thread.  */
551 	  s->host_thread = thread;
552 	  s->multithreaded = false;
553 
554 	  s->d = (CUdeviceptr) NULL;
555 	  s->h = NULL;
556 	  map_init (s);
557 
558 	  s->next = ptx_dev->active_streams;
559 	  ptx_dev->active_streams = s;
560 	  ptx_dev->async_streams.arr[async] = s;
561 	}
562 
563       stream = ptx_dev->async_streams.arr[async];
564     }
565   else if (async < 0)
566     GOMP_PLUGIN_fatal ("bad async %d", async);
567 
568   if (create)
569     {
570       assert (stream != NULL);
571 
572       /* If we're trying to use the same stream from different threads
573 	 simultaneously, set stream->multithreaded to true.  This affects the
574 	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
575 	 only wait for asynchronous launches from the same host thread they are
576 	 invoked on.  If multiple threads use the same async value, we make note
577 	 of that here and fall back to testing/waiting for all threads in those
578 	 functions.  */
579       if (thread != stream->host_thread)
580         stream->multithreaded = true;
581 
582       pthread_mutex_unlock (&ptx_dev->stream_lock);
583     }
584   else if (stream && !stream->multithreaded
585 	   && !pthread_equal (stream->host_thread, thread))
586     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
587 
588   return stream;
589 }
590 
591 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
592    should be locked on entry and remains locked on exit.  */
593 static bool
594 nvptx_init (void)
595 {
596   CUresult r;
597   int rc;
598   int ndevs;
599 
600   if (instantiated_devices != 0)
601     return true;
602 
603   rc = verify_device_library ();
604   if (rc < 0)
605     return false;
606 
607   r = cuInit (0);
608   if (r != CUDA_SUCCESS)
609     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
610 
611   ptx_events = NULL;
612 
613   pthread_mutex_init (&ptx_event_lock, NULL);
614 
615   r = cuDeviceGetCount (&ndevs);
616   if (r != CUDA_SUCCESS)
617     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
618 
619   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
620 					    * ndevs);
621 
622   return true;
623 }
624 
625 /* Select the N'th PTX device for the current host thread.  The device must
626    have been previously opened before calling this function.  */
627 
628 static void
629 nvptx_attach_host_thread_to_device (int n)
630 {
631   CUdevice dev;
632   CUresult r;
633   struct ptx_device *ptx_dev;
634   CUcontext thd_ctx;
635 
636   r = cuCtxGetDevice (&dev);
637   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
638     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
639 
640   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
641     return;
642   else
643     {
644       CUcontext old_ctx;
645 
646       ptx_dev = ptx_devices[n];
647       assert (ptx_dev);
648 
649       r = cuCtxGetCurrent (&thd_ctx);
650       if (r != CUDA_SUCCESS)
651         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
652 
653       /* We don't necessarily have a current context (e.g. if it has been
654          destroyed.  Pop it if we do though.  */
655       if (thd_ctx != NULL)
656 	{
657 	  r = cuCtxPopCurrent (&old_ctx);
658 	  if (r != CUDA_SUCCESS)
659             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
660 	}
661 
662       r = cuCtxPushCurrent (ptx_dev->ctx);
663       if (r != CUDA_SUCCESS)
664         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
665     }
666 }
667 
668 static struct ptx_device *
669 nvptx_open_device (int n)
670 {
671   struct ptx_device *ptx_dev;
672   CUdevice dev, ctx_dev;
673   CUresult r;
674   int async_engines, pi;
675 
676   r = cuDeviceGet (&dev, n);
677   if (r != CUDA_SUCCESS)
678     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
679 
680   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
681 
682   ptx_dev->ord = n;
683   ptx_dev->dev = dev;
684   ptx_dev->ctx_shared = false;
685 
686   r = cuCtxGetDevice (&ctx_dev);
687   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
688     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
689 
690   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
691     {
692       /* The current host thread has an active context for a different device.
693          Detach it.  */
694       CUcontext old_ctx;
695 
696       r = cuCtxPopCurrent (&old_ctx);
697       if (r != CUDA_SUCCESS)
698 	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
699     }
700 
701   r = cuCtxGetCurrent (&ptx_dev->ctx);
702   if (r != CUDA_SUCCESS)
703     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
704 
705   if (!ptx_dev->ctx)
706     {
707       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
708       if (r != CUDA_SUCCESS)
709 	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
710     }
711   else
712     ptx_dev->ctx_shared = true;
713 
714   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
715   if (r != CUDA_SUCCESS)
716     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
717 
718   ptx_dev->overlap = pi;
719 
720   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
721   if (r != CUDA_SUCCESS)
722     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
723 
724   ptx_dev->map = pi;
725 
726   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
727   if (r != CUDA_SUCCESS)
728     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
729 
730   ptx_dev->concur = pi;
731 
732   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
733   if (r != CUDA_SUCCESS)
734     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
735 
736   ptx_dev->mode = pi;
737 
738   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
739   if (r != CUDA_SUCCESS)
740     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
741 
742   ptx_dev->mkern = pi;
743 
744   r = cuDeviceGetAttribute (&async_engines,
745 			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
746   if (r != CUDA_SUCCESS)
747     async_engines = 1;
748 
749   init_streams_for_device (ptx_dev, async_engines);
750 
751   return ptx_dev;
752 }
753 
754 static void
755 nvptx_close_device (struct ptx_device *ptx_dev)
756 {
757   CUresult r;
758 
759   if (!ptx_dev)
760     return;
761 
762   fini_streams_for_device (ptx_dev);
763 
764   if (!ptx_dev->ctx_shared)
765     {
766       r = cuCtxDestroy (ptx_dev->ctx);
767       if (r != CUDA_SUCCESS)
768 	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
769     }
770 
771   free (ptx_dev);
772 }
773 
774 static int
775 nvptx_get_num_devices (void)
776 {
777   int n;
778   CUresult r;
779 
780   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
781      configurations.  */
782   if (sizeof (void *) != 8)
783     return 0;
784 
785   /* This function will be called before the plugin has been initialized in
786      order to enumerate available devices, but CUDA API routines can't be used
787      until cuInit has been called.  Just call it now (but don't yet do any
788      further initialization).  */
789   if (instantiated_devices == 0)
790     cuInit (0);
791 
792   r = cuDeviceGetCount (&n);
793   if (r!= CUDA_SUCCESS)
794     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
795 
796   return n;
797 }
798 
799 
800 static void
801 link_ptx (CUmodule *module, char *ptx_code)
802 {
803   CUjit_option opts[7];
804   void *optvals[7];
805   float elapsed = 0.0;
806 #define LOGSIZE 8192
807   char elog[LOGSIZE];
808   char ilog[LOGSIZE];
809   unsigned long logsize = LOGSIZE;
810   CUlinkState linkstate;
811   CUresult r;
812   void *linkout;
813   size_t linkoutsize __attribute__ ((unused));
814 
815   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
816 
817   opts[0] = CU_JIT_WALL_TIME;
818   optvals[0] = &elapsed;
819 
820   opts[1] = CU_JIT_INFO_LOG_BUFFER;
821   optvals[1] = &ilog[0];
822 
823   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
824   optvals[2] = (void *) logsize;
825 
826   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
827   optvals[3] = &elog[0];
828 
829   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
830   optvals[4] = (void *) logsize;
831 
832   opts[5] = CU_JIT_LOG_VERBOSE;
833   optvals[5] = (void *) 1;
834 
835   opts[6] = CU_JIT_TARGET;
836   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
837 
838   r = cuLinkCreate (7, opts, optvals, &linkstate);
839   if (r != CUDA_SUCCESS)
840     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
841 
842   char *abort_ptx = ABORT_PTX;
843   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
844 		     strlen (abort_ptx) + 1, 0, 0, 0, 0);
845   if (r != CUDA_SUCCESS)
846     {
847       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
848       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
849     }
850 
851   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
852   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
853 		     strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
854   if (r != CUDA_SUCCESS)
855     {
856       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
857       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
858 			 cuda_error (r));
859     }
860 
861   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
862   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
863 		     strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
864   if (r != CUDA_SUCCESS)
865     {
866       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
867       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
868 			 cuda_error (r));
869     }
870 
871   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
872               strlen (ptx_code) + 1, 0, 0, 0, 0);
873   if (r != CUDA_SUCCESS)
874     {
875       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
876       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
877     }
878 
879   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
880   if (r != CUDA_SUCCESS)
881     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
882 
883   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
884   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
885 
886   r = cuModuleLoadData (module, linkout);
887   if (r != CUDA_SUCCESS)
888     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
889 }
890 
891 static void
892 event_gc (bool memmap_lockable)
893 {
894   struct ptx_event *ptx_event = ptx_events;
895   struct nvptx_thread *nvthd = nvptx_thread ();
896 
897   pthread_mutex_lock (&ptx_event_lock);
898 
899   while (ptx_event != NULL)
900     {
901       CUresult r;
902       struct ptx_event *e = ptx_event;
903 
904       ptx_event = ptx_event->next;
905 
906       if (e->ord != nvthd->ptx_dev->ord)
907 	continue;
908 
909       r = cuEventQuery (*e->evt);
910       if (r == CUDA_SUCCESS)
911 	{
912 	  CUevent *te;
913 
914 	  te = e->evt;
915 
916 	  switch (e->type)
917 	    {
918 	    case PTX_EVT_MEM:
919 	    case PTX_EVT_SYNC:
920 	      break;
921 
922 	    case PTX_EVT_KNL:
923 	      map_pop (e->addr);
924 	      break;
925 
926 	    case PTX_EVT_ASYNC_CLEANUP:
927 	      {
928 		/* The function gomp_plugin_async_unmap_vars needs to claim the
929 		   memory-map splay tree lock for the current device, so we
930 		   can't call it when one of our callers has already claimed
931 		   the lock.  In that case, just delay the GC for this event
932 		   until later.  */
933 		if (!memmap_lockable)
934 		  continue;
935 
936 		GOMP_PLUGIN_async_unmap_vars (e->addr);
937 	      }
938 	      break;
939 	    }
940 
941 	  cuEventDestroy (*te);
942 	  free ((void *)te);
943 
944 	  if (ptx_events == e)
945 	    ptx_events = ptx_events->next;
946 	  else
947 	    {
948 	      struct ptx_event *e_ = ptx_events;
949 	      while (e_->next != e)
950 		e_ = e_->next;
951 	      e_->next = e_->next->next;
952 	    }
953 
954 	  free (e);
955 	}
956     }
957 
958   pthread_mutex_unlock (&ptx_event_lock);
959 }
960 
961 static void
962 event_add (enum ptx_event_type type, CUevent *e, void *h)
963 {
964   struct ptx_event *ptx_event;
965   struct nvptx_thread *nvthd = nvptx_thread ();
966 
967   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
968 	  || type == PTX_EVT_ASYNC_CLEANUP);
969 
970   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
971   ptx_event->type = type;
972   ptx_event->evt = e;
973   ptx_event->addr = h;
974   ptx_event->ord = nvthd->ptx_dev->ord;
975 
976   pthread_mutex_lock (&ptx_event_lock);
977 
978   ptx_event->next = ptx_events;
979   ptx_events = ptx_event;
980 
981   pthread_mutex_unlock (&ptx_event_lock);
982 }
983 
984 void
985 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
986 	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
987 	  int vector_length, int async, void *targ_mem_desc)
988 {
989   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
990   CUfunction function;
991   CUresult r;
992   int i;
993   struct ptx_stream *dev_str;
994   void *kargs[1];
995   void *hp, *dp;
996   unsigned int nthreads_in_block;
997   struct nvptx_thread *nvthd = nvptx_thread ();
998   const char *maybe_abort_msg = "(perhaps abort was called)";
999 
1000   function = targ_fn->fn;
1001 
1002   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1003   assert (dev_str == nvthd->current_stream);
1004 
1005   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1006      the host and the device. HP is a host pointer to the new chunk, and DP is
1007      the corresponding device pointer.  */
1008   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1009 
1010   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1011 
1012   /* Copy the array of arguments to the mapped page.  */
1013   for (i = 0; i < mapnum; i++)
1014     ((void **) hp)[i] = devaddrs[i];
1015 
1016   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1017      fact have the same value on a unified-memory system).  */
1018   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1019   if (r != CUDA_SUCCESS)
1020     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1021 
1022   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1023 
1024   // OpenACC		CUDA
1025   //
1026   // num_gangs		blocks
1027   // num_workers	warps (where a warp is equivalent to 32 threads)
1028   // vector length	threads
1029   //
1030 
1031   /* The openacc vector_length clause 'determines the vector length to use for
1032      vector or SIMD operations'.  The question is how to map this to CUDA.
1033 
1034      In CUDA, the warp size is the vector length of a CUDA device.  However, the
1035      CUDA interface abstracts away from that, and only shows us warp size
1036      indirectly in maximum number of threads per block, which is a product of
1037      warp size and the number of hyperthreads of a multiprocessor.
1038 
1039      We choose to map openacc vector_length directly onto the number of threads
1040      in a block, in the x dimension.  This is reflected in gcc code generation
1041      that uses ThreadIdx.x to access vector elements.
1042 
1043      Attempting to use an openacc vector_length of more than the maximum number
1044      of threads per block will result in a cuda error.  */
1045   nthreads_in_block = vector_length;
1046 
1047   kargs[0] = &dp;
1048   r = cuLaunchKernel (function,
1049 		      num_gangs, 1, 1,
1050 		      nthreads_in_block, 1, 1,
1051 		      0, dev_str->stream, kargs, 0);
1052   if (r != CUDA_SUCCESS)
1053     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1054 
1055 #ifndef DISABLE_ASYNC
1056   if (async < acc_async_noval)
1057     {
1058       r = cuStreamSynchronize (dev_str->stream);
1059       if (r == CUDA_ERROR_LAUNCH_FAILED)
1060 	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1061 			   maybe_abort_msg);
1062       else if (r != CUDA_SUCCESS)
1063         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1064     }
1065   else
1066     {
1067       CUevent *e;
1068 
1069       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1070 
1071       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1072       if (r == CUDA_ERROR_LAUNCH_FAILED)
1073 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1074 			   maybe_abort_msg);
1075       else if (r != CUDA_SUCCESS)
1076         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1077 
1078       event_gc (true);
1079 
1080       r = cuEventRecord (*e, dev_str->stream);
1081       if (r != CUDA_SUCCESS)
1082         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1083 
1084       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1085     }
1086 #else
1087   r = cuCtxSynchronize ();
1088   if (r == CUDA_ERROR_LAUNCH_FAILED)
1089     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1090 		       maybe_abort_msg);
1091   else if (r != CUDA_SUCCESS)
1092     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1093 #endif
1094 
1095   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1096 		     targ_fn->name);
1097 
1098 #ifndef DISABLE_ASYNC
1099   if (async < acc_async_noval)
1100 #endif
1101     map_pop (dev_str);
1102 }
1103 
1104 void * openacc_get_current_cuda_context (void);
1105 
1106 static void *
1107 nvptx_alloc (size_t s)
1108 {
1109   CUdeviceptr d;
1110   CUresult r;
1111 
1112   r = cuMemAlloc (&d, s);
1113   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1114     return 0;
1115   if (r != CUDA_SUCCESS)
1116     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1117   return (void *)d;
1118 }
1119 
1120 static void
1121 nvptx_free (void *p)
1122 {
1123   CUresult r;
1124   CUdeviceptr pb;
1125   size_t ps;
1126 
1127   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1128   if (r != CUDA_SUCCESS)
1129     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1130 
1131   if ((CUdeviceptr)p != pb)
1132     GOMP_PLUGIN_fatal ("invalid device address");
1133 
1134   r = cuMemFree ((CUdeviceptr)p);
1135   if (r != CUDA_SUCCESS)
1136     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1137 }
1138 
1139 static void *
1140 nvptx_host2dev (void *d, const void *h, size_t s)
1141 {
1142   CUresult r;
1143   CUdeviceptr pb;
1144   size_t ps;
1145   struct nvptx_thread *nvthd = nvptx_thread ();
1146 
1147   if (!s)
1148     return 0;
1149 
1150   if (!d)
1151     GOMP_PLUGIN_fatal ("invalid device address");
1152 
1153   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1154   if (r != CUDA_SUCCESS)
1155     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1156 
1157   if (!pb)
1158     GOMP_PLUGIN_fatal ("invalid device address");
1159 
1160   if (!h)
1161     GOMP_PLUGIN_fatal ("invalid host address");
1162 
1163   if (d == h)
1164     GOMP_PLUGIN_fatal ("invalid host or device address");
1165 
1166   if ((void *)(d + s) > (void *)(pb + ps))
1167     GOMP_PLUGIN_fatal ("invalid size");
1168 
1169 #ifndef DISABLE_ASYNC
1170   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1171     {
1172       CUevent *e;
1173 
1174       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1175 
1176       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1177       if (r != CUDA_SUCCESS)
1178         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1179 
1180       event_gc (false);
1181 
1182       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1183 			     nvthd->current_stream->stream);
1184       if (r != CUDA_SUCCESS)
1185         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1186 
1187       r = cuEventRecord (*e, nvthd->current_stream->stream);
1188       if (r != CUDA_SUCCESS)
1189         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1190 
1191       event_add (PTX_EVT_MEM, e, (void *)h);
1192     }
1193   else
1194 #endif
1195     {
1196       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1197       if (r != CUDA_SUCCESS)
1198         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1199     }
1200 
1201   return 0;
1202 }
1203 
1204 static void *
1205 nvptx_dev2host (void *h, const void *d, size_t s)
1206 {
1207   CUresult r;
1208   CUdeviceptr pb;
1209   size_t ps;
1210   struct nvptx_thread *nvthd = nvptx_thread ();
1211 
1212   if (!s)
1213     return 0;
1214 
1215   if (!d)
1216     GOMP_PLUGIN_fatal ("invalid device address");
1217 
1218   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1219   if (r != CUDA_SUCCESS)
1220     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1221 
1222   if (!pb)
1223     GOMP_PLUGIN_fatal ("invalid device address");
1224 
1225   if (!h)
1226     GOMP_PLUGIN_fatal ("invalid host address");
1227 
1228   if (d == h)
1229     GOMP_PLUGIN_fatal ("invalid host or device address");
1230 
1231   if ((void *)(d + s) > (void *)(pb + ps))
1232     GOMP_PLUGIN_fatal ("invalid size");
1233 
1234 #ifndef DISABLE_ASYNC
1235   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1236     {
1237       CUevent *e;
1238 
1239       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1240 
1241       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1242       if (r != CUDA_SUCCESS)
1243         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1244 
1245       event_gc (false);
1246 
1247       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1248 			     nvthd->current_stream->stream);
1249       if (r != CUDA_SUCCESS)
1250         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1251 
1252       r = cuEventRecord (*e, nvthd->current_stream->stream);
1253       if (r != CUDA_SUCCESS)
1254         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1255 
1256       event_add (PTX_EVT_MEM, e, (void *)h);
1257     }
1258   else
1259 #endif
1260     {
1261       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1262       if (r != CUDA_SUCCESS)
1263 	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1264     }
1265 
1266   return 0;
1267 }
1268 
1269 static void
1270 nvptx_set_async (int async)
1271 {
1272   struct nvptx_thread *nvthd = nvptx_thread ();
1273   nvthd->current_stream
1274     = select_stream_for_async (async, pthread_self (), true, NULL);
1275 }
1276 
1277 static int
1278 nvptx_async_test (int async)
1279 {
1280   CUresult r;
1281   struct ptx_stream *s;
1282 
1283   s = select_stream_for_async (async, pthread_self (), false, NULL);
1284 
1285   if (!s)
1286     GOMP_PLUGIN_fatal ("unknown async %d", async);
1287 
1288   r = cuStreamQuery (s->stream);
1289   if (r == CUDA_SUCCESS)
1290     {
1291       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1292 	 whether all work has completed on this stream, and if so omits the call
1293 	 to the wait hook.  If that happens, event_gc might not get called
1294 	 (which prevents variables from getting unmapped and their associated
1295 	 device storage freed), so call it here.  */
1296       event_gc (true);
1297       return 1;
1298     }
1299   else if (r == CUDA_ERROR_NOT_READY)
1300     return 0;
1301 
1302   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1303 
1304   return 0;
1305 }
1306 
1307 static int
1308 nvptx_async_test_all (void)
1309 {
1310   struct ptx_stream *s;
1311   pthread_t self = pthread_self ();
1312   struct nvptx_thread *nvthd = nvptx_thread ();
1313 
1314   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1315 
1316   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1317     {
1318       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1319 	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1320 	{
1321 	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1322 	  return 0;
1323 	}
1324     }
1325 
1326   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1327 
1328   event_gc (true);
1329 
1330   return 1;
1331 }
1332 
1333 static void
1334 nvptx_wait (int async)
1335 {
1336   CUresult r;
1337   struct ptx_stream *s;
1338 
1339   s = select_stream_for_async (async, pthread_self (), false, NULL);
1340 
1341   if (!s)
1342     GOMP_PLUGIN_fatal ("unknown async %d", async);
1343 
1344   r = cuStreamSynchronize (s->stream);
1345   if (r != CUDA_SUCCESS)
1346     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1347 
1348   event_gc (true);
1349 }
1350 
1351 static void
1352 nvptx_wait_async (int async1, int async2)
1353 {
1354   CUresult r;
1355   CUevent *e;
1356   struct ptx_stream *s1, *s2;
1357   pthread_t self = pthread_self ();
1358 
1359   /* The stream that is waiting (rather than being waited for) doesn't
1360      necessarily have to exist already.  */
1361   s2 = select_stream_for_async (async2, self, true, NULL);
1362 
1363   s1 = select_stream_for_async (async1, self, false, NULL);
1364   if (!s1)
1365     GOMP_PLUGIN_fatal ("invalid async 1\n");
1366 
1367   if (s1 == s2)
1368     GOMP_PLUGIN_fatal ("identical parameters");
1369 
1370   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1371 
1372   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1373   if (r != CUDA_SUCCESS)
1374     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1375 
1376   event_gc (true);
1377 
1378   r = cuEventRecord (*e, s1->stream);
1379   if (r != CUDA_SUCCESS)
1380     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1381 
1382   event_add (PTX_EVT_SYNC, e, NULL);
1383 
1384   r = cuStreamWaitEvent (s2->stream, *e, 0);
1385   if (r != CUDA_SUCCESS)
1386     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1387 }
1388 
1389 static void
1390 nvptx_wait_all (void)
1391 {
1392   CUresult r;
1393   struct ptx_stream *s;
1394   pthread_t self = pthread_self ();
1395   struct nvptx_thread *nvthd = nvptx_thread ();
1396 
1397   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1398 
1399   /* Wait for active streams initiated by this thread (or by multiple threads)
1400      to complete.  */
1401   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1402     {
1403       if (s->multithreaded || pthread_equal (s->host_thread, self))
1404 	{
1405 	  r = cuStreamQuery (s->stream);
1406 	  if (r == CUDA_SUCCESS)
1407 	    continue;
1408 	  else if (r != CUDA_ERROR_NOT_READY)
1409 	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1410 
1411 	  r = cuStreamSynchronize (s->stream);
1412 	  if (r != CUDA_SUCCESS)
1413 	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1414 	}
1415     }
1416 
1417   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1418 
1419   event_gc (true);
1420 }
1421 
1422 static void
1423 nvptx_wait_all_async (int async)
1424 {
1425   CUresult r;
1426   struct ptx_stream *waiting_stream, *other_stream;
1427   CUevent *e;
1428   struct nvptx_thread *nvthd = nvptx_thread ();
1429   pthread_t self = pthread_self ();
1430 
1431   /* The stream doing the waiting.  This could be the first mention of the
1432      stream, so create it if necessary.  */
1433   waiting_stream
1434     = select_stream_for_async (async, pthread_self (), true, NULL);
1435 
1436   /* Launches on the null stream already block on other streams in the
1437      context.  */
1438   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1439     return;
1440 
1441   event_gc (true);
1442 
1443   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1444 
1445   for (other_stream = nvthd->ptx_dev->active_streams;
1446        other_stream != NULL;
1447        other_stream = other_stream->next)
1448     {
1449       if (!other_stream->multithreaded
1450 	  && !pthread_equal (other_stream->host_thread, self))
1451 	continue;
1452 
1453       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1454 
1455       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1456       if (r != CUDA_SUCCESS)
1457 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1458 
1459       /* Record an event on the waited-for stream.  */
1460       r = cuEventRecord (*e, other_stream->stream);
1461       if (r != CUDA_SUCCESS)
1462 	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1463 
1464       event_add (PTX_EVT_SYNC, e, NULL);
1465 
1466       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1467       if (r != CUDA_SUCCESS)
1468 	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1469    }
1470 
1471   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1472 }
1473 
1474 static void *
1475 nvptx_get_current_cuda_device (void)
1476 {
1477   struct nvptx_thread *nvthd = nvptx_thread ();
1478 
1479   if (!nvthd || !nvthd->ptx_dev)
1480     return NULL;
1481 
1482   return &nvthd->ptx_dev->dev;
1483 }
1484 
1485 static void *
1486 nvptx_get_current_cuda_context (void)
1487 {
1488   struct nvptx_thread *nvthd = nvptx_thread ();
1489 
1490   if (!nvthd || !nvthd->ptx_dev)
1491     return NULL;
1492 
1493   return nvthd->ptx_dev->ctx;
1494 }
1495 
1496 static void *
1497 nvptx_get_cuda_stream (int async)
1498 {
1499   struct ptx_stream *s;
1500   struct nvptx_thread *nvthd = nvptx_thread ();
1501 
1502   if (!nvthd || !nvthd->ptx_dev)
1503     return NULL;
1504 
1505   s = select_stream_for_async (async, pthread_self (), false, NULL);
1506 
1507   return s ? s->stream : NULL;
1508 }
1509 
1510 static int
1511 nvptx_set_cuda_stream (int async, void *stream)
1512 {
1513   struct ptx_stream *oldstream;
1514   pthread_t self = pthread_self ();
1515   struct nvptx_thread *nvthd = nvptx_thread ();
1516 
1517   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1518 
1519   if (async < 0)
1520     GOMP_PLUGIN_fatal ("bad async %d", async);
1521 
1522   /* We have a list of active streams and an array mapping async values to
1523      entries of that list.  We need to take "ownership" of the passed-in stream,
1524      and add it to our list, removing the previous entry also (if there was one)
1525      in order to prevent resource leaks.  Note the potential for surprise
1526      here: maybe we should keep track of passed-in streams and leave it up to
1527      the user to tidy those up, but that doesn't work for stream handles
1528      returned from acc_get_cuda_stream above...  */
1529 
1530   oldstream = select_stream_for_async (async, self, false, NULL);
1531 
1532   if (oldstream)
1533     {
1534       if (nvthd->ptx_dev->active_streams == oldstream)
1535 	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1536       else
1537 	{
1538 	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1539 	  while (s->next != oldstream)
1540 	    s = s->next;
1541 	  s->next = s->next->next;
1542 	}
1543 
1544       cuStreamDestroy (oldstream->stream);
1545       map_fini (oldstream);
1546       free (oldstream);
1547     }
1548 
1549   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1550 
1551   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1552 
1553   return 1;
1554 }
1555 
1556 /* Plugin entry points.  */
1557 
1558 const char *
1559 GOMP_OFFLOAD_get_name (void)
1560 {
1561   return "nvptx";
1562 }
1563 
1564 unsigned int
1565 GOMP_OFFLOAD_get_caps (void)
1566 {
1567   return GOMP_OFFLOAD_CAP_OPENACC_200;
1568 }
1569 
1570 int
1571 GOMP_OFFLOAD_get_type (void)
1572 {
1573   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1574 }
1575 
1576 int
1577 GOMP_OFFLOAD_get_num_devices (void)
1578 {
1579   return nvptx_get_num_devices ();
1580 }
1581 
1582 void
1583 GOMP_OFFLOAD_init_device (int n)
1584 {
1585   pthread_mutex_lock (&ptx_dev_lock);
1586 
1587   if (!nvptx_init () || ptx_devices[n] != NULL)
1588     {
1589       pthread_mutex_unlock (&ptx_dev_lock);
1590       return;
1591     }
1592 
1593   ptx_devices[n] = nvptx_open_device (n);
1594   instantiated_devices++;
1595 
1596   pthread_mutex_unlock (&ptx_dev_lock);
1597 }
1598 
1599 void
1600 GOMP_OFFLOAD_fini_device (int n)
1601 {
1602   pthread_mutex_lock (&ptx_dev_lock);
1603 
1604   if (ptx_devices[n] != NULL)
1605     {
1606       nvptx_attach_host_thread_to_device (n);
1607       nvptx_close_device (ptx_devices[n]);
1608       ptx_devices[n] = NULL;
1609       instantiated_devices--;
1610     }
1611 
1612   pthread_mutex_unlock (&ptx_dev_lock);
1613 }
1614 
1615 int
1616 GOMP_OFFLOAD_load_image (int ord, void *target_data,
1617 			 struct addr_pair **target_table)
1618 {
1619   CUmodule module;
1620   char **fn_names, **var_names;
1621   unsigned int fn_entries, var_entries, i, j;
1622   CUresult r;
1623   struct targ_fn_descriptor *targ_fns;
1624   void **img_header = (void **) target_data;
1625   struct ptx_image_data *new_image;
1626 
1627   GOMP_OFFLOAD_init_device (ord);
1628 
1629   nvptx_attach_host_thread_to_device (ord);
1630 
1631   link_ptx (&module, img_header[0]);
1632 
1633   pthread_mutex_lock (&ptx_image_lock);
1634   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1635   new_image->target_data = target_data;
1636   new_image->module = module;
1637   new_image->next = ptx_images;
1638   ptx_images = new_image;
1639   pthread_mutex_unlock (&ptx_image_lock);
1640 
1641   /* The mkoffload utility emits a table of pointers/integers at the start of
1642      each offload image:
1643 
1644      img_header[0] -> ptx code
1645      img_header[1] -> number of variables
1646      img_header[2] -> array of variable names (pointers to strings)
1647      img_header[3] -> number of kernels
1648      img_header[4] -> array of kernel names (pointers to strings)
1649 
1650      The array of kernel names and the functions addresses form a
1651      one-to-one correspondence.  */
1652 
1653   var_entries = (uintptr_t) img_header[1];
1654   var_names = (char **) img_header[2];
1655   fn_entries = (uintptr_t) img_header[3];
1656   fn_names = (char **) img_header[4];
1657 
1658   *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1659 				      * (fn_entries + var_entries));
1660   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1661 				 * fn_entries);
1662 
1663   for (i = 0; i < fn_entries; i++)
1664     {
1665       CUfunction function;
1666 
1667       r = cuModuleGetFunction (&function, module, fn_names[i]);
1668       if (r != CUDA_SUCCESS)
1669 	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1670 
1671       targ_fns[i].fn = function;
1672       targ_fns[i].name = (const char *) fn_names[i];
1673 
1674       (*target_table)[i].start = (uintptr_t) &targ_fns[i];
1675       (*target_table)[i].end = (*target_table)[i].start + 1;
1676     }
1677 
1678   for (j = 0; j < var_entries; j++, i++)
1679     {
1680       CUdeviceptr var;
1681       size_t bytes;
1682 
1683       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1684       if (r != CUDA_SUCCESS)
1685         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1686 
1687       (*target_table)[i].start = (uintptr_t) var;
1688       (*target_table)[i].end = (*target_table)[i].start + bytes;
1689     }
1690 
1691   return i;
1692 }
1693 
1694 void
1695 GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
1696 {
1697   void **img_header = (void **) target_data;
1698   struct targ_fn_descriptor *targ_fns
1699     = (struct targ_fn_descriptor *) img_header[0];
1700   struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
1701 
1702   free (targ_fns);
1703 
1704   pthread_mutex_lock (&ptx_image_lock);
1705   for (image = ptx_images; image != NULL;)
1706     {
1707       struct ptx_image_data *next = image->next;
1708 
1709       if (image->target_data == target_data)
1710 	{
1711 	  cuModuleUnload (image->module);
1712 	  free (image);
1713 	  if (prev)
1714 	    prev->next = next;
1715 	}
1716       else
1717 	{
1718 	  prev = image;
1719 	  if (!newhd)
1720 	    newhd = image;
1721 	}
1722 
1723       image = next;
1724     }
1725   ptx_images = newhd;
1726   pthread_mutex_unlock (&ptx_image_lock);
1727 }
1728 
1729 void *
1730 GOMP_OFFLOAD_alloc (int ord, size_t size)
1731 {
1732   nvptx_attach_host_thread_to_device (ord);
1733   return nvptx_alloc (size);
1734 }
1735 
1736 void
1737 GOMP_OFFLOAD_free (int ord, void *ptr)
1738 {
1739   nvptx_attach_host_thread_to_device (ord);
1740   nvptx_free (ptr);
1741 }
1742 
1743 void *
1744 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1745 {
1746   nvptx_attach_host_thread_to_device (ord);
1747   return nvptx_dev2host (dst, src, n);
1748 }
1749 
1750 void *
1751 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1752 {
1753   nvptx_attach_host_thread_to_device (ord);
1754   return nvptx_host2dev (dst, src, n);
1755 }
1756 
1757 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1758 
1759 void
1760 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1761 			       void **hostaddrs, void **devaddrs, size_t *sizes,
1762 			       unsigned short *kinds, int num_gangs,
1763 			       int num_workers, int vector_length, int async,
1764 			       void *targ_mem_desc)
1765 {
1766   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1767 	    num_workers, vector_length, async, targ_mem_desc);
1768 }
1769 
1770 void
1771 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1772 {
1773   CUevent *e;
1774   CUresult r;
1775   struct nvptx_thread *nvthd = nvptx_thread ();
1776 
1777   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1778 
1779   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1780   if (r != CUDA_SUCCESS)
1781     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1782 
1783   r = cuEventRecord (*e, nvthd->current_stream->stream);
1784   if (r != CUDA_SUCCESS)
1785     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1786 
1787   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1788 }
1789 
1790 int
1791 GOMP_OFFLOAD_openacc_async_test (int async)
1792 {
1793   return nvptx_async_test (async);
1794 }
1795 
1796 int
1797 GOMP_OFFLOAD_openacc_async_test_all (void)
1798 {
1799   return nvptx_async_test_all ();
1800 }
1801 
1802 void
1803 GOMP_OFFLOAD_openacc_async_wait (int async)
1804 {
1805   nvptx_wait (async);
1806 }
1807 
1808 void
1809 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1810 {
1811   nvptx_wait_async (async1, async2);
1812 }
1813 
1814 void
1815 GOMP_OFFLOAD_openacc_async_wait_all (void)
1816 {
1817   nvptx_wait_all ();
1818 }
1819 
1820 void
1821 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1822 {
1823   nvptx_wait_all_async (async);
1824 }
1825 
1826 void
1827 GOMP_OFFLOAD_openacc_async_set_async (int async)
1828 {
1829   nvptx_set_async (async);
1830 }
1831 
1832 void *
1833 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1834 {
1835   struct ptx_device *ptx_dev;
1836   struct nvptx_thread *nvthd
1837     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1838   CUresult r;
1839   CUcontext thd_ctx;
1840 
1841   ptx_dev = ptx_devices[ord];
1842 
1843   assert (ptx_dev);
1844 
1845   r = cuCtxGetCurrent (&thd_ctx);
1846   if (r != CUDA_SUCCESS)
1847     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1848 
1849   assert (ptx_dev->ctx);
1850 
1851   if (!thd_ctx)
1852     {
1853       r = cuCtxPushCurrent (ptx_dev->ctx);
1854       if (r != CUDA_SUCCESS)
1855 	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1856     }
1857 
1858   nvthd->current_stream = ptx_dev->null_stream;
1859   nvthd->ptx_dev = ptx_dev;
1860 
1861   return (void *) nvthd;
1862 }
1863 
1864 void
1865 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1866 {
1867   free (data);
1868 }
1869 
1870 void *
1871 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1872 {
1873   return nvptx_get_current_cuda_device ();
1874 }
1875 
1876 void *
1877 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1878 {
1879   return nvptx_get_current_cuda_context ();
1880 }
1881 
1882 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1883 
1884 void *
1885 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1886 {
1887   return nvptx_get_cuda_stream (async);
1888 }
1889 
1890 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1891 
1892 int
1893 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1894 {
1895   return nvptx_set_cuda_stream (async, stream);
1896 }
1897