1/* Plugin for NVPTX execution.
2
3   Copyright (C) 2013-2015 Free Software Foundation, Inc.
4
5   Contributed by Mentor Embedded.
6
7   This file is part of the GNU Offloading and Multi Processing Library
8   (libgomp).
9
10   Libgomp is free software; you can redistribute it and/or modify it
11   under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 3, or (at your option)
13   any later version.
14
15   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18   more details.
19
20   Under Section 7 of GPL version 3, you are granted additional
21   permissions described in the GCC Runtime Library Exception, version
22   3.1, as published by the Free Software Foundation.
23
24   You should have received a copy of the GNU General Public License and
25   a copy of the GCC Runtime Library Exception along with this program;
26   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27   <http://www.gnu.org/licenses/>.  */
28
29/* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30   library appears to hold some implicit state, but the documentation
31   is not clear as to what that state might be.  Or how one might
32   propagate it from one thread to another.  */
33
34#include "openacc.h"
35#include "config.h"
36#include "libgomp-plugin.h"
37#include "oacc-ptx.h"
38#include "oacc-plugin.h"
39
40#include <pthread.h>
41#include <cuda.h>
42#include <stdbool.h>
43#include <stdint.h>
44#include <string.h>
45#include <stdio.h>
46#include <dlfcn.h>
47#include <unistd.h>
48#include <assert.h>
49
50#define	ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
51
52static struct
53{
54  CUresult r;
55  char *m;
56} cuda_errlist[]=
57{
58  { CUDA_ERROR_INVALID_VALUE, "invalid value" },
59  { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
60  { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
61  { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
62  { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
63  { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
64  { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
65  { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
66  { CUDA_ERROR_NO_DEVICE, "no device" },
67  { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
68  { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
69  { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
70  { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
71  { CUDA_ERROR_MAP_FAILED, "map error" },
72  { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
73  { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
74  { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
75  { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
76  { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
77  { CUDA_ERROR_NOT_MAPPED, "not mapped" },
78  { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
79  { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
80  { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
81  { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
82  { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
83  { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
84  { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
85  { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
86  { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
87                                           "shared object symbol not found" },
88  { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
89  { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
90  { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
91  { CUDA_ERROR_NOT_FOUND, "not found" },
92  { CUDA_ERROR_NOT_READY, "not ready" },
93  { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
94  { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
95  { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
96  { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
97                                             "launch incompatibe texturing" },
98  { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
99  { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
100  { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
101  { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
102  { CUDA_ERROR_ASSERT, "assert" },
103  { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
104  { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
105                                           "host memory already registered" },
106  { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
107  { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
108  { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
109  { CUDA_ERROR_UNKNOWN, "unknown" }
110};
111
112static char errmsg[128];
113
114static char *
115cuda_error (CUresult r)
116{
117  int i;
118
119  for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
120    {
121      if (cuda_errlist[i].r == r)
122	return &cuda_errlist[i].m[0];
123    }
124
125  sprintf (&errmsg[0], "unknown result code: %5d", r);
126
127  return &errmsg[0];
128}
129
130struct targ_fn_descriptor
131{
132  CUfunction fn;
133  const char *name;
134};
135
136static unsigned int instantiated_devices = 0;
137static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
138
139struct ptx_stream
140{
141  CUstream stream;
142  pthread_t host_thread;
143  bool multithreaded;
144
145  CUdeviceptr d;
146  void *h;
147  void *h_begin;
148  void *h_end;
149  void *h_next;
150  void *h_prev;
151  void *h_tail;
152
153  struct ptx_stream *next;
154};
155
156/* Thread-specific data for PTX.  */
157
158struct nvptx_thread
159{
160  struct ptx_stream *current_stream;
161  struct ptx_device *ptx_dev;
162};
163
164struct map
165{
166  int     async;
167  size_t  size;
168  char    mappings[0];
169};
170
171static void
172map_init (struct ptx_stream *s)
173{
174  CUresult r;
175
176  int size = getpagesize ();
177
178  assert (s);
179  assert (!s->d);
180  assert (!s->h);
181
182  r = cuMemAllocHost (&s->h, size);
183  if (r != CUDA_SUCCESS)
184    GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
185
186  r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
187  if (r != CUDA_SUCCESS)
188    GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
189
190  assert (s->h);
191
192  s->h_begin = s->h;
193  s->h_end = s->h_begin + size;
194  s->h_next = s->h_prev = s->h_tail = s->h_begin;
195
196  assert (s->h_next);
197  assert (s->h_end);
198}
199
200static void
201map_fini (struct ptx_stream *s)
202{
203  CUresult r;
204
205  r = cuMemFreeHost (s->h);
206  if (r != CUDA_SUCCESS)
207    GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
208}
209
210static void
211map_pop (struct ptx_stream *s)
212{
213  struct map *m;
214
215  assert (s != NULL);
216  assert (s->h_next);
217  assert (s->h_prev);
218  assert (s->h_tail);
219
220  m = s->h_tail;
221
222  s->h_tail += m->size;
223
224  if (s->h_tail >= s->h_end)
225    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
226
227  if (s->h_next == s->h_tail)
228    s->h_prev = s->h_next;
229
230  assert (s->h_next >= s->h_begin);
231  assert (s->h_tail >= s->h_begin);
232  assert (s->h_prev >= s->h_begin);
233
234  assert (s->h_next <= s->h_end);
235  assert (s->h_tail <= s->h_end);
236  assert (s->h_prev <= s->h_end);
237}
238
239static void
240map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
241{
242  int left;
243  int offset;
244  struct map *m;
245
246  assert (s != NULL);
247
248  left = s->h_end - s->h_next;
249  size += sizeof (struct map);
250
251  assert (s->h_prev);
252  assert (s->h_next);
253
254  if (size >= left)
255    {
256      m = s->h_prev;
257      m->size += left;
258      s->h_next = s->h_begin;
259
260      if (s->h_next + size > s->h_end)
261	GOMP_PLUGIN_fatal ("unable to push map");
262    }
263
264  assert (s->h_next);
265
266  m = s->h_next;
267  m->async = async;
268  m->size = size;
269
270  offset = (void *)&m->mappings[0] - s->h;
271
272  *d = (void *)(s->d + offset);
273  *h = (void *)(s->h + offset);
274
275  s->h_prev = s->h_next;
276  s->h_next += size;
277
278  assert (s->h_prev);
279  assert (s->h_next);
280
281  assert (s->h_next >= s->h_begin);
282  assert (s->h_tail >= s->h_begin);
283  assert (s->h_prev >= s->h_begin);
284  assert (s->h_next <= s->h_end);
285  assert (s->h_tail <= s->h_end);
286  assert (s->h_prev <= s->h_end);
287
288  return;
289}
290
291struct ptx_device
292{
293  CUcontext ctx;
294  bool ctx_shared;
295  CUdevice dev;
296  struct ptx_stream *null_stream;
297  /* All non-null streams associated with this device (actually context),
298     either created implicitly or passed in from the user (via
299     acc_set_cuda_stream).  */
300  struct ptx_stream *active_streams;
301  struct {
302    struct ptx_stream **arr;
303    int size;
304  } async_streams;
305  /* A lock for use when manipulating the above stream list and array.  */
306  pthread_mutex_t stream_lock;
307  int ord;
308  bool overlap;
309  bool map;
310  bool concur;
311  int  mode;
312  bool mkern;
313
314  struct ptx_device *next;
315};
316
317enum ptx_event_type
318{
319  PTX_EVT_MEM,
320  PTX_EVT_KNL,
321  PTX_EVT_SYNC,
322  PTX_EVT_ASYNC_CLEANUP
323};
324
325struct ptx_event
326{
327  CUevent *evt;
328  int type;
329  void *addr;
330  int ord;
331
332  struct ptx_event *next;
333};
334
335struct ptx_image_data
336{
337  void *target_data;
338  CUmodule module;
339  struct ptx_image_data *next;
340};
341
342static pthread_mutex_t ptx_event_lock;
343static struct ptx_event *ptx_events;
344
345static struct ptx_device **ptx_devices;
346
347static struct ptx_image_data *ptx_images = NULL;
348static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
349
350#define _XSTR(s) _STR(s)
351#define _STR(s) #s
352
353static struct _synames
354{
355  char *n;
356} cuda_symnames[] =
357{
358  { _XSTR (cuCtxCreate) },
359  { _XSTR (cuCtxDestroy) },
360  { _XSTR (cuCtxGetCurrent) },
361  { _XSTR (cuCtxPushCurrent) },
362  { _XSTR (cuCtxSynchronize) },
363  { _XSTR (cuDeviceGet) },
364  { _XSTR (cuDeviceGetAttribute) },
365  { _XSTR (cuDeviceGetCount) },
366  { _XSTR (cuEventCreate) },
367  { _XSTR (cuEventDestroy) },
368  { _XSTR (cuEventQuery) },
369  { _XSTR (cuEventRecord) },
370  { _XSTR (cuInit) },
371  { _XSTR (cuLaunchKernel) },
372  { _XSTR (cuLinkAddData) },
373  { _XSTR (cuLinkComplete) },
374  { _XSTR (cuLinkCreate) },
375  { _XSTR (cuMemAlloc) },
376  { _XSTR (cuMemAllocHost) },
377  { _XSTR (cuMemcpy) },
378  { _XSTR (cuMemcpyDtoH) },
379  { _XSTR (cuMemcpyDtoHAsync) },
380  { _XSTR (cuMemcpyHtoD) },
381  { _XSTR (cuMemcpyHtoDAsync) },
382  { _XSTR (cuMemFree) },
383  { _XSTR (cuMemFreeHost) },
384  { _XSTR (cuMemGetAddressRange) },
385  { _XSTR (cuMemHostGetDevicePointer) },
386  { _XSTR (cuMemHostRegister) },
387  { _XSTR (cuMemHostUnregister) },
388  { _XSTR (cuModuleGetFunction) },
389  { _XSTR (cuModuleLoadData) },
390  { _XSTR (cuStreamDestroy) },
391  { _XSTR (cuStreamQuery) },
392  { _XSTR (cuStreamSynchronize) },
393  { _XSTR (cuStreamWaitEvent) }
394};
395
396static int
397verify_device_library (void)
398{
399  int i;
400  void *dh, *ds;
401
402  dh = dlopen ("libcuda.so", RTLD_LAZY);
403  if (!dh)
404    return -1;
405
406  for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
407    {
408      ds = dlsym (dh, cuda_symnames[i].n);
409      if (!ds)
410        return -1;
411    }
412
413  dlclose (dh);
414
415  return 0;
416}
417
418static inline struct nvptx_thread *
419nvptx_thread (void)
420{
421  return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
422}
423
424static void
425init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
426{
427  int i;
428  struct ptx_stream *null_stream
429    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
430
431  null_stream->stream = NULL;
432  null_stream->host_thread = pthread_self ();
433  null_stream->multithreaded = true;
434  null_stream->d = (CUdeviceptr) NULL;
435  null_stream->h = NULL;
436  map_init (null_stream);
437  ptx_dev->null_stream = null_stream;
438
439  ptx_dev->active_streams = NULL;
440  pthread_mutex_init (&ptx_dev->stream_lock, NULL);
441
442  if (concurrency < 1)
443    concurrency = 1;
444
445  /* This is just a guess -- make space for as many async streams as the
446     current device is capable of concurrently executing.  This can grow
447     later as necessary.  No streams are created yet.  */
448  ptx_dev->async_streams.arr
449    = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
450  ptx_dev->async_streams.size = concurrency;
451
452  for (i = 0; i < concurrency; i++)
453    ptx_dev->async_streams.arr[i] = NULL;
454}
455
456static void
457fini_streams_for_device (struct ptx_device *ptx_dev)
458{
459  free (ptx_dev->async_streams.arr);
460
461  while (ptx_dev->active_streams != NULL)
462    {
463      struct ptx_stream *s = ptx_dev->active_streams;
464      ptx_dev->active_streams = ptx_dev->active_streams->next;
465
466      map_fini (s);
467      cuStreamDestroy (s->stream);
468      free (s);
469    }
470
471  map_fini (ptx_dev->null_stream);
472  free (ptx_dev->null_stream);
473}
474
475/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
476   thread THREAD (and also current device/context).  If CREATE is true, create
477   the stream if it does not exist (or use EXISTING if it is non-NULL), and
478   associate the stream with the same thread argument.  Returns stream to use
479   as result.  */
480
481static struct ptx_stream *
482select_stream_for_async (int async, pthread_t thread, bool create,
483			 CUstream existing)
484{
485  struct nvptx_thread *nvthd = nvptx_thread ();
486  /* Local copy of TLS variable.  */
487  struct ptx_device *ptx_dev = nvthd->ptx_dev;
488  struct ptx_stream *stream = NULL;
489  int orig_async = async;
490
491  /* The special value acc_async_noval (-1) maps (for now) to an
492     implicitly-created stream, which is then handled the same as any other
493     numbered async stream.  Other options are available, e.g. using the null
494     stream for anonymous async operations, or choosing an idle stream from an
495     active set.  But, stick with this for now.  */
496  if (async > acc_async_sync)
497    async++;
498
499  if (create)
500    pthread_mutex_lock (&ptx_dev->stream_lock);
501
502  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
503     null stream, and in fact better performance may be obtainable if it doesn't
504     (because the null stream enforces overly-strict synchronisation with
505     respect to other streams for legacy reasons, and that's probably not
506     needed with OpenACC).  Maybe investigate later.  */
507  if (async == acc_async_sync)
508    stream = ptx_dev->null_stream;
509  else if (async >= 0 && async < ptx_dev->async_streams.size
510	   && ptx_dev->async_streams.arr[async] && !(create && existing))
511    stream = ptx_dev->async_streams.arr[async];
512  else if (async >= 0 && create)
513    {
514      if (async >= ptx_dev->async_streams.size)
515	{
516	  int i, newsize = ptx_dev->async_streams.size * 2;
517
518	  if (async >= newsize)
519	    newsize = async + 1;
520
521	  ptx_dev->async_streams.arr
522	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
523				   newsize * sizeof (struct ptx_stream *));
524
525	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
526	    ptx_dev->async_streams.arr[i] = NULL;
527
528	  ptx_dev->async_streams.size = newsize;
529	}
530
531      /* Create a new stream on-demand if there isn't one already, or if we're
532	 setting a particular async value to an existing (externally-provided)
533	 stream.  */
534      if (!ptx_dev->async_streams.arr[async] || existing)
535        {
536	  CUresult r;
537	  struct ptx_stream *s
538	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
539
540	  if (existing)
541	    s->stream = existing;
542	  else
543	    {
544	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
545	      if (r != CUDA_SUCCESS)
546		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
547	    }
548
549	  /* If CREATE is true, we're going to be queueing some work on this
550	     stream.  Associate it with the current host thread.  */
551	  s->host_thread = thread;
552	  s->multithreaded = false;
553
554	  s->d = (CUdeviceptr) NULL;
555	  s->h = NULL;
556	  map_init (s);
557
558	  s->next = ptx_dev->active_streams;
559	  ptx_dev->active_streams = s;
560	  ptx_dev->async_streams.arr[async] = s;
561	}
562
563      stream = ptx_dev->async_streams.arr[async];
564    }
565  else if (async < 0)
566    GOMP_PLUGIN_fatal ("bad async %d", async);
567
568  if (create)
569    {
570      assert (stream != NULL);
571
572      /* If we're trying to use the same stream from different threads
573	 simultaneously, set stream->multithreaded to true.  This affects the
574	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
575	 only wait for asynchronous launches from the same host thread they are
576	 invoked on.  If multiple threads use the same async value, we make note
577	 of that here and fall back to testing/waiting for all threads in those
578	 functions.  */
579      if (thread != stream->host_thread)
580        stream->multithreaded = true;
581
582      pthread_mutex_unlock (&ptx_dev->stream_lock);
583    }
584  else if (stream && !stream->multithreaded
585	   && !pthread_equal (stream->host_thread, thread))
586    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
587
588  return stream;
589}
590
591/* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
592   should be locked on entry and remains locked on exit.  */
593static bool
594nvptx_init (void)
595{
596  CUresult r;
597  int rc;
598  int ndevs;
599
600  if (instantiated_devices != 0)
601    return true;
602
603  rc = verify_device_library ();
604  if (rc < 0)
605    return false;
606
607  r = cuInit (0);
608  if (r != CUDA_SUCCESS)
609    GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
610
611  ptx_events = NULL;
612
613  pthread_mutex_init (&ptx_event_lock, NULL);
614
615  r = cuDeviceGetCount (&ndevs);
616  if (r != CUDA_SUCCESS)
617    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
618
619  ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
620					    * ndevs);
621
622  return true;
623}
624
625/* Select the N'th PTX device for the current host thread.  The device must
626   have been previously opened before calling this function.  */
627
628static void
629nvptx_attach_host_thread_to_device (int n)
630{
631  CUdevice dev;
632  CUresult r;
633  struct ptx_device *ptx_dev;
634  CUcontext thd_ctx;
635
636  r = cuCtxGetDevice (&dev);
637  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
638    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
639
640  if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
641    return;
642  else
643    {
644      CUcontext old_ctx;
645
646      ptx_dev = ptx_devices[n];
647      assert (ptx_dev);
648
649      r = cuCtxGetCurrent (&thd_ctx);
650      if (r != CUDA_SUCCESS)
651        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
652
653      /* We don't necessarily have a current context (e.g. if it has been
654         destroyed.  Pop it if we do though.  */
655      if (thd_ctx != NULL)
656	{
657	  r = cuCtxPopCurrent (&old_ctx);
658	  if (r != CUDA_SUCCESS)
659            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
660	}
661
662      r = cuCtxPushCurrent (ptx_dev->ctx);
663      if (r != CUDA_SUCCESS)
664        GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
665    }
666}
667
668static struct ptx_device *
669nvptx_open_device (int n)
670{
671  struct ptx_device *ptx_dev;
672  CUdevice dev, ctx_dev;
673  CUresult r;
674  int async_engines, pi;
675
676  r = cuDeviceGet (&dev, n);
677  if (r != CUDA_SUCCESS)
678    GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
679
680  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
681
682  ptx_dev->ord = n;
683  ptx_dev->dev = dev;
684  ptx_dev->ctx_shared = false;
685
686  r = cuCtxGetDevice (&ctx_dev);
687  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
688    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
689
690  if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
691    {
692      /* The current host thread has an active context for a different device.
693         Detach it.  */
694      CUcontext old_ctx;
695
696      r = cuCtxPopCurrent (&old_ctx);
697      if (r != CUDA_SUCCESS)
698	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
699    }
700
701  r = cuCtxGetCurrent (&ptx_dev->ctx);
702  if (r != CUDA_SUCCESS)
703    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
704
705  if (!ptx_dev->ctx)
706    {
707      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
708      if (r != CUDA_SUCCESS)
709	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
710    }
711  else
712    ptx_dev->ctx_shared = true;
713
714  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
715  if (r != CUDA_SUCCESS)
716    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
717
718  ptx_dev->overlap = pi;
719
720  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
721  if (r != CUDA_SUCCESS)
722    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
723
724  ptx_dev->map = pi;
725
726  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
727  if (r != CUDA_SUCCESS)
728    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
729
730  ptx_dev->concur = pi;
731
732  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
733  if (r != CUDA_SUCCESS)
734    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
735
736  ptx_dev->mode = pi;
737
738  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
739  if (r != CUDA_SUCCESS)
740    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
741
742  ptx_dev->mkern = pi;
743
744  r = cuDeviceGetAttribute (&async_engines,
745			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
746  if (r != CUDA_SUCCESS)
747    async_engines = 1;
748
749  init_streams_for_device (ptx_dev, async_engines);
750
751  return ptx_dev;
752}
753
754static void
755nvptx_close_device (struct ptx_device *ptx_dev)
756{
757  CUresult r;
758
759  if (!ptx_dev)
760    return;
761
762  fini_streams_for_device (ptx_dev);
763
764  if (!ptx_dev->ctx_shared)
765    {
766      r = cuCtxDestroy (ptx_dev->ctx);
767      if (r != CUDA_SUCCESS)
768	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
769    }
770
771  free (ptx_dev);
772}
773
774static int
775nvptx_get_num_devices (void)
776{
777  int n;
778  CUresult r;
779
780  /* PR libgomp/65099: Currently, we only support offloading in 64-bit
781     configurations.  */
782  if (sizeof (void *) != 8)
783    return 0;
784
785  /* This function will be called before the plugin has been initialized in
786     order to enumerate available devices, but CUDA API routines can't be used
787     until cuInit has been called.  Just call it now (but don't yet do any
788     further initialization).  */
789  if (instantiated_devices == 0)
790    cuInit (0);
791
792  r = cuDeviceGetCount (&n);
793  if (r!= CUDA_SUCCESS)
794    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
795
796  return n;
797}
798
799
800static void
801link_ptx (CUmodule *module, char *ptx_code)
802{
803  CUjit_option opts[7];
804  void *optvals[7];
805  float elapsed = 0.0;
806#define LOGSIZE 8192
807  char elog[LOGSIZE];
808  char ilog[LOGSIZE];
809  unsigned long logsize = LOGSIZE;
810  CUlinkState linkstate;
811  CUresult r;
812  void *linkout;
813  size_t linkoutsize __attribute__ ((unused));
814
815  GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
816
817  opts[0] = CU_JIT_WALL_TIME;
818  optvals[0] = &elapsed;
819
820  opts[1] = CU_JIT_INFO_LOG_BUFFER;
821  optvals[1] = &ilog[0];
822
823  opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
824  optvals[2] = (void *) logsize;
825
826  opts[3] = CU_JIT_ERROR_LOG_BUFFER;
827  optvals[3] = &elog[0];
828
829  opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
830  optvals[4] = (void *) logsize;
831
832  opts[5] = CU_JIT_LOG_VERBOSE;
833  optvals[5] = (void *) 1;
834
835  opts[6] = CU_JIT_TARGET;
836  optvals[6] = (void *) CU_TARGET_COMPUTE_30;
837
838  r = cuLinkCreate (7, opts, optvals, &linkstate);
839  if (r != CUDA_SUCCESS)
840    GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
841
842  char *abort_ptx = ABORT_PTX;
843  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
844		     strlen (abort_ptx) + 1, 0, 0, 0, 0);
845  if (r != CUDA_SUCCESS)
846    {
847      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
848      GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
849    }
850
851  char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
852  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
853		     strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
854  if (r != CUDA_SUCCESS)
855    {
856      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
857      GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
858			 cuda_error (r));
859    }
860
861  char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
862  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
863		     strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
864  if (r != CUDA_SUCCESS)
865    {
866      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
867      GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
868			 cuda_error (r));
869    }
870
871  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
872              strlen (ptx_code) + 1, 0, 0, 0, 0);
873  if (r != CUDA_SUCCESS)
874    {
875      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
876      GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
877    }
878
879  r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
880  if (r != CUDA_SUCCESS)
881    GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
882
883  GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
884  GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
885
886  r = cuModuleLoadData (module, linkout);
887  if (r != CUDA_SUCCESS)
888    GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
889}
890
891static void
892event_gc (bool memmap_lockable)
893{
894  struct ptx_event *ptx_event = ptx_events;
895  struct nvptx_thread *nvthd = nvptx_thread ();
896
897  pthread_mutex_lock (&ptx_event_lock);
898
899  while (ptx_event != NULL)
900    {
901      CUresult r;
902      struct ptx_event *e = ptx_event;
903
904      ptx_event = ptx_event->next;
905
906      if (e->ord != nvthd->ptx_dev->ord)
907	continue;
908
909      r = cuEventQuery (*e->evt);
910      if (r == CUDA_SUCCESS)
911	{
912	  CUevent *te;
913
914	  te = e->evt;
915
916	  switch (e->type)
917	    {
918	    case PTX_EVT_MEM:
919	    case PTX_EVT_SYNC:
920	      break;
921
922	    case PTX_EVT_KNL:
923	      map_pop (e->addr);
924	      break;
925
926	    case PTX_EVT_ASYNC_CLEANUP:
927	      {
928		/* The function gomp_plugin_async_unmap_vars needs to claim the
929		   memory-map splay tree lock for the current device, so we
930		   can't call it when one of our callers has already claimed
931		   the lock.  In that case, just delay the GC for this event
932		   until later.  */
933		if (!memmap_lockable)
934		  continue;
935
936		GOMP_PLUGIN_async_unmap_vars (e->addr);
937	      }
938	      break;
939	    }
940
941	  cuEventDestroy (*te);
942	  free ((void *)te);
943
944	  if (ptx_events == e)
945	    ptx_events = ptx_events->next;
946	  else
947	    {
948	      struct ptx_event *e_ = ptx_events;
949	      while (e_->next != e)
950		e_ = e_->next;
951	      e_->next = e_->next->next;
952	    }
953
954	  free (e);
955	}
956    }
957
958  pthread_mutex_unlock (&ptx_event_lock);
959}
960
961static void
962event_add (enum ptx_event_type type, CUevent *e, void *h)
963{
964  struct ptx_event *ptx_event;
965  struct nvptx_thread *nvthd = nvptx_thread ();
966
967  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
968	  || type == PTX_EVT_ASYNC_CLEANUP);
969
970  ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
971  ptx_event->type = type;
972  ptx_event->evt = e;
973  ptx_event->addr = h;
974  ptx_event->ord = nvthd->ptx_dev->ord;
975
976  pthread_mutex_lock (&ptx_event_lock);
977
978  ptx_event->next = ptx_events;
979  ptx_events = ptx_event;
980
981  pthread_mutex_unlock (&ptx_event_lock);
982}
983
984void
985nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
986	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
987	  int vector_length, int async, void *targ_mem_desc)
988{
989  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
990  CUfunction function;
991  CUresult r;
992  int i;
993  struct ptx_stream *dev_str;
994  void *kargs[1];
995  void *hp, *dp;
996  unsigned int nthreads_in_block;
997  struct nvptx_thread *nvthd = nvptx_thread ();
998  const char *maybe_abort_msg = "(perhaps abort was called)";
999
1000  function = targ_fn->fn;
1001
1002  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1003  assert (dev_str == nvthd->current_stream);
1004
1005  /* This reserves a chunk of a pre-allocated page of memory mapped on both
1006     the host and the device. HP is a host pointer to the new chunk, and DP is
1007     the corresponding device pointer.  */
1008  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1009
1010  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1011
1012  /* Copy the array of arguments to the mapped page.  */
1013  for (i = 0; i < mapnum; i++)
1014    ((void **) hp)[i] = devaddrs[i];
1015
1016  /* Copy the (device) pointers to arguments to the device (dp and hp might in
1017     fact have the same value on a unified-memory system).  */
1018  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1019  if (r != CUDA_SUCCESS)
1020    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1021
1022  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1023
1024  // OpenACC		CUDA
1025  //
1026  // num_gangs		blocks
1027  // num_workers	warps (where a warp is equivalent to 32 threads)
1028  // vector length	threads
1029  //
1030
1031  /* The openacc vector_length clause 'determines the vector length to use for
1032     vector or SIMD operations'.  The question is how to map this to CUDA.
1033
1034     In CUDA, the warp size is the vector length of a CUDA device.  However, the
1035     CUDA interface abstracts away from that, and only shows us warp size
1036     indirectly in maximum number of threads per block, which is a product of
1037     warp size and the number of hyperthreads of a multiprocessor.
1038
1039     We choose to map openacc vector_length directly onto the number of threads
1040     in a block, in the x dimension.  This is reflected in gcc code generation
1041     that uses ThreadIdx.x to access vector elements.
1042
1043     Attempting to use an openacc vector_length of more than the maximum number
1044     of threads per block will result in a cuda error.  */
1045  nthreads_in_block = vector_length;
1046
1047  kargs[0] = &dp;
1048  r = cuLaunchKernel (function,
1049		      num_gangs, 1, 1,
1050		      nthreads_in_block, 1, 1,
1051		      0, dev_str->stream, kargs, 0);
1052  if (r != CUDA_SUCCESS)
1053    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1054
1055#ifndef DISABLE_ASYNC
1056  if (async < acc_async_noval)
1057    {
1058      r = cuStreamSynchronize (dev_str->stream);
1059      if (r == CUDA_ERROR_LAUNCH_FAILED)
1060	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1061			   maybe_abort_msg);
1062      else if (r != CUDA_SUCCESS)
1063        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1064    }
1065  else
1066    {
1067      CUevent *e;
1068
1069      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1070
1071      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1072      if (r == CUDA_ERROR_LAUNCH_FAILED)
1073	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1074			   maybe_abort_msg);
1075      else if (r != CUDA_SUCCESS)
1076        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1077
1078      event_gc (true);
1079
1080      r = cuEventRecord (*e, dev_str->stream);
1081      if (r != CUDA_SUCCESS)
1082        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1083
1084      event_add (PTX_EVT_KNL, e, (void *)dev_str);
1085    }
1086#else
1087  r = cuCtxSynchronize ();
1088  if (r == CUDA_ERROR_LAUNCH_FAILED)
1089    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1090		       maybe_abort_msg);
1091  else if (r != CUDA_SUCCESS)
1092    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1093#endif
1094
1095  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1096		     targ_fn->name);
1097
1098#ifndef DISABLE_ASYNC
1099  if (async < acc_async_noval)
1100#endif
1101    map_pop (dev_str);
1102}
1103
1104void * openacc_get_current_cuda_context (void);
1105
1106static void *
1107nvptx_alloc (size_t s)
1108{
1109  CUdeviceptr d;
1110  CUresult r;
1111
1112  r = cuMemAlloc (&d, s);
1113  if (r == CUDA_ERROR_OUT_OF_MEMORY)
1114    return 0;
1115  if (r != CUDA_SUCCESS)
1116    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1117  return (void *)d;
1118}
1119
1120static void
1121nvptx_free (void *p)
1122{
1123  CUresult r;
1124  CUdeviceptr pb;
1125  size_t ps;
1126
1127  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1128  if (r != CUDA_SUCCESS)
1129    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1130
1131  if ((CUdeviceptr)p != pb)
1132    GOMP_PLUGIN_fatal ("invalid device address");
1133
1134  r = cuMemFree ((CUdeviceptr)p);
1135  if (r != CUDA_SUCCESS)
1136    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1137}
1138
1139static void *
1140nvptx_host2dev (void *d, const void *h, size_t s)
1141{
1142  CUresult r;
1143  CUdeviceptr pb;
1144  size_t ps;
1145  struct nvptx_thread *nvthd = nvptx_thread ();
1146
1147  if (!s)
1148    return 0;
1149
1150  if (!d)
1151    GOMP_PLUGIN_fatal ("invalid device address");
1152
1153  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1154  if (r != CUDA_SUCCESS)
1155    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1156
1157  if (!pb)
1158    GOMP_PLUGIN_fatal ("invalid device address");
1159
1160  if (!h)
1161    GOMP_PLUGIN_fatal ("invalid host address");
1162
1163  if (d == h)
1164    GOMP_PLUGIN_fatal ("invalid host or device address");
1165
1166  if ((void *)(d + s) > (void *)(pb + ps))
1167    GOMP_PLUGIN_fatal ("invalid size");
1168
1169#ifndef DISABLE_ASYNC
1170  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1171    {
1172      CUevent *e;
1173
1174      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1175
1176      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1177      if (r != CUDA_SUCCESS)
1178        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1179
1180      event_gc (false);
1181
1182      r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1183			     nvthd->current_stream->stream);
1184      if (r != CUDA_SUCCESS)
1185        GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1186
1187      r = cuEventRecord (*e, nvthd->current_stream->stream);
1188      if (r != CUDA_SUCCESS)
1189        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1190
1191      event_add (PTX_EVT_MEM, e, (void *)h);
1192    }
1193  else
1194#endif
1195    {
1196      r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1197      if (r != CUDA_SUCCESS)
1198        GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1199    }
1200
1201  return 0;
1202}
1203
1204static void *
1205nvptx_dev2host (void *h, const void *d, size_t s)
1206{
1207  CUresult r;
1208  CUdeviceptr pb;
1209  size_t ps;
1210  struct nvptx_thread *nvthd = nvptx_thread ();
1211
1212  if (!s)
1213    return 0;
1214
1215  if (!d)
1216    GOMP_PLUGIN_fatal ("invalid device address");
1217
1218  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1219  if (r != CUDA_SUCCESS)
1220    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1221
1222  if (!pb)
1223    GOMP_PLUGIN_fatal ("invalid device address");
1224
1225  if (!h)
1226    GOMP_PLUGIN_fatal ("invalid host address");
1227
1228  if (d == h)
1229    GOMP_PLUGIN_fatal ("invalid host or device address");
1230
1231  if ((void *)(d + s) > (void *)(pb + ps))
1232    GOMP_PLUGIN_fatal ("invalid size");
1233
1234#ifndef DISABLE_ASYNC
1235  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1236    {
1237      CUevent *e;
1238
1239      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1240
1241      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1242      if (r != CUDA_SUCCESS)
1243        GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1244
1245      event_gc (false);
1246
1247      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1248			     nvthd->current_stream->stream);
1249      if (r != CUDA_SUCCESS)
1250        GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1251
1252      r = cuEventRecord (*e, nvthd->current_stream->stream);
1253      if (r != CUDA_SUCCESS)
1254        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1255
1256      event_add (PTX_EVT_MEM, e, (void *)h);
1257    }
1258  else
1259#endif
1260    {
1261      r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1262      if (r != CUDA_SUCCESS)
1263	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1264    }
1265
1266  return 0;
1267}
1268
1269static void
1270nvptx_set_async (int async)
1271{
1272  struct nvptx_thread *nvthd = nvptx_thread ();
1273  nvthd->current_stream
1274    = select_stream_for_async (async, pthread_self (), true, NULL);
1275}
1276
1277static int
1278nvptx_async_test (int async)
1279{
1280  CUresult r;
1281  struct ptx_stream *s;
1282
1283  s = select_stream_for_async (async, pthread_self (), false, NULL);
1284
1285  if (!s)
1286    GOMP_PLUGIN_fatal ("unknown async %d", async);
1287
1288  r = cuStreamQuery (s->stream);
1289  if (r == CUDA_SUCCESS)
1290    {
1291      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1292	 whether all work has completed on this stream, and if so omits the call
1293	 to the wait hook.  If that happens, event_gc might not get called
1294	 (which prevents variables from getting unmapped and their associated
1295	 device storage freed), so call it here.  */
1296      event_gc (true);
1297      return 1;
1298    }
1299  else if (r == CUDA_ERROR_NOT_READY)
1300    return 0;
1301
1302  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1303
1304  return 0;
1305}
1306
1307static int
1308nvptx_async_test_all (void)
1309{
1310  struct ptx_stream *s;
1311  pthread_t self = pthread_self ();
1312  struct nvptx_thread *nvthd = nvptx_thread ();
1313
1314  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1315
1316  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1317    {
1318      if ((s->multithreaded || pthread_equal (s->host_thread, self))
1319	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1320	{
1321	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1322	  return 0;
1323	}
1324    }
1325
1326  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1327
1328  event_gc (true);
1329
1330  return 1;
1331}
1332
1333static void
1334nvptx_wait (int async)
1335{
1336  CUresult r;
1337  struct ptx_stream *s;
1338
1339  s = select_stream_for_async (async, pthread_self (), false, NULL);
1340
1341  if (!s)
1342    GOMP_PLUGIN_fatal ("unknown async %d", async);
1343
1344  r = cuStreamSynchronize (s->stream);
1345  if (r != CUDA_SUCCESS)
1346    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1347
1348  event_gc (true);
1349}
1350
1351static void
1352nvptx_wait_async (int async1, int async2)
1353{
1354  CUresult r;
1355  CUevent *e;
1356  struct ptx_stream *s1, *s2;
1357  pthread_t self = pthread_self ();
1358
1359  /* The stream that is waiting (rather than being waited for) doesn't
1360     necessarily have to exist already.  */
1361  s2 = select_stream_for_async (async2, self, true, NULL);
1362
1363  s1 = select_stream_for_async (async1, self, false, NULL);
1364  if (!s1)
1365    GOMP_PLUGIN_fatal ("invalid async 1\n");
1366
1367  if (s1 == s2)
1368    GOMP_PLUGIN_fatal ("identical parameters");
1369
1370  e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1371
1372  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1373  if (r != CUDA_SUCCESS)
1374    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1375
1376  event_gc (true);
1377
1378  r = cuEventRecord (*e, s1->stream);
1379  if (r != CUDA_SUCCESS)
1380    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1381
1382  event_add (PTX_EVT_SYNC, e, NULL);
1383
1384  r = cuStreamWaitEvent (s2->stream, *e, 0);
1385  if (r != CUDA_SUCCESS)
1386    GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1387}
1388
1389static void
1390nvptx_wait_all (void)
1391{
1392  CUresult r;
1393  struct ptx_stream *s;
1394  pthread_t self = pthread_self ();
1395  struct nvptx_thread *nvthd = nvptx_thread ();
1396
1397  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1398
1399  /* Wait for active streams initiated by this thread (or by multiple threads)
1400     to complete.  */
1401  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1402    {
1403      if (s->multithreaded || pthread_equal (s->host_thread, self))
1404	{
1405	  r = cuStreamQuery (s->stream);
1406	  if (r == CUDA_SUCCESS)
1407	    continue;
1408	  else if (r != CUDA_ERROR_NOT_READY)
1409	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1410
1411	  r = cuStreamSynchronize (s->stream);
1412	  if (r != CUDA_SUCCESS)
1413	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1414	}
1415    }
1416
1417  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1418
1419  event_gc (true);
1420}
1421
1422static void
1423nvptx_wait_all_async (int async)
1424{
1425  CUresult r;
1426  struct ptx_stream *waiting_stream, *other_stream;
1427  CUevent *e;
1428  struct nvptx_thread *nvthd = nvptx_thread ();
1429  pthread_t self = pthread_self ();
1430
1431  /* The stream doing the waiting.  This could be the first mention of the
1432     stream, so create it if necessary.  */
1433  waiting_stream
1434    = select_stream_for_async (async, pthread_self (), true, NULL);
1435
1436  /* Launches on the null stream already block on other streams in the
1437     context.  */
1438  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1439    return;
1440
1441  event_gc (true);
1442
1443  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1444
1445  for (other_stream = nvthd->ptx_dev->active_streams;
1446       other_stream != NULL;
1447       other_stream = other_stream->next)
1448    {
1449      if (!other_stream->multithreaded
1450	  && !pthread_equal (other_stream->host_thread, self))
1451	continue;
1452
1453      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1454
1455      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1456      if (r != CUDA_SUCCESS)
1457	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1458
1459      /* Record an event on the waited-for stream.  */
1460      r = cuEventRecord (*e, other_stream->stream);
1461      if (r != CUDA_SUCCESS)
1462	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1463
1464      event_add (PTX_EVT_SYNC, e, NULL);
1465
1466      r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1467      if (r != CUDA_SUCCESS)
1468	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1469   }
1470
1471  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1472}
1473
1474static void *
1475nvptx_get_current_cuda_device (void)
1476{
1477  struct nvptx_thread *nvthd = nvptx_thread ();
1478
1479  if (!nvthd || !nvthd->ptx_dev)
1480    return NULL;
1481
1482  return &nvthd->ptx_dev->dev;
1483}
1484
1485static void *
1486nvptx_get_current_cuda_context (void)
1487{
1488  struct nvptx_thread *nvthd = nvptx_thread ();
1489
1490  if (!nvthd || !nvthd->ptx_dev)
1491    return NULL;
1492
1493  return nvthd->ptx_dev->ctx;
1494}
1495
1496static void *
1497nvptx_get_cuda_stream (int async)
1498{
1499  struct ptx_stream *s;
1500  struct nvptx_thread *nvthd = nvptx_thread ();
1501
1502  if (!nvthd || !nvthd->ptx_dev)
1503    return NULL;
1504
1505  s = select_stream_for_async (async, pthread_self (), false, NULL);
1506
1507  return s ? s->stream : NULL;
1508}
1509
1510static int
1511nvptx_set_cuda_stream (int async, void *stream)
1512{
1513  struct ptx_stream *oldstream;
1514  pthread_t self = pthread_self ();
1515  struct nvptx_thread *nvthd = nvptx_thread ();
1516
1517  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1518
1519  if (async < 0)
1520    GOMP_PLUGIN_fatal ("bad async %d", async);
1521
1522  /* We have a list of active streams and an array mapping async values to
1523     entries of that list.  We need to take "ownership" of the passed-in stream,
1524     and add it to our list, removing the previous entry also (if there was one)
1525     in order to prevent resource leaks.  Note the potential for surprise
1526     here: maybe we should keep track of passed-in streams and leave it up to
1527     the user to tidy those up, but that doesn't work for stream handles
1528     returned from acc_get_cuda_stream above...  */
1529
1530  oldstream = select_stream_for_async (async, self, false, NULL);
1531
1532  if (oldstream)
1533    {
1534      if (nvthd->ptx_dev->active_streams == oldstream)
1535	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1536      else
1537	{
1538	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1539	  while (s->next != oldstream)
1540	    s = s->next;
1541	  s->next = s->next->next;
1542	}
1543
1544      cuStreamDestroy (oldstream->stream);
1545      map_fini (oldstream);
1546      free (oldstream);
1547    }
1548
1549  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1550
1551  (void) select_stream_for_async (async, self, true, (CUstream) stream);
1552
1553  return 1;
1554}
1555
1556/* Plugin entry points.  */
1557
1558const char *
1559GOMP_OFFLOAD_get_name (void)
1560{
1561  return "nvptx";
1562}
1563
1564unsigned int
1565GOMP_OFFLOAD_get_caps (void)
1566{
1567  return GOMP_OFFLOAD_CAP_OPENACC_200;
1568}
1569
1570int
1571GOMP_OFFLOAD_get_type (void)
1572{
1573  return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1574}
1575
1576int
1577GOMP_OFFLOAD_get_num_devices (void)
1578{
1579  return nvptx_get_num_devices ();
1580}
1581
1582void
1583GOMP_OFFLOAD_init_device (int n)
1584{
1585  pthread_mutex_lock (&ptx_dev_lock);
1586
1587  if (!nvptx_init () || ptx_devices[n] != NULL)
1588    {
1589      pthread_mutex_unlock (&ptx_dev_lock);
1590      return;
1591    }
1592
1593  ptx_devices[n] = nvptx_open_device (n);
1594  instantiated_devices++;
1595
1596  pthread_mutex_unlock (&ptx_dev_lock);
1597}
1598
1599void
1600GOMP_OFFLOAD_fini_device (int n)
1601{
1602  pthread_mutex_lock (&ptx_dev_lock);
1603
1604  if (ptx_devices[n] != NULL)
1605    {
1606      nvptx_attach_host_thread_to_device (n);
1607      nvptx_close_device (ptx_devices[n]);
1608      ptx_devices[n] = NULL;
1609      instantiated_devices--;
1610    }
1611
1612  pthread_mutex_unlock (&ptx_dev_lock);
1613}
1614
1615int
1616GOMP_OFFLOAD_load_image (int ord, void *target_data,
1617			 struct addr_pair **target_table)
1618{
1619  CUmodule module;
1620  char **fn_names, **var_names;
1621  unsigned int fn_entries, var_entries, i, j;
1622  CUresult r;
1623  struct targ_fn_descriptor *targ_fns;
1624  void **img_header = (void **) target_data;
1625  struct ptx_image_data *new_image;
1626
1627  GOMP_OFFLOAD_init_device (ord);
1628
1629  nvptx_attach_host_thread_to_device (ord);
1630
1631  link_ptx (&module, img_header[0]);
1632
1633  pthread_mutex_lock (&ptx_image_lock);
1634  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1635  new_image->target_data = target_data;
1636  new_image->module = module;
1637  new_image->next = ptx_images;
1638  ptx_images = new_image;
1639  pthread_mutex_unlock (&ptx_image_lock);
1640
1641  /* The mkoffload utility emits a table of pointers/integers at the start of
1642     each offload image:
1643
1644     img_header[0] -> ptx code
1645     img_header[1] -> number of variables
1646     img_header[2] -> array of variable names (pointers to strings)
1647     img_header[3] -> number of kernels
1648     img_header[4] -> array of kernel names (pointers to strings)
1649
1650     The array of kernel names and the functions addresses form a
1651     one-to-one correspondence.  */
1652
1653  var_entries = (uintptr_t) img_header[1];
1654  var_names = (char **) img_header[2];
1655  fn_entries = (uintptr_t) img_header[3];
1656  fn_names = (char **) img_header[4];
1657
1658  *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1659				      * (fn_entries + var_entries));
1660  targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1661				 * fn_entries);
1662
1663  for (i = 0; i < fn_entries; i++)
1664    {
1665      CUfunction function;
1666
1667      r = cuModuleGetFunction (&function, module, fn_names[i]);
1668      if (r != CUDA_SUCCESS)
1669	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1670
1671      targ_fns[i].fn = function;
1672      targ_fns[i].name = (const char *) fn_names[i];
1673
1674      (*target_table)[i].start = (uintptr_t) &targ_fns[i];
1675      (*target_table)[i].end = (*target_table)[i].start + 1;
1676    }
1677
1678  for (j = 0; j < var_entries; j++, i++)
1679    {
1680      CUdeviceptr var;
1681      size_t bytes;
1682
1683      r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1684      if (r != CUDA_SUCCESS)
1685        GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1686
1687      (*target_table)[i].start = (uintptr_t) var;
1688      (*target_table)[i].end = (*target_table)[i].start + bytes;
1689    }
1690
1691  return i;
1692}
1693
1694void
1695GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
1696{
1697  void **img_header = (void **) target_data;
1698  struct targ_fn_descriptor *targ_fns
1699    = (struct targ_fn_descriptor *) img_header[0];
1700  struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
1701
1702  free (targ_fns);
1703
1704  pthread_mutex_lock (&ptx_image_lock);
1705  for (image = ptx_images; image != NULL;)
1706    {
1707      struct ptx_image_data *next = image->next;
1708
1709      if (image->target_data == target_data)
1710	{
1711	  cuModuleUnload (image->module);
1712	  free (image);
1713	  if (prev)
1714	    prev->next = next;
1715	}
1716      else
1717	{
1718	  prev = image;
1719	  if (!newhd)
1720	    newhd = image;
1721	}
1722
1723      image = next;
1724    }
1725  ptx_images = newhd;
1726  pthread_mutex_unlock (&ptx_image_lock);
1727}
1728
1729void *
1730GOMP_OFFLOAD_alloc (int ord, size_t size)
1731{
1732  nvptx_attach_host_thread_to_device (ord);
1733  return nvptx_alloc (size);
1734}
1735
1736void
1737GOMP_OFFLOAD_free (int ord, void *ptr)
1738{
1739  nvptx_attach_host_thread_to_device (ord);
1740  nvptx_free (ptr);
1741}
1742
1743void *
1744GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1745{
1746  nvptx_attach_host_thread_to_device (ord);
1747  return nvptx_dev2host (dst, src, n);
1748}
1749
1750void *
1751GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1752{
1753  nvptx_attach_host_thread_to_device (ord);
1754  return nvptx_host2dev (dst, src, n);
1755}
1756
1757void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1758
1759void
1760GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1761			       void **hostaddrs, void **devaddrs, size_t *sizes,
1762			       unsigned short *kinds, int num_gangs,
1763			       int num_workers, int vector_length, int async,
1764			       void *targ_mem_desc)
1765{
1766  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1767	    num_workers, vector_length, async, targ_mem_desc);
1768}
1769
1770void
1771GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1772{
1773  CUevent *e;
1774  CUresult r;
1775  struct nvptx_thread *nvthd = nvptx_thread ();
1776
1777  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1778
1779  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1780  if (r != CUDA_SUCCESS)
1781    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1782
1783  r = cuEventRecord (*e, nvthd->current_stream->stream);
1784  if (r != CUDA_SUCCESS)
1785    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1786
1787  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1788}
1789
1790int
1791GOMP_OFFLOAD_openacc_async_test (int async)
1792{
1793  return nvptx_async_test (async);
1794}
1795
1796int
1797GOMP_OFFLOAD_openacc_async_test_all (void)
1798{
1799  return nvptx_async_test_all ();
1800}
1801
1802void
1803GOMP_OFFLOAD_openacc_async_wait (int async)
1804{
1805  nvptx_wait (async);
1806}
1807
1808void
1809GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1810{
1811  nvptx_wait_async (async1, async2);
1812}
1813
1814void
1815GOMP_OFFLOAD_openacc_async_wait_all (void)
1816{
1817  nvptx_wait_all ();
1818}
1819
1820void
1821GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1822{
1823  nvptx_wait_all_async (async);
1824}
1825
1826void
1827GOMP_OFFLOAD_openacc_async_set_async (int async)
1828{
1829  nvptx_set_async (async);
1830}
1831
1832void *
1833GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1834{
1835  struct ptx_device *ptx_dev;
1836  struct nvptx_thread *nvthd
1837    = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1838  CUresult r;
1839  CUcontext thd_ctx;
1840
1841  ptx_dev = ptx_devices[ord];
1842
1843  assert (ptx_dev);
1844
1845  r = cuCtxGetCurrent (&thd_ctx);
1846  if (r != CUDA_SUCCESS)
1847    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1848
1849  assert (ptx_dev->ctx);
1850
1851  if (!thd_ctx)
1852    {
1853      r = cuCtxPushCurrent (ptx_dev->ctx);
1854      if (r != CUDA_SUCCESS)
1855	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1856    }
1857
1858  nvthd->current_stream = ptx_dev->null_stream;
1859  nvthd->ptx_dev = ptx_dev;
1860
1861  return (void *) nvthd;
1862}
1863
1864void
1865GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1866{
1867  free (data);
1868}
1869
1870void *
1871GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1872{
1873  return nvptx_get_current_cuda_device ();
1874}
1875
1876void *
1877GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1878{
1879  return nvptx_get_current_cuda_context ();
1880}
1881
1882/* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1883
1884void *
1885GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1886{
1887  return nvptx_get_cuda_stream (async);
1888}
1889
1890/* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1891
1892int
1893GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1894{
1895  return nvptx_set_cuda_stream (async, stream);
1896}
1897