1/* Plugin for NVPTX execution.
2
3   Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5   Contributed by Mentor Embedded.
6
7   This file is part of the GNU Offloading and Multi Processing Library
8   (libgomp).
9
10   Libgomp is free software; you can redistribute it and/or modify it
11   under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 3, or (at your option)
13   any later version.
14
15   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18   more details.
19
20   Under Section 7 of GPL version 3, you are granted additional
21   permissions described in the GCC Runtime Library Exception, version
22   3.1, as published by the Free Software Foundation.
23
24   You should have received a copy of the GNU General Public License and
25   a copy of the GCC Runtime Library Exception along with this program;
26   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27   <http://www.gnu.org/licenses/>.  */
28
29/* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30   library appears to hold some implicit state, but the documentation
31   is not clear as to what that state might be.  Or how one might
32   propagate it from one thread to another.  */
33
34#define _GNU_SOURCE
35#include "openacc.h"
36#include "config.h"
37#include "libgomp-plugin.h"
38#include "oacc-plugin.h"
39#include "gomp-constants.h"
40#include "oacc-int.h"
41
42#include <pthread.h>
43#include <cuda.h>
44#include <stdbool.h>
45#include <limits.h>
46#include <string.h>
47#include <stdio.h>
48#include <unistd.h>
49#include <assert.h>
50#include <errno.h>
51
52#if CUDA_VERSION < 6000
53extern CUresult cuGetErrorString (CUresult, const char **);
54#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55#endif
56
57#if CUDA_VERSION >= 6050
58#undef cuLinkCreate
59#undef cuLinkAddData
60CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61			const char *, unsigned, CUjit_option *, void **);
62CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63#else
64typedef size_t (*CUoccupancyB2DSize)(int);
65CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66			   const char *, unsigned, CUjit_option *, void **);
67CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69					  CUoccupancyB2DSize, size_t, int);
70#endif
71
72#define DO_PRAGMA(x) _Pragma (#x)
73
74#if PLUGIN_NVPTX_DYNAMIC
75# include <dlfcn.h>
76
77struct cuda_lib_s {
78
79# define CUDA_ONE_CALL(call)			\
80  __typeof (call) *call;
81# define CUDA_ONE_CALL_MAYBE_NULL(call)		\
82  CUDA_ONE_CALL (call)
83#include "cuda-lib.def"
84# undef CUDA_ONE_CALL
85# undef CUDA_ONE_CALL_MAYBE_NULL
86
87} cuda_lib;
88
89/* -1 if init_cuda_lib has not been called yet, false
90   if it has been and failed, true if it has been and succeeded.  */
91static signed char cuda_lib_inited = -1;
92
93/* Dynamically load the CUDA runtime library and initialize function
94   pointers, return false if unsuccessful, true if successful.  */
95static bool
96init_cuda_lib (void)
97{
98  if (cuda_lib_inited != -1)
99    return cuda_lib_inited;
100  const char *cuda_runtime_lib = "libcuda.so.1";
101  void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102  cuda_lib_inited = false;
103  if (h == NULL)
104    return false;
105
106# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108# define CUDA_ONE_CALL_1(call, allow_null)		\
109  cuda_lib.call = dlsym (h, #call);	\
110  if (!allow_null && cuda_lib.call == NULL)		\
111    return false;
112#include "cuda-lib.def"
113# undef CUDA_ONE_CALL
114# undef CUDA_ONE_CALL_1
115# undef CUDA_ONE_CALL_MAYBE_NULL
116
117  cuda_lib_inited = true;
118  return true;
119}
120# define CUDA_CALL_PREFIX cuda_lib.
121#else
122
123# define CUDA_ONE_CALL(call)
124# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125#include "cuda-lib.def"
126#undef CUDA_ONE_CALL_MAYBE_NULL
127#undef CUDA_ONE_CALL
128
129# define CUDA_CALL_PREFIX
130# define init_cuda_lib() true
131#endif
132
133#include "secure_getenv.h"
134
135#undef MIN
136#undef MAX
137#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
140/* Convenience macros for the frequently used CUDA library call and
141   error handling sequence as well as CUDA library calls that
142   do the error checking themselves or don't do it at all.  */
143
144#define CUDA_CALL_ERET(ERET, FN, ...)		\
145  do {						\
146    unsigned __r				\
147      = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
148    if (__r != CUDA_SUCCESS)			\
149      {						\
150	GOMP_PLUGIN_error (#FN " error: %s",	\
151			   cuda_error (__r));	\
152	return ERET;				\
153      }						\
154  } while (0)
155
156#define CUDA_CALL(FN, ...)			\
157  CUDA_CALL_ERET (false, FN, __VA_ARGS__)
158
159#define CUDA_CALL_ASSERT(FN, ...)		\
160  do {						\
161    unsigned __r				\
162      = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
163    if (__r != CUDA_SUCCESS)			\
164      {						\
165	GOMP_PLUGIN_fatal (#FN " error: %s",	\
166			   cuda_error (__r));	\
167      }						\
168  } while (0)
169
170#define CUDA_CALL_NOCHECK(FN, ...)		\
171  CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
173#define CUDA_CALL_EXISTS(FN)			\
174  CUDA_CALL_PREFIX FN
175
176static const char *
177cuda_error (CUresult r)
178{
179  const char *fallback = "unknown cuda error";
180  const char *desc;
181
182  if (!CUDA_CALL_EXISTS (cuGetErrorString))
183    return fallback;
184
185  r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186  if (r == CUDA_SUCCESS)
187    return desc;
188
189  return fallback;
190}
191
192/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193   Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194static char cuda_driver_version_s[30];
195
196static unsigned int instantiated_devices = 0;
197static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
198
199/* NVPTX/CUDA specific definition of asynchronous queues.  */
200struct goacc_asyncqueue
201{
202  CUstream cuda_stream;
203};
204
205struct nvptx_callback
206{
207  void (*fn) (void *);
208  void *ptr;
209  struct goacc_asyncqueue *aq;
210  struct nvptx_callback *next;
211};
212
213/* Thread-specific data for PTX.  */
214
215struct nvptx_thread
216{
217  /* We currently have this embedded inside the plugin because libgomp manages
218     devices through integer target_ids.  This might be better if using an
219     opaque target-specific pointer directly from gomp_device_descr.  */
220  struct ptx_device *ptx_dev;
221};
222
223/* Target data function launch information.  */
224
225struct targ_fn_launch
226{
227  const char *fn;
228  unsigned short dim[GOMP_DIM_MAX];
229};
230
231/* Target PTX object information.  */
232
233struct targ_ptx_obj
234{
235  const char *code;
236  size_t size;
237};
238
239/* Target data image information.  */
240
241typedef struct nvptx_tdata
242{
243  const struct targ_ptx_obj *ptx_objs;
244  unsigned ptx_num;
245
246  const char *const *var_names;
247  unsigned var_num;
248
249  const struct targ_fn_launch *fn_descs;
250  unsigned fn_num;
251} nvptx_tdata_t;
252
253/* Descriptor of a loaded function.  */
254
255struct targ_fn_descriptor
256{
257  CUfunction fn;
258  const struct targ_fn_launch *launch;
259  int regs_per_thread;
260  int max_threads_per_block;
261};
262
263/* A loaded PTX image.  */
264struct ptx_image_data
265{
266  const void *target_data;
267  CUmodule module;
268
269  struct targ_fn_descriptor *fns;  /* Array of functions.  */
270
271  struct ptx_image_data *next;
272};
273
274struct ptx_free_block
275{
276  void *ptr;
277  struct ptx_free_block *next;
278};
279
280struct ptx_device
281{
282  CUcontext ctx;
283  bool ctx_shared;
284  CUdevice dev;
285
286  int ord;
287  bool overlap;
288  bool map;
289  bool concur;
290  bool mkern;
291  int mode;
292  int clock_khz;
293  int num_sms;
294  int regs_per_block;
295  int regs_per_sm;
296  int warp_size;
297  int max_threads_per_block;
298  int max_threads_per_multiprocessor;
299  int default_dims[GOMP_DIM_MAX];
300
301  /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
302  char name[256];
303
304  struct ptx_image_data *images;  /* Images loaded on device.  */
305  pthread_mutex_t image_lock;     /* Lock for above list.  */
306
307  struct ptx_free_block *free_blocks;
308  pthread_mutex_t free_blocks_lock;
309
310  struct ptx_device *next;
311};
312
313static struct ptx_device **ptx_devices;
314
315static inline struct nvptx_thread *
316nvptx_thread (void)
317{
318  return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319}
320
321/* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
322   should be locked on entry and remains locked on exit.  */
323
324static bool
325nvptx_init (void)
326{
327  int ndevs;
328
329  if (instantiated_devices != 0)
330    return true;
331
332  if (!init_cuda_lib ())
333    return false;
334
335  CUDA_CALL (cuInit, 0);
336
337  int cuda_driver_version;
338  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339  snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340	    "CUDA Driver %u.%u",
341	    cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342
343  CUDA_CALL (cuDeviceGetCount, &ndevs);
344  ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345					    * ndevs);
346
347  return true;
348}
349
350/* Select the N'th PTX device for the current host thread.  The device must
351   have been previously opened before calling this function.  */
352
353static bool
354nvptx_attach_host_thread_to_device (int n)
355{
356  CUdevice dev;
357  CUresult r;
358  struct ptx_device *ptx_dev;
359  CUcontext thd_ctx;
360
361  r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
362  if (r == CUDA_ERROR_NOT_PERMITTED)
363    {
364      /* Assume we're in a CUDA callback, just return true.  */
365      return true;
366    }
367  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
368    {
369      GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370      return false;
371    }
372
373  if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
374    return true;
375  else
376    {
377      CUcontext old_ctx;
378
379      ptx_dev = ptx_devices[n];
380      if (!ptx_dev)
381	{
382	  GOMP_PLUGIN_error ("device %d not found", n);
383	  return false;
384	}
385
386      CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
387
388      /* We don't necessarily have a current context (e.g. if it has been
389         destroyed.  Pop it if we do though.  */
390      if (thd_ctx != NULL)
391	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
392
393      CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
394    }
395  return true;
396}
397
398static struct ptx_device *
399nvptx_open_device (int n)
400{
401  struct ptx_device *ptx_dev;
402  CUdevice dev, ctx_dev;
403  CUresult r;
404  int async_engines, pi;
405
406  CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
407
408  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409
410  ptx_dev->ord = n;
411  ptx_dev->dev = dev;
412  ptx_dev->ctx_shared = false;
413
414  r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
415  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
416    {
417      GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418      return NULL;
419    }
420
421  if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422    {
423      /* The current host thread has an active context for a different device.
424         Detach it.  */
425      CUcontext old_ctx;
426      CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
427    }
428
429  CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
430
431  if (!ptx_dev->ctx)
432    CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
433  else
434    ptx_dev->ctx_shared = true;
435
436  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
438  ptx_dev->overlap = pi;
439
440  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
442  ptx_dev->map = pi;
443
444  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
446  ptx_dev->concur = pi;
447
448  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
450  ptx_dev->mode = pi;
451
452  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
454  ptx_dev->mkern = pi;
455
456  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458  ptx_dev->clock_khz = pi;
459
460  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
461		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462  ptx_dev->num_sms = pi;
463
464  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466  ptx_dev->regs_per_block = pi;
467
468  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
469     in CUDA 6.0 and newer.  */
470  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472			 dev);
473  /* Fallback: use limit of registers per block, which is usually equal.  */
474  if (r == CUDA_ERROR_INVALID_VALUE)
475    pi = ptx_dev->regs_per_block;
476  else if (r != CUDA_SUCCESS)
477    {
478      GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479      return NULL;
480    }
481  ptx_dev->regs_per_sm = pi;
482
483  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485  if (pi != 32)
486    {
487      GOMP_PLUGIN_error ("Only warp size 32 is supported");
488      return NULL;
489    }
490  ptx_dev->warp_size = pi;
491
492  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494  ptx_dev->max_threads_per_block = pi;
495
496  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498  ptx_dev->max_threads_per_multiprocessor = pi;
499
500  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
502  if (r != CUDA_SUCCESS)
503    async_engines = 1;
504
505  for (int i = 0; i != GOMP_DIM_MAX; i++)
506    ptx_dev->default_dims[i] = 0;
507
508  CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509		  dev);
510
511  ptx_dev->images = NULL;
512  pthread_mutex_init (&ptx_dev->image_lock, NULL);
513
514  ptx_dev->free_blocks = NULL;
515  pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
516
517  return ptx_dev;
518}
519
520static bool
521nvptx_close_device (struct ptx_device *ptx_dev)
522{
523  if (!ptx_dev)
524    return true;
525
526  for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527    {
528      struct ptx_free_block *b_next = b->next;
529      CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530      free (b);
531      b = b_next;
532    }
533
534  pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
535  pthread_mutex_destroy (&ptx_dev->image_lock);
536
537  if (!ptx_dev->ctx_shared)
538    CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
539
540  free (ptx_dev);
541  return true;
542}
543
544static int
545nvptx_get_num_devices (void)
546{
547  int n;
548
549  /* This function will be called before the plugin has been initialized in
550     order to enumerate available devices, but CUDA API routines can't be used
551     until cuInit has been called.  Just call it now (but don't yet do any
552     further initialization).  */
553  if (instantiated_devices == 0)
554    {
555      if (!init_cuda_lib ())
556	return 0;
557      CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
558      /* This is not an error: e.g. we may have CUDA libraries installed but
559         no devices available.  */
560      if (r != CUDA_SUCCESS)
561	{
562	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
563			     cuda_error (r));
564	  return 0;
565	}
566    }
567
568  CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
569  return n;
570}
571
572static void
573notify_var (const char *var_name, const char *env_var)
574{
575  if (env_var == NULL)
576    GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
577  else
578    GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
579}
580
581static void
582process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
583{
584  const char *var_name = "GOMP_NVPTX_JIT";
585  const char *env_var = secure_getenv (var_name);
586  notify_var (var_name, env_var);
587
588  if (env_var == NULL)
589    return;
590
591  const char *c = env_var;
592  while (*c != '\0')
593    {
594      while (*c == ' ')
595	c++;
596
597      if (c[0] == '-' && c[1] == 'O'
598	  && '0' <= c[2] && c[2] <= '4'
599	  && (c[3] == '\0' || c[3] == ' '))
600	{
601	  *gomp_nvptx_o = c[2] - '0';
602	  c += 3;
603	  continue;
604	}
605
606      GOMP_PLUGIN_error ("Error parsing %s", var_name);
607      break;
608    }
609}
610
611static bool
612link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
613	  unsigned num_objs)
614{
615  CUjit_option opts[7];
616  void *optvals[7];
617  float elapsed = 0.0;
618  char elog[1024];
619  char ilog[16384];
620  CUlinkState linkstate;
621  CUresult r;
622  void *linkout;
623  size_t linkoutsize __attribute__ ((unused));
624
625  opts[0] = CU_JIT_WALL_TIME;
626  optvals[0] = &elapsed;
627
628  opts[1] = CU_JIT_INFO_LOG_BUFFER;
629  optvals[1] = &ilog[0];
630
631  opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
632  optvals[2] = (void *) sizeof ilog;
633
634  opts[3] = CU_JIT_ERROR_LOG_BUFFER;
635  optvals[3] = &elog[0];
636
637  opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
638  optvals[4] = (void *) sizeof elog;
639
640  opts[5] = CU_JIT_LOG_VERBOSE;
641  optvals[5] = (void *) 1;
642
643  static intptr_t gomp_nvptx_o = -1;
644
645  static bool init_done = false;
646  if (!init_done)
647    {
648      process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
649      init_done = true;
650  }
651
652  int nopts = 6;
653  if (gomp_nvptx_o != -1)
654    {
655      opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
656      optvals[nopts] = (void *) gomp_nvptx_o;
657      nopts++;
658    }
659
660  if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
661    CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
662  else
663    CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
664
665  for (; num_objs--; ptx_objs++)
666    {
667      /* cuLinkAddData's 'data' argument erroneously omits the const
668	 qualifier.  */
669      GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
670      if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
671	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
672			       (char *) ptx_objs->code, ptx_objs->size,
673			       0, 0, 0, 0);
674      else
675	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
676			       (char *) ptx_objs->code, ptx_objs->size,
677			       0, 0, 0, 0);
678      if (r != CUDA_SUCCESS)
679	{
680	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
681	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
682			     cuda_error (r));
683	  return false;
684	}
685    }
686
687  GOMP_PLUGIN_debug (0, "Linking\n");
688  r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
689
690  GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
691  GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
692
693  if (r != CUDA_SUCCESS)
694    {
695      GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
696      return false;
697    }
698
699  CUDA_CALL (cuModuleLoadData, module, linkout);
700  CUDA_CALL (cuLinkDestroy, linkstate);
701  return true;
702}
703
704static void
705nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
706	    unsigned *dims, void *targ_mem_desc,
707	    CUdeviceptr dp, CUstream stream)
708{
709  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
710  CUfunction function;
711  int i;
712  void *kargs[1];
713  struct nvptx_thread *nvthd = nvptx_thread ();
714  int warp_size = nvthd->ptx_dev->warp_size;
715
716  function = targ_fn->fn;
717
718  /* Initialize the launch dimensions.  Typically this is constant,
719     provided by the device compiler, but we must permit runtime
720     values.  */
721  int seen_zero = 0;
722  for (i = 0; i != GOMP_DIM_MAX; i++)
723    {
724      if (targ_fn->launch->dim[i])
725       dims[i] = targ_fn->launch->dim[i];
726      if (!dims[i])
727       seen_zero = 1;
728    }
729
730  if (seen_zero)
731    {
732      pthread_mutex_lock (&ptx_dev_lock);
733
734      static int gomp_openacc_dims[GOMP_DIM_MAX];
735      if (!gomp_openacc_dims[0])
736	{
737	  /* See if the user provided GOMP_OPENACC_DIM environment
738	     variable to specify runtime defaults.  */
739	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
740	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
741	}
742
743      if (!nvthd->ptx_dev->default_dims[0])
744	{
745	  int default_dims[GOMP_DIM_MAX];
746	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
747	    default_dims[i] = gomp_openacc_dims[i];
748
749	  int gang, worker, vector;
750	  {
751	    int block_size = nvthd->ptx_dev->max_threads_per_block;
752	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
753	    int dev_size = nvthd->ptx_dev->num_sms;
754	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
755			       " dev_size=%d, cpu_size=%d\n",
756			       warp_size, block_size, dev_size, cpu_size);
757
758	    gang = (cpu_size / block_size) * dev_size;
759	    worker = block_size / warp_size;
760	    vector = warp_size;
761	  }
762
763	  /* There is no upper bound on the gang size.  The best size
764	     matches the hardware configuration.  Logical gangs are
765	     scheduled onto physical hardware.  To maximize usage, we
766	     should guess a large number.  */
767	  if (default_dims[GOMP_DIM_GANG] < 1)
768	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
769	  /* The worker size must not exceed the hardware.  */
770	  if (default_dims[GOMP_DIM_WORKER] < 1
771	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
772	    default_dims[GOMP_DIM_WORKER] = worker;
773	  /* The vector size must exactly match the hardware.  */
774	  if (default_dims[GOMP_DIM_VECTOR] < 1
775	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
776	    default_dims[GOMP_DIM_VECTOR] = vector;
777
778	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
779			     default_dims[GOMP_DIM_GANG],
780			     default_dims[GOMP_DIM_WORKER],
781			     default_dims[GOMP_DIM_VECTOR]);
782
783	  for (i = 0; i != GOMP_DIM_MAX; i++)
784	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
785	}
786      pthread_mutex_unlock (&ptx_dev_lock);
787
788      {
789	bool default_dim_p[GOMP_DIM_MAX];
790	for (i = 0; i != GOMP_DIM_MAX; i++)
791	  default_dim_p[i] = !dims[i];
792
793	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
794	  {
795	    for (i = 0; i != GOMP_DIM_MAX; i++)
796	      if (default_dim_p[i])
797		dims[i] = nvthd->ptx_dev->default_dims[i];
798
799	    if (default_dim_p[GOMP_DIM_VECTOR])
800	      dims[GOMP_DIM_VECTOR]
801		= MIN (dims[GOMP_DIM_VECTOR],
802		       (targ_fn->max_threads_per_block / warp_size
803			* warp_size));
804
805	    if (default_dim_p[GOMP_DIM_WORKER])
806	      dims[GOMP_DIM_WORKER]
807		= MIN (dims[GOMP_DIM_WORKER],
808		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
809	  }
810	else
811	  {
812	    /* Handle the case that the compiler allows the runtime to choose
813	       the vector-length conservatively, by ignoring
814	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
815	       it.  */
816	    int vectors = 0;
817	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
818	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
819	       exceed targ_fn->max_threads_per_block. */
820	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
821	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
822	    int grids, blocks;
823
824	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
825			      &blocks, function, NULL, 0,
826			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
827	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
828			       "grid = %d, block = %d\n", grids, blocks);
829
830	    /* Keep the num_gangs proportional to the block size.  In
831	       the case were a block size is limited by shared-memory
832	       or the register file capacity, the runtime will not
833	       excessively over assign gangs to the multiprocessor
834	       units if their state is going to be swapped out even
835	       more than necessary. The constant factor 2 is there to
836	       prevent threads from idling when there is insufficient
837	       work for them.  */
838	    if (gangs == 0)
839	      gangs = 2 * grids * (blocks / warp_size);
840
841	    if (vectors == 0)
842	      vectors = warp_size;
843
844	    if (workers == 0)
845	      {
846		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
847				      ? vectors
848				      : dims[GOMP_DIM_VECTOR]);
849		workers = blocks / actual_vectors;
850		workers = MAX (workers, 1);
851		/* If we need a per-worker barrier ... .  */
852		if (actual_vectors > 32)
853		  /* Don't use more barriers than available.  */
854		  workers = MIN (workers, 15);
855	      }
856
857	    for (i = 0; i != GOMP_DIM_MAX; i++)
858	      if (default_dim_p[i])
859		switch (i)
860		  {
861		  case GOMP_DIM_GANG: dims[i] = gangs; break;
862		  case GOMP_DIM_WORKER: dims[i] = workers; break;
863		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
864		  default: GOMP_PLUGIN_fatal ("invalid dim");
865		  }
866	  }
867      }
868    }
869
870  /* Check if the accelerator has sufficient hardware resources to
871     launch the offloaded kernel.  */
872  if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
873      > targ_fn->max_threads_per_block)
874    {
875      const char *msg
876	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
877	   " with num_workers = %d and vector_length = %d"
878	   "; "
879	   "recompile the program with 'num_workers = x and vector_length = y'"
880	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
881	   " x * y <= %d"
882	   ".\n");
883      GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
884			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
885    }
886
887  /* Check if the accelerator has sufficient barrier resources to
888     launch the offloaded kernel.  */
889  if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
890    {
891      const char *msg
892	= ("The Nvidia accelerator has insufficient barrier resources to launch"
893	   " '%s' with num_workers = %d and vector_length = %d"
894	   "; "
895	   "recompile the program with 'num_workers = x' on that offloaded"
896	   " region or '-fopenacc-dim=:x:' where x <= 15"
897	   "; "
898	   "or, recompile the program with 'vector_length = 32' on that"
899	   " offloaded region or '-fopenacc-dim=::32'"
900	   ".\n");
901	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
902			   dims[GOMP_DIM_VECTOR]);
903    }
904
905  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
906		     " gangs=%u, workers=%u, vectors=%u\n",
907		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
908		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
909
910  // OpenACC		CUDA
911  //
912  // num_gangs		nctaid.x
913  // num_workers	ntid.y
914  // vector length	ntid.x
915
916  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
917  acc_prof_info *prof_info = thr->prof_info;
918  acc_event_info enqueue_launch_event_info;
919  acc_api_info *api_info = thr->api_info;
920  bool profiling_p = __builtin_expect (prof_info != NULL, false);
921  if (profiling_p)
922    {
923      prof_info->event_type = acc_ev_enqueue_launch_start;
924
925      enqueue_launch_event_info.launch_event.event_type
926	= prof_info->event_type;
927      enqueue_launch_event_info.launch_event.valid_bytes
928	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
929      enqueue_launch_event_info.launch_event.parent_construct
930	= acc_construct_parallel;
931      enqueue_launch_event_info.launch_event.implicit = 1;
932      enqueue_launch_event_info.launch_event.tool_info = NULL;
933      enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
934      enqueue_launch_event_info.launch_event.num_gangs
935	= dims[GOMP_DIM_GANG];
936      enqueue_launch_event_info.launch_event.num_workers
937	= dims[GOMP_DIM_WORKER];
938      enqueue_launch_event_info.launch_event.vector_length
939	= dims[GOMP_DIM_VECTOR];
940
941      api_info->device_api = acc_device_api_cuda;
942
943      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
944					    api_info);
945    }
946
947  kargs[0] = &dp;
948  CUDA_CALL_ASSERT (cuLaunchKernel, function,
949		    dims[GOMP_DIM_GANG], 1, 1,
950		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
951		    0, stream, kargs, 0);
952
953  if (profiling_p)
954    {
955      prof_info->event_type = acc_ev_enqueue_launch_end;
956      enqueue_launch_event_info.launch_event.event_type
957	= prof_info->event_type;
958      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
959					    api_info);
960    }
961
962  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
963		     targ_fn->launch->fn);
964}
965
966void * openacc_get_current_cuda_context (void);
967
968static void
969goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
970{
971  acc_prof_info *prof_info = thr->prof_info;
972  acc_event_info data_event_info;
973  acc_api_info *api_info = thr->api_info;
974
975  prof_info->event_type = acc_ev_alloc;
976
977  data_event_info.data_event.event_type = prof_info->event_type;
978  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
979  data_event_info.data_event.parent_construct = acc_construct_parallel;
980  data_event_info.data_event.implicit = 1;
981  data_event_info.data_event.tool_info = NULL;
982  data_event_info.data_event.var_name = NULL;
983  data_event_info.data_event.bytes = s;
984  data_event_info.data_event.host_ptr = NULL;
985  data_event_info.data_event.device_ptr = dp;
986
987  api_info->device_api = acc_device_api_cuda;
988
989  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
990}
991
992static void *
993nvptx_alloc (size_t s)
994{
995  CUdeviceptr d;
996
997  CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
998  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
999  bool profiling_p
1000    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1001  if (profiling_p)
1002    goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1003
1004  return (void *) d;
1005}
1006
1007static void
1008goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1009{
1010  acc_prof_info *prof_info = thr->prof_info;
1011  acc_event_info data_event_info;
1012  acc_api_info *api_info = thr->api_info;
1013
1014  prof_info->event_type = acc_ev_free;
1015
1016  data_event_info.data_event.event_type = prof_info->event_type;
1017  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1018  data_event_info.data_event.parent_construct = acc_construct_parallel;
1019  data_event_info.data_event.implicit = 1;
1020  data_event_info.data_event.tool_info = NULL;
1021  data_event_info.data_event.var_name = NULL;
1022  data_event_info.data_event.bytes = -1;
1023  data_event_info.data_event.host_ptr = NULL;
1024  data_event_info.data_event.device_ptr = p;
1025
1026  api_info->device_api = acc_device_api_cuda;
1027
1028  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1029}
1030
1031static bool
1032nvptx_free (void *p, struct ptx_device *ptx_dev)
1033{
1034  CUdeviceptr pb;
1035  size_t ps;
1036
1037  CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1038				  (CUdeviceptr) p);
1039  if (r == CUDA_ERROR_NOT_PERMITTED)
1040    {
1041      /* We assume that this error indicates we are in a CUDA callback context,
1042	 where all CUDA calls are not allowed (see cuStreamAddCallback
1043	 documentation for description). Arrange to free this piece of device
1044	 memory later.  */
1045      struct ptx_free_block *n
1046	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1047      n->ptr = p;
1048      pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1049      n->next = ptx_dev->free_blocks;
1050      ptx_dev->free_blocks = n;
1051      pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1052      return true;
1053    }
1054  else if (r != CUDA_SUCCESS)
1055    {
1056      GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1057      return false;
1058    }
1059  if ((CUdeviceptr) p != pb)
1060    {
1061      GOMP_PLUGIN_error ("invalid device address");
1062      return false;
1063    }
1064
1065  CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1066  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1067  bool profiling_p
1068    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1069  if (profiling_p)
1070    goacc_profiling_acc_ev_free (thr, p);
1071
1072  return true;
1073}
1074
1075static void *
1076nvptx_get_current_cuda_device (void)
1077{
1078  struct nvptx_thread *nvthd = nvptx_thread ();
1079
1080  if (!nvthd || !nvthd->ptx_dev)
1081    return NULL;
1082
1083  return &nvthd->ptx_dev->dev;
1084}
1085
1086static void *
1087nvptx_get_current_cuda_context (void)
1088{
1089  struct nvptx_thread *nvthd = nvptx_thread ();
1090
1091  if (!nvthd || !nvthd->ptx_dev)
1092    return NULL;
1093
1094  return nvthd->ptx_dev->ctx;
1095}
1096
1097/* Plugin entry points.  */
1098
1099const char *
1100GOMP_OFFLOAD_get_name (void)
1101{
1102  return "nvptx";
1103}
1104
1105unsigned int
1106GOMP_OFFLOAD_get_caps (void)
1107{
1108  return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1109}
1110
1111int
1112GOMP_OFFLOAD_get_type (void)
1113{
1114  return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1115}
1116
1117int
1118GOMP_OFFLOAD_get_num_devices (void)
1119{
1120  return nvptx_get_num_devices ();
1121}
1122
1123bool
1124GOMP_OFFLOAD_init_device (int n)
1125{
1126  struct ptx_device *dev;
1127
1128  pthread_mutex_lock (&ptx_dev_lock);
1129
1130  if (!nvptx_init () || ptx_devices[n] != NULL)
1131    {
1132      pthread_mutex_unlock (&ptx_dev_lock);
1133      return false;
1134    }
1135
1136  dev = nvptx_open_device (n);
1137  if (dev)
1138    {
1139      ptx_devices[n] = dev;
1140      instantiated_devices++;
1141    }
1142
1143  pthread_mutex_unlock (&ptx_dev_lock);
1144
1145  return dev != NULL;
1146}
1147
1148bool
1149GOMP_OFFLOAD_fini_device (int n)
1150{
1151  pthread_mutex_lock (&ptx_dev_lock);
1152
1153  if (ptx_devices[n] != NULL)
1154    {
1155      if (!nvptx_attach_host_thread_to_device (n)
1156	  || !nvptx_close_device (ptx_devices[n]))
1157	{
1158	  pthread_mutex_unlock (&ptx_dev_lock);
1159	  return false;
1160	}
1161      ptx_devices[n] = NULL;
1162      instantiated_devices--;
1163    }
1164
1165  if (instantiated_devices == 0)
1166    {
1167      free (ptx_devices);
1168      ptx_devices = NULL;
1169    }
1170
1171  pthread_mutex_unlock (&ptx_dev_lock);
1172  return true;
1173}
1174
1175/* Return the libgomp version number we're compatible with.  There is
1176   no requirement for cross-version compatibility.  */
1177
1178unsigned
1179GOMP_OFFLOAD_version (void)
1180{
1181  return GOMP_VERSION;
1182}
1183
1184/* Initialize __nvptx_clocktick, if present in MODULE.  */
1185
1186static void
1187nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1188{
1189  CUdeviceptr dptr;
1190  CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1191				  module, "__nvptx_clocktick");
1192  if (r == CUDA_ERROR_NOT_FOUND)
1193    return;
1194  if (r != CUDA_SUCCESS)
1195    GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1196  double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1197  r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1198			 sizeof (__nvptx_clocktick));
1199  if (r != CUDA_SUCCESS)
1200    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1201}
1202
1203/* Load the (partial) program described by TARGET_DATA to device
1204   number ORD.  Allocate and return TARGET_TABLE.  */
1205
1206int
1207GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1208			 struct addr_pair **target_table)
1209{
1210  CUmodule module;
1211  const char *const *var_names;
1212  const struct targ_fn_launch *fn_descs;
1213  unsigned int fn_entries, var_entries, i, j;
1214  struct targ_fn_descriptor *targ_fns;
1215  struct addr_pair *targ_tbl;
1216  const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1217  struct ptx_image_data *new_image;
1218  struct ptx_device *dev;
1219
1220  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1221    {
1222      GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1223			 " (expected %u, received %u)",
1224			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1225      return -1;
1226    }
1227
1228  if (!nvptx_attach_host_thread_to_device (ord)
1229      || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1230    return -1;
1231
1232  dev = ptx_devices[ord];
1233
1234  /* The mkoffload utility emits a struct of pointers/integers at the
1235     start of each offload image.  The array of kernel names and the
1236     functions addresses form a one-to-one correspondence.  */
1237
1238  var_entries = img_header->var_num;
1239  var_names = img_header->var_names;
1240  fn_entries = img_header->fn_num;
1241  fn_descs = img_header->fn_descs;
1242
1243  targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1244				 * (fn_entries + var_entries));
1245  targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1246				 * fn_entries);
1247
1248  *target_table = targ_tbl;
1249
1250  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1251  new_image->target_data = target_data;
1252  new_image->module = module;
1253  new_image->fns = targ_fns;
1254
1255  pthread_mutex_lock (&dev->image_lock);
1256  new_image->next = dev->images;
1257  dev->images = new_image;
1258  pthread_mutex_unlock (&dev->image_lock);
1259
1260  for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1261    {
1262      CUfunction function;
1263      int nregs, mthrs;
1264
1265      CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1266		      fn_descs[i].fn);
1267      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1268		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1269      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1270		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1271
1272      targ_fns->fn = function;
1273      targ_fns->launch = &fn_descs[i];
1274      targ_fns->regs_per_thread = nregs;
1275      targ_fns->max_threads_per_block = mthrs;
1276
1277      targ_tbl->start = (uintptr_t) targ_fns;
1278      targ_tbl->end = targ_tbl->start + 1;
1279    }
1280
1281  for (j = 0; j < var_entries; j++, targ_tbl++)
1282    {
1283      CUdeviceptr var;
1284      size_t bytes;
1285
1286      CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1287		      &var, &bytes, module, var_names[j]);
1288
1289      targ_tbl->start = (uintptr_t) var;
1290      targ_tbl->end = targ_tbl->start + bytes;
1291    }
1292
1293  nvptx_set_clocktick (module, dev);
1294
1295  return fn_entries + var_entries;
1296}
1297
1298/* Unload the program described by TARGET_DATA.  DEV_DATA is the
1299   function descriptors allocated by G_O_load_image.  */
1300
1301bool
1302GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1303{
1304  struct ptx_image_data *image, **prev_p;
1305  struct ptx_device *dev = ptx_devices[ord];
1306
1307  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1308    {
1309      GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1310			 " (expected %u, received %u)",
1311			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1312      return false;
1313    }
1314
1315  bool ret = true;
1316  pthread_mutex_lock (&dev->image_lock);
1317  for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1318    if (image->target_data == target_data)
1319      {
1320	*prev_p = image->next;
1321	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1322	  ret = false;
1323	free (image->fns);
1324	free (image);
1325	break;
1326      }
1327  pthread_mutex_unlock (&dev->image_lock);
1328  return ret;
1329}
1330
1331void *
1332GOMP_OFFLOAD_alloc (int ord, size_t size)
1333{
1334  if (!nvptx_attach_host_thread_to_device (ord))
1335    return NULL;
1336
1337  struct ptx_device *ptx_dev = ptx_devices[ord];
1338  struct ptx_free_block *blocks, *tmp;
1339
1340  pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1341  blocks = ptx_dev->free_blocks;
1342  ptx_dev->free_blocks = NULL;
1343  pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1344
1345  while (blocks)
1346    {
1347      tmp = blocks->next;
1348      nvptx_free (blocks->ptr, ptx_dev);
1349      free (blocks);
1350      blocks = tmp;
1351    }
1352
1353  return nvptx_alloc (size);
1354}
1355
1356bool
1357GOMP_OFFLOAD_free (int ord, void *ptr)
1358{
1359  return (nvptx_attach_host_thread_to_device (ord)
1360	  && nvptx_free (ptr, ptx_devices[ord]));
1361}
1362
1363void
1364GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1365			   void **hostaddrs, void **devaddrs,
1366			   unsigned *dims, void *targ_mem_desc)
1367{
1368  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1369
1370  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1371  acc_prof_info *prof_info = thr->prof_info;
1372  acc_event_info data_event_info;
1373  acc_api_info *api_info = thr->api_info;
1374  bool profiling_p = __builtin_expect (prof_info != NULL, false);
1375
1376  void **hp = NULL;
1377  CUdeviceptr dp = 0;
1378
1379  if (mapnum > 0)
1380    {
1381      size_t s = mapnum * sizeof (void *);
1382      hp = alloca (s);
1383      for (int i = 0; i < mapnum; i++)
1384	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1385      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1386      if (profiling_p)
1387	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1388    }
1389
1390  /* Copy the (device) pointers to arguments to the device (dp and hp might in
1391     fact have the same value on a unified-memory system).  */
1392  if (mapnum > 0)
1393    {
1394      if (profiling_p)
1395	{
1396	  prof_info->event_type = acc_ev_enqueue_upload_start;
1397
1398	  data_event_info.data_event.event_type = prof_info->event_type;
1399	  data_event_info.data_event.valid_bytes
1400	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1401	  data_event_info.data_event.parent_construct
1402	    = acc_construct_parallel;
1403	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1404	  data_event_info.data_event.tool_info = NULL;
1405	  data_event_info.data_event.var_name = NULL;
1406	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1407	  data_event_info.data_event.host_ptr = hp;
1408	  data_event_info.data_event.device_ptr = (const void *) dp;
1409
1410	  api_info->device_api = acc_device_api_cuda;
1411
1412	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1413						api_info);
1414	}
1415      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1416			mapnum * sizeof (void *));
1417      if (profiling_p)
1418	{
1419	  prof_info->event_type = acc_ev_enqueue_upload_end;
1420	  data_event_info.data_event.event_type = prof_info->event_type;
1421	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1422						api_info);
1423	}
1424    }
1425
1426  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1427	      dp, NULL);
1428
1429  CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1430  const char *maybe_abort_msg = "(perhaps abort was called)";
1431  if (r == CUDA_ERROR_LAUNCH_FAILED)
1432    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1433		       maybe_abort_msg);
1434  else if (r != CUDA_SUCCESS)
1435    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1436
1437  CUDA_CALL_ASSERT (cuMemFree, dp);
1438  if (profiling_p)
1439    goacc_profiling_acc_ev_free (thr, (void *) dp);
1440}
1441
1442static void
1443cuda_free_argmem (void *ptr)
1444{
1445  void **block = (void **) ptr;
1446  nvptx_free (block[0], (struct ptx_device *) block[1]);
1447  free (block);
1448}
1449
1450void
1451GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1452				 void **hostaddrs, void **devaddrs,
1453				 unsigned *dims, void *targ_mem_desc,
1454				 struct goacc_asyncqueue *aq)
1455{
1456  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1457
1458  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1459  acc_prof_info *prof_info = thr->prof_info;
1460  acc_event_info data_event_info;
1461  acc_api_info *api_info = thr->api_info;
1462  bool profiling_p = __builtin_expect (prof_info != NULL, false);
1463
1464  void **hp = NULL;
1465  CUdeviceptr dp = 0;
1466  void **block = NULL;
1467
1468  if (mapnum > 0)
1469    {
1470      size_t s = mapnum * sizeof (void *);
1471      block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1472      hp = block + 2;
1473      for (int i = 0; i < mapnum; i++)
1474	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1475      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1476      if (profiling_p)
1477	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1478    }
1479
1480  /* Copy the (device) pointers to arguments to the device (dp and hp might in
1481     fact have the same value on a unified-memory system).  */
1482  if (mapnum > 0)
1483    {
1484      if (profiling_p)
1485	{
1486	  prof_info->event_type = acc_ev_enqueue_upload_start;
1487
1488	  data_event_info.data_event.event_type = prof_info->event_type;
1489	  data_event_info.data_event.valid_bytes
1490	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1491	  data_event_info.data_event.parent_construct
1492	    = acc_construct_parallel;
1493	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1494	  data_event_info.data_event.tool_info = NULL;
1495	  data_event_info.data_event.var_name = NULL;
1496	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1497	  data_event_info.data_event.host_ptr = hp;
1498	  data_event_info.data_event.device_ptr = (const void *) dp;
1499
1500	  api_info->device_api = acc_device_api_cuda;
1501
1502	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1503						api_info);
1504	}
1505
1506      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1507			mapnum * sizeof (void *), aq->cuda_stream);
1508      block[0] = (void *) dp;
1509
1510      struct nvptx_thread *nvthd =
1511	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1512      block[1] = (void *) nvthd->ptx_dev;
1513
1514      if (profiling_p)
1515	{
1516	  prof_info->event_type = acc_ev_enqueue_upload_end;
1517	  data_event_info.data_event.event_type = prof_info->event_type;
1518	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1519						api_info);
1520	}
1521    }
1522
1523  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1524	      dp, aq->cuda_stream);
1525
1526  if (mapnum > 0)
1527    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1528}
1529
1530void *
1531GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1532{
1533  struct ptx_device *ptx_dev;
1534  struct nvptx_thread *nvthd
1535    = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1536  CUcontext thd_ctx;
1537
1538  ptx_dev = ptx_devices[ord];
1539
1540  assert (ptx_dev);
1541
1542  CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1543
1544  assert (ptx_dev->ctx);
1545
1546  if (!thd_ctx)
1547    CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1548
1549  nvthd->ptx_dev = ptx_dev;
1550
1551  return (void *) nvthd;
1552}
1553
1554void
1555GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1556{
1557  free (data);
1558}
1559
1560void *
1561GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1562{
1563  return nvptx_get_current_cuda_device ();
1564}
1565
1566void *
1567GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1568{
1569  return nvptx_get_current_cuda_context ();
1570}
1571
1572/* This returns a CUstream.  */
1573void *
1574GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1575{
1576  return (void *) aq->cuda_stream;
1577}
1578
1579/* This takes a CUstream.  */
1580int
1581GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1582{
1583  if (aq->cuda_stream)
1584    {
1585      CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1586      CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1587    }
1588
1589  aq->cuda_stream = (CUstream) stream;
1590  return 1;
1591}
1592
1593struct goacc_asyncqueue *
1594GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1595{
1596  CUstream stream = NULL;
1597  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1598
1599  struct goacc_asyncqueue *aq
1600    = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1601  aq->cuda_stream = stream;
1602  return aq;
1603}
1604
1605bool
1606GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1607{
1608  CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1609  free (aq);
1610  return true;
1611}
1612
1613int
1614GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1615{
1616  CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1617  if (r == CUDA_SUCCESS)
1618    return 1;
1619  if (r == CUDA_ERROR_NOT_READY)
1620    return 0;
1621
1622  GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1623  return -1;
1624}
1625
1626bool
1627GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1628{
1629  CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1630  return true;
1631}
1632
1633bool
1634GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1635				      struct goacc_asyncqueue *aq2)
1636{
1637  CUevent e;
1638  CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1639  CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1640  CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1641  return true;
1642}
1643
1644static void
1645cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1646{
1647  if (res != CUDA_SUCCESS)
1648    GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1649  struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1650  cb->fn (cb->ptr);
1651  free (ptr);
1652}
1653
1654void
1655GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1656					   void (*callback_fn)(void *),
1657					   void *userptr)
1658{
1659  struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1660  b->fn = callback_fn;
1661  b->ptr = userptr;
1662  b->aq = aq;
1663  CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1664		    cuda_callback_wrapper, (void *) b, 0);
1665}
1666
1667static bool
1668cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1669{
1670  CUdeviceptr pb;
1671  size_t ps;
1672  if (!s)
1673    return true;
1674  if (!d)
1675    {
1676      GOMP_PLUGIN_error ("invalid device address");
1677      return false;
1678    }
1679  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1680  if (!pb)
1681    {
1682      GOMP_PLUGIN_error ("invalid device address");
1683      return false;
1684    }
1685  if (!h)
1686    {
1687      GOMP_PLUGIN_error ("invalid host address");
1688      return false;
1689    }
1690  if (d == h)
1691    {
1692      GOMP_PLUGIN_error ("invalid host or device address");
1693      return false;
1694    }
1695  if ((void *)(d + s) > (void *)(pb + ps))
1696    {
1697      GOMP_PLUGIN_error ("invalid size");
1698      return false;
1699    }
1700  return true;
1701}
1702
1703bool
1704GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1705{
1706  if (!nvptx_attach_host_thread_to_device (ord)
1707      || !cuda_memcpy_sanity_check (src, dst, n))
1708    return false;
1709  CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1710  return true;
1711}
1712
1713bool
1714GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1715{
1716  if (!nvptx_attach_host_thread_to_device (ord)
1717      || !cuda_memcpy_sanity_check (dst, src, n))
1718    return false;
1719  CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1720  return true;
1721}
1722
1723bool
1724GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1725{
1726  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1727  return true;
1728}
1729
1730bool
1731GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1732				     size_t n, struct goacc_asyncqueue *aq)
1733{
1734  if (!nvptx_attach_host_thread_to_device (ord)
1735      || !cuda_memcpy_sanity_check (src, dst, n))
1736    return false;
1737  CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1738  return true;
1739}
1740
1741bool
1742GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1743				     size_t n, struct goacc_asyncqueue *aq)
1744{
1745  if (!nvptx_attach_host_thread_to_device (ord)
1746      || !cuda_memcpy_sanity_check (dst, src, n))
1747    return false;
1748  CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1749  return true;
1750}
1751
1752union goacc_property_value
1753GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1754{
1755  union goacc_property_value propval = { .val = 0 };
1756
1757  pthread_mutex_lock (&ptx_dev_lock);
1758
1759  if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1760    {
1761      pthread_mutex_unlock (&ptx_dev_lock);
1762      return propval;
1763    }
1764
1765  struct ptx_device *ptx_dev = ptx_devices[n];
1766  switch (prop)
1767    {
1768    case GOACC_PROPERTY_MEMORY:
1769      {
1770	size_t total_mem;
1771
1772	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1773	propval.val = total_mem;
1774      }
1775      break;
1776    case GOACC_PROPERTY_FREE_MEMORY:
1777      {
1778	size_t total_mem;
1779	size_t free_mem;
1780	CUdevice ctxdev;
1781
1782	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1783	if (ptx_dev->dev == ctxdev)
1784	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1785	else if (ptx_dev->ctx)
1786	  {
1787	    CUcontext old_ctx;
1788
1789	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1790	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1791	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1792	  }
1793	else
1794	  {
1795	    CUcontext new_ctx;
1796
1797	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1798			    ptx_dev->dev);
1799	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1800	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1801	  }
1802	propval.val = free_mem;
1803      }
1804      break;
1805    case GOACC_PROPERTY_NAME:
1806      propval.ptr = ptx_dev->name;
1807      break;
1808    case GOACC_PROPERTY_VENDOR:
1809      propval.ptr = "Nvidia";
1810      break;
1811    case GOACC_PROPERTY_DRIVER:
1812      propval.ptr = cuda_driver_version_s;
1813      break;
1814    default:
1815      break;
1816    }
1817
1818  pthread_mutex_unlock (&ptx_dev_lock);
1819  return propval;
1820}
1821
1822/* Adjust launch dimensions: pick good values for number of blocks and warps
1823   and ensure that number of warps does not exceed CUDA limits as well as GCC's
1824   own limits.  */
1825
1826static void
1827nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1828			    struct ptx_device *ptx_dev,
1829			    int *teams_p, int *threads_p)
1830{
1831  int max_warps_block = fn->max_threads_per_block / 32;
1832  /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1833     and libgcc, which matches documented limit of all GPUs as of 2015.  */
1834  if (max_warps_block > 32)
1835    max_warps_block = 32;
1836  if (*threads_p <= 0)
1837    *threads_p = 8;
1838  if (*threads_p > max_warps_block)
1839    *threads_p = max_warps_block;
1840
1841  int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1842  /* This is an estimate of how many blocks the device can host simultaneously.
1843     Actual limit, which may be lower, can be queried with "occupancy control"
1844     driver interface (since CUDA 6.0).  */
1845  int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1846  if (*teams_p <= 0 || *teams_p > max_blocks)
1847    *teams_p = max_blocks;
1848}
1849
1850/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1851   target regions.  */
1852
1853static size_t
1854nvptx_stacks_size ()
1855{
1856  return 128 * 1024;
1857}
1858
1859/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
1860
1861static void *
1862nvptx_stacks_alloc (size_t size, int num)
1863{
1864  CUdeviceptr stacks;
1865  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1866  if (r != CUDA_SUCCESS)
1867    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1868  return (void *) stacks;
1869}
1870
1871/* Release storage previously allocated by nvptx_stacks_alloc.  */
1872
1873static void
1874nvptx_stacks_free (void *p, int num)
1875{
1876  CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1877  if (r != CUDA_SUCCESS)
1878    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1879}
1880
1881void
1882GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1883{
1884  CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1885  CUresult r;
1886  struct ptx_device *ptx_dev = ptx_devices[ord];
1887  const char *maybe_abort_msg = "(perhaps abort was called)";
1888  int teams = 0, threads = 0;
1889
1890  if (!args)
1891    GOMP_PLUGIN_fatal ("No target arguments provided");
1892  while (*args)
1893    {
1894      intptr_t id = (intptr_t) *args++, val;
1895      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1896	val = (intptr_t) *args++;
1897      else
1898        val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1899      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1900	continue;
1901      val = val > INT_MAX ? INT_MAX : val;
1902      id &= GOMP_TARGET_ARG_ID_MASK;
1903      if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1904	teams = val;
1905      else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1906	threads = val;
1907    }
1908  nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1909
1910  size_t stack_size = nvptx_stacks_size ();
1911  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1912  void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1913  size_t fn_args_size = sizeof fn_args;
1914  void *config[] = {
1915    CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1916    CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1917    CU_LAUNCH_PARAM_END
1918  };
1919  r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1920			 32, threads, 1, 0, NULL, NULL, config);
1921  if (r != CUDA_SUCCESS)
1922    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1923
1924  r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1925  if (r == CUDA_ERROR_LAUNCH_FAILED)
1926    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1927		       maybe_abort_msg);
1928  else if (r != CUDA_SUCCESS)
1929    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1930  nvptx_stacks_free (stacks, teams * threads);
1931}
1932
1933/* TODO: Implement GOMP_OFFLOAD_async_run. */
1934