1/* { dg-do run { target openacc_nvidia_accel_selected } } */
2/* { dg-additional-options "-lcuda" } */
3
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7#include <openacc.h>
8#include <cuda.h>
9
10int
11main (int argc, char **argv)
12{
13  CUdevice dev;
14  CUfunction delay2;
15  CUmodule module;
16  CUresult r;
17  int N;
18  int i;
19  CUstream *streams;
20  unsigned long **a, **d_a, *tid, ticks;
21  int nbytes;
22  void *kargs[3];
23  int clkrate;
24  int devnum, nprocs;
25
26  acc_init (acc_device_nvidia);
27
28  devnum = acc_get_device_num (acc_device_nvidia);
29
30  r = cuDeviceGet (&dev, devnum);
31  if (r != CUDA_SUCCESS)
32    {
33      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
34      abort ();
35    }
36
37  r =
38    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
39			  dev);
40  if (r != CUDA_SUCCESS)
41    {
42      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
43      abort ();
44    }
45
46  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
47  if (r != CUDA_SUCCESS)
48    {
49      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
50      abort ();
51    }
52
53  r = cuModuleLoad (&module, "subr.ptx");
54  if (r != CUDA_SUCCESS)
55    {
56      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
57      abort ();
58    }
59
60  r = cuModuleGetFunction (&delay2, module, "delay2");
61  if (r != CUDA_SUCCESS)
62    {
63      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
64      abort ();
65    }
66
67  nbytes = sizeof (int);
68
69  ticks = (unsigned long) (200.0 * clkrate);
70
71  N = nprocs;
72
73  streams = (CUstream *) malloc (N * sizeof (void *));
74
75  a = (unsigned long **) malloc (N * sizeof (unsigned long *));
76  d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
77  tid = (unsigned long *) malloc (N * sizeof (unsigned long));
78
79  for (i = 0; i < N; i++)
80    {
81      a[i] = (unsigned long *) malloc (sizeof (unsigned long));
82      *a[i] = N;
83      d_a[i] = (unsigned long *) acc_malloc (nbytes);
84      tid[i] = i;
85
86      acc_map_data (a[i], d_a[i], nbytes);
87
88      streams[i] = (CUstream) acc_get_cuda_stream (i);
89      if (streams[i] != NULL)
90        abort ();
91
92      r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
93      if (r != CUDA_SUCCESS)
94        {
95          fprintf (stderr, "cuStreamCreate failed: %d\n", r);
96          abort ();
97        }
98
99       if (!acc_set_cuda_stream (i, streams[i]))
100        abort ();
101    }
102
103  for (i = 0; i < N; i++)
104    {
105      kargs[0] = (void *) &d_a[i];
106      kargs[1] = (void *) &ticks;
107      kargs[2] = (void *) &tid[i];
108
109      r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
110      if (r != CUDA_SUCCESS)
111	{
112	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
113	  abort ();
114	}
115
116      ticks = (unsigned long) (50.0 * clkrate);
117    }
118
119  acc_wait_all_async (0);
120
121  for (i = 0; i < N; i++)
122    {
123      acc_copyout (a[i], nbytes);
124      if (*a[i] != i)
125	abort ();
126    }
127
128  free (streams);
129
130  for (i = 0; i < N; i++)
131    {
132      free (a[i]);
133    }
134
135  free (a);
136  free (d_a);
137  free (tid);
138
139  acc_shutdown (acc_device_nvidia);
140
141  exit (0);
142}
143
144/* { dg-output "" } */
145