1/* { dg-do run { target openacc_nvidia_accel_selected } } */
2/* { dg-additional-options "-lcuda -lcublas -lcudart" } */
3
4#include <stdio.h>
5#include <stdlib.h>
6#include <cuda.h>
7#include <cuda_runtime_api.h>
8#include <cublas_v2.h>
9#include <openacc.h>
10
11void
12saxpy (int n, float a, float *x, float *y)
13{
14    int i;
15
16    for (i = 0; i < n; i++)
17    {
18        y[i] = a * x[i] + y[i];
19    }
20}
21
22void
23context_check (CUcontext ctx1)
24{
25    CUcontext ctx2, ctx3;
26    CUresult r;
27
28    r = cuCtxGetCurrent (&ctx2);
29    if (r != CUDA_SUCCESS)
30    {
31        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
32        exit (EXIT_FAILURE);
33    }
34
35    if (ctx1 != ctx2)
36    {
37        fprintf (stderr, "new context established\n");
38        exit (EXIT_FAILURE);
39    }
40
41    ctx3 = (CUcontext) acc_get_current_cuda_context ();
42
43    if (ctx1 != ctx3)
44    {
45        fprintf (stderr, "acc_get_current_cuda_context returned wrong value\n");
46        exit (EXIT_FAILURE);
47    }
48
49    return;
50}
51
52int
53main (int argc, char **argv)
54{
55    cublasStatus_t s;
56    cublasHandle_t h;
57    CUcontext pctx;
58    CUresult r;
59    int i;
60    const int N = 256;
61    float *h_X, *h_Y1, *h_Y2;
62    float *d_X,*d_Y;
63    float alpha = 2.0f;
64    float error_norm;
65    float ref_norm;
66
67    /* Test 4 - OpenACC creates, cuBLAS shares.  */
68
69    acc_set_device_num (0, acc_device_nvidia);
70
71    r = cuCtxGetCurrent (&pctx);
72    if (r != CUDA_SUCCESS)
73    {
74        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
75        exit (EXIT_FAILURE);
76    }
77
78    h_X = (float *) malloc (N * sizeof (float));
79    if (h_X == 0)
80    {
81        fprintf (stderr, "malloc failed: for h_X\n");
82        exit (EXIT_FAILURE);
83    }
84
85    h_Y1 = (float *) malloc (N * sizeof (float));
86    if (h_Y1 == 0)
87    {
88        fprintf (stderr, "malloc failed: for h_Y1\n");
89        exit (EXIT_FAILURE);
90    }
91
92    h_Y2 = (float *) malloc (N * sizeof (float));
93    if (h_Y2 == 0)
94    {
95        fprintf (stderr, "malloc failed: for h_Y2\n");
96        exit (EXIT_FAILURE);
97    }
98
99    for (i = 0; i < N; i++)
100    {
101        h_X[i] = rand () / (float) RAND_MAX;
102        h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX;
103    }
104
105#pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
106    {
107        int i;
108
109        for (i = 0; i < N; i++)
110        {
111            h_Y2[i] = alpha * h_X[i] + h_Y2[i];
112        }
113    }
114
115    r = cuCtxGetCurrent (&pctx);
116    if (r != CUDA_SUCCESS)
117    {
118        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
119        exit (EXIT_FAILURE);
120    }
121
122    d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float));
123    if (d_X == NULL)
124    {
125        fprintf (stderr, "copyin error h_Y1\n");
126        exit (EXIT_FAILURE);
127    }
128
129    d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float));
130    if (d_Y == NULL)
131    {
132        fprintf (stderr, "copyin error h_Y1\n");
133        exit (EXIT_FAILURE);
134    }
135
136    s = cublasCreate (&h);
137    if (s != CUBLAS_STATUS_SUCCESS)
138    {
139        fprintf (stderr, "cublasCreate failed: %d\n", s);
140        exit (EXIT_FAILURE);
141    }
142
143    context_check (pctx);
144
145    s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1);
146    if (s != CUBLAS_STATUS_SUCCESS)
147    {
148        fprintf (stderr, "cublasSaxpy failed: %d\n", s);
149        exit (EXIT_FAILURE);
150    }
151
152    context_check (pctx);
153
154    acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float));
155
156    context_check (pctx);
157
158    error_norm = 0;
159    ref_norm = 0;
160
161    for (i = 0; i < N; ++i)
162    {
163        float diff;
164
165        diff = h_Y1[i] - h_Y2[i];
166        error_norm += diff * diff;
167        ref_norm += h_Y2[i] * h_Y2[i];
168    }
169
170    error_norm = (float) sqrt ((double) error_norm);
171    ref_norm = (float) sqrt ((double) ref_norm);
172
173    if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f))
174    {
175        fprintf (stderr, "math error\n");
176        exit (EXIT_FAILURE);
177    }
178
179    free (h_X);
180    free (h_Y1);
181    free (h_Y2);
182
183    acc_free (d_X);
184    acc_free (d_Y);
185
186    context_check (pctx);
187
188    s = cublasDestroy (h);
189    if (s != CUBLAS_STATUS_SUCCESS)
190    {
191        fprintf (stderr, "cublasDestroy failed: %d\n", s);
192        exit (EXIT_FAILURE);
193    }
194
195    context_check (pctx);
196
197    acc_shutdown (acc_device_nvidia);
198
199    r = cuCtxGetCurrent (&pctx);
200    if (r != CUDA_SUCCESS)
201    {
202        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
203        exit (EXIT_FAILURE);
204    }
205
206    if (pctx)
207    {
208        fprintf (stderr, "Unexpected context\n");
209        exit (EXIT_FAILURE);
210    }
211
212    return EXIT_SUCCESS;
213}
214