1/* { dg-do run { target openacc_nvidia_accel_selected } } */ 2/* { dg-additional-options "-lcuda -lcublas -lcudart" } */ 3 4#include <stdio.h> 5#include <stdlib.h> 6#include <cuda.h> 7#include <cuda_runtime_api.h> 8#include <cublas_v2.h> 9#include <openacc.h> 10 11void 12saxpy (int n, float a, float *x, float *y) 13{ 14 int i; 15 16 for (i = 0; i < n; i++) 17 { 18 y[i] = a * x[i] + y[i]; 19 } 20} 21 22void 23context_check (CUcontext ctx1) 24{ 25 CUcontext ctx2, ctx3; 26 CUresult r; 27 28 r = cuCtxGetCurrent (&ctx2); 29 if (r != CUDA_SUCCESS) 30 { 31 fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); 32 exit (EXIT_FAILURE); 33 } 34 35 if (ctx1 != ctx2) 36 { 37 fprintf (stderr, "new context established\n"); 38 exit (EXIT_FAILURE); 39 } 40 41 ctx3 = (CUcontext) acc_get_current_cuda_context (); 42 43 if (ctx1 != ctx3) 44 { 45 fprintf (stderr, "acc_get_current_cuda_context returned wrong value\n"); 46 exit (EXIT_FAILURE); 47 } 48 49 return; 50} 51 52int 53main (int argc, char **argv) 54{ 55 cublasStatus_t s; 56 cublasHandle_t h; 57 CUcontext pctx; 58 CUresult r; 59 int i; 60 const int N = 256; 61 float *h_X, *h_Y1, *h_Y2; 62 float *d_X,*d_Y; 63 float alpha = 2.0f; 64 float error_norm; 65 float ref_norm; 66 67 /* Test 4 - OpenACC creates, cuBLAS shares. */ 68 69 acc_set_device_num (0, acc_device_nvidia); 70 71 r = cuCtxGetCurrent (&pctx); 72 if (r != CUDA_SUCCESS) 73 { 74 fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); 75 exit (EXIT_FAILURE); 76 } 77 78 h_X = (float *) malloc (N * sizeof (float)); 79 if (h_X == 0) 80 { 81 fprintf (stderr, "malloc failed: for h_X\n"); 82 exit (EXIT_FAILURE); 83 } 84 85 h_Y1 = (float *) malloc (N * sizeof (float)); 86 if (h_Y1 == 0) 87 { 88 fprintf (stderr, "malloc failed: for h_Y1\n"); 89 exit (EXIT_FAILURE); 90 } 91 92 h_Y2 = (float *) malloc (N * sizeof (float)); 93 if (h_Y2 == 0) 94 { 95 fprintf (stderr, "malloc failed: for h_Y2\n"); 96 exit (EXIT_FAILURE); 97 } 98 99 for (i = 0; i < N; i++) 100 { 101 h_X[i] = rand () / (float) RAND_MAX; 102 h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX; 103 } 104 105#pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha) 106 { 107 int i; 108 109 for (i = 0; i < N; i++) 110 { 111 h_Y2[i] = alpha * h_X[i] + h_Y2[i]; 112 } 113 } 114 115 r = cuCtxGetCurrent (&pctx); 116 if (r != CUDA_SUCCESS) 117 { 118 fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); 119 exit (EXIT_FAILURE); 120 } 121 122 d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float)); 123 if (d_X == NULL) 124 { 125 fprintf (stderr, "copyin error h_Y1\n"); 126 exit (EXIT_FAILURE); 127 } 128 129 d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float)); 130 if (d_Y == NULL) 131 { 132 fprintf (stderr, "copyin error h_Y1\n"); 133 exit (EXIT_FAILURE); 134 } 135 136 s = cublasCreate (&h); 137 if (s != CUBLAS_STATUS_SUCCESS) 138 { 139 fprintf (stderr, "cublasCreate failed: %d\n", s); 140 exit (EXIT_FAILURE); 141 } 142 143 context_check (pctx); 144 145 s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1); 146 if (s != CUBLAS_STATUS_SUCCESS) 147 { 148 fprintf (stderr, "cublasSaxpy failed: %d\n", s); 149 exit (EXIT_FAILURE); 150 } 151 152 context_check (pctx); 153 154 acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float)); 155 156 context_check (pctx); 157 158 error_norm = 0; 159 ref_norm = 0; 160 161 for (i = 0; i < N; ++i) 162 { 163 float diff; 164 165 diff = h_Y1[i] - h_Y2[i]; 166 error_norm += diff * diff; 167 ref_norm += h_Y2[i] * h_Y2[i]; 168 } 169 170 error_norm = (float) sqrt ((double) error_norm); 171 ref_norm = (float) sqrt ((double) ref_norm); 172 173 if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f)) 174 { 175 fprintf (stderr, "math error\n"); 176 exit (EXIT_FAILURE); 177 } 178 179 free (h_X); 180 free (h_Y1); 181 free (h_Y2); 182 183 acc_free (d_X); 184 acc_free (d_Y); 185 186 context_check (pctx); 187 188 s = cublasDestroy (h); 189 if (s != CUBLAS_STATUS_SUCCESS) 190 { 191 fprintf (stderr, "cublasDestroy failed: %d\n", s); 192 exit (EXIT_FAILURE); 193 } 194 195 context_check (pctx); 196 197 acc_shutdown (acc_device_nvidia); 198 199 r = cuCtxGetCurrent (&pctx); 200 if (r != CUDA_SUCCESS) 201 { 202 fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); 203 exit (EXIT_FAILURE); 204 } 205 206 if (pctx) 207 { 208 fprintf (stderr, "Unexpected context\n"); 209 exit (EXIT_FAILURE); 210 } 211 212 return EXIT_SUCCESS; 213} 214