1/* Plugin for NVPTX execution. 2 3 Copyright (C) 2013-2015 Free Software Foundation, Inc. 4 5 Contributed by Mentor Embedded. 6 7 This file is part of the GNU Offloading and Multi Processing Library 8 (libgomp). 9 10 Libgomp is free software; you can redistribute it and/or modify it 11 under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 more details. 19 20 Under Section 7 of GPL version 3, you are granted additional 21 permissions described in the GCC Runtime Library Exception, version 22 3.1, as published by the Free Software Foundation. 23 24 You should have received a copy of the GNU General Public License and 25 a copy of the GCC Runtime Library Exception along with this program; 26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 <http://www.gnu.org/licenses/>. */ 28 29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver 30 library appears to hold some implicit state, but the documentation 31 is not clear as to what that state might be. Or how one might 32 propagate it from one thread to another. */ 33 34#include "openacc.h" 35#include "config.h" 36#include "libgomp-plugin.h" 37#include "oacc-ptx.h" 38#include "oacc-plugin.h" 39 40#include <pthread.h> 41#include <cuda.h> 42#include <stdbool.h> 43#include <stdint.h> 44#include <string.h> 45#include <stdio.h> 46#include <dlfcn.h> 47#include <unistd.h> 48#include <assert.h> 49 50#define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0])) 51 52static struct 53{ 54 CUresult r; 55 char *m; 56} cuda_errlist[]= 57{ 58 { CUDA_ERROR_INVALID_VALUE, "invalid value" }, 59 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" }, 60 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" }, 61 { CUDA_ERROR_DEINITIALIZED, "deinitialized" }, 62 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" }, 63 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" }, 64 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" }, 65 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" }, 66 { CUDA_ERROR_NO_DEVICE, "no device" }, 67 { CUDA_ERROR_INVALID_DEVICE, "invalid device" }, 68 { CUDA_ERROR_INVALID_IMAGE, "invalid image" }, 69 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" }, 70 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" }, 71 { CUDA_ERROR_MAP_FAILED, "map error" }, 72 { CUDA_ERROR_UNMAP_FAILED, "unmap error" }, 73 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" }, 74 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" }, 75 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" }, 76 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" }, 77 { CUDA_ERROR_NOT_MAPPED, "not mapped" }, 78 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" }, 79 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" }, 80 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" }, 81 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" }, 82 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" }, 83 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" }, 84 { CUDA_ERROR_INVALID_SOURCE, "invalid source" }, 85 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" }, 86 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, 87 "shared object symbol not found" }, 88 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" }, 89 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" }, 90 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" }, 91 { CUDA_ERROR_NOT_FOUND, "not found" }, 92 { CUDA_ERROR_NOT_READY, "not ready" }, 93 { CUDA_ERROR_LAUNCH_FAILED, "launch error" }, 94 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" }, 95 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" }, 96 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, 97 "launch incompatibe texturing" }, 98 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" }, 99 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " }, 100 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" }, 101 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" }, 102 { CUDA_ERROR_ASSERT, "assert" }, 103 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" }, 104 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, 105 "host memory already registered" }, 106 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" }, 107 { CUDA_ERROR_NOT_PERMITTED, "not permitted" }, 108 { CUDA_ERROR_NOT_SUPPORTED, "not supported" }, 109 { CUDA_ERROR_UNKNOWN, "unknown" } 110}; 111 112static char errmsg[128]; 113 114static char * 115cuda_error (CUresult r) 116{ 117 int i; 118 119 for (i = 0; i < ARRAYSIZE (cuda_errlist); i++) 120 { 121 if (cuda_errlist[i].r == r) 122 return &cuda_errlist[i].m[0]; 123 } 124 125 sprintf (&errmsg[0], "unknown result code: %5d", r); 126 127 return &errmsg[0]; 128} 129 130struct targ_fn_descriptor 131{ 132 CUfunction fn; 133 const char *name; 134}; 135 136static unsigned int instantiated_devices = 0; 137static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; 138 139struct ptx_stream 140{ 141 CUstream stream; 142 pthread_t host_thread; 143 bool multithreaded; 144 145 CUdeviceptr d; 146 void *h; 147 void *h_begin; 148 void *h_end; 149 void *h_next; 150 void *h_prev; 151 void *h_tail; 152 153 struct ptx_stream *next; 154}; 155 156/* Thread-specific data for PTX. */ 157 158struct nvptx_thread 159{ 160 struct ptx_stream *current_stream; 161 struct ptx_device *ptx_dev; 162}; 163 164struct map 165{ 166 int async; 167 size_t size; 168 char mappings[0]; 169}; 170 171static void 172map_init (struct ptx_stream *s) 173{ 174 CUresult r; 175 176 int size = getpagesize (); 177 178 assert (s); 179 assert (!s->d); 180 assert (!s->h); 181 182 r = cuMemAllocHost (&s->h, size); 183 if (r != CUDA_SUCCESS) 184 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); 185 186 r = cuMemHostGetDevicePointer (&s->d, s->h, 0); 187 if (r != CUDA_SUCCESS) 188 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); 189 190 assert (s->h); 191 192 s->h_begin = s->h; 193 s->h_end = s->h_begin + size; 194 s->h_next = s->h_prev = s->h_tail = s->h_begin; 195 196 assert (s->h_next); 197 assert (s->h_end); 198} 199 200static void 201map_fini (struct ptx_stream *s) 202{ 203 CUresult r; 204 205 r = cuMemFreeHost (s->h); 206 if (r != CUDA_SUCCESS) 207 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r)); 208} 209 210static void 211map_pop (struct ptx_stream *s) 212{ 213 struct map *m; 214 215 assert (s != NULL); 216 assert (s->h_next); 217 assert (s->h_prev); 218 assert (s->h_tail); 219 220 m = s->h_tail; 221 222 s->h_tail += m->size; 223 224 if (s->h_tail >= s->h_end) 225 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); 226 227 if (s->h_next == s->h_tail) 228 s->h_prev = s->h_next; 229 230 assert (s->h_next >= s->h_begin); 231 assert (s->h_tail >= s->h_begin); 232 assert (s->h_prev >= s->h_begin); 233 234 assert (s->h_next <= s->h_end); 235 assert (s->h_tail <= s->h_end); 236 assert (s->h_prev <= s->h_end); 237} 238 239static void 240map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) 241{ 242 int left; 243 int offset; 244 struct map *m; 245 246 assert (s != NULL); 247 248 left = s->h_end - s->h_next; 249 size += sizeof (struct map); 250 251 assert (s->h_prev); 252 assert (s->h_next); 253 254 if (size >= left) 255 { 256 m = s->h_prev; 257 m->size += left; 258 s->h_next = s->h_begin; 259 260 if (s->h_next + size > s->h_end) 261 GOMP_PLUGIN_fatal ("unable to push map"); 262 } 263 264 assert (s->h_next); 265 266 m = s->h_next; 267 m->async = async; 268 m->size = size; 269 270 offset = (void *)&m->mappings[0] - s->h; 271 272 *d = (void *)(s->d + offset); 273 *h = (void *)(s->h + offset); 274 275 s->h_prev = s->h_next; 276 s->h_next += size; 277 278 assert (s->h_prev); 279 assert (s->h_next); 280 281 assert (s->h_next >= s->h_begin); 282 assert (s->h_tail >= s->h_begin); 283 assert (s->h_prev >= s->h_begin); 284 assert (s->h_next <= s->h_end); 285 assert (s->h_tail <= s->h_end); 286 assert (s->h_prev <= s->h_end); 287 288 return; 289} 290 291struct ptx_device 292{ 293 CUcontext ctx; 294 bool ctx_shared; 295 CUdevice dev; 296 struct ptx_stream *null_stream; 297 /* All non-null streams associated with this device (actually context), 298 either created implicitly or passed in from the user (via 299 acc_set_cuda_stream). */ 300 struct ptx_stream *active_streams; 301 struct { 302 struct ptx_stream **arr; 303 int size; 304 } async_streams; 305 /* A lock for use when manipulating the above stream list and array. */ 306 pthread_mutex_t stream_lock; 307 int ord; 308 bool overlap; 309 bool map; 310 bool concur; 311 int mode; 312 bool mkern; 313 314 struct ptx_device *next; 315}; 316 317enum ptx_event_type 318{ 319 PTX_EVT_MEM, 320 PTX_EVT_KNL, 321 PTX_EVT_SYNC, 322 PTX_EVT_ASYNC_CLEANUP 323}; 324 325struct ptx_event 326{ 327 CUevent *evt; 328 int type; 329 void *addr; 330 int ord; 331 332 struct ptx_event *next; 333}; 334 335struct ptx_image_data 336{ 337 void *target_data; 338 CUmodule module; 339 struct ptx_image_data *next; 340}; 341 342static pthread_mutex_t ptx_event_lock; 343static struct ptx_event *ptx_events; 344 345static struct ptx_device **ptx_devices; 346 347static struct ptx_image_data *ptx_images = NULL; 348static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER; 349 350#define _XSTR(s) _STR(s) 351#define _STR(s) #s 352 353static struct _synames 354{ 355 char *n; 356} cuda_symnames[] = 357{ 358 { _XSTR (cuCtxCreate) }, 359 { _XSTR (cuCtxDestroy) }, 360 { _XSTR (cuCtxGetCurrent) }, 361 { _XSTR (cuCtxPushCurrent) }, 362 { _XSTR (cuCtxSynchronize) }, 363 { _XSTR (cuDeviceGet) }, 364 { _XSTR (cuDeviceGetAttribute) }, 365 { _XSTR (cuDeviceGetCount) }, 366 { _XSTR (cuEventCreate) }, 367 { _XSTR (cuEventDestroy) }, 368 { _XSTR (cuEventQuery) }, 369 { _XSTR (cuEventRecord) }, 370 { _XSTR (cuInit) }, 371 { _XSTR (cuLaunchKernel) }, 372 { _XSTR (cuLinkAddData) }, 373 { _XSTR (cuLinkComplete) }, 374 { _XSTR (cuLinkCreate) }, 375 { _XSTR (cuMemAlloc) }, 376 { _XSTR (cuMemAllocHost) }, 377 { _XSTR (cuMemcpy) }, 378 { _XSTR (cuMemcpyDtoH) }, 379 { _XSTR (cuMemcpyDtoHAsync) }, 380 { _XSTR (cuMemcpyHtoD) }, 381 { _XSTR (cuMemcpyHtoDAsync) }, 382 { _XSTR (cuMemFree) }, 383 { _XSTR (cuMemFreeHost) }, 384 { _XSTR (cuMemGetAddressRange) }, 385 { _XSTR (cuMemHostGetDevicePointer) }, 386 { _XSTR (cuMemHostRegister) }, 387 { _XSTR (cuMemHostUnregister) }, 388 { _XSTR (cuModuleGetFunction) }, 389 { _XSTR (cuModuleLoadData) }, 390 { _XSTR (cuStreamDestroy) }, 391 { _XSTR (cuStreamQuery) }, 392 { _XSTR (cuStreamSynchronize) }, 393 { _XSTR (cuStreamWaitEvent) } 394}; 395 396static int 397verify_device_library (void) 398{ 399 int i; 400 void *dh, *ds; 401 402 dh = dlopen ("libcuda.so", RTLD_LAZY); 403 if (!dh) 404 return -1; 405 406 for (i = 0; i < ARRAYSIZE (cuda_symnames); i++) 407 { 408 ds = dlsym (dh, cuda_symnames[i].n); 409 if (!ds) 410 return -1; 411 } 412 413 dlclose (dh); 414 415 return 0; 416} 417 418static inline struct nvptx_thread * 419nvptx_thread (void) 420{ 421 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 422} 423 424static void 425init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) 426{ 427 int i; 428 struct ptx_stream *null_stream 429 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); 430 431 null_stream->stream = NULL; 432 null_stream->host_thread = pthread_self (); 433 null_stream->multithreaded = true; 434 null_stream->d = (CUdeviceptr) NULL; 435 null_stream->h = NULL; 436 map_init (null_stream); 437 ptx_dev->null_stream = null_stream; 438 439 ptx_dev->active_streams = NULL; 440 pthread_mutex_init (&ptx_dev->stream_lock, NULL); 441 442 if (concurrency < 1) 443 concurrency = 1; 444 445 /* This is just a guess -- make space for as many async streams as the 446 current device is capable of concurrently executing. This can grow 447 later as necessary. No streams are created yet. */ 448 ptx_dev->async_streams.arr 449 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *)); 450 ptx_dev->async_streams.size = concurrency; 451 452 for (i = 0; i < concurrency; i++) 453 ptx_dev->async_streams.arr[i] = NULL; 454} 455 456static void 457fini_streams_for_device (struct ptx_device *ptx_dev) 458{ 459 free (ptx_dev->async_streams.arr); 460 461 while (ptx_dev->active_streams != NULL) 462 { 463 struct ptx_stream *s = ptx_dev->active_streams; 464 ptx_dev->active_streams = ptx_dev->active_streams->next; 465 466 map_fini (s); 467 cuStreamDestroy (s->stream); 468 free (s); 469 } 470 471 map_fini (ptx_dev->null_stream); 472 free (ptx_dev->null_stream); 473} 474 475/* Select a stream for (OpenACC-semantics) ASYNC argument for the current 476 thread THREAD (and also current device/context). If CREATE is true, create 477 the stream if it does not exist (or use EXISTING if it is non-NULL), and 478 associate the stream with the same thread argument. Returns stream to use 479 as result. */ 480 481static struct ptx_stream * 482select_stream_for_async (int async, pthread_t thread, bool create, 483 CUstream existing) 484{ 485 struct nvptx_thread *nvthd = nvptx_thread (); 486 /* Local copy of TLS variable. */ 487 struct ptx_device *ptx_dev = nvthd->ptx_dev; 488 struct ptx_stream *stream = NULL; 489 int orig_async = async; 490 491 /* The special value acc_async_noval (-1) maps (for now) to an 492 implicitly-created stream, which is then handled the same as any other 493 numbered async stream. Other options are available, e.g. using the null 494 stream for anonymous async operations, or choosing an idle stream from an 495 active set. But, stick with this for now. */ 496 if (async > acc_async_sync) 497 async++; 498 499 if (create) 500 pthread_mutex_lock (&ptx_dev->stream_lock); 501 502 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the 503 null stream, and in fact better performance may be obtainable if it doesn't 504 (because the null stream enforces overly-strict synchronisation with 505 respect to other streams for legacy reasons, and that's probably not 506 needed with OpenACC). Maybe investigate later. */ 507 if (async == acc_async_sync) 508 stream = ptx_dev->null_stream; 509 else if (async >= 0 && async < ptx_dev->async_streams.size 510 && ptx_dev->async_streams.arr[async] && !(create && existing)) 511 stream = ptx_dev->async_streams.arr[async]; 512 else if (async >= 0 && create) 513 { 514 if (async >= ptx_dev->async_streams.size) 515 { 516 int i, newsize = ptx_dev->async_streams.size * 2; 517 518 if (async >= newsize) 519 newsize = async + 1; 520 521 ptx_dev->async_streams.arr 522 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, 523 newsize * sizeof (struct ptx_stream *)); 524 525 for (i = ptx_dev->async_streams.size; i < newsize; i++) 526 ptx_dev->async_streams.arr[i] = NULL; 527 528 ptx_dev->async_streams.size = newsize; 529 } 530 531 /* Create a new stream on-demand if there isn't one already, or if we're 532 setting a particular async value to an existing (externally-provided) 533 stream. */ 534 if (!ptx_dev->async_streams.arr[async] || existing) 535 { 536 CUresult r; 537 struct ptx_stream *s 538 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); 539 540 if (existing) 541 s->stream = existing; 542 else 543 { 544 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); 545 if (r != CUDA_SUCCESS) 546 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); 547 } 548 549 /* If CREATE is true, we're going to be queueing some work on this 550 stream. Associate it with the current host thread. */ 551 s->host_thread = thread; 552 s->multithreaded = false; 553 554 s->d = (CUdeviceptr) NULL; 555 s->h = NULL; 556 map_init (s); 557 558 s->next = ptx_dev->active_streams; 559 ptx_dev->active_streams = s; 560 ptx_dev->async_streams.arr[async] = s; 561 } 562 563 stream = ptx_dev->async_streams.arr[async]; 564 } 565 else if (async < 0) 566 GOMP_PLUGIN_fatal ("bad async %d", async); 567 568 if (create) 569 { 570 assert (stream != NULL); 571 572 /* If we're trying to use the same stream from different threads 573 simultaneously, set stream->multithreaded to true. This affects the 574 behaviour of acc_async_test_all and acc_wait_all, which are supposed to 575 only wait for asynchronous launches from the same host thread they are 576 invoked on. If multiple threads use the same async value, we make note 577 of that here and fall back to testing/waiting for all threads in those 578 functions. */ 579 if (thread != stream->host_thread) 580 stream->multithreaded = true; 581 582 pthread_mutex_unlock (&ptx_dev->stream_lock); 583 } 584 else if (stream && !stream->multithreaded 585 && !pthread_equal (stream->host_thread, thread)) 586 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); 587 588 return stream; 589} 590 591/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK 592 should be locked on entry and remains locked on exit. */ 593static bool 594nvptx_init (void) 595{ 596 CUresult r; 597 int rc; 598 int ndevs; 599 600 if (instantiated_devices != 0) 601 return true; 602 603 rc = verify_device_library (); 604 if (rc < 0) 605 return false; 606 607 r = cuInit (0); 608 if (r != CUDA_SUCCESS) 609 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r)); 610 611 ptx_events = NULL; 612 613 pthread_mutex_init (&ptx_event_lock, NULL); 614 615 r = cuDeviceGetCount (&ndevs); 616 if (r != CUDA_SUCCESS) 617 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); 618 619 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) 620 * ndevs); 621 622 return true; 623} 624 625/* Select the N'th PTX device for the current host thread. The device must 626 have been previously opened before calling this function. */ 627 628static void 629nvptx_attach_host_thread_to_device (int n) 630{ 631 CUdevice dev; 632 CUresult r; 633 struct ptx_device *ptx_dev; 634 CUcontext thd_ctx; 635 636 r = cuCtxGetDevice (&dev); 637 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 638 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); 639 640 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) 641 return; 642 else 643 { 644 CUcontext old_ctx; 645 646 ptx_dev = ptx_devices[n]; 647 assert (ptx_dev); 648 649 r = cuCtxGetCurrent (&thd_ctx); 650 if (r != CUDA_SUCCESS) 651 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 652 653 /* We don't necessarily have a current context (e.g. if it has been 654 destroyed. Pop it if we do though. */ 655 if (thd_ctx != NULL) 656 { 657 r = cuCtxPopCurrent (&old_ctx); 658 if (r != CUDA_SUCCESS) 659 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); 660 } 661 662 r = cuCtxPushCurrent (ptx_dev->ctx); 663 if (r != CUDA_SUCCESS) 664 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); 665 } 666} 667 668static struct ptx_device * 669nvptx_open_device (int n) 670{ 671 struct ptx_device *ptx_dev; 672 CUdevice dev, ctx_dev; 673 CUresult r; 674 int async_engines, pi; 675 676 r = cuDeviceGet (&dev, n); 677 if (r != CUDA_SUCCESS) 678 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); 679 680 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); 681 682 ptx_dev->ord = n; 683 ptx_dev->dev = dev; 684 ptx_dev->ctx_shared = false; 685 686 r = cuCtxGetDevice (&ctx_dev); 687 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 688 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); 689 690 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) 691 { 692 /* The current host thread has an active context for a different device. 693 Detach it. */ 694 CUcontext old_ctx; 695 696 r = cuCtxPopCurrent (&old_ctx); 697 if (r != CUDA_SUCCESS) 698 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); 699 } 700 701 r = cuCtxGetCurrent (&ptx_dev->ctx); 702 if (r != CUDA_SUCCESS) 703 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 704 705 if (!ptx_dev->ctx) 706 { 707 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); 708 if (r != CUDA_SUCCESS) 709 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); 710 } 711 else 712 ptx_dev->ctx_shared = true; 713 714 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); 715 if (r != CUDA_SUCCESS) 716 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 717 718 ptx_dev->overlap = pi; 719 720 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); 721 if (r != CUDA_SUCCESS) 722 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 723 724 ptx_dev->map = pi; 725 726 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); 727 if (r != CUDA_SUCCESS) 728 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 729 730 ptx_dev->concur = pi; 731 732 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 733 if (r != CUDA_SUCCESS) 734 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 735 736 ptx_dev->mode = pi; 737 738 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); 739 if (r != CUDA_SUCCESS) 740 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); 741 742 ptx_dev->mkern = pi; 743 744 r = cuDeviceGetAttribute (&async_engines, 745 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); 746 if (r != CUDA_SUCCESS) 747 async_engines = 1; 748 749 init_streams_for_device (ptx_dev, async_engines); 750 751 return ptx_dev; 752} 753 754static void 755nvptx_close_device (struct ptx_device *ptx_dev) 756{ 757 CUresult r; 758 759 if (!ptx_dev) 760 return; 761 762 fini_streams_for_device (ptx_dev); 763 764 if (!ptx_dev->ctx_shared) 765 { 766 r = cuCtxDestroy (ptx_dev->ctx); 767 if (r != CUDA_SUCCESS) 768 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r)); 769 } 770 771 free (ptx_dev); 772} 773 774static int 775nvptx_get_num_devices (void) 776{ 777 int n; 778 CUresult r; 779 780 /* PR libgomp/65099: Currently, we only support offloading in 64-bit 781 configurations. */ 782 if (sizeof (void *) != 8) 783 return 0; 784 785 /* This function will be called before the plugin has been initialized in 786 order to enumerate available devices, but CUDA API routines can't be used 787 until cuInit has been called. Just call it now (but don't yet do any 788 further initialization). */ 789 if (instantiated_devices == 0) 790 cuInit (0); 791 792 r = cuDeviceGetCount (&n); 793 if (r!= CUDA_SUCCESS) 794 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); 795 796 return n; 797} 798 799 800static void 801link_ptx (CUmodule *module, char *ptx_code) 802{ 803 CUjit_option opts[7]; 804 void *optvals[7]; 805 float elapsed = 0.0; 806#define LOGSIZE 8192 807 char elog[LOGSIZE]; 808 char ilog[LOGSIZE]; 809 unsigned long logsize = LOGSIZE; 810 CUlinkState linkstate; 811 CUresult r; 812 void *linkout; 813 size_t linkoutsize __attribute__ ((unused)); 814 815 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code); 816 817 opts[0] = CU_JIT_WALL_TIME; 818 optvals[0] = &elapsed; 819 820 opts[1] = CU_JIT_INFO_LOG_BUFFER; 821 optvals[1] = &ilog[0]; 822 823 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 824 optvals[2] = (void *) logsize; 825 826 opts[3] = CU_JIT_ERROR_LOG_BUFFER; 827 optvals[3] = &elog[0]; 828 829 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 830 optvals[4] = (void *) logsize; 831 832 opts[5] = CU_JIT_LOG_VERBOSE; 833 optvals[5] = (void *) 1; 834 835 opts[6] = CU_JIT_TARGET; 836 optvals[6] = (void *) CU_TARGET_COMPUTE_30; 837 838 r = cuLinkCreate (7, opts, optvals, &linkstate); 839 if (r != CUDA_SUCCESS) 840 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); 841 842 char *abort_ptx = ABORT_PTX; 843 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx, 844 strlen (abort_ptx) + 1, 0, 0, 0, 0); 845 if (r != CUDA_SUCCESS) 846 { 847 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 848 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r)); 849 } 850 851 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX; 852 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx, 853 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0); 854 if (r != CUDA_SUCCESS) 855 { 856 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 857 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s", 858 cuda_error (r)); 859 } 860 861 char *goacc_internal_ptx = GOACC_INTERNAL_PTX; 862 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx, 863 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0); 864 if (r != CUDA_SUCCESS) 865 { 866 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 867 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", 868 cuda_error (r)); 869 } 870 871 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code, 872 strlen (ptx_code) + 1, 0, 0, 0, 0); 873 if (r != CUDA_SUCCESS) 874 { 875 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 876 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); 877 } 878 879 r = cuLinkComplete (linkstate, &linkout, &linkoutsize); 880 if (r != CUDA_SUCCESS) 881 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); 882 883 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); 884 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); 885 886 r = cuModuleLoadData (module, linkout); 887 if (r != CUDA_SUCCESS) 888 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); 889} 890 891static void 892event_gc (bool memmap_lockable) 893{ 894 struct ptx_event *ptx_event = ptx_events; 895 struct nvptx_thread *nvthd = nvptx_thread (); 896 897 pthread_mutex_lock (&ptx_event_lock); 898 899 while (ptx_event != NULL) 900 { 901 CUresult r; 902 struct ptx_event *e = ptx_event; 903 904 ptx_event = ptx_event->next; 905 906 if (e->ord != nvthd->ptx_dev->ord) 907 continue; 908 909 r = cuEventQuery (*e->evt); 910 if (r == CUDA_SUCCESS) 911 { 912 CUevent *te; 913 914 te = e->evt; 915 916 switch (e->type) 917 { 918 case PTX_EVT_MEM: 919 case PTX_EVT_SYNC: 920 break; 921 922 case PTX_EVT_KNL: 923 map_pop (e->addr); 924 break; 925 926 case PTX_EVT_ASYNC_CLEANUP: 927 { 928 /* The function gomp_plugin_async_unmap_vars needs to claim the 929 memory-map splay tree lock for the current device, so we 930 can't call it when one of our callers has already claimed 931 the lock. In that case, just delay the GC for this event 932 until later. */ 933 if (!memmap_lockable) 934 continue; 935 936 GOMP_PLUGIN_async_unmap_vars (e->addr); 937 } 938 break; 939 } 940 941 cuEventDestroy (*te); 942 free ((void *)te); 943 944 if (ptx_events == e) 945 ptx_events = ptx_events->next; 946 else 947 { 948 struct ptx_event *e_ = ptx_events; 949 while (e_->next != e) 950 e_ = e_->next; 951 e_->next = e_->next->next; 952 } 953 954 free (e); 955 } 956 } 957 958 pthread_mutex_unlock (&ptx_event_lock); 959} 960 961static void 962event_add (enum ptx_event_type type, CUevent *e, void *h) 963{ 964 struct ptx_event *ptx_event; 965 struct nvptx_thread *nvthd = nvptx_thread (); 966 967 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC 968 || type == PTX_EVT_ASYNC_CLEANUP); 969 970 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event)); 971 ptx_event->type = type; 972 ptx_event->evt = e; 973 ptx_event->addr = h; 974 ptx_event->ord = nvthd->ptx_dev->ord; 975 976 pthread_mutex_lock (&ptx_event_lock); 977 978 ptx_event->next = ptx_events; 979 ptx_events = ptx_event; 980 981 pthread_mutex_unlock (&ptx_event_lock); 982} 983 984void 985nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, 986 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, 987 int vector_length, int async, void *targ_mem_desc) 988{ 989 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; 990 CUfunction function; 991 CUresult r; 992 int i; 993 struct ptx_stream *dev_str; 994 void *kargs[1]; 995 void *hp, *dp; 996 unsigned int nthreads_in_block; 997 struct nvptx_thread *nvthd = nvptx_thread (); 998 const char *maybe_abort_msg = "(perhaps abort was called)"; 999 1000 function = targ_fn->fn; 1001 1002 dev_str = select_stream_for_async (async, pthread_self (), false, NULL); 1003 assert (dev_str == nvthd->current_stream); 1004 1005 /* This reserves a chunk of a pre-allocated page of memory mapped on both 1006 the host and the device. HP is a host pointer to the new chunk, and DP is 1007 the corresponding device pointer. */ 1008 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); 1009 1010 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 1011 1012 /* Copy the array of arguments to the mapped page. */ 1013 for (i = 0; i < mapnum; i++) 1014 ((void **) hp)[i] = devaddrs[i]; 1015 1016 /* Copy the (device) pointers to arguments to the device (dp and hp might in 1017 fact have the same value on a unified-memory system). */ 1018 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); 1019 if (r != CUDA_SUCCESS) 1020 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); 1021 1022 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); 1023 1024 // OpenACC CUDA 1025 // 1026 // num_gangs blocks 1027 // num_workers warps (where a warp is equivalent to 32 threads) 1028 // vector length threads 1029 // 1030 1031 /* The openacc vector_length clause 'determines the vector length to use for 1032 vector or SIMD operations'. The question is how to map this to CUDA. 1033 1034 In CUDA, the warp size is the vector length of a CUDA device. However, the 1035 CUDA interface abstracts away from that, and only shows us warp size 1036 indirectly in maximum number of threads per block, which is a product of 1037 warp size and the number of hyperthreads of a multiprocessor. 1038 1039 We choose to map openacc vector_length directly onto the number of threads 1040 in a block, in the x dimension. This is reflected in gcc code generation 1041 that uses ThreadIdx.x to access vector elements. 1042 1043 Attempting to use an openacc vector_length of more than the maximum number 1044 of threads per block will result in a cuda error. */ 1045 nthreads_in_block = vector_length; 1046 1047 kargs[0] = &dp; 1048 r = cuLaunchKernel (function, 1049 num_gangs, 1, 1, 1050 nthreads_in_block, 1, 1, 1051 0, dev_str->stream, kargs, 0); 1052 if (r != CUDA_SUCCESS) 1053 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); 1054 1055#ifndef DISABLE_ASYNC 1056 if (async < acc_async_noval) 1057 { 1058 r = cuStreamSynchronize (dev_str->stream); 1059 if (r == CUDA_ERROR_LAUNCH_FAILED) 1060 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), 1061 maybe_abort_msg); 1062 else if (r != CUDA_SUCCESS) 1063 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1064 } 1065 else 1066 { 1067 CUevent *e; 1068 1069 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1070 1071 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1072 if (r == CUDA_ERROR_LAUNCH_FAILED) 1073 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), 1074 maybe_abort_msg); 1075 else if (r != CUDA_SUCCESS) 1076 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1077 1078 event_gc (true); 1079 1080 r = cuEventRecord (*e, dev_str->stream); 1081 if (r != CUDA_SUCCESS) 1082 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1083 1084 event_add (PTX_EVT_KNL, e, (void *)dev_str); 1085 } 1086#else 1087 r = cuCtxSynchronize (); 1088 if (r == CUDA_ERROR_LAUNCH_FAILED) 1089 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), 1090 maybe_abort_msg); 1091 else if (r != CUDA_SUCCESS) 1092 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); 1093#endif 1094 1095 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, 1096 targ_fn->name); 1097 1098#ifndef DISABLE_ASYNC 1099 if (async < acc_async_noval) 1100#endif 1101 map_pop (dev_str); 1102} 1103 1104void * openacc_get_current_cuda_context (void); 1105 1106static void * 1107nvptx_alloc (size_t s) 1108{ 1109 CUdeviceptr d; 1110 CUresult r; 1111 1112 r = cuMemAlloc (&d, s); 1113 if (r == CUDA_ERROR_OUT_OF_MEMORY) 1114 return 0; 1115 if (r != CUDA_SUCCESS) 1116 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); 1117 return (void *)d; 1118} 1119 1120static void 1121nvptx_free (void *p) 1122{ 1123 CUresult r; 1124 CUdeviceptr pb; 1125 size_t ps; 1126 1127 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); 1128 if (r != CUDA_SUCCESS) 1129 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1130 1131 if ((CUdeviceptr)p != pb) 1132 GOMP_PLUGIN_fatal ("invalid device address"); 1133 1134 r = cuMemFree ((CUdeviceptr)p); 1135 if (r != CUDA_SUCCESS) 1136 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); 1137} 1138 1139static void * 1140nvptx_host2dev (void *d, const void *h, size_t s) 1141{ 1142 CUresult r; 1143 CUdeviceptr pb; 1144 size_t ps; 1145 struct nvptx_thread *nvthd = nvptx_thread (); 1146 1147 if (!s) 1148 return 0; 1149 1150 if (!d) 1151 GOMP_PLUGIN_fatal ("invalid device address"); 1152 1153 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); 1154 if (r != CUDA_SUCCESS) 1155 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1156 1157 if (!pb) 1158 GOMP_PLUGIN_fatal ("invalid device address"); 1159 1160 if (!h) 1161 GOMP_PLUGIN_fatal ("invalid host address"); 1162 1163 if (d == h) 1164 GOMP_PLUGIN_fatal ("invalid host or device address"); 1165 1166 if ((void *)(d + s) > (void *)(pb + ps)) 1167 GOMP_PLUGIN_fatal ("invalid size"); 1168 1169#ifndef DISABLE_ASYNC 1170 if (nvthd->current_stream != nvthd->ptx_dev->null_stream) 1171 { 1172 CUevent *e; 1173 1174 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1175 1176 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1177 if (r != CUDA_SUCCESS) 1178 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1179 1180 event_gc (false); 1181 1182 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, 1183 nvthd->current_stream->stream); 1184 if (r != CUDA_SUCCESS) 1185 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r)); 1186 1187 r = cuEventRecord (*e, nvthd->current_stream->stream); 1188 if (r != CUDA_SUCCESS) 1189 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1190 1191 event_add (PTX_EVT_MEM, e, (void *)h); 1192 } 1193 else 1194#endif 1195 { 1196 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); 1197 if (r != CUDA_SUCCESS) 1198 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); 1199 } 1200 1201 return 0; 1202} 1203 1204static void * 1205nvptx_dev2host (void *h, const void *d, size_t s) 1206{ 1207 CUresult r; 1208 CUdeviceptr pb; 1209 size_t ps; 1210 struct nvptx_thread *nvthd = nvptx_thread (); 1211 1212 if (!s) 1213 return 0; 1214 1215 if (!d) 1216 GOMP_PLUGIN_fatal ("invalid device address"); 1217 1218 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); 1219 if (r != CUDA_SUCCESS) 1220 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); 1221 1222 if (!pb) 1223 GOMP_PLUGIN_fatal ("invalid device address"); 1224 1225 if (!h) 1226 GOMP_PLUGIN_fatal ("invalid host address"); 1227 1228 if (d == h) 1229 GOMP_PLUGIN_fatal ("invalid host or device address"); 1230 1231 if ((void *)(d + s) > (void *)(pb + ps)) 1232 GOMP_PLUGIN_fatal ("invalid size"); 1233 1234#ifndef DISABLE_ASYNC 1235 if (nvthd->current_stream != nvthd->ptx_dev->null_stream) 1236 { 1237 CUevent *e; 1238 1239 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1240 1241 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1242 if (r != CUDA_SUCCESS) 1243 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); 1244 1245 event_gc (false); 1246 1247 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, 1248 nvthd->current_stream->stream); 1249 if (r != CUDA_SUCCESS) 1250 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); 1251 1252 r = cuEventRecord (*e, nvthd->current_stream->stream); 1253 if (r != CUDA_SUCCESS) 1254 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1255 1256 event_add (PTX_EVT_MEM, e, (void *)h); 1257 } 1258 else 1259#endif 1260 { 1261 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); 1262 if (r != CUDA_SUCCESS) 1263 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); 1264 } 1265 1266 return 0; 1267} 1268 1269static void 1270nvptx_set_async (int async) 1271{ 1272 struct nvptx_thread *nvthd = nvptx_thread (); 1273 nvthd->current_stream 1274 = select_stream_for_async (async, pthread_self (), true, NULL); 1275} 1276 1277static int 1278nvptx_async_test (int async) 1279{ 1280 CUresult r; 1281 struct ptx_stream *s; 1282 1283 s = select_stream_for_async (async, pthread_self (), false, NULL); 1284 1285 if (!s) 1286 GOMP_PLUGIN_fatal ("unknown async %d", async); 1287 1288 r = cuStreamQuery (s->stream); 1289 if (r == CUDA_SUCCESS) 1290 { 1291 /* The oacc-parallel.c:goacc_wait function calls this hook to determine 1292 whether all work has completed on this stream, and if so omits the call 1293 to the wait hook. If that happens, event_gc might not get called 1294 (which prevents variables from getting unmapped and their associated 1295 device storage freed), so call it here. */ 1296 event_gc (true); 1297 return 1; 1298 } 1299 else if (r == CUDA_ERROR_NOT_READY) 1300 return 0; 1301 1302 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); 1303 1304 return 0; 1305} 1306 1307static int 1308nvptx_async_test_all (void) 1309{ 1310 struct ptx_stream *s; 1311 pthread_t self = pthread_self (); 1312 struct nvptx_thread *nvthd = nvptx_thread (); 1313 1314 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1315 1316 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) 1317 { 1318 if ((s->multithreaded || pthread_equal (s->host_thread, self)) 1319 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) 1320 { 1321 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1322 return 0; 1323 } 1324 } 1325 1326 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1327 1328 event_gc (true); 1329 1330 return 1; 1331} 1332 1333static void 1334nvptx_wait (int async) 1335{ 1336 CUresult r; 1337 struct ptx_stream *s; 1338 1339 s = select_stream_for_async (async, pthread_self (), false, NULL); 1340 1341 if (!s) 1342 GOMP_PLUGIN_fatal ("unknown async %d", async); 1343 1344 r = cuStreamSynchronize (s->stream); 1345 if (r != CUDA_SUCCESS) 1346 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1347 1348 event_gc (true); 1349} 1350 1351static void 1352nvptx_wait_async (int async1, int async2) 1353{ 1354 CUresult r; 1355 CUevent *e; 1356 struct ptx_stream *s1, *s2; 1357 pthread_t self = pthread_self (); 1358 1359 /* The stream that is waiting (rather than being waited for) doesn't 1360 necessarily have to exist already. */ 1361 s2 = select_stream_for_async (async2, self, true, NULL); 1362 1363 s1 = select_stream_for_async (async1, self, false, NULL); 1364 if (!s1) 1365 GOMP_PLUGIN_fatal ("invalid async 1\n"); 1366 1367 if (s1 == s2) 1368 GOMP_PLUGIN_fatal ("identical parameters"); 1369 1370 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); 1371 1372 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1373 if (r != CUDA_SUCCESS) 1374 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1375 1376 event_gc (true); 1377 1378 r = cuEventRecord (*e, s1->stream); 1379 if (r != CUDA_SUCCESS) 1380 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1381 1382 event_add (PTX_EVT_SYNC, e, NULL); 1383 1384 r = cuStreamWaitEvent (s2->stream, *e, 0); 1385 if (r != CUDA_SUCCESS) 1386 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); 1387} 1388 1389static void 1390nvptx_wait_all (void) 1391{ 1392 CUresult r; 1393 struct ptx_stream *s; 1394 pthread_t self = pthread_self (); 1395 struct nvptx_thread *nvthd = nvptx_thread (); 1396 1397 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1398 1399 /* Wait for active streams initiated by this thread (or by multiple threads) 1400 to complete. */ 1401 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) 1402 { 1403 if (s->multithreaded || pthread_equal (s->host_thread, self)) 1404 { 1405 r = cuStreamQuery (s->stream); 1406 if (r == CUDA_SUCCESS) 1407 continue; 1408 else if (r != CUDA_ERROR_NOT_READY) 1409 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); 1410 1411 r = cuStreamSynchronize (s->stream); 1412 if (r != CUDA_SUCCESS) 1413 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1414 } 1415 } 1416 1417 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1418 1419 event_gc (true); 1420} 1421 1422static void 1423nvptx_wait_all_async (int async) 1424{ 1425 CUresult r; 1426 struct ptx_stream *waiting_stream, *other_stream; 1427 CUevent *e; 1428 struct nvptx_thread *nvthd = nvptx_thread (); 1429 pthread_t self = pthread_self (); 1430 1431 /* The stream doing the waiting. This could be the first mention of the 1432 stream, so create it if necessary. */ 1433 waiting_stream 1434 = select_stream_for_async (async, pthread_self (), true, NULL); 1435 1436 /* Launches on the null stream already block on other streams in the 1437 context. */ 1438 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) 1439 return; 1440 1441 event_gc (true); 1442 1443 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1444 1445 for (other_stream = nvthd->ptx_dev->active_streams; 1446 other_stream != NULL; 1447 other_stream = other_stream->next) 1448 { 1449 if (!other_stream->multithreaded 1450 && !pthread_equal (other_stream->host_thread, self)) 1451 continue; 1452 1453 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); 1454 1455 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1456 if (r != CUDA_SUCCESS) 1457 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1458 1459 /* Record an event on the waited-for stream. */ 1460 r = cuEventRecord (*e, other_stream->stream); 1461 if (r != CUDA_SUCCESS) 1462 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1463 1464 event_add (PTX_EVT_SYNC, e, NULL); 1465 1466 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); 1467 if (r != CUDA_SUCCESS) 1468 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); 1469 } 1470 1471 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1472} 1473 1474static void * 1475nvptx_get_current_cuda_device (void) 1476{ 1477 struct nvptx_thread *nvthd = nvptx_thread (); 1478 1479 if (!nvthd || !nvthd->ptx_dev) 1480 return NULL; 1481 1482 return &nvthd->ptx_dev->dev; 1483} 1484 1485static void * 1486nvptx_get_current_cuda_context (void) 1487{ 1488 struct nvptx_thread *nvthd = nvptx_thread (); 1489 1490 if (!nvthd || !nvthd->ptx_dev) 1491 return NULL; 1492 1493 return nvthd->ptx_dev->ctx; 1494} 1495 1496static void * 1497nvptx_get_cuda_stream (int async) 1498{ 1499 struct ptx_stream *s; 1500 struct nvptx_thread *nvthd = nvptx_thread (); 1501 1502 if (!nvthd || !nvthd->ptx_dev) 1503 return NULL; 1504 1505 s = select_stream_for_async (async, pthread_self (), false, NULL); 1506 1507 return s ? s->stream : NULL; 1508} 1509 1510static int 1511nvptx_set_cuda_stream (int async, void *stream) 1512{ 1513 struct ptx_stream *oldstream; 1514 pthread_t self = pthread_self (); 1515 struct nvptx_thread *nvthd = nvptx_thread (); 1516 1517 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); 1518 1519 if (async < 0) 1520 GOMP_PLUGIN_fatal ("bad async %d", async); 1521 1522 /* We have a list of active streams and an array mapping async values to 1523 entries of that list. We need to take "ownership" of the passed-in stream, 1524 and add it to our list, removing the previous entry also (if there was one) 1525 in order to prevent resource leaks. Note the potential for surprise 1526 here: maybe we should keep track of passed-in streams and leave it up to 1527 the user to tidy those up, but that doesn't work for stream handles 1528 returned from acc_get_cuda_stream above... */ 1529 1530 oldstream = select_stream_for_async (async, self, false, NULL); 1531 1532 if (oldstream) 1533 { 1534 if (nvthd->ptx_dev->active_streams == oldstream) 1535 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; 1536 else 1537 { 1538 struct ptx_stream *s = nvthd->ptx_dev->active_streams; 1539 while (s->next != oldstream) 1540 s = s->next; 1541 s->next = s->next->next; 1542 } 1543 1544 cuStreamDestroy (oldstream->stream); 1545 map_fini (oldstream); 1546 free (oldstream); 1547 } 1548 1549 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); 1550 1551 (void) select_stream_for_async (async, self, true, (CUstream) stream); 1552 1553 return 1; 1554} 1555 1556/* Plugin entry points. */ 1557 1558const char * 1559GOMP_OFFLOAD_get_name (void) 1560{ 1561 return "nvptx"; 1562} 1563 1564unsigned int 1565GOMP_OFFLOAD_get_caps (void) 1566{ 1567 return GOMP_OFFLOAD_CAP_OPENACC_200; 1568} 1569 1570int 1571GOMP_OFFLOAD_get_type (void) 1572{ 1573 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX; 1574} 1575 1576int 1577GOMP_OFFLOAD_get_num_devices (void) 1578{ 1579 return nvptx_get_num_devices (); 1580} 1581 1582void 1583GOMP_OFFLOAD_init_device (int n) 1584{ 1585 pthread_mutex_lock (&ptx_dev_lock); 1586 1587 if (!nvptx_init () || ptx_devices[n] != NULL) 1588 { 1589 pthread_mutex_unlock (&ptx_dev_lock); 1590 return; 1591 } 1592 1593 ptx_devices[n] = nvptx_open_device (n); 1594 instantiated_devices++; 1595 1596 pthread_mutex_unlock (&ptx_dev_lock); 1597} 1598 1599void 1600GOMP_OFFLOAD_fini_device (int n) 1601{ 1602 pthread_mutex_lock (&ptx_dev_lock); 1603 1604 if (ptx_devices[n] != NULL) 1605 { 1606 nvptx_attach_host_thread_to_device (n); 1607 nvptx_close_device (ptx_devices[n]); 1608 ptx_devices[n] = NULL; 1609 instantiated_devices--; 1610 } 1611 1612 pthread_mutex_unlock (&ptx_dev_lock); 1613} 1614 1615int 1616GOMP_OFFLOAD_load_image (int ord, void *target_data, 1617 struct addr_pair **target_table) 1618{ 1619 CUmodule module; 1620 char **fn_names, **var_names; 1621 unsigned int fn_entries, var_entries, i, j; 1622 CUresult r; 1623 struct targ_fn_descriptor *targ_fns; 1624 void **img_header = (void **) target_data; 1625 struct ptx_image_data *new_image; 1626 1627 GOMP_OFFLOAD_init_device (ord); 1628 1629 nvptx_attach_host_thread_to_device (ord); 1630 1631 link_ptx (&module, img_header[0]); 1632 1633 pthread_mutex_lock (&ptx_image_lock); 1634 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); 1635 new_image->target_data = target_data; 1636 new_image->module = module; 1637 new_image->next = ptx_images; 1638 ptx_images = new_image; 1639 pthread_mutex_unlock (&ptx_image_lock); 1640 1641 /* The mkoffload utility emits a table of pointers/integers at the start of 1642 each offload image: 1643 1644 img_header[0] -> ptx code 1645 img_header[1] -> number of variables 1646 img_header[2] -> array of variable names (pointers to strings) 1647 img_header[3] -> number of kernels 1648 img_header[4] -> array of kernel names (pointers to strings) 1649 1650 The array of kernel names and the functions addresses form a 1651 one-to-one correspondence. */ 1652 1653 var_entries = (uintptr_t) img_header[1]; 1654 var_names = (char **) img_header[2]; 1655 fn_entries = (uintptr_t) img_header[3]; 1656 fn_names = (char **) img_header[4]; 1657 1658 *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) 1659 * (fn_entries + var_entries)); 1660 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) 1661 * fn_entries); 1662 1663 for (i = 0; i < fn_entries; i++) 1664 { 1665 CUfunction function; 1666 1667 r = cuModuleGetFunction (&function, module, fn_names[i]); 1668 if (r != CUDA_SUCCESS) 1669 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); 1670 1671 targ_fns[i].fn = function; 1672 targ_fns[i].name = (const char *) fn_names[i]; 1673 1674 (*target_table)[i].start = (uintptr_t) &targ_fns[i]; 1675 (*target_table)[i].end = (*target_table)[i].start + 1; 1676 } 1677 1678 for (j = 0; j < var_entries; j++, i++) 1679 { 1680 CUdeviceptr var; 1681 size_t bytes; 1682 1683 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); 1684 if (r != CUDA_SUCCESS) 1685 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); 1686 1687 (*target_table)[i].start = (uintptr_t) var; 1688 (*target_table)[i].end = (*target_table)[i].start + bytes; 1689 } 1690 1691 return i; 1692} 1693 1694void 1695GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data) 1696{ 1697 void **img_header = (void **) target_data; 1698 struct targ_fn_descriptor *targ_fns 1699 = (struct targ_fn_descriptor *) img_header[0]; 1700 struct ptx_image_data *image, *prev = NULL, *newhd = NULL; 1701 1702 free (targ_fns); 1703 1704 pthread_mutex_lock (&ptx_image_lock); 1705 for (image = ptx_images; image != NULL;) 1706 { 1707 struct ptx_image_data *next = image->next; 1708 1709 if (image->target_data == target_data) 1710 { 1711 cuModuleUnload (image->module); 1712 free (image); 1713 if (prev) 1714 prev->next = next; 1715 } 1716 else 1717 { 1718 prev = image; 1719 if (!newhd) 1720 newhd = image; 1721 } 1722 1723 image = next; 1724 } 1725 ptx_images = newhd; 1726 pthread_mutex_unlock (&ptx_image_lock); 1727} 1728 1729void * 1730GOMP_OFFLOAD_alloc (int ord, size_t size) 1731{ 1732 nvptx_attach_host_thread_to_device (ord); 1733 return nvptx_alloc (size); 1734} 1735 1736void 1737GOMP_OFFLOAD_free (int ord, void *ptr) 1738{ 1739 nvptx_attach_host_thread_to_device (ord); 1740 nvptx_free (ptr); 1741} 1742 1743void * 1744GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) 1745{ 1746 nvptx_attach_host_thread_to_device (ord); 1747 return nvptx_dev2host (dst, src, n); 1748} 1749 1750void * 1751GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) 1752{ 1753 nvptx_attach_host_thread_to_device (ord); 1754 return nvptx_host2dev (dst, src, n); 1755} 1756 1757void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; 1758 1759void 1760GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, 1761 void **hostaddrs, void **devaddrs, size_t *sizes, 1762 unsigned short *kinds, int num_gangs, 1763 int num_workers, int vector_length, int async, 1764 void *targ_mem_desc) 1765{ 1766 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, 1767 num_workers, vector_length, async, targ_mem_desc); 1768} 1769 1770void 1771GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) 1772{ 1773 CUevent *e; 1774 CUresult r; 1775 struct nvptx_thread *nvthd = nvptx_thread (); 1776 1777 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); 1778 1779 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); 1780 if (r != CUDA_SUCCESS) 1781 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); 1782 1783 r = cuEventRecord (*e, nvthd->current_stream->stream); 1784 if (r != CUDA_SUCCESS) 1785 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); 1786 1787 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); 1788} 1789 1790int 1791GOMP_OFFLOAD_openacc_async_test (int async) 1792{ 1793 return nvptx_async_test (async); 1794} 1795 1796int 1797GOMP_OFFLOAD_openacc_async_test_all (void) 1798{ 1799 return nvptx_async_test_all (); 1800} 1801 1802void 1803GOMP_OFFLOAD_openacc_async_wait (int async) 1804{ 1805 nvptx_wait (async); 1806} 1807 1808void 1809GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2) 1810{ 1811 nvptx_wait_async (async1, async2); 1812} 1813 1814void 1815GOMP_OFFLOAD_openacc_async_wait_all (void) 1816{ 1817 nvptx_wait_all (); 1818} 1819 1820void 1821GOMP_OFFLOAD_openacc_async_wait_all_async (int async) 1822{ 1823 nvptx_wait_all_async (async); 1824} 1825 1826void 1827GOMP_OFFLOAD_openacc_async_set_async (int async) 1828{ 1829 nvptx_set_async (async); 1830} 1831 1832void * 1833GOMP_OFFLOAD_openacc_create_thread_data (int ord) 1834{ 1835 struct ptx_device *ptx_dev; 1836 struct nvptx_thread *nvthd 1837 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); 1838 CUresult r; 1839 CUcontext thd_ctx; 1840 1841 ptx_dev = ptx_devices[ord]; 1842 1843 assert (ptx_dev); 1844 1845 r = cuCtxGetCurrent (&thd_ctx); 1846 if (r != CUDA_SUCCESS) 1847 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); 1848 1849 assert (ptx_dev->ctx); 1850 1851 if (!thd_ctx) 1852 { 1853 r = cuCtxPushCurrent (ptx_dev->ctx); 1854 if (r != CUDA_SUCCESS) 1855 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); 1856 } 1857 1858 nvthd->current_stream = ptx_dev->null_stream; 1859 nvthd->ptx_dev = ptx_dev; 1860 1861 return (void *) nvthd; 1862} 1863 1864void 1865GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 1866{ 1867 free (data); 1868} 1869 1870void * 1871GOMP_OFFLOAD_openacc_get_current_cuda_device (void) 1872{ 1873 return nvptx_get_current_cuda_device (); 1874} 1875 1876void * 1877GOMP_OFFLOAD_openacc_get_current_cuda_context (void) 1878{ 1879 return nvptx_get_current_cuda_context (); 1880} 1881 1882/* NOTE: This returns a CUstream, not a ptx_stream pointer. */ 1883 1884void * 1885GOMP_OFFLOAD_openacc_get_cuda_stream (int async) 1886{ 1887 return nvptx_get_cuda_stream (async); 1888} 1889 1890/* NOTE: This takes a CUstream, not a ptx_stream pointer. */ 1891 1892int 1893GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) 1894{ 1895 return nvptx_set_cuda_stream (async, stream); 1896} 1897