1/* Plugin for NVPTX execution. 2 3 Copyright (C) 2013-2020 Free Software Foundation, Inc. 4 5 Contributed by Mentor Embedded. 6 7 This file is part of the GNU Offloading and Multi Processing Library 8 (libgomp). 9 10 Libgomp is free software; you can redistribute it and/or modify it 11 under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 more details. 19 20 Under Section 7 of GPL version 3, you are granted additional 21 permissions described in the GCC Runtime Library Exception, version 22 3.1, as published by the Free Software Foundation. 23 24 You should have received a copy of the GNU General Public License and 25 a copy of the GCC Runtime Library Exception along with this program; 26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 <http://www.gnu.org/licenses/>. */ 28 29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver 30 library appears to hold some implicit state, but the documentation 31 is not clear as to what that state might be. Or how one might 32 propagate it from one thread to another. */ 33 34#define _GNU_SOURCE 35#include "openacc.h" 36#include "config.h" 37#include "libgomp-plugin.h" 38#include "oacc-plugin.h" 39#include "gomp-constants.h" 40#include "oacc-int.h" 41 42#include <pthread.h> 43#include <cuda.h> 44#include <stdbool.h> 45#include <limits.h> 46#include <string.h> 47#include <stdio.h> 48#include <unistd.h> 49#include <assert.h> 50#include <errno.h> 51 52#if CUDA_VERSION < 6000 53extern CUresult cuGetErrorString (CUresult, const char **); 54#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82 55#endif 56 57#if CUDA_VERSION >= 6050 58#undef cuLinkCreate 59#undef cuLinkAddData 60CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, 61 const char *, unsigned, CUjit_option *, void **); 62CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); 63#else 64typedef size_t (*CUoccupancyB2DSize)(int); 65CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, 66 const char *, unsigned, CUjit_option *, void **); 67CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); 68CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, 69 CUoccupancyB2DSize, size_t, int); 70#endif 71 72#define DO_PRAGMA(x) _Pragma (#x) 73 74#if PLUGIN_NVPTX_DYNAMIC 75# include <dlfcn.h> 76 77struct cuda_lib_s { 78 79# define CUDA_ONE_CALL(call) \ 80 __typeof (call) *call; 81# define CUDA_ONE_CALL_MAYBE_NULL(call) \ 82 CUDA_ONE_CALL (call) 83#include "cuda-lib.def" 84# undef CUDA_ONE_CALL 85# undef CUDA_ONE_CALL_MAYBE_NULL 86 87} cuda_lib; 88 89/* -1 if init_cuda_lib has not been called yet, false 90 if it has been and failed, true if it has been and succeeded. */ 91static signed char cuda_lib_inited = -1; 92 93/* Dynamically load the CUDA runtime library and initialize function 94 pointers, return false if unsuccessful, true if successful. */ 95static bool 96init_cuda_lib (void) 97{ 98 if (cuda_lib_inited != -1) 99 return cuda_lib_inited; 100 const char *cuda_runtime_lib = "libcuda.so.1"; 101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY); 102 cuda_lib_inited = false; 103 if (h == NULL) 104 return false; 105 106# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false) 107# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true) 108# define CUDA_ONE_CALL_1(call, allow_null) \ 109 cuda_lib.call = dlsym (h, #call); \ 110 if (!allow_null && cuda_lib.call == NULL) \ 111 return false; 112#include "cuda-lib.def" 113# undef CUDA_ONE_CALL 114# undef CUDA_ONE_CALL_1 115# undef CUDA_ONE_CALL_MAYBE_NULL 116 117 cuda_lib_inited = true; 118 return true; 119} 120# define CUDA_CALL_PREFIX cuda_lib. 121#else 122 123# define CUDA_ONE_CALL(call) 124# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call) 125#include "cuda-lib.def" 126#undef CUDA_ONE_CALL_MAYBE_NULL 127#undef CUDA_ONE_CALL 128 129# define CUDA_CALL_PREFIX 130# define init_cuda_lib() true 131#endif 132 133#include "secure_getenv.h" 134 135#undef MIN 136#undef MAX 137#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) 138#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) 139 140/* Convenience macros for the frequently used CUDA library call and 141 error handling sequence as well as CUDA library calls that 142 do the error checking themselves or don't do it at all. */ 143 144#define CUDA_CALL_ERET(ERET, FN, ...) \ 145 do { \ 146 unsigned __r \ 147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ 148 if (__r != CUDA_SUCCESS) \ 149 { \ 150 GOMP_PLUGIN_error (#FN " error: %s", \ 151 cuda_error (__r)); \ 152 return ERET; \ 153 } \ 154 } while (0) 155 156#define CUDA_CALL(FN, ...) \ 157 CUDA_CALL_ERET (false, FN, __VA_ARGS__) 158 159#define CUDA_CALL_ASSERT(FN, ...) \ 160 do { \ 161 unsigned __r \ 162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ 163 if (__r != CUDA_SUCCESS) \ 164 { \ 165 GOMP_PLUGIN_fatal (#FN " error: %s", \ 166 cuda_error (__r)); \ 167 } \ 168 } while (0) 169 170#define CUDA_CALL_NOCHECK(FN, ...) \ 171 CUDA_CALL_PREFIX FN (__VA_ARGS__) 172 173#define CUDA_CALL_EXISTS(FN) \ 174 CUDA_CALL_PREFIX FN 175 176static const char * 177cuda_error (CUresult r) 178{ 179 const char *fallback = "unknown cuda error"; 180 const char *desc; 181 182 if (!CUDA_CALL_EXISTS (cuGetErrorString)) 183 return fallback; 184 185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); 186 if (r == CUDA_SUCCESS) 187 return desc; 188 189 return fallback; 190} 191 192/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by 193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */ 194static char cuda_driver_version_s[30]; 195 196static unsigned int instantiated_devices = 0; 197static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; 198 199/* NVPTX/CUDA specific definition of asynchronous queues. */ 200struct goacc_asyncqueue 201{ 202 CUstream cuda_stream; 203}; 204 205struct nvptx_callback 206{ 207 void (*fn) (void *); 208 void *ptr; 209 struct goacc_asyncqueue *aq; 210 struct nvptx_callback *next; 211}; 212 213/* Thread-specific data for PTX. */ 214 215struct nvptx_thread 216{ 217 /* We currently have this embedded inside the plugin because libgomp manages 218 devices through integer target_ids. This might be better if using an 219 opaque target-specific pointer directly from gomp_device_descr. */ 220 struct ptx_device *ptx_dev; 221}; 222 223/* Target data function launch information. */ 224 225struct targ_fn_launch 226{ 227 const char *fn; 228 unsigned short dim[GOMP_DIM_MAX]; 229}; 230 231/* Target PTX object information. */ 232 233struct targ_ptx_obj 234{ 235 const char *code; 236 size_t size; 237}; 238 239/* Target data image information. */ 240 241typedef struct nvptx_tdata 242{ 243 const struct targ_ptx_obj *ptx_objs; 244 unsigned ptx_num; 245 246 const char *const *var_names; 247 unsigned var_num; 248 249 const struct targ_fn_launch *fn_descs; 250 unsigned fn_num; 251} nvptx_tdata_t; 252 253/* Descriptor of a loaded function. */ 254 255struct targ_fn_descriptor 256{ 257 CUfunction fn; 258 const struct targ_fn_launch *launch; 259 int regs_per_thread; 260 int max_threads_per_block; 261}; 262 263/* A loaded PTX image. */ 264struct ptx_image_data 265{ 266 const void *target_data; 267 CUmodule module; 268 269 struct targ_fn_descriptor *fns; /* Array of functions. */ 270 271 struct ptx_image_data *next; 272}; 273 274struct ptx_free_block 275{ 276 void *ptr; 277 struct ptx_free_block *next; 278}; 279 280struct ptx_device 281{ 282 CUcontext ctx; 283 bool ctx_shared; 284 CUdevice dev; 285 286 int ord; 287 bool overlap; 288 bool map; 289 bool concur; 290 bool mkern; 291 int mode; 292 int clock_khz; 293 int num_sms; 294 int regs_per_block; 295 int regs_per_sm; 296 int warp_size; 297 int max_threads_per_block; 298 int max_threads_per_multiprocessor; 299 int default_dims[GOMP_DIM_MAX]; 300 301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */ 302 char name[256]; 303 304 struct ptx_image_data *images; /* Images loaded on device. */ 305 pthread_mutex_t image_lock; /* Lock for above list. */ 306 307 struct ptx_free_block *free_blocks; 308 pthread_mutex_t free_blocks_lock; 309 310 struct ptx_device *next; 311}; 312 313static struct ptx_device **ptx_devices; 314 315static inline struct nvptx_thread * 316nvptx_thread (void) 317{ 318 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 319} 320 321/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK 322 should be locked on entry and remains locked on exit. */ 323 324static bool 325nvptx_init (void) 326{ 327 int ndevs; 328 329 if (instantiated_devices != 0) 330 return true; 331 332 if (!init_cuda_lib ()) 333 return false; 334 335 CUDA_CALL (cuInit, 0); 336 337 int cuda_driver_version; 338 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version); 339 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s, 340 "CUDA Driver %u.%u", 341 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10); 342 343 CUDA_CALL (cuDeviceGetCount, &ndevs); 344 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) 345 * ndevs); 346 347 return true; 348} 349 350/* Select the N'th PTX device for the current host thread. The device must 351 have been previously opened before calling this function. */ 352 353static bool 354nvptx_attach_host_thread_to_device (int n) 355{ 356 CUdevice dev; 357 CUresult r; 358 struct ptx_device *ptx_dev; 359 CUcontext thd_ctx; 360 361 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); 362 if (r == CUDA_ERROR_NOT_PERMITTED) 363 { 364 /* Assume we're in a CUDA callback, just return true. */ 365 return true; 366 } 367 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 368 { 369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); 370 return false; 371 } 372 373 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) 374 return true; 375 else 376 { 377 CUcontext old_ctx; 378 379 ptx_dev = ptx_devices[n]; 380 if (!ptx_dev) 381 { 382 GOMP_PLUGIN_error ("device %d not found", n); 383 return false; 384 } 385 386 CUDA_CALL (cuCtxGetCurrent, &thd_ctx); 387 388 /* We don't necessarily have a current context (e.g. if it has been 389 destroyed. Pop it if we do though. */ 390 if (thd_ctx != NULL) 391 CUDA_CALL (cuCtxPopCurrent, &old_ctx); 392 393 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); 394 } 395 return true; 396} 397 398static struct ptx_device * 399nvptx_open_device (int n) 400{ 401 struct ptx_device *ptx_dev; 402 CUdevice dev, ctx_dev; 403 CUresult r; 404 int async_engines, pi; 405 406 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); 407 408 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); 409 410 ptx_dev->ord = n; 411 ptx_dev->dev = dev; 412 ptx_dev->ctx_shared = false; 413 414 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); 415 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) 416 { 417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); 418 return NULL; 419 } 420 421 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) 422 { 423 /* The current host thread has an active context for a different device. 424 Detach it. */ 425 CUcontext old_ctx; 426 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx); 427 } 428 429 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx); 430 431 if (!ptx_dev->ctx) 432 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); 433 else 434 ptx_dev->ctx_shared = true; 435 436 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 437 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); 438 ptx_dev->overlap = pi; 439 440 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 441 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); 442 ptx_dev->map = pi; 443 444 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 445 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); 446 ptx_dev->concur = pi; 447 448 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 449 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 450 ptx_dev->mode = pi; 451 452 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 453 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); 454 ptx_dev->mkern = pi; 455 456 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 457 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); 458 ptx_dev->clock_khz = pi; 459 460 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 461 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); 462 ptx_dev->num_sms = pi; 463 464 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 465 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); 466 ptx_dev->regs_per_block = pi; 467 468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only 469 in CUDA 6.0 and newer. */ 470 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, 472 dev); 473 /* Fallback: use limit of registers per block, which is usually equal. */ 474 if (r == CUDA_ERROR_INVALID_VALUE) 475 pi = ptx_dev->regs_per_block; 476 else if (r != CUDA_SUCCESS) 477 { 478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r)); 479 return NULL; 480 } 481 ptx_dev->regs_per_sm = pi; 482 483 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, 484 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); 485 if (pi != 32) 486 { 487 GOMP_PLUGIN_error ("Only warp size 32 is supported"); 488 return NULL; 489 } 490 ptx_dev->warp_size = pi; 491 492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, 493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); 494 ptx_dev->max_threads_per_block = pi; 495 496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, 497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); 498 ptx_dev->max_threads_per_multiprocessor = pi; 499 500 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, 501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); 502 if (r != CUDA_SUCCESS) 503 async_engines = 1; 504 505 for (int i = 0; i != GOMP_DIM_MAX; i++) 506 ptx_dev->default_dims[i] = 0; 507 508 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name, 509 dev); 510 511 ptx_dev->images = NULL; 512 pthread_mutex_init (&ptx_dev->image_lock, NULL); 513 514 ptx_dev->free_blocks = NULL; 515 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); 516 517 return ptx_dev; 518} 519 520static bool 521nvptx_close_device (struct ptx_device *ptx_dev) 522{ 523 if (!ptx_dev) 524 return true; 525 526 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;) 527 { 528 struct ptx_free_block *b_next = b->next; 529 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr); 530 free (b); 531 b = b_next; 532 } 533 534 pthread_mutex_destroy (&ptx_dev->free_blocks_lock); 535 pthread_mutex_destroy (&ptx_dev->image_lock); 536 537 if (!ptx_dev->ctx_shared) 538 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); 539 540 free (ptx_dev); 541 return true; 542} 543 544static int 545nvptx_get_num_devices (void) 546{ 547 int n; 548 549 /* This function will be called before the plugin has been initialized in 550 order to enumerate available devices, but CUDA API routines can't be used 551 until cuInit has been called. Just call it now (but don't yet do any 552 further initialization). */ 553 if (instantiated_devices == 0) 554 { 555 if (!init_cuda_lib ()) 556 return 0; 557 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0); 558 /* This is not an error: e.g. we may have CUDA libraries installed but 559 no devices available. */ 560 if (r != CUDA_SUCCESS) 561 { 562 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n", 563 cuda_error (r)); 564 return 0; 565 } 566 } 567 568 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n); 569 return n; 570} 571 572static void 573notify_var (const char *var_name, const char *env_var) 574{ 575 if (env_var == NULL) 576 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name); 577 else 578 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var); 579} 580 581static void 582process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o) 583{ 584 const char *var_name = "GOMP_NVPTX_JIT"; 585 const char *env_var = secure_getenv (var_name); 586 notify_var (var_name, env_var); 587 588 if (env_var == NULL) 589 return; 590 591 const char *c = env_var; 592 while (*c != '\0') 593 { 594 while (*c == ' ') 595 c++; 596 597 if (c[0] == '-' && c[1] == 'O' 598 && '0' <= c[2] && c[2] <= '4' 599 && (c[3] == '\0' || c[3] == ' ')) 600 { 601 *gomp_nvptx_o = c[2] - '0'; 602 c += 3; 603 continue; 604 } 605 606 GOMP_PLUGIN_error ("Error parsing %s", var_name); 607 break; 608 } 609} 610 611static bool 612link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, 613 unsigned num_objs) 614{ 615 CUjit_option opts[7]; 616 void *optvals[7]; 617 float elapsed = 0.0; 618 char elog[1024]; 619 char ilog[16384]; 620 CUlinkState linkstate; 621 CUresult r; 622 void *linkout; 623 size_t linkoutsize __attribute__ ((unused)); 624 625 opts[0] = CU_JIT_WALL_TIME; 626 optvals[0] = &elapsed; 627 628 opts[1] = CU_JIT_INFO_LOG_BUFFER; 629 optvals[1] = &ilog[0]; 630 631 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 632 optvals[2] = (void *) sizeof ilog; 633 634 opts[3] = CU_JIT_ERROR_LOG_BUFFER; 635 optvals[3] = &elog[0]; 636 637 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 638 optvals[4] = (void *) sizeof elog; 639 640 opts[5] = CU_JIT_LOG_VERBOSE; 641 optvals[5] = (void *) 1; 642 643 static intptr_t gomp_nvptx_o = -1; 644 645 static bool init_done = false; 646 if (!init_done) 647 { 648 process_GOMP_NVPTX_JIT (&gomp_nvptx_o); 649 init_done = true; 650 } 651 652 int nopts = 6; 653 if (gomp_nvptx_o != -1) 654 { 655 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL; 656 optvals[nopts] = (void *) gomp_nvptx_o; 657 nopts++; 658 } 659 660 if (CUDA_CALL_EXISTS (cuLinkCreate_v2)) 661 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate); 662 else 663 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate); 664 665 for (; num_objs--; ptx_objs++) 666 { 667 /* cuLinkAddData's 'data' argument erroneously omits the const 668 qualifier. */ 669 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); 670 if (CUDA_CALL_EXISTS (cuLinkAddData_v2)) 671 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX, 672 (char *) ptx_objs->code, ptx_objs->size, 673 0, 0, 0, 0); 674 else 675 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, 676 (char *) ptx_objs->code, ptx_objs->size, 677 0, 0, 0, 0); 678 if (r != CUDA_SUCCESS) 679 { 680 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); 681 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", 682 cuda_error (r)); 683 return false; 684 } 685 } 686 687 GOMP_PLUGIN_debug (0, "Linking\n"); 688 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize); 689 690 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); 691 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); 692 693 if (r != CUDA_SUCCESS) 694 { 695 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r)); 696 return false; 697 } 698 699 CUDA_CALL (cuModuleLoadData, module, linkout); 700 CUDA_CALL (cuLinkDestroy, linkstate); 701 return true; 702} 703 704static void 705nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, 706 unsigned *dims, void *targ_mem_desc, 707 CUdeviceptr dp, CUstream stream) 708{ 709 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; 710 CUfunction function; 711 int i; 712 void *kargs[1]; 713 struct nvptx_thread *nvthd = nvptx_thread (); 714 int warp_size = nvthd->ptx_dev->warp_size; 715 716 function = targ_fn->fn; 717 718 /* Initialize the launch dimensions. Typically this is constant, 719 provided by the device compiler, but we must permit runtime 720 values. */ 721 int seen_zero = 0; 722 for (i = 0; i != GOMP_DIM_MAX; i++) 723 { 724 if (targ_fn->launch->dim[i]) 725 dims[i] = targ_fn->launch->dim[i]; 726 if (!dims[i]) 727 seen_zero = 1; 728 } 729 730 if (seen_zero) 731 { 732 pthread_mutex_lock (&ptx_dev_lock); 733 734 static int gomp_openacc_dims[GOMP_DIM_MAX]; 735 if (!gomp_openacc_dims[0]) 736 { 737 /* See if the user provided GOMP_OPENACC_DIM environment 738 variable to specify runtime defaults. */ 739 for (int i = 0; i < GOMP_DIM_MAX; ++i) 740 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i); 741 } 742 743 if (!nvthd->ptx_dev->default_dims[0]) 744 { 745 int default_dims[GOMP_DIM_MAX]; 746 for (int i = 0; i < GOMP_DIM_MAX; ++i) 747 default_dims[i] = gomp_openacc_dims[i]; 748 749 int gang, worker, vector; 750 { 751 int block_size = nvthd->ptx_dev->max_threads_per_block; 752 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor; 753 int dev_size = nvthd->ptx_dev->num_sms; 754 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," 755 " dev_size=%d, cpu_size=%d\n", 756 warp_size, block_size, dev_size, cpu_size); 757 758 gang = (cpu_size / block_size) * dev_size; 759 worker = block_size / warp_size; 760 vector = warp_size; 761 } 762 763 /* There is no upper bound on the gang size. The best size 764 matches the hardware configuration. Logical gangs are 765 scheduled onto physical hardware. To maximize usage, we 766 should guess a large number. */ 767 if (default_dims[GOMP_DIM_GANG] < 1) 768 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024; 769 /* The worker size must not exceed the hardware. */ 770 if (default_dims[GOMP_DIM_WORKER] < 1 771 || (default_dims[GOMP_DIM_WORKER] > worker && gang)) 772 default_dims[GOMP_DIM_WORKER] = worker; 773 /* The vector size must exactly match the hardware. */ 774 if (default_dims[GOMP_DIM_VECTOR] < 1 775 || (default_dims[GOMP_DIM_VECTOR] != vector && gang)) 776 default_dims[GOMP_DIM_VECTOR] = vector; 777 778 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n", 779 default_dims[GOMP_DIM_GANG], 780 default_dims[GOMP_DIM_WORKER], 781 default_dims[GOMP_DIM_VECTOR]); 782 783 for (i = 0; i != GOMP_DIM_MAX; i++) 784 nvthd->ptx_dev->default_dims[i] = default_dims[i]; 785 } 786 pthread_mutex_unlock (&ptx_dev_lock); 787 788 { 789 bool default_dim_p[GOMP_DIM_MAX]; 790 for (i = 0; i != GOMP_DIM_MAX; i++) 791 default_dim_p[i] = !dims[i]; 792 793 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) 794 { 795 for (i = 0; i != GOMP_DIM_MAX; i++) 796 if (default_dim_p[i]) 797 dims[i] = nvthd->ptx_dev->default_dims[i]; 798 799 if (default_dim_p[GOMP_DIM_VECTOR]) 800 dims[GOMP_DIM_VECTOR] 801 = MIN (dims[GOMP_DIM_VECTOR], 802 (targ_fn->max_threads_per_block / warp_size 803 * warp_size)); 804 805 if (default_dim_p[GOMP_DIM_WORKER]) 806 dims[GOMP_DIM_WORKER] 807 = MIN (dims[GOMP_DIM_WORKER], 808 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); 809 } 810 else 811 { 812 /* Handle the case that the compiler allows the runtime to choose 813 the vector-length conservatively, by ignoring 814 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle 815 it. */ 816 int vectors = 0; 817 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that 818 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not 819 exceed targ_fn->max_threads_per_block. */ 820 int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; 821 int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; 822 int grids, blocks; 823 824 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, 825 &blocks, function, NULL, 0, 826 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); 827 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " 828 "grid = %d, block = %d\n", grids, blocks); 829 830 /* Keep the num_gangs proportional to the block size. In 831 the case were a block size is limited by shared-memory 832 or the register file capacity, the runtime will not 833 excessively over assign gangs to the multiprocessor 834 units if their state is going to be swapped out even 835 more than necessary. The constant factor 2 is there to 836 prevent threads from idling when there is insufficient 837 work for them. */ 838 if (gangs == 0) 839 gangs = 2 * grids * (blocks / warp_size); 840 841 if (vectors == 0) 842 vectors = warp_size; 843 844 if (workers == 0) 845 { 846 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR] 847 ? vectors 848 : dims[GOMP_DIM_VECTOR]); 849 workers = blocks / actual_vectors; 850 workers = MAX (workers, 1); 851 /* If we need a per-worker barrier ... . */ 852 if (actual_vectors > 32) 853 /* Don't use more barriers than available. */ 854 workers = MIN (workers, 15); 855 } 856 857 for (i = 0; i != GOMP_DIM_MAX; i++) 858 if (default_dim_p[i]) 859 switch (i) 860 { 861 case GOMP_DIM_GANG: dims[i] = gangs; break; 862 case GOMP_DIM_WORKER: dims[i] = workers; break; 863 case GOMP_DIM_VECTOR: dims[i] = vectors; break; 864 default: GOMP_PLUGIN_fatal ("invalid dim"); 865 } 866 } 867 } 868 } 869 870 /* Check if the accelerator has sufficient hardware resources to 871 launch the offloaded kernel. */ 872 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] 873 > targ_fn->max_threads_per_block) 874 { 875 const char *msg 876 = ("The Nvidia accelerator has insufficient resources to launch '%s'" 877 " with num_workers = %d and vector_length = %d" 878 "; " 879 "recompile the program with 'num_workers = x and vector_length = y'" 880 " on that offloaded region or '-fopenacc-dim=:x:y' where" 881 " x * y <= %d" 882 ".\n"); 883 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], 884 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block); 885 } 886 887 /* Check if the accelerator has sufficient barrier resources to 888 launch the offloaded kernel. */ 889 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32) 890 { 891 const char *msg 892 = ("The Nvidia accelerator has insufficient barrier resources to launch" 893 " '%s' with num_workers = %d and vector_length = %d" 894 "; " 895 "recompile the program with 'num_workers = x' on that offloaded" 896 " region or '-fopenacc-dim=:x:' where x <= 15" 897 "; " 898 "or, recompile the program with 'vector_length = 32' on that" 899 " offloaded region or '-fopenacc-dim=::32'" 900 ".\n"); 901 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], 902 dims[GOMP_DIM_VECTOR]); 903 } 904 905 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" 906 " gangs=%u, workers=%u, vectors=%u\n", 907 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], 908 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]); 909 910 // OpenACC CUDA 911 // 912 // num_gangs nctaid.x 913 // num_workers ntid.y 914 // vector length ntid.x 915 916 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 917 acc_prof_info *prof_info = thr->prof_info; 918 acc_event_info enqueue_launch_event_info; 919 acc_api_info *api_info = thr->api_info; 920 bool profiling_p = __builtin_expect (prof_info != NULL, false); 921 if (profiling_p) 922 { 923 prof_info->event_type = acc_ev_enqueue_launch_start; 924 925 enqueue_launch_event_info.launch_event.event_type 926 = prof_info->event_type; 927 enqueue_launch_event_info.launch_event.valid_bytes 928 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES; 929 enqueue_launch_event_info.launch_event.parent_construct 930 = acc_construct_parallel; 931 enqueue_launch_event_info.launch_event.implicit = 1; 932 enqueue_launch_event_info.launch_event.tool_info = NULL; 933 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn; 934 enqueue_launch_event_info.launch_event.num_gangs 935 = dims[GOMP_DIM_GANG]; 936 enqueue_launch_event_info.launch_event.num_workers 937 = dims[GOMP_DIM_WORKER]; 938 enqueue_launch_event_info.launch_event.vector_length 939 = dims[GOMP_DIM_VECTOR]; 940 941 api_info->device_api = acc_device_api_cuda; 942 943 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, 944 api_info); 945 } 946 947 kargs[0] = &dp; 948 CUDA_CALL_ASSERT (cuLaunchKernel, function, 949 dims[GOMP_DIM_GANG], 1, 1, 950 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 951 0, stream, kargs, 0); 952 953 if (profiling_p) 954 { 955 prof_info->event_type = acc_ev_enqueue_launch_end; 956 enqueue_launch_event_info.launch_event.event_type 957 = prof_info->event_type; 958 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, 959 api_info); 960 } 961 962 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, 963 targ_fn->launch->fn); 964} 965 966void * openacc_get_current_cuda_context (void); 967 968static void 969goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s) 970{ 971 acc_prof_info *prof_info = thr->prof_info; 972 acc_event_info data_event_info; 973 acc_api_info *api_info = thr->api_info; 974 975 prof_info->event_type = acc_ev_alloc; 976 977 data_event_info.data_event.event_type = prof_info->event_type; 978 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES; 979 data_event_info.data_event.parent_construct = acc_construct_parallel; 980 data_event_info.data_event.implicit = 1; 981 data_event_info.data_event.tool_info = NULL; 982 data_event_info.data_event.var_name = NULL; 983 data_event_info.data_event.bytes = s; 984 data_event_info.data_event.host_ptr = NULL; 985 data_event_info.data_event.device_ptr = dp; 986 987 api_info->device_api = acc_device_api_cuda; 988 989 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info); 990} 991 992static void * 993nvptx_alloc (size_t s) 994{ 995 CUdeviceptr d; 996 997 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s); 998 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 999 bool profiling_p 1000 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1001 if (profiling_p) 1002 goacc_profiling_acc_ev_alloc (thr, (void *) d, s); 1003 1004 return (void *) d; 1005} 1006 1007static void 1008goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p) 1009{ 1010 acc_prof_info *prof_info = thr->prof_info; 1011 acc_event_info data_event_info; 1012 acc_api_info *api_info = thr->api_info; 1013 1014 prof_info->event_type = acc_ev_free; 1015 1016 data_event_info.data_event.event_type = prof_info->event_type; 1017 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1018 data_event_info.data_event.parent_construct = acc_construct_parallel; 1019 data_event_info.data_event.implicit = 1; 1020 data_event_info.data_event.tool_info = NULL; 1021 data_event_info.data_event.var_name = NULL; 1022 data_event_info.data_event.bytes = -1; 1023 data_event_info.data_event.host_ptr = NULL; 1024 data_event_info.data_event.device_ptr = p; 1025 1026 api_info->device_api = acc_device_api_cuda; 1027 1028 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info); 1029} 1030 1031static bool 1032nvptx_free (void *p, struct ptx_device *ptx_dev) 1033{ 1034 CUdeviceptr pb; 1035 size_t ps; 1036 1037 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps, 1038 (CUdeviceptr) p); 1039 if (r == CUDA_ERROR_NOT_PERMITTED) 1040 { 1041 /* We assume that this error indicates we are in a CUDA callback context, 1042 where all CUDA calls are not allowed (see cuStreamAddCallback 1043 documentation for description). Arrange to free this piece of device 1044 memory later. */ 1045 struct ptx_free_block *n 1046 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block)); 1047 n->ptr = p; 1048 pthread_mutex_lock (&ptx_dev->free_blocks_lock); 1049 n->next = ptx_dev->free_blocks; 1050 ptx_dev->free_blocks = n; 1051 pthread_mutex_unlock (&ptx_dev->free_blocks_lock); 1052 return true; 1053 } 1054 else if (r != CUDA_SUCCESS) 1055 { 1056 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r)); 1057 return false; 1058 } 1059 if ((CUdeviceptr) p != pb) 1060 { 1061 GOMP_PLUGIN_error ("invalid device address"); 1062 return false; 1063 } 1064 1065 CUDA_CALL (cuMemFree, (CUdeviceptr) p); 1066 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1067 bool profiling_p 1068 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1069 if (profiling_p) 1070 goacc_profiling_acc_ev_free (thr, p); 1071 1072 return true; 1073} 1074 1075static void * 1076nvptx_get_current_cuda_device (void) 1077{ 1078 struct nvptx_thread *nvthd = nvptx_thread (); 1079 1080 if (!nvthd || !nvthd->ptx_dev) 1081 return NULL; 1082 1083 return &nvthd->ptx_dev->dev; 1084} 1085 1086static void * 1087nvptx_get_current_cuda_context (void) 1088{ 1089 struct nvptx_thread *nvthd = nvptx_thread (); 1090 1091 if (!nvthd || !nvthd->ptx_dev) 1092 return NULL; 1093 1094 return nvthd->ptx_dev->ctx; 1095} 1096 1097/* Plugin entry points. */ 1098 1099const char * 1100GOMP_OFFLOAD_get_name (void) 1101{ 1102 return "nvptx"; 1103} 1104 1105unsigned int 1106GOMP_OFFLOAD_get_caps (void) 1107{ 1108 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400; 1109} 1110 1111int 1112GOMP_OFFLOAD_get_type (void) 1113{ 1114 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX; 1115} 1116 1117int 1118GOMP_OFFLOAD_get_num_devices (void) 1119{ 1120 return nvptx_get_num_devices (); 1121} 1122 1123bool 1124GOMP_OFFLOAD_init_device (int n) 1125{ 1126 struct ptx_device *dev; 1127 1128 pthread_mutex_lock (&ptx_dev_lock); 1129 1130 if (!nvptx_init () || ptx_devices[n] != NULL) 1131 { 1132 pthread_mutex_unlock (&ptx_dev_lock); 1133 return false; 1134 } 1135 1136 dev = nvptx_open_device (n); 1137 if (dev) 1138 { 1139 ptx_devices[n] = dev; 1140 instantiated_devices++; 1141 } 1142 1143 pthread_mutex_unlock (&ptx_dev_lock); 1144 1145 return dev != NULL; 1146} 1147 1148bool 1149GOMP_OFFLOAD_fini_device (int n) 1150{ 1151 pthread_mutex_lock (&ptx_dev_lock); 1152 1153 if (ptx_devices[n] != NULL) 1154 { 1155 if (!nvptx_attach_host_thread_to_device (n) 1156 || !nvptx_close_device (ptx_devices[n])) 1157 { 1158 pthread_mutex_unlock (&ptx_dev_lock); 1159 return false; 1160 } 1161 ptx_devices[n] = NULL; 1162 instantiated_devices--; 1163 } 1164 1165 if (instantiated_devices == 0) 1166 { 1167 free (ptx_devices); 1168 ptx_devices = NULL; 1169 } 1170 1171 pthread_mutex_unlock (&ptx_dev_lock); 1172 return true; 1173} 1174 1175/* Return the libgomp version number we're compatible with. There is 1176 no requirement for cross-version compatibility. */ 1177 1178unsigned 1179GOMP_OFFLOAD_version (void) 1180{ 1181 return GOMP_VERSION; 1182} 1183 1184/* Initialize __nvptx_clocktick, if present in MODULE. */ 1185 1186static void 1187nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) 1188{ 1189 CUdeviceptr dptr; 1190 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL, 1191 module, "__nvptx_clocktick"); 1192 if (r == CUDA_ERROR_NOT_FOUND) 1193 return; 1194 if (r != CUDA_SUCCESS) 1195 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); 1196 double __nvptx_clocktick = 1e-3 / dev->clock_khz; 1197 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick, 1198 sizeof (__nvptx_clocktick)); 1199 if (r != CUDA_SUCCESS) 1200 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); 1201} 1202 1203/* Load the (partial) program described by TARGET_DATA to device 1204 number ORD. Allocate and return TARGET_TABLE. */ 1205 1206int 1207GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, 1208 struct addr_pair **target_table) 1209{ 1210 CUmodule module; 1211 const char *const *var_names; 1212 const struct targ_fn_launch *fn_descs; 1213 unsigned int fn_entries, var_entries, i, j; 1214 struct targ_fn_descriptor *targ_fns; 1215 struct addr_pair *targ_tbl; 1216 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; 1217 struct ptx_image_data *new_image; 1218 struct ptx_device *dev; 1219 1220 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1221 { 1222 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" 1223 " (expected %u, received %u)", 1224 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); 1225 return -1; 1226 } 1227 1228 if (!nvptx_attach_host_thread_to_device (ord) 1229 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num)) 1230 return -1; 1231 1232 dev = ptx_devices[ord]; 1233 1234 /* The mkoffload utility emits a struct of pointers/integers at the 1235 start of each offload image. The array of kernel names and the 1236 functions addresses form a one-to-one correspondence. */ 1237 1238 var_entries = img_header->var_num; 1239 var_names = img_header->var_names; 1240 fn_entries = img_header->fn_num; 1241 fn_descs = img_header->fn_descs; 1242 1243 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) 1244 * (fn_entries + var_entries)); 1245 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) 1246 * fn_entries); 1247 1248 *target_table = targ_tbl; 1249 1250 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); 1251 new_image->target_data = target_data; 1252 new_image->module = module; 1253 new_image->fns = targ_fns; 1254 1255 pthread_mutex_lock (&dev->image_lock); 1256 new_image->next = dev->images; 1257 dev->images = new_image; 1258 pthread_mutex_unlock (&dev->image_lock); 1259 1260 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) 1261 { 1262 CUfunction function; 1263 int nregs, mthrs; 1264 1265 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, 1266 fn_descs[i].fn); 1267 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs, 1268 CU_FUNC_ATTRIBUTE_NUM_REGS, function); 1269 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs, 1270 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); 1271 1272 targ_fns->fn = function; 1273 targ_fns->launch = &fn_descs[i]; 1274 targ_fns->regs_per_thread = nregs; 1275 targ_fns->max_threads_per_block = mthrs; 1276 1277 targ_tbl->start = (uintptr_t) targ_fns; 1278 targ_tbl->end = targ_tbl->start + 1; 1279 } 1280 1281 for (j = 0; j < var_entries; j++, targ_tbl++) 1282 { 1283 CUdeviceptr var; 1284 size_t bytes; 1285 1286 CUDA_CALL_ERET (-1, cuModuleGetGlobal, 1287 &var, &bytes, module, var_names[j]); 1288 1289 targ_tbl->start = (uintptr_t) var; 1290 targ_tbl->end = targ_tbl->start + bytes; 1291 } 1292 1293 nvptx_set_clocktick (module, dev); 1294 1295 return fn_entries + var_entries; 1296} 1297 1298/* Unload the program described by TARGET_DATA. DEV_DATA is the 1299 function descriptors allocated by G_O_load_image. */ 1300 1301bool 1302GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) 1303{ 1304 struct ptx_image_data *image, **prev_p; 1305 struct ptx_device *dev = ptx_devices[ord]; 1306 1307 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) 1308 { 1309 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" 1310 " (expected %u, received %u)", 1311 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); 1312 return false; 1313 } 1314 1315 bool ret = true; 1316 pthread_mutex_lock (&dev->image_lock); 1317 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) 1318 if (image->target_data == target_data) 1319 { 1320 *prev_p = image->next; 1321 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) 1322 ret = false; 1323 free (image->fns); 1324 free (image); 1325 break; 1326 } 1327 pthread_mutex_unlock (&dev->image_lock); 1328 return ret; 1329} 1330 1331void * 1332GOMP_OFFLOAD_alloc (int ord, size_t size) 1333{ 1334 if (!nvptx_attach_host_thread_to_device (ord)) 1335 return NULL; 1336 1337 struct ptx_device *ptx_dev = ptx_devices[ord]; 1338 struct ptx_free_block *blocks, *tmp; 1339 1340 pthread_mutex_lock (&ptx_dev->free_blocks_lock); 1341 blocks = ptx_dev->free_blocks; 1342 ptx_dev->free_blocks = NULL; 1343 pthread_mutex_unlock (&ptx_dev->free_blocks_lock); 1344 1345 while (blocks) 1346 { 1347 tmp = blocks->next; 1348 nvptx_free (blocks->ptr, ptx_dev); 1349 free (blocks); 1350 blocks = tmp; 1351 } 1352 1353 return nvptx_alloc (size); 1354} 1355 1356bool 1357GOMP_OFFLOAD_free (int ord, void *ptr) 1358{ 1359 return (nvptx_attach_host_thread_to_device (ord) 1360 && nvptx_free (ptr, ptx_devices[ord])); 1361} 1362 1363void 1364GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, 1365 void **hostaddrs, void **devaddrs, 1366 unsigned *dims, void *targ_mem_desc) 1367{ 1368 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 1369 1370 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1371 acc_prof_info *prof_info = thr->prof_info; 1372 acc_event_info data_event_info; 1373 acc_api_info *api_info = thr->api_info; 1374 bool profiling_p = __builtin_expect (prof_info != NULL, false); 1375 1376 void **hp = NULL; 1377 CUdeviceptr dp = 0; 1378 1379 if (mapnum > 0) 1380 { 1381 size_t s = mapnum * sizeof (void *); 1382 hp = alloca (s); 1383 for (int i = 0; i < mapnum; i++) 1384 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); 1385 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); 1386 if (profiling_p) 1387 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); 1388 } 1389 1390 /* Copy the (device) pointers to arguments to the device (dp and hp might in 1391 fact have the same value on a unified-memory system). */ 1392 if (mapnum > 0) 1393 { 1394 if (profiling_p) 1395 { 1396 prof_info->event_type = acc_ev_enqueue_upload_start; 1397 1398 data_event_info.data_event.event_type = prof_info->event_type; 1399 data_event_info.data_event.valid_bytes 1400 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1401 data_event_info.data_event.parent_construct 1402 = acc_construct_parallel; 1403 data_event_info.data_event.implicit = 1; /* Always implicit. */ 1404 data_event_info.data_event.tool_info = NULL; 1405 data_event_info.data_event.var_name = NULL; 1406 data_event_info.data_event.bytes = mapnum * sizeof (void *); 1407 data_event_info.data_event.host_ptr = hp; 1408 data_event_info.data_event.device_ptr = (const void *) dp; 1409 1410 api_info->device_api = acc_device_api_cuda; 1411 1412 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1413 api_info); 1414 } 1415 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, 1416 mapnum * sizeof (void *)); 1417 if (profiling_p) 1418 { 1419 prof_info->event_type = acc_ev_enqueue_upload_end; 1420 data_event_info.data_event.event_type = prof_info->event_type; 1421 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1422 api_info); 1423 } 1424 } 1425 1426 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, 1427 dp, NULL); 1428 1429 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL); 1430 const char *maybe_abort_msg = "(perhaps abort was called)"; 1431 if (r == CUDA_ERROR_LAUNCH_FAILED) 1432 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), 1433 maybe_abort_msg); 1434 else if (r != CUDA_SUCCESS) 1435 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); 1436 1437 CUDA_CALL_ASSERT (cuMemFree, dp); 1438 if (profiling_p) 1439 goacc_profiling_acc_ev_free (thr, (void *) dp); 1440} 1441 1442static void 1443cuda_free_argmem (void *ptr) 1444{ 1445 void **block = (void **) ptr; 1446 nvptx_free (block[0], (struct ptx_device *) block[1]); 1447 free (block); 1448} 1449 1450void 1451GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, 1452 void **hostaddrs, void **devaddrs, 1453 unsigned *dims, void *targ_mem_desc, 1454 struct goacc_asyncqueue *aq) 1455{ 1456 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); 1457 1458 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1459 acc_prof_info *prof_info = thr->prof_info; 1460 acc_event_info data_event_info; 1461 acc_api_info *api_info = thr->api_info; 1462 bool profiling_p = __builtin_expect (prof_info != NULL, false); 1463 1464 void **hp = NULL; 1465 CUdeviceptr dp = 0; 1466 void **block = NULL; 1467 1468 if (mapnum > 0) 1469 { 1470 size_t s = mapnum * sizeof (void *); 1471 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s); 1472 hp = block + 2; 1473 for (int i = 0; i < mapnum; i++) 1474 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); 1475 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); 1476 if (profiling_p) 1477 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); 1478 } 1479 1480 /* Copy the (device) pointers to arguments to the device (dp and hp might in 1481 fact have the same value on a unified-memory system). */ 1482 if (mapnum > 0) 1483 { 1484 if (profiling_p) 1485 { 1486 prof_info->event_type = acc_ev_enqueue_upload_start; 1487 1488 data_event_info.data_event.event_type = prof_info->event_type; 1489 data_event_info.data_event.valid_bytes 1490 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1491 data_event_info.data_event.parent_construct 1492 = acc_construct_parallel; 1493 data_event_info.data_event.implicit = 1; /* Always implicit. */ 1494 data_event_info.data_event.tool_info = NULL; 1495 data_event_info.data_event.var_name = NULL; 1496 data_event_info.data_event.bytes = mapnum * sizeof (void *); 1497 data_event_info.data_event.host_ptr = hp; 1498 data_event_info.data_event.device_ptr = (const void *) dp; 1499 1500 api_info->device_api = acc_device_api_cuda; 1501 1502 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1503 api_info); 1504 } 1505 1506 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, 1507 mapnum * sizeof (void *), aq->cuda_stream); 1508 block[0] = (void *) dp; 1509 1510 struct nvptx_thread *nvthd = 1511 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); 1512 block[1] = (void *) nvthd->ptx_dev; 1513 1514 if (profiling_p) 1515 { 1516 prof_info->event_type = acc_ev_enqueue_upload_end; 1517 data_event_info.data_event.event_type = prof_info->event_type; 1518 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1519 api_info); 1520 } 1521 } 1522 1523 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, 1524 dp, aq->cuda_stream); 1525 1526 if (mapnum > 0) 1527 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block); 1528} 1529 1530void * 1531GOMP_OFFLOAD_openacc_create_thread_data (int ord) 1532{ 1533 struct ptx_device *ptx_dev; 1534 struct nvptx_thread *nvthd 1535 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); 1536 CUcontext thd_ctx; 1537 1538 ptx_dev = ptx_devices[ord]; 1539 1540 assert (ptx_dev); 1541 1542 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx); 1543 1544 assert (ptx_dev->ctx); 1545 1546 if (!thd_ctx) 1547 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); 1548 1549 nvthd->ptx_dev = ptx_dev; 1550 1551 return (void *) nvthd; 1552} 1553 1554void 1555GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 1556{ 1557 free (data); 1558} 1559 1560void * 1561GOMP_OFFLOAD_openacc_cuda_get_current_device (void) 1562{ 1563 return nvptx_get_current_cuda_device (); 1564} 1565 1566void * 1567GOMP_OFFLOAD_openacc_cuda_get_current_context (void) 1568{ 1569 return nvptx_get_current_cuda_context (); 1570} 1571 1572/* This returns a CUstream. */ 1573void * 1574GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq) 1575{ 1576 return (void *) aq->cuda_stream; 1577} 1578 1579/* This takes a CUstream. */ 1580int 1581GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream) 1582{ 1583 if (aq->cuda_stream) 1584 { 1585 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream); 1586 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream); 1587 } 1588 1589 aq->cuda_stream = (CUstream) stream; 1590 return 1; 1591} 1592 1593struct goacc_asyncqueue * 1594GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) 1595{ 1596 CUstream stream = NULL; 1597 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT); 1598 1599 struct goacc_asyncqueue *aq 1600 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue)); 1601 aq->cuda_stream = stream; 1602 return aq; 1603} 1604 1605bool 1606GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) 1607{ 1608 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream); 1609 free (aq); 1610 return true; 1611} 1612 1613int 1614GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) 1615{ 1616 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream); 1617 if (r == CUDA_SUCCESS) 1618 return 1; 1619 if (r == CUDA_ERROR_NOT_READY) 1620 return 0; 1621 1622 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r)); 1623 return -1; 1624} 1625 1626bool 1627GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) 1628{ 1629 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream); 1630 return true; 1631} 1632 1633bool 1634GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, 1635 struct goacc_asyncqueue *aq2) 1636{ 1637 CUevent e; 1638 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING); 1639 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream); 1640 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0); 1641 return true; 1642} 1643 1644static void 1645cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr) 1646{ 1647 if (res != CUDA_SUCCESS) 1648 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res)); 1649 struct nvptx_callback *cb = (struct nvptx_callback *) ptr; 1650 cb->fn (cb->ptr); 1651 free (ptr); 1652} 1653 1654void 1655GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, 1656 void (*callback_fn)(void *), 1657 void *userptr) 1658{ 1659 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b)); 1660 b->fn = callback_fn; 1661 b->ptr = userptr; 1662 b->aq = aq; 1663 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream, 1664 cuda_callback_wrapper, (void *) b, 0); 1665} 1666 1667static bool 1668cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) 1669{ 1670 CUdeviceptr pb; 1671 size_t ps; 1672 if (!s) 1673 return true; 1674 if (!d) 1675 { 1676 GOMP_PLUGIN_error ("invalid device address"); 1677 return false; 1678 } 1679 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); 1680 if (!pb) 1681 { 1682 GOMP_PLUGIN_error ("invalid device address"); 1683 return false; 1684 } 1685 if (!h) 1686 { 1687 GOMP_PLUGIN_error ("invalid host address"); 1688 return false; 1689 } 1690 if (d == h) 1691 { 1692 GOMP_PLUGIN_error ("invalid host or device address"); 1693 return false; 1694 } 1695 if ((void *)(d + s) > (void *)(pb + ps)) 1696 { 1697 GOMP_PLUGIN_error ("invalid size"); 1698 return false; 1699 } 1700 return true; 1701} 1702 1703bool 1704GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) 1705{ 1706 if (!nvptx_attach_host_thread_to_device (ord) 1707 || !cuda_memcpy_sanity_check (src, dst, n)) 1708 return false; 1709 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n); 1710 return true; 1711} 1712 1713bool 1714GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) 1715{ 1716 if (!nvptx_attach_host_thread_to_device (ord) 1717 || !cuda_memcpy_sanity_check (dst, src, n)) 1718 return false; 1719 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n); 1720 return true; 1721} 1722 1723bool 1724GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) 1725{ 1726 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); 1727 return true; 1728} 1729 1730bool 1731GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, 1732 size_t n, struct goacc_asyncqueue *aq) 1733{ 1734 if (!nvptx_attach_host_thread_to_device (ord) 1735 || !cuda_memcpy_sanity_check (src, dst, n)) 1736 return false; 1737 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream); 1738 return true; 1739} 1740 1741bool 1742GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, 1743 size_t n, struct goacc_asyncqueue *aq) 1744{ 1745 if (!nvptx_attach_host_thread_to_device (ord) 1746 || !cuda_memcpy_sanity_check (dst, src, n)) 1747 return false; 1748 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream); 1749 return true; 1750} 1751 1752union goacc_property_value 1753GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) 1754{ 1755 union goacc_property_value propval = { .val = 0 }; 1756 1757 pthread_mutex_lock (&ptx_dev_lock); 1758 1759 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL) 1760 { 1761 pthread_mutex_unlock (&ptx_dev_lock); 1762 return propval; 1763 } 1764 1765 struct ptx_device *ptx_dev = ptx_devices[n]; 1766 switch (prop) 1767 { 1768 case GOACC_PROPERTY_MEMORY: 1769 { 1770 size_t total_mem; 1771 1772 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev); 1773 propval.val = total_mem; 1774 } 1775 break; 1776 case GOACC_PROPERTY_FREE_MEMORY: 1777 { 1778 size_t total_mem; 1779 size_t free_mem; 1780 CUdevice ctxdev; 1781 1782 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev); 1783 if (ptx_dev->dev == ctxdev) 1784 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1785 else if (ptx_dev->ctx) 1786 { 1787 CUcontext old_ctx; 1788 1789 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx); 1790 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1791 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx); 1792 } 1793 else 1794 { 1795 CUcontext new_ctx; 1796 1797 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO, 1798 ptx_dev->dev); 1799 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem); 1800 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx); 1801 } 1802 propval.val = free_mem; 1803 } 1804 break; 1805 case GOACC_PROPERTY_NAME: 1806 propval.ptr = ptx_dev->name; 1807 break; 1808 case GOACC_PROPERTY_VENDOR: 1809 propval.ptr = "Nvidia"; 1810 break; 1811 case GOACC_PROPERTY_DRIVER: 1812 propval.ptr = cuda_driver_version_s; 1813 break; 1814 default: 1815 break; 1816 } 1817 1818 pthread_mutex_unlock (&ptx_dev_lock); 1819 return propval; 1820} 1821 1822/* Adjust launch dimensions: pick good values for number of blocks and warps 1823 and ensure that number of warps does not exceed CUDA limits as well as GCC's 1824 own limits. */ 1825 1826static void 1827nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn, 1828 struct ptx_device *ptx_dev, 1829 int *teams_p, int *threads_p) 1830{ 1831 int max_warps_block = fn->max_threads_per_block / 32; 1832 /* Maximum 32 warps per block is an implementation limit in NVPTX backend 1833 and libgcc, which matches documented limit of all GPUs as of 2015. */ 1834 if (max_warps_block > 32) 1835 max_warps_block = 32; 1836 if (*threads_p <= 0) 1837 *threads_p = 8; 1838 if (*threads_p > max_warps_block) 1839 *threads_p = max_warps_block; 1840 1841 int regs_per_block = fn->regs_per_thread * 32 * *threads_p; 1842 /* This is an estimate of how many blocks the device can host simultaneously. 1843 Actual limit, which may be lower, can be queried with "occupancy control" 1844 driver interface (since CUDA 6.0). */ 1845 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms; 1846 if (*teams_p <= 0 || *teams_p > max_blocks) 1847 *teams_p = max_blocks; 1848} 1849 1850/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP 1851 target regions. */ 1852 1853static size_t 1854nvptx_stacks_size () 1855{ 1856 return 128 * 1024; 1857} 1858 1859/* Return contiguous storage for NUM stacks, each SIZE bytes. */ 1860 1861static void * 1862nvptx_stacks_alloc (size_t size, int num) 1863{ 1864 CUdeviceptr stacks; 1865 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num); 1866 if (r != CUDA_SUCCESS) 1867 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); 1868 return (void *) stacks; 1869} 1870 1871/* Release storage previously allocated by nvptx_stacks_alloc. */ 1872 1873static void 1874nvptx_stacks_free (void *p, int num) 1875{ 1876 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p); 1877 if (r != CUDA_SUCCESS) 1878 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); 1879} 1880 1881void 1882GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) 1883{ 1884 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn; 1885 CUresult r; 1886 struct ptx_device *ptx_dev = ptx_devices[ord]; 1887 const char *maybe_abort_msg = "(perhaps abort was called)"; 1888 int teams = 0, threads = 0; 1889 1890 if (!args) 1891 GOMP_PLUGIN_fatal ("No target arguments provided"); 1892 while (*args) 1893 { 1894 intptr_t id = (intptr_t) *args++, val; 1895 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM) 1896 val = (intptr_t) *args++; 1897 else 1898 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT; 1899 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL) 1900 continue; 1901 val = val > INT_MAX ? INT_MAX : val; 1902 id &= GOMP_TARGET_ARG_ID_MASK; 1903 if (id == GOMP_TARGET_ARG_NUM_TEAMS) 1904 teams = val; 1905 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT) 1906 threads = val; 1907 } 1908 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); 1909 1910 size_t stack_size = nvptx_stacks_size (); 1911 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads); 1912 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size}; 1913 size_t fn_args_size = sizeof fn_args; 1914 void *config[] = { 1915 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args, 1916 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size, 1917 CU_LAUNCH_PARAM_END 1918 }; 1919 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, 1920 32, threads, 1, 0, NULL, NULL, config); 1921 if (r != CUDA_SUCCESS) 1922 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); 1923 1924 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); 1925 if (r == CUDA_ERROR_LAUNCH_FAILED) 1926 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), 1927 maybe_abort_msg); 1928 else if (r != CUDA_SUCCESS) 1929 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); 1930 nvptx_stacks_free (stacks, teams * threads); 1931} 1932 1933/* TODO: Implement GOMP_OFFLOAD_async_run. */ 1934