1/* Plugin for AMD GCN execution. 2 3 Copyright (C) 2013-2020 Free Software Foundation, Inc. 4 5 Contributed by Mentor Embedded 6 7 This file is part of the GNU Offloading and Multi Processing Library 8 (libgomp). 9 10 Libgomp is free software; you can redistribute it and/or modify it 11 under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 more details. 19 20 Under Section 7 of GPL version 3, you are granted additional 21 permissions described in the GCC Runtime Library Exception, version 22 3.1, as published by the Free Software Foundation. 23 24 You should have received a copy of the GNU General Public License and 25 a copy of the GCC Runtime Library Exception along with this program; 26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 <http://www.gnu.org/licenses/>. */ 28 29/* {{{ Includes and defines */ 30 31#include "config.h" 32#include <stdio.h> 33#include <stdlib.h> 34#include <string.h> 35#include <pthread.h> 36#include <inttypes.h> 37#include <stdbool.h> 38#include <limits.h> 39#include <hsa.h> 40#include <dlfcn.h> 41#include <signal.h> 42#include "libgomp-plugin.h" 43#include "gomp-constants.h" 44#include <elf.h> 45#include "oacc-plugin.h" 46#include "oacc-int.h" 47#include <assert.h> 48 49/* Additional definitions not in HSA 1.1. 50 FIXME: this needs to be updated in hsa.h for upstream, but the only source 51 right now is the ROCr source which may cause license issues. */ 52#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002 53 54/* These probably won't be in elf.h for a while. */ 55#define R_AMDGPU_NONE 0 56#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ 57#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */ 58#define R_AMDGPU_ABS64 3 /* S + A */ 59#define R_AMDGPU_REL32 4 /* S + A - P */ 60#define R_AMDGPU_REL64 5 /* S + A - P */ 61#define R_AMDGPU_ABS32 6 /* S + A */ 62#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */ 63#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */ 64#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */ 65#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */ 66#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */ 67#define reserved 12 68#define R_AMDGPU_RELATIVE64 13 /* B + A */ 69 70/* GCN specific definitions for asynchronous queues. */ 71 72#define ASYNC_QUEUE_SIZE 64 73#define DRAIN_QUEUE_SYNCHRONOUS_P false 74#define DEBUG_QUEUES 0 75#define DEBUG_THREAD_SLEEP 0 76#define DEBUG_THREAD_SIGNAL 0 77 78/* Defaults. */ 79#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */ 80 81/* Secure getenv() which returns NULL if running as SUID/SGID. */ 82#ifndef HAVE_SECURE_GETENV 83#ifdef HAVE___SECURE_GETENV 84#define secure_getenv __secure_getenv 85#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \ 86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID) 87 88#include <unistd.h> 89 90/* Implementation of secure_getenv() for targets where it is not provided but 91 we have at least means to test real and effective IDs. */ 92 93static char * 94secure_getenv (const char *name) 95{ 96 if ((getuid () == geteuid ()) && (getgid () == getegid ())) 97 return getenv (name); 98 else 99 return NULL; 100} 101 102#else 103#define secure_getenv getenv 104#endif 105#endif 106 107/* }}} */ 108/* {{{ Types */ 109 110/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */ 111 112struct gcn_thread 113{ 114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */ 115 int async; 116}; 117 118/* As an HSA runtime is dlopened, following structure defines function 119 pointers utilized by the HSA plug-in. */ 120 121struct hsa_runtime_fn_info 122{ 123 /* HSA runtime. */ 124 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, 125 const char **status_string); 126 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute, 127 void *value); 128 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, 129 hsa_agent_info_t attribute, 130 void *value); 131 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa, 132 hsa_isa_info_t attribute, 133 uint32_t index, 134 void *value); 135 hsa_status_t (*hsa_init_fn) (void); 136 hsa_status_t (*hsa_iterate_agents_fn) 137 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data); 138 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, 139 hsa_region_info_t attribute, 140 void *value); 141 hsa_status_t (*hsa_queue_create_fn) 142 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, 143 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), 144 void *data, uint32_t private_segment_size, 145 uint32_t group_segment_size, hsa_queue_t **queue); 146 hsa_status_t (*hsa_agent_iterate_regions_fn) 147 (hsa_agent_t agent, 148 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data); 149 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); 150 hsa_status_t (*hsa_executable_create_fn) 151 (hsa_profile_t profile, hsa_executable_state_t executable_state, 152 const char *options, hsa_executable_t *executable); 153 hsa_status_t (*hsa_executable_global_variable_define_fn) 154 (hsa_executable_t executable, const char *variable_name, void *address); 155 hsa_status_t (*hsa_executable_load_code_object_fn) 156 (hsa_executable_t executable, hsa_agent_t agent, 157 hsa_code_object_t code_object, const char *options); 158 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable, 159 const char *options); 160 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, 161 uint32_t num_consumers, 162 const hsa_agent_t *consumers, 163 hsa_signal_t *signal); 164 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, 165 void **ptr); 166 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, 167 hsa_access_permission_t access); 168 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size); 169 hsa_status_t (*hsa_memory_free_fn) (void *ptr); 170 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); 171 hsa_status_t (*hsa_executable_get_symbol_fn) 172 (hsa_executable_t executable, const char *module_name, 173 const char *symbol_name, hsa_agent_t agent, int32_t call_convention, 174 hsa_executable_symbol_t *symbol); 175 hsa_status_t (*hsa_executable_symbol_get_info_fn) 176 (hsa_executable_symbol_t executable_symbol, 177 hsa_executable_symbol_info_t attribute, void *value); 178 hsa_status_t (*hsa_executable_iterate_symbols_fn) 179 (hsa_executable_t executable, 180 hsa_status_t (*callback)(hsa_executable_t executable, 181 hsa_executable_symbol_t symbol, void *data), 182 void *data); 183 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue, 184 uint64_t value); 185 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue); 186 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, 187 hsa_signal_value_t value); 188 void (*hsa_signal_store_release_fn) (hsa_signal_t signal, 189 hsa_signal_value_t value); 190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn) 191 (hsa_signal_t signal, hsa_signal_condition_t condition, 192 hsa_signal_value_t compare_value, uint64_t timeout_hint, 193 hsa_wait_state_t wait_state_hint); 194 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal); 195 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); 196 197 hsa_status_t (*hsa_code_object_deserialize_fn) 198 (void *serialized_code_object, size_t serialized_code_object_size, 199 const char *options, hsa_code_object_t *code_object); 200}; 201 202/* Structure describing the run-time and grid properties of an HSA kernel 203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */ 204 205struct GOMP_kernel_launch_attributes 206{ 207 /* Number of dimensions the workload has. Maximum number is 3. */ 208 uint32_t ndim; 209 /* Size of the grid in the three respective dimensions. */ 210 uint32_t gdims[3]; 211 /* Size of work-groups in the respective dimensions. */ 212 uint32_t wdims[3]; 213}; 214 215/* Collection of information needed for a dispatch of a kernel from a 216 kernel. */ 217 218struct kernel_dispatch 219{ 220 struct agent_info *agent; 221 /* Pointer to a command queue associated with a kernel dispatch agent. */ 222 void *queue; 223 /* Pointer to a memory space used for kernel arguments passing. */ 224 void *kernarg_address; 225 /* Kernel object. */ 226 uint64_t object; 227 /* Synchronization signal used for dispatch synchronization. */ 228 uint64_t signal; 229 /* Private segment size. */ 230 uint32_t private_segment_size; 231 /* Group segment size. */ 232 uint32_t group_segment_size; 233}; 234 235/* Structure of the kernargs segment, supporting console output. 236 237 This needs to match the definitions in Newlib, and the expectations 238 in libgomp target code. */ 239 240struct kernargs { 241 /* Leave space for the real kernel arguments. 242 OpenACC and OpenMP only use one pointer. */ 243 int64_t dummy1; 244 int64_t dummy2; 245 246 /* A pointer to struct output, below, for console output data. */ 247 int64_t out_ptr; 248 249 /* A pointer to struct heap, below. */ 250 int64_t heap_ptr; 251 252 /* A pointer to an ephemeral memory arena. 253 Only needed for OpenMP. */ 254 int64_t arena_ptr; 255 256 /* Output data. */ 257 struct output { 258 int return_value; 259 unsigned int next_output; 260 struct printf_data { 261 int written; 262 char msg[128]; 263 int type; 264 union { 265 int64_t ivalue; 266 double dvalue; 267 char text[128]; 268 }; 269 } queue[1024]; 270 unsigned int consumed; 271 } output_data; 272}; 273 274/* A queue entry for a future asynchronous launch. */ 275 276struct kernel_launch 277{ 278 struct kernel_info *kernel; 279 void *vars; 280 struct GOMP_kernel_launch_attributes kla; 281}; 282 283/* A queue entry for a future callback. */ 284 285struct callback 286{ 287 void (*fn)(void *); 288 void *data; 289}; 290 291/* A data struct for the copy_data callback. */ 292 293struct copy_data 294{ 295 void *dst; 296 const void *src; 297 size_t len; 298 bool free_src; 299 struct goacc_asyncqueue *aq; 300}; 301 302/* A queue entry for a placeholder. These correspond to a wait event. */ 303 304struct placeholder 305{ 306 int executed; 307 pthread_cond_t cond; 308 pthread_mutex_t mutex; 309}; 310 311/* A queue entry for a wait directive. */ 312 313struct asyncwait_info 314{ 315 struct placeholder *placeholderp; 316}; 317 318/* Encode the type of an entry in an async queue. */ 319 320enum entry_type 321{ 322 KERNEL_LAUNCH, 323 CALLBACK, 324 ASYNC_WAIT, 325 ASYNC_PLACEHOLDER 326}; 327 328/* An entry in an async queue. */ 329 330struct queue_entry 331{ 332 enum entry_type type; 333 union { 334 struct kernel_launch launch; 335 struct callback callback; 336 struct asyncwait_info asyncwait; 337 struct placeholder placeholder; 338 } u; 339}; 340 341/* An async queue header. 342 343 OpenMP may create one of these. 344 OpenACC may create many. */ 345 346struct goacc_asyncqueue 347{ 348 struct agent_info *agent; 349 hsa_queue_t *hsa_queue; 350 351 pthread_t thread_drain_queue; 352 pthread_mutex_t mutex; 353 pthread_cond_t queue_cond_in; 354 pthread_cond_t queue_cond_out; 355 struct queue_entry queue[ASYNC_QUEUE_SIZE]; 356 int queue_first; 357 int queue_n; 358 int drain_queue_stop; 359 360 int id; 361 struct goacc_asyncqueue *prev; 362 struct goacc_asyncqueue *next; 363}; 364 365/* Mkoffload uses this structure to describe a kernel. 366 367 OpenMP kernel dimensions are passed at runtime. 368 OpenACC kernel dimensions are passed at compile time, here. */ 369 370struct hsa_kernel_description 371{ 372 const char *name; 373 int oacc_dims[3]; /* Only present for GCN kernels. */ 374 int sgpr_count; 375 int vpgr_count; 376}; 377 378/* Mkoffload uses this structure to describe an offload variable. */ 379 380struct global_var_info 381{ 382 const char *name; 383 void *address; 384}; 385 386/* Mkoffload uses this structure to describe all the kernels in a 387 loadable module. These are passed the libgomp via static constructors. */ 388 389struct gcn_image_desc 390{ 391 struct gcn_image { 392 size_t size; 393 void *image; 394 } *gcn_image; 395 const unsigned kernel_count; 396 struct hsa_kernel_description *kernel_infos; 397 const unsigned global_variable_count; 398 struct global_var_info *global_variables; 399}; 400 401/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we 402 support. 403 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */ 404 405typedef enum { 406 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, 407 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, 408 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, 409} EF_AMDGPU_MACH; 410 411const static int EF_AMDGPU_MACH_MASK = 0x000000ff; 412typedef EF_AMDGPU_MACH gcn_isa; 413 414/* Description of an HSA GPU agent (device) and the program associated with 415 it. */ 416 417struct agent_info 418{ 419 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */ 420 hsa_agent_t id; 421 /* The user-visible device number. */ 422 int device_id; 423 /* Whether the agent has been initialized. The fields below are usable only 424 if it has been. */ 425 bool initialized; 426 427 /* The instruction set architecture of the device. */ 428 gcn_isa device_isa; 429 /* Name of the agent. */ 430 char name[64]; 431 /* Name of the vendor of the agent. */ 432 char vendor_name[64]; 433 /* Command queues of the agent. */ 434 hsa_queue_t *sync_queue; 435 struct goacc_asyncqueue *async_queues, *omp_async_queue; 436 pthread_mutex_t async_queues_mutex; 437 438 /* The HSA memory region from which to allocate kernel arguments. */ 439 hsa_region_t kernarg_region; 440 441 /* The HSA memory region from which to allocate device data. */ 442 hsa_region_t data_region; 443 444 /* Allocated team arenas. */ 445 struct team_arena_list *team_arena_list; 446 pthread_mutex_t team_arena_write_lock; 447 448 /* Read-write lock that protects kernels which are running or about to be run 449 from interference with loading and unloading of images. Needs to be 450 locked for reading while a kernel is being run, and for writing if the 451 list of modules is manipulated (and thus the HSA program invalidated). */ 452 pthread_rwlock_t module_rwlock; 453 454 /* The module associated with this kernel. */ 455 struct module_info *module; 456 457 /* Mutex enforcing that only one thread will finalize the HSA program. A 458 thread should have locked agent->module_rwlock for reading before 459 acquiring it. */ 460 pthread_mutex_t prog_mutex; 461 /* Flag whether the HSA program that consists of all the modules has been 462 finalized. */ 463 bool prog_finalized; 464 /* HSA executable - the finalized program that is used to locate kernels. */ 465 hsa_executable_t executable; 466}; 467 468/* Information required to identify, finalize and run any given kernel. */ 469 470enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC}; 471 472struct kernel_info 473{ 474 /* Name of the kernel, required to locate it within the GCN object-code 475 module. */ 476 const char *name; 477 /* The specific agent the kernel has been or will be finalized for and run 478 on. */ 479 struct agent_info *agent; 480 /* The specific module where the kernel takes place. */ 481 struct module_info *module; 482 /* Information provided by mkoffload associated with the kernel. */ 483 struct hsa_kernel_description *description; 484 /* Mutex enforcing that at most once thread ever initializes a kernel for 485 use. A thread should have locked agent->module_rwlock for reading before 486 acquiring it. */ 487 pthread_mutex_t init_mutex; 488 /* Flag indicating whether the kernel has been initialized and all fields 489 below it contain valid data. */ 490 bool initialized; 491 /* Flag indicating that the kernel has a problem that blocks an execution. */ 492 bool initialization_failed; 493 /* The object to be put into the dispatch queue. */ 494 uint64_t object; 495 /* Required size of kernel arguments. */ 496 uint32_t kernarg_segment_size; 497 /* Required size of group segment. */ 498 uint32_t group_segment_size; 499 /* Required size of private segment. */ 500 uint32_t private_segment_size; 501 /* Set up for OpenMP or OpenACC? */ 502 enum offload_kind kind; 503}; 504 505/* Information about a particular GCN module, its image and kernels. */ 506 507struct module_info 508{ 509 /* The description with which the program has registered the image. */ 510 struct gcn_image_desc *image_desc; 511 /* GCN heap allocation. */ 512 struct heap *heap; 513 /* Physical boundaries of the loaded module. */ 514 Elf64_Addr phys_address_start; 515 Elf64_Addr phys_address_end; 516 517 bool constructors_run_p; 518 struct kernel_info *init_array_func, *fini_array_func; 519 520 /* Number of kernels in this module. */ 521 int kernel_count; 522 /* An array of kernel_info structures describing each kernel in this 523 module. */ 524 struct kernel_info kernels[]; 525}; 526 527/* A linked list of memory arenas allocated on the device. 528 These are only used by OpenMP, as a means to optimize per-team malloc. */ 529 530struct team_arena_list 531{ 532 struct team_arena_list *next; 533 534 /* The number of teams determines the size of the allocation. */ 535 int num_teams; 536 /* The device address of the arena itself. */ 537 void *arena; 538 /* A flag to prevent two asynchronous kernels trying to use the same arena. 539 The mutex is locked until the kernel exits. */ 540 pthread_mutex_t in_use; 541}; 542 543/* Information about the whole HSA environment and all of its agents. */ 544 545struct hsa_context_info 546{ 547 /* Whether the structure has been initialized. */ 548 bool initialized; 549 /* Number of usable GPU HSA agents in the system. */ 550 int agent_count; 551 /* Array of agent_info structures describing the individual HSA agents. */ 552 struct agent_info *agents; 553 /* Driver version string. */ 554 char driver_version_s[30]; 555}; 556 557/* Format of the on-device heap. 558 559 This must match the definition in Newlib and gcn-run. */ 560 561struct heap { 562 int64_t size; 563 char data[0]; 564}; 565 566/* }}} */ 567/* {{{ Global variables */ 568 569/* Information about the whole HSA environment and all of its agents. */ 570 571static struct hsa_context_info hsa_context; 572 573/* HSA runtime functions that are initialized in init_hsa_context. */ 574 575static struct hsa_runtime_fn_info hsa_fns; 576 577/* Heap space, allocated target-side, provided for use of newlib malloc. 578 Each module should have it's own heap allocated. 579 Beware that heap usage increases with OpenMP teams. See also arenas. */ 580 581static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE; 582 583/* Flag to decide whether print to stderr information about what is going on. 584 Set in init_debug depending on environment variables. */ 585 586static bool debug; 587 588/* Flag to decide if the runtime should suppress a possible fallback to host 589 execution. */ 590 591static bool suppress_host_fallback; 592 593/* Flag to locate HSA runtime shared library that is dlopened 594 by this plug-in. */ 595 596static const char *hsa_runtime_lib; 597 598/* Flag to decide if the runtime should support also CPU devices (can be 599 a simulator). */ 600 601static bool support_cpu_devices; 602 603/* Runtime dimension overrides. Zero indicates default. */ 604 605static int override_x_dim = 0; 606static int override_z_dim = 0; 607 608/* }}} */ 609/* {{{ Debug & Diagnostic */ 610 611/* Print a message to stderr if GCN_DEBUG value is set to true. */ 612 613#define DEBUG_PRINT(...) \ 614 do \ 615 { \ 616 if (debug) \ 617 { \ 618 fprintf (stderr, __VA_ARGS__); \ 619 } \ 620 } \ 621 while (false); 622 623/* Flush stderr if GCN_DEBUG value is set to true. */ 624 625#define DEBUG_FLUSH() \ 626 do { \ 627 if (debug) \ 628 fflush (stderr); \ 629 } while (false) 630 631/* Print a logging message with PREFIX to stderr if GCN_DEBUG value 632 is set to true. */ 633 634#define DEBUG_LOG(prefix, ...) \ 635 do \ 636 { \ 637 DEBUG_PRINT (prefix); \ 638 DEBUG_PRINT (__VA_ARGS__); \ 639 DEBUG_FLUSH (); \ 640 } while (false) 641 642/* Print a debugging message to stderr. */ 643 644#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__) 645 646/* Print a warning message to stderr. */ 647 648#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__) 649 650/* Print HSA warning STR with an HSA STATUS code. */ 651 652static void 653hsa_warn (const char *str, hsa_status_t status) 654{ 655 if (!debug) 656 return; 657 658 const char *hsa_error_msg = "[unknown]"; 659 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 660 661 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str, 662 hsa_error_msg); 663} 664 665/* Report a fatal error STR together with the HSA error corresponding to STATUS 666 and terminate execution of the current process. */ 667 668static void 669hsa_fatal (const char *str, hsa_status_t status) 670{ 671 const char *hsa_error_msg = "[unknown]"; 672 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 673 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str, 674 hsa_error_msg); 675} 676 677/* Like hsa_fatal, except only report error message, and return FALSE 678 for propagating error processing to outside of plugin. */ 679 680static bool 681hsa_error (const char *str, hsa_status_t status) 682{ 683 const char *hsa_error_msg = "[unknown]"; 684 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 685 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str, 686 hsa_error_msg); 687 return false; 688} 689 690/* Dump information about the available hardware. */ 691 692static void 693dump_hsa_system_info (void) 694{ 695 hsa_status_t status; 696 697 hsa_endianness_t endianness; 698 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS, 699 &endianness); 700 if (status == HSA_STATUS_SUCCESS) 701 switch (endianness) 702 { 703 case HSA_ENDIANNESS_LITTLE: 704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n"); 705 break; 706 case HSA_ENDIANNESS_BIG: 707 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n"); 708 break; 709 default: 710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n"); 711 } 712 else 713 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n"); 714 715 uint8_t extensions[128]; 716 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS, 717 &extensions); 718 if (status == HSA_STATUS_SUCCESS) 719 { 720 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES)) 721 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n"); 722 } 723 else 724 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n"); 725} 726 727/* Dump information about the available hardware. */ 728 729static void 730dump_machine_model (hsa_machine_model_t machine_model, const char *s) 731{ 732 switch (machine_model) 733 { 734 case HSA_MACHINE_MODEL_SMALL: 735 GCN_DEBUG ("%s: SMALL\n", s); 736 break; 737 case HSA_MACHINE_MODEL_LARGE: 738 GCN_DEBUG ("%s: LARGE\n", s); 739 break; 740 default: 741 GCN_WARNING ("%s: UNKNOWN\n", s); 742 break; 743 } 744} 745 746/* Dump information about the available hardware. */ 747 748static void 749dump_profile (hsa_profile_t profile, const char *s) 750{ 751 switch (profile) 752 { 753 case HSA_PROFILE_FULL: 754 GCN_DEBUG ("%s: FULL\n", s); 755 break; 756 case HSA_PROFILE_BASE: 757 GCN_DEBUG ("%s: BASE\n", s); 758 break; 759 default: 760 GCN_WARNING ("%s: UNKNOWN\n", s); 761 break; 762 } 763} 764 765/* Dump information about a device memory region. */ 766 767static hsa_status_t 768dump_hsa_region (hsa_region_t region, void *data __attribute__((unused))) 769{ 770 hsa_status_t status; 771 772 hsa_region_segment_t segment; 773 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, 774 &segment); 775 if (status == HSA_STATUS_SUCCESS) 776 { 777 if (segment == HSA_REGION_SEGMENT_GLOBAL) 778 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n"); 779 else if (segment == HSA_REGION_SEGMENT_READONLY) 780 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n"); 781 else if (segment == HSA_REGION_SEGMENT_PRIVATE) 782 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n"); 783 else if (segment == HSA_REGION_SEGMENT_GROUP) 784 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n"); 785 else 786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n"); 787 } 788 else 789 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n"); 790 791 if (segment == HSA_REGION_SEGMENT_GLOBAL) 792 { 793 uint32_t flags; 794 status 795 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, 796 &flags); 797 if (status == HSA_STATUS_SUCCESS) 798 { 799 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) 800 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n"); 801 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) 802 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n"); 803 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) 804 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n"); 805 } 806 else 807 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n"); 808 } 809 810 size_t size; 811 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size); 812 if (status == HSA_STATUS_SUCCESS) 813 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size); 814 else 815 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n"); 816 817 status 818 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE, 819 &size); 820 if (status == HSA_STATUS_SUCCESS) 821 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size); 822 else 823 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n"); 824 825 bool alloc_allowed; 826 status 827 = hsa_fns.hsa_region_get_info_fn (region, 828 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, 829 &alloc_allowed); 830 if (status == HSA_STATUS_SUCCESS) 831 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed); 832 else 833 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n"); 834 835 if (status != HSA_STATUS_SUCCESS || !alloc_allowed) 836 return HSA_STATUS_SUCCESS; 837 838 status 839 = hsa_fns.hsa_region_get_info_fn (region, 840 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE, 841 &size); 842 if (status == HSA_STATUS_SUCCESS) 843 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size); 844 else 845 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n"); 846 847 size_t align; 848 status 849 = hsa_fns.hsa_region_get_info_fn (region, 850 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT, 851 &align); 852 if (status == HSA_STATUS_SUCCESS) 853 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align); 854 else 855 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n"); 856 857 return HSA_STATUS_SUCCESS; 858} 859 860/* Dump information about all the device memory regions. */ 861 862static void 863dump_hsa_regions (hsa_agent_t agent) 864{ 865 hsa_status_t status; 866 status = hsa_fns.hsa_agent_iterate_regions_fn (agent, 867 dump_hsa_region, 868 NULL); 869 if (status != HSA_STATUS_SUCCESS) 870 hsa_error ("Dumping hsa regions failed", status); 871} 872 873/* Dump information about the available devices. */ 874 875static hsa_status_t 876dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused))) 877{ 878 hsa_status_t status; 879 880 char buf[64]; 881 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, 882 &buf); 883 if (status == HSA_STATUS_SUCCESS) 884 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf); 885 else 886 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n"); 887 888 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME, 889 &buf); 890 if (status == HSA_STATUS_SUCCESS) 891 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf); 892 else 893 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n"); 894 895 hsa_machine_model_t machine_model; 896 status 897 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL, 898 &machine_model); 899 if (status == HSA_STATUS_SUCCESS) 900 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL"); 901 else 902 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n"); 903 904 hsa_profile_t profile; 905 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE, 906 &profile); 907 if (status == HSA_STATUS_SUCCESS) 908 dump_profile (profile, "HSA_AGENT_INFO_PROFILE"); 909 else 910 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n"); 911 912 hsa_device_type_t device_type; 913 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, 914 &device_type); 915 if (status == HSA_STATUS_SUCCESS) 916 { 917 switch (device_type) 918 { 919 case HSA_DEVICE_TYPE_CPU: 920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n"); 921 break; 922 case HSA_DEVICE_TYPE_GPU: 923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n"); 924 break; 925 case HSA_DEVICE_TYPE_DSP: 926 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n"); 927 break; 928 default: 929 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n"); 930 break; 931 } 932 } 933 else 934 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n"); 935 936 uint32_t cu_count; 937 status = hsa_fns.hsa_agent_get_info_fn 938 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); 939 if (status == HSA_STATUS_SUCCESS) 940 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count); 941 else 942 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n"); 943 944 uint32_t size; 945 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, 946 &size); 947 if (status == HSA_STATUS_SUCCESS) 948 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size); 949 else 950 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n"); 951 952 uint32_t max_dim; 953 status = hsa_fns.hsa_agent_get_info_fn (agent, 954 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 955 &max_dim); 956 if (status == HSA_STATUS_SUCCESS) 957 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim); 958 else 959 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n"); 960 961 uint32_t max_size; 962 status = hsa_fns.hsa_agent_get_info_fn (agent, 963 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, 964 &max_size); 965 if (status == HSA_STATUS_SUCCESS) 966 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size); 967 else 968 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n"); 969 970 uint32_t grid_max_dim; 971 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM, 972 &grid_max_dim); 973 if (status == HSA_STATUS_SUCCESS) 974 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim); 975 else 976 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n"); 977 978 uint32_t grid_max_size; 979 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE, 980 &grid_max_size); 981 if (status == HSA_STATUS_SUCCESS) 982 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size); 983 else 984 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n"); 985 986 dump_hsa_regions (agent); 987 988 return HSA_STATUS_SUCCESS; 989} 990 991/* Forward reference. */ 992 993static char *get_executable_symbol_name (hsa_executable_symbol_t symbol); 994 995/* Helper function for dump_executable_symbols. */ 996 997static hsa_status_t 998dump_executable_symbol (hsa_executable_t executable, 999 hsa_executable_symbol_t symbol, 1000 void *data __attribute__((unused))) 1001{ 1002 char *name = get_executable_symbol_name (symbol); 1003 1004 if (name) 1005 { 1006 GCN_DEBUG ("executable symbol: %s\n", name); 1007 free (name); 1008 } 1009 1010 return HSA_STATUS_SUCCESS; 1011} 1012 1013/* Dump all global symbol in an executable. */ 1014 1015static void 1016dump_executable_symbols (hsa_executable_t executable) 1017{ 1018 hsa_status_t status; 1019 status 1020 = hsa_fns.hsa_executable_iterate_symbols_fn (executable, 1021 dump_executable_symbol, 1022 NULL); 1023 if (status != HSA_STATUS_SUCCESS) 1024 hsa_fatal ("Could not dump HSA executable symbols", status); 1025} 1026 1027/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */ 1028 1029static void 1030print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) 1031{ 1032 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address; 1033 1034 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch); 1035 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue); 1036 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs); 1037 fprintf (stderr, "%*sheap address: %p\n", indent, "", 1038 (void*)kernargs->heap_ptr); 1039 fprintf (stderr, "%*sarena address: %p\n", indent, "", 1040 (void*)kernargs->arena_ptr); 1041 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object); 1042 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "", 1043 dispatch->private_segment_size); 1044 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "", 1045 dispatch->group_segment_size); 1046 fprintf (stderr, "\n"); 1047} 1048 1049/* }}} */ 1050/* {{{ Utility functions */ 1051 1052/* Cast the thread local storage to gcn_thread. */ 1053 1054static inline struct gcn_thread * 1055gcn_thread (void) 1056{ 1057 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread (); 1058} 1059 1060/* Initialize debug and suppress_host_fallback according to the environment. */ 1061 1062static void 1063init_environment_variables (void) 1064{ 1065 if (secure_getenv ("GCN_DEBUG")) 1066 debug = true; 1067 else 1068 debug = false; 1069 1070 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK")) 1071 suppress_host_fallback = true; 1072 else 1073 suppress_host_fallback = false; 1074 1075 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB"); 1076 if (hsa_runtime_lib == NULL) 1077 hsa_runtime_lib = "libhsa-runtime64.so"; 1078 1079 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES"); 1080 1081 const char *x = secure_getenv ("GCN_NUM_TEAMS"); 1082 if (!x) 1083 x = secure_getenv ("GCN_NUM_GANGS"); 1084 if (x) 1085 override_x_dim = atoi (x); 1086 1087 const char *z = secure_getenv ("GCN_NUM_THREADS"); 1088 if (!z) 1089 z = secure_getenv ("GCN_NUM_WORKERS"); 1090 if (z) 1091 override_z_dim = atoi (z); 1092 1093 const char *heap = secure_getenv ("GCN_HEAP_SIZE"); 1094 if (heap) 1095 { 1096 size_t tmp = atol (heap); 1097 if (tmp) 1098 gcn_kernel_heap_size = tmp; 1099 } 1100} 1101 1102/* Return malloc'd string with name of SYMBOL. */ 1103 1104static char * 1105get_executable_symbol_name (hsa_executable_symbol_t symbol) 1106{ 1107 hsa_status_t status; 1108 char *res; 1109 uint32_t len; 1110 const hsa_executable_symbol_info_t info_name_length 1111 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH; 1112 1113 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length, 1114 &len); 1115 if (status != HSA_STATUS_SUCCESS) 1116 { 1117 hsa_error ("Could not get length of symbol name", status); 1118 return NULL; 1119 } 1120 1121 res = GOMP_PLUGIN_malloc (len + 1); 1122 1123 const hsa_executable_symbol_info_t info_name 1124 = HSA_EXECUTABLE_SYMBOL_INFO_NAME; 1125 1126 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res); 1127 1128 if (status != HSA_STATUS_SUCCESS) 1129 { 1130 hsa_error ("Could not get symbol name", status); 1131 free (res); 1132 return NULL; 1133 } 1134 1135 res[len] = '\0'; 1136 1137 return res; 1138} 1139 1140/* Helper function for find_executable_symbol. */ 1141 1142static hsa_status_t 1143find_executable_symbol_1 (hsa_executable_t executable, 1144 hsa_executable_symbol_t symbol, 1145 void *data) 1146{ 1147 hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data; 1148 *res = symbol; 1149 return HSA_STATUS_INFO_BREAK; 1150} 1151 1152/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not 1153 found, return false. */ 1154 1155static bool 1156find_executable_symbol (hsa_executable_t executable, 1157 hsa_executable_symbol_t *symbol) 1158{ 1159 hsa_status_t status; 1160 1161 status 1162 = hsa_fns.hsa_executable_iterate_symbols_fn (executable, 1163 find_executable_symbol_1, 1164 symbol); 1165 if (status != HSA_STATUS_INFO_BREAK) 1166 { 1167 hsa_error ("Could not find executable symbol", status); 1168 return false; 1169 } 1170 1171 return true; 1172} 1173 1174/* Get the number of GPU Compute Units. */ 1175 1176static int 1177get_cu_count (struct agent_info *agent) 1178{ 1179 uint32_t cu_count; 1180 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn 1181 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); 1182 if (status == HSA_STATUS_SUCCESS) 1183 return cu_count; 1184 else 1185 return 64; /* The usual number for older devices. */ 1186} 1187 1188/* Calculate the maximum grid size for OMP threads / OACC workers. 1189 This depends on the kernel's resource usage levels. */ 1190 1191static int 1192limit_worker_threads (int threads) 1193{ 1194 /* FIXME Do something more inteligent here. 1195 GCN can always run 4 threads within a Compute Unit, but 1196 more than that depends on register usage. */ 1197 if (threads > 16) 1198 threads = 16; 1199 return threads; 1200} 1201 1202/* Parse the target attributes INPUT provided by the compiler and return true 1203 if we should run anything all. If INPUT is NULL, fill DEF with default 1204 values, then store INPUT or DEF into *RESULT. 1205 1206 This is used for OpenMP only. */ 1207 1208static bool 1209parse_target_attributes (void **input, 1210 struct GOMP_kernel_launch_attributes *def, 1211 struct GOMP_kernel_launch_attributes **result, 1212 struct agent_info *agent) 1213{ 1214 if (!input) 1215 GOMP_PLUGIN_fatal ("No target arguments provided"); 1216 1217 bool grid_attrs_found = false; 1218 bool gcn_dims_found = false; 1219 int gcn_teams = 0; 1220 int gcn_threads = 0; 1221 while (*input) 1222 { 1223 intptr_t id = (intptr_t) *input++, val; 1224 1225 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM) 1226 val = (intptr_t) *input++; 1227 else 1228 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT; 1229 1230 val = (val > INT_MAX) ? INT_MAX : val; 1231 1232 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN 1233 && ((id & GOMP_TARGET_ARG_ID_MASK) 1234 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES)) 1235 { 1236 grid_attrs_found = true; 1237 break; 1238 } 1239 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK) 1240 == GOMP_TARGET_ARG_DEVICE_ALL) 1241 { 1242 gcn_dims_found = true; 1243 switch (id & GOMP_TARGET_ARG_ID_MASK) 1244 { 1245 case GOMP_TARGET_ARG_NUM_TEAMS: 1246 gcn_teams = val; 1247 break; 1248 case GOMP_TARGET_ARG_THREAD_LIMIT: 1249 gcn_threads = limit_worker_threads (val); 1250 break; 1251 default: 1252 ; 1253 } 1254 } 1255 } 1256 1257 if (gcn_dims_found) 1258 { 1259 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900 1260 && gcn_threads == 0 && override_z_dim == 0) 1261 { 1262 gcn_threads = 4; 1263 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of " 1264 "threads to 4 per team.\n"); 1265 GCN_WARNING (" - If this is not a Vega 10 device, please use " 1266 "GCN_NUM_THREADS=16\n"); 1267 } 1268 1269 def->ndim = 3; 1270 /* Fiji has 64 CUs, but Vega20 has 60. */ 1271 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent); 1272 /* Each thread is 64 work items wide. */ 1273 def->gdims[1] = 64; 1274 /* A work group can have 16 wavefronts. */ 1275 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16; 1276 def->wdims[0] = 1; /* Single team per work-group. */ 1277 def->wdims[1] = 64; 1278 def->wdims[2] = 16; 1279 *result = def; 1280 return true; 1281 } 1282 else if (!grid_attrs_found) 1283 { 1284 def->ndim = 1; 1285 def->gdims[0] = 1; 1286 def->gdims[1] = 1; 1287 def->gdims[2] = 1; 1288 def->wdims[0] = 1; 1289 def->wdims[1] = 1; 1290 def->wdims[2] = 1; 1291 *result = def; 1292 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n"); 1293 return true; 1294 } 1295 1296 struct GOMP_kernel_launch_attributes *kla; 1297 kla = (struct GOMP_kernel_launch_attributes *) *input; 1298 *result = kla; 1299 if (kla->ndim == 0 || kla->ndim > 3) 1300 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim); 1301 1302 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim); 1303 unsigned i; 1304 for (i = 0; i < kla->ndim; i++) 1305 { 1306 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i, 1307 kla->gdims[i], kla->wdims[i]); 1308 if (kla->gdims[i] == 0) 1309 return false; 1310 } 1311 return true; 1312} 1313 1314/* Return the group size given the requested GROUP size, GRID size and number 1315 of grid dimensions NDIM. */ 1316 1317static uint32_t 1318get_group_size (uint32_t ndim, uint32_t grid, uint32_t group) 1319{ 1320 if (group == 0) 1321 { 1322 /* TODO: Provide a default via environment or device characteristics. */ 1323 if (ndim == 1) 1324 group = 64; 1325 else if (ndim == 2) 1326 group = 8; 1327 else 1328 group = 4; 1329 } 1330 1331 if (group > grid) 1332 group = grid; 1333 return group; 1334} 1335 1336/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */ 1337 1338static void 1339packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest) 1340{ 1341 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE); 1342} 1343 1344/* A never-called callback for the HSA command queues. These signal events 1345 that we don't use, so we trigger an error. 1346 1347 This "queue" is not to be confused with the async queues, below. */ 1348 1349static void 1350hsa_queue_callback (hsa_status_t status, 1351 hsa_queue_t *queue __attribute__ ((unused)), 1352 void *data __attribute__ ((unused))) 1353{ 1354 hsa_fatal ("Asynchronous queue error", status); 1355} 1356 1357/* }}} */ 1358/* {{{ HSA initialization */ 1359 1360/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */ 1361 1362static bool 1363init_hsa_runtime_functions (void) 1364{ 1365#define DLSYM_FN(function) \ 1366 hsa_fns.function##_fn = dlsym (handle, #function); \ 1367 if (hsa_fns.function##_fn == NULL) \ 1368 return false; 1369 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY); 1370 if (handle == NULL) 1371 return false; 1372 1373 DLSYM_FN (hsa_status_string) 1374 DLSYM_FN (hsa_system_get_info) 1375 DLSYM_FN (hsa_agent_get_info) 1376 DLSYM_FN (hsa_init) 1377 DLSYM_FN (hsa_iterate_agents) 1378 DLSYM_FN (hsa_region_get_info) 1379 DLSYM_FN (hsa_queue_create) 1380 DLSYM_FN (hsa_agent_iterate_regions) 1381 DLSYM_FN (hsa_executable_destroy) 1382 DLSYM_FN (hsa_executable_create) 1383 DLSYM_FN (hsa_executable_global_variable_define) 1384 DLSYM_FN (hsa_executable_load_code_object) 1385 DLSYM_FN (hsa_executable_freeze) 1386 DLSYM_FN (hsa_signal_create) 1387 DLSYM_FN (hsa_memory_allocate) 1388 DLSYM_FN (hsa_memory_assign_agent) 1389 DLSYM_FN (hsa_memory_copy) 1390 DLSYM_FN (hsa_memory_free) 1391 DLSYM_FN (hsa_signal_destroy) 1392 DLSYM_FN (hsa_executable_get_symbol) 1393 DLSYM_FN (hsa_executable_symbol_get_info) 1394 DLSYM_FN (hsa_executable_iterate_symbols) 1395 DLSYM_FN (hsa_queue_add_write_index_release) 1396 DLSYM_FN (hsa_queue_load_read_index_acquire) 1397 DLSYM_FN (hsa_signal_wait_acquire) 1398 DLSYM_FN (hsa_signal_store_relaxed) 1399 DLSYM_FN (hsa_signal_store_release) 1400 DLSYM_FN (hsa_signal_load_acquire) 1401 DLSYM_FN (hsa_queue_destroy) 1402 DLSYM_FN (hsa_code_object_deserialize) 1403 return true; 1404#undef DLSYM_FN 1405} 1406 1407/* Return true if the agent is a GPU and can accept of concurrent submissions 1408 from different threads. */ 1409 1410static bool 1411suitable_hsa_agent_p (hsa_agent_t agent) 1412{ 1413 hsa_device_type_t device_type; 1414 hsa_status_t status 1415 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, 1416 &device_type); 1417 if (status != HSA_STATUS_SUCCESS) 1418 return false; 1419 1420 switch (device_type) 1421 { 1422 case HSA_DEVICE_TYPE_GPU: 1423 break; 1424 case HSA_DEVICE_TYPE_CPU: 1425 if (!support_cpu_devices) 1426 return false; 1427 break; 1428 default: 1429 return false; 1430 } 1431 1432 uint32_t features = 0; 1433 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE, 1434 &features); 1435 if (status != HSA_STATUS_SUCCESS 1436 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)) 1437 return false; 1438 hsa_queue_type_t queue_type; 1439 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE, 1440 &queue_type); 1441 if (status != HSA_STATUS_SUCCESS 1442 || (queue_type != HSA_QUEUE_TYPE_MULTI)) 1443 return false; 1444 1445 return true; 1446} 1447 1448/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment 1449 agent_count in hsa_context. */ 1450 1451static hsa_status_t 1452count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused))) 1453{ 1454 if (suitable_hsa_agent_p (agent)) 1455 hsa_context.agent_count++; 1456 return HSA_STATUS_SUCCESS; 1457} 1458 1459/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent 1460 id to the describing structure in the hsa context. The index of the 1461 structure is pointed to by DATA, increment it afterwards. */ 1462 1463static hsa_status_t 1464assign_agent_ids (hsa_agent_t agent, void *data) 1465{ 1466 if (suitable_hsa_agent_p (agent)) 1467 { 1468 int *agent_index = (int *) data; 1469 hsa_context.agents[*agent_index].id = agent; 1470 ++*agent_index; 1471 } 1472 return HSA_STATUS_SUCCESS; 1473} 1474 1475/* Initialize hsa_context if it has not already been done. 1476 Return TRUE on success. */ 1477 1478static bool 1479init_hsa_context (void) 1480{ 1481 hsa_status_t status; 1482 int agent_index = 0; 1483 1484 if (hsa_context.initialized) 1485 return true; 1486 init_environment_variables (); 1487 if (!init_hsa_runtime_functions ()) 1488 { 1489 GCN_WARNING ("Run-time could not be dynamically opened\n"); 1490 if (suppress_host_fallback) 1491 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed"); 1492 return false; 1493 } 1494 status = hsa_fns.hsa_init_fn (); 1495 if (status != HSA_STATUS_SUCCESS) 1496 return hsa_error ("Run-time could not be initialized", status); 1497 GCN_DEBUG ("HSA run-time initialized for GCN\n"); 1498 1499 if (debug) 1500 dump_hsa_system_info (); 1501 1502 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL); 1503 if (status != HSA_STATUS_SUCCESS) 1504 return hsa_error ("GCN GPU devices could not be enumerated", status); 1505 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count); 1506 1507 hsa_context.agents 1508 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count 1509 * sizeof (struct agent_info)); 1510 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index); 1511 if (status != HSA_STATUS_SUCCESS) 1512 return hsa_error ("Scanning compute agents failed", status); 1513 if (agent_index != hsa_context.agent_count) 1514 { 1515 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents"); 1516 return false; 1517 } 1518 1519 if (debug) 1520 { 1521 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL); 1522 if (status != HSA_STATUS_SUCCESS) 1523 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents"); 1524 } 1525 1526 uint16_t minor, major; 1527 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR, 1528 &minor); 1529 if (status != HSA_STATUS_SUCCESS) 1530 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version"); 1531 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR, 1532 &major); 1533 if (status != HSA_STATUS_SUCCESS) 1534 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version"); 1535 1536 size_t len = sizeof hsa_context.driver_version_s; 1537 int printed = snprintf (hsa_context.driver_version_s, len, 1538 "HSA Runtime %hu.%hu", (unsigned short int)major, 1539 (unsigned short int)minor); 1540 if (printed >= len) 1541 GCN_WARNING ("HSA runtime version string was truncated." 1542 "Version %hu.%hu is too long.", (unsigned short int)major, 1543 (unsigned short int)minor); 1544 1545 hsa_context.initialized = true; 1546 return true; 1547} 1548 1549/* Verify that hsa_context has already been initialized and return the 1550 agent_info structure describing device number N. Return NULL on error. */ 1551 1552static struct agent_info * 1553get_agent_info (int n) 1554{ 1555 if (!hsa_context.initialized) 1556 { 1557 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context."); 1558 return NULL; 1559 } 1560 if (n >= hsa_context.agent_count) 1561 { 1562 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n); 1563 return NULL; 1564 } 1565 if (!hsa_context.agents[n].initialized) 1566 { 1567 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent."); 1568 return NULL; 1569 } 1570 return &hsa_context.agents[n]; 1571} 1572 1573/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions. 1574 1575 Selects (breaks at) a suitable region of type KIND. */ 1576 1577static hsa_status_t 1578get_memory_region (hsa_region_t region, hsa_region_t *retval, 1579 hsa_region_global_flag_t kind) 1580{ 1581 hsa_status_t status; 1582 hsa_region_segment_t segment; 1583 1584 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, 1585 &segment); 1586 if (status != HSA_STATUS_SUCCESS) 1587 return status; 1588 if (segment != HSA_REGION_SEGMENT_GLOBAL) 1589 return HSA_STATUS_SUCCESS; 1590 1591 uint32_t flags; 1592 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, 1593 &flags); 1594 if (status != HSA_STATUS_SUCCESS) 1595 return status; 1596 if (flags & kind) 1597 { 1598 *retval = region; 1599 return HSA_STATUS_INFO_BREAK; 1600 } 1601 return HSA_STATUS_SUCCESS; 1602} 1603 1604/* Callback of hsa_agent_iterate_regions. 1605 1606 Selects a kernargs memory region. */ 1607 1608static hsa_status_t 1609get_kernarg_memory_region (hsa_region_t region, void *data) 1610{ 1611 return get_memory_region (region, (hsa_region_t *)data, 1612 HSA_REGION_GLOBAL_FLAG_KERNARG); 1613} 1614 1615/* Callback of hsa_agent_iterate_regions. 1616 1617 Selects a coarse-grained memory region suitable for the heap and 1618 offload data. */ 1619 1620static hsa_status_t 1621get_data_memory_region (hsa_region_t region, void *data) 1622{ 1623 return get_memory_region (region, (hsa_region_t *)data, 1624 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); 1625} 1626 1627static int 1628elf_gcn_isa_field (Elf64_Ehdr *image) 1629{ 1630 return image->e_flags & EF_AMDGPU_MACH_MASK; 1631} 1632 1633const static char *gcn_gfx803_s = "gfx803"; 1634const static char *gcn_gfx900_s = "gfx900"; 1635const static char *gcn_gfx906_s = "gfx906"; 1636const static int gcn_isa_name_len = 6; 1637 1638/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not 1639 support the ISA. */ 1640 1641static const char* 1642isa_hsa_name (int isa) { 1643 switch(isa) 1644 { 1645 case EF_AMDGPU_MACH_AMDGCN_GFX803: 1646 return gcn_gfx803_s; 1647 case EF_AMDGPU_MACH_AMDGCN_GFX900: 1648 return gcn_gfx900_s; 1649 case EF_AMDGPU_MACH_AMDGCN_GFX906: 1650 return gcn_gfx906_s; 1651 } 1652 return NULL; 1653} 1654 1655/* Returns the user-facing name that GCC uses to identify the architecture (e.g. 1656 with -march) or NULL if we do not support the ISA. 1657 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */ 1658 1659static const char* 1660isa_gcc_name (int isa) { 1661 switch(isa) 1662 { 1663 case EF_AMDGPU_MACH_AMDGCN_GFX803: 1664 return "fiji"; 1665 default: 1666 return isa_hsa_name (isa); 1667 } 1668} 1669 1670/* Returns the code which is used in the GCN object code to identify the ISA with 1671 the given name (as used by the HSA runtime). */ 1672 1673static gcn_isa 1674isa_code(const char *isa) { 1675 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len)) 1676 return EF_AMDGPU_MACH_AMDGCN_GFX803; 1677 1678 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len)) 1679 return EF_AMDGPU_MACH_AMDGCN_GFX900; 1680 1681 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len)) 1682 return EF_AMDGPU_MACH_AMDGCN_GFX906; 1683 1684 return -1; 1685} 1686 1687/* }}} */ 1688/* {{{ Run */ 1689 1690/* Create or reuse a team arena. 1691 1692 Team arenas are used by OpenMP to avoid calling malloc multiple times 1693 while setting up each team. This is purely a performance optimization. 1694 1695 Allocating an arena also costs performance, albeit on the host side, so 1696 this function will reuse an existing arena if a large enough one is idle. 1697 The arena is released, but not deallocated, when the kernel exits. */ 1698 1699static void * 1700get_team_arena (struct agent_info *agent, int num_teams) 1701{ 1702 struct team_arena_list **next_ptr = &agent->team_arena_list; 1703 struct team_arena_list *item; 1704 1705 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next) 1706 { 1707 if (item->num_teams < num_teams) 1708 continue; 1709 1710 if (pthread_mutex_trylock (&item->in_use)) 1711 continue; 1712 1713 return item->arena; 1714 } 1715 1716 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams); 1717 1718 if (pthread_mutex_lock (&agent->team_arena_write_lock)) 1719 { 1720 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 1721 return false; 1722 } 1723 item = malloc (sizeof (*item)); 1724 item->num_teams = num_teams; 1725 item->next = NULL; 1726 *next_ptr = item; 1727 1728 if (pthread_mutex_init (&item->in_use, NULL)) 1729 { 1730 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex"); 1731 return false; 1732 } 1733 if (pthread_mutex_lock (&item->in_use)) 1734 { 1735 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 1736 return false; 1737 } 1738 if (pthread_mutex_unlock (&agent->team_arena_write_lock)) 1739 { 1740 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 1741 return false; 1742 } 1743 1744 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */ 1745 hsa_status_t status; 1746 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, 1747 TEAM_ARENA_SIZE*num_teams, 1748 &item->arena); 1749 if (status != HSA_STATUS_SUCCESS) 1750 hsa_fatal ("Could not allocate memory for GCN kernel arena", status); 1751 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id, 1752 HSA_ACCESS_PERMISSION_RW); 1753 if (status != HSA_STATUS_SUCCESS) 1754 hsa_fatal ("Could not assign arena memory to device", status); 1755 1756 return item->arena; 1757} 1758 1759/* Mark a team arena available for reuse. */ 1760 1761static void 1762release_team_arena (struct agent_info* agent, void *arena) 1763{ 1764 struct team_arena_list *item; 1765 1766 for (item = agent->team_arena_list; item; item = item->next) 1767 { 1768 if (item->arena == arena) 1769 { 1770 if (pthread_mutex_unlock (&item->in_use)) 1771 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 1772 return; 1773 } 1774 } 1775 GOMP_PLUGIN_error ("Could not find a GCN arena to release."); 1776} 1777 1778/* Clean up all the allocated team arenas. */ 1779 1780static bool 1781destroy_team_arenas (struct agent_info *agent) 1782{ 1783 struct team_arena_list *item, *next; 1784 1785 for (item = agent->team_arena_list; item; item = next) 1786 { 1787 next = item->next; 1788 hsa_fns.hsa_memory_free_fn (item->arena); 1789 if (pthread_mutex_destroy (&item->in_use)) 1790 { 1791 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex"); 1792 return false; 1793 } 1794 free (item); 1795 } 1796 agent->team_arena_list = NULL; 1797 1798 return true; 1799} 1800 1801/* Allocate memory on a specified device. */ 1802 1803static void * 1804alloc_by_agent (struct agent_info *agent, size_t size) 1805{ 1806 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id); 1807 1808 /* Zero-size allocations are invalid, so in order to return a valid pointer 1809 we need to pass a valid size. One source of zero-size allocations is 1810 kernargs for kernels that have no inputs or outputs (the kernel may 1811 only use console output, for example). */ 1812 if (size == 0) 1813 size = 4; 1814 1815 void *ptr; 1816 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, 1817 size, &ptr); 1818 if (status != HSA_STATUS_SUCCESS) 1819 { 1820 hsa_error ("Could not allocate device memory", status); 1821 return NULL; 1822 } 1823 1824 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id, 1825 HSA_ACCESS_PERMISSION_RW); 1826 if (status != HSA_STATUS_SUCCESS) 1827 { 1828 hsa_error ("Could not assign data memory to device", status); 1829 return NULL; 1830 } 1831 1832 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1833 bool profiling_dispatch_p 1834 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1835 if (profiling_dispatch_p) 1836 { 1837 acc_prof_info *prof_info = thr->prof_info; 1838 acc_event_info data_event_info; 1839 acc_api_info *api_info = thr->api_info; 1840 1841 prof_info->event_type = acc_ev_alloc; 1842 1843 data_event_info.data_event.event_type = prof_info->event_type; 1844 data_event_info.data_event.valid_bytes 1845 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1846 data_event_info.data_event.parent_construct 1847 = acc_construct_parallel; 1848 data_event_info.data_event.implicit = 1; 1849 data_event_info.data_event.tool_info = NULL; 1850 data_event_info.data_event.var_name = NULL; 1851 data_event_info.data_event.bytes = size; 1852 data_event_info.data_event.host_ptr = NULL; 1853 data_event_info.data_event.device_ptr = (void *) ptr; 1854 1855 api_info->device_api = acc_device_api_other; 1856 1857 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1858 api_info); 1859 } 1860 1861 return ptr; 1862} 1863 1864/* Create kernel dispatch data structure for given KERNEL, along with 1865 the necessary device signals and memory allocations. */ 1866 1867static struct kernel_dispatch * 1868create_kernel_dispatch (struct kernel_info *kernel, int num_teams) 1869{ 1870 struct agent_info *agent = kernel->agent; 1871 struct kernel_dispatch *shadow 1872 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch)); 1873 1874 shadow->agent = kernel->agent; 1875 shadow->object = kernel->object; 1876 1877 hsa_signal_t sync_signal; 1878 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal); 1879 if (status != HSA_STATUS_SUCCESS) 1880 hsa_fatal ("Error creating the GCN sync signal", status); 1881 1882 shadow->signal = sync_signal.handle; 1883 shadow->private_segment_size = kernel->private_segment_size; 1884 shadow->group_segment_size = kernel->group_segment_size; 1885 1886 /* We expect kernels to request a single pointer, explicitly, and the 1887 rest of struct kernargs, implicitly. If they request anything else 1888 then something is wrong. */ 1889 if (kernel->kernarg_segment_size > 8) 1890 { 1891 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested"); 1892 return NULL; 1893 } 1894 1895 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region, 1896 sizeof (struct kernargs), 1897 &shadow->kernarg_address); 1898 if (status != HSA_STATUS_SUCCESS) 1899 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status); 1900 struct kernargs *kernargs = shadow->kernarg_address; 1901 1902 /* Zero-initialize the output_data (minimum needed). */ 1903 kernargs->out_ptr = (int64_t)&kernargs->output_data; 1904 kernargs->output_data.next_output = 0; 1905 for (unsigned i = 0; 1906 i < (sizeof (kernargs->output_data.queue) 1907 / sizeof (kernargs->output_data.queue[0])); 1908 i++) 1909 kernargs->output_data.queue[i].written = 0; 1910 kernargs->output_data.consumed = 0; 1911 1912 /* Pass in the heap location. */ 1913 kernargs->heap_ptr = (int64_t)kernel->module->heap; 1914 1915 /* Create an arena. */ 1916 if (kernel->kind == KIND_OPENMP) 1917 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams); 1918 else 1919 kernargs->arena_ptr = 0; 1920 1921 /* Ensure we can recognize unset return values. */ 1922 kernargs->output_data.return_value = 0xcafe0000; 1923 1924 return shadow; 1925} 1926 1927/* Output any data written to console output from the kernel. It is expected 1928 that this function is polled during kernel execution. 1929 1930 We print all entries from the last item printed to the next entry without 1931 a "written" flag. If the "final" flag is set then it'll continue right to 1932 the end. 1933 1934 The print buffer is circular, but the from and to locations don't wrap when 1935 the buffer does, so the output limit is UINT_MAX. The target blocks on 1936 output when the buffer is full. */ 1937 1938static void 1939console_output (struct kernel_info *kernel, struct kernargs *kernargs, 1940 bool final) 1941{ 1942 unsigned int limit = (sizeof (kernargs->output_data.queue) 1943 / sizeof (kernargs->output_data.queue[0])); 1944 1945 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, 1946 __ATOMIC_ACQUIRE); 1947 unsigned int to = kernargs->output_data.next_output; 1948 1949 if (from > to) 1950 { 1951 /* Overflow. */ 1952 if (final) 1953 printf ("GCN print buffer overflowed.\n"); 1954 return; 1955 } 1956 1957 unsigned int i; 1958 for (i = from; i < to; i++) 1959 { 1960 struct printf_data *data = &kernargs->output_data.queue[i%limit]; 1961 1962 if (!data->written && !final) 1963 break; 1964 1965 switch (data->type) 1966 { 1967 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break; 1968 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break; 1969 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break; 1970 case 3: printf ("%.128s%.128s", data->msg, data->text); break; 1971 default: printf ("GCN print buffer error!\n"); break; 1972 } 1973 data->written = 0; 1974 __atomic_store_n (&kernargs->output_data.consumed, i+1, 1975 __ATOMIC_RELEASE); 1976 } 1977 fflush (stdout); 1978} 1979 1980/* Release data structure created for a kernel dispatch in SHADOW argument, 1981 and clean up the signal and memory allocations. */ 1982 1983static void 1984release_kernel_dispatch (struct kernel_dispatch *shadow) 1985{ 1986 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow); 1987 1988 struct kernargs *kernargs = shadow->kernarg_address; 1989 void *arena = (void *)kernargs->arena_ptr; 1990 if (arena) 1991 release_team_arena (shadow->agent, arena); 1992 1993 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address); 1994 1995 hsa_signal_t s; 1996 s.handle = shadow->signal; 1997 hsa_fns.hsa_signal_destroy_fn (s); 1998 1999 free (shadow); 2000} 2001 2002/* Extract the properties from a kernel binary. */ 2003 2004static void 2005init_kernel_properties (struct kernel_info *kernel) 2006{ 2007 hsa_status_t status; 2008 struct agent_info *agent = kernel->agent; 2009 hsa_executable_symbol_t kernel_symbol; 2010 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 2011 kernel->name, agent->id, 2012 0, &kernel_symbol); 2013 if (status != HSA_STATUS_SUCCESS) 2014 { 2015 hsa_warn ("Could not find symbol for kernel in the code object", status); 2016 fprintf (stderr, "not found name: '%s'\n", kernel->name); 2017 dump_executable_symbols (agent->executable); 2018 goto failure; 2019 } 2020 GCN_DEBUG ("Located kernel %s\n", kernel->name); 2021 status = hsa_fns.hsa_executable_symbol_get_info_fn 2022 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object); 2023 if (status != HSA_STATUS_SUCCESS) 2024 hsa_fatal ("Could not extract a kernel object from its symbol", status); 2025 status = hsa_fns.hsa_executable_symbol_get_info_fn 2026 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, 2027 &kernel->kernarg_segment_size); 2028 if (status != HSA_STATUS_SUCCESS) 2029 hsa_fatal ("Could not get info about kernel argument size", status); 2030 status = hsa_fns.hsa_executable_symbol_get_info_fn 2031 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, 2032 &kernel->group_segment_size); 2033 if (status != HSA_STATUS_SUCCESS) 2034 hsa_fatal ("Could not get info about kernel group segment size", status); 2035 status = hsa_fns.hsa_executable_symbol_get_info_fn 2036 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, 2037 &kernel->private_segment_size); 2038 if (status != HSA_STATUS_SUCCESS) 2039 hsa_fatal ("Could not get info about kernel private segment size", 2040 status); 2041 2042 /* The kernel type is not known until something tries to launch it. */ 2043 kernel->kind = KIND_UNKNOWN; 2044 2045 GCN_DEBUG ("Kernel structure for %s fully initialized with " 2046 "following segment sizes: \n", kernel->name); 2047 GCN_DEBUG (" group_segment_size: %u\n", 2048 (unsigned) kernel->group_segment_size); 2049 GCN_DEBUG (" private_segment_size: %u\n", 2050 (unsigned) kernel->private_segment_size); 2051 GCN_DEBUG (" kernarg_segment_size: %u\n", 2052 (unsigned) kernel->kernarg_segment_size); 2053 return; 2054 2055failure: 2056 kernel->initialization_failed = true; 2057} 2058 2059/* Do all the work that is necessary before running KERNEL for the first time. 2060 The function assumes the program has been created, finalized and frozen by 2061 create_and_finalize_hsa_program. */ 2062 2063static void 2064init_kernel (struct kernel_info *kernel) 2065{ 2066 if (pthread_mutex_lock (&kernel->init_mutex)) 2067 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex"); 2068 if (kernel->initialized) 2069 { 2070 if (pthread_mutex_unlock (&kernel->init_mutex)) 2071 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization " 2072 "mutex"); 2073 2074 return; 2075 } 2076 2077 init_kernel_properties (kernel); 2078 2079 if (!kernel->initialization_failed) 2080 { 2081 GCN_DEBUG ("\n"); 2082 2083 kernel->initialized = true; 2084 } 2085 if (pthread_mutex_unlock (&kernel->init_mutex)) 2086 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization " 2087 "mutex"); 2088} 2089 2090/* Run KERNEL on its agent, pass VARS to it as arguments and take 2091 launch attributes from KLA. 2092 2093 MODULE_LOCKED indicates that the caller already holds the lock and 2094 run_kernel need not lock it again. 2095 If AQ is NULL then agent->sync_queue will be used. */ 2096 2097static void 2098run_kernel (struct kernel_info *kernel, void *vars, 2099 struct GOMP_kernel_launch_attributes *kla, 2100 struct goacc_asyncqueue *aq, bool module_locked) 2101{ 2102 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count, 2103 kernel->description->vpgr_count); 2104 2105 /* Reduce the number of threads/workers if there are insufficient 2106 VGPRs available to run the kernels together. */ 2107 if (kla->ndim == 3 && kernel->description->vpgr_count > 0) 2108 { 2109 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3; 2110 int max_threads = (256 / granulated_vgprs) * 4; 2111 if (kla->gdims[2] > max_threads) 2112 { 2113 GCN_WARNING ("Too many VGPRs required to support %d threads/workers" 2114 " per team/gang - reducing to %d threads/workers.\n", 2115 kla->gdims[2], max_threads); 2116 kla->gdims[2] = max_threads; 2117 } 2118 } 2119 2120 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id, 2121 (aq ? aq->id : 0)); 2122 GCN_DEBUG ("GCN launch attribs: gdims:["); 2123 int i; 2124 for (i = 0; i < kla->ndim; ++i) 2125 { 2126 if (i) 2127 DEBUG_PRINT (", "); 2128 DEBUG_PRINT ("%u", kla->gdims[i]); 2129 } 2130 DEBUG_PRINT ("], normalized gdims:["); 2131 for (i = 0; i < kla->ndim; ++i) 2132 { 2133 if (i) 2134 DEBUG_PRINT (", "); 2135 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]); 2136 } 2137 DEBUG_PRINT ("], wdims:["); 2138 for (i = 0; i < kla->ndim; ++i) 2139 { 2140 if (i) 2141 DEBUG_PRINT (", "); 2142 DEBUG_PRINT ("%u", kla->wdims[i]); 2143 } 2144 DEBUG_PRINT ("]\n"); 2145 DEBUG_FLUSH (); 2146 2147 struct agent_info *agent = kernel->agent; 2148 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock)) 2149 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock"); 2150 2151 if (!agent->initialized) 2152 GOMP_PLUGIN_fatal ("Agent must be initialized"); 2153 2154 if (!kernel->initialized) 2155 GOMP_PLUGIN_fatal ("Called kernel must be initialized"); 2156 2157 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue); 2158 2159 uint64_t index 2160 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1); 2161 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index); 2162 2163 /* Wait until the queue is not full before writing the packet. */ 2164 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q) 2165 >= command_q->size) 2166 ; 2167 2168 /* Do not allow the dimensions to be overridden when running 2169 constructors or destructors. */ 2170 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim; 2171 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim; 2172 2173 hsa_kernel_dispatch_packet_t *packet; 2174 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address) 2175 + index % command_q->size; 2176 2177 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4); 2178 packet->grid_size_x = override_x ? : kla->gdims[0]; 2179 packet->workgroup_size_x = get_group_size (kla->ndim, 2180 packet->grid_size_x, 2181 kla->wdims[0]); 2182 2183 if (kla->ndim >= 2) 2184 { 2185 packet->grid_size_y = kla->gdims[1]; 2186 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1], 2187 kla->wdims[1]); 2188 } 2189 else 2190 { 2191 packet->grid_size_y = 1; 2192 packet->workgroup_size_y = 1; 2193 } 2194 2195 if (kla->ndim == 3) 2196 { 2197 packet->grid_size_z = limit_worker_threads (override_z 2198 ? : kla->gdims[2]); 2199 packet->workgroup_size_z = get_group_size (kla->ndim, 2200 packet->grid_size_z, 2201 kla->wdims[2]); 2202 } 2203 else 2204 { 2205 packet->grid_size_z = 1; 2206 packet->workgroup_size_z = 1; 2207 } 2208 2209 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u]," 2210 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n", 2211 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z, 2212 packet->grid_size_x / packet->workgroup_size_x, 2213 packet->grid_size_y / packet->workgroup_size_y, 2214 packet->grid_size_z / packet->workgroup_size_z, 2215 packet->workgroup_size_x, packet->workgroup_size_y, 2216 packet->workgroup_size_z); 2217 2218 struct kernel_dispatch *shadow 2219 = create_kernel_dispatch (kernel, packet->grid_size_x); 2220 shadow->queue = command_q; 2221 2222 if (debug) 2223 { 2224 fprintf (stderr, "\nKernel has following dependencies:\n"); 2225 print_kernel_dispatch (shadow, 2); 2226 } 2227 2228 packet->private_segment_size = kernel->private_segment_size; 2229 packet->group_segment_size = kernel->group_segment_size; 2230 packet->kernel_object = kernel->object; 2231 packet->kernarg_address = shadow->kernarg_address; 2232 hsa_signal_t s; 2233 s.handle = shadow->signal; 2234 packet->completion_signal = s; 2235 hsa_fns.hsa_signal_store_relaxed_fn (s, 1); 2236 memcpy (shadow->kernarg_address, &vars, sizeof (vars)); 2237 2238 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n"); 2239 2240 uint16_t header; 2241 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 2242 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 2243 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 2244 2245 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name, 2246 agent->device_id); 2247 2248 packet_store_release ((uint32_t *) packet, header, 2249 (uint16_t) kla->ndim 2250 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS); 2251 2252 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal, 2253 index); 2254 2255 GCN_DEBUG ("Kernel dispatched, waiting for completion\n"); 2256 2257 /* Root signal waits with 1ms timeout. */ 2258 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1, 2259 1000 * 1000, 2260 HSA_WAIT_STATE_BLOCKED) != 0) 2261 { 2262 console_output (kernel, shadow->kernarg_address, false); 2263 } 2264 console_output (kernel, shadow->kernarg_address, true); 2265 2266 struct kernargs *kernargs = shadow->kernarg_address; 2267 unsigned int return_value = (unsigned int)kernargs->output_data.return_value; 2268 2269 release_kernel_dispatch (shadow); 2270 2271 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock)) 2272 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock"); 2273 2274 unsigned int upper = (return_value & ~0xffff) >> 16; 2275 if (upper == 0xcafe) 2276 ; // exit not called, normal termination. 2277 else if (upper == 0xffff) 2278 ; // exit called. 2279 else 2280 { 2281 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most" 2282 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n", 2283 return_value); 2284 abort (); 2285 } 2286 2287 if (upper == 0xffff) 2288 { 2289 unsigned int signal = (return_value >> 8) & 0xff; 2290 2291 if (signal == SIGABRT) 2292 { 2293 GCN_WARNING ("GCN Kernel aborted\n"); 2294 abort (); 2295 } 2296 else if (signal != 0) 2297 { 2298 GCN_WARNING ("GCN Kernel received unknown signal\n"); 2299 abort (); 2300 } 2301 2302 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff); 2303 exit (return_value & 0xff); 2304 } 2305} 2306 2307/* }}} */ 2308/* {{{ Load/Unload */ 2309 2310/* Initialize KERNEL from D and other parameters. Return true on success. */ 2311 2312static bool 2313init_basic_kernel_info (struct kernel_info *kernel, 2314 struct hsa_kernel_description *d, 2315 struct agent_info *agent, 2316 struct module_info *module) 2317{ 2318 kernel->agent = agent; 2319 kernel->module = module; 2320 kernel->name = d->name; 2321 kernel->description = d; 2322 if (pthread_mutex_init (&kernel->init_mutex, NULL)) 2323 { 2324 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex"); 2325 return false; 2326 } 2327 return true; 2328} 2329 2330/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If 2331 not found, return false. */ 2332 2333static bool 2334find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent, 2335 struct module_info *module, Elf64_Ehdr *image, 2336 Elf64_Shdr *sections) 2337{ 2338 bool res = false; 2339 2340 hsa_status_t status; 2341 2342 hsa_executable_symbol_t symbol; 2343 if (!find_executable_symbol (agent->executable, &symbol)) 2344 return false; 2345 2346 status = hsa_fns.hsa_executable_symbol_get_info_fn 2347 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset); 2348 if (status != HSA_STATUS_SUCCESS) 2349 { 2350 hsa_error ("Could not extract symbol address", status); 2351 return false; 2352 } 2353 2354 char *symbol_name = get_executable_symbol_name (symbol); 2355 if (symbol_name == NULL) 2356 return false; 2357 2358 /* Find the kernel function in ELF, and calculate actual load offset. */ 2359 for (int i = 0; i < image->e_shnum; i++) 2360 if (sections[i].sh_type == SHT_SYMTAB) 2361 { 2362 Elf64_Shdr *strtab = §ions[sections[i].sh_link]; 2363 char *strings = (char *)image + strtab->sh_offset; 2364 2365 for (size_t offset = 0; 2366 offset < sections[i].sh_size; 2367 offset += sections[i].sh_entsize) 2368 { 2369 Elf64_Sym *sym = (Elf64_Sym*)((char*)image 2370 + sections[i].sh_offset 2371 + offset); 2372 if (strcmp (symbol_name, strings + sym->st_name) == 0) 2373 { 2374 *load_offset -= sym->st_value; 2375 res = true; 2376 break; 2377 } 2378 } 2379 } 2380 2381 free (symbol_name); 2382 return res; 2383} 2384 2385/* Check that the GCN ISA of the given image matches the ISA of the agent. */ 2386 2387static bool 2388isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image) 2389{ 2390 int isa_field = elf_gcn_isa_field (image); 2391 const char* isa_s = isa_hsa_name (isa_field); 2392 if (!isa_s) 2393 { 2394 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR); 2395 return false; 2396 } 2397 2398 if (isa_field != agent->device_isa) 2399 { 2400 char msg[120]; 2401 const char *agent_isa_s = isa_hsa_name (agent->device_isa); 2402 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa); 2403 assert (agent_isa_s); 2404 assert (agent_isa_gcc_s); 2405 2406 snprintf (msg, sizeof msg, 2407 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n" 2408 "Try to recompile with '-foffload=-march=%s'.\n", 2409 isa_s, agent_isa_s, agent_isa_gcc_s); 2410 2411 hsa_error (msg, HSA_STATUS_ERROR); 2412 return false; 2413 } 2414 2415 return true; 2416} 2417 2418/* Create and finalize the program consisting of all loaded modules. */ 2419 2420static bool 2421create_and_finalize_hsa_program (struct agent_info *agent) 2422{ 2423 hsa_status_t status; 2424 int reloc_count = 0; 2425 bool res = true; 2426 if (pthread_mutex_lock (&agent->prog_mutex)) 2427 { 2428 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 2429 return false; 2430 } 2431 if (agent->prog_finalized) 2432 goto final; 2433 2434 status 2435 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, 2436 HSA_EXECUTABLE_STATE_UNFROZEN, 2437 "", &agent->executable); 2438 if (status != HSA_STATUS_SUCCESS) 2439 { 2440 hsa_error ("Could not create GCN executable", status); 2441 goto fail; 2442 } 2443 2444 /* Load any GCN modules. */ 2445 struct module_info *module = agent->module; 2446 if (module) 2447 { 2448 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image; 2449 2450 if (!isa_matches_agent (agent, image)) 2451 goto fail; 2452 2453 /* Hide relocations from the HSA runtime loader. 2454 Keep a copy of the unmodified section headers to use later. */ 2455 Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image 2456 + image->e_shoff); 2457 for (int i = image->e_shnum - 1; i >= 0; i--) 2458 { 2459 if (image_sections[i].sh_type == SHT_RELA 2460 || image_sections[i].sh_type == SHT_REL) 2461 /* Change section type to something harmless. */ 2462 image_sections[i].sh_type |= 0x80; 2463 } 2464 2465 hsa_code_object_t co = { 0 }; 2466 status = hsa_fns.hsa_code_object_deserialize_fn 2467 (module->image_desc->gcn_image->image, 2468 module->image_desc->gcn_image->size, 2469 NULL, &co); 2470 if (status != HSA_STATUS_SUCCESS) 2471 { 2472 hsa_error ("Could not deserialize GCN code object", status); 2473 goto fail; 2474 } 2475 2476 status = hsa_fns.hsa_executable_load_code_object_fn 2477 (agent->executable, agent->id, co, ""); 2478 if (status != HSA_STATUS_SUCCESS) 2479 { 2480 hsa_error ("Could not load GCN code object", status); 2481 goto fail; 2482 } 2483 2484 if (!module->heap) 2485 { 2486 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, 2487 gcn_kernel_heap_size, 2488 (void**)&module->heap); 2489 if (status != HSA_STATUS_SUCCESS) 2490 { 2491 hsa_error ("Could not allocate memory for GCN heap", status); 2492 goto fail; 2493 } 2494 2495 status = hsa_fns.hsa_memory_assign_agent_fn 2496 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW); 2497 if (status != HSA_STATUS_SUCCESS) 2498 { 2499 hsa_error ("Could not assign GCN heap memory to device", status); 2500 goto fail; 2501 } 2502 2503 hsa_fns.hsa_memory_copy_fn (&module->heap->size, 2504 &gcn_kernel_heap_size, 2505 sizeof (gcn_kernel_heap_size)); 2506 } 2507 2508 } 2509 2510 if (debug) 2511 dump_executable_symbols (agent->executable); 2512 2513 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, ""); 2514 if (status != HSA_STATUS_SUCCESS) 2515 { 2516 hsa_error ("Could not freeze the GCN executable", status); 2517 goto fail; 2518 } 2519 2520 if (agent->module) 2521 { 2522 struct module_info *module = agent->module; 2523 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image; 2524 Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff); 2525 2526 Elf64_Addr load_offset; 2527 if (!find_load_offset (&load_offset, agent, module, image, sections)) 2528 goto fail; 2529 2530 /* Record the physical load address range. 2531 We need this for data copies later. */ 2532 Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff); 2533 Elf64_Addr low = ~0, high = 0; 2534 for (int i = 0; i < image->e_phnum; i++) 2535 if (segments[i].p_memsz > 0) 2536 { 2537 if (segments[i].p_paddr < low) 2538 low = segments[i].p_paddr; 2539 if (segments[i].p_paddr > high) 2540 high = segments[i].p_paddr + segments[i].p_memsz - 1; 2541 } 2542 module->phys_address_start = low + load_offset; 2543 module->phys_address_end = high + load_offset; 2544 2545 // Find dynamic symbol table 2546 Elf64_Shdr *dynsym = NULL; 2547 for (int i = 0; i < image->e_shnum; i++) 2548 if (sections[i].sh_type == SHT_DYNSYM) 2549 { 2550 dynsym = §ions[i]; 2551 break; 2552 } 2553 2554 /* Fix up relocations. */ 2555 for (int i = 0; i < image->e_shnum; i++) 2556 { 2557 if (sections[i].sh_type == (SHT_RELA | 0x80)) 2558 for (size_t offset = 0; 2559 offset < sections[i].sh_size; 2560 offset += sections[i].sh_entsize) 2561 { 2562 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image 2563 + sections[i].sh_offset 2564 + offset); 2565 Elf64_Sym *sym = 2566 (dynsym 2567 ? (Elf64_Sym*)((char*)image 2568 + dynsym->sh_offset 2569 + (dynsym->sh_entsize 2570 * ELF64_R_SYM (reloc->r_info))) 2571 : NULL); 2572 2573 int64_t S = (sym ? sym->st_value : 0); 2574 int64_t P = reloc->r_offset + load_offset; 2575 int64_t A = reloc->r_addend; 2576 int64_t B = load_offset; 2577 int64_t V, size; 2578 switch (ELF64_R_TYPE (reloc->r_info)) 2579 { 2580 case R_AMDGPU_ABS32_LO: 2581 V = (S + A) & 0xFFFFFFFF; 2582 size = 4; 2583 break; 2584 case R_AMDGPU_ABS32_HI: 2585 V = (S + A) >> 32; 2586 size = 4; 2587 break; 2588 case R_AMDGPU_ABS64: 2589 V = S + A; 2590 size = 8; 2591 break; 2592 case R_AMDGPU_REL32: 2593 V = S + A - P; 2594 size = 4; 2595 break; 2596 case R_AMDGPU_REL64: 2597 /* FIXME 2598 LLD seems to emit REL64 where the the assembler has 2599 ABS64. This is clearly wrong because it's not what the 2600 compiler is expecting. Let's assume, for now, that 2601 it's a bug. In any case, GCN kernels are always self 2602 contained and therefore relative relocations will have 2603 been resolved already, so this should be a safe 2604 workaround. */ 2605 V = S + A/* - P*/; 2606 size = 8; 2607 break; 2608 case R_AMDGPU_ABS32: 2609 V = S + A; 2610 size = 4; 2611 break; 2612 /* TODO R_AMDGPU_GOTPCREL */ 2613 /* TODO R_AMDGPU_GOTPCREL32_LO */ 2614 /* TODO R_AMDGPU_GOTPCREL32_HI */ 2615 case R_AMDGPU_REL32_LO: 2616 V = (S + A - P) & 0xFFFFFFFF; 2617 size = 4; 2618 break; 2619 case R_AMDGPU_REL32_HI: 2620 V = (S + A - P) >> 32; 2621 size = 4; 2622 break; 2623 case R_AMDGPU_RELATIVE64: 2624 V = B + A; 2625 size = 8; 2626 break; 2627 default: 2628 fprintf (stderr, "Error: unsupported relocation type.\n"); 2629 exit (1); 2630 } 2631 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size); 2632 if (status != HSA_STATUS_SUCCESS) 2633 { 2634 hsa_error ("Failed to fix up relocation", status); 2635 goto fail; 2636 } 2637 reloc_count++; 2638 } 2639 } 2640 } 2641 2642 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n", 2643 agent->device_id, reloc_count); 2644 2645final: 2646 agent->prog_finalized = true; 2647 2648 if (pthread_mutex_unlock (&agent->prog_mutex)) 2649 { 2650 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 2651 res = false; 2652 } 2653 2654 return res; 2655 2656fail: 2657 res = false; 2658 goto final; 2659} 2660 2661/* Free the HSA program in agent and everything associated with it and set 2662 agent->prog_finalized and the initialized flags of all kernels to false. 2663 Return TRUE on success. */ 2664 2665static bool 2666destroy_hsa_program (struct agent_info *agent) 2667{ 2668 if (!agent->prog_finalized) 2669 return true; 2670 2671 hsa_status_t status; 2672 2673 GCN_DEBUG ("Destroying the current GCN program.\n"); 2674 2675 status = hsa_fns.hsa_executable_destroy_fn (agent->executable); 2676 if (status != HSA_STATUS_SUCCESS) 2677 return hsa_error ("Could not destroy GCN executable", status); 2678 2679 if (agent->module) 2680 { 2681 int i; 2682 for (i = 0; i < agent->module->kernel_count; i++) 2683 agent->module->kernels[i].initialized = false; 2684 2685 if (agent->module->heap) 2686 { 2687 hsa_fns.hsa_memory_free_fn (agent->module->heap); 2688 agent->module->heap = NULL; 2689 } 2690 } 2691 agent->prog_finalized = false; 2692 return true; 2693} 2694 2695/* Deinitialize all information associated with MODULE and kernels within 2696 it. Return TRUE on success. */ 2697 2698static bool 2699destroy_module (struct module_info *module, bool locked) 2700{ 2701 /* Run destructors before destroying module. */ 2702 struct GOMP_kernel_launch_attributes kla = 2703 { 3, 2704 /* Grid size. */ 2705 { 1, 64, 1 }, 2706 /* Work-group size. */ 2707 { 1, 64, 1 } 2708 }; 2709 2710 if (module->fini_array_func) 2711 { 2712 init_kernel (module->fini_array_func); 2713 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked); 2714 } 2715 module->constructors_run_p = false; 2716 2717 int i; 2718 for (i = 0; i < module->kernel_count; i++) 2719 if (pthread_mutex_destroy (&module->kernels[i].init_mutex)) 2720 { 2721 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization " 2722 "mutex"); 2723 return false; 2724 } 2725 2726 return true; 2727} 2728 2729/* }}} */ 2730/* {{{ Async */ 2731 2732/* Callback of dispatch queues to report errors. */ 2733 2734static void 2735execute_queue_entry (struct goacc_asyncqueue *aq, int index) 2736{ 2737 struct queue_entry *entry = &aq->queue[index]; 2738 2739 switch (entry->type) 2740 { 2741 case KERNEL_LAUNCH: 2742 if (DEBUG_QUEUES) 2743 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n", 2744 aq->agent->device_id, aq->id, index); 2745 run_kernel (entry->u.launch.kernel, 2746 entry->u.launch.vars, 2747 &entry->u.launch.kla, aq, false); 2748 if (DEBUG_QUEUES) 2749 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n", 2750 aq->agent->device_id, aq->id, index); 2751 break; 2752 2753 case CALLBACK: 2754 if (DEBUG_QUEUES) 2755 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n", 2756 aq->agent->device_id, aq->id, index); 2757 entry->u.callback.fn (entry->u.callback.data); 2758 if (DEBUG_QUEUES) 2759 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n", 2760 aq->agent->device_id, aq->id, index); 2761 break; 2762 2763 case ASYNC_WAIT: 2764 { 2765 /* FIXME: is it safe to access a placeholder that may already have 2766 been executed? */ 2767 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp; 2768 2769 if (DEBUG_QUEUES) 2770 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n", 2771 aq->agent->device_id, aq->id, index); 2772 2773 pthread_mutex_lock (&placeholderp->mutex); 2774 2775 while (!placeholderp->executed) 2776 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex); 2777 2778 pthread_mutex_unlock (&placeholderp->mutex); 2779 2780 if (pthread_cond_destroy (&placeholderp->cond)) 2781 GOMP_PLUGIN_error ("Failed to destroy serialization cond"); 2782 2783 if (pthread_mutex_destroy (&placeholderp->mutex)) 2784 GOMP_PLUGIN_error ("Failed to destroy serialization mutex"); 2785 2786 if (DEBUG_QUEUES) 2787 GCN_DEBUG ("Async thread %d:%d: Executing async wait " 2788 "entry (%d) done\n", aq->agent->device_id, aq->id, index); 2789 } 2790 break; 2791 2792 case ASYNC_PLACEHOLDER: 2793 pthread_mutex_lock (&entry->u.placeholder.mutex); 2794 entry->u.placeholder.executed = 1; 2795 pthread_cond_signal (&entry->u.placeholder.cond); 2796 pthread_mutex_unlock (&entry->u.placeholder.mutex); 2797 break; 2798 2799 default: 2800 GOMP_PLUGIN_fatal ("Unknown queue element"); 2801 } 2802} 2803 2804/* This function is run as a thread to service an async queue in the 2805 background. It runs continuously until the stop flag is set. */ 2806 2807static void * 2808drain_queue (void *thread_arg) 2809{ 2810 struct goacc_asyncqueue *aq = thread_arg; 2811 2812 if (DRAIN_QUEUE_SYNCHRONOUS_P) 2813 { 2814 aq->drain_queue_stop = 2; 2815 return NULL; 2816 } 2817 2818 pthread_mutex_lock (&aq->mutex); 2819 2820 while (true) 2821 { 2822 if (aq->drain_queue_stop) 2823 break; 2824 2825 if (aq->queue_n > 0) 2826 { 2827 pthread_mutex_unlock (&aq->mutex); 2828 execute_queue_entry (aq, aq->queue_first); 2829 2830 pthread_mutex_lock (&aq->mutex); 2831 aq->queue_first = ((aq->queue_first + 1) 2832 % ASYNC_QUEUE_SIZE); 2833 aq->queue_n--; 2834 2835 if (DEBUG_THREAD_SIGNAL) 2836 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n", 2837 aq->agent->device_id, aq->id); 2838 pthread_cond_broadcast (&aq->queue_cond_out); 2839 pthread_mutex_unlock (&aq->mutex); 2840 2841 if (DEBUG_QUEUES) 2842 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id, 2843 aq->id); 2844 pthread_mutex_lock (&aq->mutex); 2845 } 2846 else 2847 { 2848 if (DEBUG_THREAD_SLEEP) 2849 GCN_DEBUG ("Async thread %d:%d: going to sleep\n", 2850 aq->agent->device_id, aq->id); 2851 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex); 2852 if (DEBUG_THREAD_SLEEP) 2853 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n", 2854 aq->agent->device_id, aq->id); 2855 } 2856 } 2857 2858 aq->drain_queue_stop = 2; 2859 if (DEBUG_THREAD_SIGNAL) 2860 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n", 2861 aq->agent->device_id, aq->id); 2862 pthread_cond_broadcast (&aq->queue_cond_out); 2863 pthread_mutex_unlock (&aq->mutex); 2864 2865 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id); 2866 return NULL; 2867} 2868 2869/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which 2870 is not usually the case. This is just a debug tool. */ 2871 2872static void 2873drain_queue_synchronous (struct goacc_asyncqueue *aq) 2874{ 2875 pthread_mutex_lock (&aq->mutex); 2876 2877 while (aq->queue_n > 0) 2878 { 2879 execute_queue_entry (aq, aq->queue_first); 2880 2881 aq->queue_first = ((aq->queue_first + 1) 2882 % ASYNC_QUEUE_SIZE); 2883 aq->queue_n--; 2884 } 2885 2886 pthread_mutex_unlock (&aq->mutex); 2887} 2888 2889/* Block the current thread until an async queue is writable. The aq->mutex 2890 lock should be held on entry, and remains locked on exit. */ 2891 2892static void 2893wait_for_queue_nonfull (struct goacc_asyncqueue *aq) 2894{ 2895 if (aq->queue_n == ASYNC_QUEUE_SIZE) 2896 { 2897 /* Queue is full. Wait for it to not be full. */ 2898 while (aq->queue_n == ASYNC_QUEUE_SIZE) 2899 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 2900 } 2901} 2902 2903/* Request an asynchronous kernel launch on the specified queue. This 2904 may block if the queue is full, but returns without waiting for the 2905 kernel to run. */ 2906 2907static void 2908queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel, 2909 void *vars, struct GOMP_kernel_launch_attributes *kla) 2910{ 2911 assert (aq->agent == kernel->agent); 2912 2913 pthread_mutex_lock (&aq->mutex); 2914 2915 wait_for_queue_nonfull (aq); 2916 2917 int queue_last = ((aq->queue_first + aq->queue_n) 2918 % ASYNC_QUEUE_SIZE); 2919 if (DEBUG_QUEUES) 2920 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id, 2921 aq->id, queue_last); 2922 2923 aq->queue[queue_last].type = KERNEL_LAUNCH; 2924 aq->queue[queue_last].u.launch.kernel = kernel; 2925 aq->queue[queue_last].u.launch.vars = vars; 2926 aq->queue[queue_last].u.launch.kla = *kla; 2927 2928 aq->queue_n++; 2929 2930 if (DEBUG_THREAD_SIGNAL) 2931 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2932 aq->agent->device_id, aq->id); 2933 pthread_cond_signal (&aq->queue_cond_in); 2934 2935 pthread_mutex_unlock (&aq->mutex); 2936} 2937 2938/* Request an asynchronous callback on the specified queue. The callback 2939 function will be called, with the given opaque data, from the appropriate 2940 async thread, when all previous items on that queue are complete. */ 2941 2942static void 2943queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *), 2944 void *data) 2945{ 2946 pthread_mutex_lock (&aq->mutex); 2947 2948 wait_for_queue_nonfull (aq); 2949 2950 int queue_last = ((aq->queue_first + aq->queue_n) 2951 % ASYNC_QUEUE_SIZE); 2952 if (DEBUG_QUEUES) 2953 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id, 2954 aq->id, queue_last); 2955 2956 aq->queue[queue_last].type = CALLBACK; 2957 aq->queue[queue_last].u.callback.fn = fn; 2958 aq->queue[queue_last].u.callback.data = data; 2959 2960 aq->queue_n++; 2961 2962 if (DEBUG_THREAD_SIGNAL) 2963 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2964 aq->agent->device_id, aq->id); 2965 pthread_cond_signal (&aq->queue_cond_in); 2966 2967 pthread_mutex_unlock (&aq->mutex); 2968} 2969 2970/* Request that a given async thread wait for another thread (unspecified) to 2971 reach the given placeholder. The wait will occur when all previous entries 2972 on the queue are complete. A placeholder is effectively a kind of signal 2973 which simply sets a flag when encountered in a queue. */ 2974 2975static void 2976queue_push_asyncwait (struct goacc_asyncqueue *aq, 2977 struct placeholder *placeholderp) 2978{ 2979 pthread_mutex_lock (&aq->mutex); 2980 2981 wait_for_queue_nonfull (aq); 2982 2983 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE); 2984 if (DEBUG_QUEUES) 2985 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id, 2986 aq->id, queue_last); 2987 2988 aq->queue[queue_last].type = ASYNC_WAIT; 2989 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp; 2990 2991 aq->queue_n++; 2992 2993 if (DEBUG_THREAD_SIGNAL) 2994 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2995 aq->agent->device_id, aq->id); 2996 pthread_cond_signal (&aq->queue_cond_in); 2997 2998 pthread_mutex_unlock (&aq->mutex); 2999} 3000 3001/* Add a placeholder into an async queue. When the async thread reaches the 3002 placeholder it will set the "executed" flag to true and continue. 3003 Another thread may be waiting on this thread reaching the placeholder. */ 3004 3005static struct placeholder * 3006queue_push_placeholder (struct goacc_asyncqueue *aq) 3007{ 3008 struct placeholder *placeholderp; 3009 3010 pthread_mutex_lock (&aq->mutex); 3011 3012 wait_for_queue_nonfull (aq); 3013 3014 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE); 3015 if (DEBUG_QUEUES) 3016 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id, 3017 aq->id, queue_last); 3018 3019 aq->queue[queue_last].type = ASYNC_PLACEHOLDER; 3020 placeholderp = &aq->queue[queue_last].u.placeholder; 3021 3022 if (pthread_mutex_init (&placeholderp->mutex, NULL)) 3023 { 3024 pthread_mutex_unlock (&aq->mutex); 3025 GOMP_PLUGIN_error ("Failed to initialize serialization mutex"); 3026 } 3027 3028 if (pthread_cond_init (&placeholderp->cond, NULL)) 3029 { 3030 pthread_mutex_unlock (&aq->mutex); 3031 GOMP_PLUGIN_error ("Failed to initialize serialization cond"); 3032 } 3033 3034 placeholderp->executed = 0; 3035 3036 aq->queue_n++; 3037 3038 if (DEBUG_THREAD_SIGNAL) 3039 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 3040 aq->agent->device_id, aq->id); 3041 pthread_cond_signal (&aq->queue_cond_in); 3042 3043 pthread_mutex_unlock (&aq->mutex); 3044 3045 return placeholderp; 3046} 3047 3048/* Signal an asynchronous thread to terminate, and wait for it to do so. */ 3049 3050static void 3051finalize_async_thread (struct goacc_asyncqueue *aq) 3052{ 3053 pthread_mutex_lock (&aq->mutex); 3054 if (aq->drain_queue_stop == 2) 3055 { 3056 pthread_mutex_unlock (&aq->mutex); 3057 return; 3058 } 3059 3060 aq->drain_queue_stop = 1; 3061 3062 if (DEBUG_THREAD_SIGNAL) 3063 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n", 3064 aq->agent->device_id, aq->id); 3065 pthread_cond_signal (&aq->queue_cond_in); 3066 3067 while (aq->drain_queue_stop != 2) 3068 { 3069 if (DEBUG_THREAD_SLEEP) 3070 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread" 3071 " to sleep\n", aq->agent->device_id, aq->id); 3072 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 3073 if (DEBUG_THREAD_SLEEP) 3074 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n", 3075 aq->agent->device_id, aq->id); 3076 } 3077 3078 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id, 3079 aq->id); 3080 pthread_mutex_unlock (&aq->mutex); 3081 3082 int err = pthread_join (aq->thread_drain_queue, NULL); 3083 if (err != 0) 3084 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s", 3085 aq->agent->device_id, aq->id, strerror (err)); 3086 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id); 3087} 3088 3089/* Set up an async queue for OpenMP. There will be only one. The 3090 implementation simply uses an OpenACC async queue. 3091 FIXME: is this thread-safe if two threads call this function? */ 3092 3093static void 3094maybe_init_omp_async (struct agent_info *agent) 3095{ 3096 if (!agent->omp_async_queue) 3097 agent->omp_async_queue 3098 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id); 3099} 3100 3101/* A wrapper that works around an issue in the HSA runtime with host-to-device 3102 copies from read-only pages. */ 3103 3104static void 3105hsa_memory_copy_wrapper (void *dst, const void *src, size_t len) 3106{ 3107 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len); 3108 3109 if (status == HSA_STATUS_SUCCESS) 3110 return; 3111 3112 /* It appears that the copy fails if the source data is in a read-only page. 3113 We can't detect that easily, so try copying the data to a temporary buffer 3114 and doing the copy again if we got an error above. */ 3115 3116 GCN_WARNING ("Read-only data transfer bug workaround triggered for " 3117 "[%p:+%d]\n", (void *) src, (int) len); 3118 3119 void *src_copy = malloc (len); 3120 memcpy (src_copy, src, len); 3121 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len); 3122 free (src_copy); 3123 if (status != HSA_STATUS_SUCCESS) 3124 GOMP_PLUGIN_error ("memory copy failed"); 3125} 3126 3127/* Copy data to or from a device. This is intended for use as an async 3128 callback event. */ 3129 3130static void 3131copy_data (void *data_) 3132{ 3133 struct copy_data *data = (struct copy_data *)data_; 3134 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n", 3135 data->aq->agent->device_id, data->aq->id, data->len, data->src, 3136 data->dst); 3137 hsa_memory_copy_wrapper (data->dst, data->src, data->len); 3138 if (data->free_src) 3139 free ((void *) data->src); 3140 free (data); 3141} 3142 3143/* Free device data. This is intended for use as an async callback event. */ 3144 3145static void 3146gomp_offload_free (void *ptr) 3147{ 3148 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr); 3149 GOMP_OFFLOAD_free (0, ptr); 3150} 3151 3152/* Request an asynchronous data copy, to or from a device, on a given queue. 3153 The event will be registered as a callback. If FREE_SRC is true 3154 then the source data will be freed following the copy. */ 3155 3156static void 3157queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src, 3158 size_t len, bool free_src) 3159{ 3160 if (DEBUG_QUEUES) 3161 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n", 3162 aq->agent->device_id, aq->id, len, src, dst); 3163 struct copy_data *data 3164 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data)); 3165 data->dst = dst; 3166 data->src = src; 3167 data->len = len; 3168 data->free_src = free_src; 3169 data->aq = aq; 3170 queue_push_callback (aq, copy_data, data); 3171} 3172 3173/* Return true if the given queue is currently empty. */ 3174 3175static int 3176queue_empty (struct goacc_asyncqueue *aq) 3177{ 3178 pthread_mutex_lock (&aq->mutex); 3179 int res = aq->queue_n == 0 ? 1 : 0; 3180 pthread_mutex_unlock (&aq->mutex); 3181 3182 return res; 3183} 3184 3185/* Wait for a given queue to become empty. This implements an OpenACC wait 3186 directive. */ 3187 3188static void 3189wait_queue (struct goacc_asyncqueue *aq) 3190{ 3191 if (DRAIN_QUEUE_SYNCHRONOUS_P) 3192 { 3193 drain_queue_synchronous (aq); 3194 return; 3195 } 3196 3197 pthread_mutex_lock (&aq->mutex); 3198 3199 while (aq->queue_n > 0) 3200 { 3201 if (DEBUG_THREAD_SLEEP) 3202 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n", 3203 aq->agent->device_id, aq->id); 3204 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 3205 if (DEBUG_THREAD_SLEEP) 3206 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id, 3207 aq->id); 3208 } 3209 3210 pthread_mutex_unlock (&aq->mutex); 3211 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id); 3212} 3213 3214/* }}} */ 3215/* {{{ OpenACC support */ 3216 3217/* Execute an OpenACC kernel, synchronously or asynchronously. */ 3218 3219static void 3220gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs, 3221 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async, 3222 struct goacc_asyncqueue *aq) 3223{ 3224 if (!GOMP_OFFLOAD_can_run (kernel)) 3225 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented."); 3226 3227 /* If we get here then this must be an OpenACC kernel. */ 3228 kernel->kind = KIND_OPENACC; 3229 3230 /* devaddrs must be double-indirect on the target. */ 3231 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum); 3232 for (size_t i = 0; i < mapnum; i++) 3233 hsa_fns.hsa_memory_copy_fn (&ind_da[i], 3234 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i], 3235 sizeof (void *)); 3236 3237 struct hsa_kernel_description *hsa_kernel_desc = NULL; 3238 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++) 3239 { 3240 struct hsa_kernel_description *d 3241 = &kernel->module->image_desc->kernel_infos[i]; 3242 if (d->name == kernel->name) 3243 { 3244 hsa_kernel_desc = d; 3245 break; 3246 } 3247 } 3248 3249 /* We may have statically-determined dimensions in 3250 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel 3251 invocation at runtime in dims[]. We allow static dimensions to take 3252 priority over dynamic dimensions when present (non-zero). */ 3253 if (hsa_kernel_desc->oacc_dims[0] > 0) 3254 dims[0] = hsa_kernel_desc->oacc_dims[0]; 3255 if (hsa_kernel_desc->oacc_dims[1] > 0) 3256 dims[1] = hsa_kernel_desc->oacc_dims[1]; 3257 if (hsa_kernel_desc->oacc_dims[2] > 0) 3258 dims[2] = hsa_kernel_desc->oacc_dims[2]; 3259 3260 /* If any of the OpenACC dimensions remain 0 then we get to pick a number. 3261 There isn't really a correct answer for this without a clue about the 3262 problem size, so let's do a reasonable number of single-worker gangs. 3263 64 gangs matches a typical Fiji device. */ 3264 3265 /* NOTE: Until support for middle-end worker partitioning is merged, use 1 3266 for the default number of workers. */ 3267 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */ 3268 if (dims[1] == 0) dims[1] = 1; /* Workers. */ 3269 3270 /* The incoming dimensions are expressed in terms of gangs, workers, and 3271 vectors. The HSA dimensions are expressed in terms of "work-items", 3272 which means multiples of vector lanes. 3273 3274 The "grid size" specifies the size of the problem space, and the 3275 "work-group size" specifies how much of that we want a single compute 3276 unit to chew on at once. 3277 3278 The three dimensions do not really correspond to hardware, but the 3279 important thing is that the HSA runtime will launch as many 3280 work-groups as it takes to process the entire grid, and each 3281 work-group will contain as many wave-fronts as it takes to process 3282 the work-items in that group. 3283 3284 Essentially, as long as we set the Y dimension to 64 (the number of 3285 vector lanes in hardware), and the Z group size to the maximum (16), 3286 then we will get the gangs (X) and workers (Z) launched as we expect. 3287 3288 The reason for the apparent reversal of vector and worker dimension 3289 order is to do with the way the run-time distributes work-items across 3290 v1 and v2. */ 3291 struct GOMP_kernel_launch_attributes kla = 3292 {3, 3293 /* Grid size. */ 3294 {dims[0], 64, dims[1]}, 3295 /* Work-group size. */ 3296 {1, 64, 16} 3297 }; 3298 3299 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 3300 acc_prof_info *prof_info = thr->prof_info; 3301 acc_event_info enqueue_launch_event_info; 3302 acc_api_info *api_info = thr->api_info; 3303 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); 3304 if (profiling_dispatch_p) 3305 { 3306 prof_info->event_type = acc_ev_enqueue_launch_start; 3307 3308 enqueue_launch_event_info.launch_event.event_type 3309 = prof_info->event_type; 3310 enqueue_launch_event_info.launch_event.valid_bytes 3311 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES; 3312 enqueue_launch_event_info.launch_event.parent_construct 3313 = acc_construct_parallel; 3314 enqueue_launch_event_info.launch_event.implicit = 1; 3315 enqueue_launch_event_info.launch_event.tool_info = NULL; 3316 enqueue_launch_event_info.launch_event.kernel_name 3317 = (char *) kernel->name; 3318 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0]; 3319 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2]; 3320 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1]; 3321 3322 api_info->device_api = acc_device_api_other; 3323 3324 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 3325 &enqueue_launch_event_info, api_info); 3326 } 3327 3328 if (!async) 3329 { 3330 run_kernel (kernel, ind_da, &kla, NULL, false); 3331 gomp_offload_free (ind_da); 3332 } 3333 else 3334 { 3335 queue_push_launch (aq, kernel, ind_da, &kla); 3336 if (DEBUG_QUEUES) 3337 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n", 3338 aq->agent->device_id, aq->id, ind_da); 3339 queue_push_callback (aq, gomp_offload_free, ind_da); 3340 } 3341 3342 if (profiling_dispatch_p) 3343 { 3344 prof_info->event_type = acc_ev_enqueue_launch_end; 3345 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type; 3346 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 3347 &enqueue_launch_event_info, 3348 api_info); 3349 } 3350} 3351 3352/* }}} */ 3353/* {{{ Generic Plugin API */ 3354 3355/* Return the name of the accelerator, which is "gcn". */ 3356 3357const char * 3358GOMP_OFFLOAD_get_name (void) 3359{ 3360 return "gcn"; 3361} 3362 3363/* Return the specific capabilities the HSA accelerator have. */ 3364 3365unsigned int 3366GOMP_OFFLOAD_get_caps (void) 3367{ 3368 /* FIXME: Enable shared memory for APU, but not discrete GPU. */ 3369 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400 3370 | GOMP_OFFLOAD_CAP_OPENACC_200; 3371} 3372 3373/* Identify as GCN accelerator. */ 3374 3375int 3376GOMP_OFFLOAD_get_type (void) 3377{ 3378 return OFFLOAD_TARGET_TYPE_GCN; 3379} 3380 3381/* Return the libgomp version number we're compatible with. There is 3382 no requirement for cross-version compatibility. */ 3383 3384unsigned 3385GOMP_OFFLOAD_version (void) 3386{ 3387 return GOMP_VERSION; 3388} 3389 3390/* Return the number of GCN devices on the system. */ 3391 3392int 3393GOMP_OFFLOAD_get_num_devices (void) 3394{ 3395 if (!init_hsa_context ()) 3396 return 0; 3397 return hsa_context.agent_count; 3398} 3399 3400/* Initialize device (agent) number N so that it can be used for computation. 3401 Return TRUE on success. */ 3402 3403bool 3404GOMP_OFFLOAD_init_device (int n) 3405{ 3406 if (!init_hsa_context ()) 3407 return false; 3408 if (n >= hsa_context.agent_count) 3409 { 3410 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n); 3411 return false; 3412 } 3413 struct agent_info *agent = &hsa_context.agents[n]; 3414 3415 if (agent->initialized) 3416 return true; 3417 3418 agent->device_id = n; 3419 3420 if (pthread_rwlock_init (&agent->module_rwlock, NULL)) 3421 { 3422 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock"); 3423 return false; 3424 } 3425 if (pthread_mutex_init (&agent->prog_mutex, NULL)) 3426 { 3427 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex"); 3428 return false; 3429 } 3430 if (pthread_mutex_init (&agent->async_queues_mutex, NULL)) 3431 { 3432 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex"); 3433 return false; 3434 } 3435 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL)) 3436 { 3437 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex"); 3438 return false; 3439 } 3440 agent->async_queues = NULL; 3441 agent->omp_async_queue = NULL; 3442 agent->team_arena_list = NULL; 3443 3444 uint32_t queue_size; 3445 hsa_status_t status; 3446 status = hsa_fns.hsa_agent_get_info_fn (agent->id, 3447 HSA_AGENT_INFO_QUEUE_MAX_SIZE, 3448 &queue_size); 3449 if (status != HSA_STATUS_SUCCESS) 3450 return hsa_error ("Error requesting maximum queue size of the GCN agent", 3451 status); 3452 3453 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME, 3454 &agent->name); 3455 if (status != HSA_STATUS_SUCCESS) 3456 return hsa_error ("Error querying the name of the agent", status); 3457 3458 agent->device_isa = isa_code (agent->name); 3459 if (agent->device_isa < 0) 3460 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR); 3461 3462 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME, 3463 &agent->vendor_name); 3464 if (status != HSA_STATUS_SUCCESS) 3465 return hsa_error ("Error querying the vendor name of the agent", status); 3466 3467 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size, 3468 HSA_QUEUE_TYPE_MULTI, 3469 hsa_queue_callback, NULL, UINT32_MAX, 3470 UINT32_MAX, &agent->sync_queue); 3471 if (status != HSA_STATUS_SUCCESS) 3472 return hsa_error ("Error creating command queue", status); 3473 3474 agent->kernarg_region.handle = (uint64_t) -1; 3475 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id, 3476 get_kernarg_memory_region, 3477 &agent->kernarg_region); 3478 if (status != HSA_STATUS_SUCCESS 3479 && status != HSA_STATUS_INFO_BREAK) 3480 hsa_error ("Scanning memory regions failed", status); 3481 if (agent->kernarg_region.handle == (uint64_t) -1) 3482 { 3483 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel " 3484 "arguments"); 3485 return false; 3486 } 3487 GCN_DEBUG ("Selected kernel arguments memory region:\n"); 3488 dump_hsa_region (agent->kernarg_region, NULL); 3489 3490 agent->data_region.handle = (uint64_t) -1; 3491 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id, 3492 get_data_memory_region, 3493 &agent->data_region); 3494 if (status != HSA_STATUS_SUCCESS 3495 && status != HSA_STATUS_INFO_BREAK) 3496 hsa_error ("Scanning memory regions failed", status); 3497 if (agent->data_region.handle == (uint64_t) -1) 3498 { 3499 GOMP_PLUGIN_error ("Could not find suitable memory region for device " 3500 "data"); 3501 return false; 3502 } 3503 GCN_DEBUG ("Selected device data memory region:\n"); 3504 dump_hsa_region (agent->data_region, NULL); 3505 3506 GCN_DEBUG ("GCN agent %d initialized\n", n); 3507 3508 agent->initialized = true; 3509 return true; 3510} 3511 3512/* Load GCN object-code module described by struct gcn_image_desc in 3513 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE. 3514 If there are any constructors then run them. */ 3515 3516int 3517GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, 3518 struct addr_pair **target_table) 3519{ 3520 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN) 3521 { 3522 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin" 3523 " (expected %u, received %u)", 3524 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version)); 3525 return -1; 3526 } 3527 3528 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data; 3529 struct agent_info *agent; 3530 struct addr_pair *pair; 3531 struct module_info *module; 3532 struct kernel_info *kernel; 3533 int kernel_count = image_desc->kernel_count; 3534 unsigned var_count = image_desc->global_variable_count; 3535 3536 agent = get_agent_info (ord); 3537 if (!agent) 3538 return -1; 3539 3540 if (pthread_rwlock_wrlock (&agent->module_rwlock)) 3541 { 3542 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock"); 3543 return -1; 3544 } 3545 if (agent->prog_finalized 3546 && !destroy_hsa_program (agent)) 3547 return -1; 3548 3549 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count); 3550 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count); 3551 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2) 3552 * sizeof (struct addr_pair)); 3553 *target_table = pair; 3554 module = (struct module_info *) 3555 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info) 3556 + kernel_count * sizeof (struct kernel_info)); 3557 module->image_desc = image_desc; 3558 module->kernel_count = kernel_count; 3559 module->heap = NULL; 3560 module->constructors_run_p = false; 3561 3562 kernel = &module->kernels[0]; 3563 3564 /* Allocate memory for kernel dependencies. */ 3565 for (unsigned i = 0; i < kernel_count; i++) 3566 { 3567 struct hsa_kernel_description *d = &image_desc->kernel_infos[i]; 3568 if (!init_basic_kernel_info (kernel, d, agent, module)) 3569 return -1; 3570 if (strcmp (d->name, "_init_array") == 0) 3571 module->init_array_func = kernel; 3572 else if (strcmp (d->name, "_fini_array") == 0) 3573 module->fini_array_func = kernel; 3574 else 3575 { 3576 pair->start = (uintptr_t) kernel; 3577 pair->end = (uintptr_t) (kernel + 1); 3578 pair++; 3579 } 3580 kernel++; 3581 } 3582 3583 agent->module = module; 3584 if (pthread_rwlock_unlock (&agent->module_rwlock)) 3585 { 3586 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock"); 3587 return -1; 3588 } 3589 3590 if (!create_and_finalize_hsa_program (agent)) 3591 return -1; 3592 3593 for (unsigned i = 0; i < var_count; i++) 3594 { 3595 struct global_var_info *v = &image_desc->global_variables[i]; 3596 GCN_DEBUG ("Looking for variable %s\n", v->name); 3597 3598 hsa_status_t status; 3599 hsa_executable_symbol_t var_symbol; 3600 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3601 v->name, agent->id, 3602 0, &var_symbol); 3603 3604 if (status != HSA_STATUS_SUCCESS) 3605 hsa_fatal ("Could not find symbol for variable in the code object", 3606 status); 3607 3608 uint64_t var_addr; 3609 uint32_t var_size; 3610 status = hsa_fns.hsa_executable_symbol_get_info_fn 3611 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr); 3612 if (status != HSA_STATUS_SUCCESS) 3613 hsa_fatal ("Could not extract a variable from its symbol", status); 3614 status = hsa_fns.hsa_executable_symbol_get_info_fn 3615 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size); 3616 if (status != HSA_STATUS_SUCCESS) 3617 hsa_fatal ("Could not extract a variable size from its symbol", status); 3618 3619 pair->start = var_addr; 3620 pair->end = var_addr + var_size; 3621 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name, 3622 (void *)var_addr, var_size); 3623 pair++; 3624 } 3625 3626 /* Ensure that constructors are run first. */ 3627 struct GOMP_kernel_launch_attributes kla = 3628 { 3, 3629 /* Grid size. */ 3630 { 1, 64, 1 }, 3631 /* Work-group size. */ 3632 { 1, 64, 1 } 3633 }; 3634 3635 if (module->init_array_func) 3636 { 3637 init_kernel (module->init_array_func); 3638 run_kernel (module->init_array_func, NULL, &kla, NULL, false); 3639 } 3640 module->constructors_run_p = true; 3641 3642 /* Don't report kernels that libgomp need not know about. */ 3643 if (module->init_array_func) 3644 kernel_count--; 3645 if (module->fini_array_func) 3646 kernel_count--; 3647 3648 return kernel_count + var_count; 3649} 3650 3651/* Unload GCN object-code module described by struct gcn_image_desc in 3652 TARGET_DATA from agent number N. Return TRUE on success. */ 3653 3654bool 3655GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data) 3656{ 3657 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN) 3658 { 3659 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin" 3660 " (expected %u, received %u)", 3661 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version)); 3662 return false; 3663 } 3664 3665 struct agent_info *agent; 3666 agent = get_agent_info (n); 3667 if (!agent) 3668 return false; 3669 3670 if (pthread_rwlock_wrlock (&agent->module_rwlock)) 3671 { 3672 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock"); 3673 return false; 3674 } 3675 3676 if (!agent->module || agent->module->image_desc != target_data) 3677 { 3678 GOMP_PLUGIN_error ("Attempt to unload an image that has never been " 3679 "loaded before"); 3680 return false; 3681 } 3682 3683 if (!destroy_module (agent->module, true)) 3684 return false; 3685 free (agent->module); 3686 agent->module = NULL; 3687 if (!destroy_hsa_program (agent)) 3688 return false; 3689 if (pthread_rwlock_unlock (&agent->module_rwlock)) 3690 { 3691 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock"); 3692 return false; 3693 } 3694 return true; 3695} 3696 3697/* Deinitialize all information and status associated with agent number N. We 3698 do not attempt any synchronization, assuming the user and libgomp will not 3699 attempt deinitialization of a device that is in any way being used at the 3700 same time. Return TRUE on success. */ 3701 3702bool 3703GOMP_OFFLOAD_fini_device (int n) 3704{ 3705 struct agent_info *agent = get_agent_info (n); 3706 if (!agent) 3707 return false; 3708 3709 if (!agent->initialized) 3710 return true; 3711 3712 if (agent->omp_async_queue) 3713 { 3714 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue); 3715 agent->omp_async_queue = NULL; 3716 } 3717 3718 if (agent->module) 3719 { 3720 if (!destroy_module (agent->module, false)) 3721 return false; 3722 free (agent->module); 3723 agent->module = NULL; 3724 } 3725 3726 if (!destroy_team_arenas (agent)) 3727 return false; 3728 3729 if (!destroy_hsa_program (agent)) 3730 return false; 3731 3732 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue); 3733 if (status != HSA_STATUS_SUCCESS) 3734 return hsa_error ("Error destroying command queue", status); 3735 3736 if (pthread_mutex_destroy (&agent->prog_mutex)) 3737 { 3738 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex"); 3739 return false; 3740 } 3741 if (pthread_rwlock_destroy (&agent->module_rwlock)) 3742 { 3743 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock"); 3744 return false; 3745 } 3746 3747 if (pthread_mutex_destroy (&agent->async_queues_mutex)) 3748 { 3749 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex"); 3750 return false; 3751 } 3752 if (pthread_mutex_destroy (&agent->team_arena_write_lock)) 3753 { 3754 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex"); 3755 return false; 3756 } 3757 agent->initialized = false; 3758 return true; 3759} 3760 3761/* Return true if the HSA runtime can run function FN_PTR. */ 3762 3763bool 3764GOMP_OFFLOAD_can_run (void *fn_ptr) 3765{ 3766 struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 3767 3768 init_kernel (kernel); 3769 if (kernel->initialization_failed) 3770 goto failure; 3771 3772 return true; 3773 3774failure: 3775 if (suppress_host_fallback) 3776 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed"); 3777 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n"); 3778 return false; 3779} 3780 3781/* Allocate memory on device N. */ 3782 3783void * 3784GOMP_OFFLOAD_alloc (int n, size_t size) 3785{ 3786 struct agent_info *agent = get_agent_info (n); 3787 return alloc_by_agent (agent, size); 3788} 3789 3790/* Free memory from device N. */ 3791 3792bool 3793GOMP_OFFLOAD_free (int device, void *ptr) 3794{ 3795 GCN_DEBUG ("Freeing memory on device %d\n", device); 3796 3797 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr); 3798 if (status != HSA_STATUS_SUCCESS) 3799 { 3800 hsa_error ("Could not free device memory", status); 3801 return false; 3802 } 3803 3804 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 3805 bool profiling_dispatch_p 3806 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 3807 if (profiling_dispatch_p) 3808 { 3809 acc_prof_info *prof_info = thr->prof_info; 3810 acc_event_info data_event_info; 3811 acc_api_info *api_info = thr->api_info; 3812 3813 prof_info->event_type = acc_ev_free; 3814 3815 data_event_info.data_event.event_type = prof_info->event_type; 3816 data_event_info.data_event.valid_bytes 3817 = _ACC_DATA_EVENT_INFO_VALID_BYTES; 3818 data_event_info.data_event.parent_construct 3819 = acc_construct_parallel; 3820 data_event_info.data_event.implicit = 1; 3821 data_event_info.data_event.tool_info = NULL; 3822 data_event_info.data_event.var_name = NULL; 3823 data_event_info.data_event.bytes = 0; 3824 data_event_info.data_event.host_ptr = NULL; 3825 data_event_info.data_event.device_ptr = (void *) ptr; 3826 3827 api_info->device_api = acc_device_api_other; 3828 3829 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 3830 api_info); 3831 } 3832 3833 return true; 3834} 3835 3836/* Copy data from DEVICE to host. */ 3837 3838bool 3839GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n) 3840{ 3841 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device, 3842 src, dst); 3843 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n); 3844 if (status != HSA_STATUS_SUCCESS) 3845 GOMP_PLUGIN_error ("memory copy failed"); 3846 return true; 3847} 3848 3849/* Copy data from host to DEVICE. */ 3850 3851bool 3852GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n) 3853{ 3854 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src, 3855 device, dst); 3856 hsa_memory_copy_wrapper (dst, src, n); 3857 return true; 3858} 3859 3860/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */ 3861 3862bool 3863GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n) 3864{ 3865 struct gcn_thread *thread_data = gcn_thread (); 3866 3867 if (thread_data && !async_synchronous_p (thread_data->async)) 3868 { 3869 struct agent_info *agent = get_agent_info (device); 3870 maybe_init_omp_async (agent); 3871 queue_push_copy (agent->omp_async_queue, dst, src, n, false); 3872 return true; 3873 } 3874 3875 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n, 3876 device, src, device, dst); 3877 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n); 3878 if (status != HSA_STATUS_SUCCESS) 3879 GOMP_PLUGIN_error ("memory copy failed"); 3880 return true; 3881} 3882 3883/* }}} */ 3884/* {{{ OpenMP Plugin API */ 3885 3886/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers 3887 in VARS as a parameter. The kernel is identified by FN_PTR which must point 3888 to a kernel_info structure, and must have previously been loaded to the 3889 specified device. */ 3890 3891void 3892GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args) 3893{ 3894 struct agent_info *agent = get_agent_info (device); 3895 struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 3896 struct GOMP_kernel_launch_attributes def; 3897 struct GOMP_kernel_launch_attributes *kla; 3898 assert (agent == kernel->agent); 3899 3900 /* If we get here then the kernel must be OpenMP. */ 3901 kernel->kind = KIND_OPENMP; 3902 3903 if (!parse_target_attributes (args, &def, &kla, agent)) 3904 { 3905 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n"); 3906 return; 3907 } 3908 run_kernel (kernel, vars, kla, NULL, false); 3909} 3910 3911/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to 3912 GOMP_OFFLOAD_run except that the launch is queued and there is a call to 3913 GOMP_PLUGIN_target_task_completion when it has finished. */ 3914 3915void 3916GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, 3917 void **args, void *async_data) 3918{ 3919 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n"); 3920 struct agent_info *agent = get_agent_info (device); 3921 struct kernel_info *kernel = (struct kernel_info *) tgt_fn; 3922 struct GOMP_kernel_launch_attributes def; 3923 struct GOMP_kernel_launch_attributes *kla; 3924 assert (agent == kernel->agent); 3925 3926 /* If we get here then the kernel must be OpenMP. */ 3927 kernel->kind = KIND_OPENMP; 3928 3929 if (!parse_target_attributes (args, &def, &kla, agent)) 3930 { 3931 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n"); 3932 return; 3933 } 3934 3935 maybe_init_omp_async (agent); 3936 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla); 3937 queue_push_callback (agent->omp_async_queue, 3938 GOMP_PLUGIN_target_task_completion, async_data); 3939} 3940 3941/* }}} */ 3942/* {{{ OpenACC Plugin API */ 3943 3944/* Run a synchronous OpenACC kernel. The device number is inferred from the 3945 already-loaded KERNEL. */ 3946 3947void 3948GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum, 3949 void **hostaddrs, void **devaddrs, unsigned *dims, 3950 void *targ_mem_desc) 3951{ 3952 struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 3953 3954 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false, 3955 NULL); 3956} 3957 3958/* Run an asynchronous OpenACC kernel on the specified queue. */ 3959 3960void 3961GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum, 3962 void **hostaddrs, void **devaddrs, 3963 unsigned *dims, void *targ_mem_desc, 3964 struct goacc_asyncqueue *aq) 3965{ 3966 struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 3967 3968 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true, 3969 aq); 3970} 3971 3972/* Create a new asynchronous thread and queue for running future kernels. */ 3973 3974struct goacc_asyncqueue * 3975GOMP_OFFLOAD_openacc_async_construct (int device) 3976{ 3977 struct agent_info *agent = get_agent_info (device); 3978 3979 pthread_mutex_lock (&agent->async_queues_mutex); 3980 3981 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq)); 3982 aq->agent = get_agent_info (device); 3983 aq->prev = NULL; 3984 aq->next = agent->async_queues; 3985 if (aq->next) 3986 { 3987 aq->next->prev = aq; 3988 aq->id = aq->next->id + 1; 3989 } 3990 else 3991 aq->id = 1; 3992 agent->async_queues = aq; 3993 3994 aq->queue_first = 0; 3995 aq->queue_n = 0; 3996 aq->drain_queue_stop = 0; 3997 3998 if (pthread_mutex_init (&aq->mutex, NULL)) 3999 { 4000 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex"); 4001 return false; 4002 } 4003 if (pthread_cond_init (&aq->queue_cond_in, NULL)) 4004 { 4005 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond"); 4006 return false; 4007 } 4008 if (pthread_cond_init (&aq->queue_cond_out, NULL)) 4009 { 4010 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond"); 4011 return false; 4012 } 4013 4014 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id, 4015 ASYNC_QUEUE_SIZE, 4016 HSA_QUEUE_TYPE_MULTI, 4017 hsa_queue_callback, NULL, 4018 UINT32_MAX, UINT32_MAX, 4019 &aq->hsa_queue); 4020 if (status != HSA_STATUS_SUCCESS) 4021 hsa_fatal ("Error creating command queue", status); 4022 4023 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq); 4024 if (err != 0) 4025 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s", 4026 strerror (err)); 4027 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id, 4028 aq->id); 4029 4030 pthread_mutex_unlock (&agent->async_queues_mutex); 4031 4032 return aq; 4033} 4034 4035/* Destroy an existing asynchronous thread and queue. Waits for any 4036 currently-running task to complete, but cancels any queued tasks. */ 4037 4038bool 4039GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) 4040{ 4041 struct agent_info *agent = aq->agent; 4042 4043 finalize_async_thread (aq); 4044 4045 pthread_mutex_lock (&agent->async_queues_mutex); 4046 4047 int err; 4048 if ((err = pthread_mutex_destroy (&aq->mutex))) 4049 { 4050 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err); 4051 goto fail; 4052 } 4053 if (pthread_cond_destroy (&aq->queue_cond_in)) 4054 { 4055 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond"); 4056 goto fail; 4057 } 4058 if (pthread_cond_destroy (&aq->queue_cond_out)) 4059 { 4060 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond"); 4061 goto fail; 4062 } 4063 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue); 4064 if (status != HSA_STATUS_SUCCESS) 4065 { 4066 hsa_error ("Error destroying command queue", status); 4067 goto fail; 4068 } 4069 4070 if (aq->prev) 4071 aq->prev->next = aq->next; 4072 if (aq->next) 4073 aq->next->prev = aq->prev; 4074 if (agent->async_queues == aq) 4075 agent->async_queues = aq->next; 4076 4077 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id); 4078 4079 free (aq); 4080 pthread_mutex_unlock (&agent->async_queues_mutex); 4081 return true; 4082 4083fail: 4084 pthread_mutex_unlock (&agent->async_queues_mutex); 4085 return false; 4086} 4087 4088/* Return true if the specified async queue is currently empty. */ 4089 4090int 4091GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) 4092{ 4093 return queue_empty (aq); 4094} 4095 4096/* Block until the specified queue has executed all its tasks and the 4097 queue is empty. */ 4098 4099bool 4100GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) 4101{ 4102 wait_queue (aq); 4103 return true; 4104} 4105 4106/* Add a serialization point across two async queues. Any new tasks added to 4107 AQ2, after this call, will not run until all tasks on AQ1, at the time 4108 of this call, have completed. */ 4109 4110bool 4111GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, 4112 struct goacc_asyncqueue *aq2) 4113{ 4114 /* For serialize, stream aq2 waits for aq1 to complete work that has been 4115 scheduled to run on it up to this point. */ 4116 if (aq1 != aq2) 4117 { 4118 struct placeholder *placeholderp = queue_push_placeholder (aq1); 4119 queue_push_asyncwait (aq2, placeholderp); 4120 } 4121 return true; 4122} 4123 4124/* Add an opaque callback to the given async queue. */ 4125 4126void 4127GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, 4128 void (*fn) (void *), void *data) 4129{ 4130 queue_push_callback (aq, fn, data); 4131} 4132 4133/* Queue up an asynchronous data copy from host to DEVICE. */ 4134 4135bool 4136GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, 4137 size_t n, struct goacc_asyncqueue *aq) 4138{ 4139 struct agent_info *agent = get_agent_info (device); 4140 assert (agent == aq->agent); 4141 /* The source data does not necessarily remain live until the deferred 4142 copy happens. Taking a snapshot of the data here avoids reading 4143 uninitialised data later, but means that (a) data is copied twice and 4144 (b) modifications to the copied data between the "spawning" point of 4145 the asynchronous kernel and when it is executed will not be seen. 4146 But, that is probably correct. */ 4147 void *src_copy = GOMP_PLUGIN_malloc (n); 4148 memcpy (src_copy, src, n); 4149 queue_push_copy (aq, dst, src_copy, n, true); 4150 return true; 4151} 4152 4153/* Queue up an asynchronous data copy from DEVICE to host. */ 4154 4155bool 4156GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src, 4157 size_t n, struct goacc_asyncqueue *aq) 4158{ 4159 struct agent_info *agent = get_agent_info (device); 4160 assert (agent == aq->agent); 4161 queue_push_copy (aq, dst, src, n, false); 4162 return true; 4163} 4164 4165union goacc_property_value 4166GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop) 4167{ 4168 struct agent_info *agent = get_agent_info (device); 4169 4170 union goacc_property_value propval = { .val = 0 }; 4171 4172 switch (prop) 4173 { 4174 case GOACC_PROPERTY_FREE_MEMORY: 4175 /* Not supported. */ 4176 break; 4177 case GOACC_PROPERTY_MEMORY: 4178 { 4179 size_t size; 4180 hsa_region_t region = agent->data_region; 4181 hsa_status_t status = 4182 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size); 4183 if (status == HSA_STATUS_SUCCESS) 4184 propval.val = size; 4185 break; 4186 } 4187 case GOACC_PROPERTY_NAME: 4188 propval.ptr = agent->name; 4189 break; 4190 case GOACC_PROPERTY_VENDOR: 4191 propval.ptr = agent->vendor_name; 4192 break; 4193 case GOACC_PROPERTY_DRIVER: 4194 propval.ptr = hsa_context.driver_version_s; 4195 break; 4196 } 4197 4198 return propval; 4199} 4200 4201/* Set up plugin-specific thread-local-data (host-side). */ 4202 4203void * 4204GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused))) 4205{ 4206 struct gcn_thread *thread_data 4207 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread)); 4208 4209 thread_data->async = GOMP_ASYNC_SYNC; 4210 4211 return (void *) thread_data; 4212} 4213 4214/* Clean up plugin-specific thread-local-data. */ 4215 4216void 4217GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 4218{ 4219 free (data); 4220} 4221 4222/* }}} */ 4223