1/* Plugin for AMD GCN execution.
2
3   Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5   Contributed by Mentor Embedded
6
7   This file is part of the GNU Offloading and Multi Processing Library
8   (libgomp).
9
10   Libgomp is free software; you can redistribute it and/or modify it
11   under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 3, or (at your option)
13   any later version.
14
15   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18   more details.
19
20   Under Section 7 of GPL version 3, you are granted additional
21   permissions described in the GCC Runtime Library Exception, version
22   3.1, as published by the Free Software Foundation.
23
24   You should have received a copy of the GNU General Public License and
25   a copy of the GCC Runtime Library Exception along with this program;
26   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27   <http://www.gnu.org/licenses/>.  */
28
29/* {{{ Includes and defines  */
30
31#include "config.h"
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#include <inttypes.h>
37#include <stdbool.h>
38#include <limits.h>
39#include <hsa.h>
40#include <dlfcn.h>
41#include <signal.h>
42#include "libgomp-plugin.h"
43#include "gomp-constants.h"
44#include <elf.h>
45#include "oacc-plugin.h"
46#include "oacc-int.h"
47#include <assert.h>
48
49/* Additional definitions not in HSA 1.1.
50   FIXME: this needs to be updated in hsa.h for upstream, but the only source
51          right now is the ROCr source which may cause license issues.  */
52#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54/* These probably won't be in elf.h for a while.  */
55#define R_AMDGPU_NONE		0
56#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
57#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
58#define R_AMDGPU_ABS64		3	/* S + A  */
59#define R_AMDGPU_REL32		4	/* S + A - P  */
60#define R_AMDGPU_REL64		5	/* S + A - P  */
61#define R_AMDGPU_ABS32		6	/* S + A  */
62#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
63#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
64#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
65#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
66#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
67#define reserved		12
68#define R_AMDGPU_RELATIVE64	13	/* B + A  */
69
70/* GCN specific definitions for asynchronous queues.  */
71
72#define ASYNC_QUEUE_SIZE 64
73#define DRAIN_QUEUE_SYNCHRONOUS_P false
74#define DEBUG_QUEUES 0
75#define DEBUG_THREAD_SLEEP 0
76#define DEBUG_THREAD_SIGNAL 0
77
78/* Defaults.  */
79#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
80
81/* Secure getenv() which returns NULL if running as SUID/SGID.  */
82#ifndef HAVE_SECURE_GETENV
83#ifdef HAVE___SECURE_GETENV
84#define secure_getenv __secure_getenv
85#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86  && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87
88#include <unistd.h>
89
90/* Implementation of secure_getenv() for targets where it is not provided but
91   we have at least means to test real and effective IDs. */
92
93static char *
94secure_getenv (const char *name)
95{
96  if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97    return getenv (name);
98  else
99    return NULL;
100}
101
102#else
103#define secure_getenv getenv
104#endif
105#endif
106
107/* }}}  */
108/* {{{ Types  */
109
110/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
111
112struct gcn_thread
113{
114  /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
115  int async;
116};
117
118/* As an HSA runtime is dlopened, following structure defines function
119   pointers utilized by the HSA plug-in.  */
120
121struct hsa_runtime_fn_info
122{
123  /* HSA runtime.  */
124  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125					const char **status_string);
126  hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127					  void *value);
128  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129					 hsa_agent_info_t attribute,
130					 void *value);
131  hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132				      hsa_isa_info_t attribute,
133				      uint32_t index,
134				      void *value);
135  hsa_status_t (*hsa_init_fn) (void);
136  hsa_status_t (*hsa_iterate_agents_fn)
137    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139					  hsa_region_info_t attribute,
140					  void *value);
141  hsa_status_t (*hsa_queue_create_fn)
142    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144     void *data, uint32_t private_segment_size,
145     uint32_t group_segment_size, hsa_queue_t **queue);
146  hsa_status_t (*hsa_agent_iterate_regions_fn)
147    (hsa_agent_t agent,
148     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150  hsa_status_t (*hsa_executable_create_fn)
151    (hsa_profile_t profile, hsa_executable_state_t executable_state,
152     const char *options, hsa_executable_t *executable);
153  hsa_status_t (*hsa_executable_global_variable_define_fn)
154    (hsa_executable_t executable, const char *variable_name, void *address);
155  hsa_status_t (*hsa_executable_load_code_object_fn)
156    (hsa_executable_t executable, hsa_agent_t agent,
157     hsa_code_object_t code_object, const char *options);
158  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159					   const char *options);
160  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161					uint32_t num_consumers,
162					const hsa_agent_t *consumers,
163					hsa_signal_t *signal);
164  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165					  void **ptr);
166  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167					      hsa_access_permission_t access);
168  hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171  hsa_status_t (*hsa_executable_get_symbol_fn)
172    (hsa_executable_t executable, const char *module_name,
173     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174     hsa_executable_symbol_t *symbol);
175  hsa_status_t (*hsa_executable_symbol_get_info_fn)
176    (hsa_executable_symbol_t executable_symbol,
177     hsa_executable_symbol_info_t attribute, void *value);
178  hsa_status_t (*hsa_executable_iterate_symbols_fn)
179    (hsa_executable_t executable,
180     hsa_status_t (*callback)(hsa_executable_t executable,
181			      hsa_executable_symbol_t symbol, void *data),
182     void *data);
183  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184						    uint64_t value);
185  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187				       hsa_signal_value_t value);
188  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189				       hsa_signal_value_t value);
190  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191    (hsa_signal_t signal, hsa_signal_condition_t condition,
192     hsa_signal_value_t compare_value, uint64_t timeout_hint,
193     hsa_wait_state_t wait_state_hint);
194  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196
197  hsa_status_t (*hsa_code_object_deserialize_fn)
198    (void *serialized_code_object, size_t serialized_code_object_size,
199     const char *options, hsa_code_object_t *code_object);
200};
201
202/* Structure describing the run-time and grid properties of an HSA kernel
203   lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
204
205struct GOMP_kernel_launch_attributes
206{
207  /* Number of dimensions the workload has.  Maximum number is 3.  */
208  uint32_t ndim;
209  /* Size of the grid in the three respective dimensions.  */
210  uint32_t gdims[3];
211  /* Size of work-groups in the respective dimensions.  */
212  uint32_t wdims[3];
213};
214
215/* Collection of information needed for a dispatch of a kernel from a
216   kernel.  */
217
218struct kernel_dispatch
219{
220  struct agent_info *agent;
221  /* Pointer to a command queue associated with a kernel dispatch agent.  */
222  void *queue;
223  /* Pointer to a memory space used for kernel arguments passing.  */
224  void *kernarg_address;
225  /* Kernel object.  */
226  uint64_t object;
227  /* Synchronization signal used for dispatch synchronization.  */
228  uint64_t signal;
229  /* Private segment size.  */
230  uint32_t private_segment_size;
231  /* Group segment size.  */
232  uint32_t group_segment_size;
233};
234
235/* Structure of the kernargs segment, supporting console output.
236
237   This needs to match the definitions in Newlib, and the expectations
238   in libgomp target code.  */
239
240struct kernargs {
241  /* Leave space for the real kernel arguments.
242     OpenACC and OpenMP only use one pointer.  */
243  int64_t dummy1;
244  int64_t dummy2;
245
246  /* A pointer to struct output, below, for console output data.  */
247  int64_t out_ptr;
248
249  /* A pointer to struct heap, below.  */
250  int64_t heap_ptr;
251
252  /* A pointer to an ephemeral memory arena.
253    Only needed for OpenMP.  */
254  int64_t arena_ptr;
255
256  /* Output data.  */
257  struct output {
258    int return_value;
259    unsigned int next_output;
260    struct printf_data {
261      int written;
262      char msg[128];
263      int type;
264      union {
265	int64_t ivalue;
266	double dvalue;
267	char text[128];
268      };
269    } queue[1024];
270    unsigned int consumed;
271  } output_data;
272};
273
274/* A queue entry for a future asynchronous launch.  */
275
276struct kernel_launch
277{
278  struct kernel_info *kernel;
279  void *vars;
280  struct GOMP_kernel_launch_attributes kla;
281};
282
283/* A queue entry for a future callback.  */
284
285struct callback
286{
287  void (*fn)(void *);
288  void *data;
289};
290
291/* A data struct for the copy_data callback.  */
292
293struct copy_data
294{
295  void *dst;
296  const void *src;
297  size_t len;
298  bool free_src;
299  struct goacc_asyncqueue *aq;
300};
301
302/* A queue entry for a placeholder.  These correspond to a wait event.  */
303
304struct placeholder
305{
306  int executed;
307  pthread_cond_t cond;
308  pthread_mutex_t mutex;
309};
310
311/* A queue entry for a wait directive.  */
312
313struct asyncwait_info
314{
315  struct placeholder *placeholderp;
316};
317
318/* Encode the type of an entry in an async queue.  */
319
320enum entry_type
321{
322  KERNEL_LAUNCH,
323  CALLBACK,
324  ASYNC_WAIT,
325  ASYNC_PLACEHOLDER
326};
327
328/* An entry in an async queue.  */
329
330struct queue_entry
331{
332  enum entry_type type;
333  union {
334    struct kernel_launch launch;
335    struct callback callback;
336    struct asyncwait_info asyncwait;
337    struct placeholder placeholder;
338  } u;
339};
340
341/* An async queue header.
342
343   OpenMP may create one of these.
344   OpenACC may create many.  */
345
346struct goacc_asyncqueue
347{
348  struct agent_info *agent;
349  hsa_queue_t *hsa_queue;
350
351  pthread_t thread_drain_queue;
352  pthread_mutex_t mutex;
353  pthread_cond_t queue_cond_in;
354  pthread_cond_t queue_cond_out;
355  struct queue_entry queue[ASYNC_QUEUE_SIZE];
356  int queue_first;
357  int queue_n;
358  int drain_queue_stop;
359
360  int id;
361  struct goacc_asyncqueue *prev;
362  struct goacc_asyncqueue *next;
363};
364
365/* Mkoffload uses this structure to describe a kernel.
366
367   OpenMP kernel dimensions are passed at runtime.
368   OpenACC kernel dimensions are passed at compile time, here.  */
369
370struct hsa_kernel_description
371{
372  const char *name;
373  int oacc_dims[3];  /* Only present for GCN kernels.  */
374  int sgpr_count;
375  int vpgr_count;
376};
377
378/* Mkoffload uses this structure to describe an offload variable.  */
379
380struct global_var_info
381{
382  const char *name;
383  void *address;
384};
385
386/* Mkoffload uses this structure to describe all the kernels in a
387   loadable module.  These are passed the libgomp via static constructors.  */
388
389struct gcn_image_desc
390{
391  struct gcn_image {
392    size_t size;
393    void *image;
394  } *gcn_image;
395  const unsigned kernel_count;
396  struct hsa_kernel_description *kernel_infos;
397  const unsigned global_variable_count;
398  struct global_var_info *global_variables;
399};
400
401/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
402   support.
403   See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
404
405typedef enum {
406  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
407  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
408  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
409} EF_AMDGPU_MACH;
410
411const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
412typedef EF_AMDGPU_MACH gcn_isa;
413
414/* Description of an HSA GPU agent (device) and the program associated with
415   it.  */
416
417struct agent_info
418{
419  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
420  hsa_agent_t id;
421  /* The user-visible device number.  */
422  int device_id;
423  /* Whether the agent has been initialized.  The fields below are usable only
424     if it has been.  */
425  bool initialized;
426
427  /* The instruction set architecture of the device. */
428  gcn_isa device_isa;
429  /* Name of the agent. */
430  char name[64];
431  /* Name of the vendor of the agent. */
432  char vendor_name[64];
433  /* Command queues of the agent.  */
434  hsa_queue_t *sync_queue;
435  struct goacc_asyncqueue *async_queues, *omp_async_queue;
436  pthread_mutex_t async_queues_mutex;
437
438  /* The HSA memory region from which to allocate kernel arguments.  */
439  hsa_region_t kernarg_region;
440
441  /* The HSA memory region from which to allocate device data.  */
442  hsa_region_t data_region;
443
444  /* Allocated team arenas.  */
445  struct team_arena_list *team_arena_list;
446  pthread_mutex_t team_arena_write_lock;
447
448  /* Read-write lock that protects kernels which are running or about to be run
449     from interference with loading and unloading of images.  Needs to be
450     locked for reading while a kernel is being run, and for writing if the
451     list of modules is manipulated (and thus the HSA program invalidated).  */
452  pthread_rwlock_t module_rwlock;
453
454  /* The module associated with this kernel.  */
455  struct module_info *module;
456
457  /* Mutex enforcing that only one thread will finalize the HSA program.  A
458     thread should have locked agent->module_rwlock for reading before
459     acquiring it.  */
460  pthread_mutex_t prog_mutex;
461  /* Flag whether the HSA program that consists of all the modules has been
462     finalized.  */
463  bool prog_finalized;
464  /* HSA executable - the finalized program that is used to locate kernels.  */
465  hsa_executable_t executable;
466};
467
468/* Information required to identify, finalize and run any given kernel.  */
469
470enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
471
472struct kernel_info
473{
474  /* Name of the kernel, required to locate it within the GCN object-code
475     module.  */
476  const char *name;
477  /* The specific agent the kernel has been or will be finalized for and run
478     on.  */
479  struct agent_info *agent;
480  /* The specific module where the kernel takes place.  */
481  struct module_info *module;
482  /* Information provided by mkoffload associated with the kernel.  */
483  struct hsa_kernel_description *description;
484  /* Mutex enforcing that at most once thread ever initializes a kernel for
485     use.  A thread should have locked agent->module_rwlock for reading before
486     acquiring it.  */
487  pthread_mutex_t init_mutex;
488  /* Flag indicating whether the kernel has been initialized and all fields
489     below it contain valid data.  */
490  bool initialized;
491  /* Flag indicating that the kernel has a problem that blocks an execution.  */
492  bool initialization_failed;
493  /* The object to be put into the dispatch queue.  */
494  uint64_t object;
495  /* Required size of kernel arguments.  */
496  uint32_t kernarg_segment_size;
497  /* Required size of group segment.  */
498  uint32_t group_segment_size;
499  /* Required size of private segment.  */
500  uint32_t private_segment_size;
501  /* Set up for OpenMP or OpenACC?  */
502  enum offload_kind kind;
503};
504
505/* Information about a particular GCN module, its image and kernels.  */
506
507struct module_info
508{
509  /* The description with which the program has registered the image.  */
510  struct gcn_image_desc *image_desc;
511  /* GCN heap allocation.  */
512  struct heap *heap;
513  /* Physical boundaries of the loaded module.  */
514  Elf64_Addr phys_address_start;
515  Elf64_Addr phys_address_end;
516
517  bool constructors_run_p;
518  struct kernel_info *init_array_func, *fini_array_func;
519
520  /* Number of kernels in this module.  */
521  int kernel_count;
522  /* An array of kernel_info structures describing each kernel in this
523     module.  */
524  struct kernel_info kernels[];
525};
526
527/* A linked list of memory arenas allocated on the device.
528   These are only used by OpenMP, as a means to optimize per-team malloc.  */
529
530struct team_arena_list
531{
532  struct team_arena_list *next;
533
534  /* The number of teams determines the size of the allocation.  */
535  int num_teams;
536  /* The device address of the arena itself.  */
537  void *arena;
538  /* A flag to prevent two asynchronous kernels trying to use the same arena.
539     The mutex is locked until the kernel exits.  */
540  pthread_mutex_t in_use;
541};
542
543/* Information about the whole HSA environment and all of its agents.  */
544
545struct hsa_context_info
546{
547  /* Whether the structure has been initialized.  */
548  bool initialized;
549  /* Number of usable GPU HSA agents in the system.  */
550  int agent_count;
551  /* Array of agent_info structures describing the individual HSA agents.  */
552  struct agent_info *agents;
553  /* Driver version string. */
554  char driver_version_s[30];
555};
556
557/* Format of the on-device heap.
558
559   This must match the definition in Newlib and gcn-run.  */
560
561struct heap {
562  int64_t size;
563  char data[0];
564};
565
566/* }}}  */
567/* {{{ Global variables  */
568
569/* Information about the whole HSA environment and all of its agents.  */
570
571static struct hsa_context_info hsa_context;
572
573/* HSA runtime functions that are initialized in init_hsa_context.  */
574
575static struct hsa_runtime_fn_info hsa_fns;
576
577/* Heap space, allocated target-side, provided for use of newlib malloc.
578   Each module should have it's own heap allocated.
579   Beware that heap usage increases with OpenMP teams.  See also arenas.  */
580
581static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
582
583/* Flag to decide whether print to stderr information about what is going on.
584   Set in init_debug depending on environment variables.  */
585
586static bool debug;
587
588/* Flag to decide if the runtime should suppress a possible fallback to host
589   execution.  */
590
591static bool suppress_host_fallback;
592
593/* Flag to locate HSA runtime shared library that is dlopened
594   by this plug-in.  */
595
596static const char *hsa_runtime_lib;
597
598/* Flag to decide if the runtime should support also CPU devices (can be
599   a simulator).  */
600
601static bool support_cpu_devices;
602
603/* Runtime dimension overrides.  Zero indicates default.  */
604
605static int override_x_dim = 0;
606static int override_z_dim = 0;
607
608/* }}}  */
609/* {{{ Debug & Diagnostic  */
610
611/* Print a message to stderr if GCN_DEBUG value is set to true.  */
612
613#define DEBUG_PRINT(...) \
614  do \
615  { \
616    if (debug) \
617      { \
618	fprintf (stderr, __VA_ARGS__); \
619      } \
620  } \
621  while (false);
622
623/* Flush stderr if GCN_DEBUG value is set to true.  */
624
625#define DEBUG_FLUSH()				\
626  do {						\
627    if (debug)					\
628      fflush (stderr);				\
629  } while (false)
630
631/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
632   is set to true.  */
633
634#define DEBUG_LOG(prefix, ...)			\
635  do						\
636    {						\
637      DEBUG_PRINT (prefix);			\
638      DEBUG_PRINT (__VA_ARGS__);			\
639      DEBUG_FLUSH ();				\
640    } while (false)
641
642/* Print a debugging message to stderr.  */
643
644#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
645
646/* Print a warning message to stderr.  */
647
648#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
649
650/* Print HSA warning STR with an HSA STATUS code.  */
651
652static void
653hsa_warn (const char *str, hsa_status_t status)
654{
655  if (!debug)
656    return;
657
658  const char *hsa_error_msg = "[unknown]";
659  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660
661  fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
662	   hsa_error_msg);
663}
664
665/* Report a fatal error STR together with the HSA error corresponding to STATUS
666   and terminate execution of the current process.  */
667
668static void
669hsa_fatal (const char *str, hsa_status_t status)
670{
671  const char *hsa_error_msg = "[unknown]";
672  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
673  GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
674		     hsa_error_msg);
675}
676
677/* Like hsa_fatal, except only report error message, and return FALSE
678   for propagating error processing to outside of plugin.  */
679
680static bool
681hsa_error (const char *str, hsa_status_t status)
682{
683  const char *hsa_error_msg = "[unknown]";
684  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
685  GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
686		     hsa_error_msg);
687  return false;
688}
689
690/* Dump information about the available hardware.  */
691
692static void
693dump_hsa_system_info (void)
694{
695  hsa_status_t status;
696
697  hsa_endianness_t endianness;
698  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
699					   &endianness);
700  if (status == HSA_STATUS_SUCCESS)
701    switch (endianness)
702      {
703      case HSA_ENDIANNESS_LITTLE:
704	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
705	break;
706      case HSA_ENDIANNESS_BIG:
707	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
708	break;
709      default:
710	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
711      }
712  else
713    GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
714
715  uint8_t extensions[128];
716  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
717					   &extensions);
718  if (status == HSA_STATUS_SUCCESS)
719    {
720      if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
721	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
722    }
723  else
724    GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
725}
726
727/* Dump information about the available hardware.  */
728
729static void
730dump_machine_model (hsa_machine_model_t machine_model, const char *s)
731{
732  switch (machine_model)
733    {
734    case HSA_MACHINE_MODEL_SMALL:
735      GCN_DEBUG ("%s: SMALL\n", s);
736      break;
737    case HSA_MACHINE_MODEL_LARGE:
738      GCN_DEBUG ("%s: LARGE\n", s);
739      break;
740    default:
741      GCN_WARNING ("%s: UNKNOWN\n", s);
742      break;
743    }
744}
745
746/* Dump information about the available hardware.  */
747
748static void
749dump_profile (hsa_profile_t profile, const char *s)
750{
751  switch (profile)
752    {
753    case HSA_PROFILE_FULL:
754      GCN_DEBUG ("%s: FULL\n", s);
755      break;
756    case HSA_PROFILE_BASE:
757      GCN_DEBUG ("%s: BASE\n", s);
758      break;
759    default:
760      GCN_WARNING ("%s: UNKNOWN\n", s);
761      break;
762    }
763}
764
765/* Dump information about a device memory region.  */
766
767static hsa_status_t
768dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
769{
770  hsa_status_t status;
771
772  hsa_region_segment_t segment;
773  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
774					   &segment);
775  if (status == HSA_STATUS_SUCCESS)
776    {
777      if (segment == HSA_REGION_SEGMENT_GLOBAL)
778	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
779      else if (segment == HSA_REGION_SEGMENT_READONLY)
780	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
781      else if (segment == HSA_REGION_SEGMENT_PRIVATE)
782	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
783      else if (segment == HSA_REGION_SEGMENT_GROUP)
784	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
785      else
786	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
787    }
788  else
789    GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
790
791  if (segment == HSA_REGION_SEGMENT_GLOBAL)
792    {
793      uint32_t flags;
794      status
795	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
796					  &flags);
797      if (status == HSA_STATUS_SUCCESS)
798	{
799	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
800	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
801	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
802	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
803	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
804	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
805	}
806      else
807	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
808    }
809
810  size_t size;
811  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
812  if (status == HSA_STATUS_SUCCESS)
813    GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
814  else
815    GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
816
817  status
818    = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
819				      &size);
820  if (status == HSA_STATUS_SUCCESS)
821    GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
822  else
823    GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
824
825  bool alloc_allowed;
826  status
827    = hsa_fns.hsa_region_get_info_fn (region,
828				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
829				      &alloc_allowed);
830  if (status == HSA_STATUS_SUCCESS)
831    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
832  else
833    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
834
835  if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
836    return HSA_STATUS_SUCCESS;
837
838  status
839    = hsa_fns.hsa_region_get_info_fn (region,
840				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
841				      &size);
842  if (status == HSA_STATUS_SUCCESS)
843    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
844  else
845    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
846
847  size_t align;
848  status
849    = hsa_fns.hsa_region_get_info_fn (region,
850				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
851				      &align);
852  if (status == HSA_STATUS_SUCCESS)
853    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
854  else
855    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
856
857  return HSA_STATUS_SUCCESS;
858}
859
860/* Dump information about all the device memory regions.  */
861
862static void
863dump_hsa_regions (hsa_agent_t agent)
864{
865  hsa_status_t status;
866  status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
867						 dump_hsa_region,
868						 NULL);
869  if (status != HSA_STATUS_SUCCESS)
870    hsa_error ("Dumping hsa regions failed", status);
871}
872
873/* Dump information about the available devices.  */
874
875static hsa_status_t
876dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
877{
878  hsa_status_t status;
879
880  char buf[64];
881  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
882					  &buf);
883  if (status == HSA_STATUS_SUCCESS)
884    GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
885  else
886    GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
887
888  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
889					  &buf);
890  if (status == HSA_STATUS_SUCCESS)
891    GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
892  else
893    GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
894
895  hsa_machine_model_t machine_model;
896  status
897    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
898				     &machine_model);
899  if (status == HSA_STATUS_SUCCESS)
900    dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
901  else
902    GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
903
904  hsa_profile_t profile;
905  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
906					  &profile);
907  if (status == HSA_STATUS_SUCCESS)
908    dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
909  else
910    GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
911
912  hsa_device_type_t device_type;
913  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
914					  &device_type);
915  if (status == HSA_STATUS_SUCCESS)
916    {
917      switch (device_type)
918	{
919	case HSA_DEVICE_TYPE_CPU:
920	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
921	  break;
922	case HSA_DEVICE_TYPE_GPU:
923	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
924	  break;
925	case HSA_DEVICE_TYPE_DSP:
926	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
927	  break;
928	default:
929	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
930	  break;
931	}
932    }
933  else
934    GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
935
936  uint32_t cu_count;
937  status = hsa_fns.hsa_agent_get_info_fn
938    (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
939  if (status == HSA_STATUS_SUCCESS)
940    GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
941  else
942    GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
943
944  uint32_t size;
945  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
946					  &size);
947  if (status == HSA_STATUS_SUCCESS)
948    GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
949  else
950    GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
951
952  uint32_t max_dim;
953  status = hsa_fns.hsa_agent_get_info_fn (agent,
954					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
955					  &max_dim);
956  if (status == HSA_STATUS_SUCCESS)
957    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
958  else
959    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
960
961  uint32_t max_size;
962  status = hsa_fns.hsa_agent_get_info_fn (agent,
963					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
964					  &max_size);
965  if (status == HSA_STATUS_SUCCESS)
966    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
967  else
968    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
969
970  uint32_t grid_max_dim;
971  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
972					  &grid_max_dim);
973  if (status == HSA_STATUS_SUCCESS)
974    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
975  else
976    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
977
978  uint32_t grid_max_size;
979  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
980					  &grid_max_size);
981  if (status == HSA_STATUS_SUCCESS)
982    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
983  else
984    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
985
986  dump_hsa_regions (agent);
987
988  return HSA_STATUS_SUCCESS;
989}
990
991/* Forward reference.  */
992
993static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
994
995/* Helper function for dump_executable_symbols.  */
996
997static hsa_status_t
998dump_executable_symbol (hsa_executable_t executable,
999			hsa_executable_symbol_t symbol,
1000			void *data __attribute__((unused)))
1001{
1002  char *name = get_executable_symbol_name (symbol);
1003
1004  if (name)
1005    {
1006      GCN_DEBUG ("executable symbol: %s\n", name);
1007      free (name);
1008    }
1009
1010  return HSA_STATUS_SUCCESS;
1011}
1012
1013/* Dump all global symbol in an executable.  */
1014
1015static void
1016dump_executable_symbols (hsa_executable_t executable)
1017{
1018  hsa_status_t status;
1019  status
1020    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1021						 dump_executable_symbol,
1022						 NULL);
1023  if (status != HSA_STATUS_SUCCESS)
1024    hsa_fatal ("Could not dump HSA executable symbols", status);
1025}
1026
1027/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1028
1029static void
1030print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1031{
1032  struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1033
1034  fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1035  fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1036  fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1037  fprintf (stderr, "%*sheap address: %p\n", indent, "",
1038	   (void*)kernargs->heap_ptr);
1039  fprintf (stderr, "%*sarena address: %p\n", indent, "",
1040	   (void*)kernargs->arena_ptr);
1041  fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1042  fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1043	   dispatch->private_segment_size);
1044  fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1045	   dispatch->group_segment_size);
1046  fprintf (stderr, "\n");
1047}
1048
1049/* }}}  */
1050/* {{{ Utility functions  */
1051
1052/* Cast the thread local storage to gcn_thread.  */
1053
1054static inline struct gcn_thread *
1055gcn_thread (void)
1056{
1057  return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1058}
1059
1060/* Initialize debug and suppress_host_fallback according to the environment.  */
1061
1062static void
1063init_environment_variables (void)
1064{
1065  if (secure_getenv ("GCN_DEBUG"))
1066    debug = true;
1067  else
1068    debug = false;
1069
1070  if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1071    suppress_host_fallback = true;
1072  else
1073    suppress_host_fallback = false;
1074
1075  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1076  if (hsa_runtime_lib == NULL)
1077    hsa_runtime_lib = "libhsa-runtime64.so";
1078
1079  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1080
1081  const char *x = secure_getenv ("GCN_NUM_TEAMS");
1082  if (!x)
1083    x = secure_getenv ("GCN_NUM_GANGS");
1084  if (x)
1085    override_x_dim = atoi (x);
1086
1087  const char *z = secure_getenv ("GCN_NUM_THREADS");
1088  if (!z)
1089    z = secure_getenv ("GCN_NUM_WORKERS");
1090  if (z)
1091    override_z_dim = atoi (z);
1092
1093  const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1094  if (heap)
1095    {
1096      size_t tmp = atol (heap);
1097      if (tmp)
1098	gcn_kernel_heap_size = tmp;
1099    }
1100}
1101
1102/* Return malloc'd string with name of SYMBOL.  */
1103
1104static char *
1105get_executable_symbol_name (hsa_executable_symbol_t symbol)
1106{
1107  hsa_status_t status;
1108  char *res;
1109  uint32_t len;
1110  const hsa_executable_symbol_info_t info_name_length
1111    = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1112
1113  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1114						      &len);
1115  if (status != HSA_STATUS_SUCCESS)
1116    {
1117      hsa_error ("Could not get length of symbol name", status);
1118      return NULL;
1119    }
1120
1121  res = GOMP_PLUGIN_malloc (len + 1);
1122
1123  const hsa_executable_symbol_info_t info_name
1124    = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1125
1126  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1127
1128  if (status != HSA_STATUS_SUCCESS)
1129    {
1130      hsa_error ("Could not get symbol name", status);
1131      free (res);
1132      return NULL;
1133    }
1134
1135  res[len] = '\0';
1136
1137  return res;
1138}
1139
1140/* Helper function for find_executable_symbol.  */
1141
1142static hsa_status_t
1143find_executable_symbol_1 (hsa_executable_t executable,
1144			  hsa_executable_symbol_t symbol,
1145			  void *data)
1146{
1147  hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1148  *res = symbol;
1149  return HSA_STATUS_INFO_BREAK;
1150}
1151
1152/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
1153   found, return false.  */
1154
1155static bool
1156find_executable_symbol (hsa_executable_t executable,
1157			hsa_executable_symbol_t *symbol)
1158{
1159  hsa_status_t status;
1160
1161  status
1162    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1163						 find_executable_symbol_1,
1164						 symbol);
1165  if (status != HSA_STATUS_INFO_BREAK)
1166    {
1167      hsa_error ("Could not find executable symbol", status);
1168      return false;
1169    }
1170
1171  return true;
1172}
1173
1174/* Get the number of GPU Compute Units.  */
1175
1176static int
1177get_cu_count (struct agent_info *agent)
1178{
1179  uint32_t cu_count;
1180  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1181    (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1182  if (status == HSA_STATUS_SUCCESS)
1183    return cu_count;
1184  else
1185    return 64;  /* The usual number for older devices.  */
1186}
1187
1188/* Calculate the maximum grid size for OMP threads / OACC workers.
1189   This depends on the kernel's resource usage levels.  */
1190
1191static int
1192limit_worker_threads (int threads)
1193{
1194  /* FIXME Do something more inteligent here.
1195     GCN can always run 4 threads within a Compute Unit, but
1196     more than that depends on register usage.  */
1197  if (threads > 16)
1198    threads = 16;
1199  return threads;
1200}
1201
1202/* Parse the target attributes INPUT provided by the compiler and return true
1203   if we should run anything all.  If INPUT is NULL, fill DEF with default
1204   values, then store INPUT or DEF into *RESULT.
1205
1206   This is used for OpenMP only.  */
1207
1208static bool
1209parse_target_attributes (void **input,
1210			 struct GOMP_kernel_launch_attributes *def,
1211			 struct GOMP_kernel_launch_attributes **result,
1212			 struct agent_info *agent)
1213{
1214  if (!input)
1215    GOMP_PLUGIN_fatal ("No target arguments provided");
1216
1217  bool grid_attrs_found = false;
1218  bool gcn_dims_found = false;
1219  int gcn_teams = 0;
1220  int gcn_threads = 0;
1221  while (*input)
1222    {
1223      intptr_t id = (intptr_t) *input++, val;
1224
1225      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1226	val = (intptr_t) *input++;
1227      else
1228	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1229
1230      val = (val > INT_MAX) ? INT_MAX : val;
1231
1232      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1233	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1234	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1235	{
1236	  grid_attrs_found = true;
1237	  break;
1238	}
1239      else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1240	       == GOMP_TARGET_ARG_DEVICE_ALL)
1241	{
1242	  gcn_dims_found = true;
1243	  switch (id & GOMP_TARGET_ARG_ID_MASK)
1244	    {
1245	    case GOMP_TARGET_ARG_NUM_TEAMS:
1246	      gcn_teams = val;
1247	      break;
1248	    case GOMP_TARGET_ARG_THREAD_LIMIT:
1249	      gcn_threads = limit_worker_threads (val);
1250	      break;
1251	    default:
1252	      ;
1253	    }
1254	}
1255    }
1256
1257  if (gcn_dims_found)
1258    {
1259      if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1260	  && gcn_threads == 0 && override_z_dim == 0)
1261	{
1262	  gcn_threads = 4;
1263	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1264		       "threads to 4 per team.\n");
1265	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
1266		       "GCN_NUM_THREADS=16\n");
1267	}
1268
1269      def->ndim = 3;
1270      /* Fiji has 64 CUs, but Vega20 has 60.  */
1271      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1272      /* Each thread is 64 work items wide.  */
1273      def->gdims[1] = 64;
1274      /* A work group can have 16 wavefronts.  */
1275      def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1276      def->wdims[0] = 1; /* Single team per work-group.  */
1277      def->wdims[1] = 64;
1278      def->wdims[2] = 16;
1279      *result = def;
1280      return true;
1281    }
1282  else if (!grid_attrs_found)
1283    {
1284      def->ndim = 1;
1285      def->gdims[0] = 1;
1286      def->gdims[1] = 1;
1287      def->gdims[2] = 1;
1288      def->wdims[0] = 1;
1289      def->wdims[1] = 1;
1290      def->wdims[2] = 1;
1291      *result = def;
1292      GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1293      return true;
1294    }
1295
1296  struct GOMP_kernel_launch_attributes *kla;
1297  kla = (struct GOMP_kernel_launch_attributes *) *input;
1298  *result = kla;
1299  if (kla->ndim == 0 || kla->ndim > 3)
1300    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1301
1302  GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1303  unsigned i;
1304  for (i = 0; i < kla->ndim; i++)
1305    {
1306      GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1307		 kla->gdims[i], kla->wdims[i]);
1308      if (kla->gdims[i] == 0)
1309	return false;
1310    }
1311  return true;
1312}
1313
1314/* Return the group size given the requested GROUP size, GRID size and number
1315   of grid dimensions NDIM.  */
1316
1317static uint32_t
1318get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1319{
1320  if (group == 0)
1321    {
1322      /* TODO: Provide a default via environment or device characteristics.  */
1323      if (ndim == 1)
1324	group = 64;
1325      else if (ndim == 2)
1326	group = 8;
1327      else
1328	group = 4;
1329    }
1330
1331  if (group > grid)
1332    group = grid;
1333  return group;
1334}
1335
1336/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1337
1338static void
1339packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1340{
1341  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1342}
1343
1344/* A never-called callback for the HSA command queues.  These signal events
1345   that we don't use, so we trigger an error.
1346
1347   This "queue" is not to be confused with the async queues, below.  */
1348
1349static void
1350hsa_queue_callback (hsa_status_t status,
1351		hsa_queue_t *queue __attribute__ ((unused)),
1352		void *data __attribute__ ((unused)))
1353{
1354  hsa_fatal ("Asynchronous queue error", status);
1355}
1356
1357/* }}}  */
1358/* {{{ HSA initialization  */
1359
1360/* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1361
1362static bool
1363init_hsa_runtime_functions (void)
1364{
1365#define DLSYM_FN(function) \
1366  hsa_fns.function##_fn = dlsym (handle, #function); \
1367  if (hsa_fns.function##_fn == NULL) \
1368    return false;
1369  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1370  if (handle == NULL)
1371    return false;
1372
1373  DLSYM_FN (hsa_status_string)
1374  DLSYM_FN (hsa_system_get_info)
1375  DLSYM_FN (hsa_agent_get_info)
1376  DLSYM_FN (hsa_init)
1377  DLSYM_FN (hsa_iterate_agents)
1378  DLSYM_FN (hsa_region_get_info)
1379  DLSYM_FN (hsa_queue_create)
1380  DLSYM_FN (hsa_agent_iterate_regions)
1381  DLSYM_FN (hsa_executable_destroy)
1382  DLSYM_FN (hsa_executable_create)
1383  DLSYM_FN (hsa_executable_global_variable_define)
1384  DLSYM_FN (hsa_executable_load_code_object)
1385  DLSYM_FN (hsa_executable_freeze)
1386  DLSYM_FN (hsa_signal_create)
1387  DLSYM_FN (hsa_memory_allocate)
1388  DLSYM_FN (hsa_memory_assign_agent)
1389  DLSYM_FN (hsa_memory_copy)
1390  DLSYM_FN (hsa_memory_free)
1391  DLSYM_FN (hsa_signal_destroy)
1392  DLSYM_FN (hsa_executable_get_symbol)
1393  DLSYM_FN (hsa_executable_symbol_get_info)
1394  DLSYM_FN (hsa_executable_iterate_symbols)
1395  DLSYM_FN (hsa_queue_add_write_index_release)
1396  DLSYM_FN (hsa_queue_load_read_index_acquire)
1397  DLSYM_FN (hsa_signal_wait_acquire)
1398  DLSYM_FN (hsa_signal_store_relaxed)
1399  DLSYM_FN (hsa_signal_store_release)
1400  DLSYM_FN (hsa_signal_load_acquire)
1401  DLSYM_FN (hsa_queue_destroy)
1402  DLSYM_FN (hsa_code_object_deserialize)
1403  return true;
1404#undef DLSYM_FN
1405}
1406
1407/* Return true if the agent is a GPU and can accept of concurrent submissions
1408   from different threads.  */
1409
1410static bool
1411suitable_hsa_agent_p (hsa_agent_t agent)
1412{
1413  hsa_device_type_t device_type;
1414  hsa_status_t status
1415    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1416				     &device_type);
1417  if (status != HSA_STATUS_SUCCESS)
1418    return false;
1419
1420  switch (device_type)
1421    {
1422    case HSA_DEVICE_TYPE_GPU:
1423      break;
1424    case HSA_DEVICE_TYPE_CPU:
1425      if (!support_cpu_devices)
1426	return false;
1427      break;
1428    default:
1429      return false;
1430    }
1431
1432  uint32_t features = 0;
1433  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1434					  &features);
1435  if (status != HSA_STATUS_SUCCESS
1436      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1437    return false;
1438  hsa_queue_type_t queue_type;
1439  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1440					  &queue_type);
1441  if (status != HSA_STATUS_SUCCESS
1442      || (queue_type != HSA_QUEUE_TYPE_MULTI))
1443    return false;
1444
1445  return true;
1446}
1447
1448/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1449   agent_count in hsa_context.  */
1450
1451static hsa_status_t
1452count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1453{
1454  if (suitable_hsa_agent_p (agent))
1455    hsa_context.agent_count++;
1456  return HSA_STATUS_SUCCESS;
1457}
1458
1459/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1460   id to the describing structure in the hsa context.  The index of the
1461   structure is pointed to by DATA, increment it afterwards.  */
1462
1463static hsa_status_t
1464assign_agent_ids (hsa_agent_t agent, void *data)
1465{
1466  if (suitable_hsa_agent_p (agent))
1467    {
1468      int *agent_index = (int *) data;
1469      hsa_context.agents[*agent_index].id = agent;
1470      ++*agent_index;
1471    }
1472  return HSA_STATUS_SUCCESS;
1473}
1474
1475/* Initialize hsa_context if it has not already been done.
1476   Return TRUE on success.  */
1477
1478static bool
1479init_hsa_context (void)
1480{
1481  hsa_status_t status;
1482  int agent_index = 0;
1483
1484  if (hsa_context.initialized)
1485    return true;
1486  init_environment_variables ();
1487  if (!init_hsa_runtime_functions ())
1488    {
1489      GCN_WARNING ("Run-time could not be dynamically opened\n");
1490      if (suppress_host_fallback)
1491	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1492      return false;
1493    }
1494  status = hsa_fns.hsa_init_fn ();
1495  if (status != HSA_STATUS_SUCCESS)
1496    return hsa_error ("Run-time could not be initialized", status);
1497  GCN_DEBUG ("HSA run-time initialized for GCN\n");
1498
1499  if (debug)
1500    dump_hsa_system_info ();
1501
1502  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1503  if (status != HSA_STATUS_SUCCESS)
1504    return hsa_error ("GCN GPU devices could not be enumerated", status);
1505  GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1506
1507  hsa_context.agents
1508    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1509				  * sizeof (struct agent_info));
1510  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1511  if (status != HSA_STATUS_SUCCESS)
1512    return hsa_error ("Scanning compute agents failed", status);
1513  if (agent_index != hsa_context.agent_count)
1514    {
1515      GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1516      return false;
1517    }
1518
1519  if (debug)
1520    {
1521      status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1522      if (status != HSA_STATUS_SUCCESS)
1523	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1524    }
1525
1526  uint16_t minor, major;
1527  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1528					   &minor);
1529  if (status != HSA_STATUS_SUCCESS)
1530    GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1531  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1532					   &major);
1533  if (status != HSA_STATUS_SUCCESS)
1534    GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1535
1536  size_t len = sizeof hsa_context.driver_version_s;
1537  int printed = snprintf (hsa_context.driver_version_s, len,
1538			  "HSA Runtime %hu.%hu", (unsigned short int)major,
1539			  (unsigned short int)minor);
1540  if (printed >= len)
1541    GCN_WARNING ("HSA runtime version string was truncated."
1542		 "Version %hu.%hu is too long.", (unsigned short int)major,
1543		 (unsigned short int)minor);
1544
1545  hsa_context.initialized = true;
1546  return true;
1547}
1548
1549/* Verify that hsa_context has already been initialized and return the
1550   agent_info structure describing device number N.  Return NULL on error.  */
1551
1552static struct agent_info *
1553get_agent_info (int n)
1554{
1555  if (!hsa_context.initialized)
1556    {
1557      GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1558      return NULL;
1559    }
1560  if (n >= hsa_context.agent_count)
1561    {
1562      GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1563      return NULL;
1564    }
1565  if (!hsa_context.agents[n].initialized)
1566    {
1567      GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1568      return NULL;
1569    }
1570  return &hsa_context.agents[n];
1571}
1572
1573/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1574
1575   Selects (breaks at) a suitable region of type KIND.  */
1576
1577static hsa_status_t
1578get_memory_region (hsa_region_t region, hsa_region_t *retval,
1579		   hsa_region_global_flag_t kind)
1580{
1581  hsa_status_t status;
1582  hsa_region_segment_t segment;
1583
1584  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1585					   &segment);
1586  if (status != HSA_STATUS_SUCCESS)
1587    return status;
1588  if (segment != HSA_REGION_SEGMENT_GLOBAL)
1589    return HSA_STATUS_SUCCESS;
1590
1591  uint32_t flags;
1592  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1593					   &flags);
1594  if (status != HSA_STATUS_SUCCESS)
1595    return status;
1596  if (flags & kind)
1597    {
1598      *retval = region;
1599      return HSA_STATUS_INFO_BREAK;
1600    }
1601  return HSA_STATUS_SUCCESS;
1602}
1603
1604/* Callback of hsa_agent_iterate_regions.
1605
1606   Selects a kernargs memory region.  */
1607
1608static hsa_status_t
1609get_kernarg_memory_region (hsa_region_t region, void *data)
1610{
1611  return get_memory_region (region, (hsa_region_t *)data,
1612			    HSA_REGION_GLOBAL_FLAG_KERNARG);
1613}
1614
1615/* Callback of hsa_agent_iterate_regions.
1616
1617   Selects a coarse-grained memory region suitable for the heap and
1618   offload data.  */
1619
1620static hsa_status_t
1621get_data_memory_region (hsa_region_t region, void *data)
1622{
1623  return get_memory_region (region, (hsa_region_t *)data,
1624			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1625}
1626
1627static int
1628elf_gcn_isa_field (Elf64_Ehdr *image)
1629{
1630  return image->e_flags & EF_AMDGPU_MACH_MASK;
1631}
1632
1633const static char *gcn_gfx803_s = "gfx803";
1634const static char *gcn_gfx900_s = "gfx900";
1635const static char *gcn_gfx906_s = "gfx906";
1636const static int gcn_isa_name_len = 6;
1637
1638/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1639   support the ISA. */
1640
1641static const char*
1642isa_hsa_name (int isa) {
1643  switch(isa)
1644    {
1645    case EF_AMDGPU_MACH_AMDGCN_GFX803:
1646      return gcn_gfx803_s;
1647    case EF_AMDGPU_MACH_AMDGCN_GFX900:
1648      return gcn_gfx900_s;
1649    case EF_AMDGPU_MACH_AMDGCN_GFX906:
1650      return gcn_gfx906_s;
1651    }
1652  return NULL;
1653}
1654
1655/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656   with -march) or NULL if we do not support the ISA.
1657   Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1658
1659static const char*
1660isa_gcc_name (int isa) {
1661  switch(isa)
1662    {
1663    case EF_AMDGPU_MACH_AMDGCN_GFX803:
1664      return "fiji";
1665    default:
1666      return isa_hsa_name (isa);
1667    }
1668}
1669
1670/* Returns the code which is used in the GCN object code to identify the ISA with
1671   the given name (as used by the HSA runtime).  */
1672
1673static gcn_isa
1674isa_code(const char *isa) {
1675  if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1676    return EF_AMDGPU_MACH_AMDGCN_GFX803;
1677
1678  if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1679    return EF_AMDGPU_MACH_AMDGCN_GFX900;
1680
1681  if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1682    return EF_AMDGPU_MACH_AMDGCN_GFX906;
1683
1684  return -1;
1685}
1686
1687/* }}}  */
1688/* {{{ Run  */
1689
1690/* Create or reuse a team arena.
1691
1692   Team arenas are used by OpenMP to avoid calling malloc multiple times
1693   while setting up each team.  This is purely a performance optimization.
1694
1695   Allocating an arena also costs performance, albeit on the host side, so
1696   this function will reuse an existing arena if a large enough one is idle.
1697   The arena is released, but not deallocated, when the kernel exits.  */
1698
1699static void *
1700get_team_arena (struct agent_info *agent, int num_teams)
1701{
1702  struct team_arena_list **next_ptr = &agent->team_arena_list;
1703  struct team_arena_list *item;
1704
1705  for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706    {
1707      if (item->num_teams < num_teams)
1708	continue;
1709
1710      if (pthread_mutex_trylock (&item->in_use))
1711	continue;
1712
1713      return item->arena;
1714    }
1715
1716  GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717
1718  if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719    {
1720      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721      return false;
1722    }
1723  item = malloc (sizeof (*item));
1724  item->num_teams = num_teams;
1725  item->next = NULL;
1726  *next_ptr = item;
1727
1728  if (pthread_mutex_init (&item->in_use, NULL))
1729    {
1730      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731      return false;
1732    }
1733  if (pthread_mutex_lock (&item->in_use))
1734    {
1735      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736      return false;
1737    }
1738  if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739    {
1740      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741      return false;
1742    }
1743
1744  const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1745  hsa_status_t status;
1746  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747					   TEAM_ARENA_SIZE*num_teams,
1748					   &item->arena);
1749  if (status != HSA_STATUS_SUCCESS)
1750    hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751  status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752					       HSA_ACCESS_PERMISSION_RW);
1753  if (status != HSA_STATUS_SUCCESS)
1754    hsa_fatal ("Could not assign arena memory to device", status);
1755
1756  return item->arena;
1757}
1758
1759/* Mark a team arena available for reuse.  */
1760
1761static void
1762release_team_arena (struct agent_info* agent, void *arena)
1763{
1764  struct team_arena_list *item;
1765
1766  for (item = agent->team_arena_list; item; item = item->next)
1767    {
1768      if (item->arena == arena)
1769	{
1770	  if (pthread_mutex_unlock (&item->in_use))
1771	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772	  return;
1773	}
1774    }
1775  GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776}
1777
1778/* Clean up all the allocated team arenas.  */
1779
1780static bool
1781destroy_team_arenas (struct agent_info *agent)
1782{
1783  struct team_arena_list *item, *next;
1784
1785  for (item = agent->team_arena_list; item; item = next)
1786    {
1787      next = item->next;
1788      hsa_fns.hsa_memory_free_fn (item->arena);
1789      if (pthread_mutex_destroy (&item->in_use))
1790	{
1791	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792	  return false;
1793	}
1794      free (item);
1795    }
1796  agent->team_arena_list = NULL;
1797
1798  return true;
1799}
1800
1801/* Allocate memory on a specified device.  */
1802
1803static void *
1804alloc_by_agent (struct agent_info *agent, size_t size)
1805{
1806  GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807
1808  /* Zero-size allocations are invalid, so in order to return a valid pointer
1809     we need to pass a valid size.  One source of zero-size allocations is
1810     kernargs for kernels that have no inputs or outputs (the kernel may
1811     only use console output, for example).  */
1812  if (size == 0)
1813    size = 4;
1814
1815  void *ptr;
1816  hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817							size, &ptr);
1818  if (status != HSA_STATUS_SUCCESS)
1819    {
1820      hsa_error ("Could not allocate device memory", status);
1821      return NULL;
1822    }
1823
1824  status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825					       HSA_ACCESS_PERMISSION_RW);
1826  if (status != HSA_STATUS_SUCCESS)
1827    {
1828      hsa_error ("Could not assign data memory to device", status);
1829      return NULL;
1830    }
1831
1832  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833  bool profiling_dispatch_p
1834    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835  if (profiling_dispatch_p)
1836    {
1837      acc_prof_info *prof_info = thr->prof_info;
1838      acc_event_info data_event_info;
1839      acc_api_info *api_info = thr->api_info;
1840
1841      prof_info->event_type = acc_ev_alloc;
1842
1843      data_event_info.data_event.event_type = prof_info->event_type;
1844      data_event_info.data_event.valid_bytes
1845	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846      data_event_info.data_event.parent_construct
1847	= acc_construct_parallel;
1848      data_event_info.data_event.implicit = 1;
1849      data_event_info.data_event.tool_info = NULL;
1850      data_event_info.data_event.var_name = NULL;
1851      data_event_info.data_event.bytes = size;
1852      data_event_info.data_event.host_ptr = NULL;
1853      data_event_info.data_event.device_ptr = (void *) ptr;
1854
1855      api_info->device_api = acc_device_api_other;
1856
1857      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858					    api_info);
1859    }
1860
1861  return ptr;
1862}
1863
1864/* Create kernel dispatch data structure for given KERNEL, along with
1865   the necessary device signals and memory allocations.  */
1866
1867static struct kernel_dispatch *
1868create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869{
1870  struct agent_info *agent = kernel->agent;
1871  struct kernel_dispatch *shadow
1872    = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873
1874  shadow->agent = kernel->agent;
1875  shadow->object = kernel->object;
1876
1877  hsa_signal_t sync_signal;
1878  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879  if (status != HSA_STATUS_SUCCESS)
1880    hsa_fatal ("Error creating the GCN sync signal", status);
1881
1882  shadow->signal = sync_signal.handle;
1883  shadow->private_segment_size = kernel->private_segment_size;
1884  shadow->group_segment_size = kernel->group_segment_size;
1885
1886  /* We expect kernels to request a single pointer, explicitly, and the
1887     rest of struct kernargs, implicitly.  If they request anything else
1888     then something is wrong.  */
1889  if (kernel->kernarg_segment_size > 8)
1890    {
1891      GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892      return NULL;
1893    }
1894
1895  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896					   sizeof (struct kernargs),
1897					   &shadow->kernarg_address);
1898  if (status != HSA_STATUS_SUCCESS)
1899    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900  struct kernargs *kernargs = shadow->kernarg_address;
1901
1902  /* Zero-initialize the output_data (minimum needed).  */
1903  kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904  kernargs->output_data.next_output = 0;
1905  for (unsigned i = 0;
1906       i < (sizeof (kernargs->output_data.queue)
1907	    / sizeof (kernargs->output_data.queue[0]));
1908       i++)
1909    kernargs->output_data.queue[i].written = 0;
1910  kernargs->output_data.consumed = 0;
1911
1912  /* Pass in the heap location.  */
1913  kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914
1915  /* Create an arena.  */
1916  if (kernel->kind == KIND_OPENMP)
1917    kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918  else
1919    kernargs->arena_ptr = 0;
1920
1921  /* Ensure we can recognize unset return values.  */
1922  kernargs->output_data.return_value = 0xcafe0000;
1923
1924  return shadow;
1925}
1926
1927/* Output any data written to console output from the kernel.  It is expected
1928   that this function is polled during kernel execution.
1929
1930   We print all entries from the last item printed to the next entry without
1931   a "written" flag.  If the "final" flag is set then it'll continue right to
1932   the end.
1933
1934   The print buffer is circular, but the from and to locations don't wrap when
1935   the buffer does, so the output limit is UINT_MAX.  The target blocks on
1936   output when the buffer is full.  */
1937
1938static void
1939console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940		bool final)
1941{
1942  unsigned int limit = (sizeof (kernargs->output_data.queue)
1943			/ sizeof (kernargs->output_data.queue[0]));
1944
1945  unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946				       __ATOMIC_ACQUIRE);
1947  unsigned int to = kernargs->output_data.next_output;
1948
1949  if (from > to)
1950    {
1951      /* Overflow.  */
1952      if (final)
1953	printf ("GCN print buffer overflowed.\n");
1954      return;
1955    }
1956
1957  unsigned int i;
1958  for (i = from; i < to; i++)
1959    {
1960      struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961
1962      if (!data->written && !final)
1963	break;
1964
1965      switch (data->type)
1966	{
1967	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971	default: printf ("GCN print buffer error!\n"); break;
1972	}
1973      data->written = 0;
1974      __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975			__ATOMIC_RELEASE);
1976    }
1977  fflush (stdout);
1978}
1979
1980/* Release data structure created for a kernel dispatch in SHADOW argument,
1981   and clean up the signal and memory allocations.  */
1982
1983static void
1984release_kernel_dispatch (struct kernel_dispatch *shadow)
1985{
1986  GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987
1988  struct kernargs *kernargs = shadow->kernarg_address;
1989  void *arena = (void *)kernargs->arena_ptr;
1990  if (arena)
1991    release_team_arena (shadow->agent, arena);
1992
1993  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994
1995  hsa_signal_t s;
1996  s.handle = shadow->signal;
1997  hsa_fns.hsa_signal_destroy_fn (s);
1998
1999  free (shadow);
2000}
2001
2002/* Extract the properties from a kernel binary.  */
2003
2004static void
2005init_kernel_properties (struct kernel_info *kernel)
2006{
2007  hsa_status_t status;
2008  struct agent_info *agent = kernel->agent;
2009  hsa_executable_symbol_t kernel_symbol;
2010  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2011						 kernel->name, agent->id,
2012						 0, &kernel_symbol);
2013  if (status != HSA_STATUS_SUCCESS)
2014    {
2015      hsa_warn ("Could not find symbol for kernel in the code object", status);
2016      fprintf (stderr, "not found name: '%s'\n", kernel->name);
2017      dump_executable_symbols (agent->executable);
2018      goto failure;
2019    }
2020  GCN_DEBUG ("Located kernel %s\n", kernel->name);
2021  status = hsa_fns.hsa_executable_symbol_get_info_fn
2022    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2023  if (status != HSA_STATUS_SUCCESS)
2024    hsa_fatal ("Could not extract a kernel object from its symbol", status);
2025  status = hsa_fns.hsa_executable_symbol_get_info_fn
2026    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2027     &kernel->kernarg_segment_size);
2028  if (status != HSA_STATUS_SUCCESS)
2029    hsa_fatal ("Could not get info about kernel argument size", status);
2030  status = hsa_fns.hsa_executable_symbol_get_info_fn
2031    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2032     &kernel->group_segment_size);
2033  if (status != HSA_STATUS_SUCCESS)
2034    hsa_fatal ("Could not get info about kernel group segment size", status);
2035  status = hsa_fns.hsa_executable_symbol_get_info_fn
2036    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2037     &kernel->private_segment_size);
2038  if (status != HSA_STATUS_SUCCESS)
2039    hsa_fatal ("Could not get info about kernel private segment size",
2040	       status);
2041
2042  /* The kernel type is not known until something tries to launch it.  */
2043  kernel->kind = KIND_UNKNOWN;
2044
2045  GCN_DEBUG ("Kernel structure for %s fully initialized with "
2046	     "following segment sizes: \n", kernel->name);
2047  GCN_DEBUG ("  group_segment_size: %u\n",
2048	     (unsigned) kernel->group_segment_size);
2049  GCN_DEBUG ("  private_segment_size: %u\n",
2050	     (unsigned) kernel->private_segment_size);
2051  GCN_DEBUG ("  kernarg_segment_size: %u\n",
2052	     (unsigned) kernel->kernarg_segment_size);
2053  return;
2054
2055failure:
2056  kernel->initialization_failed = true;
2057}
2058
2059/* Do all the work that is necessary before running KERNEL for the first time.
2060   The function assumes the program has been created, finalized and frozen by
2061   create_and_finalize_hsa_program.  */
2062
2063static void
2064init_kernel (struct kernel_info *kernel)
2065{
2066  if (pthread_mutex_lock (&kernel->init_mutex))
2067    GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2068  if (kernel->initialized)
2069    {
2070      if (pthread_mutex_unlock (&kernel->init_mutex))
2071	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2072			   "mutex");
2073
2074      return;
2075    }
2076
2077  init_kernel_properties (kernel);
2078
2079  if (!kernel->initialization_failed)
2080    {
2081      GCN_DEBUG ("\n");
2082
2083      kernel->initialized = true;
2084    }
2085  if (pthread_mutex_unlock (&kernel->init_mutex))
2086    GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2087		       "mutex");
2088}
2089
2090/* Run KERNEL on its agent, pass VARS to it as arguments and take
2091   launch attributes from KLA.
2092
2093   MODULE_LOCKED indicates that the caller already holds the lock and
2094   run_kernel need not lock it again.
2095   If AQ is NULL then agent->sync_queue will be used.  */
2096
2097static void
2098run_kernel (struct kernel_info *kernel, void *vars,
2099	    struct GOMP_kernel_launch_attributes *kla,
2100	    struct goacc_asyncqueue *aq, bool module_locked)
2101{
2102  GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2103	     kernel->description->vpgr_count);
2104
2105  /* Reduce the number of threads/workers if there are insufficient
2106     VGPRs available to run the kernels together.  */
2107  if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2108    {
2109      int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2110      int max_threads = (256 / granulated_vgprs) * 4;
2111      if (kla->gdims[2] > max_threads)
2112	{
2113	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2114		       " per team/gang - reducing to %d threads/workers.\n",
2115		       kla->gdims[2], max_threads);
2116	  kla->gdims[2] = max_threads;
2117	}
2118    }
2119
2120  GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2121	     (aq ? aq->id : 0));
2122  GCN_DEBUG ("GCN launch attribs: gdims:[");
2123  int i;
2124  for (i = 0; i < kla->ndim; ++i)
2125    {
2126      if (i)
2127	DEBUG_PRINT (", ");
2128      DEBUG_PRINT ("%u", kla->gdims[i]);
2129    }
2130  DEBUG_PRINT ("], normalized gdims:[");
2131  for (i = 0; i < kla->ndim; ++i)
2132    {
2133      if (i)
2134	DEBUG_PRINT (", ");
2135      DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2136    }
2137  DEBUG_PRINT ("], wdims:[");
2138  for (i = 0; i < kla->ndim; ++i)
2139    {
2140      if (i)
2141	DEBUG_PRINT (", ");
2142      DEBUG_PRINT ("%u", kla->wdims[i]);
2143    }
2144  DEBUG_PRINT ("]\n");
2145  DEBUG_FLUSH ();
2146
2147  struct agent_info *agent = kernel->agent;
2148  if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2149    GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2150
2151  if (!agent->initialized)
2152    GOMP_PLUGIN_fatal ("Agent must be initialized");
2153
2154  if (!kernel->initialized)
2155    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2156
2157  hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2158
2159  uint64_t index
2160    = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2161  GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2162
2163  /* Wait until the queue is not full before writing the packet.   */
2164  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2165	 >= command_q->size)
2166    ;
2167
2168  /* Do not allow the dimensions to be overridden when running
2169     constructors or destructors.  */
2170  int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2171  int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2172
2173  hsa_kernel_dispatch_packet_t *packet;
2174  packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2175	   + index % command_q->size;
2176
2177  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2178  packet->grid_size_x = override_x ? : kla->gdims[0];
2179  packet->workgroup_size_x = get_group_size (kla->ndim,
2180					     packet->grid_size_x,
2181					     kla->wdims[0]);
2182
2183  if (kla->ndim >= 2)
2184    {
2185      packet->grid_size_y = kla->gdims[1];
2186      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2187						 kla->wdims[1]);
2188    }
2189  else
2190    {
2191      packet->grid_size_y = 1;
2192      packet->workgroup_size_y = 1;
2193    }
2194
2195  if (kla->ndim == 3)
2196    {
2197      packet->grid_size_z = limit_worker_threads (override_z
2198						  ? : kla->gdims[2]);
2199      packet->workgroup_size_z = get_group_size (kla->ndim,
2200						 packet->grid_size_z,
2201						 kla->wdims[2]);
2202    }
2203  else
2204    {
2205      packet->grid_size_z = 1;
2206      packet->workgroup_size_z = 1;
2207    }
2208
2209  GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2210	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2211	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2212	     packet->grid_size_x / packet->workgroup_size_x,
2213	     packet->grid_size_y / packet->workgroup_size_y,
2214	     packet->grid_size_z / packet->workgroup_size_z,
2215	     packet->workgroup_size_x, packet->workgroup_size_y,
2216	     packet->workgroup_size_z);
2217
2218  struct kernel_dispatch *shadow
2219    = create_kernel_dispatch (kernel, packet->grid_size_x);
2220  shadow->queue = command_q;
2221
2222  if (debug)
2223    {
2224      fprintf (stderr, "\nKernel has following dependencies:\n");
2225      print_kernel_dispatch (shadow, 2);
2226    }
2227
2228  packet->private_segment_size = kernel->private_segment_size;
2229  packet->group_segment_size = kernel->group_segment_size;
2230  packet->kernel_object = kernel->object;
2231  packet->kernarg_address = shadow->kernarg_address;
2232  hsa_signal_t s;
2233  s.handle = shadow->signal;
2234  packet->completion_signal = s;
2235  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2236  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2237
2238  GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2239
2240  uint16_t header;
2241  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2242  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2243  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2244
2245  GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2246	     agent->device_id);
2247
2248  packet_store_release ((uint32_t *) packet, header,
2249			(uint16_t) kla->ndim
2250			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2251
2252  hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2253				       index);
2254
2255  GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2256
2257  /* Root signal waits with 1ms timeout.  */
2258  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2259					     1000 * 1000,
2260					     HSA_WAIT_STATE_BLOCKED) != 0)
2261    {
2262      console_output (kernel, shadow->kernarg_address, false);
2263    }
2264  console_output (kernel, shadow->kernarg_address, true);
2265
2266  struct kernargs *kernargs = shadow->kernarg_address;
2267  unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2268
2269  release_kernel_dispatch (shadow);
2270
2271  if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2272    GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2273
2274  unsigned int upper = (return_value & ~0xffff) >> 16;
2275  if (upper == 0xcafe)
2276    ; // exit not called, normal termination.
2277  else if (upper == 0xffff)
2278    ; // exit called.
2279  else
2280    {
2281      GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2282			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2283			 return_value);
2284      abort ();
2285    }
2286
2287  if (upper == 0xffff)
2288    {
2289      unsigned int signal = (return_value >> 8) & 0xff;
2290
2291      if (signal == SIGABRT)
2292	{
2293	  GCN_WARNING ("GCN Kernel aborted\n");
2294	  abort ();
2295	}
2296      else if (signal != 0)
2297	{
2298	  GCN_WARNING ("GCN Kernel received unknown signal\n");
2299	  abort ();
2300	}
2301
2302      GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2303      exit (return_value & 0xff);
2304    }
2305}
2306
2307/* }}}  */
2308/* {{{ Load/Unload  */
2309
2310/* Initialize KERNEL from D and other parameters.  Return true on success. */
2311
2312static bool
2313init_basic_kernel_info (struct kernel_info *kernel,
2314			struct hsa_kernel_description *d,
2315			struct agent_info *agent,
2316			struct module_info *module)
2317{
2318  kernel->agent = agent;
2319  kernel->module = module;
2320  kernel->name = d->name;
2321  kernel->description = d;
2322  if (pthread_mutex_init (&kernel->init_mutex, NULL))
2323    {
2324      GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2325      return false;
2326    }
2327  return true;
2328}
2329
2330/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
2331   not found, return false.  */
2332
2333static bool
2334find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2335		  struct module_info *module, Elf64_Ehdr *image,
2336		  Elf64_Shdr *sections)
2337{
2338  bool res = false;
2339
2340  hsa_status_t status;
2341
2342  hsa_executable_symbol_t symbol;
2343  if (!find_executable_symbol (agent->executable, &symbol))
2344    return false;
2345
2346  status = hsa_fns.hsa_executable_symbol_get_info_fn
2347    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2348  if (status != HSA_STATUS_SUCCESS)
2349    {
2350      hsa_error ("Could not extract symbol address", status);
2351      return false;
2352    }
2353
2354  char *symbol_name = get_executable_symbol_name (symbol);
2355  if (symbol_name == NULL)
2356    return false;
2357
2358  /* Find the kernel function in ELF, and calculate actual load offset.  */
2359  for (int i = 0; i < image->e_shnum; i++)
2360    if (sections[i].sh_type == SHT_SYMTAB)
2361      {
2362	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2363	char *strings = (char *)image + strtab->sh_offset;
2364
2365	for (size_t offset = 0;
2366	     offset < sections[i].sh_size;
2367	     offset += sections[i].sh_entsize)
2368	  {
2369	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2370					  + sections[i].sh_offset
2371					  + offset);
2372	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
2373	      {
2374		*load_offset -= sym->st_value;
2375		res = true;
2376		break;
2377	      }
2378	  }
2379      }
2380
2381  free (symbol_name);
2382  return res;
2383}
2384
2385/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2386
2387static bool
2388isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2389{
2390  int isa_field = elf_gcn_isa_field (image);
2391  const char* isa_s = isa_hsa_name (isa_field);
2392  if (!isa_s)
2393    {
2394      hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2395      return false;
2396    }
2397
2398  if (isa_field != agent->device_isa)
2399    {
2400      char msg[120];
2401      const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2402      const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2403      assert (agent_isa_s);
2404      assert (agent_isa_gcc_s);
2405
2406      snprintf (msg, sizeof msg,
2407		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2408		"Try to recompile with '-foffload=-march=%s'.\n",
2409		isa_s, agent_isa_s, agent_isa_gcc_s);
2410
2411      hsa_error (msg, HSA_STATUS_ERROR);
2412      return false;
2413    }
2414
2415  return true;
2416}
2417
2418/* Create and finalize the program consisting of all loaded modules.  */
2419
2420static bool
2421create_and_finalize_hsa_program (struct agent_info *agent)
2422{
2423  hsa_status_t status;
2424  int reloc_count = 0;
2425  bool res = true;
2426  if (pthread_mutex_lock (&agent->prog_mutex))
2427    {
2428      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2429      return false;
2430    }
2431  if (agent->prog_finalized)
2432    goto final;
2433
2434  status
2435    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2436					HSA_EXECUTABLE_STATE_UNFROZEN,
2437					"", &agent->executable);
2438  if (status != HSA_STATUS_SUCCESS)
2439    {
2440      hsa_error ("Could not create GCN executable", status);
2441      goto fail;
2442    }
2443
2444  /* Load any GCN modules.  */
2445  struct module_info *module = agent->module;
2446  if (module)
2447    {
2448      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2449
2450      if (!isa_matches_agent (agent, image))
2451	goto fail;
2452
2453      /* Hide relocations from the HSA runtime loader.
2454	 Keep a copy of the unmodified section headers to use later.  */
2455      Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2456						  + image->e_shoff);
2457      for (int i = image->e_shnum - 1; i >= 0; i--)
2458	{
2459	  if (image_sections[i].sh_type == SHT_RELA
2460	      || image_sections[i].sh_type == SHT_REL)
2461	    /* Change section type to something harmless.  */
2462	    image_sections[i].sh_type |= 0x80;
2463	}
2464
2465      hsa_code_object_t co = { 0 };
2466      status = hsa_fns.hsa_code_object_deserialize_fn
2467	(module->image_desc->gcn_image->image,
2468	 module->image_desc->gcn_image->size,
2469	 NULL, &co);
2470      if (status != HSA_STATUS_SUCCESS)
2471	{
2472	  hsa_error ("Could not deserialize GCN code object", status);
2473	  goto fail;
2474	}
2475
2476      status = hsa_fns.hsa_executable_load_code_object_fn
2477	(agent->executable, agent->id, co, "");
2478      if (status != HSA_STATUS_SUCCESS)
2479	{
2480	  hsa_error ("Could not load GCN code object", status);
2481	  goto fail;
2482	}
2483
2484      if (!module->heap)
2485	{
2486	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2487						   gcn_kernel_heap_size,
2488						   (void**)&module->heap);
2489	  if (status != HSA_STATUS_SUCCESS)
2490	    {
2491	      hsa_error ("Could not allocate memory for GCN heap", status);
2492	      goto fail;
2493	    }
2494
2495	  status = hsa_fns.hsa_memory_assign_agent_fn
2496			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2497	  if (status != HSA_STATUS_SUCCESS)
2498	    {
2499	      hsa_error ("Could not assign GCN heap memory to device", status);
2500	      goto fail;
2501	    }
2502
2503	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2504				      &gcn_kernel_heap_size,
2505				      sizeof (gcn_kernel_heap_size));
2506	}
2507
2508    }
2509
2510  if (debug)
2511    dump_executable_symbols (agent->executable);
2512
2513  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2514  if (status != HSA_STATUS_SUCCESS)
2515    {
2516      hsa_error ("Could not freeze the GCN executable", status);
2517      goto fail;
2518    }
2519
2520  if (agent->module)
2521    {
2522      struct module_info *module = agent->module;
2523      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2524      Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2525
2526      Elf64_Addr load_offset;
2527      if (!find_load_offset (&load_offset, agent, module, image, sections))
2528	goto fail;
2529
2530      /* Record the physical load address range.
2531	 We need this for data copies later.  */
2532      Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2533      Elf64_Addr low = ~0, high = 0;
2534      for (int i = 0; i < image->e_phnum; i++)
2535	if (segments[i].p_memsz > 0)
2536	  {
2537	    if (segments[i].p_paddr < low)
2538	      low = segments[i].p_paddr;
2539	    if (segments[i].p_paddr > high)
2540	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
2541	  }
2542      module->phys_address_start = low + load_offset;
2543      module->phys_address_end = high + load_offset;
2544
2545      // Find dynamic symbol table
2546      Elf64_Shdr *dynsym = NULL;
2547      for (int i = 0; i < image->e_shnum; i++)
2548	if (sections[i].sh_type == SHT_DYNSYM)
2549	  {
2550	    dynsym = &sections[i];
2551	    break;
2552	  }
2553
2554      /* Fix up relocations.  */
2555      for (int i = 0; i < image->e_shnum; i++)
2556	{
2557	  if (sections[i].sh_type == (SHT_RELA | 0x80))
2558	    for (size_t offset = 0;
2559		 offset < sections[i].sh_size;
2560		 offset += sections[i].sh_entsize)
2561	      {
2562		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2563						  + sections[i].sh_offset
2564						  + offset);
2565		Elf64_Sym *sym =
2566		  (dynsym
2567		   ? (Elf64_Sym*)((char*)image
2568				  + dynsym->sh_offset
2569				  + (dynsym->sh_entsize
2570				     * ELF64_R_SYM (reloc->r_info)))
2571		   : NULL);
2572
2573		int64_t S = (sym ? sym->st_value : 0);
2574		int64_t P = reloc->r_offset + load_offset;
2575		int64_t A = reloc->r_addend;
2576		int64_t B = load_offset;
2577		int64_t V, size;
2578		switch (ELF64_R_TYPE (reloc->r_info))
2579		  {
2580		  case R_AMDGPU_ABS32_LO:
2581		    V = (S + A) & 0xFFFFFFFF;
2582		    size = 4;
2583		    break;
2584		  case R_AMDGPU_ABS32_HI:
2585		    V = (S + A) >> 32;
2586		    size = 4;
2587		    break;
2588		  case R_AMDGPU_ABS64:
2589		    V = S + A;
2590		    size = 8;
2591		    break;
2592		  case R_AMDGPU_REL32:
2593		    V = S + A - P;
2594		    size = 4;
2595		    break;
2596		  case R_AMDGPU_REL64:
2597		    /* FIXME
2598		       LLD seems to emit REL64 where the the assembler has
2599		       ABS64.  This is clearly wrong because it's not what the
2600		       compiler is expecting.  Let's assume, for now, that
2601		       it's a bug.  In any case, GCN kernels are always self
2602		       contained and therefore relative relocations will have
2603		       been resolved already, so this should be a safe
2604		       workaround.  */
2605		    V = S + A/* - P*/;
2606		    size = 8;
2607		    break;
2608		  case R_AMDGPU_ABS32:
2609		    V = S + A;
2610		    size = 4;
2611		    break;
2612		    /* TODO R_AMDGPU_GOTPCREL */
2613		    /* TODO R_AMDGPU_GOTPCREL32_LO */
2614		    /* TODO R_AMDGPU_GOTPCREL32_HI */
2615		  case R_AMDGPU_REL32_LO:
2616		    V = (S + A - P) & 0xFFFFFFFF;
2617		    size = 4;
2618		    break;
2619		  case R_AMDGPU_REL32_HI:
2620		    V = (S + A - P) >> 32;
2621		    size = 4;
2622		    break;
2623		  case R_AMDGPU_RELATIVE64:
2624		    V = B + A;
2625		    size = 8;
2626		    break;
2627		  default:
2628		    fprintf (stderr, "Error: unsupported relocation type.\n");
2629		    exit (1);
2630		  }
2631		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2632		if (status != HSA_STATUS_SUCCESS)
2633		  {
2634		    hsa_error ("Failed to fix up relocation", status);
2635		    goto fail;
2636		  }
2637		reloc_count++;
2638	      }
2639	}
2640    }
2641
2642  GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2643	     agent->device_id, reloc_count);
2644
2645final:
2646  agent->prog_finalized = true;
2647
2648  if (pthread_mutex_unlock (&agent->prog_mutex))
2649    {
2650      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2651      res = false;
2652    }
2653
2654  return res;
2655
2656fail:
2657  res = false;
2658  goto final;
2659}
2660
2661/* Free the HSA program in agent and everything associated with it and set
2662   agent->prog_finalized and the initialized flags of all kernels to false.
2663   Return TRUE on success.  */
2664
2665static bool
2666destroy_hsa_program (struct agent_info *agent)
2667{
2668  if (!agent->prog_finalized)
2669    return true;
2670
2671  hsa_status_t status;
2672
2673  GCN_DEBUG ("Destroying the current GCN program.\n");
2674
2675  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2676  if (status != HSA_STATUS_SUCCESS)
2677    return hsa_error ("Could not destroy GCN executable", status);
2678
2679  if (agent->module)
2680    {
2681      int i;
2682      for (i = 0; i < agent->module->kernel_count; i++)
2683	agent->module->kernels[i].initialized = false;
2684
2685      if (agent->module->heap)
2686	{
2687	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
2688	  agent->module->heap = NULL;
2689	}
2690    }
2691  agent->prog_finalized = false;
2692  return true;
2693}
2694
2695/* Deinitialize all information associated with MODULE and kernels within
2696   it.  Return TRUE on success.  */
2697
2698static bool
2699destroy_module (struct module_info *module, bool locked)
2700{
2701  /* Run destructors before destroying module.  */
2702  struct GOMP_kernel_launch_attributes kla =
2703    { 3,
2704      /* Grid size.  */
2705      { 1, 64, 1 },
2706      /* Work-group size.  */
2707      { 1, 64, 1 }
2708    };
2709
2710  if (module->fini_array_func)
2711    {
2712      init_kernel (module->fini_array_func);
2713      run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2714    }
2715  module->constructors_run_p = false;
2716
2717  int i;
2718  for (i = 0; i < module->kernel_count; i++)
2719    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2720      {
2721	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2722			   "mutex");
2723	return false;
2724      }
2725
2726  return true;
2727}
2728
2729/* }}}  */
2730/* {{{ Async  */
2731
2732/* Callback of dispatch queues to report errors.  */
2733
2734static void
2735execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2736{
2737  struct queue_entry *entry = &aq->queue[index];
2738
2739  switch (entry->type)
2740    {
2741    case KERNEL_LAUNCH:
2742      if (DEBUG_QUEUES)
2743	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2744		   aq->agent->device_id, aq->id, index);
2745      run_kernel (entry->u.launch.kernel,
2746		  entry->u.launch.vars,
2747		  &entry->u.launch.kla, aq, false);
2748      if (DEBUG_QUEUES)
2749	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2750		   aq->agent->device_id, aq->id, index);
2751      break;
2752
2753    case CALLBACK:
2754      if (DEBUG_QUEUES)
2755	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2756		   aq->agent->device_id, aq->id, index);
2757      entry->u.callback.fn (entry->u.callback.data);
2758      if (DEBUG_QUEUES)
2759	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2760		   aq->agent->device_id, aq->id, index);
2761      break;
2762
2763    case ASYNC_WAIT:
2764      {
2765	/* FIXME: is it safe to access a placeholder that may already have
2766	   been executed?  */
2767        struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2768
2769	if (DEBUG_QUEUES)
2770          GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2771		     aq->agent->device_id, aq->id, index);
2772
2773	pthread_mutex_lock (&placeholderp->mutex);
2774
2775	while (!placeholderp->executed)
2776          pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2777
2778	pthread_mutex_unlock (&placeholderp->mutex);
2779
2780	if (pthread_cond_destroy (&placeholderp->cond))
2781	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2782
2783	if (pthread_mutex_destroy (&placeholderp->mutex))
2784	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2785
2786	if (DEBUG_QUEUES)
2787          GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2788		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2789      }
2790      break;
2791
2792    case ASYNC_PLACEHOLDER:
2793      pthread_mutex_lock (&entry->u.placeholder.mutex);
2794      entry->u.placeholder.executed = 1;
2795      pthread_cond_signal (&entry->u.placeholder.cond);
2796      pthread_mutex_unlock (&entry->u.placeholder.mutex);
2797      break;
2798
2799    default:
2800      GOMP_PLUGIN_fatal ("Unknown queue element");
2801    }
2802}
2803
2804/* This function is run as a thread to service an async queue in the
2805   background.  It runs continuously until the stop flag is set.  */
2806
2807static void *
2808drain_queue (void *thread_arg)
2809{
2810  struct goacc_asyncqueue *aq = thread_arg;
2811
2812  if (DRAIN_QUEUE_SYNCHRONOUS_P)
2813    {
2814      aq->drain_queue_stop = 2;
2815      return NULL;
2816    }
2817
2818  pthread_mutex_lock (&aq->mutex);
2819
2820  while (true)
2821    {
2822      if (aq->drain_queue_stop)
2823	break;
2824
2825      if (aq->queue_n > 0)
2826	{
2827	  pthread_mutex_unlock (&aq->mutex);
2828	  execute_queue_entry (aq, aq->queue_first);
2829
2830	  pthread_mutex_lock (&aq->mutex);
2831	  aq->queue_first = ((aq->queue_first + 1)
2832			     % ASYNC_QUEUE_SIZE);
2833	  aq->queue_n--;
2834
2835	  if (DEBUG_THREAD_SIGNAL)
2836	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2837		       aq->agent->device_id, aq->id);
2838	  pthread_cond_broadcast (&aq->queue_cond_out);
2839	  pthread_mutex_unlock (&aq->mutex);
2840
2841	  if (DEBUG_QUEUES)
2842	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2843		       aq->id);
2844	  pthread_mutex_lock (&aq->mutex);
2845	}
2846      else
2847	{
2848	  if (DEBUG_THREAD_SLEEP)
2849	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2850		       aq->agent->device_id, aq->id);
2851	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2852	  if (DEBUG_THREAD_SLEEP)
2853	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2854		       aq->agent->device_id, aq->id);
2855	}
2856    }
2857
2858  aq->drain_queue_stop = 2;
2859  if (DEBUG_THREAD_SIGNAL)
2860    GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2861	       aq->agent->device_id, aq->id);
2862  pthread_cond_broadcast (&aq->queue_cond_out);
2863  pthread_mutex_unlock (&aq->mutex);
2864
2865  GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2866  return NULL;
2867}
2868
2869/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2870   is not usually the case.  This is just a debug tool.  */
2871
2872static void
2873drain_queue_synchronous (struct goacc_asyncqueue *aq)
2874{
2875  pthread_mutex_lock (&aq->mutex);
2876
2877  while (aq->queue_n > 0)
2878    {
2879      execute_queue_entry (aq, aq->queue_first);
2880
2881      aq->queue_first = ((aq->queue_first + 1)
2882			 % ASYNC_QUEUE_SIZE);
2883      aq->queue_n--;
2884    }
2885
2886  pthread_mutex_unlock (&aq->mutex);
2887}
2888
2889/* Block the current thread until an async queue is writable.  The aq->mutex
2890   lock should be held on entry, and remains locked on exit.  */
2891
2892static void
2893wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2894{
2895  if (aq->queue_n == ASYNC_QUEUE_SIZE)
2896    {
2897      /* Queue is full.  Wait for it to not be full.  */
2898      while (aq->queue_n == ASYNC_QUEUE_SIZE)
2899	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2900    }
2901}
2902
2903/* Request an asynchronous kernel launch on the specified queue.  This
2904   may block if the queue is full, but returns without waiting for the
2905   kernel to run.  */
2906
2907static void
2908queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2909		   void *vars, struct GOMP_kernel_launch_attributes *kla)
2910{
2911  assert (aq->agent == kernel->agent);
2912
2913  pthread_mutex_lock (&aq->mutex);
2914
2915  wait_for_queue_nonfull (aq);
2916
2917  int queue_last = ((aq->queue_first + aq->queue_n)
2918		    % ASYNC_QUEUE_SIZE);
2919  if (DEBUG_QUEUES)
2920    GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2921	       aq->id, queue_last);
2922
2923  aq->queue[queue_last].type = KERNEL_LAUNCH;
2924  aq->queue[queue_last].u.launch.kernel = kernel;
2925  aq->queue[queue_last].u.launch.vars = vars;
2926  aq->queue[queue_last].u.launch.kla = *kla;
2927
2928  aq->queue_n++;
2929
2930  if (DEBUG_THREAD_SIGNAL)
2931    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2932	       aq->agent->device_id, aq->id);
2933  pthread_cond_signal (&aq->queue_cond_in);
2934
2935  pthread_mutex_unlock (&aq->mutex);
2936}
2937
2938/* Request an asynchronous callback on the specified queue.  The callback
2939   function will be called, with the given opaque data, from the appropriate
2940   async thread, when all previous items on that queue are complete.  */
2941
2942static void
2943queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2944		     void *data)
2945{
2946  pthread_mutex_lock (&aq->mutex);
2947
2948  wait_for_queue_nonfull (aq);
2949
2950  int queue_last = ((aq->queue_first + aq->queue_n)
2951		    % ASYNC_QUEUE_SIZE);
2952  if (DEBUG_QUEUES)
2953    GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2954	       aq->id, queue_last);
2955
2956  aq->queue[queue_last].type = CALLBACK;
2957  aq->queue[queue_last].u.callback.fn = fn;
2958  aq->queue[queue_last].u.callback.data = data;
2959
2960  aq->queue_n++;
2961
2962  if (DEBUG_THREAD_SIGNAL)
2963    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2964	       aq->agent->device_id, aq->id);
2965  pthread_cond_signal (&aq->queue_cond_in);
2966
2967  pthread_mutex_unlock (&aq->mutex);
2968}
2969
2970/* Request that a given async thread wait for another thread (unspecified) to
2971   reach the given placeholder.  The wait will occur when all previous entries
2972   on the queue are complete.  A placeholder is effectively a kind of signal
2973   which simply sets a flag when encountered in a queue.  */
2974
2975static void
2976queue_push_asyncwait (struct goacc_asyncqueue *aq,
2977		      struct placeholder *placeholderp)
2978{
2979  pthread_mutex_lock (&aq->mutex);
2980
2981  wait_for_queue_nonfull (aq);
2982
2983  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2984  if (DEBUG_QUEUES)
2985    GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2986	       aq->id, queue_last);
2987
2988  aq->queue[queue_last].type = ASYNC_WAIT;
2989  aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2990
2991  aq->queue_n++;
2992
2993  if (DEBUG_THREAD_SIGNAL)
2994    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2995	       aq->agent->device_id, aq->id);
2996  pthread_cond_signal (&aq->queue_cond_in);
2997
2998  pthread_mutex_unlock (&aq->mutex);
2999}
3000
3001/* Add a placeholder into an async queue.  When the async thread reaches the
3002   placeholder it will set the "executed" flag to true and continue.
3003   Another thread may be waiting on this thread reaching the placeholder.  */
3004
3005static struct placeholder *
3006queue_push_placeholder (struct goacc_asyncqueue *aq)
3007{
3008  struct placeholder *placeholderp;
3009
3010  pthread_mutex_lock (&aq->mutex);
3011
3012  wait_for_queue_nonfull (aq);
3013
3014  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
3015  if (DEBUG_QUEUES)
3016    GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
3017	       aq->id, queue_last);
3018
3019  aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
3020  placeholderp = &aq->queue[queue_last].u.placeholder;
3021
3022  if (pthread_mutex_init (&placeholderp->mutex, NULL))
3023    {
3024      pthread_mutex_unlock (&aq->mutex);
3025      GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
3026    }
3027
3028  if (pthread_cond_init (&placeholderp->cond, NULL))
3029    {
3030      pthread_mutex_unlock (&aq->mutex);
3031      GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3032    }
3033
3034  placeholderp->executed = 0;
3035
3036  aq->queue_n++;
3037
3038  if (DEBUG_THREAD_SIGNAL)
3039    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3040	       aq->agent->device_id, aq->id);
3041  pthread_cond_signal (&aq->queue_cond_in);
3042
3043  pthread_mutex_unlock (&aq->mutex);
3044
3045  return placeholderp;
3046}
3047
3048/* Signal an asynchronous thread to terminate, and wait for it to do so.  */
3049
3050static void
3051finalize_async_thread (struct goacc_asyncqueue *aq)
3052{
3053  pthread_mutex_lock (&aq->mutex);
3054  if (aq->drain_queue_stop == 2)
3055    {
3056      pthread_mutex_unlock (&aq->mutex);
3057      return;
3058    }
3059
3060  aq->drain_queue_stop = 1;
3061
3062  if (DEBUG_THREAD_SIGNAL)
3063    GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3064	       aq->agent->device_id, aq->id);
3065  pthread_cond_signal (&aq->queue_cond_in);
3066
3067  while (aq->drain_queue_stop != 2)
3068    {
3069      if (DEBUG_THREAD_SLEEP)
3070	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3071		   " to sleep\n", aq->agent->device_id, aq->id);
3072      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3073      if (DEBUG_THREAD_SLEEP)
3074	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
3075		   aq->agent->device_id, aq->id);
3076    }
3077
3078  GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3079	     aq->id);
3080  pthread_mutex_unlock (&aq->mutex);
3081
3082  int err = pthread_join (aq->thread_drain_queue, NULL);
3083  if (err != 0)
3084    GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3085		       aq->agent->device_id, aq->id, strerror (err));
3086  GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3087}
3088
3089/* Set up an async queue for OpenMP.  There will be only one.  The
3090   implementation simply uses an OpenACC async queue.
3091   FIXME: is this thread-safe if two threads call this function?  */
3092
3093static void
3094maybe_init_omp_async (struct agent_info *agent)
3095{
3096  if (!agent->omp_async_queue)
3097    agent->omp_async_queue
3098      = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3099}
3100
3101/* A wrapper that works around an issue in the HSA runtime with host-to-device
3102   copies from read-only pages.  */
3103
3104static void
3105hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3106{
3107  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3108
3109  if (status == HSA_STATUS_SUCCESS)
3110    return;
3111
3112  /* It appears that the copy fails if the source data is in a read-only page.
3113     We can't detect that easily, so try copying the data to a temporary buffer
3114     and doing the copy again if we got an error above.  */
3115
3116  GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3117	       "[%p:+%d]\n", (void *) src, (int) len);
3118
3119  void *src_copy = malloc (len);
3120  memcpy (src_copy, src, len);
3121  status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3122  free (src_copy);
3123  if (status != HSA_STATUS_SUCCESS)
3124    GOMP_PLUGIN_error ("memory copy failed");
3125}
3126
3127/* Copy data to or from a device.  This is intended for use as an async
3128   callback event.  */
3129
3130static void
3131copy_data (void *data_)
3132{
3133  struct copy_data *data = (struct copy_data *)data_;
3134  GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3135	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
3136	     data->dst);
3137  hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3138  if (data->free_src)
3139    free ((void *) data->src);
3140  free (data);
3141}
3142
3143/* Free device data.  This is intended for use as an async callback event.  */
3144
3145static void
3146gomp_offload_free (void *ptr)
3147{
3148  GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
3149  GOMP_OFFLOAD_free (0, ptr);
3150}
3151
3152/* Request an asynchronous data copy, to or from a device, on a given queue.
3153   The event will be registered as a callback.  If FREE_SRC is true
3154   then the source data will be freed following the copy.  */
3155
3156static void
3157queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3158		 size_t len, bool free_src)
3159{
3160  if (DEBUG_QUEUES)
3161    GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3162	       aq->agent->device_id, aq->id, len, src, dst);
3163  struct copy_data *data
3164    = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3165  data->dst = dst;
3166  data->src = src;
3167  data->len = len;
3168  data->free_src = free_src;
3169  data->aq = aq;
3170  queue_push_callback (aq, copy_data, data);
3171}
3172
3173/* Return true if the given queue is currently empty.  */
3174
3175static int
3176queue_empty (struct goacc_asyncqueue *aq)
3177{
3178  pthread_mutex_lock (&aq->mutex);
3179  int res = aq->queue_n == 0 ? 1 : 0;
3180  pthread_mutex_unlock (&aq->mutex);
3181
3182  return res;
3183}
3184
3185/* Wait for a given queue to become empty.  This implements an OpenACC wait
3186   directive.  */
3187
3188static void
3189wait_queue (struct goacc_asyncqueue *aq)
3190{
3191  if (DRAIN_QUEUE_SYNCHRONOUS_P)
3192    {
3193      drain_queue_synchronous (aq);
3194      return;
3195    }
3196
3197  pthread_mutex_lock (&aq->mutex);
3198
3199  while (aq->queue_n > 0)
3200    {
3201      if (DEBUG_THREAD_SLEEP)
3202	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3203		   aq->agent->device_id, aq->id);
3204      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3205      if (DEBUG_THREAD_SLEEP)
3206	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3207		   aq->id);
3208    }
3209
3210  pthread_mutex_unlock (&aq->mutex);
3211  GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3212}
3213
3214/* }}}  */
3215/* {{{ OpenACC support  */
3216
3217/* Execute an OpenACC kernel, synchronously or asynchronously.  */
3218
3219static void
3220gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3221	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3222	  struct goacc_asyncqueue *aq)
3223{
3224  if (!GOMP_OFFLOAD_can_run (kernel))
3225    GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3226
3227  /* If we get here then this must be an OpenACC kernel.  */
3228  kernel->kind = KIND_OPENACC;
3229
3230  /* devaddrs must be double-indirect on the target.  */
3231  void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3232  for (size_t i = 0; i < mapnum; i++)
3233    hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3234				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3235				sizeof (void *));
3236
3237  struct hsa_kernel_description *hsa_kernel_desc = NULL;
3238  for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3239    {
3240      struct hsa_kernel_description *d
3241	= &kernel->module->image_desc->kernel_infos[i];
3242      if (d->name == kernel->name)
3243	{
3244	  hsa_kernel_desc = d;
3245	  break;
3246	}
3247    }
3248
3249  /* We may have statically-determined dimensions in
3250     hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3251     invocation at runtime in dims[].  We allow static dimensions to take
3252     priority over dynamic dimensions when present (non-zero).  */
3253  if (hsa_kernel_desc->oacc_dims[0] > 0)
3254    dims[0] = hsa_kernel_desc->oacc_dims[0];
3255  if (hsa_kernel_desc->oacc_dims[1] > 0)
3256    dims[1] = hsa_kernel_desc->oacc_dims[1];
3257  if (hsa_kernel_desc->oacc_dims[2] > 0)
3258    dims[2] = hsa_kernel_desc->oacc_dims[2];
3259
3260  /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3261     There isn't really a correct answer for this without a clue about the
3262     problem size, so let's do a reasonable number of single-worker gangs.
3263     64 gangs matches a typical Fiji device.  */
3264
3265  /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3266     for the default number of workers.  */
3267  if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
3268  if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
3269
3270  /* The incoming dimensions are expressed in terms of gangs, workers, and
3271     vectors.  The HSA dimensions are expressed in terms of "work-items",
3272     which means multiples of vector lanes.
3273
3274     The "grid size" specifies the size of the problem space, and the
3275     "work-group size" specifies how much of that we want a single compute
3276     unit to chew on at once.
3277
3278     The three dimensions do not really correspond to hardware, but the
3279     important thing is that the HSA runtime will launch as many
3280     work-groups as it takes to process the entire grid, and each
3281     work-group will contain as many wave-fronts as it takes to process
3282     the work-items in that group.
3283
3284     Essentially, as long as we set the Y dimension to 64 (the number of
3285     vector lanes in hardware), and the Z group size to the maximum (16),
3286     then we will get the gangs (X) and workers (Z) launched as we expect.
3287
3288     The reason for the apparent reversal of vector and worker dimension
3289     order is to do with the way the run-time distributes work-items across
3290     v1 and v2.  */
3291  struct GOMP_kernel_launch_attributes kla =
3292    {3,
3293     /* Grid size.  */
3294     {dims[0], 64, dims[1]},
3295     /* Work-group size.  */
3296     {1,       64, 16}
3297    };
3298
3299  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3300  acc_prof_info *prof_info = thr->prof_info;
3301  acc_event_info enqueue_launch_event_info;
3302  acc_api_info *api_info = thr->api_info;
3303  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3304  if (profiling_dispatch_p)
3305    {
3306      prof_info->event_type = acc_ev_enqueue_launch_start;
3307
3308      enqueue_launch_event_info.launch_event.event_type
3309	= prof_info->event_type;
3310      enqueue_launch_event_info.launch_event.valid_bytes
3311	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3312      enqueue_launch_event_info.launch_event.parent_construct
3313	= acc_construct_parallel;
3314      enqueue_launch_event_info.launch_event.implicit = 1;
3315      enqueue_launch_event_info.launch_event.tool_info = NULL;
3316      enqueue_launch_event_info.launch_event.kernel_name
3317	= (char *) kernel->name;
3318      enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3319      enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3320      enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3321
3322      api_info->device_api = acc_device_api_other;
3323
3324      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3325	&enqueue_launch_event_info, api_info);
3326    }
3327
3328  if (!async)
3329    {
3330      run_kernel (kernel, ind_da, &kla, NULL, false);
3331      gomp_offload_free (ind_da);
3332    }
3333  else
3334    {
3335      queue_push_launch (aq, kernel, ind_da, &kla);
3336      if (DEBUG_QUEUES)
3337	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3338		   aq->agent->device_id, aq->id, ind_da);
3339      queue_push_callback (aq, gomp_offload_free, ind_da);
3340    }
3341
3342  if (profiling_dispatch_p)
3343    {
3344      prof_info->event_type = acc_ev_enqueue_launch_end;
3345      enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3346      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3347					    &enqueue_launch_event_info,
3348					    api_info);
3349    }
3350}
3351
3352/* }}}  */
3353/* {{{ Generic Plugin API  */
3354
3355/* Return the name of the accelerator, which is "gcn".  */
3356
3357const char *
3358GOMP_OFFLOAD_get_name (void)
3359{
3360  return "gcn";
3361}
3362
3363/* Return the specific capabilities the HSA accelerator have.  */
3364
3365unsigned int
3366GOMP_OFFLOAD_get_caps (void)
3367{
3368  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3369  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3370	    | GOMP_OFFLOAD_CAP_OPENACC_200;
3371}
3372
3373/* Identify as GCN accelerator.  */
3374
3375int
3376GOMP_OFFLOAD_get_type (void)
3377{
3378  return OFFLOAD_TARGET_TYPE_GCN;
3379}
3380
3381/* Return the libgomp version number we're compatible with.  There is
3382   no requirement for cross-version compatibility.  */
3383
3384unsigned
3385GOMP_OFFLOAD_version (void)
3386{
3387  return GOMP_VERSION;
3388}
3389
3390/* Return the number of GCN devices on the system.  */
3391
3392int
3393GOMP_OFFLOAD_get_num_devices (void)
3394{
3395  if (!init_hsa_context ())
3396    return 0;
3397  return hsa_context.agent_count;
3398}
3399
3400/* Initialize device (agent) number N so that it can be used for computation.
3401   Return TRUE on success.  */
3402
3403bool
3404GOMP_OFFLOAD_init_device (int n)
3405{
3406  if (!init_hsa_context ())
3407    return false;
3408  if (n >= hsa_context.agent_count)
3409    {
3410      GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3411      return false;
3412    }
3413  struct agent_info *agent = &hsa_context.agents[n];
3414
3415  if (agent->initialized)
3416    return true;
3417
3418  agent->device_id = n;
3419
3420  if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3421    {
3422      GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3423      return false;
3424    }
3425  if (pthread_mutex_init (&agent->prog_mutex, NULL))
3426    {
3427      GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3428      return false;
3429    }
3430  if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3431    {
3432      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3433      return false;
3434    }
3435  if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3436    {
3437      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3438      return false;
3439    }
3440  agent->async_queues = NULL;
3441  agent->omp_async_queue = NULL;
3442  agent->team_arena_list = NULL;
3443
3444  uint32_t queue_size;
3445  hsa_status_t status;
3446  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3447					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3448					  &queue_size);
3449  if (status != HSA_STATUS_SUCCESS)
3450    return hsa_error ("Error requesting maximum queue size of the GCN agent",
3451		      status);
3452
3453  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3454					  &agent->name);
3455  if (status != HSA_STATUS_SUCCESS)
3456    return hsa_error ("Error querying the name of the agent", status);
3457
3458  agent->device_isa = isa_code (agent->name);
3459  if (agent->device_isa < 0)
3460    return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3461
3462  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3463					  &agent->vendor_name);
3464  if (status != HSA_STATUS_SUCCESS)
3465    return hsa_error ("Error querying the vendor name of the agent", status);
3466
3467  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3468					HSA_QUEUE_TYPE_MULTI,
3469					hsa_queue_callback, NULL, UINT32_MAX,
3470					UINT32_MAX, &agent->sync_queue);
3471  if (status != HSA_STATUS_SUCCESS)
3472    return hsa_error ("Error creating command queue", status);
3473
3474  agent->kernarg_region.handle = (uint64_t) -1;
3475  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3476						 get_kernarg_memory_region,
3477						 &agent->kernarg_region);
3478  if (status != HSA_STATUS_SUCCESS
3479      && status != HSA_STATUS_INFO_BREAK)
3480    hsa_error ("Scanning memory regions failed", status);
3481  if (agent->kernarg_region.handle == (uint64_t) -1)
3482    {
3483      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3484			 "arguments");
3485      return false;
3486    }
3487  GCN_DEBUG ("Selected kernel arguments memory region:\n");
3488  dump_hsa_region (agent->kernarg_region, NULL);
3489
3490  agent->data_region.handle = (uint64_t) -1;
3491  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3492						 get_data_memory_region,
3493						 &agent->data_region);
3494  if (status != HSA_STATUS_SUCCESS
3495      && status != HSA_STATUS_INFO_BREAK)
3496    hsa_error ("Scanning memory regions failed", status);
3497  if (agent->data_region.handle == (uint64_t) -1)
3498    {
3499      GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3500			 "data");
3501      return false;
3502    }
3503  GCN_DEBUG ("Selected device data memory region:\n");
3504  dump_hsa_region (agent->data_region, NULL);
3505
3506  GCN_DEBUG ("GCN agent %d initialized\n", n);
3507
3508  agent->initialized = true;
3509  return true;
3510}
3511
3512/* Load GCN object-code module described by struct gcn_image_desc in
3513   TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3514   If there are any constructors then run them.  */
3515
3516int
3517GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3518			 struct addr_pair **target_table)
3519{
3520  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3521    {
3522      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3523			 " (expected %u, received %u)",
3524			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3525      return -1;
3526    }
3527
3528  struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3529  struct agent_info *agent;
3530  struct addr_pair *pair;
3531  struct module_info *module;
3532  struct kernel_info *kernel;
3533  int kernel_count = image_desc->kernel_count;
3534  unsigned var_count = image_desc->global_variable_count;
3535
3536  agent = get_agent_info (ord);
3537  if (!agent)
3538    return -1;
3539
3540  if (pthread_rwlock_wrlock (&agent->module_rwlock))
3541    {
3542      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3543      return -1;
3544    }
3545  if (agent->prog_finalized
3546      && !destroy_hsa_program (agent))
3547    return -1;
3548
3549  GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3550  GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3551  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3552			     * sizeof (struct addr_pair));
3553  *target_table = pair;
3554  module = (struct module_info *)
3555    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3556				+ kernel_count * sizeof (struct kernel_info));
3557  module->image_desc = image_desc;
3558  module->kernel_count = kernel_count;
3559  module->heap = NULL;
3560  module->constructors_run_p = false;
3561
3562  kernel = &module->kernels[0];
3563
3564  /* Allocate memory for kernel dependencies.  */
3565  for (unsigned i = 0; i < kernel_count; i++)
3566    {
3567      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3568      if (!init_basic_kernel_info (kernel, d, agent, module))
3569	return -1;
3570      if (strcmp (d->name, "_init_array") == 0)
3571	module->init_array_func = kernel;
3572      else if (strcmp (d->name, "_fini_array") == 0)
3573        module->fini_array_func = kernel;
3574      else
3575	{
3576	  pair->start = (uintptr_t) kernel;
3577	  pair->end = (uintptr_t) (kernel + 1);
3578	  pair++;
3579	}
3580      kernel++;
3581    }
3582
3583  agent->module = module;
3584  if (pthread_rwlock_unlock (&agent->module_rwlock))
3585    {
3586      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3587      return -1;
3588    }
3589
3590  if (!create_and_finalize_hsa_program (agent))
3591    return -1;
3592
3593  for (unsigned i = 0; i < var_count; i++)
3594    {
3595      struct global_var_info *v = &image_desc->global_variables[i];
3596      GCN_DEBUG ("Looking for variable %s\n", v->name);
3597
3598      hsa_status_t status;
3599      hsa_executable_symbol_t var_symbol;
3600      status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3601						     v->name, agent->id,
3602						     0, &var_symbol);
3603
3604      if (status != HSA_STATUS_SUCCESS)
3605	hsa_fatal ("Could not find symbol for variable in the code object",
3606		   status);
3607
3608      uint64_t var_addr;
3609      uint32_t var_size;
3610      status = hsa_fns.hsa_executable_symbol_get_info_fn
3611	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3612      if (status != HSA_STATUS_SUCCESS)
3613	hsa_fatal ("Could not extract a variable from its symbol", status);
3614      status = hsa_fns.hsa_executable_symbol_get_info_fn
3615	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3616      if (status != HSA_STATUS_SUCCESS)
3617	hsa_fatal ("Could not extract a variable size from its symbol", status);
3618
3619      pair->start = var_addr;
3620      pair->end = var_addr + var_size;
3621      GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3622		 (void *)var_addr, var_size);
3623      pair++;
3624    }
3625
3626  /* Ensure that constructors are run first.  */
3627  struct GOMP_kernel_launch_attributes kla =
3628    { 3,
3629      /* Grid size.  */
3630      { 1, 64, 1 },
3631      /* Work-group size.  */
3632      { 1, 64, 1 }
3633    };
3634
3635  if (module->init_array_func)
3636    {
3637      init_kernel (module->init_array_func);
3638      run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3639    }
3640  module->constructors_run_p = true;
3641
3642  /* Don't report kernels that libgomp need not know about.  */
3643  if (module->init_array_func)
3644    kernel_count--;
3645  if (module->fini_array_func)
3646    kernel_count--;
3647
3648  return kernel_count + var_count;
3649}
3650
3651/* Unload GCN object-code module described by struct gcn_image_desc in
3652   TARGET_DATA from agent number N.  Return TRUE on success.  */
3653
3654bool
3655GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3656{
3657  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3658    {
3659      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3660			 " (expected %u, received %u)",
3661			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3662      return false;
3663    }
3664
3665  struct agent_info *agent;
3666  agent = get_agent_info (n);
3667  if (!agent)
3668    return false;
3669
3670  if (pthread_rwlock_wrlock (&agent->module_rwlock))
3671    {
3672      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3673      return false;
3674    }
3675
3676  if (!agent->module || agent->module->image_desc != target_data)
3677    {
3678      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3679			 "loaded before");
3680      return false;
3681    }
3682
3683  if (!destroy_module (agent->module, true))
3684    return false;
3685  free (agent->module);
3686  agent->module = NULL;
3687  if (!destroy_hsa_program (agent))
3688    return false;
3689  if (pthread_rwlock_unlock (&agent->module_rwlock))
3690    {
3691      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3692      return false;
3693    }
3694  return true;
3695}
3696
3697/* Deinitialize all information and status associated with agent number N.  We
3698   do not attempt any synchronization, assuming the user and libgomp will not
3699   attempt deinitialization of a device that is in any way being used at the
3700   same time.  Return TRUE on success.  */
3701
3702bool
3703GOMP_OFFLOAD_fini_device (int n)
3704{
3705  struct agent_info *agent = get_agent_info (n);
3706  if (!agent)
3707    return false;
3708
3709  if (!agent->initialized)
3710    return true;
3711
3712  if (agent->omp_async_queue)
3713    {
3714      GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3715      agent->omp_async_queue = NULL;
3716    }
3717
3718  if (agent->module)
3719    {
3720      if (!destroy_module (agent->module, false))
3721	return false;
3722      free (agent->module);
3723      agent->module = NULL;
3724    }
3725
3726  if (!destroy_team_arenas (agent))
3727    return false;
3728
3729  if (!destroy_hsa_program (agent))
3730    return false;
3731
3732  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3733  if (status != HSA_STATUS_SUCCESS)
3734    return hsa_error ("Error destroying command queue", status);
3735
3736  if (pthread_mutex_destroy (&agent->prog_mutex))
3737    {
3738      GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3739      return false;
3740    }
3741  if (pthread_rwlock_destroy (&agent->module_rwlock))
3742    {
3743      GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3744      return false;
3745    }
3746
3747  if (pthread_mutex_destroy (&agent->async_queues_mutex))
3748    {
3749      GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3750      return false;
3751    }
3752  if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3753    {
3754      GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3755      return false;
3756    }
3757  agent->initialized = false;
3758  return true;
3759}
3760
3761/* Return true if the HSA runtime can run function FN_PTR.  */
3762
3763bool
3764GOMP_OFFLOAD_can_run (void *fn_ptr)
3765{
3766  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3767
3768  init_kernel (kernel);
3769  if (kernel->initialization_failed)
3770    goto failure;
3771
3772  return true;
3773
3774failure:
3775  if (suppress_host_fallback)
3776    GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3777  GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3778  return false;
3779}
3780
3781/* Allocate memory on device N.  */
3782
3783void *
3784GOMP_OFFLOAD_alloc (int n, size_t size)
3785{
3786  struct agent_info *agent = get_agent_info (n);
3787  return alloc_by_agent (agent, size);
3788}
3789
3790/* Free memory from device N.  */
3791
3792bool
3793GOMP_OFFLOAD_free (int device, void *ptr)
3794{
3795  GCN_DEBUG ("Freeing memory on device %d\n", device);
3796
3797  hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3798  if (status != HSA_STATUS_SUCCESS)
3799    {
3800      hsa_error ("Could not free device memory", status);
3801      return false;
3802    }
3803
3804  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3805  bool profiling_dispatch_p
3806    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3807  if (profiling_dispatch_p)
3808    {
3809      acc_prof_info *prof_info = thr->prof_info;
3810      acc_event_info data_event_info;
3811      acc_api_info *api_info = thr->api_info;
3812
3813      prof_info->event_type = acc_ev_free;
3814
3815      data_event_info.data_event.event_type = prof_info->event_type;
3816      data_event_info.data_event.valid_bytes
3817	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
3818      data_event_info.data_event.parent_construct
3819	= acc_construct_parallel;
3820      data_event_info.data_event.implicit = 1;
3821      data_event_info.data_event.tool_info = NULL;
3822      data_event_info.data_event.var_name = NULL;
3823      data_event_info.data_event.bytes = 0;
3824      data_event_info.data_event.host_ptr = NULL;
3825      data_event_info.data_event.device_ptr = (void *) ptr;
3826
3827      api_info->device_api = acc_device_api_other;
3828
3829      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3830					    api_info);
3831    }
3832
3833  return true;
3834}
3835
3836/* Copy data from DEVICE to host.  */
3837
3838bool
3839GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3840{
3841  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3842	     src, dst);
3843  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3844  if (status != HSA_STATUS_SUCCESS)
3845    GOMP_PLUGIN_error ("memory copy failed");
3846  return true;
3847}
3848
3849/* Copy data from host to DEVICE.  */
3850
3851bool
3852GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3853{
3854  GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3855	     device, dst);
3856  hsa_memory_copy_wrapper (dst, src, n);
3857  return true;
3858}
3859
3860/* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3861
3862bool
3863GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3864{
3865  struct gcn_thread *thread_data = gcn_thread ();
3866
3867  if (thread_data && !async_synchronous_p (thread_data->async))
3868    {
3869      struct agent_info *agent = get_agent_info (device);
3870      maybe_init_omp_async (agent);
3871      queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3872      return true;
3873    }
3874
3875  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3876	     device, src, device, dst);
3877  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3878  if (status != HSA_STATUS_SUCCESS)
3879    GOMP_PLUGIN_error ("memory copy failed");
3880  return true;
3881}
3882
3883/* }}}  */
3884/* {{{ OpenMP Plugin API  */
3885
3886/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3887   in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3888   to a kernel_info structure, and must have previously been loaded to the
3889   specified device.  */
3890
3891void
3892GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3893{
3894  struct agent_info *agent = get_agent_info (device);
3895  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3896  struct GOMP_kernel_launch_attributes def;
3897  struct GOMP_kernel_launch_attributes *kla;
3898  assert (agent == kernel->agent);
3899
3900  /* If we get here then the kernel must be OpenMP.  */
3901  kernel->kind = KIND_OPENMP;
3902
3903  if (!parse_target_attributes (args, &def, &kla, agent))
3904    {
3905      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3906      return;
3907    }
3908  run_kernel (kernel, vars, kla, NULL, false);
3909}
3910
3911/* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3912   GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3913   GOMP_PLUGIN_target_task_completion when it has finished.  */
3914
3915void
3916GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3917			void **args, void *async_data)
3918{
3919  GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3920  struct agent_info *agent = get_agent_info (device);
3921  struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3922  struct GOMP_kernel_launch_attributes def;
3923  struct GOMP_kernel_launch_attributes *kla;
3924  assert (agent == kernel->agent);
3925
3926  /* If we get here then the kernel must be OpenMP.  */
3927  kernel->kind = KIND_OPENMP;
3928
3929  if (!parse_target_attributes (args, &def, &kla, agent))
3930    {
3931      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3932      return;
3933    }
3934
3935  maybe_init_omp_async (agent);
3936  queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3937  queue_push_callback (agent->omp_async_queue,
3938		       GOMP_PLUGIN_target_task_completion, async_data);
3939}
3940
3941/* }}} */
3942/* {{{ OpenACC Plugin API  */
3943
3944/* Run a synchronous OpenACC kernel.  The device number is inferred from the
3945   already-loaded KERNEL.  */
3946
3947void
3948GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3949			   void **hostaddrs, void **devaddrs, unsigned *dims,
3950			   void *targ_mem_desc)
3951{
3952  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3953
3954  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3955	    NULL);
3956}
3957
3958/* Run an asynchronous OpenACC kernel on the specified queue.  */
3959
3960void
3961GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3962				 void **hostaddrs, void **devaddrs,
3963				 unsigned *dims, void *targ_mem_desc,
3964				 struct goacc_asyncqueue *aq)
3965{
3966  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3967
3968  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3969	    aq);
3970}
3971
3972/* Create a new asynchronous thread and queue for running future kernels.  */
3973
3974struct goacc_asyncqueue *
3975GOMP_OFFLOAD_openacc_async_construct (int device)
3976{
3977  struct agent_info *agent = get_agent_info (device);
3978
3979  pthread_mutex_lock (&agent->async_queues_mutex);
3980
3981  struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3982  aq->agent = get_agent_info (device);
3983  aq->prev = NULL;
3984  aq->next = agent->async_queues;
3985  if (aq->next)
3986    {
3987      aq->next->prev = aq;
3988      aq->id = aq->next->id + 1;
3989    }
3990  else
3991    aq->id = 1;
3992  agent->async_queues = aq;
3993
3994  aq->queue_first = 0;
3995  aq->queue_n = 0;
3996  aq->drain_queue_stop = 0;
3997
3998  if (pthread_mutex_init (&aq->mutex, NULL))
3999    {
4000      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4001      return false;
4002    }
4003  if (pthread_cond_init (&aq->queue_cond_in, NULL))
4004    {
4005      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4006      return false;
4007    }
4008  if (pthread_cond_init (&aq->queue_cond_out, NULL))
4009    {
4010      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4011      return false;
4012    }
4013
4014  hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4015						     ASYNC_QUEUE_SIZE,
4016						     HSA_QUEUE_TYPE_MULTI,
4017						     hsa_queue_callback, NULL,
4018						     UINT32_MAX, UINT32_MAX,
4019						     &aq->hsa_queue);
4020  if (status != HSA_STATUS_SUCCESS)
4021    hsa_fatal ("Error creating command queue", status);
4022
4023  int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4024  if (err != 0)
4025    GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4026		       strerror (err));
4027  GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4028	     aq->id);
4029
4030  pthread_mutex_unlock (&agent->async_queues_mutex);
4031
4032  return aq;
4033}
4034
4035/* Destroy an existing asynchronous thread and queue.  Waits for any
4036   currently-running task to complete, but cancels any queued tasks.  */
4037
4038bool
4039GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4040{
4041  struct agent_info *agent = aq->agent;
4042
4043  finalize_async_thread (aq);
4044
4045  pthread_mutex_lock (&agent->async_queues_mutex);
4046
4047  int err;
4048  if ((err = pthread_mutex_destroy (&aq->mutex)))
4049    {
4050      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4051      goto fail;
4052    }
4053  if (pthread_cond_destroy (&aq->queue_cond_in))
4054    {
4055      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4056      goto fail;
4057    }
4058  if (pthread_cond_destroy (&aq->queue_cond_out))
4059    {
4060      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4061      goto fail;
4062    }
4063  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4064  if (status != HSA_STATUS_SUCCESS)
4065    {
4066      hsa_error ("Error destroying command queue", status);
4067      goto fail;
4068    }
4069
4070  if (aq->prev)
4071    aq->prev->next = aq->next;
4072  if (aq->next)
4073    aq->next->prev = aq->prev;
4074  if (agent->async_queues == aq)
4075    agent->async_queues = aq->next;
4076
4077  GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4078
4079  free (aq);
4080  pthread_mutex_unlock (&agent->async_queues_mutex);
4081  return true;
4082
4083fail:
4084  pthread_mutex_unlock (&agent->async_queues_mutex);
4085  return false;
4086}
4087
4088/* Return true if the specified async queue is currently empty.  */
4089
4090int
4091GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4092{
4093  return queue_empty (aq);
4094}
4095
4096/* Block until the specified queue has executed all its tasks and the
4097   queue is empty.  */
4098
4099bool
4100GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4101{
4102  wait_queue (aq);
4103  return true;
4104}
4105
4106/* Add a serialization point across two async queues. Any new tasks added to
4107   AQ2, after this call, will not run until all tasks on AQ1, at the time
4108   of this call, have completed.  */
4109
4110bool
4111GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4112				      struct goacc_asyncqueue *aq2)
4113{
4114  /* For serialize, stream aq2 waits for aq1 to complete work that has been
4115     scheduled to run on it up to this point.  */
4116  if (aq1 != aq2)
4117    {
4118      struct placeholder *placeholderp = queue_push_placeholder (aq1);
4119      queue_push_asyncwait (aq2, placeholderp);
4120    }
4121  return true;
4122}
4123
4124/* Add an opaque callback to the given async queue.  */
4125
4126void
4127GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4128					   void (*fn) (void *), void *data)
4129{
4130  queue_push_callback (aq, fn, data);
4131}
4132
4133/* Queue up an asynchronous data copy from host to DEVICE.  */
4134
4135bool
4136GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4137				     size_t n, struct goacc_asyncqueue *aq)
4138{
4139  struct agent_info *agent = get_agent_info (device);
4140  assert (agent == aq->agent);
4141  /* The source data does not necessarily remain live until the deferred
4142     copy happens.  Taking a snapshot of the data here avoids reading
4143     uninitialised data later, but means that (a) data is copied twice and
4144     (b) modifications to the copied data between the "spawning" point of
4145     the asynchronous kernel and when it is executed will not be seen.
4146     But, that is probably correct.  */
4147  void *src_copy = GOMP_PLUGIN_malloc (n);
4148  memcpy (src_copy, src, n);
4149  queue_push_copy (aq, dst, src_copy, n, true);
4150  return true;
4151}
4152
4153/* Queue up an asynchronous data copy from DEVICE to host.  */
4154
4155bool
4156GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4157				     size_t n, struct goacc_asyncqueue *aq)
4158{
4159  struct agent_info *agent = get_agent_info (device);
4160  assert (agent == aq->agent);
4161  queue_push_copy (aq, dst, src, n, false);
4162  return true;
4163}
4164
4165union goacc_property_value
4166GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4167{
4168  struct agent_info *agent = get_agent_info (device);
4169
4170  union goacc_property_value propval = { .val = 0 };
4171
4172  switch (prop)
4173    {
4174    case GOACC_PROPERTY_FREE_MEMORY:
4175      /* Not supported. */
4176      break;
4177    case GOACC_PROPERTY_MEMORY:
4178      {
4179	size_t size;
4180	hsa_region_t region = agent->data_region;
4181	hsa_status_t status =
4182	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4183	if (status == HSA_STATUS_SUCCESS)
4184	  propval.val = size;
4185	break;
4186      }
4187    case GOACC_PROPERTY_NAME:
4188      propval.ptr = agent->name;
4189      break;
4190    case GOACC_PROPERTY_VENDOR:
4191      propval.ptr = agent->vendor_name;
4192      break;
4193    case GOACC_PROPERTY_DRIVER:
4194      propval.ptr = hsa_context.driver_version_s;
4195      break;
4196    }
4197
4198  return propval;
4199}
4200
4201/* Set up plugin-specific thread-local-data (host-side).  */
4202
4203void *
4204GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4205{
4206  struct gcn_thread *thread_data
4207    = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4208
4209  thread_data->async = GOMP_ASYNC_SYNC;
4210
4211  return (void *) thread_data;
4212}
4213
4214/* Clean up plugin-specific thread-local-data.  */
4215
4216void
4217GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4218{
4219  free (data);
4220}
4221
4222/* }}} */
4223