1/* Plugin for AMD GCN execution.
2
3   Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5   Contributed by Mentor Embedded
6
7   This file is part of the GNU Offloading and Multi Processing Library
8   (libgomp).
9
10   Libgomp is free software; you can redistribute it and/or modify it
11   under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 3, or (at your option)
13   any later version.
14
15   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18   more details.
19
20   Under Section 7 of GPL version 3, you are granted additional
21   permissions described in the GCC Runtime Library Exception, version
22   3.1, as published by the Free Software Foundation.
23
24   You should have received a copy of the GNU General Public License and
25   a copy of the GCC Runtime Library Exception along with this program;
26   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27   <http://www.gnu.org/licenses/>.  */
28
29/* {{{ Includes and defines  */
30
31#include "config.h"
32#include "symcat.h"
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
41#include <hsa_ext_amd.h>
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
45#include "gomp-constants.h"
46#include <elf.h>
47#include "oacc-plugin.h"
48#include "oacc-int.h"
49#include <assert.h>
50
51/* These probably won't be in elf.h for a while.  */
52#ifndef R_AMDGPU_NONE
53#define R_AMDGPU_NONE		0
54#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
55#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
56#define R_AMDGPU_ABS64		3	/* S + A  */
57#define R_AMDGPU_REL32		4	/* S + A - P  */
58#define R_AMDGPU_REL64		5	/* S + A - P  */
59#define R_AMDGPU_ABS32		6	/* S + A  */
60#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
61#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
62#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
63#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
64#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
65#define R_AMDGPU_RELATIVE64	13	/* B + A  */
66#endif
67
68/* GCN specific definitions for asynchronous queues.  */
69
70#define ASYNC_QUEUE_SIZE 64
71#define DRAIN_QUEUE_SYNCHRONOUS_P false
72#define DEBUG_QUEUES 0
73#define DEBUG_THREAD_SLEEP 0
74#define DEBUG_THREAD_SIGNAL 0
75
76/* Defaults.  */
77#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
78
79/* Secure getenv() which returns NULL if running as SUID/SGID.  */
80#ifndef HAVE_SECURE_GETENV
81#ifdef HAVE___SECURE_GETENV
82#define secure_getenv __secure_getenv
83#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
84  && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
85
86#include <unistd.h>
87
88/* Implementation of secure_getenv() for targets where it is not provided but
89   we have at least means to test real and effective IDs. */
90
91static char *
92secure_getenv (const char *name)
93{
94  if ((getuid () == geteuid ()) && (getgid () == getegid ()))
95    return getenv (name);
96  else
97    return NULL;
98}
99
100#else
101#define secure_getenv getenv
102#endif
103#endif
104
105/* }}}  */
106/* {{{ Types  */
107
108/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
109
110struct gcn_thread
111{
112  /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
113  int async;
114};
115
116/* As an HSA runtime is dlopened, following structure defines function
117   pointers utilized by the HSA plug-in.  */
118
119struct hsa_runtime_fn_info
120{
121  /* HSA runtime.  */
122  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
123					const char **status_string);
124  hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
125					  void *value);
126  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
127					 hsa_agent_info_t attribute,
128					 void *value);
129  hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
130				      hsa_isa_info_t attribute,
131				      uint32_t index,
132				      void *value);
133  hsa_status_t (*hsa_init_fn) (void);
134  hsa_status_t (*hsa_iterate_agents_fn)
135    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
136  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
137					  hsa_region_info_t attribute,
138					  void *value);
139  hsa_status_t (*hsa_queue_create_fn)
140    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
141     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
142     void *data, uint32_t private_segment_size,
143     uint32_t group_segment_size, hsa_queue_t **queue);
144  hsa_status_t (*hsa_agent_iterate_regions_fn)
145    (hsa_agent_t agent,
146     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
147  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
148  hsa_status_t (*hsa_executable_create_fn)
149    (hsa_profile_t profile, hsa_executable_state_t executable_state,
150     const char *options, hsa_executable_t *executable);
151  hsa_status_t (*hsa_executable_global_variable_define_fn)
152    (hsa_executable_t executable, const char *variable_name, void *address);
153  hsa_status_t (*hsa_executable_load_code_object_fn)
154    (hsa_executable_t executable, hsa_agent_t agent,
155     hsa_code_object_t code_object, const char *options);
156  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
157					   const char *options);
158  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
159					uint32_t num_consumers,
160					const hsa_agent_t *consumers,
161					hsa_signal_t *signal);
162  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
163					  void **ptr);
164  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
165					      hsa_access_permission_t access);
166  hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
167  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
168  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
169  hsa_status_t (*hsa_executable_get_symbol_fn)
170    (hsa_executable_t executable, const char *module_name,
171     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
172     hsa_executable_symbol_t *symbol);
173  hsa_status_t (*hsa_executable_symbol_get_info_fn)
174    (hsa_executable_symbol_t executable_symbol,
175     hsa_executable_symbol_info_t attribute, void *value);
176  hsa_status_t (*hsa_executable_iterate_symbols_fn)
177    (hsa_executable_t executable,
178     hsa_status_t (*callback)(hsa_executable_t executable,
179			      hsa_executable_symbol_t symbol, void *data),
180     void *data);
181  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
182						    uint64_t value);
183  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
184  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
185				       hsa_signal_value_t value);
186  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
187				       hsa_signal_value_t value);
188  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
189    (hsa_signal_t signal, hsa_signal_condition_t condition,
190     hsa_signal_value_t compare_value, uint64_t timeout_hint,
191     hsa_wait_state_t wait_state_hint);
192  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
193  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
194
195  hsa_status_t (*hsa_code_object_deserialize_fn)
196    (void *serialized_code_object, size_t serialized_code_object_size,
197     const char *options, hsa_code_object_t *code_object);
198};
199
200/* Structure describing the run-time and grid properties of an HSA kernel
201   lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
202
203struct GOMP_kernel_launch_attributes
204{
205  /* Number of dimensions the workload has.  Maximum number is 3.  */
206  uint32_t ndim;
207  /* Size of the grid in the three respective dimensions.  */
208  uint32_t gdims[3];
209  /* Size of work-groups in the respective dimensions.  */
210  uint32_t wdims[3];
211};
212
213/* Collection of information needed for a dispatch of a kernel from a
214   kernel.  */
215
216struct kernel_dispatch
217{
218  struct agent_info *agent;
219  /* Pointer to a command queue associated with a kernel dispatch agent.  */
220  void *queue;
221  /* Pointer to a memory space used for kernel arguments passing.  */
222  void *kernarg_address;
223  /* Kernel object.  */
224  uint64_t object;
225  /* Synchronization signal used for dispatch synchronization.  */
226  uint64_t signal;
227  /* Private segment size.  */
228  uint32_t private_segment_size;
229  /* Group segment size.  */
230  uint32_t group_segment_size;
231};
232
233/* Structure of the kernargs segment, supporting console output.
234
235   This needs to match the definitions in Newlib, and the expectations
236   in libgomp target code.  */
237
238struct kernargs {
239  /* Leave space for the real kernel arguments.
240     OpenACC and OpenMP only use one pointer.  */
241  int64_t dummy1;
242  int64_t dummy2;
243
244  /* A pointer to struct output, below, for console output data.  */
245  int64_t out_ptr;
246
247  /* A pointer to struct heap, below.  */
248  int64_t heap_ptr;
249
250  /* A pointer to an ephemeral memory arena.
251    Only needed for OpenMP.  */
252  int64_t arena_ptr;
253
254  /* Output data.  */
255  struct output {
256    int return_value;
257    unsigned int next_output;
258    struct printf_data {
259      int written;
260      char msg[128];
261      int type;
262      union {
263	int64_t ivalue;
264	double dvalue;
265	char text[128];
266      };
267    } queue[1024];
268    unsigned int consumed;
269  } output_data;
270};
271
272/* A queue entry for a future asynchronous launch.  */
273
274struct kernel_launch
275{
276  struct kernel_info *kernel;
277  void *vars;
278  struct GOMP_kernel_launch_attributes kla;
279};
280
281/* A queue entry for a future callback.  */
282
283struct callback
284{
285  void (*fn)(void *);
286  void *data;
287};
288
289/* A data struct for the copy_data callback.  */
290
291struct copy_data
292{
293  void *dst;
294  const void *src;
295  size_t len;
296  struct goacc_asyncqueue *aq;
297};
298
299/* A queue entry for a placeholder.  These correspond to a wait event.  */
300
301struct placeholder
302{
303  int executed;
304  pthread_cond_t cond;
305  pthread_mutex_t mutex;
306};
307
308/* A queue entry for a wait directive.  */
309
310struct asyncwait_info
311{
312  struct placeholder *placeholderp;
313};
314
315/* Encode the type of an entry in an async queue.  */
316
317enum entry_type
318{
319  KERNEL_LAUNCH,
320  CALLBACK,
321  ASYNC_WAIT,
322  ASYNC_PLACEHOLDER
323};
324
325/* An entry in an async queue.  */
326
327struct queue_entry
328{
329  enum entry_type type;
330  union {
331    struct kernel_launch launch;
332    struct callback callback;
333    struct asyncwait_info asyncwait;
334    struct placeholder placeholder;
335  } u;
336};
337
338/* An async queue header.
339
340   OpenMP may create one of these.
341   OpenACC may create many.  */
342
343struct goacc_asyncqueue
344{
345  struct agent_info *agent;
346  hsa_queue_t *hsa_queue;
347
348  pthread_t thread_drain_queue;
349  pthread_mutex_t mutex;
350  pthread_cond_t queue_cond_in;
351  pthread_cond_t queue_cond_out;
352  struct queue_entry queue[ASYNC_QUEUE_SIZE];
353  int queue_first;
354  int queue_n;
355  int drain_queue_stop;
356
357  int id;
358  struct goacc_asyncqueue *prev;
359  struct goacc_asyncqueue *next;
360};
361
362/* Mkoffload uses this structure to describe a kernel.
363
364   OpenMP kernel dimensions are passed at runtime.
365   OpenACC kernel dimensions are passed at compile time, here.  */
366
367struct hsa_kernel_description
368{
369  const char *name;
370  int oacc_dims[3];  /* Only present for GCN kernels.  */
371  int sgpr_count;
372  int vpgr_count;
373};
374
375/* Mkoffload uses this structure to describe an offload variable.  */
376
377struct global_var_info
378{
379  const char *name;
380  void *address;
381};
382
383/* Mkoffload uses this structure to describe all the kernels in a
384   loadable module.  These are passed the libgomp via static constructors.  */
385
386struct gcn_image_desc
387{
388  struct gcn_image {
389    size_t size;
390    void *image;
391  } *gcn_image;
392  const unsigned kernel_count;
393  struct hsa_kernel_description *kernel_infos;
394  const unsigned global_variable_count;
395};
396
397/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
398   support.
399   See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
400
401typedef enum {
402  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
403  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
404  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
405  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
406} EF_AMDGPU_MACH;
407
408const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
409typedef EF_AMDGPU_MACH gcn_isa;
410
411/* Description of an HSA GPU agent (device) and the program associated with
412   it.  */
413
414struct agent_info
415{
416  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
417  hsa_agent_t id;
418  /* The user-visible device number.  */
419  int device_id;
420  /* Whether the agent has been initialized.  The fields below are usable only
421     if it has been.  */
422  bool initialized;
423
424  /* The instruction set architecture of the device. */
425  gcn_isa device_isa;
426  /* Name of the agent. */
427  char name[64];
428  /* Name of the vendor of the agent. */
429  char vendor_name[64];
430  /* Command queues of the agent.  */
431  hsa_queue_t *sync_queue;
432  struct goacc_asyncqueue *async_queues, *omp_async_queue;
433  pthread_mutex_t async_queues_mutex;
434
435  /* The HSA memory region from which to allocate kernel arguments.  */
436  hsa_region_t kernarg_region;
437
438  /* The HSA memory region from which to allocate device data.  */
439  hsa_region_t data_region;
440
441  /* Allocated team arenas.  */
442  struct team_arena_list *team_arena_list;
443  pthread_mutex_t team_arena_write_lock;
444
445  /* Read-write lock that protects kernels which are running or about to be run
446     from interference with loading and unloading of images.  Needs to be
447     locked for reading while a kernel is being run, and for writing if the
448     list of modules is manipulated (and thus the HSA program invalidated).  */
449  pthread_rwlock_t module_rwlock;
450
451  /* The module associated with this kernel.  */
452  struct module_info *module;
453
454  /* Mutex enforcing that only one thread will finalize the HSA program.  A
455     thread should have locked agent->module_rwlock for reading before
456     acquiring it.  */
457  pthread_mutex_t prog_mutex;
458  /* Flag whether the HSA program that consists of all the modules has been
459     finalized.  */
460  bool prog_finalized;
461  /* HSA executable - the finalized program that is used to locate kernels.  */
462  hsa_executable_t executable;
463};
464
465/* Information required to identify, finalize and run any given kernel.  */
466
467enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
468
469struct kernel_info
470{
471  /* Name of the kernel, required to locate it within the GCN object-code
472     module.  */
473  const char *name;
474  /* The specific agent the kernel has been or will be finalized for and run
475     on.  */
476  struct agent_info *agent;
477  /* The specific module where the kernel takes place.  */
478  struct module_info *module;
479  /* Information provided by mkoffload associated with the kernel.  */
480  struct hsa_kernel_description *description;
481  /* Mutex enforcing that at most once thread ever initializes a kernel for
482     use.  A thread should have locked agent->module_rwlock for reading before
483     acquiring it.  */
484  pthread_mutex_t init_mutex;
485  /* Flag indicating whether the kernel has been initialized and all fields
486     below it contain valid data.  */
487  bool initialized;
488  /* Flag indicating that the kernel has a problem that blocks an execution.  */
489  bool initialization_failed;
490  /* The object to be put into the dispatch queue.  */
491  uint64_t object;
492  /* Required size of kernel arguments.  */
493  uint32_t kernarg_segment_size;
494  /* Required size of group segment.  */
495  uint32_t group_segment_size;
496  /* Required size of private segment.  */
497  uint32_t private_segment_size;
498  /* Set up for OpenMP or OpenACC?  */
499  enum offload_kind kind;
500};
501
502/* Information about a particular GCN module, its image and kernels.  */
503
504struct module_info
505{
506  /* The description with which the program has registered the image.  */
507  struct gcn_image_desc *image_desc;
508  /* GCN heap allocation.  */
509  struct heap *heap;
510  /* Physical boundaries of the loaded module.  */
511  Elf64_Addr phys_address_start;
512  Elf64_Addr phys_address_end;
513
514  bool constructors_run_p;
515  struct kernel_info *init_array_func, *fini_array_func;
516
517  /* Number of kernels in this module.  */
518  int kernel_count;
519  /* An array of kernel_info structures describing each kernel in this
520     module.  */
521  struct kernel_info kernels[];
522};
523
524/* A linked list of memory arenas allocated on the device.
525   These are only used by OpenMP, as a means to optimize per-team malloc.  */
526
527struct team_arena_list
528{
529  struct team_arena_list *next;
530
531  /* The number of teams determines the size of the allocation.  */
532  int num_teams;
533  /* The device address of the arena itself.  */
534  void *arena;
535  /* A flag to prevent two asynchronous kernels trying to use the same arena.
536     The mutex is locked until the kernel exits.  */
537  pthread_mutex_t in_use;
538};
539
540/* Information about the whole HSA environment and all of its agents.  */
541
542struct hsa_context_info
543{
544  /* Whether the structure has been initialized.  */
545  bool initialized;
546  /* Number of usable GPU HSA agents in the system.  */
547  int agent_count;
548  /* Array of agent_info structures describing the individual HSA agents.  */
549  struct agent_info *agents;
550  /* Driver version string. */
551  char driver_version_s[30];
552};
553
554/* Format of the on-device heap.
555
556   This must match the definition in Newlib and gcn-run.  */
557
558struct heap {
559  int64_t size;
560  char data[0];
561};
562
563/* }}}  */
564/* {{{ Global variables  */
565
566/* Information about the whole HSA environment and all of its agents.  */
567
568static struct hsa_context_info hsa_context;
569
570/* HSA runtime functions that are initialized in init_hsa_context.  */
571
572static struct hsa_runtime_fn_info hsa_fns;
573
574/* Heap space, allocated target-side, provided for use of newlib malloc.
575   Each module should have it's own heap allocated.
576   Beware that heap usage increases with OpenMP teams.  See also arenas.  */
577
578static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
579
580/* Flag to decide whether print to stderr information about what is going on.
581   Set in init_debug depending on environment variables.  */
582
583static bool debug;
584
585/* Flag to decide if the runtime should suppress a possible fallback to host
586   execution.  */
587
588static bool suppress_host_fallback;
589
590/* Flag to locate HSA runtime shared library that is dlopened
591   by this plug-in.  */
592
593static const char *hsa_runtime_lib;
594
595/* Flag to decide if the runtime should support also CPU devices (can be
596   a simulator).  */
597
598static bool support_cpu_devices;
599
600/* Runtime dimension overrides.  Zero indicates default.  */
601
602static int override_x_dim = 0;
603static int override_z_dim = 0;
604
605/* }}}  */
606/* {{{ Debug & Diagnostic  */
607
608/* Print a message to stderr if GCN_DEBUG value is set to true.  */
609
610#define DEBUG_PRINT(...) \
611  do \
612  { \
613    if (debug) \
614      { \
615	fprintf (stderr, __VA_ARGS__); \
616      } \
617  } \
618  while (false);
619
620/* Flush stderr if GCN_DEBUG value is set to true.  */
621
622#define DEBUG_FLUSH()				\
623  do {						\
624    if (debug)					\
625      fflush (stderr);				\
626  } while (false)
627
628/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
629   is set to true.  */
630
631#define DEBUG_LOG(prefix, ...)			\
632  do						\
633    {						\
634      DEBUG_PRINT (prefix);			\
635      DEBUG_PRINT (__VA_ARGS__);			\
636      DEBUG_FLUSH ();				\
637    } while (false)
638
639/* Print a debugging message to stderr.  */
640
641#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
642
643/* Print a warning message to stderr.  */
644
645#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
646
647/* Print HSA warning STR with an HSA STATUS code.  */
648
649static void
650hsa_warn (const char *str, hsa_status_t status)
651{
652  if (!debug)
653    return;
654
655  const char *hsa_error_msg = "[unknown]";
656  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657
658  fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
659	   hsa_error_msg);
660}
661
662/* Report a fatal error STR together with the HSA error corresponding to STATUS
663   and terminate execution of the current process.  */
664
665static void
666hsa_fatal (const char *str, hsa_status_t status)
667{
668  const char *hsa_error_msg = "[unknown]";
669  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670  GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
671		     hsa_error_msg);
672}
673
674/* Like hsa_fatal, except only report error message, and return FALSE
675   for propagating error processing to outside of plugin.  */
676
677static bool
678hsa_error (const char *str, hsa_status_t status)
679{
680  const char *hsa_error_msg = "[unknown]";
681  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
682  GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
683		     hsa_error_msg);
684  return false;
685}
686
687/* Dump information about the available hardware.  */
688
689static void
690dump_hsa_system_info (void)
691{
692  hsa_status_t status;
693
694  hsa_endianness_t endianness;
695  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
696					   &endianness);
697  if (status == HSA_STATUS_SUCCESS)
698    switch (endianness)
699      {
700      case HSA_ENDIANNESS_LITTLE:
701	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
702	break;
703      case HSA_ENDIANNESS_BIG:
704	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
705	break;
706      default:
707	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
708      }
709  else
710    GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
711
712  uint8_t extensions[128];
713  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
714					   &extensions);
715  if (status == HSA_STATUS_SUCCESS)
716    {
717      if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
718	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
719    }
720  else
721    GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
722}
723
724/* Dump information about the available hardware.  */
725
726static void
727dump_machine_model (hsa_machine_model_t machine_model, const char *s)
728{
729  switch (machine_model)
730    {
731    case HSA_MACHINE_MODEL_SMALL:
732      GCN_DEBUG ("%s: SMALL\n", s);
733      break;
734    case HSA_MACHINE_MODEL_LARGE:
735      GCN_DEBUG ("%s: LARGE\n", s);
736      break;
737    default:
738      GCN_WARNING ("%s: UNKNOWN\n", s);
739      break;
740    }
741}
742
743/* Dump information about the available hardware.  */
744
745static void
746dump_profile (hsa_profile_t profile, const char *s)
747{
748  switch (profile)
749    {
750    case HSA_PROFILE_FULL:
751      GCN_DEBUG ("%s: FULL\n", s);
752      break;
753    case HSA_PROFILE_BASE:
754      GCN_DEBUG ("%s: BASE\n", s);
755      break;
756    default:
757      GCN_WARNING ("%s: UNKNOWN\n", s);
758      break;
759    }
760}
761
762/* Dump information about a device memory region.  */
763
764static hsa_status_t
765dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
766{
767  hsa_status_t status;
768
769  hsa_region_segment_t segment;
770  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
771					   &segment);
772  if (status == HSA_STATUS_SUCCESS)
773    {
774      if (segment == HSA_REGION_SEGMENT_GLOBAL)
775	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
776      else if (segment == HSA_REGION_SEGMENT_READONLY)
777	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
778      else if (segment == HSA_REGION_SEGMENT_PRIVATE)
779	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
780      else if (segment == HSA_REGION_SEGMENT_GROUP)
781	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
782      else
783	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
784    }
785  else
786    GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
787
788  if (segment == HSA_REGION_SEGMENT_GLOBAL)
789    {
790      uint32_t flags;
791      status
792	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
793					  &flags);
794      if (status == HSA_STATUS_SUCCESS)
795	{
796	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
797	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
798	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
799	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
800	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
801	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
802	}
803      else
804	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
805    }
806
807  size_t size;
808  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
809  if (status == HSA_STATUS_SUCCESS)
810    GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
811  else
812    GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
813
814  status
815    = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
816				      &size);
817  if (status == HSA_STATUS_SUCCESS)
818    GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
819  else
820    GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
821
822  bool alloc_allowed;
823  status
824    = hsa_fns.hsa_region_get_info_fn (region,
825				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
826				      &alloc_allowed);
827  if (status == HSA_STATUS_SUCCESS)
828    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
829  else
830    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
831
832  if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
833    return HSA_STATUS_SUCCESS;
834
835  status
836    = hsa_fns.hsa_region_get_info_fn (region,
837				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
838				      &size);
839  if (status == HSA_STATUS_SUCCESS)
840    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
841  else
842    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
843
844  size_t align;
845  status
846    = hsa_fns.hsa_region_get_info_fn (region,
847				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
848				      &align);
849  if (status == HSA_STATUS_SUCCESS)
850    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
851  else
852    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
853
854  return HSA_STATUS_SUCCESS;
855}
856
857/* Dump information about all the device memory regions.  */
858
859static void
860dump_hsa_regions (hsa_agent_t agent)
861{
862  hsa_status_t status;
863  status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
864						 dump_hsa_region,
865						 NULL);
866  if (status != HSA_STATUS_SUCCESS)
867    hsa_error ("Dumping hsa regions failed", status);
868}
869
870/* Dump information about the available devices.  */
871
872static hsa_status_t
873dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
874{
875  hsa_status_t status;
876
877  char buf[64];
878  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
879					  &buf);
880  if (status == HSA_STATUS_SUCCESS)
881    GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
882  else
883    GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
884
885  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
886					  &buf);
887  if (status == HSA_STATUS_SUCCESS)
888    GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
889  else
890    GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
891
892  hsa_machine_model_t machine_model;
893  status
894    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
895				     &machine_model);
896  if (status == HSA_STATUS_SUCCESS)
897    dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
898  else
899    GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
900
901  hsa_profile_t profile;
902  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
903					  &profile);
904  if (status == HSA_STATUS_SUCCESS)
905    dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
906  else
907    GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
908
909  hsa_device_type_t device_type;
910  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
911					  &device_type);
912  if (status == HSA_STATUS_SUCCESS)
913    {
914      switch (device_type)
915	{
916	case HSA_DEVICE_TYPE_CPU:
917	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
918	  break;
919	case HSA_DEVICE_TYPE_GPU:
920	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
921	  break;
922	case HSA_DEVICE_TYPE_DSP:
923	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
924	  break;
925	default:
926	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
927	  break;
928	}
929    }
930  else
931    GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
932
933  uint32_t cu_count;
934  status = hsa_fns.hsa_agent_get_info_fn
935    (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
936  if (status == HSA_STATUS_SUCCESS)
937    GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
938  else
939    GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
940
941  uint32_t size;
942  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
943					  &size);
944  if (status == HSA_STATUS_SUCCESS)
945    GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
946  else
947    GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
948
949  uint32_t max_dim;
950  status = hsa_fns.hsa_agent_get_info_fn (agent,
951					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
952					  &max_dim);
953  if (status == HSA_STATUS_SUCCESS)
954    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
955  else
956    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
957
958  uint32_t max_size;
959  status = hsa_fns.hsa_agent_get_info_fn (agent,
960					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
961					  &max_size);
962  if (status == HSA_STATUS_SUCCESS)
963    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
964  else
965    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
966
967  uint32_t grid_max_dim;
968  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
969					  &grid_max_dim);
970  if (status == HSA_STATUS_SUCCESS)
971    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
972  else
973    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
974
975  uint32_t grid_max_size;
976  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
977					  &grid_max_size);
978  if (status == HSA_STATUS_SUCCESS)
979    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
980  else
981    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
982
983  dump_hsa_regions (agent);
984
985  return HSA_STATUS_SUCCESS;
986}
987
988/* Forward reference.  */
989
990static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
991
992/* Helper function for dump_executable_symbols.  */
993
994static hsa_status_t
995dump_executable_symbol (hsa_executable_t executable,
996			hsa_executable_symbol_t symbol,
997			void *data __attribute__((unused)))
998{
999  char *name = get_executable_symbol_name (symbol);
1000
1001  if (name)
1002    {
1003      GCN_DEBUG ("executable symbol: %s\n", name);
1004      free (name);
1005    }
1006
1007  return HSA_STATUS_SUCCESS;
1008}
1009
1010/* Dump all global symbol in an executable.  */
1011
1012static void
1013dump_executable_symbols (hsa_executable_t executable)
1014{
1015  hsa_status_t status;
1016  status
1017    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1018						 dump_executable_symbol,
1019						 NULL);
1020  if (status != HSA_STATUS_SUCCESS)
1021    hsa_fatal ("Could not dump HSA executable symbols", status);
1022}
1023
1024/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1025
1026static void
1027print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1028{
1029  struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1030
1031  fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1032  fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1033  fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1034  fprintf (stderr, "%*sheap address: %p\n", indent, "",
1035	   (void*)kernargs->heap_ptr);
1036  fprintf (stderr, "%*sarena address: %p\n", indent, "",
1037	   (void*)kernargs->arena_ptr);
1038  fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1039  fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1040	   dispatch->private_segment_size);
1041  fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1042	   dispatch->group_segment_size);
1043  fprintf (stderr, "\n");
1044}
1045
1046/* }}}  */
1047/* {{{ Utility functions  */
1048
1049/* Cast the thread local storage to gcn_thread.  */
1050
1051static inline struct gcn_thread *
1052gcn_thread (void)
1053{
1054  return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1055}
1056
1057/* Initialize debug and suppress_host_fallback according to the environment.  */
1058
1059static void
1060init_environment_variables (void)
1061{
1062  if (secure_getenv ("GCN_DEBUG"))
1063    debug = true;
1064  else
1065    debug = false;
1066
1067  if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1068    suppress_host_fallback = true;
1069  else
1070    suppress_host_fallback = false;
1071
1072  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1073  if (hsa_runtime_lib == NULL)
1074    hsa_runtime_lib = "libhsa-runtime64.so.1";
1075
1076  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1077
1078  const char *x = secure_getenv ("GCN_NUM_TEAMS");
1079  if (!x)
1080    x = secure_getenv ("GCN_NUM_GANGS");
1081  if (x)
1082    override_x_dim = atoi (x);
1083
1084  const char *z = secure_getenv ("GCN_NUM_THREADS");
1085  if (!z)
1086    z = secure_getenv ("GCN_NUM_WORKERS");
1087  if (z)
1088    override_z_dim = atoi (z);
1089
1090  const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1091  if (heap)
1092    {
1093      size_t tmp = atol (heap);
1094      if (tmp)
1095	gcn_kernel_heap_size = tmp;
1096    }
1097}
1098
1099/* Return malloc'd string with name of SYMBOL.  */
1100
1101static char *
1102get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103{
1104  hsa_status_t status;
1105  char *res;
1106  uint32_t len;
1107  const hsa_executable_symbol_info_t info_name_length
1108    = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109
1110  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1111						      &len);
1112  if (status != HSA_STATUS_SUCCESS)
1113    {
1114      hsa_error ("Could not get length of symbol name", status);
1115      return NULL;
1116    }
1117
1118  res = GOMP_PLUGIN_malloc (len + 1);
1119
1120  const hsa_executable_symbol_info_t info_name
1121    = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122
1123  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124
1125  if (status != HSA_STATUS_SUCCESS)
1126    {
1127      hsa_error ("Could not get symbol name", status);
1128      free (res);
1129      return NULL;
1130    }
1131
1132  res[len] = '\0';
1133
1134  return res;
1135}
1136
1137/* Get the number of GPU Compute Units.  */
1138
1139static int
1140get_cu_count (struct agent_info *agent)
1141{
1142  uint32_t cu_count;
1143  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1144    (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1145  if (status == HSA_STATUS_SUCCESS)
1146    return cu_count;
1147  else
1148    return 64;  /* The usual number for older devices.  */
1149}
1150
1151/* Calculate the maximum grid size for OMP threads / OACC workers.
1152   This depends on the kernel's resource usage levels.  */
1153
1154static int
1155limit_worker_threads (int threads)
1156{
1157  /* FIXME Do something more inteligent here.
1158     GCN can always run 4 threads within a Compute Unit, but
1159     more than that depends on register usage.  */
1160  if (threads > 16)
1161    threads = 16;
1162  return threads;
1163}
1164
1165/* Parse the target attributes INPUT provided by the compiler and return true
1166   if we should run anything all.  If INPUT is NULL, fill DEF with default
1167   values, then store INPUT or DEF into *RESULT.
1168
1169   This is used for OpenMP only.  */
1170
1171static bool
1172parse_target_attributes (void **input,
1173			 struct GOMP_kernel_launch_attributes *def,
1174			 struct GOMP_kernel_launch_attributes **result,
1175			 struct agent_info *agent)
1176{
1177  if (!input)
1178    GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180  bool grid_attrs_found = false;
1181  bool gcn_dims_found = false;
1182  int gcn_teams = 0;
1183  int gcn_threads = 0;
1184  while (*input)
1185    {
1186      intptr_t id = (intptr_t) *input++, val;
1187
1188      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189	val = (intptr_t) *input++;
1190      else
1191	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193      val = (val > INT_MAX) ? INT_MAX : val;
1194
1195      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1197	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198	{
1199	  grid_attrs_found = true;
1200	  break;
1201	}
1202      else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203	       == GOMP_TARGET_ARG_DEVICE_ALL)
1204	{
1205	  gcn_dims_found = true;
1206	  switch (id & GOMP_TARGET_ARG_ID_MASK)
1207	    {
1208	    case GOMP_TARGET_ARG_NUM_TEAMS:
1209	      gcn_teams = val;
1210	      break;
1211	    case GOMP_TARGET_ARG_THREAD_LIMIT:
1212	      gcn_threads = limit_worker_threads (val);
1213	      break;
1214	    default:
1215	      ;
1216	    }
1217	}
1218    }
1219
1220  if (gcn_dims_found)
1221    {
1222      bool gfx900_workaround_p = false;
1223
1224      if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1225	  && gcn_threads == 0 && override_z_dim == 0)
1226	{
1227	  gfx900_workaround_p = true;
1228	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1229		       "threads to at most 4 per team.\n");
1230	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231		       "GCN_NUM_THREADS=16\n");
1232	}
1233
1234      /* Ideally, when a dimension isn't explicitly specified, we should
1235	 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236	 In practice, we tune for peak performance on BabelStream, which
1237	 for OpenACC is currently 32 threads per CU.  */
1238      def->ndim = 3;
1239      if (gcn_teams <= 0 && gcn_threads <= 0)
1240	{
1241	  /* Set up a reasonable number of teams and threads.  */
1242	  gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1243	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1244	  def->gdims[2] = gcn_threads;
1245	}
1246      else if (gcn_teams <= 0 && gcn_threads > 0)
1247	{
1248	  /* Auto-scale the number of teams with the number of threads.  */
1249	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1250	  def->gdims[2] = gcn_threads;
1251	}
1252      else if (gcn_teams > 0 && gcn_threads <= 0)
1253	{
1254	  int max_threads = gfx900_workaround_p ? 4 : 16;
1255
1256	  /* Auto-scale the number of threads with the number of teams.  */
1257	  def->gdims[0] = gcn_teams;
1258	  def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259	  if (def->gdims[2] == 0)
1260	    def->gdims[2] = 1;
1261	  else if (def->gdims[2] > max_threads)
1262	    def->gdims[2] = max_threads;
1263	}
1264      else
1265	{
1266	  def->gdims[0] = gcn_teams;
1267	  def->gdims[2] = gcn_threads;
1268	}
1269      def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1270      def->wdims[0] = 1;  /* Single team per work-group.  */
1271      def->wdims[1] = 64;
1272      def->wdims[2] = 16;
1273      *result = def;
1274      return true;
1275    }
1276  else if (!grid_attrs_found)
1277    {
1278      def->ndim = 1;
1279      def->gdims[0] = 1;
1280      def->gdims[1] = 1;
1281      def->gdims[2] = 1;
1282      def->wdims[0] = 1;
1283      def->wdims[1] = 1;
1284      def->wdims[2] = 1;
1285      *result = def;
1286      GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1287      return true;
1288    }
1289
1290  struct GOMP_kernel_launch_attributes *kla;
1291  kla = (struct GOMP_kernel_launch_attributes *) *input;
1292  *result = kla;
1293  if (kla->ndim == 0 || kla->ndim > 3)
1294    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1295
1296  GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1297  unsigned i;
1298  for (i = 0; i < kla->ndim; i++)
1299    {
1300      GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1301		 kla->gdims[i], kla->wdims[i]);
1302      if (kla->gdims[i] == 0)
1303	return false;
1304    }
1305  return true;
1306}
1307
1308/* Return the group size given the requested GROUP size, GRID size and number
1309   of grid dimensions NDIM.  */
1310
1311static uint32_t
1312get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1313{
1314  if (group == 0)
1315    {
1316      /* TODO: Provide a default via environment or device characteristics.  */
1317      if (ndim == 1)
1318	group = 64;
1319      else if (ndim == 2)
1320	group = 8;
1321      else
1322	group = 4;
1323    }
1324
1325  if (group > grid)
1326    group = grid;
1327  return group;
1328}
1329
1330/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1331
1332static void
1333packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1334{
1335  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1336}
1337
1338/* A never-called callback for the HSA command queues.  These signal events
1339   that we don't use, so we trigger an error.
1340
1341   This "queue" is not to be confused with the async queues, below.  */
1342
1343static void
1344hsa_queue_callback (hsa_status_t status,
1345		hsa_queue_t *queue __attribute__ ((unused)),
1346		void *data __attribute__ ((unused)))
1347{
1348  hsa_fatal ("Asynchronous queue error", status);
1349}
1350
1351/* }}}  */
1352/* {{{ HSA initialization  */
1353
1354/* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1355
1356static bool
1357init_hsa_runtime_functions (void)
1358{
1359#define DLSYM_FN(function) \
1360  hsa_fns.function##_fn = dlsym (handle, #function); \
1361  if (hsa_fns.function##_fn == NULL) \
1362    return false;
1363  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1364  if (handle == NULL)
1365    return false;
1366
1367  DLSYM_FN (hsa_status_string)
1368  DLSYM_FN (hsa_system_get_info)
1369  DLSYM_FN (hsa_agent_get_info)
1370  DLSYM_FN (hsa_init)
1371  DLSYM_FN (hsa_iterate_agents)
1372  DLSYM_FN (hsa_region_get_info)
1373  DLSYM_FN (hsa_queue_create)
1374  DLSYM_FN (hsa_agent_iterate_regions)
1375  DLSYM_FN (hsa_executable_destroy)
1376  DLSYM_FN (hsa_executable_create)
1377  DLSYM_FN (hsa_executable_global_variable_define)
1378  DLSYM_FN (hsa_executable_load_code_object)
1379  DLSYM_FN (hsa_executable_freeze)
1380  DLSYM_FN (hsa_signal_create)
1381  DLSYM_FN (hsa_memory_allocate)
1382  DLSYM_FN (hsa_memory_assign_agent)
1383  DLSYM_FN (hsa_memory_copy)
1384  DLSYM_FN (hsa_memory_free)
1385  DLSYM_FN (hsa_signal_destroy)
1386  DLSYM_FN (hsa_executable_get_symbol)
1387  DLSYM_FN (hsa_executable_symbol_get_info)
1388  DLSYM_FN (hsa_executable_iterate_symbols)
1389  DLSYM_FN (hsa_queue_add_write_index_release)
1390  DLSYM_FN (hsa_queue_load_read_index_acquire)
1391  DLSYM_FN (hsa_signal_wait_acquire)
1392  DLSYM_FN (hsa_signal_store_relaxed)
1393  DLSYM_FN (hsa_signal_store_release)
1394  DLSYM_FN (hsa_signal_load_acquire)
1395  DLSYM_FN (hsa_queue_destroy)
1396  DLSYM_FN (hsa_code_object_deserialize)
1397  return true;
1398#undef DLSYM_FN
1399}
1400
1401/* Return true if the agent is a GPU and can accept of concurrent submissions
1402   from different threads.  */
1403
1404static bool
1405suitable_hsa_agent_p (hsa_agent_t agent)
1406{
1407  hsa_device_type_t device_type;
1408  hsa_status_t status
1409    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1410				     &device_type);
1411  if (status != HSA_STATUS_SUCCESS)
1412    return false;
1413
1414  switch (device_type)
1415    {
1416    case HSA_DEVICE_TYPE_GPU:
1417      break;
1418    case HSA_DEVICE_TYPE_CPU:
1419      if (!support_cpu_devices)
1420	return false;
1421      break;
1422    default:
1423      return false;
1424    }
1425
1426  uint32_t features = 0;
1427  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1428					  &features);
1429  if (status != HSA_STATUS_SUCCESS
1430      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1431    return false;
1432  hsa_queue_type_t queue_type;
1433  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1434					  &queue_type);
1435  if (status != HSA_STATUS_SUCCESS
1436      || (queue_type != HSA_QUEUE_TYPE_MULTI))
1437    return false;
1438
1439  return true;
1440}
1441
1442/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443   agent_count in hsa_context.  */
1444
1445static hsa_status_t
1446count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1447{
1448  if (suitable_hsa_agent_p (agent))
1449    hsa_context.agent_count++;
1450  return HSA_STATUS_SUCCESS;
1451}
1452
1453/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454   id to the describing structure in the hsa context.  The index of the
1455   structure is pointed to by DATA, increment it afterwards.  */
1456
1457static hsa_status_t
1458assign_agent_ids (hsa_agent_t agent, void *data)
1459{
1460  if (suitable_hsa_agent_p (agent))
1461    {
1462      int *agent_index = (int *) data;
1463      hsa_context.agents[*agent_index].id = agent;
1464      ++*agent_index;
1465    }
1466  return HSA_STATUS_SUCCESS;
1467}
1468
1469/* Initialize hsa_context if it has not already been done.
1470   Return TRUE on success.  */
1471
1472static bool
1473init_hsa_context (void)
1474{
1475  hsa_status_t status;
1476  int agent_index = 0;
1477
1478  if (hsa_context.initialized)
1479    return true;
1480  init_environment_variables ();
1481  if (!init_hsa_runtime_functions ())
1482    {
1483      GCN_WARNING ("Run-time could not be dynamically opened\n");
1484      if (suppress_host_fallback)
1485	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1486      return false;
1487    }
1488  status = hsa_fns.hsa_init_fn ();
1489  if (status != HSA_STATUS_SUCCESS)
1490    return hsa_error ("Run-time could not be initialized", status);
1491  GCN_DEBUG ("HSA run-time initialized for GCN\n");
1492
1493  if (debug)
1494    dump_hsa_system_info ();
1495
1496  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1497  if (status != HSA_STATUS_SUCCESS)
1498    return hsa_error ("GCN GPU devices could not be enumerated", status);
1499  GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1500
1501  hsa_context.agents
1502    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1503				  * sizeof (struct agent_info));
1504  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1505  if (status != HSA_STATUS_SUCCESS)
1506    return hsa_error ("Scanning compute agents failed", status);
1507  if (agent_index != hsa_context.agent_count)
1508    {
1509      GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1510      return false;
1511    }
1512
1513  if (debug)
1514    {
1515      status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1516      if (status != HSA_STATUS_SUCCESS)
1517	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1518    }
1519
1520  uint16_t minor, major;
1521  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1522					   &minor);
1523  if (status != HSA_STATUS_SUCCESS)
1524    GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1526					   &major);
1527  if (status != HSA_STATUS_SUCCESS)
1528    GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1529
1530  size_t len = sizeof hsa_context.driver_version_s;
1531  int printed = snprintf (hsa_context.driver_version_s, len,
1532			  "HSA Runtime %hu.%hu", (unsigned short int)major,
1533			  (unsigned short int)minor);
1534  if (printed >= len)
1535    GCN_WARNING ("HSA runtime version string was truncated."
1536		 "Version %hu.%hu is too long.", (unsigned short int)major,
1537		 (unsigned short int)minor);
1538
1539  hsa_context.initialized = true;
1540  return true;
1541}
1542
1543/* Verify that hsa_context has already been initialized and return the
1544   agent_info structure describing device number N.  Return NULL on error.  */
1545
1546static struct agent_info *
1547get_agent_info (int n)
1548{
1549  if (!hsa_context.initialized)
1550    {
1551      GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1552      return NULL;
1553    }
1554  if (n >= hsa_context.agent_count)
1555    {
1556      GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1557      return NULL;
1558    }
1559  if (!hsa_context.agents[n].initialized)
1560    {
1561      GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1562      return NULL;
1563    }
1564  return &hsa_context.agents[n];
1565}
1566
1567/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1568
1569   Selects (breaks at) a suitable region of type KIND.  */
1570
1571static hsa_status_t
1572get_memory_region (hsa_region_t region, hsa_region_t *retval,
1573		   hsa_region_global_flag_t kind)
1574{
1575  hsa_status_t status;
1576  hsa_region_segment_t segment;
1577
1578  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1579					   &segment);
1580  if (status != HSA_STATUS_SUCCESS)
1581    return status;
1582  if (segment != HSA_REGION_SEGMENT_GLOBAL)
1583    return HSA_STATUS_SUCCESS;
1584
1585  uint32_t flags;
1586  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1587					   &flags);
1588  if (status != HSA_STATUS_SUCCESS)
1589    return status;
1590  if (flags & kind)
1591    {
1592      *retval = region;
1593      return HSA_STATUS_INFO_BREAK;
1594    }
1595  return HSA_STATUS_SUCCESS;
1596}
1597
1598/* Callback of hsa_agent_iterate_regions.
1599
1600   Selects a kernargs memory region.  */
1601
1602static hsa_status_t
1603get_kernarg_memory_region (hsa_region_t region, void *data)
1604{
1605  return get_memory_region (region, (hsa_region_t *)data,
1606			    HSA_REGION_GLOBAL_FLAG_KERNARG);
1607}
1608
1609/* Callback of hsa_agent_iterate_regions.
1610
1611   Selects a coarse-grained memory region suitable for the heap and
1612   offload data.  */
1613
1614static hsa_status_t
1615get_data_memory_region (hsa_region_t region, void *data)
1616{
1617  return get_memory_region (region, (hsa_region_t *)data,
1618			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1619}
1620
1621static int
1622elf_gcn_isa_field (Elf64_Ehdr *image)
1623{
1624  return image->e_flags & EF_AMDGPU_MACH_MASK;
1625}
1626
1627const static char *gcn_gfx803_s = "gfx803";
1628const static char *gcn_gfx900_s = "gfx900";
1629const static char *gcn_gfx906_s = "gfx906";
1630const static char *gcn_gfx908_s = "gfx908";
1631const static int gcn_isa_name_len = 6;
1632
1633/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1634   support the ISA. */
1635
1636static const char*
1637isa_hsa_name (int isa) {
1638  switch(isa)
1639    {
1640    case EF_AMDGPU_MACH_AMDGCN_GFX803:
1641      return gcn_gfx803_s;
1642    case EF_AMDGPU_MACH_AMDGCN_GFX900:
1643      return gcn_gfx900_s;
1644    case EF_AMDGPU_MACH_AMDGCN_GFX906:
1645      return gcn_gfx906_s;
1646    case EF_AMDGPU_MACH_AMDGCN_GFX908:
1647      return gcn_gfx908_s;
1648    }
1649  return NULL;
1650}
1651
1652/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1653   with -march) or NULL if we do not support the ISA.
1654   Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1655
1656static const char*
1657isa_gcc_name (int isa) {
1658  switch(isa)
1659    {
1660    case EF_AMDGPU_MACH_AMDGCN_GFX803:
1661      return "fiji";
1662    default:
1663      return isa_hsa_name (isa);
1664    }
1665}
1666
1667/* Returns the code which is used in the GCN object code to identify the ISA with
1668   the given name (as used by the HSA runtime).  */
1669
1670static gcn_isa
1671isa_code(const char *isa) {
1672  if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1673    return EF_AMDGPU_MACH_AMDGCN_GFX803;
1674
1675  if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1676    return EF_AMDGPU_MACH_AMDGCN_GFX900;
1677
1678  if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1679    return EF_AMDGPU_MACH_AMDGCN_GFX906;
1680
1681  if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1682    return EF_AMDGPU_MACH_AMDGCN_GFX908;
1683
1684  return -1;
1685}
1686
1687/* }}}  */
1688/* {{{ Run  */
1689
1690/* Create or reuse a team arena.
1691
1692   Team arenas are used by OpenMP to avoid calling malloc multiple times
1693   while setting up each team.  This is purely a performance optimization.
1694
1695   Allocating an arena also costs performance, albeit on the host side, so
1696   this function will reuse an existing arena if a large enough one is idle.
1697   The arena is released, but not deallocated, when the kernel exits.  */
1698
1699static void *
1700get_team_arena (struct agent_info *agent, int num_teams)
1701{
1702  struct team_arena_list **next_ptr = &agent->team_arena_list;
1703  struct team_arena_list *item;
1704
1705  for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706    {
1707      if (item->num_teams < num_teams)
1708	continue;
1709
1710      if (pthread_mutex_trylock (&item->in_use))
1711	continue;
1712
1713      return item->arena;
1714    }
1715
1716  GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717
1718  if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719    {
1720      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721      return false;
1722    }
1723  item = malloc (sizeof (*item));
1724  item->num_teams = num_teams;
1725  item->next = NULL;
1726  *next_ptr = item;
1727
1728  if (pthread_mutex_init (&item->in_use, NULL))
1729    {
1730      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731      return false;
1732    }
1733  if (pthread_mutex_lock (&item->in_use))
1734    {
1735      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736      return false;
1737    }
1738  if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739    {
1740      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741      return false;
1742    }
1743
1744  const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1745  hsa_status_t status;
1746  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747					   TEAM_ARENA_SIZE*num_teams,
1748					   &item->arena);
1749  if (status != HSA_STATUS_SUCCESS)
1750    hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751  status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752					       HSA_ACCESS_PERMISSION_RW);
1753  if (status != HSA_STATUS_SUCCESS)
1754    hsa_fatal ("Could not assign arena memory to device", status);
1755
1756  return item->arena;
1757}
1758
1759/* Mark a team arena available for reuse.  */
1760
1761static void
1762release_team_arena (struct agent_info* agent, void *arena)
1763{
1764  struct team_arena_list *item;
1765
1766  for (item = agent->team_arena_list; item; item = item->next)
1767    {
1768      if (item->arena == arena)
1769	{
1770	  if (pthread_mutex_unlock (&item->in_use))
1771	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772	  return;
1773	}
1774    }
1775  GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776}
1777
1778/* Clean up all the allocated team arenas.  */
1779
1780static bool
1781destroy_team_arenas (struct agent_info *agent)
1782{
1783  struct team_arena_list *item, *next;
1784
1785  for (item = agent->team_arena_list; item; item = next)
1786    {
1787      next = item->next;
1788      hsa_fns.hsa_memory_free_fn (item->arena);
1789      if (pthread_mutex_destroy (&item->in_use))
1790	{
1791	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792	  return false;
1793	}
1794      free (item);
1795    }
1796  agent->team_arena_list = NULL;
1797
1798  return true;
1799}
1800
1801/* Allocate memory on a specified device.  */
1802
1803static void *
1804alloc_by_agent (struct agent_info *agent, size_t size)
1805{
1806  GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807
1808  /* Zero-size allocations are invalid, so in order to return a valid pointer
1809     we need to pass a valid size.  One source of zero-size allocations is
1810     kernargs for kernels that have no inputs or outputs (the kernel may
1811     only use console output, for example).  */
1812  if (size == 0)
1813    size = 4;
1814
1815  void *ptr;
1816  hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817							size, &ptr);
1818  if (status != HSA_STATUS_SUCCESS)
1819    {
1820      hsa_error ("Could not allocate device memory", status);
1821      return NULL;
1822    }
1823
1824  status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825					       HSA_ACCESS_PERMISSION_RW);
1826  if (status != HSA_STATUS_SUCCESS)
1827    {
1828      hsa_error ("Could not assign data memory to device", status);
1829      return NULL;
1830    }
1831
1832  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833  bool profiling_dispatch_p
1834    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835  if (profiling_dispatch_p)
1836    {
1837      acc_prof_info *prof_info = thr->prof_info;
1838      acc_event_info data_event_info;
1839      acc_api_info *api_info = thr->api_info;
1840
1841      prof_info->event_type = acc_ev_alloc;
1842
1843      data_event_info.data_event.event_type = prof_info->event_type;
1844      data_event_info.data_event.valid_bytes
1845	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846      data_event_info.data_event.parent_construct
1847	= acc_construct_parallel;
1848      data_event_info.data_event.implicit = 1;
1849      data_event_info.data_event.tool_info = NULL;
1850      data_event_info.data_event.var_name = NULL;
1851      data_event_info.data_event.bytes = size;
1852      data_event_info.data_event.host_ptr = NULL;
1853      data_event_info.data_event.device_ptr = (void *) ptr;
1854
1855      api_info->device_api = acc_device_api_other;
1856
1857      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858					    api_info);
1859    }
1860
1861  return ptr;
1862}
1863
1864/* Create kernel dispatch data structure for given KERNEL, along with
1865   the necessary device signals and memory allocations.  */
1866
1867static struct kernel_dispatch *
1868create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869{
1870  struct agent_info *agent = kernel->agent;
1871  struct kernel_dispatch *shadow
1872    = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873
1874  shadow->agent = kernel->agent;
1875  shadow->object = kernel->object;
1876
1877  hsa_signal_t sync_signal;
1878  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879  if (status != HSA_STATUS_SUCCESS)
1880    hsa_fatal ("Error creating the GCN sync signal", status);
1881
1882  shadow->signal = sync_signal.handle;
1883  shadow->private_segment_size = kernel->private_segment_size;
1884  shadow->group_segment_size = kernel->group_segment_size;
1885
1886  /* We expect kernels to request a single pointer, explicitly, and the
1887     rest of struct kernargs, implicitly.  If they request anything else
1888     then something is wrong.  */
1889  if (kernel->kernarg_segment_size > 8)
1890    {
1891      GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892      return NULL;
1893    }
1894
1895  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896					   sizeof (struct kernargs),
1897					   &shadow->kernarg_address);
1898  if (status != HSA_STATUS_SUCCESS)
1899    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900  struct kernargs *kernargs = shadow->kernarg_address;
1901
1902  /* Zero-initialize the output_data (minimum needed).  */
1903  kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904  kernargs->output_data.next_output = 0;
1905  for (unsigned i = 0;
1906       i < (sizeof (kernargs->output_data.queue)
1907	    / sizeof (kernargs->output_data.queue[0]));
1908       i++)
1909    kernargs->output_data.queue[i].written = 0;
1910  kernargs->output_data.consumed = 0;
1911
1912  /* Pass in the heap location.  */
1913  kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914
1915  /* Create an arena.  */
1916  if (kernel->kind == KIND_OPENMP)
1917    kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918  else
1919    kernargs->arena_ptr = 0;
1920
1921  /* Ensure we can recognize unset return values.  */
1922  kernargs->output_data.return_value = 0xcafe0000;
1923
1924  return shadow;
1925}
1926
1927/* Output any data written to console output from the kernel.  It is expected
1928   that this function is polled during kernel execution.
1929
1930   We print all entries from the last item printed to the next entry without
1931   a "written" flag.  If the "final" flag is set then it'll continue right to
1932   the end.
1933
1934   The print buffer is circular, but the from and to locations don't wrap when
1935   the buffer does, so the output limit is UINT_MAX.  The target blocks on
1936   output when the buffer is full.  */
1937
1938static void
1939console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940		bool final)
1941{
1942  unsigned int limit = (sizeof (kernargs->output_data.queue)
1943			/ sizeof (kernargs->output_data.queue[0]));
1944
1945  unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946				       __ATOMIC_ACQUIRE);
1947  unsigned int to = kernargs->output_data.next_output;
1948
1949  if (from > to)
1950    {
1951      /* Overflow.  */
1952      if (final)
1953	printf ("GCN print buffer overflowed.\n");
1954      return;
1955    }
1956
1957  unsigned int i;
1958  for (i = from; i < to; i++)
1959    {
1960      struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961
1962      if (!data->written && !final)
1963	break;
1964
1965      switch (data->type)
1966	{
1967	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971	default: printf ("GCN print buffer error!\n"); break;
1972	}
1973      data->written = 0;
1974      __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975			__ATOMIC_RELEASE);
1976    }
1977  fflush (stdout);
1978}
1979
1980/* Release data structure created for a kernel dispatch in SHADOW argument,
1981   and clean up the signal and memory allocations.  */
1982
1983static void
1984release_kernel_dispatch (struct kernel_dispatch *shadow)
1985{
1986  GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987
1988  struct kernargs *kernargs = shadow->kernarg_address;
1989  void *arena = (void *)kernargs->arena_ptr;
1990  if (arena)
1991    release_team_arena (shadow->agent, arena);
1992
1993  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994
1995  hsa_signal_t s;
1996  s.handle = shadow->signal;
1997  hsa_fns.hsa_signal_destroy_fn (s);
1998
1999  free (shadow);
2000}
2001
2002/* Extract the properties from a kernel binary.  */
2003
2004static void
2005init_kernel_properties (struct kernel_info *kernel)
2006{
2007  hsa_status_t status;
2008  struct agent_info *agent = kernel->agent;
2009  hsa_executable_symbol_t kernel_symbol;
2010  char *buf = alloca (strlen (kernel->name) + 4);
2011  sprintf (buf, "%s.kd", kernel->name);
2012  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2013						 buf, agent->id,
2014						 0, &kernel_symbol);
2015  if (status != HSA_STATUS_SUCCESS)
2016    {
2017      hsa_warn ("Could not find symbol for kernel in the code object", status);
2018      fprintf (stderr, "not found name: '%s'\n", buf);
2019      dump_executable_symbols (agent->executable);
2020      goto failure;
2021    }
2022  GCN_DEBUG ("Located kernel %s\n", kernel->name);
2023  status = hsa_fns.hsa_executable_symbol_get_info_fn
2024    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2025  if (status != HSA_STATUS_SUCCESS)
2026    hsa_fatal ("Could not extract a kernel object from its symbol", status);
2027  status = hsa_fns.hsa_executable_symbol_get_info_fn
2028    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2029     &kernel->kernarg_segment_size);
2030  if (status != HSA_STATUS_SUCCESS)
2031    hsa_fatal ("Could not get info about kernel argument size", status);
2032  status = hsa_fns.hsa_executable_symbol_get_info_fn
2033    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2034     &kernel->group_segment_size);
2035  if (status != HSA_STATUS_SUCCESS)
2036    hsa_fatal ("Could not get info about kernel group segment size", status);
2037  status = hsa_fns.hsa_executable_symbol_get_info_fn
2038    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2039     &kernel->private_segment_size);
2040  if (status != HSA_STATUS_SUCCESS)
2041    hsa_fatal ("Could not get info about kernel private segment size",
2042	       status);
2043
2044  /* The kernel type is not known until something tries to launch it.  */
2045  kernel->kind = KIND_UNKNOWN;
2046
2047  GCN_DEBUG ("Kernel structure for %s fully initialized with "
2048	     "following segment sizes: \n", kernel->name);
2049  GCN_DEBUG ("  group_segment_size: %u\n",
2050	     (unsigned) kernel->group_segment_size);
2051  GCN_DEBUG ("  private_segment_size: %u\n",
2052	     (unsigned) kernel->private_segment_size);
2053  GCN_DEBUG ("  kernarg_segment_size: %u\n",
2054	     (unsigned) kernel->kernarg_segment_size);
2055  return;
2056
2057failure:
2058  kernel->initialization_failed = true;
2059}
2060
2061/* Do all the work that is necessary before running KERNEL for the first time.
2062   The function assumes the program has been created, finalized and frozen by
2063   create_and_finalize_hsa_program.  */
2064
2065static void
2066init_kernel (struct kernel_info *kernel)
2067{
2068  if (pthread_mutex_lock (&kernel->init_mutex))
2069    GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2070  if (kernel->initialized)
2071    {
2072      if (pthread_mutex_unlock (&kernel->init_mutex))
2073	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2074			   "mutex");
2075
2076      return;
2077    }
2078
2079  init_kernel_properties (kernel);
2080
2081  if (!kernel->initialization_failed)
2082    {
2083      GCN_DEBUG ("\n");
2084
2085      kernel->initialized = true;
2086    }
2087  if (pthread_mutex_unlock (&kernel->init_mutex))
2088    GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2089		       "mutex");
2090}
2091
2092/* Run KERNEL on its agent, pass VARS to it as arguments and take
2093   launch attributes from KLA.
2094
2095   MODULE_LOCKED indicates that the caller already holds the lock and
2096   run_kernel need not lock it again.
2097   If AQ is NULL then agent->sync_queue will be used.  */
2098
2099static void
2100run_kernel (struct kernel_info *kernel, void *vars,
2101	    struct GOMP_kernel_launch_attributes *kla,
2102	    struct goacc_asyncqueue *aq, bool module_locked)
2103{
2104  GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2105	     kernel->description->vpgr_count);
2106
2107  /* Reduce the number of threads/workers if there are insufficient
2108     VGPRs available to run the kernels together.  */
2109  if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2110    {
2111      int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2112      int max_threads = (256 / granulated_vgprs) * 4;
2113      if (kla->gdims[2] > max_threads)
2114	{
2115	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2116		       " per team/gang - reducing to %d threads/workers.\n",
2117		       kla->gdims[2], max_threads);
2118	  kla->gdims[2] = max_threads;
2119	}
2120    }
2121
2122  GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2123	     (aq ? aq->id : 0));
2124  GCN_DEBUG ("GCN launch attribs: gdims:[");
2125  int i;
2126  for (i = 0; i < kla->ndim; ++i)
2127    {
2128      if (i)
2129	DEBUG_PRINT (", ");
2130      DEBUG_PRINT ("%u", kla->gdims[i]);
2131    }
2132  DEBUG_PRINT ("], normalized gdims:[");
2133  for (i = 0; i < kla->ndim; ++i)
2134    {
2135      if (i)
2136	DEBUG_PRINT (", ");
2137      DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2138    }
2139  DEBUG_PRINT ("], wdims:[");
2140  for (i = 0; i < kla->ndim; ++i)
2141    {
2142      if (i)
2143	DEBUG_PRINT (", ");
2144      DEBUG_PRINT ("%u", kla->wdims[i]);
2145    }
2146  DEBUG_PRINT ("]\n");
2147  DEBUG_FLUSH ();
2148
2149  struct agent_info *agent = kernel->agent;
2150  if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2151    GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2152
2153  if (!agent->initialized)
2154    GOMP_PLUGIN_fatal ("Agent must be initialized");
2155
2156  if (!kernel->initialized)
2157    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2158
2159  hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2160
2161  uint64_t index
2162    = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2163  GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2164
2165  /* Wait until the queue is not full before writing the packet.   */
2166  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2167	 >= command_q->size)
2168    ;
2169
2170  /* Do not allow the dimensions to be overridden when running
2171     constructors or destructors.  */
2172  int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2173  int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2174
2175  hsa_kernel_dispatch_packet_t *packet;
2176  packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2177	   + index % command_q->size;
2178
2179  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2180  packet->grid_size_x = override_x ? : kla->gdims[0];
2181  packet->workgroup_size_x = get_group_size (kla->ndim,
2182					     packet->grid_size_x,
2183					     kla->wdims[0]);
2184
2185  if (kla->ndim >= 2)
2186    {
2187      packet->grid_size_y = kla->gdims[1];
2188      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2189						 kla->wdims[1]);
2190    }
2191  else
2192    {
2193      packet->grid_size_y = 1;
2194      packet->workgroup_size_y = 1;
2195    }
2196
2197  if (kla->ndim == 3)
2198    {
2199      packet->grid_size_z = limit_worker_threads (override_z
2200						  ? : kla->gdims[2]);
2201      packet->workgroup_size_z = get_group_size (kla->ndim,
2202						 packet->grid_size_z,
2203						 kla->wdims[2]);
2204    }
2205  else
2206    {
2207      packet->grid_size_z = 1;
2208      packet->workgroup_size_z = 1;
2209    }
2210
2211  GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2212	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2213	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2214	     packet->grid_size_x / packet->workgroup_size_x,
2215	     packet->grid_size_y / packet->workgroup_size_y,
2216	     packet->grid_size_z / packet->workgroup_size_z,
2217	     packet->workgroup_size_x, packet->workgroup_size_y,
2218	     packet->workgroup_size_z);
2219
2220  struct kernel_dispatch *shadow
2221    = create_kernel_dispatch (kernel, packet->grid_size_x);
2222  shadow->queue = command_q;
2223
2224  if (debug)
2225    {
2226      fprintf (stderr, "\nKernel has following dependencies:\n");
2227      print_kernel_dispatch (shadow, 2);
2228    }
2229
2230  packet->private_segment_size = kernel->private_segment_size;
2231  packet->group_segment_size = kernel->group_segment_size;
2232  packet->kernel_object = kernel->object;
2233  packet->kernarg_address = shadow->kernarg_address;
2234  hsa_signal_t s;
2235  s.handle = shadow->signal;
2236  packet->completion_signal = s;
2237  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2238  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2239
2240  GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2241
2242  uint16_t header;
2243  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2244  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2245  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2246
2247  GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2248	     agent->device_id);
2249
2250  packet_store_release ((uint32_t *) packet, header,
2251			(uint16_t) kla->ndim
2252			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2253
2254  hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2255				       index);
2256
2257  GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2258
2259  /* Root signal waits with 1ms timeout.  */
2260  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2261					     1000 * 1000,
2262					     HSA_WAIT_STATE_BLOCKED) != 0)
2263    {
2264      console_output (kernel, shadow->kernarg_address, false);
2265    }
2266  console_output (kernel, shadow->kernarg_address, true);
2267
2268  struct kernargs *kernargs = shadow->kernarg_address;
2269  unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2270
2271  release_kernel_dispatch (shadow);
2272
2273  if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2274    GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2275
2276  unsigned int upper = (return_value & ~0xffff) >> 16;
2277  if (upper == 0xcafe)
2278    ; // exit not called, normal termination.
2279  else if (upper == 0xffff)
2280    ; // exit called.
2281  else
2282    {
2283      GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2284			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2285			 return_value);
2286      abort ();
2287    }
2288
2289  if (upper == 0xffff)
2290    {
2291      unsigned int signal = (return_value >> 8) & 0xff;
2292
2293      if (signal == SIGABRT)
2294	{
2295	  GCN_WARNING ("GCN Kernel aborted\n");
2296	  abort ();
2297	}
2298      else if (signal != 0)
2299	{
2300	  GCN_WARNING ("GCN Kernel received unknown signal\n");
2301	  abort ();
2302	}
2303
2304      GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2305      exit (return_value & 0xff);
2306    }
2307}
2308
2309/* }}}  */
2310/* {{{ Load/Unload  */
2311
2312/* Initialize KERNEL from D and other parameters.  Return true on success. */
2313
2314static bool
2315init_basic_kernel_info (struct kernel_info *kernel,
2316			struct hsa_kernel_description *d,
2317			struct agent_info *agent,
2318			struct module_info *module)
2319{
2320  kernel->agent = agent;
2321  kernel->module = module;
2322  kernel->name = d->name;
2323  kernel->description = d;
2324  if (pthread_mutex_init (&kernel->init_mutex, NULL))
2325    {
2326      GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2327      return false;
2328    }
2329  return true;
2330}
2331
2332/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2333
2334static bool
2335isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2336{
2337  int isa_field = elf_gcn_isa_field (image);
2338  const char* isa_s = isa_hsa_name (isa_field);
2339  if (!isa_s)
2340    {
2341      hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2342      return false;
2343    }
2344
2345  if (isa_field != agent->device_isa)
2346    {
2347      char msg[120];
2348      const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2349      const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2350      assert (agent_isa_s);
2351      assert (agent_isa_gcc_s);
2352
2353      snprintf (msg, sizeof msg,
2354		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2355		"Try to recompile with '-foffload=-march=%s'.\n",
2356		isa_s, agent_isa_s, agent_isa_gcc_s);
2357
2358      hsa_error (msg, HSA_STATUS_ERROR);
2359      return false;
2360    }
2361
2362  return true;
2363}
2364
2365/* Create and finalize the program consisting of all loaded modules.  */
2366
2367static bool
2368create_and_finalize_hsa_program (struct agent_info *agent)
2369{
2370  hsa_status_t status;
2371  bool res = true;
2372  if (pthread_mutex_lock (&agent->prog_mutex))
2373    {
2374      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2375      return false;
2376    }
2377  if (agent->prog_finalized)
2378    goto final;
2379
2380  status
2381    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2382					HSA_EXECUTABLE_STATE_UNFROZEN,
2383					"", &agent->executable);
2384  if (status != HSA_STATUS_SUCCESS)
2385    {
2386      hsa_error ("Could not create GCN executable", status);
2387      goto fail;
2388    }
2389
2390  /* Load any GCN modules.  */
2391  struct module_info *module = agent->module;
2392  if (module)
2393    {
2394      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2395
2396      if (!isa_matches_agent (agent, image))
2397	goto fail;
2398
2399      hsa_code_object_t co = { 0 };
2400      status = hsa_fns.hsa_code_object_deserialize_fn
2401	(module->image_desc->gcn_image->image,
2402	 module->image_desc->gcn_image->size,
2403	 NULL, &co);
2404      if (status != HSA_STATUS_SUCCESS)
2405	{
2406	  hsa_error ("Could not deserialize GCN code object", status);
2407	  goto fail;
2408	}
2409
2410      status = hsa_fns.hsa_executable_load_code_object_fn
2411	(agent->executable, agent->id, co, "");
2412      if (status != HSA_STATUS_SUCCESS)
2413	{
2414	  hsa_error ("Could not load GCN code object", status);
2415	  goto fail;
2416	}
2417
2418      if (!module->heap)
2419	{
2420	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2421						   gcn_kernel_heap_size,
2422						   (void**)&module->heap);
2423	  if (status != HSA_STATUS_SUCCESS)
2424	    {
2425	      hsa_error ("Could not allocate memory for GCN heap", status);
2426	      goto fail;
2427	    }
2428
2429	  status = hsa_fns.hsa_memory_assign_agent_fn
2430			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2431	  if (status != HSA_STATUS_SUCCESS)
2432	    {
2433	      hsa_error ("Could not assign GCN heap memory to device", status);
2434	      goto fail;
2435	    }
2436
2437	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2438				      &gcn_kernel_heap_size,
2439				      sizeof (gcn_kernel_heap_size));
2440	}
2441
2442    }
2443
2444  if (debug)
2445    dump_executable_symbols (agent->executable);
2446
2447  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2448  if (status != HSA_STATUS_SUCCESS)
2449    {
2450      hsa_error ("Could not freeze the GCN executable", status);
2451      goto fail;
2452    }
2453
2454final:
2455  agent->prog_finalized = true;
2456
2457  if (pthread_mutex_unlock (&agent->prog_mutex))
2458    {
2459      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2460      res = false;
2461    }
2462
2463  return res;
2464
2465fail:
2466  res = false;
2467  goto final;
2468}
2469
2470/* Free the HSA program in agent and everything associated with it and set
2471   agent->prog_finalized and the initialized flags of all kernels to false.
2472   Return TRUE on success.  */
2473
2474static bool
2475destroy_hsa_program (struct agent_info *agent)
2476{
2477  if (!agent->prog_finalized)
2478    return true;
2479
2480  hsa_status_t status;
2481
2482  GCN_DEBUG ("Destroying the current GCN program.\n");
2483
2484  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2485  if (status != HSA_STATUS_SUCCESS)
2486    return hsa_error ("Could not destroy GCN executable", status);
2487
2488  if (agent->module)
2489    {
2490      int i;
2491      for (i = 0; i < agent->module->kernel_count; i++)
2492	agent->module->kernels[i].initialized = false;
2493
2494      if (agent->module->heap)
2495	{
2496	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
2497	  agent->module->heap = NULL;
2498	}
2499    }
2500  agent->prog_finalized = false;
2501  return true;
2502}
2503
2504/* Deinitialize all information associated with MODULE and kernels within
2505   it.  Return TRUE on success.  */
2506
2507static bool
2508destroy_module (struct module_info *module, bool locked)
2509{
2510  /* Run destructors before destroying module.  */
2511  struct GOMP_kernel_launch_attributes kla =
2512    { 3,
2513      /* Grid size.  */
2514      { 1, 64, 1 },
2515      /* Work-group size.  */
2516      { 1, 64, 1 }
2517    };
2518
2519  if (module->fini_array_func)
2520    {
2521      init_kernel (module->fini_array_func);
2522      run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2523    }
2524  module->constructors_run_p = false;
2525
2526  int i;
2527  for (i = 0; i < module->kernel_count; i++)
2528    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2529      {
2530	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2531			   "mutex");
2532	return false;
2533      }
2534
2535  return true;
2536}
2537
2538/* }}}  */
2539/* {{{ Async  */
2540
2541/* Callback of dispatch queues to report errors.  */
2542
2543static void
2544execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2545{
2546  struct queue_entry *entry = &aq->queue[index];
2547
2548  switch (entry->type)
2549    {
2550    case KERNEL_LAUNCH:
2551      if (DEBUG_QUEUES)
2552	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2553		   aq->agent->device_id, aq->id, index);
2554      run_kernel (entry->u.launch.kernel,
2555		  entry->u.launch.vars,
2556		  &entry->u.launch.kla, aq, false);
2557      if (DEBUG_QUEUES)
2558	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2559		   aq->agent->device_id, aq->id, index);
2560      break;
2561
2562    case CALLBACK:
2563      if (DEBUG_QUEUES)
2564	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2565		   aq->agent->device_id, aq->id, index);
2566      entry->u.callback.fn (entry->u.callback.data);
2567      if (DEBUG_QUEUES)
2568	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2569		   aq->agent->device_id, aq->id, index);
2570      break;
2571
2572    case ASYNC_WAIT:
2573      {
2574	/* FIXME: is it safe to access a placeholder that may already have
2575	   been executed?  */
2576        struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2577
2578	if (DEBUG_QUEUES)
2579          GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2580		     aq->agent->device_id, aq->id, index);
2581
2582	pthread_mutex_lock (&placeholderp->mutex);
2583
2584	while (!placeholderp->executed)
2585          pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2586
2587	pthread_mutex_unlock (&placeholderp->mutex);
2588
2589	if (pthread_cond_destroy (&placeholderp->cond))
2590	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2591
2592	if (pthread_mutex_destroy (&placeholderp->mutex))
2593	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2594
2595	if (DEBUG_QUEUES)
2596          GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2597		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2598      }
2599      break;
2600
2601    case ASYNC_PLACEHOLDER:
2602      pthread_mutex_lock (&entry->u.placeholder.mutex);
2603      entry->u.placeholder.executed = 1;
2604      pthread_cond_signal (&entry->u.placeholder.cond);
2605      pthread_mutex_unlock (&entry->u.placeholder.mutex);
2606      break;
2607
2608    default:
2609      GOMP_PLUGIN_fatal ("Unknown queue element");
2610    }
2611}
2612
2613/* This function is run as a thread to service an async queue in the
2614   background.  It runs continuously until the stop flag is set.  */
2615
2616static void *
2617drain_queue (void *thread_arg)
2618{
2619  struct goacc_asyncqueue *aq = thread_arg;
2620
2621  if (DRAIN_QUEUE_SYNCHRONOUS_P)
2622    {
2623      aq->drain_queue_stop = 2;
2624      return NULL;
2625    }
2626
2627  pthread_mutex_lock (&aq->mutex);
2628
2629  while (true)
2630    {
2631      if (aq->drain_queue_stop)
2632	break;
2633
2634      if (aq->queue_n > 0)
2635	{
2636	  pthread_mutex_unlock (&aq->mutex);
2637	  execute_queue_entry (aq, aq->queue_first);
2638
2639	  pthread_mutex_lock (&aq->mutex);
2640	  aq->queue_first = ((aq->queue_first + 1)
2641			     % ASYNC_QUEUE_SIZE);
2642	  aq->queue_n--;
2643
2644	  if (DEBUG_THREAD_SIGNAL)
2645	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2646		       aq->agent->device_id, aq->id);
2647	  pthread_cond_broadcast (&aq->queue_cond_out);
2648	  pthread_mutex_unlock (&aq->mutex);
2649
2650	  if (DEBUG_QUEUES)
2651	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2652		       aq->id);
2653	  pthread_mutex_lock (&aq->mutex);
2654	}
2655      else
2656	{
2657	  if (DEBUG_THREAD_SLEEP)
2658	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2659		       aq->agent->device_id, aq->id);
2660	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2661	  if (DEBUG_THREAD_SLEEP)
2662	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2663		       aq->agent->device_id, aq->id);
2664	}
2665    }
2666
2667  aq->drain_queue_stop = 2;
2668  if (DEBUG_THREAD_SIGNAL)
2669    GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2670	       aq->agent->device_id, aq->id);
2671  pthread_cond_broadcast (&aq->queue_cond_out);
2672  pthread_mutex_unlock (&aq->mutex);
2673
2674  GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2675  return NULL;
2676}
2677
2678/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2679   is not usually the case.  This is just a debug tool.  */
2680
2681static void
2682drain_queue_synchronous (struct goacc_asyncqueue *aq)
2683{
2684  pthread_mutex_lock (&aq->mutex);
2685
2686  while (aq->queue_n > 0)
2687    {
2688      execute_queue_entry (aq, aq->queue_first);
2689
2690      aq->queue_first = ((aq->queue_first + 1)
2691			 % ASYNC_QUEUE_SIZE);
2692      aq->queue_n--;
2693    }
2694
2695  pthread_mutex_unlock (&aq->mutex);
2696}
2697
2698/* Block the current thread until an async queue is writable.  The aq->mutex
2699   lock should be held on entry, and remains locked on exit.  */
2700
2701static void
2702wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2703{
2704  if (aq->queue_n == ASYNC_QUEUE_SIZE)
2705    {
2706      /* Queue is full.  Wait for it to not be full.  */
2707      while (aq->queue_n == ASYNC_QUEUE_SIZE)
2708	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2709    }
2710}
2711
2712/* Request an asynchronous kernel launch on the specified queue.  This
2713   may block if the queue is full, but returns without waiting for the
2714   kernel to run.  */
2715
2716static void
2717queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2718		   void *vars, struct GOMP_kernel_launch_attributes *kla)
2719{
2720  assert (aq->agent == kernel->agent);
2721
2722  pthread_mutex_lock (&aq->mutex);
2723
2724  wait_for_queue_nonfull (aq);
2725
2726  int queue_last = ((aq->queue_first + aq->queue_n)
2727		    % ASYNC_QUEUE_SIZE);
2728  if (DEBUG_QUEUES)
2729    GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2730	       aq->id, queue_last);
2731
2732  aq->queue[queue_last].type = KERNEL_LAUNCH;
2733  aq->queue[queue_last].u.launch.kernel = kernel;
2734  aq->queue[queue_last].u.launch.vars = vars;
2735  aq->queue[queue_last].u.launch.kla = *kla;
2736
2737  aq->queue_n++;
2738
2739  if (DEBUG_THREAD_SIGNAL)
2740    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2741	       aq->agent->device_id, aq->id);
2742  pthread_cond_signal (&aq->queue_cond_in);
2743
2744  pthread_mutex_unlock (&aq->mutex);
2745}
2746
2747/* Request an asynchronous callback on the specified queue.  The callback
2748   function will be called, with the given opaque data, from the appropriate
2749   async thread, when all previous items on that queue are complete.  */
2750
2751static void
2752queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2753		     void *data)
2754{
2755  pthread_mutex_lock (&aq->mutex);
2756
2757  wait_for_queue_nonfull (aq);
2758
2759  int queue_last = ((aq->queue_first + aq->queue_n)
2760		    % ASYNC_QUEUE_SIZE);
2761  if (DEBUG_QUEUES)
2762    GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2763	       aq->id, queue_last);
2764
2765  aq->queue[queue_last].type = CALLBACK;
2766  aq->queue[queue_last].u.callback.fn = fn;
2767  aq->queue[queue_last].u.callback.data = data;
2768
2769  aq->queue_n++;
2770
2771  if (DEBUG_THREAD_SIGNAL)
2772    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2773	       aq->agent->device_id, aq->id);
2774  pthread_cond_signal (&aq->queue_cond_in);
2775
2776  pthread_mutex_unlock (&aq->mutex);
2777}
2778
2779/* Request that a given async thread wait for another thread (unspecified) to
2780   reach the given placeholder.  The wait will occur when all previous entries
2781   on the queue are complete.  A placeholder is effectively a kind of signal
2782   which simply sets a flag when encountered in a queue.  */
2783
2784static void
2785queue_push_asyncwait (struct goacc_asyncqueue *aq,
2786		      struct placeholder *placeholderp)
2787{
2788  pthread_mutex_lock (&aq->mutex);
2789
2790  wait_for_queue_nonfull (aq);
2791
2792  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2793  if (DEBUG_QUEUES)
2794    GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2795	       aq->id, queue_last);
2796
2797  aq->queue[queue_last].type = ASYNC_WAIT;
2798  aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2799
2800  aq->queue_n++;
2801
2802  if (DEBUG_THREAD_SIGNAL)
2803    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2804	       aq->agent->device_id, aq->id);
2805  pthread_cond_signal (&aq->queue_cond_in);
2806
2807  pthread_mutex_unlock (&aq->mutex);
2808}
2809
2810/* Add a placeholder into an async queue.  When the async thread reaches the
2811   placeholder it will set the "executed" flag to true and continue.
2812   Another thread may be waiting on this thread reaching the placeholder.  */
2813
2814static struct placeholder *
2815queue_push_placeholder (struct goacc_asyncqueue *aq)
2816{
2817  struct placeholder *placeholderp;
2818
2819  pthread_mutex_lock (&aq->mutex);
2820
2821  wait_for_queue_nonfull (aq);
2822
2823  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2824  if (DEBUG_QUEUES)
2825    GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2826	       aq->id, queue_last);
2827
2828  aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2829  placeholderp = &aq->queue[queue_last].u.placeholder;
2830
2831  if (pthread_mutex_init (&placeholderp->mutex, NULL))
2832    {
2833      pthread_mutex_unlock (&aq->mutex);
2834      GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2835    }
2836
2837  if (pthread_cond_init (&placeholderp->cond, NULL))
2838    {
2839      pthread_mutex_unlock (&aq->mutex);
2840      GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2841    }
2842
2843  placeholderp->executed = 0;
2844
2845  aq->queue_n++;
2846
2847  if (DEBUG_THREAD_SIGNAL)
2848    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2849	       aq->agent->device_id, aq->id);
2850  pthread_cond_signal (&aq->queue_cond_in);
2851
2852  pthread_mutex_unlock (&aq->mutex);
2853
2854  return placeholderp;
2855}
2856
2857/* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2858
2859static void
2860finalize_async_thread (struct goacc_asyncqueue *aq)
2861{
2862  pthread_mutex_lock (&aq->mutex);
2863  if (aq->drain_queue_stop == 2)
2864    {
2865      pthread_mutex_unlock (&aq->mutex);
2866      return;
2867    }
2868
2869  aq->drain_queue_stop = 1;
2870
2871  if (DEBUG_THREAD_SIGNAL)
2872    GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2873	       aq->agent->device_id, aq->id);
2874  pthread_cond_signal (&aq->queue_cond_in);
2875
2876  while (aq->drain_queue_stop != 2)
2877    {
2878      if (DEBUG_THREAD_SLEEP)
2879	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2880		   " to sleep\n", aq->agent->device_id, aq->id);
2881      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2882      if (DEBUG_THREAD_SLEEP)
2883	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2884		   aq->agent->device_id, aq->id);
2885    }
2886
2887  GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2888	     aq->id);
2889  pthread_mutex_unlock (&aq->mutex);
2890
2891  int err = pthread_join (aq->thread_drain_queue, NULL);
2892  if (err != 0)
2893    GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2894		       aq->agent->device_id, aq->id, strerror (err));
2895  GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2896}
2897
2898/* Set up an async queue for OpenMP.  There will be only one.  The
2899   implementation simply uses an OpenACC async queue.
2900   FIXME: is this thread-safe if two threads call this function?  */
2901
2902static void
2903maybe_init_omp_async (struct agent_info *agent)
2904{
2905  if (!agent->omp_async_queue)
2906    agent->omp_async_queue
2907      = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2908}
2909
2910/* A wrapper that works around an issue in the HSA runtime with host-to-device
2911   copies from read-only pages.  */
2912
2913static void
2914hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2915{
2916  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2917
2918  if (status == HSA_STATUS_SUCCESS)
2919    return;
2920
2921  /* It appears that the copy fails if the source data is in a read-only page.
2922     We can't detect that easily, so try copying the data to a temporary buffer
2923     and doing the copy again if we got an error above.  */
2924
2925  GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2926	       "[%p:+%d]\n", (void *) src, (int) len);
2927
2928  void *src_copy = malloc (len);
2929  memcpy (src_copy, src, len);
2930  status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2931  free (src_copy);
2932  if (status != HSA_STATUS_SUCCESS)
2933    GOMP_PLUGIN_error ("memory copy failed");
2934}
2935
2936/* Copy data to or from a device.  This is intended for use as an async
2937   callback event.  */
2938
2939static void
2940copy_data (void *data_)
2941{
2942  struct copy_data *data = (struct copy_data *)data_;
2943  GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2944	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
2945	     data->dst);
2946  hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2947  free (data);
2948}
2949
2950/* Free device data.  This is intended for use as an async callback event.  */
2951
2952static void
2953gomp_offload_free (void *ptr)
2954{
2955  GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2956  GOMP_OFFLOAD_free (0, ptr);
2957}
2958
2959/* Request an asynchronous data copy, to or from a device, on a given queue.
2960   The event will be registered as a callback.  */
2961
2962static void
2963queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2964		 size_t len)
2965{
2966  if (DEBUG_QUEUES)
2967    GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2968	       aq->agent->device_id, aq->id, len, src, dst);
2969  struct copy_data *data
2970    = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2971  data->dst = dst;
2972  data->src = src;
2973  data->len = len;
2974  data->aq = aq;
2975  queue_push_callback (aq, copy_data, data);
2976}
2977
2978/* Return true if the given queue is currently empty.  */
2979
2980static int
2981queue_empty (struct goacc_asyncqueue *aq)
2982{
2983  pthread_mutex_lock (&aq->mutex);
2984  int res = aq->queue_n == 0 ? 1 : 0;
2985  pthread_mutex_unlock (&aq->mutex);
2986
2987  return res;
2988}
2989
2990/* Wait for a given queue to become empty.  This implements an OpenACC wait
2991   directive.  */
2992
2993static void
2994wait_queue (struct goacc_asyncqueue *aq)
2995{
2996  if (DRAIN_QUEUE_SYNCHRONOUS_P)
2997    {
2998      drain_queue_synchronous (aq);
2999      return;
3000    }
3001
3002  pthread_mutex_lock (&aq->mutex);
3003
3004  while (aq->queue_n > 0)
3005    {
3006      if (DEBUG_THREAD_SLEEP)
3007	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3008		   aq->agent->device_id, aq->id);
3009      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3010      if (DEBUG_THREAD_SLEEP)
3011	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3012		   aq->id);
3013    }
3014
3015  pthread_mutex_unlock (&aq->mutex);
3016  GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3017}
3018
3019/* }}}  */
3020/* {{{ OpenACC support  */
3021
3022/* Execute an OpenACC kernel, synchronously or asynchronously.  */
3023
3024static void
3025gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3026	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3027	  struct goacc_asyncqueue *aq)
3028{
3029  if (!GOMP_OFFLOAD_can_run (kernel))
3030    GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3031
3032  /* If we get here then this must be an OpenACC kernel.  */
3033  kernel->kind = KIND_OPENACC;
3034
3035  /* devaddrs must be double-indirect on the target.  */
3036  void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3037  for (size_t i = 0; i < mapnum; i++)
3038    hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3039				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3040				sizeof (void *));
3041
3042  struct hsa_kernel_description *hsa_kernel_desc = NULL;
3043  for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3044    {
3045      struct hsa_kernel_description *d
3046	= &kernel->module->image_desc->kernel_infos[i];
3047      if (d->name == kernel->name)
3048	{
3049	  hsa_kernel_desc = d;
3050	  break;
3051	}
3052    }
3053
3054  /* We may have statically-determined dimensions in
3055     hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3056     invocation at runtime in dims[].  We allow static dimensions to take
3057     priority over dynamic dimensions when present (non-zero).  */
3058  if (hsa_kernel_desc->oacc_dims[0] > 0)
3059    dims[0] = hsa_kernel_desc->oacc_dims[0];
3060  if (hsa_kernel_desc->oacc_dims[1] > 0)
3061    dims[1] = hsa_kernel_desc->oacc_dims[1];
3062  if (hsa_kernel_desc->oacc_dims[2] > 0)
3063    dims[2] = hsa_kernel_desc->oacc_dims[2];
3064
3065  /* Ideally, when a dimension isn't explicitly specified, we should
3066     tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3067     In practice, we tune for peak performance on BabelStream, which
3068     for OpenACC is currently 32 threads per CU.  */
3069  if (dims[0] == 0 && dims[1] == 0)
3070    {
3071      /* If any of the OpenACC dimensions remain 0 then we get to pick a
3072	 number.  There isn't really a correct answer for this without a clue
3073	 about the problem size, so let's do a reasonable number of workers
3074	 and gangs.  */
3075
3076      dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3077      dims[1] = 8; /* Workers.  */
3078    }
3079  else if (dims[0] == 0 && dims[1] > 0)
3080    {
3081      /* Auto-scale the number of gangs with the requested number of workers.  */
3082      dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3083    }
3084  else if (dims[0] > 0 && dims[1] == 0)
3085    {
3086      /* Auto-scale the number of workers with the requested number of gangs.  */
3087      dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3088      if (dims[1] == 0)
3089	dims[1] = 1;
3090      if (dims[1] > 16)
3091	dims[1] = 16;
3092    }
3093
3094  /* The incoming dimensions are expressed in terms of gangs, workers, and
3095     vectors.  The HSA dimensions are expressed in terms of "work-items",
3096     which means multiples of vector lanes.
3097
3098     The "grid size" specifies the size of the problem space, and the
3099     "work-group size" specifies how much of that we want a single compute
3100     unit to chew on at once.
3101
3102     The three dimensions do not really correspond to hardware, but the
3103     important thing is that the HSA runtime will launch as many
3104     work-groups as it takes to process the entire grid, and each
3105     work-group will contain as many wave-fronts as it takes to process
3106     the work-items in that group.
3107
3108     Essentially, as long as we set the Y dimension to 64 (the number of
3109     vector lanes in hardware), and the Z group size to the maximum (16),
3110     then we will get the gangs (X) and workers (Z) launched as we expect.
3111
3112     The reason for the apparent reversal of vector and worker dimension
3113     order is to do with the way the run-time distributes work-items across
3114     v1 and v2.  */
3115  struct GOMP_kernel_launch_attributes kla =
3116    {3,
3117     /* Grid size.  */
3118     {dims[0], 64, dims[1]},
3119     /* Work-group size.  */
3120     {1,       64, 16}
3121    };
3122
3123  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3124  acc_prof_info *prof_info = thr->prof_info;
3125  acc_event_info enqueue_launch_event_info;
3126  acc_api_info *api_info = thr->api_info;
3127  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3128  if (profiling_dispatch_p)
3129    {
3130      prof_info->event_type = acc_ev_enqueue_launch_start;
3131
3132      enqueue_launch_event_info.launch_event.event_type
3133	= prof_info->event_type;
3134      enqueue_launch_event_info.launch_event.valid_bytes
3135	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3136      enqueue_launch_event_info.launch_event.parent_construct
3137	= acc_construct_parallel;
3138      enqueue_launch_event_info.launch_event.implicit = 1;
3139      enqueue_launch_event_info.launch_event.tool_info = NULL;
3140      enqueue_launch_event_info.launch_event.kernel_name
3141	= (char *) kernel->name;
3142      enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3143      enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3144      enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3145
3146      api_info->device_api = acc_device_api_other;
3147
3148      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3149	&enqueue_launch_event_info, api_info);
3150    }
3151
3152  if (!async)
3153    {
3154      run_kernel (kernel, ind_da, &kla, NULL, false);
3155      gomp_offload_free (ind_da);
3156    }
3157  else
3158    {
3159      queue_push_launch (aq, kernel, ind_da, &kla);
3160      if (DEBUG_QUEUES)
3161	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3162		   aq->agent->device_id, aq->id, ind_da);
3163      queue_push_callback (aq, gomp_offload_free, ind_da);
3164    }
3165
3166  if (profiling_dispatch_p)
3167    {
3168      prof_info->event_type = acc_ev_enqueue_launch_end;
3169      enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3170      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3171					    &enqueue_launch_event_info,
3172					    api_info);
3173    }
3174}
3175
3176/* }}}  */
3177/* {{{ Generic Plugin API  */
3178
3179/* Return the name of the accelerator, which is "gcn".  */
3180
3181const char *
3182GOMP_OFFLOAD_get_name (void)
3183{
3184  return "gcn";
3185}
3186
3187/* Return the specific capabilities the HSA accelerator have.  */
3188
3189unsigned int
3190GOMP_OFFLOAD_get_caps (void)
3191{
3192  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3193  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3194	    | GOMP_OFFLOAD_CAP_OPENACC_200;
3195}
3196
3197/* Identify as GCN accelerator.  */
3198
3199int
3200GOMP_OFFLOAD_get_type (void)
3201{
3202  return OFFLOAD_TARGET_TYPE_GCN;
3203}
3204
3205/* Return the libgomp version number we're compatible with.  There is
3206   no requirement for cross-version compatibility.  */
3207
3208unsigned
3209GOMP_OFFLOAD_version (void)
3210{
3211  return GOMP_VERSION;
3212}
3213
3214/* Return the number of GCN devices on the system.  */
3215
3216int
3217GOMP_OFFLOAD_get_num_devices (void)
3218{
3219  if (!init_hsa_context ())
3220    return 0;
3221  return hsa_context.agent_count;
3222}
3223
3224/* Initialize device (agent) number N so that it can be used for computation.
3225   Return TRUE on success.  */
3226
3227bool
3228GOMP_OFFLOAD_init_device (int n)
3229{
3230  if (!init_hsa_context ())
3231    return false;
3232  if (n >= hsa_context.agent_count)
3233    {
3234      GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3235      return false;
3236    }
3237  struct agent_info *agent = &hsa_context.agents[n];
3238
3239  if (agent->initialized)
3240    return true;
3241
3242  agent->device_id = n;
3243
3244  if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3245    {
3246      GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3247      return false;
3248    }
3249  if (pthread_mutex_init (&agent->prog_mutex, NULL))
3250    {
3251      GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3252      return false;
3253    }
3254  if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3255    {
3256      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3257      return false;
3258    }
3259  if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3260    {
3261      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3262      return false;
3263    }
3264  agent->async_queues = NULL;
3265  agent->omp_async_queue = NULL;
3266  agent->team_arena_list = NULL;
3267
3268  uint32_t queue_size;
3269  hsa_status_t status;
3270  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3271					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3272					  &queue_size);
3273  if (status != HSA_STATUS_SUCCESS)
3274    return hsa_error ("Error requesting maximum queue size of the GCN agent",
3275		      status);
3276
3277  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3278					  &agent->name);
3279  if (status != HSA_STATUS_SUCCESS)
3280    return hsa_error ("Error querying the name of the agent", status);
3281
3282  agent->device_isa = isa_code (agent->name);
3283  if (agent->device_isa < 0)
3284    return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3285
3286  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3287					  &agent->vendor_name);
3288  if (status != HSA_STATUS_SUCCESS)
3289    return hsa_error ("Error querying the vendor name of the agent", status);
3290
3291  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3292					HSA_QUEUE_TYPE_MULTI,
3293					hsa_queue_callback, NULL, UINT32_MAX,
3294					UINT32_MAX, &agent->sync_queue);
3295  if (status != HSA_STATUS_SUCCESS)
3296    return hsa_error ("Error creating command queue", status);
3297
3298  agent->kernarg_region.handle = (uint64_t) -1;
3299  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3300						 get_kernarg_memory_region,
3301						 &agent->kernarg_region);
3302  if (status != HSA_STATUS_SUCCESS
3303      && status != HSA_STATUS_INFO_BREAK)
3304    hsa_error ("Scanning memory regions failed", status);
3305  if (agent->kernarg_region.handle == (uint64_t) -1)
3306    {
3307      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3308			 "arguments");
3309      return false;
3310    }
3311  GCN_DEBUG ("Selected kernel arguments memory region:\n");
3312  dump_hsa_region (agent->kernarg_region, NULL);
3313
3314  agent->data_region.handle = (uint64_t) -1;
3315  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3316						 get_data_memory_region,
3317						 &agent->data_region);
3318  if (status != HSA_STATUS_SUCCESS
3319      && status != HSA_STATUS_INFO_BREAK)
3320    hsa_error ("Scanning memory regions failed", status);
3321  if (agent->data_region.handle == (uint64_t) -1)
3322    {
3323      GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3324			 "data");
3325      return false;
3326    }
3327  GCN_DEBUG ("Selected device data memory region:\n");
3328  dump_hsa_region (agent->data_region, NULL);
3329
3330  GCN_DEBUG ("GCN agent %d initialized\n", n);
3331
3332  agent->initialized = true;
3333  return true;
3334}
3335
3336/* Load GCN object-code module described by struct gcn_image_desc in
3337   TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3338   If there are any constructors then run them.  */
3339
3340int
3341GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3342			 struct addr_pair **target_table)
3343{
3344  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3345    {
3346      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3347			 " (expected %u, received %u)",
3348			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3349      return -1;
3350    }
3351
3352  struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3353  struct agent_info *agent;
3354  struct addr_pair *pair;
3355  struct module_info *module;
3356  struct kernel_info *kernel;
3357  int kernel_count = image_desc->kernel_count;
3358  unsigned var_count = image_desc->global_variable_count;
3359  int other_count = 1;
3360
3361  agent = get_agent_info (ord);
3362  if (!agent)
3363    return -1;
3364
3365  if (pthread_rwlock_wrlock (&agent->module_rwlock))
3366    {
3367      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3368      return -1;
3369    }
3370  if (agent->prog_finalized
3371      && !destroy_hsa_program (agent))
3372    return -1;
3373
3374  GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3375  GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3376  GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3377  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3378			     * sizeof (struct addr_pair));
3379  *target_table = pair;
3380  module = (struct module_info *)
3381    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3382				+ kernel_count * sizeof (struct kernel_info));
3383  module->image_desc = image_desc;
3384  module->kernel_count = kernel_count;
3385  module->heap = NULL;
3386  module->constructors_run_p = false;
3387
3388  kernel = &module->kernels[0];
3389
3390  /* Allocate memory for kernel dependencies.  */
3391  for (unsigned i = 0; i < kernel_count; i++)
3392    {
3393      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3394      if (!init_basic_kernel_info (kernel, d, agent, module))
3395	return -1;
3396      if (strcmp (d->name, "_init_array") == 0)
3397	module->init_array_func = kernel;
3398      else if (strcmp (d->name, "_fini_array") == 0)
3399        module->fini_array_func = kernel;
3400      else
3401	{
3402	  pair->start = (uintptr_t) kernel;
3403	  pair->end = (uintptr_t) (kernel + 1);
3404	  pair++;
3405	}
3406      kernel++;
3407    }
3408
3409  agent->module = module;
3410  if (pthread_rwlock_unlock (&agent->module_rwlock))
3411    {
3412      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3413      return -1;
3414    }
3415
3416  if (!create_and_finalize_hsa_program (agent))
3417    return -1;
3418
3419  if (var_count > 0)
3420    {
3421      hsa_status_t status;
3422      hsa_executable_symbol_t var_symbol;
3423      status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3424						     ".offload_var_table",
3425						     agent->id,
3426						     0, &var_symbol);
3427
3428      if (status != HSA_STATUS_SUCCESS)
3429	hsa_fatal ("Could not find symbol for variable in the code object",
3430		   status);
3431
3432      uint64_t var_table_addr;
3433      status = hsa_fns.hsa_executable_symbol_get_info_fn
3434	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3435	 &var_table_addr);
3436      if (status != HSA_STATUS_SUCCESS)
3437	hsa_fatal ("Could not extract a variable from its symbol", status);
3438
3439      struct {
3440	uint64_t addr;
3441	uint64_t size;
3442      } var_table[var_count];
3443      GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3444			     (void*)var_table_addr, sizeof (var_table));
3445
3446      for (unsigned i = 0; i < var_count; i++)
3447	{
3448	  pair->start = var_table[i].addr;
3449	  pair->end = var_table[i].addr + var_table[i].size;
3450	  GCN_DEBUG ("Found variable at %p with size %lu\n",
3451		     (void *)var_table[i].addr, var_table[i].size);
3452	  pair++;
3453	}
3454    }
3455
3456  GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_DEVICE_NUM_VAR));
3457
3458  hsa_status_t status;
3459  hsa_executable_symbol_t var_symbol;
3460  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3461						 XSTRING (GOMP_DEVICE_NUM_VAR),
3462						 agent->id, 0, &var_symbol);
3463  if (status == HSA_STATUS_SUCCESS)
3464    {
3465      uint64_t device_num_varptr;
3466      uint32_t device_num_varsize;
3467
3468      status = hsa_fns.hsa_executable_symbol_get_info_fn
3469	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3470	 &device_num_varptr);
3471      if (status != HSA_STATUS_SUCCESS)
3472	hsa_fatal ("Could not extract a variable from its symbol", status);
3473      status = hsa_fns.hsa_executable_symbol_get_info_fn
3474	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3475	 &device_num_varsize);
3476      if (status != HSA_STATUS_SUCCESS)
3477	hsa_fatal ("Could not extract a variable size from its symbol", status);
3478
3479      pair->start = device_num_varptr;
3480      pair->end = device_num_varptr + device_num_varsize;
3481    }
3482  else
3483    /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
3484    pair->start = pair->end = 0;
3485  pair++;
3486
3487  /* Ensure that constructors are run first.  */
3488  struct GOMP_kernel_launch_attributes kla =
3489    { 3,
3490      /* Grid size.  */
3491      { 1, 64, 1 },
3492      /* Work-group size.  */
3493      { 1, 64, 1 }
3494    };
3495
3496  if (module->init_array_func)
3497    {
3498      init_kernel (module->init_array_func);
3499      run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3500    }
3501  module->constructors_run_p = true;
3502
3503  /* Don't report kernels that libgomp need not know about.  */
3504  if (module->init_array_func)
3505    kernel_count--;
3506  if (module->fini_array_func)
3507    kernel_count--;
3508
3509  return kernel_count + var_count + other_count;
3510}
3511
3512/* Unload GCN object-code module described by struct gcn_image_desc in
3513   TARGET_DATA from agent number N.  Return TRUE on success.  */
3514
3515bool
3516GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3517{
3518  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3519    {
3520      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3521			 " (expected %u, received %u)",
3522			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3523      return false;
3524    }
3525
3526  struct agent_info *agent;
3527  agent = get_agent_info (n);
3528  if (!agent)
3529    return false;
3530
3531  if (pthread_rwlock_wrlock (&agent->module_rwlock))
3532    {
3533      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3534      return false;
3535    }
3536
3537  if (!agent->module || agent->module->image_desc != target_data)
3538    {
3539      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3540			 "loaded before");
3541      return false;
3542    }
3543
3544  if (!destroy_module (agent->module, true))
3545    return false;
3546  free (agent->module);
3547  agent->module = NULL;
3548  if (!destroy_hsa_program (agent))
3549    return false;
3550  if (pthread_rwlock_unlock (&agent->module_rwlock))
3551    {
3552      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3553      return false;
3554    }
3555  return true;
3556}
3557
3558/* Deinitialize all information and status associated with agent number N.  We
3559   do not attempt any synchronization, assuming the user and libgomp will not
3560   attempt deinitialization of a device that is in any way being used at the
3561   same time.  Return TRUE on success.  */
3562
3563bool
3564GOMP_OFFLOAD_fini_device (int n)
3565{
3566  struct agent_info *agent = get_agent_info (n);
3567  if (!agent)
3568    return false;
3569
3570  if (!agent->initialized)
3571    return true;
3572
3573  if (agent->omp_async_queue)
3574    {
3575      GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3576      agent->omp_async_queue = NULL;
3577    }
3578
3579  if (agent->module)
3580    {
3581      if (!destroy_module (agent->module, false))
3582	return false;
3583      free (agent->module);
3584      agent->module = NULL;
3585    }
3586
3587  if (!destroy_team_arenas (agent))
3588    return false;
3589
3590  if (!destroy_hsa_program (agent))
3591    return false;
3592
3593  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3594  if (status != HSA_STATUS_SUCCESS)
3595    return hsa_error ("Error destroying command queue", status);
3596
3597  if (pthread_mutex_destroy (&agent->prog_mutex))
3598    {
3599      GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3600      return false;
3601    }
3602  if (pthread_rwlock_destroy (&agent->module_rwlock))
3603    {
3604      GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3605      return false;
3606    }
3607
3608  if (pthread_mutex_destroy (&agent->async_queues_mutex))
3609    {
3610      GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3611      return false;
3612    }
3613  if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3614    {
3615      GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3616      return false;
3617    }
3618  agent->initialized = false;
3619  return true;
3620}
3621
3622/* Return true if the HSA runtime can run function FN_PTR.  */
3623
3624bool
3625GOMP_OFFLOAD_can_run (void *fn_ptr)
3626{
3627  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3628
3629  init_kernel (kernel);
3630  if (kernel->initialization_failed)
3631    goto failure;
3632
3633  return true;
3634
3635failure:
3636  if (suppress_host_fallback)
3637    GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3638  GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3639  return false;
3640}
3641
3642/* Allocate memory on device N.  */
3643
3644void *
3645GOMP_OFFLOAD_alloc (int n, size_t size)
3646{
3647  struct agent_info *agent = get_agent_info (n);
3648  return alloc_by_agent (agent, size);
3649}
3650
3651/* Free memory from device N.  */
3652
3653bool
3654GOMP_OFFLOAD_free (int device, void *ptr)
3655{
3656  GCN_DEBUG ("Freeing memory on device %d\n", device);
3657
3658  hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3659  if (status != HSA_STATUS_SUCCESS)
3660    {
3661      hsa_error ("Could not free device memory", status);
3662      return false;
3663    }
3664
3665  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3666  bool profiling_dispatch_p
3667    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3668  if (profiling_dispatch_p)
3669    {
3670      acc_prof_info *prof_info = thr->prof_info;
3671      acc_event_info data_event_info;
3672      acc_api_info *api_info = thr->api_info;
3673
3674      prof_info->event_type = acc_ev_free;
3675
3676      data_event_info.data_event.event_type = prof_info->event_type;
3677      data_event_info.data_event.valid_bytes
3678	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
3679      data_event_info.data_event.parent_construct
3680	= acc_construct_parallel;
3681      data_event_info.data_event.implicit = 1;
3682      data_event_info.data_event.tool_info = NULL;
3683      data_event_info.data_event.var_name = NULL;
3684      data_event_info.data_event.bytes = 0;
3685      data_event_info.data_event.host_ptr = NULL;
3686      data_event_info.data_event.device_ptr = (void *) ptr;
3687
3688      api_info->device_api = acc_device_api_other;
3689
3690      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3691					    api_info);
3692    }
3693
3694  return true;
3695}
3696
3697/* Copy data from DEVICE to host.  */
3698
3699bool
3700GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3701{
3702  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3703	     src, dst);
3704  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3705  if (status != HSA_STATUS_SUCCESS)
3706    GOMP_PLUGIN_error ("memory copy failed");
3707  return true;
3708}
3709
3710/* Copy data from host to DEVICE.  */
3711
3712bool
3713GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3714{
3715  GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3716	     device, dst);
3717  hsa_memory_copy_wrapper (dst, src, n);
3718  return true;
3719}
3720
3721/* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3722
3723bool
3724GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3725{
3726  struct gcn_thread *thread_data = gcn_thread ();
3727
3728  if (thread_data && !async_synchronous_p (thread_data->async))
3729    {
3730      struct agent_info *agent = get_agent_info (device);
3731      maybe_init_omp_async (agent);
3732      queue_push_copy (agent->omp_async_queue, dst, src, n);
3733      return true;
3734    }
3735
3736  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3737	     device, src, device, dst);
3738  hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3739  if (status != HSA_STATUS_SUCCESS)
3740    GOMP_PLUGIN_error ("memory copy failed");
3741  return true;
3742}
3743
3744/* }}}  */
3745/* {{{ OpenMP Plugin API  */
3746
3747/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3748   in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3749   to a kernel_info structure, and must have previously been loaded to the
3750   specified device.  */
3751
3752void
3753GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3754{
3755  struct agent_info *agent = get_agent_info (device);
3756  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3757  struct GOMP_kernel_launch_attributes def;
3758  struct GOMP_kernel_launch_attributes *kla;
3759  assert (agent == kernel->agent);
3760
3761  /* If we get here then the kernel must be OpenMP.  */
3762  kernel->kind = KIND_OPENMP;
3763
3764  if (!parse_target_attributes (args, &def, &kla, agent))
3765    {
3766      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3767      return;
3768    }
3769  run_kernel (kernel, vars, kla, NULL, false);
3770}
3771
3772/* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3773   GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3774   GOMP_PLUGIN_target_task_completion when it has finished.  */
3775
3776void
3777GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3778			void **args, void *async_data)
3779{
3780  GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3781  struct agent_info *agent = get_agent_info (device);
3782  struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3783  struct GOMP_kernel_launch_attributes def;
3784  struct GOMP_kernel_launch_attributes *kla;
3785  assert (agent == kernel->agent);
3786
3787  /* If we get here then the kernel must be OpenMP.  */
3788  kernel->kind = KIND_OPENMP;
3789
3790  if (!parse_target_attributes (args, &def, &kla, agent))
3791    {
3792      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3793      return;
3794    }
3795
3796  maybe_init_omp_async (agent);
3797  queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3798  queue_push_callback (agent->omp_async_queue,
3799		       GOMP_PLUGIN_target_task_completion, async_data);
3800}
3801
3802/* }}} */
3803/* {{{ OpenACC Plugin API  */
3804
3805/* Run a synchronous OpenACC kernel.  The device number is inferred from the
3806   already-loaded KERNEL.  */
3807
3808void
3809GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3810			   void **hostaddrs, void **devaddrs, unsigned *dims,
3811			   void *targ_mem_desc)
3812{
3813  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3814
3815  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3816	    NULL);
3817}
3818
3819/* Run an asynchronous OpenACC kernel on the specified queue.  */
3820
3821void
3822GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3823				 void **hostaddrs, void **devaddrs,
3824				 unsigned *dims, void *targ_mem_desc,
3825				 struct goacc_asyncqueue *aq)
3826{
3827  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3828
3829  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3830	    aq);
3831}
3832
3833/* Create a new asynchronous thread and queue for running future kernels.  */
3834
3835struct goacc_asyncqueue *
3836GOMP_OFFLOAD_openacc_async_construct (int device)
3837{
3838  struct agent_info *agent = get_agent_info (device);
3839
3840  pthread_mutex_lock (&agent->async_queues_mutex);
3841
3842  struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3843  aq->agent = get_agent_info (device);
3844  aq->prev = NULL;
3845  aq->next = agent->async_queues;
3846  if (aq->next)
3847    {
3848      aq->next->prev = aq;
3849      aq->id = aq->next->id + 1;
3850    }
3851  else
3852    aq->id = 1;
3853  agent->async_queues = aq;
3854
3855  aq->queue_first = 0;
3856  aq->queue_n = 0;
3857  aq->drain_queue_stop = 0;
3858
3859  if (pthread_mutex_init (&aq->mutex, NULL))
3860    {
3861      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3862      return false;
3863    }
3864  if (pthread_cond_init (&aq->queue_cond_in, NULL))
3865    {
3866      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3867      return false;
3868    }
3869  if (pthread_cond_init (&aq->queue_cond_out, NULL))
3870    {
3871      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3872      return false;
3873    }
3874
3875  hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3876						     ASYNC_QUEUE_SIZE,
3877						     HSA_QUEUE_TYPE_MULTI,
3878						     hsa_queue_callback, NULL,
3879						     UINT32_MAX, UINT32_MAX,
3880						     &aq->hsa_queue);
3881  if (status != HSA_STATUS_SUCCESS)
3882    hsa_fatal ("Error creating command queue", status);
3883
3884  int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3885  if (err != 0)
3886    GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3887		       strerror (err));
3888  GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3889	     aq->id);
3890
3891  pthread_mutex_unlock (&agent->async_queues_mutex);
3892
3893  return aq;
3894}
3895
3896/* Destroy an existing asynchronous thread and queue.  Waits for any
3897   currently-running task to complete, but cancels any queued tasks.  */
3898
3899bool
3900GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3901{
3902  struct agent_info *agent = aq->agent;
3903
3904  finalize_async_thread (aq);
3905
3906  pthread_mutex_lock (&agent->async_queues_mutex);
3907
3908  int err;
3909  if ((err = pthread_mutex_destroy (&aq->mutex)))
3910    {
3911      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3912      goto fail;
3913    }
3914  if (pthread_cond_destroy (&aq->queue_cond_in))
3915    {
3916      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3917      goto fail;
3918    }
3919  if (pthread_cond_destroy (&aq->queue_cond_out))
3920    {
3921      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3922      goto fail;
3923    }
3924  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3925  if (status != HSA_STATUS_SUCCESS)
3926    {
3927      hsa_error ("Error destroying command queue", status);
3928      goto fail;
3929    }
3930
3931  if (aq->prev)
3932    aq->prev->next = aq->next;
3933  if (aq->next)
3934    aq->next->prev = aq->prev;
3935  if (agent->async_queues == aq)
3936    agent->async_queues = aq->next;
3937
3938  GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3939
3940  free (aq);
3941  pthread_mutex_unlock (&agent->async_queues_mutex);
3942  return true;
3943
3944fail:
3945  pthread_mutex_unlock (&agent->async_queues_mutex);
3946  return false;
3947}
3948
3949/* Return true if the specified async queue is currently empty.  */
3950
3951int
3952GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3953{
3954  return queue_empty (aq);
3955}
3956
3957/* Block until the specified queue has executed all its tasks and the
3958   queue is empty.  */
3959
3960bool
3961GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3962{
3963  wait_queue (aq);
3964  return true;
3965}
3966
3967/* Add a serialization point across two async queues. Any new tasks added to
3968   AQ2, after this call, will not run until all tasks on AQ1, at the time
3969   of this call, have completed.  */
3970
3971bool
3972GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3973				      struct goacc_asyncqueue *aq2)
3974{
3975  /* For serialize, stream aq2 waits for aq1 to complete work that has been
3976     scheduled to run on it up to this point.  */
3977  if (aq1 != aq2)
3978    {
3979      struct placeholder *placeholderp = queue_push_placeholder (aq1);
3980      queue_push_asyncwait (aq2, placeholderp);
3981    }
3982  return true;
3983}
3984
3985/* Add an opaque callback to the given async queue.  */
3986
3987void
3988GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3989					   void (*fn) (void *), void *data)
3990{
3991  queue_push_callback (aq, fn, data);
3992}
3993
3994/* Queue up an asynchronous data copy from host to DEVICE.  */
3995
3996bool
3997GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3998				     size_t n, struct goacc_asyncqueue *aq)
3999{
4000  struct agent_info *agent = get_agent_info (device);
4001  assert (agent == aq->agent);
4002  queue_push_copy (aq, dst, src, n);
4003  return true;
4004}
4005
4006/* Queue up an asynchronous data copy from DEVICE to host.  */
4007
4008bool
4009GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4010				     size_t n, struct goacc_asyncqueue *aq)
4011{
4012  struct agent_info *agent = get_agent_info (device);
4013  assert (agent == aq->agent);
4014  queue_push_copy (aq, dst, src, n);
4015  return true;
4016}
4017
4018union goacc_property_value
4019GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4020{
4021  struct agent_info *agent = get_agent_info (device);
4022
4023  union goacc_property_value propval = { .val = 0 };
4024
4025  switch (prop)
4026    {
4027    case GOACC_PROPERTY_FREE_MEMORY:
4028      /* Not supported. */
4029      break;
4030    case GOACC_PROPERTY_MEMORY:
4031      {
4032	size_t size;
4033	hsa_region_t region = agent->data_region;
4034	hsa_status_t status =
4035	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4036	if (status == HSA_STATUS_SUCCESS)
4037	  propval.val = size;
4038	break;
4039      }
4040    case GOACC_PROPERTY_NAME:
4041      propval.ptr = agent->name;
4042      break;
4043    case GOACC_PROPERTY_VENDOR:
4044      propval.ptr = agent->vendor_name;
4045      break;
4046    case GOACC_PROPERTY_DRIVER:
4047      propval.ptr = hsa_context.driver_version_s;
4048      break;
4049    }
4050
4051  return propval;
4052}
4053
4054/* Set up plugin-specific thread-local-data (host-side).  */
4055
4056void *
4057GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4058{
4059  struct gcn_thread *thread_data
4060    = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4061
4062  thread_data->async = GOMP_ASYNC_SYNC;
4063
4064  return (void *) thread_data;
4065}
4066
4067/* Clean up plugin-specific thread-local-data.  */
4068
4069void
4070GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4071{
4072  free (data);
4073}
4074
4075/* }}} */
4076