1/* Plugin for HSAIL execution.
2
3   Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5   Contributed by Martin Jambor <mjambor@suse.cz> and
6   Martin Liska <mliska@suse.cz>.
7
8   This file is part of the GNU Offloading and Multi Processing Library
9   (libgomp).
10
11   Libgomp is free software; you can redistribute it and/or modify it
12   under the terms of the GNU General Public License as published by
13   the Free Software Foundation; either version 3, or (at your option)
14   any later version.
15
16   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
17   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
19   more details.
20
21   Under Section 7 of GPL version 3, you are granted additional
22   permissions described in the GCC Runtime Library Exception, version
23   3.1, as published by the Free Software Foundation.
24
25   You should have received a copy of the GNU General Public License and
26   a copy of the GCC Runtime Library Exception along with this program;
27   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
28   <http://www.gnu.org/licenses/>.  */
29
30#include "config.h"
31#include <stdint.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#ifdef HAVE_INTTYPES_H
37#include <inttypes.h>
38#endif
39#include <stdbool.h>
40#include <hsa.h>
41#include <plugin/hsa_ext_finalize.h>
42#include <dlfcn.h>
43#include "libgomp-plugin.h"
44#include "gomp-constants.h"
45#include "secure_getenv.h"
46
47#ifdef HAVE_INTTYPES_H
48typedef uint64_t print_uint64_t;
49#else
50#define PRIu64 "lu"
51typedef unsigned long print_uint64_t;
52#endif
53
54/* As an HSA runtime is dlopened, following structure defines function
55   pointers utilized by the HSA plug-in.  */
56
57struct hsa_runtime_fn_info
58{
59  /* HSA runtime.  */
60  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
61					const char **status_string);
62  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
63					 hsa_agent_info_t attribute,
64					 void *value);
65  hsa_status_t (*hsa_init_fn) (void);
66  hsa_status_t (*hsa_iterate_agents_fn)
67    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
68  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
69					  hsa_region_info_t attribute,
70					  void *value);
71  hsa_status_t (*hsa_queue_create_fn)
72    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
73     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
74     void *data, uint32_t private_segment_size,
75     uint32_t group_segment_size, hsa_queue_t **queue);
76  hsa_status_t (*hsa_agent_iterate_regions_fn)
77    (hsa_agent_t agent,
78     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
79  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
80  hsa_status_t (*hsa_executable_create_fn)
81    (hsa_profile_t profile, hsa_executable_state_t executable_state,
82     const char *options, hsa_executable_t *executable);
83  hsa_status_t (*hsa_executable_global_variable_define_fn)
84    (hsa_executable_t executable, const char *variable_name, void *address);
85  hsa_status_t (*hsa_executable_load_code_object_fn)
86    (hsa_executable_t executable, hsa_agent_t agent,
87     hsa_code_object_t code_object, const char *options);
88  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
89					   const char *options);
90  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
91					uint32_t num_consumers,
92					const hsa_agent_t *consumers,
93					hsa_signal_t *signal);
94  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
95					  void **ptr);
96  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
97  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
98  hsa_status_t (*hsa_executable_get_symbol_fn)
99    (hsa_executable_t executable, const char *module_name,
100     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
101     hsa_executable_symbol_t *symbol);
102  hsa_status_t (*hsa_executable_symbol_get_info_fn)
103    (hsa_executable_symbol_t executable_symbol,
104     hsa_executable_symbol_info_t attribute, void *value);
105  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
106						    uint64_t value);
107  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
108  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
109				       hsa_signal_value_t value);
110  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
111				       hsa_signal_value_t value);
112  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
113    (hsa_signal_t signal, hsa_signal_condition_t condition,
114     hsa_signal_value_t compare_value, uint64_t timeout_hint,
115     hsa_wait_state_t wait_state_hint);
116  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
117  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
118
119  /* HSA finalizer.  */
120  hsa_status_t (*hsa_ext_program_add_module_fn) (hsa_ext_program_t program,
121						 hsa_ext_module_t module);
122  hsa_status_t (*hsa_ext_program_create_fn)
123    (hsa_machine_model_t machine_model, hsa_profile_t profile,
124     hsa_default_float_rounding_mode_t default_float_rounding_mode,
125     const char *options, hsa_ext_program_t *program);
126  hsa_status_t (*hsa_ext_program_destroy_fn) (hsa_ext_program_t program);
127  hsa_status_t (*hsa_ext_program_finalize_fn)
128    (hsa_ext_program_t program,hsa_isa_t isa,
129     int32_t call_convention, hsa_ext_control_directives_t control_directives,
130     const char *options, hsa_code_object_type_t code_object_type,
131     hsa_code_object_t *code_object);
132};
133
134/* HSA runtime functions that are initialized in init_hsa_context.  */
135
136static struct hsa_runtime_fn_info hsa_fns;
137
138/* Keep the following GOMP prefixed structures in sync with respective parts of
139   the compiler.  */
140
141/* Structure describing the run-time and grid properties of an HSA kernel
142   lauch.  */
143
144struct GOMP_kernel_launch_attributes
145{
146  /* Number of dimensions the workload has.  Maximum number is 3.  */
147  uint32_t ndim;
148  /* Size of the grid in the three respective dimensions.  */
149  uint32_t gdims[3];
150  /* Size of work-groups in the respective dimensions.  */
151  uint32_t wdims[3];
152};
153
154/* Collection of information needed for a dispatch of a kernel from a
155   kernel.  */
156
157struct GOMP_hsa_kernel_dispatch
158{
159  /* Pointer to a command queue associated with a kernel dispatch agent.  */
160  void *queue;
161  /* Pointer to reserved memory for OMP data struct copying.  */
162  void *omp_data_memory;
163  /* Pointer to a memory space used for kernel arguments passing.  */
164  void *kernarg_address;
165  /* Kernel object.  */
166  uint64_t object;
167  /* Synchronization signal used for dispatch synchronization.  */
168  uint64_t signal;
169  /* Private segment size.  */
170  uint32_t private_segment_size;
171  /* Group segment size.  */
172  uint32_t group_segment_size;
173  /* Number of children kernel dispatches.  */
174  uint64_t kernel_dispatch_count;
175  /* Debug purpose argument.  */
176  uint64_t debug;
177  /* Levels-var ICV.  */
178  uint64_t omp_level;
179  /* Kernel dispatch structures created for children kernel dispatches.  */
180  struct GOMP_hsa_kernel_dispatch **children_dispatches;
181  /* Number of threads.  */
182  uint32_t omp_num_threads;
183};
184
185/* Part of the libgomp plugin interface.  Return the name of the accelerator,
186   which is "hsa".  */
187
188const char *
189GOMP_OFFLOAD_get_name (void)
190{
191  return "hsa";
192}
193
194/* Part of the libgomp plugin interface.  Return the specific capabilities the
195   HSA accelerator have.  */
196
197unsigned int
198GOMP_OFFLOAD_get_caps (void)
199{
200  return GOMP_OFFLOAD_CAP_SHARED_MEM | GOMP_OFFLOAD_CAP_OPENMP_400;
201}
202
203/* Part of the libgomp plugin interface.  Identify as HSA accelerator.  */
204
205int
206GOMP_OFFLOAD_get_type (void)
207{
208  return OFFLOAD_TARGET_TYPE_HSA;
209}
210
211/* Return the libgomp version number we're compatible with.  There is
212   no requirement for cross-version compatibility.  */
213
214unsigned
215GOMP_OFFLOAD_version (void)
216{
217  return GOMP_VERSION;
218}
219
220/* Flag to decide whether print to stderr information about what is going on.
221   Set in init_debug depending on environment variables.  */
222
223static bool debug;
224
225/* Flag to decide if the runtime should suppress a possible fallback to host
226   execution.  */
227
228static bool suppress_host_fallback;
229
230/* Flag to locate HSA runtime shared library that is dlopened
231   by this plug-in.  */
232
233static const char *hsa_runtime_lib;
234
235/* Flag to decide if the runtime should support also CPU devices (can be
236   a simulator).  */
237
238static bool support_cpu_devices;
239
240/* Initialize debug and suppress_host_fallback according to the environment.  */
241
242static void
243init_enviroment_variables (void)
244{
245  if (secure_getenv ("HSA_DEBUG"))
246    debug = true;
247  else
248    debug = false;
249
250  if (secure_getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
251    suppress_host_fallback = true;
252  else
253    suppress_host_fallback = false;
254
255  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
256  if (hsa_runtime_lib == NULL)
257    hsa_runtime_lib = "libhsa-runtime64.so";
258
259  support_cpu_devices = secure_getenv ("HSA_SUPPORT_CPU_DEVICES");
260}
261
262/* Print a logging message with PREFIX to stderr if HSA_DEBUG value
263   is set to true.  */
264
265#define HSA_LOG(prefix, ...) \
266  do \
267  { \
268    if (debug) \
269      { \
270	fprintf (stderr, prefix); \
271	fprintf (stderr, __VA_ARGS__); \
272      } \
273  } \
274  while (false)
275
276/* Print a debugging message to stderr.  */
277
278#define HSA_DEBUG(...) HSA_LOG ("HSA debug: ", __VA_ARGS__)
279
280/* Print a warning message to stderr.  */
281
282#define HSA_WARNING(...) HSA_LOG ("HSA warning: ", __VA_ARGS__)
283
284/* Print HSA warning STR with an HSA STATUS code.  */
285
286static void
287hsa_warn (const char *str, hsa_status_t status)
288{
289  if (!debug)
290    return;
291
292  const char *hsa_error_msg = "[unknown]";
293  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
294
295  fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error_msg);
296}
297
298/* Report a fatal error STR together with the HSA error corresponding to STATUS
299   and terminate execution of the current process.  */
300
301static void
302hsa_fatal (const char *str, hsa_status_t status)
303{
304  const char *hsa_error_msg = "[unknown]";
305  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
306  GOMP_PLUGIN_fatal ("HSA fatal error: %s\nRuntime message: %s", str,
307		     hsa_error_msg);
308}
309
310/* Like hsa_fatal, except only report error message, and return FALSE
311   for propagating error processing to outside of plugin.  */
312
313static bool
314hsa_error (const char *str, hsa_status_t status)
315{
316  const char *hsa_error_msg = "[unknown]";
317  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
318  GOMP_PLUGIN_error ("HSA fatal error: %s\nRuntime message: %s", str,
319		     hsa_error_msg);
320  return false;
321}
322
323struct hsa_kernel_description
324{
325  const char *name;
326  unsigned omp_data_size;
327  bool gridified_kernel_p;
328  unsigned kernel_dependencies_count;
329  const char **kernel_dependencies;
330};
331
332struct global_var_info
333{
334  const char *name;
335  void *address;
336};
337
338/* Data passed by the static initializer of a compilation unit containing BRIG
339   to GOMP_offload_register.  */
340
341struct brig_image_desc
342{
343  hsa_ext_module_t brig_module;
344  const unsigned kernel_count;
345  struct hsa_kernel_description *kernel_infos;
346  const unsigned global_variable_count;
347  struct global_var_info *global_variables;
348};
349
350struct agent_info;
351
352/* Information required to identify, finalize and run any given kernel.  */
353
354struct kernel_info
355{
356  /* Name of the kernel, required to locate it within the brig module.  */
357  const char *name;
358  /* Size of memory space for OMP data.  */
359  unsigned omp_data_size;
360  /* The specific agent the kernel has been or will be finalized for and run
361     on.  */
362  struct agent_info *agent;
363  /* The specific module where the kernel takes place.  */
364  struct module_info *module;
365  /* Mutex enforcing that at most once thread ever initializes a kernel for
366     use.  A thread should have locked agent->modules_rwlock for reading before
367     acquiring it.  */
368  pthread_mutex_t init_mutex;
369  /* Flag indicating whether the kernel has been initialized and all fields
370     below it contain valid data.  */
371  bool initialized;
372  /* Flag indicating that the kernel has a problem that blocks an execution.  */
373  bool initialization_failed;
374  /* The object to be put into the dispatch queue.  */
375  uint64_t object;
376  /* Required size of kernel arguments.  */
377  uint32_t kernarg_segment_size;
378  /* Required size of group segment.  */
379  uint32_t group_segment_size;
380  /* Required size of private segment.  */
381  uint32_t private_segment_size;
382  /* List of all kernel dependencies.  */
383  const char **dependencies;
384  /* Number of dependencies.  */
385  unsigned dependencies_count;
386  /* Maximum OMP data size necessary for kernel from kernel dispatches.  */
387  unsigned max_omp_data_size;
388  /* True if the kernel is gridified.  */
389  bool gridified_kernel_p;
390};
391
392/* Information about a particular brig module, its image and kernels.  */
393
394struct module_info
395{
396  /* The next and previous module in the linked list of modules of an agent.  */
397  struct module_info *next, *prev;
398  /* The description with which the program has registered the image.  */
399  struct brig_image_desc *image_desc;
400
401  /* Number of kernels in this module.  */
402  int kernel_count;
403  /* An array of kernel_info structures describing each kernel in this
404     module.  */
405  struct kernel_info kernels[];
406};
407
408/* Information about shared brig library.  */
409
410struct brig_library_info
411{
412  char *file_name;
413  hsa_ext_module_t image;
414};
415
416/* Description of an HSA GPU agent and the program associated with it.  */
417
418struct agent_info
419{
420  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
421  hsa_agent_t id;
422  /* Whether the agent has been initialized.  The fields below are usable only
423     if it has been.  */
424  bool initialized;
425  /* The HSA ISA of this agent.  */
426  hsa_isa_t isa;
427  /* Command queue of the agent.  */
428  hsa_queue_t *command_q;
429  /* Kernel from kernel dispatch command queue.  */
430  hsa_queue_t *kernel_dispatch_command_q;
431  /* The HSA memory region from which to allocate kernel arguments.  */
432  hsa_region_t kernarg_region;
433
434  /* Read-write lock that protects kernels which are running or about to be run
435     from interference with loading and unloading of images.  Needs to be
436     locked for reading while a kernel is being run, and for writing if the
437     list of modules is manipulated (and thus the HSA program invalidated).  */
438  pthread_rwlock_t modules_rwlock;
439  /* The first module in a linked list of modules associated with this
440     kernel.  */
441  struct module_info *first_module;
442
443  /* Mutex enforcing that only one thread will finalize the HSA program.  A
444     thread should have locked agent->modules_rwlock for reading before
445     acquiring it.  */
446  pthread_mutex_t prog_mutex;
447  /* Flag whether the HSA program that consists of all the modules has been
448     finalized.  */
449  bool prog_finalized;
450  /* Flag whether the program was finalized but with a failure.  */
451  bool prog_finalized_error;
452  /* HSA executable - the finalized program that is used to locate kernels.  */
453  hsa_executable_t executable;
454  /* List of BRIG libraries.  */
455  struct brig_library_info **brig_libraries;
456  /* Number of loaded shared BRIG libraries.  */
457  unsigned brig_libraries_count;
458};
459
460/* Information about the whole HSA environment and all of its agents.  */
461
462struct hsa_context_info
463{
464  /* Whether the structure has been initialized.  */
465  bool initialized;
466  /* Number of usable GPU HSA agents in the system.  */
467  int agent_count;
468  /* Array of agent_info structures describing the individual HSA agents.  */
469  struct agent_info *agents;
470};
471
472/* Information about the whole HSA environment and all of its agents.  */
473
474static struct hsa_context_info hsa_context;
475
476#define DLSYM_FN(function) \
477  hsa_fns.function##_fn = dlsym (handle, #function); \
478  if (hsa_fns.function##_fn == NULL) \
479    goto dl_fail;
480
481static bool
482init_hsa_runtime_functions (void)
483{
484  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
485  if (handle == NULL)
486    goto dl_fail;
487
488  DLSYM_FN (hsa_status_string)
489  DLSYM_FN (hsa_agent_get_info)
490  DLSYM_FN (hsa_init)
491  DLSYM_FN (hsa_iterate_agents)
492  DLSYM_FN (hsa_region_get_info)
493  DLSYM_FN (hsa_queue_create)
494  DLSYM_FN (hsa_agent_iterate_regions)
495  DLSYM_FN (hsa_executable_destroy)
496  DLSYM_FN (hsa_executable_create)
497  DLSYM_FN (hsa_executable_global_variable_define)
498  DLSYM_FN (hsa_executable_load_code_object)
499  DLSYM_FN (hsa_executable_freeze)
500  DLSYM_FN (hsa_signal_create)
501  DLSYM_FN (hsa_memory_allocate)
502  DLSYM_FN (hsa_memory_free)
503  DLSYM_FN (hsa_signal_destroy)
504  DLSYM_FN (hsa_executable_get_symbol)
505  DLSYM_FN (hsa_executable_symbol_get_info)
506  DLSYM_FN (hsa_queue_add_write_index_release)
507  DLSYM_FN (hsa_queue_load_read_index_acquire)
508  DLSYM_FN (hsa_signal_wait_acquire)
509  DLSYM_FN (hsa_signal_store_relaxed)
510  DLSYM_FN (hsa_signal_store_release)
511  DLSYM_FN (hsa_signal_load_acquire)
512  DLSYM_FN (hsa_queue_destroy)
513  DLSYM_FN (hsa_ext_program_add_module)
514  DLSYM_FN (hsa_ext_program_create)
515  DLSYM_FN (hsa_ext_program_destroy)
516  DLSYM_FN (hsa_ext_program_finalize)
517  return true;
518
519 dl_fail:
520  HSA_DEBUG ("while loading %s: %s\n", hsa_runtime_lib, dlerror ());
521  return false;
522}
523
524/* Find kernel for an AGENT by name provided in KERNEL_NAME.  */
525
526static struct kernel_info *
527get_kernel_for_agent (struct agent_info *agent, const char *kernel_name)
528{
529  struct module_info *module = agent->first_module;
530
531  while (module)
532    {
533      for (unsigned i = 0; i < module->kernel_count; i++)
534	if (strcmp (module->kernels[i].name, kernel_name) == 0)
535	  return &module->kernels[i];
536
537      module = module->next;
538    }
539
540  return NULL;
541}
542
543/* Return true if the agent is a GPU and acceptable of concurrent submissions
544   from different threads.  */
545
546static bool
547suitable_hsa_agent_p (hsa_agent_t agent)
548{
549  hsa_device_type_t device_type;
550  hsa_status_t status
551    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
552				     &device_type);
553  if (status != HSA_STATUS_SUCCESS)
554    return false;
555
556  switch (device_type)
557    {
558    case HSA_DEVICE_TYPE_GPU:
559      break;
560    case HSA_DEVICE_TYPE_CPU:
561      if (!support_cpu_devices)
562	return false;
563      break;
564    default:
565      return false;
566    }
567
568  uint32_t features = 0;
569  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
570					  &features);
571  if (status != HSA_STATUS_SUCCESS
572      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
573    return false;
574  hsa_queue_type_t queue_type;
575  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
576					  &queue_type);
577  if (status != HSA_STATUS_SUCCESS
578      || (queue_type != HSA_QUEUE_TYPE_MULTI))
579    return false;
580
581  return true;
582}
583
584/* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
585   agent_count in hsa_context.  */
586
587static hsa_status_t
588count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
589{
590  if (suitable_hsa_agent_p (agent))
591    hsa_context.agent_count++;
592  return HSA_STATUS_SUCCESS;
593}
594
595/* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
596   id to the describing structure in the hsa context.  The index of the
597   structure is pointed to by DATA, increment it afterwards.  */
598
599static hsa_status_t
600assign_agent_ids (hsa_agent_t agent, void *data)
601{
602  if (suitable_hsa_agent_p (agent))
603    {
604      int *agent_index = (int *) data;
605      hsa_context.agents[*agent_index].id = agent;
606      ++*agent_index;
607    }
608  return HSA_STATUS_SUCCESS;
609}
610
611/* Initialize hsa_context if it has not already been done.
612   Return TRUE on success.  */
613
614static bool
615init_hsa_context (void)
616{
617  hsa_status_t status;
618  int agent_index = 0;
619
620  if (hsa_context.initialized)
621    return true;
622  init_enviroment_variables ();
623  if (!init_hsa_runtime_functions ())
624    {
625      HSA_DEBUG ("Run-time could not be dynamically opened\n");
626      return false;
627    }
628  status = hsa_fns.hsa_init_fn ();
629  if (status != HSA_STATUS_SUCCESS)
630    return hsa_error ("Run-time could not be initialized", status);
631  HSA_DEBUG ("HSA run-time initialized\n");
632  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
633  if (status != HSA_STATUS_SUCCESS)
634    return hsa_error ("HSA GPU devices could not be enumerated", status);
635  HSA_DEBUG ("There are %i HSA GPU devices.\n", hsa_context.agent_count);
636
637  hsa_context.agents
638    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
639				  * sizeof (struct agent_info));
640  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
641  if (agent_index != hsa_context.agent_count)
642    {
643      GOMP_PLUGIN_error ("Failed to assign IDs to all HSA agents");
644      return false;
645    }
646  hsa_context.initialized = true;
647  return true;
648}
649
650/* Callback of dispatch queues to report errors.  */
651
652static void
653queue_callback (hsa_status_t status,
654		hsa_queue_t *queue __attribute__ ((unused)),
655		void *data __attribute__ ((unused)))
656{
657  hsa_fatal ("Asynchronous queue error", status);
658}
659
660/* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can be
661   used for kernarg allocations and if so write it to the memory pointed to by
662   DATA and break the query.  */
663
664static hsa_status_t
665get_kernarg_memory_region (hsa_region_t region, void *data)
666{
667  hsa_status_t status;
668  hsa_region_segment_t segment;
669
670  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
671					   &segment);
672  if (status != HSA_STATUS_SUCCESS)
673    return status;
674  if (segment != HSA_REGION_SEGMENT_GLOBAL)
675    return HSA_STATUS_SUCCESS;
676
677  uint32_t flags;
678  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
679					   &flags);
680  if (status != HSA_STATUS_SUCCESS)
681    return status;
682  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
683    {
684      hsa_region_t *ret = (hsa_region_t *) data;
685      *ret = region;
686      return HSA_STATUS_INFO_BREAK;
687    }
688  return HSA_STATUS_SUCCESS;
689}
690
691/* Part of the libgomp plugin interface.  Return the number of HSA devices on
692   the system.  */
693
694int
695GOMP_OFFLOAD_get_num_devices (void)
696{
697  if (!init_hsa_context ())
698    return 0;
699  return hsa_context.agent_count;
700}
701
702/* Part of the libgomp plugin interface.  Initialize agent number N so that it
703   can be used for computation.  Return TRUE on success.  */
704
705bool
706GOMP_OFFLOAD_init_device (int n)
707{
708  if (!init_hsa_context ())
709    return false;
710  if (n >= hsa_context.agent_count)
711    {
712      GOMP_PLUGIN_error ("Request to initialize non-existing HSA device %i", n);
713      return false;
714    }
715  struct agent_info *agent = &hsa_context.agents[n];
716
717  if (agent->initialized)
718    return true;
719
720  if (pthread_rwlock_init (&agent->modules_rwlock, NULL))
721    {
722      GOMP_PLUGIN_error ("Failed to initialize an HSA agent rwlock");
723      return false;
724    }
725  if (pthread_mutex_init (&agent->prog_mutex, NULL))
726    {
727      GOMP_PLUGIN_error ("Failed to initialize an HSA agent program mutex");
728      return false;
729    }
730
731  uint32_t queue_size;
732  hsa_status_t status;
733  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
734					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
735					  &queue_size);
736  if (status != HSA_STATUS_SUCCESS)
737    return hsa_error ("Error requesting maximum queue size of the HSA agent",
738    	   	      status);
739  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_ISA,
740					  &agent->isa);
741  if (status != HSA_STATUS_SUCCESS)
742    return hsa_error ("Error querying the ISA of the agent", status);
743  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
744					HSA_QUEUE_TYPE_MULTI,
745					queue_callback, NULL, UINT32_MAX,
746					UINT32_MAX,
747					&agent->command_q);
748  if (status != HSA_STATUS_SUCCESS)
749    return hsa_error ("Error creating command queue", status);
750
751  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
752					HSA_QUEUE_TYPE_MULTI,
753					queue_callback, NULL, UINT32_MAX,
754					UINT32_MAX,
755					&agent->kernel_dispatch_command_q);
756  if (status != HSA_STATUS_SUCCESS)
757    return hsa_error ("Error creating kernel dispatch command queue", status);
758
759  agent->kernarg_region.handle = (uint64_t) -1;
760  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
761						 get_kernarg_memory_region,
762						 &agent->kernarg_region);
763  if (agent->kernarg_region.handle == (uint64_t) -1)
764    {
765      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
766			 "arguments");
767      return false;
768    }
769  HSA_DEBUG ("HSA agent initialized, queue has id %llu\n",
770	     (long long unsigned) agent->command_q->id);
771  HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n",
772	     (long long unsigned) agent->kernel_dispatch_command_q->id);
773  agent->initialized = true;
774  return true;
775}
776
777/* Verify that hsa_context has already been initialized and return the
778   agent_info structure describing device number N.  Return NULL on error.  */
779
780static struct agent_info *
781get_agent_info (int n)
782{
783  if (!hsa_context.initialized)
784    {
785      GOMP_PLUGIN_error ("Attempt to use uninitialized HSA context.");
786      return NULL;
787    }
788  if (n >= hsa_context.agent_count)
789    {
790      GOMP_PLUGIN_error ("Request to operate on anon-existing HSA device %i", n);
791      return NULL;
792    }
793  if (!hsa_context.agents[n].initialized)
794    {
795      GOMP_PLUGIN_error ("Attempt to use an uninitialized HSA agent.");
796      return NULL;
797    }
798  return &hsa_context.agents[n];
799}
800
801/* Insert MODULE to the linked list of modules of AGENT.  */
802
803static void
804add_module_to_agent (struct agent_info *agent, struct module_info *module)
805{
806  if (agent->first_module)
807    agent->first_module->prev = module;
808  module->next = agent->first_module;
809  module->prev = NULL;
810  agent->first_module = module;
811}
812
813/* Remove MODULE from the linked list of modules of AGENT.  */
814
815static void
816remove_module_from_agent (struct agent_info *agent, struct module_info *module)
817{
818  if (agent->first_module == module)
819    agent->first_module = module->next;
820  if (module->prev)
821    module->prev->next = module->next;
822  if (module->next)
823    module->next->prev = module->prev;
824}
825
826/* Free the HSA program in agent and everything associated with it and set
827   agent->prog_finalized and the initialized flags of all kernels to false.
828   Return TRUE on success.  */
829
830static bool
831destroy_hsa_program (struct agent_info *agent)
832{
833  if (!agent->prog_finalized || agent->prog_finalized_error)
834    return true;
835
836  hsa_status_t status;
837
838  HSA_DEBUG ("Destroying the current HSA program.\n");
839
840  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
841  if (status != HSA_STATUS_SUCCESS)
842    return hsa_error ("Could not destroy HSA executable", status);
843
844  struct module_info *module;
845  for (module = agent->first_module; module; module = module->next)
846    {
847      int i;
848      for (i = 0; i < module->kernel_count; i++)
849	module->kernels[i].initialized = false;
850    }
851  agent->prog_finalized = false;
852  return true;
853}
854
855/* Initialize KERNEL from D and other parameters.  Return true on success. */
856
857static bool
858init_basic_kernel_info (struct kernel_info *kernel,
859			struct hsa_kernel_description *d,
860			struct agent_info *agent,
861			struct module_info *module)
862{
863  kernel->agent = agent;
864  kernel->module = module;
865  kernel->name = d->name;
866  kernel->omp_data_size = d->omp_data_size;
867  kernel->gridified_kernel_p = d->gridified_kernel_p;
868  kernel->dependencies_count = d->kernel_dependencies_count;
869  kernel->dependencies = d->kernel_dependencies;
870  if (pthread_mutex_init (&kernel->init_mutex, NULL))
871    {
872      GOMP_PLUGIN_error ("Failed to initialize an HSA kernel mutex");
873      return false;
874    }
875  return true;
876}
877
878/* Part of the libgomp plugin interface.  Load BRIG module described by struct
879   brig_image_desc in TARGET_DATA and return references to kernel descriptors
880   in TARGET_TABLE.  */
881
882int
883GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
884			 struct addr_pair **target_table)
885{
886  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
887    {
888      GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin"
889			 " (expected %u, received %u)",
890			 GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
891      return -1;
892    }
893
894  struct brig_image_desc *image_desc = (struct brig_image_desc *) target_data;
895  struct agent_info *agent;
896  struct addr_pair *pair;
897  struct module_info *module;
898  struct kernel_info *kernel;
899  int kernel_count = image_desc->kernel_count;
900
901  agent = get_agent_info (ord);
902  if (!agent)
903    return -1;
904
905  if (pthread_rwlock_wrlock (&agent->modules_rwlock))
906    {
907      GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock");
908      return -1;
909    }
910  if (agent->prog_finalized
911      && !destroy_hsa_program (agent))
912    return -1;
913
914  HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
915  pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair));
916  *target_table = pair;
917  module = (struct module_info *)
918    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
919				+ kernel_count * sizeof (struct kernel_info));
920  module->image_desc = image_desc;
921  module->kernel_count = kernel_count;
922
923  kernel = &module->kernels[0];
924
925  /* Allocate memory for kernel dependencies.  */
926  for (unsigned i = 0; i < kernel_count; i++)
927    {
928      pair->start = (uintptr_t) kernel;
929      pair->end = (uintptr_t) (kernel + 1);
930
931      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
932      if (!init_basic_kernel_info (kernel, d, agent, module))
933	return -1;
934      kernel++;
935      pair++;
936    }
937
938  add_module_to_agent (agent, module);
939  if (pthread_rwlock_unlock (&agent->modules_rwlock))
940    {
941      GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock");
942      return -1;
943    }
944  return kernel_count;
945}
946
947/* Add a shared BRIG library from a FILE_NAME to an AGENT.  */
948
949static struct brig_library_info *
950add_shared_library (const char *file_name, struct agent_info *agent)
951{
952  struct brig_library_info *library = NULL;
953
954  void *f = dlopen (file_name, RTLD_NOW);
955  void *start = dlsym (f, "__brig_start");
956  void *end = dlsym (f, "__brig_end");
957
958  if (start == NULL || end == NULL)
959    return NULL;
960
961  unsigned size = end - start;
962  char *buf = (char *) GOMP_PLUGIN_malloc (size);
963  memcpy (buf, start, size);
964
965  library = GOMP_PLUGIN_malloc (sizeof (struct agent_info));
966  library->file_name = (char *) GOMP_PLUGIN_malloc
967    ((strlen (file_name) + 1));
968  strcpy (library->file_name, file_name);
969  library->image = (hsa_ext_module_t) buf;
970
971  return library;
972}
973
974/* Release memory used for BRIG shared libraries that correspond
975   to an AGENT.  */
976
977static void
978release_agent_shared_libraries (struct agent_info *agent)
979{
980  for (unsigned i = 0; i < agent->brig_libraries_count; i++)
981    if (agent->brig_libraries[i])
982      {
983	free (agent->brig_libraries[i]->file_name);
984	free (agent->brig_libraries[i]->image);
985	free (agent->brig_libraries[i]);
986      }
987
988  free (agent->brig_libraries);
989}
990
991/* Create and finalize the program consisting of all loaded modules.  */
992
993static void
994create_and_finalize_hsa_program (struct agent_info *agent)
995{
996  hsa_status_t status;
997  hsa_ext_program_t prog_handle;
998  int mi = 0;
999
1000  if (pthread_mutex_lock (&agent->prog_mutex))
1001    GOMP_PLUGIN_fatal ("Could not lock an HSA agent program mutex");
1002  if (agent->prog_finalized)
1003    goto final;
1004
1005  status = hsa_fns.hsa_ext_program_create_fn
1006    (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
1007     HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
1008     NULL, &prog_handle);
1009  if (status != HSA_STATUS_SUCCESS)
1010    hsa_fatal ("Could not create an HSA program", status);
1011
1012  HSA_DEBUG ("Created a finalized program\n");
1013
1014  struct module_info *module = agent->first_module;
1015  while (module)
1016    {
1017      status = hsa_fns.hsa_ext_program_add_module_fn
1018	(prog_handle, module->image_desc->brig_module);
1019      if (status != HSA_STATUS_SUCCESS)
1020	hsa_fatal ("Could not add a module to the HSA program", status);
1021      module = module->next;
1022      mi++;
1023    }
1024
1025  /* Load all shared libraries.  */
1026  const char *libraries[] = { "libhsamath.so", "libhsastd.so" };
1027  const unsigned libraries_count = sizeof (libraries) / sizeof (const char *);
1028
1029  agent->brig_libraries_count = libraries_count;
1030  agent->brig_libraries = GOMP_PLUGIN_malloc_cleared
1031    (sizeof (struct brig_library_info) * libraries_count);
1032
1033  for (unsigned i = 0; i < libraries_count; i++)
1034    {
1035      struct brig_library_info *library = add_shared_library (libraries[i],
1036							      agent);
1037      if (library == NULL)
1038	{
1039	  HSA_WARNING ("Could not open a shared BRIG library: %s\n",
1040		       libraries[i]);
1041	  continue;
1042	}
1043
1044      status = hsa_fns.hsa_ext_program_add_module_fn (prog_handle,
1045						      library->image);
1046      if (status != HSA_STATUS_SUCCESS)
1047	hsa_warn ("Could not add a shared BRIG library the HSA program",
1048		  status);
1049      else
1050	HSA_DEBUG ("a shared BRIG library has been added to a program: %s\n",
1051		   libraries[i]);
1052    }
1053
1054  hsa_ext_control_directives_t control_directives;
1055  memset (&control_directives, 0, sizeof (control_directives));
1056  hsa_code_object_t code_object;
1057  status = hsa_fns.hsa_ext_program_finalize_fn
1058    (prog_handle, agent->isa,HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO,
1059     control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object);
1060  if (status != HSA_STATUS_SUCCESS)
1061    {
1062      hsa_warn ("Finalization of the HSA program failed", status);
1063      goto failure;
1064    }
1065
1066  HSA_DEBUG ("Finalization done\n");
1067  hsa_fns.hsa_ext_program_destroy_fn (prog_handle);
1068
1069  status
1070    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
1071					HSA_EXECUTABLE_STATE_UNFROZEN,
1072					"", &agent->executable);
1073  if (status != HSA_STATUS_SUCCESS)
1074    hsa_fatal ("Could not create HSA executable", status);
1075
1076  module = agent->first_module;
1077  while (module)
1078    {
1079      /* Initialize all global variables declared in the module.  */
1080      for (unsigned i = 0; i < module->image_desc->global_variable_count; i++)
1081	{
1082	  struct global_var_info *var;
1083	  var = &module->image_desc->global_variables[i];
1084	  status = hsa_fns.hsa_executable_global_variable_define_fn
1085	    (agent->executable, var->name, var->address);
1086
1087	  HSA_DEBUG ("Defining global variable: %s, address: %p\n", var->name,
1088		     var->address);
1089
1090	  if (status != HSA_STATUS_SUCCESS)
1091	    hsa_fatal ("Could not define a global variable in the HSA program",
1092		       status);
1093	}
1094
1095      module = module->next;
1096    }
1097
1098  status = hsa_fns.hsa_executable_load_code_object_fn (agent->executable,
1099						       agent->id,
1100						       code_object, "");
1101  if (status != HSA_STATUS_SUCCESS)
1102    hsa_fatal ("Could not add a code object to the HSA executable", status);
1103  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
1104  if (status != HSA_STATUS_SUCCESS)
1105    hsa_fatal ("Could not freeze the HSA executable", status);
1106
1107  HSA_DEBUG ("Froze HSA executable with the finalized code object\n");
1108
1109  /* If all goes good, jump to final.  */
1110  goto final;
1111
1112failure:
1113  agent->prog_finalized_error = true;
1114
1115final:
1116  agent->prog_finalized = true;
1117
1118  if (pthread_mutex_unlock (&agent->prog_mutex))
1119    GOMP_PLUGIN_fatal ("Could not unlock an HSA agent program mutex");
1120}
1121
1122/* Create kernel dispatch data structure for given KERNEL.  */
1123
1124static struct GOMP_hsa_kernel_dispatch *
1125create_single_kernel_dispatch (struct kernel_info *kernel,
1126			       unsigned omp_data_size)
1127{
1128  struct agent_info *agent = kernel->agent;
1129  struct GOMP_hsa_kernel_dispatch *shadow
1130    = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
1131
1132  shadow->queue = agent->command_q;
1133  shadow->omp_data_memory
1134    = omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
1135  unsigned dispatch_count = kernel->dependencies_count;
1136  shadow->kernel_dispatch_count = dispatch_count;
1137
1138  shadow->children_dispatches
1139    = GOMP_PLUGIN_malloc (dispatch_count * sizeof (shadow));
1140
1141  shadow->object = kernel->object;
1142
1143  hsa_signal_t sync_signal;
1144  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1145  if (status != HSA_STATUS_SUCCESS)
1146    hsa_fatal ("Error creating the HSA sync signal", status);
1147
1148  shadow->signal = sync_signal.handle;
1149  shadow->private_segment_size = kernel->private_segment_size;
1150  shadow->group_segment_size = kernel->group_segment_size;
1151
1152  status
1153    = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1154				      kernel->kernarg_segment_size,
1155				      &shadow->kernarg_address);
1156  if (status != HSA_STATUS_SUCCESS)
1157    hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
1158
1159  return shadow;
1160}
1161
1162/* Release data structure created for a kernel dispatch in SHADOW argument.  */
1163
1164static void
1165release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
1166{
1167  HSA_DEBUG ("Released kernel dispatch: %p has value: %" PRIu64 " (%p)\n",
1168	     shadow, (print_uint64_t) shadow->debug,
1169	     (void *) (uintptr_t) shadow->debug);
1170
1171  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1172
1173  hsa_signal_t s;
1174  s.handle = shadow->signal;
1175  hsa_fns.hsa_signal_destroy_fn (s);
1176
1177  free (shadow->omp_data_memory);
1178
1179  for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
1180    release_kernel_dispatch (shadow->children_dispatches[i]);
1181
1182  free (shadow->children_dispatches);
1183  free (shadow);
1184}
1185
1186/* Initialize a KERNEL without its dependencies.  MAX_OMP_DATA_SIZE is used
1187   to calculate maximum necessary memory for OMP data allocation.  */
1188
1189static void
1190init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
1191{
1192  hsa_status_t status;
1193  struct agent_info *agent = kernel->agent;
1194  hsa_executable_symbol_t kernel_symbol;
1195  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1196						 kernel->name, agent->id,
1197						 0, &kernel_symbol);
1198  if (status != HSA_STATUS_SUCCESS)
1199    {
1200      hsa_warn ("Could not find symbol for kernel in the code object", status);
1201      goto failure;
1202    }
1203  HSA_DEBUG ("Located kernel %s\n", kernel->name);
1204  status = hsa_fns.hsa_executable_symbol_get_info_fn
1205    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1206  if (status != HSA_STATUS_SUCCESS)
1207    hsa_fatal ("Could not extract a kernel object from its symbol", status);
1208  status = hsa_fns.hsa_executable_symbol_get_info_fn
1209    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1210     &kernel->kernarg_segment_size);
1211  if (status != HSA_STATUS_SUCCESS)
1212    hsa_fatal ("Could not get info about kernel argument size", status);
1213  status = hsa_fns.hsa_executable_symbol_get_info_fn
1214    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1215     &kernel->group_segment_size);
1216  if (status != HSA_STATUS_SUCCESS)
1217    hsa_fatal ("Could not get info about kernel group segment size", status);
1218  status = hsa_fns.hsa_executable_symbol_get_info_fn
1219    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1220     &kernel->private_segment_size);
1221  if (status != HSA_STATUS_SUCCESS)
1222    hsa_fatal ("Could not get info about kernel private segment size",
1223	       status);
1224
1225  HSA_DEBUG ("Kernel structure for %s fully initialized with "
1226	     "following segment sizes: \n", kernel->name);
1227  HSA_DEBUG ("  group_segment_size: %u\n",
1228	     (unsigned) kernel->group_segment_size);
1229  HSA_DEBUG ("  private_segment_size: %u\n",
1230	     (unsigned) kernel->private_segment_size);
1231  HSA_DEBUG ("  kernarg_segment_size: %u\n",
1232	     (unsigned) kernel->kernarg_segment_size);
1233  HSA_DEBUG ("  omp_data_size: %u\n", kernel->omp_data_size);
1234  HSA_DEBUG ("  gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
1235
1236  if (kernel->omp_data_size > *max_omp_data_size)
1237    *max_omp_data_size = kernel->omp_data_size;
1238
1239  for (unsigned i = 0; i < kernel->dependencies_count; i++)
1240    {
1241      struct kernel_info *dependency
1242	= get_kernel_for_agent (agent, kernel->dependencies[i]);
1243
1244      if (dependency == NULL)
1245	{
1246	  HSA_DEBUG ("Could not find a dependency for a kernel: %s, "
1247		     "dependency name: %s\n", kernel->name,
1248		     kernel->dependencies[i]);
1249	  goto failure;
1250	}
1251
1252      if (dependency->dependencies_count > 0)
1253	{
1254	  HSA_DEBUG ("HSA does not allow kernel dispatching code with "
1255		     "a depth bigger than one\n");
1256	  goto failure;
1257	}
1258
1259      init_single_kernel (dependency, max_omp_data_size);
1260    }
1261
1262  return;
1263
1264failure:
1265  kernel->initialization_failed = true;
1266}
1267
1268/* Indent stream F by INDENT spaces.  */
1269
1270static void
1271indent_stream (FILE *f, unsigned indent)
1272{
1273  fprintf (f, "%*s", indent, "");
1274}
1275
1276/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1277
1278static void
1279print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch, unsigned indent)
1280{
1281  indent_stream (stderr, indent);
1282  fprintf (stderr, "this: %p\n", dispatch);
1283  indent_stream (stderr, indent);
1284  fprintf (stderr, "queue: %p\n", dispatch->queue);
1285  indent_stream (stderr, indent);
1286  fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
1287  indent_stream (stderr, indent);
1288  fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
1289  indent_stream (stderr, indent);
1290  fprintf (stderr, "object: %" PRIu64 "\n", (print_uint64_t) dispatch->object);
1291  indent_stream (stderr, indent);
1292  fprintf (stderr, "signal: %" PRIu64 "\n", (print_uint64_t) dispatch->signal);
1293  indent_stream (stderr, indent);
1294  fprintf (stderr, "private_segment_size: %u\n",
1295	   dispatch->private_segment_size);
1296  indent_stream (stderr, indent);
1297  fprintf (stderr, "group_segment_size: %u\n",
1298	   dispatch->group_segment_size);
1299  indent_stream (stderr, indent);
1300  fprintf (stderr, "children dispatches: %" PRIu64 "\n",
1301	   (print_uint64_t) dispatch->kernel_dispatch_count);
1302  indent_stream (stderr, indent);
1303  fprintf (stderr, "omp_num_threads: %u\n",
1304	   dispatch->omp_num_threads);
1305  fprintf (stderr, "\n");
1306
1307  for (unsigned i = 0; i < dispatch->kernel_dispatch_count; i++)
1308    print_kernel_dispatch (dispatch->children_dispatches[i], indent + 2);
1309}
1310
1311/* Create kernel dispatch data structure for a KERNEL and all its
1312   dependencies.  */
1313
1314static struct GOMP_hsa_kernel_dispatch *
1315create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
1316{
1317  struct GOMP_hsa_kernel_dispatch *shadow
1318    = create_single_kernel_dispatch (kernel, omp_data_size);
1319  shadow->omp_num_threads = 64;
1320  shadow->debug = 0;
1321  shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
1322
1323  /* Create kernel dispatch data structures.  We do not allow to have
1324     a kernel dispatch with depth bigger than one.  */
1325  for (unsigned i = 0; i < kernel->dependencies_count; i++)
1326    {
1327      struct kernel_info *dependency
1328	= get_kernel_for_agent (kernel->agent, kernel->dependencies[i]);
1329      shadow->children_dispatches[i]
1330	= create_single_kernel_dispatch (dependency, omp_data_size);
1331      shadow->children_dispatches[i]->queue
1332	= kernel->agent->kernel_dispatch_command_q;
1333      shadow->children_dispatches[i]->omp_level = 1;
1334    }
1335
1336  return shadow;
1337}
1338
1339/* Do all the work that is necessary before running KERNEL for the first time.
1340   The function assumes the program has been created, finalized and frozen by
1341   create_and_finalize_hsa_program.  */
1342
1343static void
1344init_kernel (struct kernel_info *kernel)
1345{
1346  if (pthread_mutex_lock (&kernel->init_mutex))
1347    GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
1348  if (kernel->initialized)
1349    {
1350      if (pthread_mutex_unlock (&kernel->init_mutex))
1351	GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1352			   "mutex");
1353
1354      return;
1355    }
1356
1357  /* Precomputed maximum size of OMP data necessary for a kernel from kernel
1358     dispatch operation.  */
1359  init_single_kernel (kernel, &kernel->max_omp_data_size);
1360
1361  if (!kernel->initialization_failed)
1362    HSA_DEBUG ("\n");
1363
1364  kernel->initialized = true;
1365  if (pthread_mutex_unlock (&kernel->init_mutex))
1366    GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1367		       "mutex");
1368}
1369
1370/* Parse the target attributes INPUT provided by the compiler and return true
1371   if we should run anything all.  If INPUT is NULL, fill DEF with default
1372   values, then store INPUT or DEF into *RESULT.  */
1373
1374static bool
1375parse_target_attributes (void **input,
1376			 struct GOMP_kernel_launch_attributes *def,
1377			 struct GOMP_kernel_launch_attributes **result)
1378{
1379  if (!input)
1380    GOMP_PLUGIN_fatal ("No target arguments provided");
1381
1382  bool attrs_found = false;
1383  while (*input)
1384    {
1385      uintptr_t id = (uintptr_t) *input;
1386      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_HSA
1387	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1388	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1389	{
1390	  input++;
1391	  attrs_found = true;
1392	  break;
1393	}
1394
1395      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1396	input++;
1397      input++;
1398    }
1399
1400  if (!attrs_found)
1401    {
1402      def->ndim = 1;
1403      def->gdims[0] = 1;
1404      def->gdims[1] = 1;
1405      def->gdims[2] = 1;
1406      def->wdims[0] = 1;
1407      def->wdims[1] = 1;
1408      def->wdims[2] = 1;
1409      *result = def;
1410      HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
1411      return true;
1412    }
1413
1414  struct GOMP_kernel_launch_attributes *kla;
1415  kla = (struct GOMP_kernel_launch_attributes *) *input;
1416  *result = kla;
1417  if (kla->ndim == 0 || kla->ndim > 3)
1418    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1419
1420  HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1421  unsigned i;
1422  for (i = 0; i < kla->ndim; i++)
1423    {
1424      HSA_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1425		 kla->gdims[i], kla->wdims[i]);
1426      if (kla->gdims[i] == 0)
1427	return false;
1428    }
1429  return true;
1430}
1431
1432/* Return the group size given the requested GROUP size, GRID size and number
1433   of grid dimensions NDIM.  */
1434
1435static uint32_t
1436get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1437{
1438  if (group == 0)
1439    {
1440      /* TODO: Provide a default via environment or device characteristics.  */
1441      if (ndim == 1)
1442	group = 64;
1443      else if (ndim == 2)
1444	group = 8;
1445      else
1446	group = 4;
1447    }
1448
1449  if (group > grid)
1450    group = grid;
1451  return group;
1452}
1453
1454/* Return true if the HSA runtime can run function FN_PTR.  */
1455
1456bool
1457GOMP_OFFLOAD_can_run (void *fn_ptr)
1458{
1459  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1460  struct agent_info *agent = kernel->agent;
1461  create_and_finalize_hsa_program (agent);
1462
1463  if (agent->prog_finalized_error)
1464    goto failure;
1465
1466  init_kernel (kernel);
1467  if (kernel->initialization_failed)
1468    goto failure;
1469
1470  return true;
1471
1472failure:
1473  if (suppress_host_fallback)
1474    GOMP_PLUGIN_fatal ("HSA host fallback has been suppressed");
1475  HSA_DEBUG ("HSA target cannot be launched, doing a host fallback\n");
1476  return false;
1477}
1478
1479/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1480
1481void
1482packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1483{
1484  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1485}
1486
1487/* Run KERNEL on its agent, pass VARS to it as arguments and take
1488   launchattributes from KLA.  */
1489
1490void
1491run_kernel (struct kernel_info *kernel, void *vars,
1492	    struct GOMP_kernel_launch_attributes *kla)
1493{
1494  struct agent_info *agent = kernel->agent;
1495  if (pthread_rwlock_rdlock (&agent->modules_rwlock))
1496    GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock");
1497
1498  if (!agent->initialized)
1499    GOMP_PLUGIN_fatal ("Agent must be initialized");
1500
1501  if (!kernel->initialized)
1502    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
1503
1504  struct GOMP_hsa_kernel_dispatch *shadow
1505    = create_kernel_dispatch (kernel, kernel->max_omp_data_size);
1506
1507  if (debug)
1508    {
1509      fprintf (stderr, "\nKernel has following dependencies:\n");
1510      print_kernel_dispatch (shadow, 2);
1511    }
1512
1513  uint64_t index
1514    = hsa_fns.hsa_queue_add_write_index_release_fn (agent->command_q, 1);
1515  HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
1516
1517  /* Wait until the queue is not full before writing the packet.   */
1518  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (agent->command_q)
1519	 >= agent->command_q->size)
1520    ;
1521
1522  hsa_kernel_dispatch_packet_t *packet;
1523  packet = ((hsa_kernel_dispatch_packet_t *) agent->command_q->base_address)
1524	   + index % agent->command_q->size;
1525
1526  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
1527  packet->grid_size_x = kla->gdims[0];
1528  packet->workgroup_size_x = get_group_size (kla->ndim, kla->gdims[0],
1529					     kla->wdims[0]);
1530
1531  if (kla->ndim >= 2)
1532    {
1533      packet->grid_size_y = kla->gdims[1];
1534      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
1535						 kla->wdims[1]);
1536    }
1537  else
1538    {
1539      packet->grid_size_y = 1;
1540      packet->workgroup_size_y = 1;
1541    }
1542
1543  if (kla->ndim == 3)
1544    {
1545      packet->grid_size_z = kla->gdims[2];
1546      packet->workgroup_size_z = get_group_size (kla->ndim, kla->gdims[2],
1547					     kla->wdims[2]);
1548    }
1549  else
1550    {
1551      packet->grid_size_z = 1;
1552      packet->workgroup_size_z = 1;
1553    }
1554
1555  packet->private_segment_size = kernel->private_segment_size;
1556  packet->group_segment_size = kernel->group_segment_size;
1557  packet->kernel_object = kernel->object;
1558  packet->kernarg_address = shadow->kernarg_address;
1559  hsa_signal_t s;
1560  s.handle = shadow->signal;
1561  packet->completion_signal = s;
1562  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
1563  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
1564
1565  /* PR hsa/70337.  */
1566  size_t vars_size = sizeof (vars);
1567  if (kernel->kernarg_segment_size > vars_size)
1568    {
1569      if (kernel->kernarg_segment_size != vars_size
1570	  + sizeof (struct hsa_kernel_runtime *))
1571	GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
1572      memcpy (packet->kernarg_address + vars_size, &shadow,
1573	      sizeof (struct hsa_kernel_runtime *));
1574    }
1575
1576  HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
1577
1578  uint16_t header;
1579  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
1580  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
1581  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
1582
1583  HSA_DEBUG ("Going to dispatch kernel %s\n", kernel->name);
1584
1585  packet_store_release ((uint32_t *) packet, header,
1586			(uint16_t) kla->ndim << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
1587
1588  hsa_fns.hsa_signal_store_release_fn (agent->command_q->doorbell_signal,
1589				       index);
1590
1591  /* TODO: GPU agents in Carrizo APUs cannot properly update L2 cache for
1592     signal wait and signal load operations on their own and we need to
1593     periodically call the hsa_signal_load_acquire on completion signals of
1594     children kernels in the CPU to make that happen.  As soon the
1595     limitation will be resolved, this workaround can be removed.  */
1596
1597  HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
1598
1599  /* Root signal waits with 1ms timeout.  */
1600  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
1601					     1000 * 1000,
1602					     HSA_WAIT_STATE_BLOCKED) != 0)
1603    for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
1604      {
1605	hsa_signal_t child_s;
1606	child_s.handle = shadow->children_dispatches[i]->signal;
1607
1608	HSA_DEBUG ("Waiting for children completion signal: %" PRIu64 "\n",
1609		   (print_uint64_t) shadow->children_dispatches[i]->signal);
1610	hsa_fns.hsa_signal_load_acquire_fn (child_s);
1611      }
1612
1613  release_kernel_dispatch (shadow);
1614
1615  if (pthread_rwlock_unlock (&agent->modules_rwlock))
1616    GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
1617}
1618
1619/* Part of the libgomp plugin interface.  Run a kernel on device N (the number
1620   is actually ignored, we assume the FN_PTR has been mapped using the correct
1621   device) and pass it an array of pointers in VARS as a parameter.  The kernel
1622   is identified by FN_PTR which must point to a kernel_info structure.  */
1623
1624void
1625GOMP_OFFLOAD_run (int n __attribute__((unused)),
1626		  void *fn_ptr, void *vars, void **args)
1627{
1628  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1629  struct GOMP_kernel_launch_attributes def;
1630  struct GOMP_kernel_launch_attributes *kla;
1631  if (!parse_target_attributes (args, &def, &kla))
1632    {
1633      HSA_DEBUG ("Will not run HSA kernel because the grid size is zero\n");
1634      return;
1635    }
1636  run_kernel (kernel, vars, kla);
1637}
1638
1639/* Information to be passed to a thread running a kernel asycnronously.  */
1640
1641struct async_run_info
1642{
1643  int device;
1644  void *tgt_fn;
1645  void *tgt_vars;
1646  void **args;
1647  void *async_data;
1648};
1649
1650/* Thread routine to run a kernel asynchronously.  */
1651
1652static void *
1653run_kernel_asynchronously (void *thread_arg)
1654{
1655  struct async_run_info *info = (struct async_run_info *) thread_arg;
1656  int device = info->device;
1657  void *tgt_fn = info->tgt_fn;
1658  void *tgt_vars = info->tgt_vars;
1659  void **args = info->args;
1660  void *async_data = info->async_data;
1661
1662  free (info);
1663  GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
1664  GOMP_PLUGIN_target_task_completion (async_data);
1665  return NULL;
1666}
1667
1668/* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
1669   does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
1670   has finished.  */
1671
1672void
1673GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
1674			void **args, void *async_data)
1675{
1676  pthread_t pt;
1677  struct async_run_info *info;
1678  HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
1679  info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
1680
1681  info->device = device;
1682  info->tgt_fn = tgt_fn;
1683  info->tgt_vars = tgt_vars;
1684  info->args = args;
1685  info->async_data = async_data;
1686
1687  int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
1688  if (err != 0)
1689    GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
1690		       strerror (err));
1691  err = pthread_detach (pt);
1692  if (err != 0)
1693    GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
1694		       "asynchronously: %s", strerror (err));
1695}
1696
1697/* Deinitialize all information associated with MODULE and kernels within
1698   it.  Return TRUE on success.  */
1699
1700static bool
1701destroy_module (struct module_info *module)
1702{
1703  int i;
1704  for (i = 0; i < module->kernel_count; i++)
1705    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
1706      {
1707	GOMP_PLUGIN_error ("Failed to destroy an HSA kernel initialization "
1708			   "mutex");
1709	return false;
1710      }
1711  return true;
1712}
1713
1714/* Part of the libgomp plugin interface.  Unload BRIG module described by
1715   struct brig_image_desc in TARGET_DATA from agent number N.  Return
1716   TRUE on success.  */
1717
1718bool
1719GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
1720{
1721  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
1722    {
1723      GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin"
1724			 " (expected %u, received %u)",
1725			 GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
1726      return false;
1727    }
1728
1729  struct agent_info *agent;
1730  agent = get_agent_info (n);
1731  if (!agent)
1732    return false;
1733
1734  if (pthread_rwlock_wrlock (&agent->modules_rwlock))
1735    {
1736      GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock");
1737      return false;
1738    }
1739  struct module_info *module = agent->first_module;
1740  while (module)
1741    {
1742      if (module->image_desc == target_data)
1743	break;
1744      module = module->next;
1745    }
1746  if (!module)
1747    {
1748      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
1749			 "loaded before");
1750      return false;
1751    }
1752
1753  remove_module_from_agent (agent, module);
1754  if (!destroy_module (module))
1755    return false;
1756  free (module);
1757  if (!destroy_hsa_program (agent))
1758    return false;
1759  if (pthread_rwlock_unlock (&agent->modules_rwlock))
1760    {
1761      GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock");
1762      return false;
1763    }
1764  return true;
1765}
1766
1767/* Part of the libgomp plugin interface.  Deinitialize all information and
1768   status associated with agent number N.  We do not attempt any
1769   synchronization, assuming the user and libgomp will not attempt
1770   deinitialization of a device that is in any way being used at the same
1771   time.  Return TRUE on success.  */
1772
1773bool
1774GOMP_OFFLOAD_fini_device (int n)
1775{
1776  struct agent_info *agent = get_agent_info (n);
1777  if (!agent)
1778    return false;
1779
1780  if (!agent->initialized)
1781    return true;
1782
1783  struct module_info *next_module = agent->first_module;
1784  while (next_module)
1785    {
1786      struct module_info *module = next_module;
1787      next_module = module->next;
1788      if (!destroy_module (module))
1789	return false;
1790      free (module);
1791    }
1792  agent->first_module = NULL;
1793  if (!destroy_hsa_program (agent))
1794    return false;
1795
1796  release_agent_shared_libraries (agent);
1797
1798  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->command_q);
1799  if (status != HSA_STATUS_SUCCESS)
1800    return hsa_error ("Error destroying command queue", status);
1801  status = hsa_fns.hsa_queue_destroy_fn (agent->kernel_dispatch_command_q);
1802  if (status != HSA_STATUS_SUCCESS)
1803    return hsa_error ("Error destroying kernel dispatch command queue", status);
1804  if (pthread_mutex_destroy (&agent->prog_mutex))
1805    {
1806      GOMP_PLUGIN_error ("Failed to destroy an HSA agent program mutex");
1807      return false;
1808    }
1809  if (pthread_rwlock_destroy (&agent->modules_rwlock))
1810    {
1811      GOMP_PLUGIN_error ("Failed to destroy an HSA agent rwlock");
1812      return false;
1813    }
1814  agent->initialized = false;
1815  return true;
1816}
1817
1818/* Part of the libgomp plugin interface.  Not implemented as it is not required
1819   for HSA.  */
1820
1821void *
1822GOMP_OFFLOAD_alloc (int ord, size_t size)
1823{
1824  GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_alloc is not implemented because "
1825		     "it should never be called");
1826  return NULL;
1827}
1828
1829/* Part of the libgomp plugin interface.  Not implemented as it is not required
1830   for HSA.  */
1831
1832bool
1833GOMP_OFFLOAD_free (int ord, void *ptr)
1834{
1835  GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_free is not implemented because "
1836		     "it should never be called");
1837  return false;
1838}
1839
1840/* Part of the libgomp plugin interface.  Not implemented as it is not required
1841   for HSA.  */
1842
1843bool
1844GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1845{
1846  GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2host is not implemented because "
1847		     "it should never be called");
1848  return false;
1849}
1850
1851/* Part of the libgomp plugin interface.  Not implemented as it is not required
1852   for HSA.  */
1853
1854bool
1855GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1856{
1857  GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_host2dev is not implemented because "
1858		     "it should never be called");
1859  return false;
1860}
1861
1862/* Part of the libgomp plugin interface.  Not implemented as it is not required
1863   for HSA.  */
1864
1865bool
1866GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1867{
1868  GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2dev is not implemented because "
1869		     "it should never be called");
1870  return false;
1871}
1872